Linux 6.7-rc7
[linux-modified.git] / arch / arm64 / crypto / sm4-neon-core.S
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * SM4 Cipher Algorithm for ARMv8 NEON
4  * as specified in
5  * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
6  *
7  * Copyright (C) 2022, Alibaba Group.
8  * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9  */
10
11 #include <linux/linkage.h>
12 #include <asm/assembler.h>
13
14 /* Register macros */
15
16 #define RTMP0   v8
17 #define RTMP1   v9
18 #define RTMP2   v10
19 #define RTMP3   v11
20
21 #define RTMP4   v12
22 #define RTMP5   v13
23 #define RTMP6   v14
24 #define RTMP7   v15
25
26 #define RX0     v12
27 #define RX1     v13
28 #define RKEY    v14
29 #define RIV     v15
30
31 /* Helper macros. */
32
33 #define SM4_PREPARE()                                           \
34         adr_l           x5, crypto_sm4_sbox;                    \
35         ld1             {v16.16b-v19.16b}, [x5], #64;           \
36         ld1             {v20.16b-v23.16b}, [x5], #64;           \
37         ld1             {v24.16b-v27.16b}, [x5], #64;           \
38         ld1             {v28.16b-v31.16b}, [x5];
39
40 #define transpose_4x4(s0, s1, s2, s3)                           \
41         zip1            RTMP0.4s, s0.4s, s1.4s;                 \
42         zip1            RTMP1.4s, s2.4s, s3.4s;                 \
43         zip2            RTMP2.4s, s0.4s, s1.4s;                 \
44         zip2            RTMP3.4s, s2.4s, s3.4s;                 \
45         zip1            s0.2d, RTMP0.2d, RTMP1.2d;              \
46         zip2            s1.2d, RTMP0.2d, RTMP1.2d;              \
47         zip1            s2.2d, RTMP2.2d, RTMP3.2d;              \
48         zip2            s3.2d, RTMP2.2d, RTMP3.2d;
49
50 #define transpose_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7)        \
51         zip1            RTMP0.4s, s0.4s, s1.4s;                 \
52         zip1            RTMP1.4s, s2.4s, s3.4s;                 \
53         zip2            RTMP2.4s, s0.4s, s1.4s;                 \
54         zip2            RTMP3.4s, s2.4s, s3.4s;                 \
55         zip1            RTMP4.4s, s4.4s, s5.4s;                 \
56         zip1            RTMP5.4s, s6.4s, s7.4s;                 \
57         zip2            RTMP6.4s, s4.4s, s5.4s;                 \
58         zip2            RTMP7.4s, s6.4s, s7.4s;                 \
59         zip1            s0.2d, RTMP0.2d, RTMP1.2d;              \
60         zip2            s1.2d, RTMP0.2d, RTMP1.2d;              \
61         zip1            s2.2d, RTMP2.2d, RTMP3.2d;              \
62         zip2            s3.2d, RTMP2.2d, RTMP3.2d;              \
63         zip1            s4.2d, RTMP4.2d, RTMP5.2d;              \
64         zip2            s5.2d, RTMP4.2d, RTMP5.2d;              \
65         zip1            s6.2d, RTMP6.2d, RTMP7.2d;              \
66         zip2            s7.2d, RTMP6.2d, RTMP7.2d;
67
68 #define rotate_clockwise_4x4(s0, s1, s2, s3)                    \
69         zip1            RTMP0.4s, s1.4s, s0.4s;                 \
70         zip2            RTMP1.4s, s1.4s, s0.4s;                 \
71         zip1            RTMP2.4s, s3.4s, s2.4s;                 \
72         zip2            RTMP3.4s, s3.4s, s2.4s;                 \
73         zip1            s0.2d, RTMP2.2d, RTMP0.2d;              \
74         zip2            s1.2d, RTMP2.2d, RTMP0.2d;              \
75         zip1            s2.2d, RTMP3.2d, RTMP1.2d;              \
76         zip2            s3.2d, RTMP3.2d, RTMP1.2d;
77
78 #define rotate_clockwise_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \
79         zip1            RTMP0.4s, s1.4s, s0.4s;                 \
80         zip1            RTMP2.4s, s3.4s, s2.4s;                 \
81         zip2            RTMP1.4s, s1.4s, s0.4s;                 \
82         zip2            RTMP3.4s, s3.4s, s2.4s;                 \
83         zip1            RTMP4.4s, s5.4s, s4.4s;                 \
84         zip1            RTMP6.4s, s7.4s, s6.4s;                 \
85         zip2            RTMP5.4s, s5.4s, s4.4s;                 \
86         zip2            RTMP7.4s, s7.4s, s6.4s;                 \
87         zip1            s0.2d, RTMP2.2d, RTMP0.2d;              \
88         zip2            s1.2d, RTMP2.2d, RTMP0.2d;              \
89         zip1            s2.2d, RTMP3.2d, RTMP1.2d;              \
90         zip2            s3.2d, RTMP3.2d, RTMP1.2d;              \
91         zip1            s4.2d, RTMP6.2d, RTMP4.2d;              \
92         zip2            s5.2d, RTMP6.2d, RTMP4.2d;              \
93         zip1            s6.2d, RTMP7.2d, RTMP5.2d;              \
94         zip2            s7.2d, RTMP7.2d, RTMP5.2d;
95
96 #define ROUND4(round, s0, s1, s2, s3)                           \
97         dup             RX0.4s, RKEY.s[round];                  \
98         /* rk ^ s1 ^ s2 ^ s3 */                                 \
99         eor             RTMP1.16b, s2.16b, s3.16b;              \
100         eor             RX0.16b, RX0.16b, s1.16b;               \
101         eor             RX0.16b, RX0.16b, RTMP1.16b;            \
102                                                                 \
103         /* sbox, non-linear part */                             \
104         movi            RTMP3.16b, #64;  /* sizeof(sbox) / 4 */ \
105         tbl             RTMP0.16b, {v16.16b-v19.16b}, RX0.16b;  \
106         sub             RX0.16b, RX0.16b, RTMP3.16b;            \
107         tbx             RTMP0.16b, {v20.16b-v23.16b}, RX0.16b;  \
108         sub             RX0.16b, RX0.16b, RTMP3.16b;            \
109         tbx             RTMP0.16b, {v24.16b-v27.16b}, RX0.16b;  \
110         sub             RX0.16b, RX0.16b, RTMP3.16b;            \
111         tbx             RTMP0.16b, {v28.16b-v31.16b}, RX0.16b;  \
112                                                                 \
113         /* linear part */                                       \
114         shl             RTMP1.4s, RTMP0.4s, #8;                 \
115         shl             RTMP2.4s, RTMP0.4s, #16;                \
116         shl             RTMP3.4s, RTMP0.4s, #24;                \
117         sri             RTMP1.4s, RTMP0.4s, #(32-8);            \
118         sri             RTMP2.4s, RTMP0.4s, #(32-16);           \
119         sri             RTMP3.4s, RTMP0.4s, #(32-24);           \
120         /* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */            \
121         eor             RTMP1.16b, RTMP1.16b, RTMP0.16b;        \
122         eor             RTMP1.16b, RTMP1.16b, RTMP2.16b;        \
123         /* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */        \
124         eor             RTMP3.16b, RTMP3.16b, RTMP0.16b;        \
125         shl             RTMP2.4s, RTMP1.4s, 2;                  \
126         sri             RTMP2.4s, RTMP1.4s, #(32-2);            \
127         eor             RTMP3.16b, RTMP3.16b, RTMP2.16b;        \
128         /* s0 ^= RTMP3 */                                       \
129         eor             s0.16b, s0.16b, RTMP3.16b;
130
131 #define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3)                       \
132         mov             x6, 8;                                  \
133 4:                                                              \
134         ld1             {RKEY.4s}, [x0], #16;                   \
135         subs            x6, x6, #1;                             \
136                                                                 \
137         ROUND4(0, b0, b1, b2, b3);                              \
138         ROUND4(1, b1, b2, b3, b0);                              \
139         ROUND4(2, b2, b3, b0, b1);                              \
140         ROUND4(3, b3, b0, b1, b2);                              \
141                                                                 \
142         bne             4b;                                     \
143                                                                 \
144         rev32           b0.16b, b0.16b;                         \
145         rev32           b1.16b, b1.16b;                         \
146         rev32           b2.16b, b2.16b;                         \
147         rev32           b3.16b, b3.16b;                         \
148                                                                 \
149         rotate_clockwise_4x4(b0, b1, b2, b3);                   \
150                                                                 \
151         /* repoint to rkey */                                   \
152         sub             x0, x0, #128;
153
154 #define SM4_CRYPT_BLK4(b0, b1, b2, b3)                          \
155         rev32           b0.16b, b0.16b;                         \
156         rev32           b1.16b, b1.16b;                         \
157         rev32           b2.16b, b2.16b;                         \
158         rev32           b3.16b, b3.16b;                         \
159         SM4_CRYPT_BLK4_BE(b0, b1, b2, b3);
160
161 #define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3)           \
162         /* rk ^ s1 ^ s2 ^ s3 */                                 \
163         dup             RX0.4s, RKEY.s[round];                  \
164         eor             RTMP0.16b, s2.16b, s3.16b;              \
165         mov             RX1.16b, RX0.16b;                       \
166         eor             RTMP1.16b, t2.16b, t3.16b;              \
167         eor             RX0.16b, RX0.16b, s1.16b;               \
168         eor             RX1.16b, RX1.16b, t1.16b;               \
169         eor             RX0.16b, RX0.16b, RTMP0.16b;            \
170         eor             RX1.16b, RX1.16b, RTMP1.16b;            \
171                                                                 \
172         /* sbox, non-linear part */                             \
173         movi            RTMP3.16b, #64;  /* sizeof(sbox) / 4 */ \
174         tbl             RTMP0.16b, {v16.16b-v19.16b}, RX0.16b;  \
175         tbl             RTMP1.16b, {v16.16b-v19.16b}, RX1.16b;  \
176         sub             RX0.16b, RX0.16b, RTMP3.16b;            \
177         sub             RX1.16b, RX1.16b, RTMP3.16b;            \
178         tbx             RTMP0.16b, {v20.16b-v23.16b}, RX0.16b;  \
179         tbx             RTMP1.16b, {v20.16b-v23.16b}, RX1.16b;  \
180         sub             RX0.16b, RX0.16b, RTMP3.16b;            \
181         sub             RX1.16b, RX1.16b, RTMP3.16b;            \
182         tbx             RTMP0.16b, {v24.16b-v27.16b}, RX0.16b;  \
183         tbx             RTMP1.16b, {v24.16b-v27.16b}, RX1.16b;  \
184         sub             RX0.16b, RX0.16b, RTMP3.16b;            \
185         sub             RX1.16b, RX1.16b, RTMP3.16b;            \
186         tbx             RTMP0.16b, {v28.16b-v31.16b}, RX0.16b;  \
187         tbx             RTMP1.16b, {v28.16b-v31.16b}, RX1.16b;  \
188                                                                 \
189         /* linear part */                                       \
190         shl             RX0.4s, RTMP0.4s, #8;                   \
191         shl             RX1.4s, RTMP1.4s, #8;                   \
192         shl             RTMP2.4s, RTMP0.4s, #16;                \
193         shl             RTMP3.4s, RTMP1.4s, #16;                \
194         sri             RX0.4s, RTMP0.4s, #(32 - 8);            \
195         sri             RX1.4s, RTMP1.4s, #(32 - 8);            \
196         sri             RTMP2.4s, RTMP0.4s, #(32 - 16);         \
197         sri             RTMP3.4s, RTMP1.4s, #(32 - 16);         \
198         /* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */               \
199         eor             RX0.16b, RX0.16b, RTMP0.16b;            \
200         eor             RX1.16b, RX1.16b, RTMP1.16b;            \
201         eor             RX0.16b, RX0.16b, RTMP2.16b;            \
202         eor             RX1.16b, RX1.16b, RTMP3.16b;            \
203         /* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */        \
204         shl             RTMP2.4s, RTMP0.4s, #24;                \
205         shl             RTMP3.4s, RTMP1.4s, #24;                \
206         sri             RTMP2.4s, RTMP0.4s, #(32 - 24);         \
207         sri             RTMP3.4s, RTMP1.4s, #(32 - 24);         \
208         eor             RTMP0.16b, RTMP0.16b, RTMP2.16b;        \
209         eor             RTMP1.16b, RTMP1.16b, RTMP3.16b;        \
210         shl             RTMP2.4s, RX0.4s, #2;                   \
211         shl             RTMP3.4s, RX1.4s, #2;                   \
212         sri             RTMP2.4s, RX0.4s, #(32 - 2);            \
213         sri             RTMP3.4s, RX1.4s, #(32 - 2);            \
214         eor             RTMP0.16b, RTMP0.16b, RTMP2.16b;        \
215         eor             RTMP1.16b, RTMP1.16b, RTMP3.16b;        \
216         /* s0/t0 ^= RTMP0/1 */                                  \
217         eor             s0.16b, s0.16b, RTMP0.16b;              \
218         eor             t0.16b, t0.16b, RTMP1.16b;
219
220 #define SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7) \
221         rev32           b0.16b, b0.16b;                         \
222         rev32           b1.16b, b1.16b;                         \
223         rev32           b2.16b, b2.16b;                         \
224         rev32           b3.16b, b3.16b;                         \
225         rev32           b4.16b, b4.16b;                         \
226         rev32           b5.16b, b5.16b;                         \
227         rev32           b6.16b, b6.16b;                         \
228         rev32           b7.16b, b7.16b;                         \
229                                                                 \
230         mov             x6, 8;                                  \
231 8:                                                              \
232         ld1             {RKEY.4s}, [x0], #16;                   \
233         subs            x6, x6, #1;                             \
234                                                                 \
235         ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7);              \
236         ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4);              \
237         ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5);              \
238         ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6);              \
239                                                                 \
240         bne             8b;                                     \
241                                                                 \
242         rev32           b0.16b, b0.16b;                         \
243         rev32           b1.16b, b1.16b;                         \
244         rev32           b2.16b, b2.16b;                         \
245         rev32           b3.16b, b3.16b;                         \
246         rev32           b4.16b, b4.16b;                         \
247         rev32           b5.16b, b5.16b;                         \
248         rev32           b6.16b, b6.16b;                         \
249         rev32           b7.16b, b7.16b;                         \
250                                                                 \
251         /* repoint to rkey */                                   \
252         sub             x0, x0, #128;
253
254 #define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7)                  \
255         SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7);        \
256         rotate_clockwise_4x4_2x(b0, b1, b2, b3, b4, b5, b6, b7);        \
257
258
259 .align 3
260 SYM_FUNC_START(sm4_neon_crypt)
261         /* input:
262          *   x0: round key array, CTX
263          *   x1: dst
264          *   x2: src
265          *   w3: nblocks
266          */
267         SM4_PREPARE()
268
269 .Lcrypt_loop_8x:
270         sub             w3, w3, #8
271         tbnz            w3, #31, .Lcrypt_4x
272
273         ld4             {v0.4s-v3.4s}, [x2], #64
274         ld4             {v4.4s-v7.4s}, [x2], #64
275
276         SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
277
278         st1             {v0.16b-v3.16b}, [x1], #64
279         st1             {v4.16b-v7.16b}, [x1], #64
280
281         cbz             w3, .Lcrypt_end
282         b               .Lcrypt_loop_8x
283
284 .Lcrypt_4x:
285         add             w3, w3, #8
286         cmp             w3, #4
287         blt             .Lcrypt_tail
288
289         sub             w3, w3, #4
290
291         ld4             {v0.4s-v3.4s}, [x2], #64
292
293         SM4_CRYPT_BLK4(v0, v1, v2, v3)
294
295         st1             {v0.16b-v3.16b}, [x1], #64
296
297         cbz             w3, .Lcrypt_end
298
299 .Lcrypt_tail:
300         cmp             w3, #2
301         ld1             {v0.16b}, [x2], #16
302         blt             .Lcrypt_tail_load_done
303         ld1             {v1.16b}, [x2], #16
304         beq             .Lcrypt_tail_load_done
305         ld1             {v2.16b}, [x2], #16
306
307 .Lcrypt_tail_load_done:
308         transpose_4x4(v0, v1, v2, v3)
309
310         SM4_CRYPT_BLK4(v0, v1, v2, v3)
311
312         cmp             w3, #2
313         st1             {v0.16b}, [x1], #16
314         blt             .Lcrypt_end
315         st1             {v1.16b}, [x1], #16
316         beq             .Lcrypt_end
317         st1             {v2.16b}, [x1], #16
318
319 .Lcrypt_end:
320         ret
321 SYM_FUNC_END(sm4_neon_crypt)
322
323 .align 3
324 SYM_FUNC_START(sm4_neon_cbc_dec)
325         /* input:
326          *   x0: round key array, CTX
327          *   x1: dst
328          *   x2: src
329          *   x3: iv (big endian, 128 bit)
330          *   w4: nblocks
331          */
332         SM4_PREPARE()
333
334         ld1             {RIV.16b}, [x3]
335
336 .Lcbc_dec_loop_8x:
337         sub             w4, w4, #8
338         tbnz            w4, #31, .Lcbc_dec_4x
339
340         ld4             {v0.4s-v3.4s}, [x2], #64
341         ld4             {v4.4s-v7.4s}, [x2]
342
343         SM4_CRYPT_BLK8_norotate(v0, v1, v2, v3, v4, v5, v6, v7)
344
345         /* Avoid overwriting the RIV register */
346         rotate_clockwise_4x4(v0, v1, v2, v3)
347         rotate_clockwise_4x4(v4, v5, v6, v7)
348
349         sub             x2, x2, #64
350
351         eor             v0.16b, v0.16b, RIV.16b
352
353         ld1             {RTMP0.16b-RTMP3.16b}, [x2], #64
354         ld1             {RTMP4.16b-RTMP7.16b}, [x2], #64
355
356         eor             v1.16b, v1.16b, RTMP0.16b
357         eor             v2.16b, v2.16b, RTMP1.16b
358         eor             v3.16b, v3.16b, RTMP2.16b
359         eor             v4.16b, v4.16b, RTMP3.16b
360         eor             v5.16b, v5.16b, RTMP4.16b
361         eor             v6.16b, v6.16b, RTMP5.16b
362         eor             v7.16b, v7.16b, RTMP6.16b
363
364         mov             RIV.16b, RTMP7.16b
365
366         st1             {v0.16b-v3.16b}, [x1], #64
367         st1             {v4.16b-v7.16b}, [x1], #64
368
369         cbz             w4, .Lcbc_dec_end
370         b               .Lcbc_dec_loop_8x
371
372 .Lcbc_dec_4x:
373         add             w4, w4, #8
374         cmp             w4, #4
375         blt             .Lcbc_dec_tail
376
377         sub             w4, w4, #4
378
379         ld1             {v0.16b-v3.16b}, [x2], #64
380
381         rev32           v4.16b, v0.16b
382         rev32           v5.16b, v1.16b
383         rev32           v6.16b, v2.16b
384         rev32           v7.16b, v3.16b
385
386         transpose_4x4(v4, v5, v6, v7)
387
388         SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
389
390         eor             v4.16b, v4.16b, RIV.16b
391         eor             v5.16b, v5.16b, v0.16b
392         eor             v6.16b, v6.16b, v1.16b
393         eor             v7.16b, v7.16b, v2.16b
394
395         mov             RIV.16b, v3.16b
396
397         st1             {v4.16b-v7.16b}, [x1], #64
398
399         cbz             w4, .Lcbc_dec_end
400
401 .Lcbc_dec_tail:
402         cmp             w4, #2
403         ld1             {v0.16b}, [x2], #16
404         blt             .Lcbc_dec_tail_load_done
405         ld1             {v1.16b}, [x2], #16
406         beq             .Lcbc_dec_tail_load_done
407         ld1             {v2.16b}, [x2], #16
408
409 .Lcbc_dec_tail_load_done:
410         rev32           v4.16b, v0.16b
411         rev32           v5.16b, v1.16b
412         rev32           v6.16b, v2.16b
413
414         transpose_4x4(v4, v5, v6, v7)
415
416         SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
417
418         cmp             w4, #2
419         eor             v4.16b, v4.16b, RIV.16b
420         mov             RIV.16b, v0.16b
421         st1             {v4.16b}, [x1], #16
422         blt             .Lcbc_dec_end
423
424         eor             v5.16b, v5.16b, v0.16b
425         mov             RIV.16b, v1.16b
426         st1             {v5.16b}, [x1], #16
427         beq             .Lcbc_dec_end
428
429         eor             v6.16b, v6.16b, v1.16b
430         mov             RIV.16b, v2.16b
431         st1             {v6.16b}, [x1], #16
432
433 .Lcbc_dec_end:
434         /* store new IV */
435         st1             {RIV.16b}, [x3]
436
437         ret
438 SYM_FUNC_END(sm4_neon_cbc_dec)
439
440 .align 3
441 SYM_FUNC_START(sm4_neon_cfb_dec)
442         /* input:
443          *   x0: round key array, CTX
444          *   x1: dst
445          *   x2: src
446          *   x3: iv (big endian, 128 bit)
447          *   w4: nblocks
448          */
449         SM4_PREPARE()
450
451         ld1             {v0.16b}, [x3]
452
453 .Lcfb_dec_loop_8x:
454         sub             w4, w4, #8
455         tbnz            w4, #31, .Lcfb_dec_4x
456
457         ld1             {v1.16b-v3.16b}, [x2], #48
458         ld4             {v4.4s-v7.4s}, [x2]
459
460         transpose_4x4(v0, v1, v2, v3)
461
462         SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
463
464         sub             x2, x2, #48
465         ld1             {RTMP0.16b-RTMP3.16b}, [x2], #64
466         ld1             {RTMP4.16b-RTMP7.16b}, [x2], #64
467
468         eor             v0.16b, v0.16b, RTMP0.16b
469         eor             v1.16b, v1.16b, RTMP1.16b
470         eor             v2.16b, v2.16b, RTMP2.16b
471         eor             v3.16b, v3.16b, RTMP3.16b
472         eor             v4.16b, v4.16b, RTMP4.16b
473         eor             v5.16b, v5.16b, RTMP5.16b
474         eor             v6.16b, v6.16b, RTMP6.16b
475         eor             v7.16b, v7.16b, RTMP7.16b
476
477         st1             {v0.16b-v3.16b}, [x1], #64
478         st1             {v4.16b-v7.16b}, [x1], #64
479
480         mov             v0.16b, RTMP7.16b
481
482         cbz             w4, .Lcfb_dec_end
483         b               .Lcfb_dec_loop_8x
484
485 .Lcfb_dec_4x:
486         add             w4, w4, #8
487         cmp             w4, #4
488         blt             .Lcfb_dec_tail
489
490         sub             w4, w4, #4
491
492         ld1             {v4.16b-v7.16b}, [x2], #64
493
494         rev32           v0.16b, v0.16b          /* v0 is IV register */
495         rev32           v1.16b, v4.16b
496         rev32           v2.16b, v5.16b
497         rev32           v3.16b, v6.16b
498
499         transpose_4x4(v0, v1, v2, v3)
500
501         SM4_CRYPT_BLK4_BE(v0, v1, v2, v3)
502
503         eor             v0.16b, v0.16b, v4.16b
504         eor             v1.16b, v1.16b, v5.16b
505         eor             v2.16b, v2.16b, v6.16b
506         eor             v3.16b, v3.16b, v7.16b
507
508         st1             {v0.16b-v3.16b}, [x1], #64
509
510         mov             v0.16b, v7.16b
511
512         cbz             w4, .Lcfb_dec_end
513
514 .Lcfb_dec_tail:
515         cmp             w4, #2
516         ld1             {v4.16b}, [x2], #16
517         blt             .Lcfb_dec_tail_load_done
518         ld1             {v5.16b}, [x2], #16
519         beq             .Lcfb_dec_tail_load_done
520         ld1             {v6.16b}, [x2], #16
521
522 .Lcfb_dec_tail_load_done:
523         rev32           v0.16b, v0.16b          /* v0 is IV register */
524         rev32           v1.16b, v4.16b
525         rev32           v2.16b, v5.16b
526
527         transpose_4x4(v0, v1, v2, v3)
528
529         SM4_CRYPT_BLK4_BE(v0, v1, v2, v3)
530
531         cmp             w4, #2
532         eor             v0.16b, v0.16b, v4.16b
533         st1             {v0.16b}, [x1], #16
534         mov             v0.16b, v4.16b
535         blt             .Lcfb_dec_end
536
537         eor             v1.16b, v1.16b, v5.16b
538         st1             {v1.16b}, [x1], #16
539         mov             v0.16b, v5.16b
540         beq             .Lcfb_dec_end
541
542         eor             v2.16b, v2.16b, v6.16b
543         st1             {v2.16b}, [x1], #16
544         mov             v0.16b, v6.16b
545
546 .Lcfb_dec_end:
547         /* store new IV */
548         st1             {v0.16b}, [x3]
549
550         ret
551 SYM_FUNC_END(sm4_neon_cfb_dec)
552
553 .align 3
554 SYM_FUNC_START(sm4_neon_ctr_crypt)
555         /* input:
556          *   x0: round key array, CTX
557          *   x1: dst
558          *   x2: src
559          *   x3: ctr (big endian, 128 bit)
560          *   w4: nblocks
561          */
562         SM4_PREPARE()
563
564         ldp             x7, x8, [x3]
565         rev             x7, x7
566         rev             x8, x8
567
568 .Lctr_crypt_loop_8x:
569         sub             w4, w4, #8
570         tbnz            w4, #31, .Lctr_crypt_4x
571
572 #define inc_le128(vctr)                             \
573                 mov             vctr.d[1], x8;      \
574                 mov             vctr.d[0], x7;      \
575                 adds            x8, x8, #1;         \
576                 rev64           vctr.16b, vctr.16b; \
577                 adc             x7, x7, xzr;
578
579         /* construct CTRs */
580         inc_le128(v0)                   /* +0 */
581         inc_le128(v1)                   /* +1 */
582         inc_le128(v2)                   /* +2 */
583         inc_le128(v3)                   /* +3 */
584         inc_le128(v4)                   /* +4 */
585         inc_le128(v5)                   /* +5 */
586         inc_le128(v6)                   /* +6 */
587         inc_le128(v7)                   /* +7 */
588
589         transpose_4x4_2x(v0, v1, v2, v3, v4, v5, v6, v7)
590
591         SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
592
593         ld1             {RTMP0.16b-RTMP3.16b}, [x2], #64
594         ld1             {RTMP4.16b-RTMP7.16b}, [x2], #64
595
596         eor             v0.16b, v0.16b, RTMP0.16b
597         eor             v1.16b, v1.16b, RTMP1.16b
598         eor             v2.16b, v2.16b, RTMP2.16b
599         eor             v3.16b, v3.16b, RTMP3.16b
600         eor             v4.16b, v4.16b, RTMP4.16b
601         eor             v5.16b, v5.16b, RTMP5.16b
602         eor             v6.16b, v6.16b, RTMP6.16b
603         eor             v7.16b, v7.16b, RTMP7.16b
604
605         st1             {v0.16b-v3.16b}, [x1], #64
606         st1             {v4.16b-v7.16b}, [x1], #64
607
608         cbz             w4, .Lctr_crypt_end
609         b               .Lctr_crypt_loop_8x
610
611 .Lctr_crypt_4x:
612         add             w4, w4, #8
613         cmp             w4, #4
614         blt             .Lctr_crypt_tail
615
616         sub             w4, w4, #4
617
618         /* construct CTRs */
619         inc_le128(v0)                   /* +0 */
620         inc_le128(v1)                   /* +1 */
621         inc_le128(v2)                   /* +2 */
622         inc_le128(v3)                   /* +3 */
623
624         ld1             {v4.16b-v7.16b}, [x2], #64
625
626         transpose_4x4(v0, v1, v2, v3)
627
628         SM4_CRYPT_BLK4(v0, v1, v2, v3)
629
630         eor             v0.16b, v0.16b, v4.16b
631         eor             v1.16b, v1.16b, v5.16b
632         eor             v2.16b, v2.16b, v6.16b
633         eor             v3.16b, v3.16b, v7.16b
634
635         st1             {v0.16b-v3.16b}, [x1], #64
636
637         cbz             w4, .Lctr_crypt_end
638
639 .Lctr_crypt_tail:
640         /* inc_le128 will change the sign bit */
641         ld1             {v4.16b}, [x2], #16
642         inc_le128(v0)
643         cmp             w4, #2
644         blt             .Lctr_crypt_tail_load_done
645
646         ld1             {v5.16b}, [x2], #16
647         inc_le128(v1)
648         cmp             w4, #2
649         beq             .Lctr_crypt_tail_load_done
650
651         ld1             {v6.16b}, [x2], #16
652         inc_le128(v2)
653
654 .Lctr_crypt_tail_load_done:
655         transpose_4x4(v0, v1, v2, v3)
656
657         SM4_CRYPT_BLK4(v0, v1, v2, v3)
658
659         cmp             w4, #2
660
661         eor             v0.16b, v0.16b, v4.16b
662         st1             {v0.16b}, [x1], #16
663         blt             .Lctr_crypt_end
664
665         eor             v1.16b, v1.16b, v5.16b
666         st1             {v1.16b}, [x1], #16
667         beq             .Lctr_crypt_end
668
669         eor             v2.16b, v2.16b, v6.16b
670         st1             {v2.16b}, [x1], #16
671
672 .Lctr_crypt_end:
673         /* store new CTR */
674         rev             x7, x7
675         rev             x8, x8
676         stp             x7, x8, [x3]
677
678         ret
679 SYM_FUNC_END(sm4_neon_ctr_crypt)