Linux 6.7-rc7
[linux-modified.git] / arch / arm64 / crypto / sm4-ce-gcm-core.S
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * SM4-GCM AEAD Algorithm using ARMv8 Crypto Extensions
4  * as specified in rfc8998
5  * https://datatracker.ietf.org/doc/html/rfc8998
6  *
7  * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
8  * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9  */
10
11 #include <linux/linkage.h>
12 #include <linux/cfi_types.h>
13 #include <asm/assembler.h>
14 #include "sm4-ce-asm.h"
15
16 .arch   armv8-a+crypto
17
18 .irp b, 0, 1, 2, 3, 24, 25, 26, 27, 28, 29, 30, 31
19         .set .Lv\b\().4s, \b
20 .endr
21
22 .macro sm4e, vd, vn
23         .inst 0xcec08400 | (.L\vn << 5) | .L\vd
24 .endm
25
26 /* Register macros */
27
28 /* Used for both encryption and decryption */
29 #define RHASH   v21
30 #define RRCONST v22
31 #define RZERO   v23
32
33 /* Helper macros. */
34
35 /*
36  * input: m0, m1
37  * output: r0:r1 (low 128-bits in r0, high in r1)
38  */
39 #define PMUL_128x128(r0, r1, m0, m1, T0, T1)                    \
40                 ext             T0.16b, m1.16b, m1.16b, #8;     \
41                 pmull           r0.1q, m0.1d, m1.1d;            \
42                 pmull           T1.1q, m0.1d, T0.1d;            \
43                 pmull2          T0.1q, m0.2d, T0.2d;            \
44                 pmull2          r1.1q, m0.2d, m1.2d;            \
45                 eor             T0.16b, T0.16b, T1.16b;         \
46                 ext             T1.16b, RZERO.16b, T0.16b, #8;  \
47                 ext             T0.16b, T0.16b, RZERO.16b, #8;  \
48                 eor             r0.16b, r0.16b, T1.16b;         \
49                 eor             r1.16b, r1.16b, T0.16b;
50
51 #define PMUL_128x128_4x(r0, r1, m0, m1, T0, T1,                 \
52                         r2, r3, m2, m3, T2, T3,                 \
53                         r4, r5, m4, m5, T4, T5,                 \
54                         r6, r7, m6, m7, T6, T7)                 \
55                 ext             T0.16b, m1.16b, m1.16b, #8;     \
56                 ext             T2.16b, m3.16b, m3.16b, #8;     \
57                 ext             T4.16b, m5.16b, m5.16b, #8;     \
58                 ext             T6.16b, m7.16b, m7.16b, #8;     \
59                 pmull           r0.1q, m0.1d, m1.1d;            \
60                 pmull           r2.1q, m2.1d, m3.1d;            \
61                 pmull           r4.1q, m4.1d, m5.1d;            \
62                 pmull           r6.1q, m6.1d, m7.1d;            \
63                 pmull           T1.1q, m0.1d, T0.1d;            \
64                 pmull           T3.1q, m2.1d, T2.1d;            \
65                 pmull           T5.1q, m4.1d, T4.1d;            \
66                 pmull           T7.1q, m6.1d, T6.1d;            \
67                 pmull2          T0.1q, m0.2d, T0.2d;            \
68                 pmull2          T2.1q, m2.2d, T2.2d;            \
69                 pmull2          T4.1q, m4.2d, T4.2d;            \
70                 pmull2          T6.1q, m6.2d, T6.2d;            \
71                 pmull2          r1.1q, m0.2d, m1.2d;            \
72                 pmull2          r3.1q, m2.2d, m3.2d;            \
73                 pmull2          r5.1q, m4.2d, m5.2d;            \
74                 pmull2          r7.1q, m6.2d, m7.2d;            \
75                 eor             T0.16b, T0.16b, T1.16b;         \
76                 eor             T2.16b, T2.16b, T3.16b;         \
77                 eor             T4.16b, T4.16b, T5.16b;         \
78                 eor             T6.16b, T6.16b, T7.16b;         \
79                 ext             T1.16b, RZERO.16b, T0.16b, #8;  \
80                 ext             T3.16b, RZERO.16b, T2.16b, #8;  \
81                 ext             T5.16b, RZERO.16b, T4.16b, #8;  \
82                 ext             T7.16b, RZERO.16b, T6.16b, #8;  \
83                 ext             T0.16b, T0.16b, RZERO.16b, #8;  \
84                 ext             T2.16b, T2.16b, RZERO.16b, #8;  \
85                 ext             T4.16b, T4.16b, RZERO.16b, #8;  \
86                 ext             T6.16b, T6.16b, RZERO.16b, #8;  \
87                 eor             r0.16b, r0.16b, T1.16b;         \
88                 eor             r2.16b, r2.16b, T3.16b;         \
89                 eor             r4.16b, r4.16b, T5.16b;         \
90                 eor             r6.16b, r6.16b, T7.16b;         \
91                 eor             r1.16b, r1.16b, T0.16b;         \
92                 eor             r3.16b, r3.16b, T2.16b;         \
93                 eor             r5.16b, r5.16b, T4.16b;         \
94                 eor             r7.16b, r7.16b, T6.16b;
95
96 /*
97  * input: r0:r1 (low 128-bits in r0, high in r1)
98  * output: a
99  */
100 #define REDUCTION(a, r0, r1, rconst, T0, T1)                    \
101                 pmull2          T0.1q, r1.2d, rconst.2d;        \
102                 ext             T1.16b, T0.16b, RZERO.16b, #8;  \
103                 ext             T0.16b, RZERO.16b, T0.16b, #8;  \
104                 eor             r1.16b, r1.16b, T1.16b;         \
105                 eor             r0.16b, r0.16b, T0.16b;         \
106                 pmull           T0.1q, r1.1d, rconst.1d;        \
107                 eor             a.16b, r0.16b, T0.16b;
108
109 #define SM4_CRYPT_PMUL_128x128_BLK(b0, r0, r1, m0, m1, T0, T1)  \
110         rev32                   b0.16b, b0.16b;                 \
111                 ext             T0.16b, m1.16b, m1.16b, #8;     \
112         sm4e                    b0.4s, v24.4s;                  \
113                 pmull           r0.1q, m0.1d, m1.1d;            \
114         sm4e                    b0.4s, v25.4s;                  \
115                 pmull           T1.1q, m0.1d, T0.1d;            \
116         sm4e                    b0.4s, v26.4s;                  \
117                 pmull2          T0.1q, m0.2d, T0.2d;            \
118         sm4e                    b0.4s, v27.4s;                  \
119                 pmull2          r1.1q, m0.2d, m1.2d;            \
120         sm4e                    b0.4s, v28.4s;                  \
121                 eor             T0.16b, T0.16b, T1.16b;         \
122         sm4e                    b0.4s, v29.4s;                  \
123                 ext             T1.16b, RZERO.16b, T0.16b, #8;  \
124         sm4e                    b0.4s, v30.4s;                  \
125                 ext             T0.16b, T0.16b, RZERO.16b, #8;  \
126         sm4e                    b0.4s, v31.4s;                  \
127                 eor             r0.16b, r0.16b, T1.16b;         \
128         rev64                   b0.4s, b0.4s;                   \
129                 eor             r1.16b, r1.16b, T0.16b;         \
130         ext                     b0.16b, b0.16b, b0.16b, #8;     \
131         rev32                   b0.16b, b0.16b;
132
133 #define SM4_CRYPT_PMUL_128x128_BLK3(b0, b1, b2,                 \
134                                     r0, r1, m0, m1, T0, T1,     \
135                                     r2, r3, m2, m3, T2, T3,     \
136                                     r4, r5, m4, m5, T4, T5)     \
137         rev32                   b0.16b, b0.16b;                 \
138         rev32                   b1.16b, b1.16b;                 \
139         rev32                   b2.16b, b2.16b;                 \
140                 ext             T0.16b, m1.16b, m1.16b, #8;     \
141                 ext             T2.16b, m3.16b, m3.16b, #8;     \
142                 ext             T4.16b, m5.16b, m5.16b, #8;     \
143         sm4e                    b0.4s, v24.4s;                  \
144         sm4e                    b1.4s, v24.4s;                  \
145         sm4e                    b2.4s, v24.4s;                  \
146                 pmull           r0.1q, m0.1d, m1.1d;            \
147                 pmull           r2.1q, m2.1d, m3.1d;            \
148                 pmull           r4.1q, m4.1d, m5.1d;            \
149         sm4e                    b0.4s, v25.4s;                  \
150         sm4e                    b1.4s, v25.4s;                  \
151         sm4e                    b2.4s, v25.4s;                  \
152                 pmull           T1.1q, m0.1d, T0.1d;            \
153                 pmull           T3.1q, m2.1d, T2.1d;            \
154                 pmull           T5.1q, m4.1d, T4.1d;            \
155         sm4e                    b0.4s, v26.4s;                  \
156         sm4e                    b1.4s, v26.4s;                  \
157         sm4e                    b2.4s, v26.4s;                  \
158                 pmull2          T0.1q, m0.2d, T0.2d;            \
159                 pmull2          T2.1q, m2.2d, T2.2d;            \
160                 pmull2          T4.1q, m4.2d, T4.2d;            \
161         sm4e                    b0.4s, v27.4s;                  \
162         sm4e                    b1.4s, v27.4s;                  \
163         sm4e                    b2.4s, v27.4s;                  \
164                 pmull2          r1.1q, m0.2d, m1.2d;            \
165                 pmull2          r3.1q, m2.2d, m3.2d;            \
166                 pmull2          r5.1q, m4.2d, m5.2d;            \
167         sm4e                    b0.4s, v28.4s;                  \
168         sm4e                    b1.4s, v28.4s;                  \
169         sm4e                    b2.4s, v28.4s;                  \
170                 eor             T0.16b, T0.16b, T1.16b;         \
171                 eor             T2.16b, T2.16b, T3.16b;         \
172                 eor             T4.16b, T4.16b, T5.16b;         \
173         sm4e                    b0.4s, v29.4s;                  \
174         sm4e                    b1.4s, v29.4s;                  \
175         sm4e                    b2.4s, v29.4s;                  \
176                 ext             T1.16b, RZERO.16b, T0.16b, #8;  \
177                 ext             T3.16b, RZERO.16b, T2.16b, #8;  \
178                 ext             T5.16b, RZERO.16b, T4.16b, #8;  \
179         sm4e                    b0.4s, v30.4s;                  \
180         sm4e                    b1.4s, v30.4s;                  \
181         sm4e                    b2.4s, v30.4s;                  \
182                 ext             T0.16b, T0.16b, RZERO.16b, #8;  \
183                 ext             T2.16b, T2.16b, RZERO.16b, #8;  \
184                 ext             T4.16b, T4.16b, RZERO.16b, #8;  \
185         sm4e                    b0.4s, v31.4s;                  \
186         sm4e                    b1.4s, v31.4s;                  \
187         sm4e                    b2.4s, v31.4s;                  \
188                 eor             r0.16b, r0.16b, T1.16b;         \
189                 eor             r2.16b, r2.16b, T3.16b;         \
190                 eor             r4.16b, r4.16b, T5.16b;         \
191         rev64                   b0.4s, b0.4s;                   \
192         rev64                   b1.4s, b1.4s;                   \
193         rev64                   b2.4s, b2.4s;                   \
194                 eor             r1.16b, r1.16b, T0.16b;         \
195                 eor             r3.16b, r3.16b, T2.16b;         \
196                 eor             r5.16b, r5.16b, T4.16b;         \
197         ext                     b0.16b, b0.16b, b0.16b, #8;     \
198         ext                     b1.16b, b1.16b, b1.16b, #8;     \
199         ext                     b2.16b, b2.16b, b2.16b, #8;     \
200                 eor             r0.16b, r0.16b, r2.16b;         \
201                 eor             r1.16b, r1.16b, r3.16b;         \
202         rev32                   b0.16b, b0.16b;                 \
203         rev32                   b1.16b, b1.16b;                 \
204         rev32                   b2.16b, b2.16b;                 \
205                 eor             r0.16b, r0.16b, r4.16b;         \
206                 eor             r1.16b, r1.16b, r5.16b;
207
208 #define inc32_le128(vctr)                                       \
209                 mov             vctr.d[1], x9;                  \
210                 add             w6, w9, #1;                     \
211                 mov             vctr.d[0], x8;                  \
212                 bfi             x9, x6, #0, #32;                \
213                 rev64           vctr.16b, vctr.16b;
214
215 #define GTAG_HASH_LENGTHS(vctr0, vlen)                                  \
216                 ld1             {vlen.16b}, [x7];                       \
217                 /* construct CTR0 */                                    \
218                 /* the lower 32-bits of initial IV is always be32(1) */ \
219                 mov             x6, #0x1;                               \
220                 bfi             x9, x6, #0, #32;                        \
221                 mov             vctr0.d[0], x8;                         \
222                 mov             vctr0.d[1], x9;                         \
223                 rbit            vlen.16b, vlen.16b;                     \
224                 rev64           vctr0.16b, vctr0.16b;                   \
225                 /* authtag = GCTR(CTR0, GHASH) */                       \
226                 eor             RHASH.16b, RHASH.16b, vlen.16b;         \
227                 SM4_CRYPT_PMUL_128x128_BLK(vctr0, RR0, RR1, RHASH, RH1, \
228                                            RTMP0, RTMP1);               \
229                 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3);      \
230                 rbit            RHASH.16b, RHASH.16b;                   \
231                 eor             RHASH.16b, RHASH.16b, vctr0.16b;
232
233
234 /* Register macros for encrypt and ghash */
235
236 /* can be the same as input v0-v3 */
237 #define RR1     v0
238 #define RR3     v1
239 #define RR5     v2
240 #define RR7     v3
241
242 #define RR0     v4
243 #define RR2     v5
244 #define RR4     v6
245 #define RR6     v7
246
247 #define RTMP0   v8
248 #define RTMP1   v9
249 #define RTMP2   v10
250 #define RTMP3   v11
251 #define RTMP4   v12
252 #define RTMP5   v13
253 #define RTMP6   v14
254 #define RTMP7   v15
255
256 #define RH1     v16
257 #define RH2     v17
258 #define RH3     v18
259 #define RH4     v19
260
261 .align 3
262 SYM_FUNC_START(sm4_ce_pmull_ghash_setup)
263         /* input:
264          *   x0: round key array, CTX
265          *   x1: ghash table
266          */
267         SM4_PREPARE(x0)
268
269         adr_l           x2, .Lghash_rconst
270         ld1r            {RRCONST.2d}, [x2]
271
272         eor             RZERO.16b, RZERO.16b, RZERO.16b
273
274         /* H = E(K, 0^128) */
275         rev32           v0.16b, RZERO.16b
276         SM4_CRYPT_BLK_BE(v0)
277
278         /* H ^ 1 */
279         rbit            RH1.16b, v0.16b
280
281         /* H ^ 2 */
282         PMUL_128x128(RR0, RR1, RH1, RH1, RTMP0, RTMP1)
283         REDUCTION(RH2, RR0, RR1, RRCONST, RTMP2, RTMP3)
284
285         /* H ^ 3 */
286         PMUL_128x128(RR0, RR1, RH2, RH1, RTMP0, RTMP1)
287         REDUCTION(RH3, RR0, RR1, RRCONST, RTMP2, RTMP3)
288
289         /* H ^ 4 */
290         PMUL_128x128(RR0, RR1, RH2, RH2, RTMP0, RTMP1)
291         REDUCTION(RH4, RR0, RR1, RRCONST, RTMP2, RTMP3)
292
293         st1             {RH1.16b-RH4.16b}, [x1]
294
295         ret
296 SYM_FUNC_END(sm4_ce_pmull_ghash_setup)
297
298 .align 3
299 SYM_FUNC_START(pmull_ghash_update)
300         /* input:
301          *   x0: ghash table
302          *   x1: ghash result
303          *   x2: src
304          *   w3: nblocks
305          */
306         ld1             {RH1.16b-RH4.16b}, [x0]
307
308         ld1             {RHASH.16b}, [x1]
309         rbit            RHASH.16b, RHASH.16b
310
311         adr_l           x4, .Lghash_rconst
312         ld1r            {RRCONST.2d}, [x4]
313
314         eor             RZERO.16b, RZERO.16b, RZERO.16b
315
316 .Lghash_loop_4x:
317         cmp             w3, #4
318         blt             .Lghash_loop_1x
319
320         sub             w3, w3, #4
321
322         ld1             {v0.16b-v3.16b}, [x2], #64
323
324         rbit            v0.16b, v0.16b
325         rbit            v1.16b, v1.16b
326         rbit            v2.16b, v2.16b
327         rbit            v3.16b, v3.16b
328
329         /*
330          * (in0 ^ HASH) * H^4 => rr0:rr1
331          * (in1)        * H^3 => rr2:rr3
332          * (in2)        * H^2 => rr4:rr5
333          * (in3)        * H^1 => rr6:rr7
334          */
335         eor             RHASH.16b, RHASH.16b, v0.16b
336
337         PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
338                         RR2, RR3, v1, RH3, RTMP2, RTMP3,
339                         RR4, RR5, v2, RH2, RTMP4, RTMP5,
340                         RR6, RR7, v3, RH1, RTMP6, RTMP7)
341
342         eor             RR0.16b, RR0.16b, RR2.16b
343         eor             RR1.16b, RR1.16b, RR3.16b
344         eor             RR0.16b, RR0.16b, RR4.16b
345         eor             RR1.16b, RR1.16b, RR5.16b
346         eor             RR0.16b, RR0.16b, RR6.16b
347         eor             RR1.16b, RR1.16b, RR7.16b
348
349         REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
350
351         cbz             w3, .Lghash_end
352         b               .Lghash_loop_4x
353
354 .Lghash_loop_1x:
355         sub             w3, w3, #1
356
357         ld1             {v0.16b}, [x2], #16
358         rbit            v0.16b, v0.16b
359         eor             RHASH.16b, RHASH.16b, v0.16b
360
361         PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
362         REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
363
364         cbnz            w3, .Lghash_loop_1x
365
366 .Lghash_end:
367         rbit            RHASH.16b, RHASH.16b
368         st1             {RHASH.2d}, [x1]
369
370         ret
371 SYM_FUNC_END(pmull_ghash_update)
372
373 .align 3
374 SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_enc)
375         /* input:
376          *   x0: round key array, CTX
377          *   x1: dst
378          *   x2: src
379          *   x3: ctr (big endian, 128 bit)
380          *   w4: nbytes
381          *   x5: ghash result
382          *   x6: ghash table
383          *   x7: lengths (only for last block)
384          */
385         SM4_PREPARE(x0)
386
387         ldp             x8, x9, [x3]
388         rev             x8, x8
389         rev             x9, x9
390
391         ld1             {RH1.16b-RH4.16b}, [x6]
392
393         ld1             {RHASH.16b}, [x5]
394         rbit            RHASH.16b, RHASH.16b
395
396         adr_l           x6, .Lghash_rconst
397         ld1r            {RRCONST.2d}, [x6]
398
399         eor             RZERO.16b, RZERO.16b, RZERO.16b
400
401         cbz             w4, .Lgcm_enc_hash_len
402
403 .Lgcm_enc_loop_4x:
404         cmp             w4, #(4 * 16)
405         blt             .Lgcm_enc_loop_1x
406
407         sub             w4, w4, #(4 * 16)
408
409         /* construct CTRs */
410         inc32_le128(v0)                 /* +0 */
411         inc32_le128(v1)                 /* +1 */
412         inc32_le128(v2)                 /* +2 */
413         inc32_le128(v3)                 /* +3 */
414
415         ld1             {RTMP0.16b-RTMP3.16b}, [x2], #64
416
417         SM4_CRYPT_BLK4(v0, v1, v2, v3)
418
419         eor             v0.16b, v0.16b, RTMP0.16b
420         eor             v1.16b, v1.16b, RTMP1.16b
421         eor             v2.16b, v2.16b, RTMP2.16b
422         eor             v3.16b, v3.16b, RTMP3.16b
423         st1             {v0.16b-v3.16b}, [x1], #64
424
425         /* ghash update */
426
427         rbit            v0.16b, v0.16b
428         rbit            v1.16b, v1.16b
429         rbit            v2.16b, v2.16b
430         rbit            v3.16b, v3.16b
431
432         /*
433          * (in0 ^ HASH) * H^4 => rr0:rr1
434          * (in1)        * H^3 => rr2:rr3
435          * (in2)        * H^2 => rr4:rr5
436          * (in3)        * H^1 => rr6:rr7
437          */
438         eor             RHASH.16b, RHASH.16b, v0.16b
439
440         PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
441                         RR2, RR3, v1, RH3, RTMP2, RTMP3,
442                         RR4, RR5, v2, RH2, RTMP4, RTMP5,
443                         RR6, RR7, v3, RH1, RTMP6, RTMP7)
444
445         eor             RR0.16b, RR0.16b, RR2.16b
446         eor             RR1.16b, RR1.16b, RR3.16b
447         eor             RR0.16b, RR0.16b, RR4.16b
448         eor             RR1.16b, RR1.16b, RR5.16b
449         eor             RR0.16b, RR0.16b, RR6.16b
450         eor             RR1.16b, RR1.16b, RR7.16b
451
452         REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
453
454         cbz             w4, .Lgcm_enc_hash_len
455         b               .Lgcm_enc_loop_4x
456
457 .Lgcm_enc_loop_1x:
458         cmp             w4, #16
459         blt             .Lgcm_enc_tail
460
461         sub             w4, w4, #16
462
463         /* construct CTRs */
464         inc32_le128(v0)
465
466         ld1             {RTMP0.16b}, [x2], #16
467
468         SM4_CRYPT_BLK(v0)
469
470         eor             v0.16b, v0.16b, RTMP0.16b
471         st1             {v0.16b}, [x1], #16
472
473         /* ghash update */
474         rbit            v0.16b, v0.16b
475         eor             RHASH.16b, RHASH.16b, v0.16b
476         PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
477         REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
478
479         cbz             w4, .Lgcm_enc_hash_len
480         b               .Lgcm_enc_loop_1x
481
482 .Lgcm_enc_tail:
483         /* construct CTRs */
484         inc32_le128(v0)
485         SM4_CRYPT_BLK(v0)
486
487         /* load permute table */
488         adr_l           x0, .Lcts_permute_table
489         add             x0, x0, #32
490         sub             x0, x0, w4, uxtw
491         ld1             {v3.16b}, [x0]
492
493 .Lgcm_enc_tail_loop:
494         /* do encrypt */
495         ldrb            w0, [x2], #1    /* get 1 byte from input */
496         umov            w6, v0.b[0]     /* get top crypted byte */
497         eor             w6, w6, w0      /* w6 = CTR ^ input */
498         strb            w6, [x1], #1    /* store out byte */
499
500         /* shift right out one byte */
501         ext             v0.16b, v0.16b, v0.16b, #1
502         /* the last ciphertext is placed in high bytes */
503         ins             v0.b[15], w6
504
505         subs            w4, w4, #1
506         bne             .Lgcm_enc_tail_loop
507
508         /* padding last block with zeros */
509         tbl             v0.16b, {v0.16b}, v3.16b
510
511         /* ghash update */
512         rbit            v0.16b, v0.16b
513         eor             RHASH.16b, RHASH.16b, v0.16b
514         PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
515         REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
516
517 .Lgcm_enc_hash_len:
518         cbz             x7, .Lgcm_enc_end
519
520         GTAG_HASH_LENGTHS(v1, v3)
521
522         b               .Lgcm_enc_ret
523
524 .Lgcm_enc_end:
525         /* store new CTR */
526         rev             x8, x8
527         rev             x9, x9
528         stp             x8, x9, [x3]
529
530         rbit            RHASH.16b, RHASH.16b
531
532 .Lgcm_enc_ret:
533         /* store new MAC */
534         st1             {RHASH.2d}, [x5]
535
536         ret
537 SYM_FUNC_END(sm4_ce_pmull_gcm_enc)
538
539 #undef  RR1
540 #undef  RR3
541 #undef  RR5
542 #undef  RR7
543 #undef  RR0
544 #undef  RR2
545 #undef  RR4
546 #undef  RR6
547 #undef RTMP0
548 #undef RTMP1
549 #undef RTMP2
550 #undef RTMP3
551 #undef RTMP4
552 #undef RTMP5
553 #undef RTMP6
554 #undef RTMP7
555 #undef  RH1
556 #undef  RH2
557 #undef  RH3
558 #undef  RH4
559
560
561 /* Register macros for decrypt */
562
563 /* v0-v2 for building CTRs, v3-v5 for saving inputs */
564
565 #define RR1     v6
566 #define RR3     v7
567 #define RR5     v8
568
569 #define RR0     v9
570 #define RR2     v10
571 #define RR4     v11
572
573 #define RTMP0   v12
574 #define RTMP1   v13
575 #define RTMP2   v14
576 #define RTMP3   v15
577 #define RTMP4   v16
578 #define RTMP5   v17
579
580 #define RH1     v18
581 #define RH2     v19
582 #define RH3     v20
583
584 .align 3
585 SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_dec)
586         /* input:
587          *   x0: round key array, CTX
588          *   x1: dst
589          *   x2: src
590          *   x3: ctr (big endian, 128 bit)
591          *   w4: nbytes
592          *   x5: ghash result
593          *   x6: ghash table
594          *   x7: lengths (only for last block)
595          */
596         SM4_PREPARE(x0)
597
598         ldp             x8, x9, [x3]
599         rev             x8, x8
600         rev             x9, x9
601
602         ld1             {RH1.16b-RH3.16b}, [x6]
603
604         ld1             {RHASH.16b}, [x5]
605         rbit            RHASH.16b, RHASH.16b
606
607         adr_l           x6, .Lghash_rconst
608         ld1r            {RRCONST.2d}, [x6]
609
610         eor             RZERO.16b, RZERO.16b, RZERO.16b
611
612         cbz             w4, .Lgcm_dec_hash_len
613
614 .Lgcm_dec_loop_3x:
615         cmp             w4, #(3 * 16)
616         blt             .Lgcm_dec_loop_1x
617
618         sub             w4, w4, #(3 * 16)
619
620         ld1             {v3.16b-v5.16b}, [x2], #(3 * 16)
621
622         /* construct CTRs */
623         inc32_le128(v0)                 /* +0 */
624         rbit            v6.16b, v3.16b
625         inc32_le128(v1)                 /* +1 */
626         rbit            v7.16b, v4.16b
627         inc32_le128(v2)                 /* +2 */
628         rbit            v8.16b, v5.16b
629
630         eor             RHASH.16b, RHASH.16b, v6.16b
631
632         /* decrypt & ghash update */
633         SM4_CRYPT_PMUL_128x128_BLK3(v0, v1, v2,
634                                     RR0, RR1, RHASH, RH3, RTMP0, RTMP1,
635                                     RR2, RR3, v7, RH2, RTMP2, RTMP3,
636                                     RR4, RR5, v8, RH1, RTMP4, RTMP5)
637
638         eor             v0.16b, v0.16b, v3.16b
639         eor             v1.16b, v1.16b, v4.16b
640         eor             v2.16b, v2.16b, v5.16b
641
642         REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
643
644         st1             {v0.16b-v2.16b}, [x1], #(3 * 16)
645
646         cbz             w4, .Lgcm_dec_hash_len
647         b               .Lgcm_dec_loop_3x
648
649 .Lgcm_dec_loop_1x:
650         cmp             w4, #16
651         blt             .Lgcm_dec_tail
652
653         sub             w4, w4, #16
654
655         ld1             {v3.16b}, [x2], #16
656
657         /* construct CTRs */
658         inc32_le128(v0)
659         rbit            v6.16b, v3.16b
660
661         eor             RHASH.16b, RHASH.16b, v6.16b
662
663         SM4_CRYPT_PMUL_128x128_BLK(v0, RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
664
665         eor             v0.16b, v0.16b, v3.16b
666
667         REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
668
669         st1             {v0.16b}, [x1], #16
670
671         cbz             w4, .Lgcm_dec_hash_len
672         b               .Lgcm_dec_loop_1x
673
674 .Lgcm_dec_tail:
675         /* construct CTRs */
676         inc32_le128(v0)
677         SM4_CRYPT_BLK(v0)
678
679         /* load permute table */
680         adr_l           x0, .Lcts_permute_table
681         add             x0, x0, #32
682         sub             x0, x0, w4, uxtw
683         ld1             {v3.16b}, [x0]
684
685 .Lgcm_dec_tail_loop:
686         /* do decrypt */
687         ldrb            w0, [x2], #1    /* get 1 byte from input */
688         umov            w6, v0.b[0]     /* get top crypted byte */
689         eor             w6, w6, w0      /* w6 = CTR ^ input */
690         strb            w6, [x1], #1    /* store out byte */
691
692         /* shift right out one byte */
693         ext             v0.16b, v0.16b, v0.16b, #1
694         /* the last ciphertext is placed in high bytes */
695         ins             v0.b[15], w0
696
697         subs            w4, w4, #1
698         bne             .Lgcm_dec_tail_loop
699
700         /* padding last block with zeros */
701         tbl             v0.16b, {v0.16b}, v3.16b
702
703         /* ghash update */
704         rbit            v0.16b, v0.16b
705         eor             RHASH.16b, RHASH.16b, v0.16b
706         PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
707         REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
708
709 .Lgcm_dec_hash_len:
710         cbz             x7, .Lgcm_dec_end
711
712         GTAG_HASH_LENGTHS(v1, v3)
713
714         b               .Lgcm_dec_ret
715
716 .Lgcm_dec_end:
717         /* store new CTR */
718         rev             x8, x8
719         rev             x9, x9
720         stp             x8, x9, [x3]
721
722         rbit            RHASH.16b, RHASH.16b
723
724 .Lgcm_dec_ret:
725         /* store new MAC */
726         st1             {RHASH.2d}, [x5]
727
728         ret
729 SYM_FUNC_END(sm4_ce_pmull_gcm_dec)
730
731         .section        ".rodata", "a"
732         .align 4
733 .Lcts_permute_table:
734         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
735         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
736         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
737         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
738         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
739         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
740
741 .Lghash_rconst:
742         .quad           0x87