Linux 6.7-rc7
[linux-modified.git] / arch / arm64 / crypto / ghash-ce-core.S
1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * Accelerated GHASH implementation with ARMv8 PMULL instructions.
4  *
5  * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
6  */
7
8 #include <linux/linkage.h>
9 #include <linux/cfi_types.h>
10 #include <asm/assembler.h>
11
12         SHASH           .req    v0
13         SHASH2          .req    v1
14         T1              .req    v2
15         T2              .req    v3
16         MASK            .req    v4
17         XM              .req    v5
18         XL              .req    v6
19         XH              .req    v7
20         IN1             .req    v7
21
22         k00_16          .req    v8
23         k32_48          .req    v9
24
25         t3              .req    v10
26         t4              .req    v11
27         t5              .req    v12
28         t6              .req    v13
29         t7              .req    v14
30         t8              .req    v15
31         t9              .req    v16
32
33         perm1           .req    v17
34         perm2           .req    v18
35         perm3           .req    v19
36
37         sh1             .req    v20
38         sh2             .req    v21
39         sh3             .req    v22
40         sh4             .req    v23
41
42         ss1             .req    v24
43         ss2             .req    v25
44         ss3             .req    v26
45         ss4             .req    v27
46
47         XL2             .req    v8
48         XM2             .req    v9
49         XH2             .req    v10
50         XL3             .req    v11
51         XM3             .req    v12
52         XH3             .req    v13
53         TT3             .req    v14
54         TT4             .req    v15
55         HH              .req    v16
56         HH3             .req    v17
57         HH4             .req    v18
58         HH34            .req    v19
59
60         .text
61         .arch           armv8-a+crypto
62
63         .macro          __pmull_p64, rd, rn, rm
64         pmull           \rd\().1q, \rn\().1d, \rm\().1d
65         .endm
66
67         .macro          __pmull2_p64, rd, rn, rm
68         pmull2          \rd\().1q, \rn\().2d, \rm\().2d
69         .endm
70
71         .macro          __pmull_p8, rq, ad, bd
72         ext             t3.8b, \ad\().8b, \ad\().8b, #1         // A1
73         ext             t5.8b, \ad\().8b, \ad\().8b, #2         // A2
74         ext             t7.8b, \ad\().8b, \ad\().8b, #3         // A3
75
76         __pmull_p8_\bd  \rq, \ad
77         .endm
78
79         .macro          __pmull2_p8, rq, ad, bd
80         tbl             t3.16b, {\ad\().16b}, perm1.16b         // A1
81         tbl             t5.16b, {\ad\().16b}, perm2.16b         // A2
82         tbl             t7.16b, {\ad\().16b}, perm3.16b         // A3
83
84         __pmull2_p8_\bd \rq, \ad
85         .endm
86
87         .macro          __pmull_p8_SHASH, rq, ad
88         __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
89         .endm
90
91         .macro          __pmull_p8_SHASH2, rq, ad
92         __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
93         .endm
94
95         .macro          __pmull2_p8_SHASH, rq, ad
96         __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
97         .endm
98
99         .macro          __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
100         pmull\t         t3.8h, t3.\nb, \bd                      // F = A1*B
101         pmull\t         t4.8h, \ad, \b1\().\nb                  // E = A*B1
102         pmull\t         t5.8h, t5.\nb, \bd                      // H = A2*B
103         pmull\t         t6.8h, \ad, \b2\().\nb                  // G = A*B2
104         pmull\t         t7.8h, t7.\nb, \bd                      // J = A3*B
105         pmull\t         t8.8h, \ad, \b3\().\nb                  // I = A*B3
106         pmull\t         t9.8h, \ad, \b4\().\nb                  // K = A*B4
107         pmull\t         \rq\().8h, \ad, \bd                     // D = A*B
108
109         eor             t3.16b, t3.16b, t4.16b                  // L = E + F
110         eor             t5.16b, t5.16b, t6.16b                  // M = G + H
111         eor             t7.16b, t7.16b, t8.16b                  // N = I + J
112
113         uzp1            t4.2d, t3.2d, t5.2d
114         uzp2            t3.2d, t3.2d, t5.2d
115         uzp1            t6.2d, t7.2d, t9.2d
116         uzp2            t7.2d, t7.2d, t9.2d
117
118         // t3 = (L) (P0 + P1) << 8
119         // t5 = (M) (P2 + P3) << 16
120         eor             t4.16b, t4.16b, t3.16b
121         and             t3.16b, t3.16b, k32_48.16b
122
123         // t7 = (N) (P4 + P5) << 24
124         // t9 = (K) (P6 + P7) << 32
125         eor             t6.16b, t6.16b, t7.16b
126         and             t7.16b, t7.16b, k00_16.16b
127
128         eor             t4.16b, t4.16b, t3.16b
129         eor             t6.16b, t6.16b, t7.16b
130
131         zip2            t5.2d, t4.2d, t3.2d
132         zip1            t3.2d, t4.2d, t3.2d
133         zip2            t9.2d, t6.2d, t7.2d
134         zip1            t7.2d, t6.2d, t7.2d
135
136         ext             t3.16b, t3.16b, t3.16b, #15
137         ext             t5.16b, t5.16b, t5.16b, #14
138         ext             t7.16b, t7.16b, t7.16b, #13
139         ext             t9.16b, t9.16b, t9.16b, #12
140
141         eor             t3.16b, t3.16b, t5.16b
142         eor             t7.16b, t7.16b, t9.16b
143         eor             \rq\().16b, \rq\().16b, t3.16b
144         eor             \rq\().16b, \rq\().16b, t7.16b
145         .endm
146
147         .macro          __pmull_pre_p64
148         add             x8, x3, #16
149         ld1             {HH.2d-HH4.2d}, [x8]
150
151         trn1            SHASH2.2d, SHASH.2d, HH.2d
152         trn2            T1.2d, SHASH.2d, HH.2d
153         eor             SHASH2.16b, SHASH2.16b, T1.16b
154
155         trn1            HH34.2d, HH3.2d, HH4.2d
156         trn2            T1.2d, HH3.2d, HH4.2d
157         eor             HH34.16b, HH34.16b, T1.16b
158
159         movi            MASK.16b, #0xe1
160         shl             MASK.2d, MASK.2d, #57
161         .endm
162
163         .macro          __pmull_pre_p8
164         ext             SHASH2.16b, SHASH.16b, SHASH.16b, #8
165         eor             SHASH2.16b, SHASH2.16b, SHASH.16b
166
167         // k00_16 := 0x0000000000000000_000000000000ffff
168         // k32_48 := 0x00000000ffffffff_0000ffffffffffff
169         movi            k32_48.2d, #0xffffffff
170         mov             k32_48.h[2], k32_48.h[0]
171         ushr            k00_16.2d, k32_48.2d, #32
172
173         // prepare the permutation vectors
174         mov_q           x5, 0x080f0e0d0c0b0a09
175         movi            T1.8b, #8
176         dup             perm1.2d, x5
177         eor             perm1.16b, perm1.16b, T1.16b
178         ushr            perm2.2d, perm1.2d, #8
179         ushr            perm3.2d, perm1.2d, #16
180         ushr            T1.2d, perm1.2d, #24
181         sli             perm2.2d, perm1.2d, #56
182         sli             perm3.2d, perm1.2d, #48
183         sli             T1.2d, perm1.2d, #40
184
185         // precompute loop invariants
186         tbl             sh1.16b, {SHASH.16b}, perm1.16b
187         tbl             sh2.16b, {SHASH.16b}, perm2.16b
188         tbl             sh3.16b, {SHASH.16b}, perm3.16b
189         tbl             sh4.16b, {SHASH.16b}, T1.16b
190         ext             ss1.8b, SHASH2.8b, SHASH2.8b, #1
191         ext             ss2.8b, SHASH2.8b, SHASH2.8b, #2
192         ext             ss3.8b, SHASH2.8b, SHASH2.8b, #3
193         ext             ss4.8b, SHASH2.8b, SHASH2.8b, #4
194         .endm
195
196         //
197         // PMULL (64x64->128) based reduction for CPUs that can do
198         // it in a single instruction.
199         //
200         .macro          __pmull_reduce_p64
201         pmull           T2.1q, XL.1d, MASK.1d
202         eor             XM.16b, XM.16b, T1.16b
203
204         mov             XH.d[0], XM.d[1]
205         mov             XM.d[1], XL.d[0]
206
207         eor             XL.16b, XM.16b, T2.16b
208         ext             T2.16b, XL.16b, XL.16b, #8
209         pmull           XL.1q, XL.1d, MASK.1d
210         .endm
211
212         //
213         // Alternative reduction for CPUs that lack support for the
214         // 64x64->128 PMULL instruction
215         //
216         .macro          __pmull_reduce_p8
217         eor             XM.16b, XM.16b, T1.16b
218
219         mov             XL.d[1], XM.d[0]
220         mov             XH.d[0], XM.d[1]
221
222         shl             T1.2d, XL.2d, #57
223         shl             T2.2d, XL.2d, #62
224         eor             T2.16b, T2.16b, T1.16b
225         shl             T1.2d, XL.2d, #63
226         eor             T2.16b, T2.16b, T1.16b
227         ext             T1.16b, XL.16b, XH.16b, #8
228         eor             T2.16b, T2.16b, T1.16b
229
230         mov             XL.d[1], T2.d[0]
231         mov             XH.d[0], T2.d[1]
232
233         ushr            T2.2d, XL.2d, #1
234         eor             XH.16b, XH.16b, XL.16b
235         eor             XL.16b, XL.16b, T2.16b
236         ushr            T2.2d, T2.2d, #6
237         ushr            XL.2d, XL.2d, #1
238         .endm
239
240         .macro          __pmull_ghash, pn
241         ld1             {SHASH.2d}, [x3]
242         ld1             {XL.2d}, [x1]
243
244         __pmull_pre_\pn
245
246         /* do the head block first, if supplied */
247         cbz             x4, 0f
248         ld1             {T1.2d}, [x4]
249         mov             x4, xzr
250         b               3f
251
252 0:      .ifc            \pn, p64
253         tbnz            w0, #0, 2f              // skip until #blocks is a
254         tbnz            w0, #1, 2f              // round multiple of 4
255
256 1:      ld1             {XM3.16b-TT4.16b}, [x2], #64
257
258         sub             w0, w0, #4
259
260         rev64           T1.16b, XM3.16b
261         rev64           T2.16b, XH3.16b
262         rev64           TT4.16b, TT4.16b
263         rev64           TT3.16b, TT3.16b
264
265         ext             IN1.16b, TT4.16b, TT4.16b, #8
266         ext             XL3.16b, TT3.16b, TT3.16b, #8
267
268         eor             TT4.16b, TT4.16b, IN1.16b
269         pmull2          XH2.1q, SHASH.2d, IN1.2d        // a1 * b1
270         pmull           XL2.1q, SHASH.1d, IN1.1d        // a0 * b0
271         pmull           XM2.1q, SHASH2.1d, TT4.1d       // (a1 + a0)(b1 + b0)
272
273         eor             TT3.16b, TT3.16b, XL3.16b
274         pmull2          XH3.1q, HH.2d, XL3.2d           // a1 * b1
275         pmull           XL3.1q, HH.1d, XL3.1d           // a0 * b0
276         pmull2          XM3.1q, SHASH2.2d, TT3.2d       // (a1 + a0)(b1 + b0)
277
278         ext             IN1.16b, T2.16b, T2.16b, #8
279         eor             XL2.16b, XL2.16b, XL3.16b
280         eor             XH2.16b, XH2.16b, XH3.16b
281         eor             XM2.16b, XM2.16b, XM3.16b
282
283         eor             T2.16b, T2.16b, IN1.16b
284         pmull2          XH3.1q, HH3.2d, IN1.2d          // a1 * b1
285         pmull           XL3.1q, HH3.1d, IN1.1d          // a0 * b0
286         pmull           XM3.1q, HH34.1d, T2.1d          // (a1 + a0)(b1 + b0)
287
288         eor             XL2.16b, XL2.16b, XL3.16b
289         eor             XH2.16b, XH2.16b, XH3.16b
290         eor             XM2.16b, XM2.16b, XM3.16b
291
292         ext             IN1.16b, T1.16b, T1.16b, #8
293         ext             TT3.16b, XL.16b, XL.16b, #8
294         eor             XL.16b, XL.16b, IN1.16b
295         eor             T1.16b, T1.16b, TT3.16b
296
297         pmull2          XH.1q, HH4.2d, XL.2d            // a1 * b1
298         eor             T1.16b, T1.16b, XL.16b
299         pmull           XL.1q, HH4.1d, XL.1d            // a0 * b0
300         pmull2          XM.1q, HH34.2d, T1.2d           // (a1 + a0)(b1 + b0)
301
302         eor             XL.16b, XL.16b, XL2.16b
303         eor             XH.16b, XH.16b, XH2.16b
304         eor             XM.16b, XM.16b, XM2.16b
305
306         eor             T2.16b, XL.16b, XH.16b
307         ext             T1.16b, XL.16b, XH.16b, #8
308         eor             XM.16b, XM.16b, T2.16b
309
310         __pmull_reduce_p64
311
312         eor             T2.16b, T2.16b, XH.16b
313         eor             XL.16b, XL.16b, T2.16b
314
315         cbz             w0, 5f
316         b               1b
317         .endif
318
319 2:      ld1             {T1.2d}, [x2], #16
320         sub             w0, w0, #1
321
322 3:      /* multiply XL by SHASH in GF(2^128) */
323 CPU_LE( rev64           T1.16b, T1.16b  )
324
325         ext             T2.16b, XL.16b, XL.16b, #8
326         ext             IN1.16b, T1.16b, T1.16b, #8
327         eor             T1.16b, T1.16b, T2.16b
328         eor             XL.16b, XL.16b, IN1.16b
329
330         __pmull2_\pn    XH, XL, SHASH                   // a1 * b1
331         eor             T1.16b, T1.16b, XL.16b
332         __pmull_\pn     XL, XL, SHASH                   // a0 * b0
333         __pmull_\pn     XM, T1, SHASH2                  // (a1 + a0)(b1 + b0)
334
335 4:      eor             T2.16b, XL.16b, XH.16b
336         ext             T1.16b, XL.16b, XH.16b, #8
337         eor             XM.16b, XM.16b, T2.16b
338
339         __pmull_reduce_\pn
340
341         eor             T2.16b, T2.16b, XH.16b
342         eor             XL.16b, XL.16b, T2.16b
343
344         cbnz            w0, 0b
345
346 5:      st1             {XL.2d}, [x1]
347         ret
348         .endm
349
350         /*
351          * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
352          *                         struct ghash_key const *k, const char *head)
353          */
354 SYM_TYPED_FUNC_START(pmull_ghash_update_p64)
355         __pmull_ghash   p64
356 SYM_FUNC_END(pmull_ghash_update_p64)
357
358 SYM_TYPED_FUNC_START(pmull_ghash_update_p8)
359         __pmull_ghash   p8
360 SYM_FUNC_END(pmull_ghash_update_p8)
361
362         KS0             .req    v8
363         KS1             .req    v9
364         KS2             .req    v10
365         KS3             .req    v11
366
367         INP0            .req    v21
368         INP1            .req    v22
369         INP2            .req    v23
370         INP3            .req    v24
371
372         K0              .req    v25
373         K1              .req    v26
374         K2              .req    v27
375         K3              .req    v28
376         K4              .req    v12
377         K5              .req    v13
378         K6              .req    v4
379         K7              .req    v5
380         K8              .req    v14
381         K9              .req    v15
382         KK              .req    v29
383         KL              .req    v30
384         KM              .req    v31
385
386         .macro          load_round_keys, rounds, rk, tmp
387         add             \tmp, \rk, #64
388         ld1             {K0.4s-K3.4s}, [\rk]
389         ld1             {K4.4s-K5.4s}, [\tmp]
390         add             \tmp, \rk, \rounds, lsl #4
391         sub             \tmp, \tmp, #32
392         ld1             {KK.4s-KM.4s}, [\tmp]
393         .endm
394
395         .macro          enc_round, state, key
396         aese            \state\().16b, \key\().16b
397         aesmc           \state\().16b, \state\().16b
398         .endm
399
400         .macro          enc_qround, s0, s1, s2, s3, key
401         enc_round       \s0, \key
402         enc_round       \s1, \key
403         enc_round       \s2, \key
404         enc_round       \s3, \key
405         .endm
406
407         .macro          enc_block, state, rounds, rk, tmp
408         add             \tmp, \rk, #96
409         ld1             {K6.4s-K7.4s}, [\tmp], #32
410         .irp            key, K0, K1, K2, K3, K4 K5
411         enc_round       \state, \key
412         .endr
413
414         tbnz            \rounds, #2, .Lnot128_\@
415 .Lout256_\@:
416         enc_round       \state, K6
417         enc_round       \state, K7
418
419 .Lout192_\@:
420         enc_round       \state, KK
421         aese            \state\().16b, KL.16b
422         eor             \state\().16b, \state\().16b, KM.16b
423
424         .subsection     1
425 .Lnot128_\@:
426         ld1             {K8.4s-K9.4s}, [\tmp], #32
427         enc_round       \state, K6
428         enc_round       \state, K7
429         ld1             {K6.4s-K7.4s}, [\tmp]
430         enc_round       \state, K8
431         enc_round       \state, K9
432         tbz             \rounds, #1, .Lout192_\@
433         b               .Lout256_\@
434         .previous
435         .endm
436
437         .align          6
438         .macro          pmull_gcm_do_crypt, enc
439         frame_push      1
440
441         load_round_keys x7, x6, x8
442
443         ld1             {SHASH.2d}, [x3], #16
444         ld1             {HH.2d-HH4.2d}, [x3]
445
446         trn1            SHASH2.2d, SHASH.2d, HH.2d
447         trn2            T1.2d, SHASH.2d, HH.2d
448         eor             SHASH2.16b, SHASH2.16b, T1.16b
449
450         trn1            HH34.2d, HH3.2d, HH4.2d
451         trn2            T1.2d, HH3.2d, HH4.2d
452         eor             HH34.16b, HH34.16b, T1.16b
453
454         ld1             {XL.2d}, [x4]
455
456         cbz             x0, 3f                          // tag only?
457
458         ldr             w8, [x5, #12]                   // load lower counter
459 CPU_LE( rev             w8, w8          )
460
461 0:      mov             w9, #4                          // max blocks per round
462         add             x10, x0, #0xf
463         lsr             x10, x10, #4                    // remaining blocks
464
465         subs            x0, x0, #64
466         csel            w9, w10, w9, mi
467         add             w8, w8, w9
468
469         bmi             1f
470         ld1             {INP0.16b-INP3.16b}, [x2], #64
471         .subsection     1
472         /*
473          * Populate the four input registers right to left with up to 63 bytes
474          * of data, using overlapping loads to avoid branches.
475          *
476          *                INP0     INP1     INP2     INP3
477          *  1 byte     |        |        |        |x       |
478          * 16 bytes    |        |        |        |xxxxxxxx|
479          * 17 bytes    |        |        |xxxxxxxx|x       |
480          * 47 bytes    |        |xxxxxxxx|xxxxxxxx|xxxxxxx |
481          * etc etc
482          *
483          * Note that this code may read up to 15 bytes before the start of
484          * the input. It is up to the calling code to ensure this is safe if
485          * this happens in the first iteration of the loop (i.e., when the
486          * input size is < 16 bytes)
487          */
488 1:      mov             x15, #16
489         ands            x19, x0, #0xf
490         csel            x19, x19, x15, ne
491         adr_l           x17, .Lpermute_table + 16
492
493         sub             x11, x15, x19
494         add             x12, x17, x11
495         sub             x17, x17, x11
496         ld1             {T1.16b}, [x12]
497         sub             x10, x1, x11
498         sub             x11, x2, x11
499
500         cmp             x0, #-16
501         csel            x14, x15, xzr, gt
502         cmp             x0, #-32
503         csel            x15, x15, xzr, gt
504         cmp             x0, #-48
505         csel            x16, x19, xzr, gt
506         csel            x1, x1, x10, gt
507         csel            x2, x2, x11, gt
508
509         ld1             {INP0.16b}, [x2], x14
510         ld1             {INP1.16b}, [x2], x15
511         ld1             {INP2.16b}, [x2], x16
512         ld1             {INP3.16b}, [x2]
513         tbl             INP3.16b, {INP3.16b}, T1.16b
514         b               2f
515         .previous
516
517 2:      .if             \enc == 0
518         bl              pmull_gcm_ghash_4x
519         .endif
520
521         bl              pmull_gcm_enc_4x
522
523         tbnz            x0, #63, 6f
524         st1             {INP0.16b-INP3.16b}, [x1], #64
525         .if             \enc == 1
526         bl              pmull_gcm_ghash_4x
527         .endif
528         bne             0b
529
530 3:      ldr             x10, [sp, #.Lframe_local_offset]
531         cbz             x10, 5f                         // output tag?
532
533         ld1             {INP3.16b}, [x10]               // load lengths[]
534         mov             w9, #1
535         bl              pmull_gcm_ghash_4x
536
537         mov             w11, #(0x1 << 24)               // BE '1U'
538         ld1             {KS0.16b}, [x5]
539         mov             KS0.s[3], w11
540
541         enc_block       KS0, x7, x6, x12
542
543         ext             XL.16b, XL.16b, XL.16b, #8
544         rev64           XL.16b, XL.16b
545         eor             XL.16b, XL.16b, KS0.16b
546
547         .if             \enc == 1
548         st1             {XL.16b}, [x10]                 // store tag
549         .else
550         ldp             x11, x12, [sp, #40]             // load tag pointer and authsize
551         adr_l           x17, .Lpermute_table
552         ld1             {KS0.16b}, [x11]                // load supplied tag
553         add             x17, x17, x12
554         ld1             {KS1.16b}, [x17]                // load permute vector
555
556         cmeq            XL.16b, XL.16b, KS0.16b         // compare tags
557         mvn             XL.16b, XL.16b                  // -1 for fail, 0 for pass
558         tbl             XL.16b, {XL.16b}, KS1.16b       // keep authsize bytes only
559         sminv           b0, XL.16b                      // signed minimum across XL
560         smov            w0, v0.b[0]                     // return b0
561         .endif
562
563 4:      frame_pop
564         ret
565
566 5:
567 CPU_LE( rev             w8, w8          )
568         str             w8, [x5, #12]                   // store lower counter
569         st1             {XL.2d}, [x4]
570         b               4b
571
572 6:      ld1             {T1.16b-T2.16b}, [x17], #32     // permute vectors
573         sub             x17, x17, x19, lsl #1
574
575         cmp             w9, #1
576         beq             7f
577         .subsection     1
578 7:      ld1             {INP2.16b}, [x1]
579         tbx             INP2.16b, {INP3.16b}, T1.16b
580         mov             INP3.16b, INP2.16b
581         b               8f
582         .previous
583
584         st1             {INP0.16b}, [x1], x14
585         st1             {INP1.16b}, [x1], x15
586         st1             {INP2.16b}, [x1], x16
587         tbl             INP3.16b, {INP3.16b}, T1.16b
588         tbx             INP3.16b, {INP2.16b}, T2.16b
589 8:      st1             {INP3.16b}, [x1]
590
591         .if             \enc == 1
592         ld1             {T1.16b}, [x17]
593         tbl             INP3.16b, {INP3.16b}, T1.16b    // clear non-data bits
594         bl              pmull_gcm_ghash_4x
595         .endif
596         b               3b
597         .endm
598
599         /*
600          * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
601          *                        struct ghash_key const *k, u64 dg[], u8 ctr[],
602          *                        int rounds, u8 tag)
603          */
604 SYM_FUNC_START(pmull_gcm_encrypt)
605         pmull_gcm_do_crypt      1
606 SYM_FUNC_END(pmull_gcm_encrypt)
607
608         /*
609          * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
610          *                        struct ghash_key const *k, u64 dg[], u8 ctr[],
611          *                        int rounds, u8 tag)
612          */
613 SYM_FUNC_START(pmull_gcm_decrypt)
614         pmull_gcm_do_crypt      0
615 SYM_FUNC_END(pmull_gcm_decrypt)
616
617 SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
618         movi            MASK.16b, #0xe1
619         shl             MASK.2d, MASK.2d, #57
620
621         rev64           T1.16b, INP0.16b
622         rev64           T2.16b, INP1.16b
623         rev64           TT3.16b, INP2.16b
624         rev64           TT4.16b, INP3.16b
625
626         ext             XL.16b, XL.16b, XL.16b, #8
627
628         tbz             w9, #2, 0f                      // <4 blocks?
629         .subsection     1
630 0:      movi            XH2.16b, #0
631         movi            XM2.16b, #0
632         movi            XL2.16b, #0
633
634         tbz             w9, #0, 1f                      // 2 blocks?
635         tbz             w9, #1, 2f                      // 1 block?
636
637         eor             T2.16b, T2.16b, XL.16b
638         ext             T1.16b, T2.16b, T2.16b, #8
639         b               .Lgh3
640
641 1:      eor             TT3.16b, TT3.16b, XL.16b
642         ext             T2.16b, TT3.16b, TT3.16b, #8
643         b               .Lgh2
644
645 2:      eor             TT4.16b, TT4.16b, XL.16b
646         ext             IN1.16b, TT4.16b, TT4.16b, #8
647         b               .Lgh1
648         .previous
649
650         eor             T1.16b, T1.16b, XL.16b
651         ext             IN1.16b, T1.16b, T1.16b, #8
652
653         pmull2          XH2.1q, HH4.2d, IN1.2d          // a1 * b1
654         eor             T1.16b, T1.16b, IN1.16b
655         pmull           XL2.1q, HH4.1d, IN1.1d          // a0 * b0
656         pmull2          XM2.1q, HH34.2d, T1.2d          // (a1 + a0)(b1 + b0)
657
658         ext             T1.16b, T2.16b, T2.16b, #8
659 .Lgh3:  eor             T2.16b, T2.16b, T1.16b
660         pmull2          XH.1q, HH3.2d, T1.2d            // a1 * b1
661         pmull           XL.1q, HH3.1d, T1.1d            // a0 * b0
662         pmull           XM.1q, HH34.1d, T2.1d           // (a1 + a0)(b1 + b0)
663
664         eor             XH2.16b, XH2.16b, XH.16b
665         eor             XL2.16b, XL2.16b, XL.16b
666         eor             XM2.16b, XM2.16b, XM.16b
667
668         ext             T2.16b, TT3.16b, TT3.16b, #8
669 .Lgh2:  eor             TT3.16b, TT3.16b, T2.16b
670         pmull2          XH.1q, HH.2d, T2.2d             // a1 * b1
671         pmull           XL.1q, HH.1d, T2.1d             // a0 * b0
672         pmull2          XM.1q, SHASH2.2d, TT3.2d        // (a1 + a0)(b1 + b0)
673
674         eor             XH2.16b, XH2.16b, XH.16b
675         eor             XL2.16b, XL2.16b, XL.16b
676         eor             XM2.16b, XM2.16b, XM.16b
677
678         ext             IN1.16b, TT4.16b, TT4.16b, #8
679 .Lgh1:  eor             TT4.16b, TT4.16b, IN1.16b
680         pmull           XL.1q, SHASH.1d, IN1.1d         // a0 * b0
681         pmull2          XH.1q, SHASH.2d, IN1.2d         // a1 * b1
682         pmull           XM.1q, SHASH2.1d, TT4.1d        // (a1 + a0)(b1 + b0)
683
684         eor             XH.16b, XH.16b, XH2.16b
685         eor             XL.16b, XL.16b, XL2.16b
686         eor             XM.16b, XM.16b, XM2.16b
687
688         eor             T2.16b, XL.16b, XH.16b
689         ext             T1.16b, XL.16b, XH.16b, #8
690         eor             XM.16b, XM.16b, T2.16b
691
692         __pmull_reduce_p64
693
694         eor             T2.16b, T2.16b, XH.16b
695         eor             XL.16b, XL.16b, T2.16b
696
697         ret
698 SYM_FUNC_END(pmull_gcm_ghash_4x)
699
700 SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
701         ld1             {KS0.16b}, [x5]                 // load upper counter
702         sub             w10, w8, #4
703         sub             w11, w8, #3
704         sub             w12, w8, #2
705         sub             w13, w8, #1
706         rev             w10, w10
707         rev             w11, w11
708         rev             w12, w12
709         rev             w13, w13
710         mov             KS1.16b, KS0.16b
711         mov             KS2.16b, KS0.16b
712         mov             KS3.16b, KS0.16b
713         ins             KS0.s[3], w10                   // set lower counter
714         ins             KS1.s[3], w11
715         ins             KS2.s[3], w12
716         ins             KS3.s[3], w13
717
718         add             x10, x6, #96                    // round key pointer
719         ld1             {K6.4s-K7.4s}, [x10], #32
720         .irp            key, K0, K1, K2, K3, K4, K5
721         enc_qround      KS0, KS1, KS2, KS3, \key
722         .endr
723
724         tbnz            x7, #2, .Lnot128
725         .subsection     1
726 .Lnot128:
727         ld1             {K8.4s-K9.4s}, [x10], #32
728         .irp            key, K6, K7
729         enc_qround      KS0, KS1, KS2, KS3, \key
730         .endr
731         ld1             {K6.4s-K7.4s}, [x10]
732         .irp            key, K8, K9
733         enc_qround      KS0, KS1, KS2, KS3, \key
734         .endr
735         tbz             x7, #1, .Lout192
736         b               .Lout256
737         .previous
738
739 .Lout256:
740         .irp            key, K6, K7
741         enc_qround      KS0, KS1, KS2, KS3, \key
742         .endr
743
744 .Lout192:
745         enc_qround      KS0, KS1, KS2, KS3, KK
746
747         aese            KS0.16b, KL.16b
748         aese            KS1.16b, KL.16b
749         aese            KS2.16b, KL.16b
750         aese            KS3.16b, KL.16b
751
752         eor             KS0.16b, KS0.16b, KM.16b
753         eor             KS1.16b, KS1.16b, KM.16b
754         eor             KS2.16b, KS2.16b, KM.16b
755         eor             KS3.16b, KS3.16b, KM.16b
756
757         eor             INP0.16b, INP0.16b, KS0.16b
758         eor             INP1.16b, INP1.16b, KS1.16b
759         eor             INP2.16b, INP2.16b, KS2.16b
760         eor             INP3.16b, INP3.16b, KS3.16b
761
762         ret
763 SYM_FUNC_END(pmull_gcm_enc_4x)
764
765         .section        ".rodata", "a"
766         .align          6
767 .Lpermute_table:
768         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
769         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
770         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
771         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
772         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
773         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
774         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
775         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
776         .previous