GNU Linux-libre 4.19.245-gnu1
[releases.git] / arch / arm64 / crypto / ghash-ce-core.S
1 /*
2  * Accelerated GHASH implementation with ARMv8 PMULL instructions.
3  *
4  * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License version 2 as published
8  * by the Free Software Foundation.
9  */
10
11 #include <linux/linkage.h>
12 #include <asm/assembler.h>
13
14         SHASH           .req    v0
15         SHASH2          .req    v1
16         T1              .req    v2
17         T2              .req    v3
18         MASK            .req    v4
19         XL              .req    v5
20         XM              .req    v6
21         XH              .req    v7
22         IN1             .req    v7
23
24         k00_16          .req    v8
25         k32_48          .req    v9
26
27         t3              .req    v10
28         t4              .req    v11
29         t5              .req    v12
30         t6              .req    v13
31         t7              .req    v14
32         t8              .req    v15
33         t9              .req    v16
34
35         perm1           .req    v17
36         perm2           .req    v18
37         perm3           .req    v19
38
39         sh1             .req    v20
40         sh2             .req    v21
41         sh3             .req    v22
42         sh4             .req    v23
43
44         ss1             .req    v24
45         ss2             .req    v25
46         ss3             .req    v26
47         ss4             .req    v27
48
49         XL2             .req    v8
50         XM2             .req    v9
51         XH2             .req    v10
52         XL3             .req    v11
53         XM3             .req    v12
54         XH3             .req    v13
55         TT3             .req    v14
56         TT4             .req    v15
57         HH              .req    v16
58         HH3             .req    v17
59         HH4             .req    v18
60         HH34            .req    v19
61
62         .text
63         .arch           armv8-a+crypto
64
65         .macro          __pmull_p64, rd, rn, rm
66         pmull           \rd\().1q, \rn\().1d, \rm\().1d
67         .endm
68
69         .macro          __pmull2_p64, rd, rn, rm
70         pmull2          \rd\().1q, \rn\().2d, \rm\().2d
71         .endm
72
73         .macro          __pmull_p8, rq, ad, bd
74         ext             t3.8b, \ad\().8b, \ad\().8b, #1         // A1
75         ext             t5.8b, \ad\().8b, \ad\().8b, #2         // A2
76         ext             t7.8b, \ad\().8b, \ad\().8b, #3         // A3
77
78         __pmull_p8_\bd  \rq, \ad
79         .endm
80
81         .macro          __pmull2_p8, rq, ad, bd
82         tbl             t3.16b, {\ad\().16b}, perm1.16b         // A1
83         tbl             t5.16b, {\ad\().16b}, perm2.16b         // A2
84         tbl             t7.16b, {\ad\().16b}, perm3.16b         // A3
85
86         __pmull2_p8_\bd \rq, \ad
87         .endm
88
89         .macro          __pmull_p8_SHASH, rq, ad
90         __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
91         .endm
92
93         .macro          __pmull_p8_SHASH2, rq, ad
94         __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
95         .endm
96
97         .macro          __pmull2_p8_SHASH, rq, ad
98         __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
99         .endm
100
101         .macro          __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
102         pmull\t         t3.8h, t3.\nb, \bd                      // F = A1*B
103         pmull\t         t4.8h, \ad, \b1\().\nb                  // E = A*B1
104         pmull\t         t5.8h, t5.\nb, \bd                      // H = A2*B
105         pmull\t         t6.8h, \ad, \b2\().\nb                  // G = A*B2
106         pmull\t         t7.8h, t7.\nb, \bd                      // J = A3*B
107         pmull\t         t8.8h, \ad, \b3\().\nb                  // I = A*B3
108         pmull\t         t9.8h, \ad, \b4\().\nb                  // K = A*B4
109         pmull\t         \rq\().8h, \ad, \bd                     // D = A*B
110
111         eor             t3.16b, t3.16b, t4.16b                  // L = E + F
112         eor             t5.16b, t5.16b, t6.16b                  // M = G + H
113         eor             t7.16b, t7.16b, t8.16b                  // N = I + J
114
115         uzp1            t4.2d, t3.2d, t5.2d
116         uzp2            t3.2d, t3.2d, t5.2d
117         uzp1            t6.2d, t7.2d, t9.2d
118         uzp2            t7.2d, t7.2d, t9.2d
119
120         // t3 = (L) (P0 + P1) << 8
121         // t5 = (M) (P2 + P3) << 16
122         eor             t4.16b, t4.16b, t3.16b
123         and             t3.16b, t3.16b, k32_48.16b
124
125         // t7 = (N) (P4 + P5) << 24
126         // t9 = (K) (P6 + P7) << 32
127         eor             t6.16b, t6.16b, t7.16b
128         and             t7.16b, t7.16b, k00_16.16b
129
130         eor             t4.16b, t4.16b, t3.16b
131         eor             t6.16b, t6.16b, t7.16b
132
133         zip2            t5.2d, t4.2d, t3.2d
134         zip1            t3.2d, t4.2d, t3.2d
135         zip2            t9.2d, t6.2d, t7.2d
136         zip1            t7.2d, t6.2d, t7.2d
137
138         ext             t3.16b, t3.16b, t3.16b, #15
139         ext             t5.16b, t5.16b, t5.16b, #14
140         ext             t7.16b, t7.16b, t7.16b, #13
141         ext             t9.16b, t9.16b, t9.16b, #12
142
143         eor             t3.16b, t3.16b, t5.16b
144         eor             t7.16b, t7.16b, t9.16b
145         eor             \rq\().16b, \rq\().16b, t3.16b
146         eor             \rq\().16b, \rq\().16b, t7.16b
147         .endm
148
149         .macro          __pmull_pre_p64
150         add             x8, x3, #16
151         ld1             {HH.2d-HH4.2d}, [x8]
152
153         trn1            SHASH2.2d, SHASH.2d, HH.2d
154         trn2            T1.2d, SHASH.2d, HH.2d
155         eor             SHASH2.16b, SHASH2.16b, T1.16b
156
157         trn1            HH34.2d, HH3.2d, HH4.2d
158         trn2            T1.2d, HH3.2d, HH4.2d
159         eor             HH34.16b, HH34.16b, T1.16b
160
161         movi            MASK.16b, #0xe1
162         shl             MASK.2d, MASK.2d, #57
163         .endm
164
165         .macro          __pmull_pre_p8
166         ext             SHASH2.16b, SHASH.16b, SHASH.16b, #8
167         eor             SHASH2.16b, SHASH2.16b, SHASH.16b
168
169         // k00_16 := 0x0000000000000000_000000000000ffff
170         // k32_48 := 0x00000000ffffffff_0000ffffffffffff
171         movi            k32_48.2d, #0xffffffff
172         mov             k32_48.h[2], k32_48.h[0]
173         ushr            k00_16.2d, k32_48.2d, #32
174
175         // prepare the permutation vectors
176         mov_q           x5, 0x080f0e0d0c0b0a09
177         movi            T1.8b, #8
178         dup             perm1.2d, x5
179         eor             perm1.16b, perm1.16b, T1.16b
180         ushr            perm2.2d, perm1.2d, #8
181         ushr            perm3.2d, perm1.2d, #16
182         ushr            T1.2d, perm1.2d, #24
183         sli             perm2.2d, perm1.2d, #56
184         sli             perm3.2d, perm1.2d, #48
185         sli             T1.2d, perm1.2d, #40
186
187         // precompute loop invariants
188         tbl             sh1.16b, {SHASH.16b}, perm1.16b
189         tbl             sh2.16b, {SHASH.16b}, perm2.16b
190         tbl             sh3.16b, {SHASH.16b}, perm3.16b
191         tbl             sh4.16b, {SHASH.16b}, T1.16b
192         ext             ss1.8b, SHASH2.8b, SHASH2.8b, #1
193         ext             ss2.8b, SHASH2.8b, SHASH2.8b, #2
194         ext             ss3.8b, SHASH2.8b, SHASH2.8b, #3
195         ext             ss4.8b, SHASH2.8b, SHASH2.8b, #4
196         .endm
197
198         //
199         // PMULL (64x64->128) based reduction for CPUs that can do
200         // it in a single instruction.
201         //
202         .macro          __pmull_reduce_p64
203         pmull           T2.1q, XL.1d, MASK.1d
204         eor             XM.16b, XM.16b, T1.16b
205
206         mov             XH.d[0], XM.d[1]
207         mov             XM.d[1], XL.d[0]
208
209         eor             XL.16b, XM.16b, T2.16b
210         ext             T2.16b, XL.16b, XL.16b, #8
211         pmull           XL.1q, XL.1d, MASK.1d
212         .endm
213
214         //
215         // Alternative reduction for CPUs that lack support for the
216         // 64x64->128 PMULL instruction
217         //
218         .macro          __pmull_reduce_p8
219         eor             XM.16b, XM.16b, T1.16b
220
221         mov             XL.d[1], XM.d[0]
222         mov             XH.d[0], XM.d[1]
223
224         shl             T1.2d, XL.2d, #57
225         shl             T2.2d, XL.2d, #62
226         eor             T2.16b, T2.16b, T1.16b
227         shl             T1.2d, XL.2d, #63
228         eor             T2.16b, T2.16b, T1.16b
229         ext             T1.16b, XL.16b, XH.16b, #8
230         eor             T2.16b, T2.16b, T1.16b
231
232         mov             XL.d[1], T2.d[0]
233         mov             XH.d[0], T2.d[1]
234
235         ushr            T2.2d, XL.2d, #1
236         eor             XH.16b, XH.16b, XL.16b
237         eor             XL.16b, XL.16b, T2.16b
238         ushr            T2.2d, T2.2d, #6
239         ushr            XL.2d, XL.2d, #1
240         .endm
241
242         .macro          __pmull_ghash, pn
243         ld1             {SHASH.2d}, [x3]
244         ld1             {XL.2d}, [x1]
245
246         __pmull_pre_\pn
247
248         /* do the head block first, if supplied */
249         cbz             x4, 0f
250         ld1             {T1.2d}, [x4]
251         mov             x4, xzr
252         b               3f
253
254 0:      .ifc            \pn, p64
255         tbnz            w0, #0, 2f              // skip until #blocks is a
256         tbnz            w0, #1, 2f              // round multiple of 4
257
258 1:      ld1             {XM3.16b-TT4.16b}, [x2], #64
259
260         sub             w0, w0, #4
261
262         rev64           T1.16b, XM3.16b
263         rev64           T2.16b, XH3.16b
264         rev64           TT4.16b, TT4.16b
265         rev64           TT3.16b, TT3.16b
266
267         ext             IN1.16b, TT4.16b, TT4.16b, #8
268         ext             XL3.16b, TT3.16b, TT3.16b, #8
269
270         eor             TT4.16b, TT4.16b, IN1.16b
271         pmull2          XH2.1q, SHASH.2d, IN1.2d        // a1 * b1
272         pmull           XL2.1q, SHASH.1d, IN1.1d        // a0 * b0
273         pmull           XM2.1q, SHASH2.1d, TT4.1d       // (a1 + a0)(b1 + b0)
274
275         eor             TT3.16b, TT3.16b, XL3.16b
276         pmull2          XH3.1q, HH.2d, XL3.2d           // a1 * b1
277         pmull           XL3.1q, HH.1d, XL3.1d           // a0 * b0
278         pmull2          XM3.1q, SHASH2.2d, TT3.2d       // (a1 + a0)(b1 + b0)
279
280         ext             IN1.16b, T2.16b, T2.16b, #8
281         eor             XL2.16b, XL2.16b, XL3.16b
282         eor             XH2.16b, XH2.16b, XH3.16b
283         eor             XM2.16b, XM2.16b, XM3.16b
284
285         eor             T2.16b, T2.16b, IN1.16b
286         pmull2          XH3.1q, HH3.2d, IN1.2d          // a1 * b1
287         pmull           XL3.1q, HH3.1d, IN1.1d          // a0 * b0
288         pmull           XM3.1q, HH34.1d, T2.1d          // (a1 + a0)(b1 + b0)
289
290         eor             XL2.16b, XL2.16b, XL3.16b
291         eor             XH2.16b, XH2.16b, XH3.16b
292         eor             XM2.16b, XM2.16b, XM3.16b
293
294         ext             IN1.16b, T1.16b, T1.16b, #8
295         ext             TT3.16b, XL.16b, XL.16b, #8
296         eor             XL.16b, XL.16b, IN1.16b
297         eor             T1.16b, T1.16b, TT3.16b
298
299         pmull2          XH.1q, HH4.2d, XL.2d            // a1 * b1
300         eor             T1.16b, T1.16b, XL.16b
301         pmull           XL.1q, HH4.1d, XL.1d            // a0 * b0
302         pmull2          XM.1q, HH34.2d, T1.2d           // (a1 + a0)(b1 + b0)
303
304         eor             XL.16b, XL.16b, XL2.16b
305         eor             XH.16b, XH.16b, XH2.16b
306         eor             XM.16b, XM.16b, XM2.16b
307
308         eor             T2.16b, XL.16b, XH.16b
309         ext             T1.16b, XL.16b, XH.16b, #8
310         eor             XM.16b, XM.16b, T2.16b
311
312         __pmull_reduce_p64
313
314         eor             T2.16b, T2.16b, XH.16b
315         eor             XL.16b, XL.16b, T2.16b
316
317         cbz             w0, 5f
318         b               1b
319         .endif
320
321 2:      ld1             {T1.2d}, [x2], #16
322         sub             w0, w0, #1
323
324 3:      /* multiply XL by SHASH in GF(2^128) */
325 CPU_LE( rev64           T1.16b, T1.16b  )
326
327         ext             T2.16b, XL.16b, XL.16b, #8
328         ext             IN1.16b, T1.16b, T1.16b, #8
329         eor             T1.16b, T1.16b, T2.16b
330         eor             XL.16b, XL.16b, IN1.16b
331
332         __pmull2_\pn    XH, XL, SHASH                   // a1 * b1
333         eor             T1.16b, T1.16b, XL.16b
334         __pmull_\pn     XL, XL, SHASH                   // a0 * b0
335         __pmull_\pn     XM, T1, SHASH2                  // (a1 + a0)(b1 + b0)
336
337 4:      eor             T2.16b, XL.16b, XH.16b
338         ext             T1.16b, XL.16b, XH.16b, #8
339         eor             XM.16b, XM.16b, T2.16b
340
341         __pmull_reduce_\pn
342
343         eor             T2.16b, T2.16b, XH.16b
344         eor             XL.16b, XL.16b, T2.16b
345
346         cbnz            w0, 0b
347
348 5:      st1             {XL.2d}, [x1]
349         ret
350         .endm
351
352         /*
353          * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
354          *                         struct ghash_key const *k, const char *head)
355          */
356 ENTRY(pmull_ghash_update_p64)
357         __pmull_ghash   p64
358 ENDPROC(pmull_ghash_update_p64)
359
360 ENTRY(pmull_ghash_update_p8)
361         __pmull_ghash   p8
362 ENDPROC(pmull_ghash_update_p8)
363
364         KS0             .req    v12
365         KS1             .req    v13
366         INP0            .req    v14
367         INP1            .req    v15
368
369         .macro          load_round_keys, rounds, rk
370         cmp             \rounds, #12
371         blo             2222f           /* 128 bits */
372         beq             1111f           /* 192 bits */
373         ld1             {v17.4s-v18.4s}, [\rk], #32
374 1111:   ld1             {v19.4s-v20.4s}, [\rk], #32
375 2222:   ld1             {v21.4s-v24.4s}, [\rk], #64
376         ld1             {v25.4s-v28.4s}, [\rk], #64
377         ld1             {v29.4s-v31.4s}, [\rk]
378         .endm
379
380         .macro          enc_round, state, key
381         aese            \state\().16b, \key\().16b
382         aesmc           \state\().16b, \state\().16b
383         .endm
384
385         .macro          enc_block, state, rounds
386         cmp             \rounds, #12
387         b.lo            2222f           /* 128 bits */
388         b.eq            1111f           /* 192 bits */
389         enc_round       \state, v17
390         enc_round       \state, v18
391 1111:   enc_round       \state, v19
392         enc_round       \state, v20
393 2222:   .irp            key, v21, v22, v23, v24, v25, v26, v27, v28, v29
394         enc_round       \state, \key
395         .endr
396         aese            \state\().16b, v30.16b
397         eor             \state\().16b, \state\().16b, v31.16b
398         .endm
399
400         .macro          pmull_gcm_do_crypt, enc
401         ld1             {SHASH.2d}, [x4], #16
402         ld1             {HH.2d}, [x4]
403         ld1             {XL.2d}, [x1]
404         ldr             x8, [x5, #8]                    // load lower counter
405
406         movi            MASK.16b, #0xe1
407         trn1            SHASH2.2d, SHASH.2d, HH.2d
408         trn2            T1.2d, SHASH.2d, HH.2d
409 CPU_LE( rev             x8, x8          )
410         shl             MASK.2d, MASK.2d, #57
411         eor             SHASH2.16b, SHASH2.16b, T1.16b
412
413         .if             \enc == 1
414         ldr             x10, [sp]
415         ld1             {KS0.16b-KS1.16b}, [x10]
416         .endif
417
418         cbnz            x6, 4f
419
420 0:      ld1             {INP0.16b-INP1.16b}, [x3], #32
421
422         rev             x9, x8
423         add             x11, x8, #1
424         add             x8, x8, #2
425
426         .if             \enc == 1
427         eor             INP0.16b, INP0.16b, KS0.16b     // encrypt input
428         eor             INP1.16b, INP1.16b, KS1.16b
429         .endif
430
431         ld1             {KS0.8b}, [x5]                  // load upper counter
432         rev             x11, x11
433         sub             w0, w0, #2
434         mov             KS1.8b, KS0.8b
435         ins             KS0.d[1], x9                    // set lower counter
436         ins             KS1.d[1], x11
437
438         rev64           T1.16b, INP1.16b
439
440         cmp             w7, #12
441         b.ge            2f                              // AES-192/256?
442
443 1:      enc_round       KS0, v21
444         ext             IN1.16b, T1.16b, T1.16b, #8
445
446         enc_round       KS1, v21
447         pmull2          XH2.1q, SHASH.2d, IN1.2d        // a1 * b1
448
449         enc_round       KS0, v22
450         eor             T1.16b, T1.16b, IN1.16b
451
452         enc_round       KS1, v22
453         pmull           XL2.1q, SHASH.1d, IN1.1d        // a0 * b0
454
455         enc_round       KS0, v23
456         pmull           XM2.1q, SHASH2.1d, T1.1d        // (a1 + a0)(b1 + b0)
457
458         enc_round       KS1, v23
459         rev64           T1.16b, INP0.16b
460         ext             T2.16b, XL.16b, XL.16b, #8
461
462         enc_round       KS0, v24
463         ext             IN1.16b, T1.16b, T1.16b, #8
464         eor             T1.16b, T1.16b, T2.16b
465
466         enc_round       KS1, v24
467         eor             XL.16b, XL.16b, IN1.16b
468
469         enc_round       KS0, v25
470         eor             T1.16b, T1.16b, XL.16b
471
472         enc_round       KS1, v25
473         pmull2          XH.1q, HH.2d, XL.2d             // a1 * b1
474
475         enc_round       KS0, v26
476         pmull           XL.1q, HH.1d, XL.1d             // a0 * b0
477
478         enc_round       KS1, v26
479         pmull2          XM.1q, SHASH2.2d, T1.2d         // (a1 + a0)(b1 + b0)
480
481         enc_round       KS0, v27
482         eor             XL.16b, XL.16b, XL2.16b
483         eor             XH.16b, XH.16b, XH2.16b
484
485         enc_round       KS1, v27
486         eor             XM.16b, XM.16b, XM2.16b
487         ext             T1.16b, XL.16b, XH.16b, #8
488
489         enc_round       KS0, v28
490         eor             T2.16b, XL.16b, XH.16b
491         eor             XM.16b, XM.16b, T1.16b
492
493         enc_round       KS1, v28
494         eor             XM.16b, XM.16b, T2.16b
495
496         enc_round       KS0, v29
497         pmull           T2.1q, XL.1d, MASK.1d
498
499         enc_round       KS1, v29
500         mov             XH.d[0], XM.d[1]
501         mov             XM.d[1], XL.d[0]
502
503         aese            KS0.16b, v30.16b
504         eor             XL.16b, XM.16b, T2.16b
505
506         aese            KS1.16b, v30.16b
507         ext             T2.16b, XL.16b, XL.16b, #8
508
509         eor             KS0.16b, KS0.16b, v31.16b
510         pmull           XL.1q, XL.1d, MASK.1d
511         eor             T2.16b, T2.16b, XH.16b
512
513         eor             KS1.16b, KS1.16b, v31.16b
514         eor             XL.16b, XL.16b, T2.16b
515
516         .if             \enc == 0
517         eor             INP0.16b, INP0.16b, KS0.16b
518         eor             INP1.16b, INP1.16b, KS1.16b
519         .endif
520
521         st1             {INP0.16b-INP1.16b}, [x2], #32
522
523         cbnz            w0, 0b
524
525 CPU_LE( rev             x8, x8          )
526         st1             {XL.2d}, [x1]
527         str             x8, [x5, #8]                    // store lower counter
528
529         .if             \enc == 1
530         st1             {KS0.16b-KS1.16b}, [x10]
531         .endif
532
533         ret
534
535 2:      b.eq            3f                              // AES-192?
536         enc_round       KS0, v17
537         enc_round       KS1, v17
538         enc_round       KS0, v18
539         enc_round       KS1, v18
540 3:      enc_round       KS0, v19
541         enc_round       KS1, v19
542         enc_round       KS0, v20
543         enc_round       KS1, v20
544         b               1b
545
546 4:      load_round_keys w7, x6
547         b               0b
548         .endm
549
550         /*
551          * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
552          *                        struct ghash_key const *k, u8 ctr[],
553          *                        int rounds, u8 ks[])
554          */
555 ENTRY(pmull_gcm_encrypt)
556         pmull_gcm_do_crypt      1
557 ENDPROC(pmull_gcm_encrypt)
558
559         /*
560          * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
561          *                        struct ghash_key const *k, u8 ctr[],
562          *                        int rounds)
563          */
564 ENTRY(pmull_gcm_decrypt)
565         pmull_gcm_do_crypt      0
566 ENDPROC(pmull_gcm_decrypt)
567
568         /*
569          * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
570          */
571 ENTRY(pmull_gcm_encrypt_block)
572         cbz             x2, 0f
573         load_round_keys w3, x2
574 0:      ld1             {v0.16b}, [x1]
575         enc_block       v0, w3
576         st1             {v0.16b}, [x0]
577         ret
578 ENDPROC(pmull_gcm_encrypt_block)