2 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
4 * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
11 #include <linux/linkage.h>
12 #include <asm/assembler.h>
65 .macro __pmull_p64, rd, rn, rm
66 pmull \rd\().1q, \rn\().1d, \rm\().1d
69 .macro __pmull2_p64, rd, rn, rm
70 pmull2 \rd\().1q, \rn\().2d, \rm\().2d
73 .macro __pmull_p8, rq, ad, bd
74 ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1
75 ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2
76 ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3
78 __pmull_p8_\bd \rq, \ad
81 .macro __pmull2_p8, rq, ad, bd
82 tbl t3.16b, {\ad\().16b}, perm1.16b // A1
83 tbl t5.16b, {\ad\().16b}, perm2.16b // A2
84 tbl t7.16b, {\ad\().16b}, perm3.16b // A3
86 __pmull2_p8_\bd \rq, \ad
89 .macro __pmull_p8_SHASH, rq, ad
90 __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
93 .macro __pmull_p8_SHASH2, rq, ad
94 __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
97 .macro __pmull2_p8_SHASH, rq, ad
98 __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
101 .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
102 pmull\t t3.8h, t3.\nb, \bd // F = A1*B
103 pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1
104 pmull\t t5.8h, t5.\nb, \bd // H = A2*B
105 pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2
106 pmull\t t7.8h, t7.\nb, \bd // J = A3*B
107 pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3
108 pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4
109 pmull\t \rq\().8h, \ad, \bd // D = A*B
111 eor t3.16b, t3.16b, t4.16b // L = E + F
112 eor t5.16b, t5.16b, t6.16b // M = G + H
113 eor t7.16b, t7.16b, t8.16b // N = I + J
115 uzp1 t4.2d, t3.2d, t5.2d
116 uzp2 t3.2d, t3.2d, t5.2d
117 uzp1 t6.2d, t7.2d, t9.2d
118 uzp2 t7.2d, t7.2d, t9.2d
120 // t3 = (L) (P0 + P1) << 8
121 // t5 = (M) (P2 + P3) << 16
122 eor t4.16b, t4.16b, t3.16b
123 and t3.16b, t3.16b, k32_48.16b
125 // t7 = (N) (P4 + P5) << 24
126 // t9 = (K) (P6 + P7) << 32
127 eor t6.16b, t6.16b, t7.16b
128 and t7.16b, t7.16b, k00_16.16b
130 eor t4.16b, t4.16b, t3.16b
131 eor t6.16b, t6.16b, t7.16b
133 zip2 t5.2d, t4.2d, t3.2d
134 zip1 t3.2d, t4.2d, t3.2d
135 zip2 t9.2d, t6.2d, t7.2d
136 zip1 t7.2d, t6.2d, t7.2d
138 ext t3.16b, t3.16b, t3.16b, #15
139 ext t5.16b, t5.16b, t5.16b, #14
140 ext t7.16b, t7.16b, t7.16b, #13
141 ext t9.16b, t9.16b, t9.16b, #12
143 eor t3.16b, t3.16b, t5.16b
144 eor t7.16b, t7.16b, t9.16b
145 eor \rq\().16b, \rq\().16b, t3.16b
146 eor \rq\().16b, \rq\().16b, t7.16b
149 .macro __pmull_pre_p64
151 ld1 {HH.2d-HH4.2d}, [x8]
153 trn1 SHASH2.2d, SHASH.2d, HH.2d
154 trn2 T1.2d, SHASH.2d, HH.2d
155 eor SHASH2.16b, SHASH2.16b, T1.16b
157 trn1 HH34.2d, HH3.2d, HH4.2d
158 trn2 T1.2d, HH3.2d, HH4.2d
159 eor HH34.16b, HH34.16b, T1.16b
162 shl MASK.2d, MASK.2d, #57
165 .macro __pmull_pre_p8
166 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
167 eor SHASH2.16b, SHASH2.16b, SHASH.16b
169 // k00_16 := 0x0000000000000000_000000000000ffff
170 // k32_48 := 0x00000000ffffffff_0000ffffffffffff
171 movi k32_48.2d, #0xffffffff
172 mov k32_48.h[2], k32_48.h[0]
173 ushr k00_16.2d, k32_48.2d, #32
175 // prepare the permutation vectors
176 mov_q x5, 0x080f0e0d0c0b0a09
179 eor perm1.16b, perm1.16b, T1.16b
180 ushr perm2.2d, perm1.2d, #8
181 ushr perm3.2d, perm1.2d, #16
182 ushr T1.2d, perm1.2d, #24
183 sli perm2.2d, perm1.2d, #56
184 sli perm3.2d, perm1.2d, #48
185 sli T1.2d, perm1.2d, #40
187 // precompute loop invariants
188 tbl sh1.16b, {SHASH.16b}, perm1.16b
189 tbl sh2.16b, {SHASH.16b}, perm2.16b
190 tbl sh3.16b, {SHASH.16b}, perm3.16b
191 tbl sh4.16b, {SHASH.16b}, T1.16b
192 ext ss1.8b, SHASH2.8b, SHASH2.8b, #1
193 ext ss2.8b, SHASH2.8b, SHASH2.8b, #2
194 ext ss3.8b, SHASH2.8b, SHASH2.8b, #3
195 ext ss4.8b, SHASH2.8b, SHASH2.8b, #4
199 // PMULL (64x64->128) based reduction for CPUs that can do
200 // it in a single instruction.
202 .macro __pmull_reduce_p64
203 pmull T2.1q, XL.1d, MASK.1d
204 eor XM.16b, XM.16b, T1.16b
209 eor XL.16b, XM.16b, T2.16b
210 ext T2.16b, XL.16b, XL.16b, #8
211 pmull XL.1q, XL.1d, MASK.1d
215 // Alternative reduction for CPUs that lack support for the
216 // 64x64->128 PMULL instruction
218 .macro __pmull_reduce_p8
219 eor XM.16b, XM.16b, T1.16b
224 shl T1.2d, XL.2d, #57
225 shl T2.2d, XL.2d, #62
226 eor T2.16b, T2.16b, T1.16b
227 shl T1.2d, XL.2d, #63
228 eor T2.16b, T2.16b, T1.16b
229 ext T1.16b, XL.16b, XH.16b, #8
230 eor T2.16b, T2.16b, T1.16b
235 ushr T2.2d, XL.2d, #1
236 eor XH.16b, XH.16b, XL.16b
237 eor XL.16b, XL.16b, T2.16b
238 ushr T2.2d, T2.2d, #6
239 ushr XL.2d, XL.2d, #1
242 .macro __pmull_ghash, pn
248 /* do the head block first, if supplied */
255 tbnz w0, #0, 2f // skip until #blocks is a
256 tbnz w0, #1, 2f // round multiple of 4
258 1: ld1 {XM3.16b-TT4.16b}, [x2], #64
262 rev64 T1.16b, XM3.16b
263 rev64 T2.16b, XH3.16b
264 rev64 TT4.16b, TT4.16b
265 rev64 TT3.16b, TT3.16b
267 ext IN1.16b, TT4.16b, TT4.16b, #8
268 ext XL3.16b, TT3.16b, TT3.16b, #8
270 eor TT4.16b, TT4.16b, IN1.16b
271 pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1
272 pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0
273 pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
275 eor TT3.16b, TT3.16b, XL3.16b
276 pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1
277 pmull XL3.1q, HH.1d, XL3.1d // a0 * b0
278 pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
280 ext IN1.16b, T2.16b, T2.16b, #8
281 eor XL2.16b, XL2.16b, XL3.16b
282 eor XH2.16b, XH2.16b, XH3.16b
283 eor XM2.16b, XM2.16b, XM3.16b
285 eor T2.16b, T2.16b, IN1.16b
286 pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1
287 pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0
288 pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
290 eor XL2.16b, XL2.16b, XL3.16b
291 eor XH2.16b, XH2.16b, XH3.16b
292 eor XM2.16b, XM2.16b, XM3.16b
294 ext IN1.16b, T1.16b, T1.16b, #8
295 ext TT3.16b, XL.16b, XL.16b, #8
296 eor XL.16b, XL.16b, IN1.16b
297 eor T1.16b, T1.16b, TT3.16b
299 pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1
300 eor T1.16b, T1.16b, XL.16b
301 pmull XL.1q, HH4.1d, XL.1d // a0 * b0
302 pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
304 eor XL.16b, XL.16b, XL2.16b
305 eor XH.16b, XH.16b, XH2.16b
306 eor XM.16b, XM.16b, XM2.16b
308 eor T2.16b, XL.16b, XH.16b
309 ext T1.16b, XL.16b, XH.16b, #8
310 eor XM.16b, XM.16b, T2.16b
314 eor T2.16b, T2.16b, XH.16b
315 eor XL.16b, XL.16b, T2.16b
321 2: ld1 {T1.2d}, [x2], #16
324 3: /* multiply XL by SHASH in GF(2^128) */
325 CPU_LE( rev64 T1.16b, T1.16b )
327 ext T2.16b, XL.16b, XL.16b, #8
328 ext IN1.16b, T1.16b, T1.16b, #8
329 eor T1.16b, T1.16b, T2.16b
330 eor XL.16b, XL.16b, IN1.16b
332 __pmull2_\pn XH, XL, SHASH // a1 * b1
333 eor T1.16b, T1.16b, XL.16b
334 __pmull_\pn XL, XL, SHASH // a0 * b0
335 __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0)
337 4: eor T2.16b, XL.16b, XH.16b
338 ext T1.16b, XL.16b, XH.16b, #8
339 eor XM.16b, XM.16b, T2.16b
343 eor T2.16b, T2.16b, XH.16b
344 eor XL.16b, XL.16b, T2.16b
353 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
354 * struct ghash_key const *k, const char *head)
356 ENTRY(pmull_ghash_update_p64)
358 ENDPROC(pmull_ghash_update_p64)
360 ENTRY(pmull_ghash_update_p8)
362 ENDPROC(pmull_ghash_update_p8)
369 .macro load_round_keys, rounds, rk
371 blo 2222f /* 128 bits */
372 beq 1111f /* 192 bits */
373 ld1 {v17.4s-v18.4s}, [\rk], #32
374 1111: ld1 {v19.4s-v20.4s}, [\rk], #32
375 2222: ld1 {v21.4s-v24.4s}, [\rk], #64
376 ld1 {v25.4s-v28.4s}, [\rk], #64
377 ld1 {v29.4s-v31.4s}, [\rk]
380 .macro enc_round, state, key
381 aese \state\().16b, \key\().16b
382 aesmc \state\().16b, \state\().16b
385 .macro enc_block, state, rounds
387 b.lo 2222f /* 128 bits */
388 b.eq 1111f /* 192 bits */
389 enc_round \state, v17
390 enc_round \state, v18
391 1111: enc_round \state, v19
392 enc_round \state, v20
393 2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
394 enc_round \state, \key
396 aese \state\().16b, v30.16b
397 eor \state\().16b, \state\().16b, v31.16b
400 .macro pmull_gcm_do_crypt, enc
401 ld1 {SHASH.2d}, [x4], #16
404 ldr x8, [x5, #8] // load lower counter
407 trn1 SHASH2.2d, SHASH.2d, HH.2d
408 trn2 T1.2d, SHASH.2d, HH.2d
410 shl MASK.2d, MASK.2d, #57
411 eor SHASH2.16b, SHASH2.16b, T1.16b
415 ld1 {KS0.16b-KS1.16b}, [x10]
420 0: ld1 {INP0.16b-INP1.16b}, [x3], #32
427 eor INP0.16b, INP0.16b, KS0.16b // encrypt input
428 eor INP1.16b, INP1.16b, KS1.16b
431 ld1 {KS0.8b}, [x5] // load upper counter
435 ins KS0.d[1], x9 // set lower counter
438 rev64 T1.16b, INP1.16b
441 b.ge 2f // AES-192/256?
443 1: enc_round KS0, v21
444 ext IN1.16b, T1.16b, T1.16b, #8
447 pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1
450 eor T1.16b, T1.16b, IN1.16b
453 pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0
456 pmull XM2.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
459 rev64 T1.16b, INP0.16b
460 ext T2.16b, XL.16b, XL.16b, #8
463 ext IN1.16b, T1.16b, T1.16b, #8
464 eor T1.16b, T1.16b, T2.16b
467 eor XL.16b, XL.16b, IN1.16b
470 eor T1.16b, T1.16b, XL.16b
473 pmull2 XH.1q, HH.2d, XL.2d // a1 * b1
476 pmull XL.1q, HH.1d, XL.1d // a0 * b0
479 pmull2 XM.1q, SHASH2.2d, T1.2d // (a1 + a0)(b1 + b0)
482 eor XL.16b, XL.16b, XL2.16b
483 eor XH.16b, XH.16b, XH2.16b
486 eor XM.16b, XM.16b, XM2.16b
487 ext T1.16b, XL.16b, XH.16b, #8
490 eor T2.16b, XL.16b, XH.16b
491 eor XM.16b, XM.16b, T1.16b
494 eor XM.16b, XM.16b, T2.16b
497 pmull T2.1q, XL.1d, MASK.1d
503 aese KS0.16b, v30.16b
504 eor XL.16b, XM.16b, T2.16b
506 aese KS1.16b, v30.16b
507 ext T2.16b, XL.16b, XL.16b, #8
509 eor KS0.16b, KS0.16b, v31.16b
510 pmull XL.1q, XL.1d, MASK.1d
511 eor T2.16b, T2.16b, XH.16b
513 eor KS1.16b, KS1.16b, v31.16b
514 eor XL.16b, XL.16b, T2.16b
517 eor INP0.16b, INP0.16b, KS0.16b
518 eor INP1.16b, INP1.16b, KS1.16b
521 st1 {INP0.16b-INP1.16b}, [x2], #32
527 str x8, [x5, #8] // store lower counter
530 st1 {KS0.16b-KS1.16b}, [x10]
535 2: b.eq 3f // AES-192?
540 3: enc_round KS0, v19
546 4: load_round_keys w7, x6
551 * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
552 * struct ghash_key const *k, u8 ctr[],
553 * int rounds, u8 ks[])
555 ENTRY(pmull_gcm_encrypt)
557 ENDPROC(pmull_gcm_encrypt)
560 * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
561 * struct ghash_key const *k, u8 ctr[],
564 ENTRY(pmull_gcm_decrypt)
566 ENDPROC(pmull_gcm_decrypt)
569 * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
571 ENTRY(pmull_gcm_encrypt_block)
573 load_round_keys w3, x2
574 0: ld1 {v0.16b}, [x1]
578 ENDPROC(pmull_gcm_encrypt_block)