1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
5 * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
8 /* included by aes-ce.S and aes-neon.S */
25 SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
26 encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
28 SYM_FUNC_END(aes_encrypt_block4x)
30 SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
31 decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
33 SYM_FUNC_END(aes_decrypt_block4x)
36 SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
37 encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
39 SYM_FUNC_END(aes_encrypt_block5x)
41 SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
42 decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
44 SYM_FUNC_END(aes_decrypt_block5x)
48 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
50 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
54 AES_FUNC_START(aes_ecb_encrypt)
55 stp x29, x30, [sp, #-16]!
58 enc_prepare w3, x2, x5
61 subs w4, w4, #MAX_STRIDE
63 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
64 ST4( bl aes_encrypt_block4x )
65 ST5( ld1 {v4.16b}, [x1], #16 )
66 ST5( bl aes_encrypt_block5x )
67 st1 {v0.16b-v3.16b}, [x0], #64
68 ST5( st1 {v4.16b}, [x0], #16 )
71 adds w4, w4, #MAX_STRIDE
74 ld1 {v0.16b}, [x1], #16 /* get next pt block */
75 encrypt_block v0, w3, x2, x5, w6
76 st1 {v0.16b}, [x0], #16
80 ldp x29, x30, [sp], #16
82 AES_FUNC_END(aes_ecb_encrypt)
85 AES_FUNC_START(aes_ecb_decrypt)
86 stp x29, x30, [sp, #-16]!
89 dec_prepare w3, x2, x5
92 subs w4, w4, #MAX_STRIDE
94 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
95 ST4( bl aes_decrypt_block4x )
96 ST5( ld1 {v4.16b}, [x1], #16 )
97 ST5( bl aes_decrypt_block5x )
98 st1 {v0.16b-v3.16b}, [x0], #64
99 ST5( st1 {v4.16b}, [x0], #16 )
102 adds w4, w4, #MAX_STRIDE
105 ld1 {v0.16b}, [x1], #16 /* get next ct block */
106 decrypt_block v0, w3, x2, x5, w6
107 st1 {v0.16b}, [x0], #16
111 ldp x29, x30, [sp], #16
113 AES_FUNC_END(aes_ecb_decrypt)
117 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
118 * int blocks, u8 iv[])
119 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
120 * int blocks, u8 iv[])
121 * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
122 * int rounds, int blocks, u8 iv[],
124 * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
125 * int rounds, int blocks, u8 iv[],
129 AES_FUNC_START(aes_essiv_cbc_encrypt)
130 ld1 {v4.16b}, [x5] /* get iv */
132 mov w8, #14 /* AES-256: 14 rounds */
133 enc_prepare w8, x6, x7
134 encrypt_block v4, w8, x6, x7, w9
135 enc_switch_key w3, x2, x6
138 AES_FUNC_START(aes_cbc_encrypt)
139 ld1 {v4.16b}, [x5] /* get iv */
140 enc_prepare w3, x2, x6
145 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
146 eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */
147 encrypt_block v0, w3, x2, x6, w7
148 eor v1.16b, v1.16b, v0.16b
149 encrypt_block v1, w3, x2, x6, w7
150 eor v2.16b, v2.16b, v1.16b
151 encrypt_block v2, w3, x2, x6, w7
152 eor v3.16b, v3.16b, v2.16b
153 encrypt_block v3, w3, x2, x6, w7
154 st1 {v0.16b-v3.16b}, [x0], #64
161 ld1 {v0.16b}, [x1], #16 /* get next pt block */
162 eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */
163 encrypt_block v4, w3, x2, x6, w7
164 st1 {v4.16b}, [x0], #16
168 st1 {v4.16b}, [x5] /* return iv */
170 AES_FUNC_END(aes_cbc_encrypt)
171 AES_FUNC_END(aes_essiv_cbc_encrypt)
173 AES_FUNC_START(aes_essiv_cbc_decrypt)
174 stp x29, x30, [sp, #-16]!
177 ld1 {cbciv.16b}, [x5] /* get iv */
179 mov w8, #14 /* AES-256: 14 rounds */
180 enc_prepare w8, x6, x7
181 encrypt_block cbciv, w8, x6, x7, w9
184 AES_FUNC_START(aes_cbc_decrypt)
185 stp x29, x30, [sp, #-16]!
188 ld1 {cbciv.16b}, [x5] /* get iv */
190 dec_prepare w3, x2, x6
193 subs w4, w4, #MAX_STRIDE
195 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
197 ld1 {v4.16b}, [x1], #16 /* get 1 ct block */
201 bl aes_decrypt_block5x
203 eor v0.16b, v0.16b, cbciv.16b
204 eor v1.16b, v1.16b, v5.16b
205 ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */
206 ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
207 eor v2.16b, v2.16b, v6.16b
208 eor v3.16b, v3.16b, v7.16b
209 eor v4.16b, v4.16b, v5.16b
214 bl aes_decrypt_block4x
216 eor v0.16b, v0.16b, cbciv.16b
217 eor v1.16b, v1.16b, v4.16b
218 ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
219 eor v2.16b, v2.16b, v5.16b
220 eor v3.16b, v3.16b, v6.16b
222 st1 {v0.16b-v3.16b}, [x0], #64
223 ST5( st1 {v4.16b}, [x0], #16 )
226 adds w4, w4, #MAX_STRIDE
229 ld1 {v1.16b}, [x1], #16 /* get next ct block */
230 mov v0.16b, v1.16b /* ...and copy to v0 */
231 decrypt_block v0, w3, x2, x6, w7
232 eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */
233 mov cbciv.16b, v1.16b /* ct is next iv */
234 st1 {v0.16b}, [x0], #16
238 st1 {cbciv.16b}, [x5] /* return iv */
239 ldp x29, x30, [sp], #16
241 AES_FUNC_END(aes_cbc_decrypt)
242 AES_FUNC_END(aes_essiv_cbc_decrypt)
246 * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
247 * int rounds, int bytes, u8 const iv[])
248 * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
249 * int rounds, int bytes, u8 const iv[])
252 AES_FUNC_START(aes_cbc_cts_encrypt)
253 adr_l x8, .Lcts_permute_table
261 ld1 {v0.16b}, [x1], x4 /* overlapping loads */
264 ld1 {v5.16b}, [x5] /* get iv */
265 enc_prepare w3, x2, x6
267 eor v0.16b, v0.16b, v5.16b /* xor with iv */
268 tbl v1.16b, {v1.16b}, v4.16b
269 encrypt_block v0, w3, x2, x6, w7
271 eor v1.16b, v1.16b, v0.16b
272 tbl v0.16b, {v0.16b}, v3.16b
273 encrypt_block v1, w3, x2, x6, w7
276 st1 {v0.16b}, [x4] /* overlapping stores */
279 AES_FUNC_END(aes_cbc_cts_encrypt)
281 AES_FUNC_START(aes_cbc_cts_decrypt)
282 adr_l x8, .Lcts_permute_table
290 ld1 {v0.16b}, [x1], x4 /* overlapping loads */
293 ld1 {v5.16b}, [x5] /* get iv */
294 dec_prepare w3, x2, x6
296 decrypt_block v0, w3, x2, x6, w7
297 tbl v2.16b, {v0.16b}, v3.16b
298 eor v2.16b, v2.16b, v1.16b
300 tbx v0.16b, {v1.16b}, v4.16b
301 decrypt_block v0, w3, x2, x6, w7
302 eor v0.16b, v0.16b, v5.16b /* xor with iv */
305 st1 {v2.16b}, [x4] /* overlapping stores */
308 AES_FUNC_END(aes_cbc_cts_decrypt)
310 .section ".rodata", "a"
313 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
314 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
315 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
316 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
317 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
318 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
323 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
324 * int bytes, u8 ctr[], u8 finalbuf[])
327 AES_FUNC_START(aes_ctr_encrypt)
328 stp x29, x30, [sp, #-16]!
331 enc_prepare w3, x2, x12
334 umov x12, vctr.d[1] /* keep swabbed ctr in reg */
339 sub w4, w4, #MAX_STRIDE << 4
350 ST5( mov v4.16b, vctr.16b )
354 /* apply carry to outgoing counter */
355 0: umov x8, vctr.d[0]
361 /* apply carry to N counter blocks for N := x12 */
364 sub x16, x16, x12, lsl #3
367 mov v0.d[0], vctr.d[0]
369 mov v1.d[0], vctr.d[0]
371 mov v2.d[0], vctr.d[0]
373 mov v3.d[0], vctr.d[0]
375 ST5( mov v4.d[0], vctr.d[0] )
381 sub x7, x12, #MAX_STRIDE - 1
382 sub x8, x12, #MAX_STRIDE - 2
383 sub x9, x12, #MAX_STRIDE - 3
388 ST5( sub x10, x12, #MAX_STRIDE - 4 )
392 ST5( mov v4.d[1], x10 )
393 tbnz w4, #31, .Lctrtail
394 ld1 {v5.16b-v7.16b}, [x1], #48
395 ST4( bl aes_encrypt_block4x )
396 ST5( bl aes_encrypt_block5x )
397 eor v0.16b, v5.16b, v0.16b
398 ST4( ld1 {v5.16b}, [x1], #16 )
399 eor v1.16b, v6.16b, v1.16b
400 ST5( ld1 {v5.16b-v6.16b}, [x1], #32 )
401 eor v2.16b, v7.16b, v2.16b
402 eor v3.16b, v5.16b, v3.16b
403 ST5( eor v4.16b, v6.16b, v4.16b )
404 st1 {v0.16b-v3.16b}, [x0], #64
405 ST5( st1 {v4.16b}, [x0], #16 )
410 st1 {vctr.16b}, [x5] /* return next CTR value */
411 ldp x29, x30, [sp], #16
415 /* XOR up to MAX_STRIDE * 16 - 1 bytes of in/output with v0 ... v3/v4 */
418 csel x13, x13, x16, ne
420 ST5( cmp w4, #64 - (MAX_STRIDE << 4) )
421 ST5( csel x14, x16, xzr, gt )
422 cmp w4, #48 - (MAX_STRIDE << 4)
423 csel x15, x16, xzr, gt
424 cmp w4, #32 - (MAX_STRIDE << 4)
425 csel x16, x16, xzr, gt
426 cmp w4, #16 - (MAX_STRIDE << 4)
429 adr_l x12, .Lcts_permute_table
432 ST5( ld1 {v5.16b}, [x1], x14 )
433 ld1 {v6.16b}, [x1], x15
434 ld1 {v7.16b}, [x1], x16
436 ST4( bl aes_encrypt_block4x )
437 ST5( bl aes_encrypt_block5x )
439 ld1 {v8.16b}, [x1], x13
443 ST4( eor v6.16b, v6.16b, v0.16b )
444 ST4( eor v7.16b, v7.16b, v1.16b )
445 ST4( tbl v3.16b, {v3.16b}, v10.16b )
446 ST4( eor v8.16b, v8.16b, v2.16b )
447 ST4( eor v9.16b, v9.16b, v3.16b )
449 ST5( eor v5.16b, v5.16b, v0.16b )
450 ST5( eor v6.16b, v6.16b, v1.16b )
451 ST5( tbl v4.16b, {v4.16b}, v10.16b )
452 ST5( eor v7.16b, v7.16b, v2.16b )
453 ST5( eor v8.16b, v8.16b, v3.16b )
454 ST5( eor v9.16b, v9.16b, v4.16b )
456 ST5( st1 {v5.16b}, [x0], x14 )
457 st1 {v6.16b}, [x0], x15
458 st1 {v7.16b}, [x0], x16
460 st1 {v9.16b}, [x13] // overlapping stores
465 csel x0, x0, x6, eq // use finalbuf if less than a full block
467 ST5( mov v3.16b, v4.16b )
468 encrypt_block v3, w3, x2, x8, w7
469 eor v5.16b, v5.16b, v3.16b
472 AES_FUNC_END(aes_ctr_encrypt)
476 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
477 * int bytes, u8 const rk2[], u8 iv[], int first)
478 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
479 * int bytes, u8 const rk2[], u8 iv[], int first)
482 .macro next_tweak, out, in, tmp
483 sshr \tmp\().2d, \in\().2d, #63
484 and \tmp\().16b, \tmp\().16b, xtsmask.16b
485 add \out\().2d, \in\().2d, \in\().2d
486 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
487 eor \out\().16b, \out\().16b, \tmp\().16b
490 .macro xts_load_mask, tmp
491 movi xtsmask.2s, #0x1
492 movi \tmp\().2s, #0x87
493 uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s
496 AES_FUNC_START(aes_xts_encrypt)
497 stp x29, x30, [sp, #-16]!
502 cbz w7, .Lxtsencnotfirst
504 enc_prepare w3, x5, x8
505 xts_cts_skip_tw w7, .LxtsencNx
506 encrypt_block v4, w3, x5, x8, w7 /* first tweak */
507 enc_switch_key w3, x2, x8
511 enc_prepare w3, x2, x8
513 next_tweak v4, v4, v8
517 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
518 next_tweak v5, v4, v8
519 eor v0.16b, v0.16b, v4.16b
520 next_tweak v6, v5, v8
521 eor v1.16b, v1.16b, v5.16b
522 eor v2.16b, v2.16b, v6.16b
523 next_tweak v7, v6, v8
524 eor v3.16b, v3.16b, v7.16b
525 bl aes_encrypt_block4x
526 eor v3.16b, v3.16b, v7.16b
527 eor v0.16b, v0.16b, v4.16b
528 eor v1.16b, v1.16b, v5.16b
529 eor v2.16b, v2.16b, v6.16b
530 st1 {v0.16b-v3.16b}, [x0], #64
541 ld1 {v0.16b}, [x1], #16
543 eor v0.16b, v0.16b, v4.16b
544 encrypt_block v0, w3, x2, x8, w7
545 eor v0.16b, v0.16b, v4.16b
548 next_tweak v4, v4, v8
550 st1 {v0.16b}, [x0], #16
556 ldp x29, x30, [sp], #16
563 adr_l x8, .Lcts_permute_table
565 add x1, x1, w4, sxtw /* rewind input pointer */
566 add w4, w4, #16 /* # bytes in final block */
570 add x4, x0, x4 /* output address of final block */
572 ld1 {v1.16b}, [x1] /* load final block */
576 tbl v2.16b, {v0.16b}, v2.16b
577 tbx v0.16b, {v1.16b}, v3.16b
578 st1 {v2.16b}, [x4] /* overlapping stores */
581 AES_FUNC_END(aes_xts_encrypt)
583 AES_FUNC_START(aes_xts_decrypt)
584 stp x29, x30, [sp, #-16]!
587 /* subtract 16 bytes if we are doing CTS */
594 xts_cts_skip_tw w7, .Lxtsdecskiptw
595 cbz w7, .Lxtsdecnotfirst
597 enc_prepare w3, x5, x8
598 encrypt_block v4, w3, x5, x8, w7 /* first tweak */
600 dec_prepare w3, x2, x8
604 dec_prepare w3, x2, x8
606 next_tweak v4, v4, v8
610 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
611 next_tweak v5, v4, v8
612 eor v0.16b, v0.16b, v4.16b
613 next_tweak v6, v5, v8
614 eor v1.16b, v1.16b, v5.16b
615 eor v2.16b, v2.16b, v6.16b
616 next_tweak v7, v6, v8
617 eor v3.16b, v3.16b, v7.16b
618 bl aes_decrypt_block4x
619 eor v3.16b, v3.16b, v7.16b
620 eor v0.16b, v0.16b, v4.16b
621 eor v1.16b, v1.16b, v5.16b
622 eor v2.16b, v2.16b, v6.16b
623 st1 {v0.16b-v3.16b}, [x0], #64
633 ld1 {v0.16b}, [x1], #16
636 eor v0.16b, v0.16b, v4.16b
637 decrypt_block v0, w3, x2, x8, w7
638 eor v0.16b, v0.16b, v4.16b
639 st1 {v0.16b}, [x0], #16
642 next_tweak v4, v4, v8
646 ldp x29, x30, [sp], #16
650 adr_l x8, .Lcts_permute_table
652 add x1, x1, w4, sxtw /* rewind input pointer */
653 add w4, w4, #16 /* # bytes in final block */
657 add x4, x0, x4 /* output address of final block */
659 next_tweak v5, v4, v8
661 ld1 {v1.16b}, [x1] /* load final block */
665 eor v0.16b, v0.16b, v5.16b
666 decrypt_block v0, w3, x2, x8, w7
667 eor v0.16b, v0.16b, v5.16b
669 tbl v2.16b, {v0.16b}, v2.16b
670 tbx v0.16b, {v1.16b}, v3.16b
672 st1 {v2.16b}, [x4] /* overlapping stores */
675 AES_FUNC_END(aes_xts_decrypt)
678 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
679 * int blocks, u8 dg[], int enc_before, int enc_after)
681 AES_FUNC_START(aes_mac_update)
682 ld1 {v0.16b}, [x4] /* get dg */
683 enc_prepare w2, x1, x7
686 encrypt_block v0, w2, x1, x7, w8
691 ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */
692 eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
693 encrypt_block v0, w2, x1, x7, w8
694 eor v0.16b, v0.16b, v2.16b
695 encrypt_block v0, w2, x1, x7, w8
696 eor v0.16b, v0.16b, v3.16b
697 encrypt_block v0, w2, x1, x7, w8
698 eor v0.16b, v0.16b, v4.16b
700 csinv x5, x6, xzr, eq
702 encrypt_block v0, w2, x1, x7, w8
703 st1 {v0.16b}, [x4] /* return dg */
704 cond_yield .Lmacout, x7, x8
710 ld1 {v1.16b}, [x0], #16 /* get next pt block */
711 eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
714 csinv x5, x6, xzr, eq
718 encrypt_block v0, w2, x1, x7, w8
722 st1 {v0.16b}, [x4] /* return dg */
725 AES_FUNC_END(aes_mac_update)