GNU Linux-libre 5.4.274-gnu1
[releases.git] / arch / arm64 / crypto / aes-modes.S
1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
4  *
5  * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
6  */
7
8 /* included by aes-ce.S and aes-neon.S */
9
10         .text
11         .align          4
12
13 #ifndef MAX_STRIDE
14 #define MAX_STRIDE      4
15 #endif
16
17 #if MAX_STRIDE == 4
18 #define ST4(x...) x
19 #define ST5(x...)
20 #else
21 #define ST4(x...)
22 #define ST5(x...) x
23 #endif
24
25 aes_encrypt_block4x:
26         encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
27         ret
28 ENDPROC(aes_encrypt_block4x)
29
30 aes_decrypt_block4x:
31         decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
32         ret
33 ENDPROC(aes_decrypt_block4x)
34
35 #if MAX_STRIDE == 5
36 aes_encrypt_block5x:
37         encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
38         ret
39 ENDPROC(aes_encrypt_block5x)
40
41 aes_decrypt_block5x:
42         decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
43         ret
44 ENDPROC(aes_decrypt_block5x)
45 #endif
46
47         /*
48          * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
49          *                 int blocks)
50          * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
51          *                 int blocks)
52          */
53
54 AES_ENTRY(aes_ecb_encrypt)
55         stp             x29, x30, [sp, #-16]!
56         mov             x29, sp
57
58         enc_prepare     w3, x2, x5
59
60 .LecbencloopNx:
61         subs            w4, w4, #MAX_STRIDE
62         bmi             .Lecbenc1x
63         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
64 ST4(    bl              aes_encrypt_block4x             )
65 ST5(    ld1             {v4.16b}, [x1], #16             )
66 ST5(    bl              aes_encrypt_block5x             )
67         st1             {v0.16b-v3.16b}, [x0], #64
68 ST5(    st1             {v4.16b}, [x0], #16             )
69         b               .LecbencloopNx
70 .Lecbenc1x:
71         adds            w4, w4, #MAX_STRIDE
72         beq             .Lecbencout
73 .Lecbencloop:
74         ld1             {v0.16b}, [x1], #16             /* get next pt block */
75         encrypt_block   v0, w3, x2, x5, w6
76         st1             {v0.16b}, [x0], #16
77         subs            w4, w4, #1
78         bne             .Lecbencloop
79 .Lecbencout:
80         ldp             x29, x30, [sp], #16
81         ret
82 AES_ENDPROC(aes_ecb_encrypt)
83
84
85 AES_ENTRY(aes_ecb_decrypt)
86         stp             x29, x30, [sp, #-16]!
87         mov             x29, sp
88
89         dec_prepare     w3, x2, x5
90
91 .LecbdecloopNx:
92         subs            w4, w4, #MAX_STRIDE
93         bmi             .Lecbdec1x
94         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
95 ST4(    bl              aes_decrypt_block4x             )
96 ST5(    ld1             {v4.16b}, [x1], #16             )
97 ST5(    bl              aes_decrypt_block5x             )
98         st1             {v0.16b-v3.16b}, [x0], #64
99 ST5(    st1             {v4.16b}, [x0], #16             )
100         b               .LecbdecloopNx
101 .Lecbdec1x:
102         adds            w4, w4, #MAX_STRIDE
103         beq             .Lecbdecout
104 .Lecbdecloop:
105         ld1             {v0.16b}, [x1], #16             /* get next ct block */
106         decrypt_block   v0, w3, x2, x5, w6
107         st1             {v0.16b}, [x0], #16
108         subs            w4, w4, #1
109         bne             .Lecbdecloop
110 .Lecbdecout:
111         ldp             x29, x30, [sp], #16
112         ret
113 AES_ENDPROC(aes_ecb_decrypt)
114
115
116         /*
117          * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
118          *                 int blocks, u8 iv[])
119          * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
120          *                 int blocks, u8 iv[])
121          * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
122          *                       int rounds, int blocks, u8 iv[],
123          *                       u32 const rk2[]);
124          * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
125          *                       int rounds, int blocks, u8 iv[],
126          *                       u32 const rk2[]);
127          */
128
129 AES_ENTRY(aes_essiv_cbc_encrypt)
130         ld1             {v4.16b}, [x5]                  /* get iv */
131
132         mov             w8, #14                         /* AES-256: 14 rounds */
133         enc_prepare     w8, x6, x7
134         encrypt_block   v4, w8, x6, x7, w9
135         enc_switch_key  w3, x2, x6
136         b               .Lcbcencloop4x
137
138 AES_ENTRY(aes_cbc_encrypt)
139         ld1             {v4.16b}, [x5]                  /* get iv */
140         enc_prepare     w3, x2, x6
141
142 .Lcbcencloop4x:
143         subs            w4, w4, #4
144         bmi             .Lcbcenc1x
145         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
146         eor             v0.16b, v0.16b, v4.16b          /* ..and xor with iv */
147         encrypt_block   v0, w3, x2, x6, w7
148         eor             v1.16b, v1.16b, v0.16b
149         encrypt_block   v1, w3, x2, x6, w7
150         eor             v2.16b, v2.16b, v1.16b
151         encrypt_block   v2, w3, x2, x6, w7
152         eor             v3.16b, v3.16b, v2.16b
153         encrypt_block   v3, w3, x2, x6, w7
154         st1             {v0.16b-v3.16b}, [x0], #64
155         mov             v4.16b, v3.16b
156         b               .Lcbcencloop4x
157 .Lcbcenc1x:
158         adds            w4, w4, #4
159         beq             .Lcbcencout
160 .Lcbcencloop:
161         ld1             {v0.16b}, [x1], #16             /* get next pt block */
162         eor             v4.16b, v4.16b, v0.16b          /* ..and xor with iv */
163         encrypt_block   v4, w3, x2, x6, w7
164         st1             {v4.16b}, [x0], #16
165         subs            w4, w4, #1
166         bne             .Lcbcencloop
167 .Lcbcencout:
168         st1             {v4.16b}, [x5]                  /* return iv */
169         ret
170 AES_ENDPROC(aes_cbc_encrypt)
171 AES_ENDPROC(aes_essiv_cbc_encrypt)
172
173 AES_ENTRY(aes_essiv_cbc_decrypt)
174         stp             x29, x30, [sp, #-16]!
175         mov             x29, sp
176
177         ld1             {cbciv.16b}, [x5]               /* get iv */
178
179         mov             w8, #14                         /* AES-256: 14 rounds */
180         enc_prepare     w8, x6, x7
181         encrypt_block   cbciv, w8, x6, x7, w9
182         b               .Lessivcbcdecstart
183
184 AES_ENTRY(aes_cbc_decrypt)
185         stp             x29, x30, [sp, #-16]!
186         mov             x29, sp
187
188         ld1             {cbciv.16b}, [x5]               /* get iv */
189 .Lessivcbcdecstart:
190         dec_prepare     w3, x2, x6
191
192 .LcbcdecloopNx:
193         subs            w4, w4, #MAX_STRIDE
194         bmi             .Lcbcdec1x
195         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
196 #if MAX_STRIDE == 5
197         ld1             {v4.16b}, [x1], #16             /* get 1 ct block */
198         mov             v5.16b, v0.16b
199         mov             v6.16b, v1.16b
200         mov             v7.16b, v2.16b
201         bl              aes_decrypt_block5x
202         sub             x1, x1, #32
203         eor             v0.16b, v0.16b, cbciv.16b
204         eor             v1.16b, v1.16b, v5.16b
205         ld1             {v5.16b}, [x1], #16             /* reload 1 ct block */
206         ld1             {cbciv.16b}, [x1], #16          /* reload 1 ct block */
207         eor             v2.16b, v2.16b, v6.16b
208         eor             v3.16b, v3.16b, v7.16b
209         eor             v4.16b, v4.16b, v5.16b
210 #else
211         mov             v4.16b, v0.16b
212         mov             v5.16b, v1.16b
213         mov             v6.16b, v2.16b
214         bl              aes_decrypt_block4x
215         sub             x1, x1, #16
216         eor             v0.16b, v0.16b, cbciv.16b
217         eor             v1.16b, v1.16b, v4.16b
218         ld1             {cbciv.16b}, [x1], #16          /* reload 1 ct block */
219         eor             v2.16b, v2.16b, v5.16b
220         eor             v3.16b, v3.16b, v6.16b
221 #endif
222         st1             {v0.16b-v3.16b}, [x0], #64
223 ST5(    st1             {v4.16b}, [x0], #16             )
224         b               .LcbcdecloopNx
225 .Lcbcdec1x:
226         adds            w4, w4, #MAX_STRIDE
227         beq             .Lcbcdecout
228 .Lcbcdecloop:
229         ld1             {v1.16b}, [x1], #16             /* get next ct block */
230         mov             v0.16b, v1.16b                  /* ...and copy to v0 */
231         decrypt_block   v0, w3, x2, x6, w7
232         eor             v0.16b, v0.16b, cbciv.16b       /* xor with iv => pt */
233         mov             cbciv.16b, v1.16b               /* ct is next iv */
234         st1             {v0.16b}, [x0], #16
235         subs            w4, w4, #1
236         bne             .Lcbcdecloop
237 .Lcbcdecout:
238         st1             {cbciv.16b}, [x5]               /* return iv */
239         ldp             x29, x30, [sp], #16
240         ret
241 AES_ENDPROC(aes_cbc_decrypt)
242 AES_ENDPROC(aes_essiv_cbc_decrypt)
243
244
245         /*
246          * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
247          *                     int rounds, int bytes, u8 const iv[])
248          * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
249          *                     int rounds, int bytes, u8 const iv[])
250          */
251
252 AES_ENTRY(aes_cbc_cts_encrypt)
253         adr_l           x8, .Lcts_permute_table
254         sub             x4, x4, #16
255         add             x9, x8, #32
256         add             x8, x8, x4
257         sub             x9, x9, x4
258         ld1             {v3.16b}, [x8]
259         ld1             {v4.16b}, [x9]
260
261         ld1             {v0.16b}, [x1], x4              /* overlapping loads */
262         ld1             {v1.16b}, [x1]
263
264         ld1             {v5.16b}, [x5]                  /* get iv */
265         enc_prepare     w3, x2, x6
266
267         eor             v0.16b, v0.16b, v5.16b          /* xor with iv */
268         tbl             v1.16b, {v1.16b}, v4.16b
269         encrypt_block   v0, w3, x2, x6, w7
270
271         eor             v1.16b, v1.16b, v0.16b
272         tbl             v0.16b, {v0.16b}, v3.16b
273         encrypt_block   v1, w3, x2, x6, w7
274
275         add             x4, x0, x4
276         st1             {v0.16b}, [x4]                  /* overlapping stores */
277         st1             {v1.16b}, [x0]
278         ret
279 AES_ENDPROC(aes_cbc_cts_encrypt)
280
281 AES_ENTRY(aes_cbc_cts_decrypt)
282         adr_l           x8, .Lcts_permute_table
283         sub             x4, x4, #16
284         add             x9, x8, #32
285         add             x8, x8, x4
286         sub             x9, x9, x4
287         ld1             {v3.16b}, [x8]
288         ld1             {v4.16b}, [x9]
289
290         ld1             {v0.16b}, [x1], x4              /* overlapping loads */
291         ld1             {v1.16b}, [x1]
292
293         ld1             {v5.16b}, [x5]                  /* get iv */
294         dec_prepare     w3, x2, x6
295
296         decrypt_block   v0, w3, x2, x6, w7
297         tbl             v2.16b, {v0.16b}, v3.16b
298         eor             v2.16b, v2.16b, v1.16b
299
300         tbx             v0.16b, {v1.16b}, v4.16b
301         decrypt_block   v0, w3, x2, x6, w7
302         eor             v0.16b, v0.16b, v5.16b          /* xor with iv */
303
304         add             x4, x0, x4
305         st1             {v2.16b}, [x4]                  /* overlapping stores */
306         st1             {v0.16b}, [x0]
307         ret
308 AES_ENDPROC(aes_cbc_cts_decrypt)
309
310         .section        ".rodata", "a"
311         .align          6
312 .Lcts_permute_table:
313         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
314         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
315         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
316         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
317         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
318         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
319         .previous
320
321
322         /*
323          * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
324          *                 int blocks, u8 ctr[])
325          */
326
327 AES_ENTRY(aes_ctr_encrypt)
328         stp             x29, x30, [sp, #-16]!
329         mov             x29, sp
330
331         enc_prepare     w3, x2, x6
332         ld1             {vctr.16b}, [x5]
333
334         umov            x6, vctr.d[1]           /* keep swabbed ctr in reg */
335         rev             x6, x6
336         cmn             w6, w4                  /* 32 bit overflow? */
337         bcs             .Lctrloop
338 .LctrloopNx:
339         subs            w4, w4, #MAX_STRIDE
340         bmi             .Lctr1x
341         add             w7, w6, #1
342         mov             v0.16b, vctr.16b
343         add             w8, w6, #2
344         mov             v1.16b, vctr.16b
345         add             w9, w6, #3
346         mov             v2.16b, vctr.16b
347         add             w9, w6, #3
348         rev             w7, w7
349         mov             v3.16b, vctr.16b
350         rev             w8, w8
351 ST5(    mov             v4.16b, vctr.16b                )
352         mov             v1.s[3], w7
353         rev             w9, w9
354 ST5(    add             w10, w6, #4                     )
355         mov             v2.s[3], w8
356 ST5(    rev             w10, w10                        )
357         mov             v3.s[3], w9
358 ST5(    mov             v4.s[3], w10                    )
359         ld1             {v5.16b-v7.16b}, [x1], #48      /* get 3 input blocks */
360 ST4(    bl              aes_encrypt_block4x             )
361 ST5(    bl              aes_encrypt_block5x             )
362         eor             v0.16b, v5.16b, v0.16b
363 ST4(    ld1             {v5.16b}, [x1], #16             )
364         eor             v1.16b, v6.16b, v1.16b
365 ST5(    ld1             {v5.16b-v6.16b}, [x1], #32      )
366         eor             v2.16b, v7.16b, v2.16b
367         eor             v3.16b, v5.16b, v3.16b
368 ST5(    eor             v4.16b, v6.16b, v4.16b          )
369         st1             {v0.16b-v3.16b}, [x0], #64
370 ST5(    st1             {v4.16b}, [x0], #16             )
371         add             x6, x6, #MAX_STRIDE
372         rev             x7, x6
373         ins             vctr.d[1], x7
374         cbz             w4, .Lctrout
375         b               .LctrloopNx
376 .Lctr1x:
377         adds            w4, w4, #MAX_STRIDE
378         beq             .Lctrout
379 .Lctrloop:
380         mov             v0.16b, vctr.16b
381         encrypt_block   v0, w3, x2, x8, w7
382
383         adds            x6, x6, #1              /* increment BE ctr */
384         rev             x7, x6
385         ins             vctr.d[1], x7
386         bcs             .Lctrcarry              /* overflow? */
387
388 .Lctrcarrydone:
389         subs            w4, w4, #1
390         bmi             .Lctrtailblock          /* blocks <0 means tail block */
391         ld1             {v3.16b}, [x1], #16
392         eor             v3.16b, v0.16b, v3.16b
393         st1             {v3.16b}, [x0], #16
394         bne             .Lctrloop
395
396 .Lctrout:
397         st1             {vctr.16b}, [x5]        /* return next CTR value */
398         ldp             x29, x30, [sp], #16
399         ret
400
401 .Lctrtailblock:
402         st1             {v0.16b}, [x0]
403         b               .Lctrout
404
405 .Lctrcarry:
406         umov            x7, vctr.d[0]           /* load upper word of ctr  */
407         rev             x7, x7                  /* ... to handle the carry */
408         add             x7, x7, #1
409         rev             x7, x7
410         ins             vctr.d[0], x7
411         b               .Lctrcarrydone
412 AES_ENDPROC(aes_ctr_encrypt)
413
414
415         /*
416          * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
417          *                 int bytes, u8 const rk2[], u8 iv[], int first)
418          * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
419          *                 int bytes, u8 const rk2[], u8 iv[], int first)
420          */
421
422         .macro          next_tweak, out, in, tmp
423         sshr            \tmp\().2d,  \in\().2d,   #63
424         and             \tmp\().16b, \tmp\().16b, xtsmask.16b
425         add             \out\().2d,  \in\().2d,   \in\().2d
426         ext             \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
427         eor             \out\().16b, \out\().16b, \tmp\().16b
428         .endm
429
430         .macro          xts_load_mask, tmp
431         movi            xtsmask.2s, #0x1
432         movi            \tmp\().2s, #0x87
433         uzp1            xtsmask.4s, xtsmask.4s, \tmp\().4s
434         .endm
435
436 AES_ENTRY(aes_xts_encrypt)
437         stp             x29, x30, [sp, #-16]!
438         mov             x29, sp
439
440         ld1             {v4.16b}, [x6]
441         xts_load_mask   v8
442         cbz             w7, .Lxtsencnotfirst
443
444         enc_prepare     w3, x5, x8
445         xts_cts_skip_tw w7, .LxtsencNx
446         encrypt_block   v4, w3, x5, x8, w7              /* first tweak */
447         enc_switch_key  w3, x2, x8
448         b               .LxtsencNx
449
450 .Lxtsencnotfirst:
451         enc_prepare     w3, x2, x8
452 .LxtsencloopNx:
453         next_tweak      v4, v4, v8
454 .LxtsencNx:
455         subs            w4, w4, #64
456         bmi             .Lxtsenc1x
457         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
458         next_tweak      v5, v4, v8
459         eor             v0.16b, v0.16b, v4.16b
460         next_tweak      v6, v5, v8
461         eor             v1.16b, v1.16b, v5.16b
462         eor             v2.16b, v2.16b, v6.16b
463         next_tweak      v7, v6, v8
464         eor             v3.16b, v3.16b, v7.16b
465         bl              aes_encrypt_block4x
466         eor             v3.16b, v3.16b, v7.16b
467         eor             v0.16b, v0.16b, v4.16b
468         eor             v1.16b, v1.16b, v5.16b
469         eor             v2.16b, v2.16b, v6.16b
470         st1             {v0.16b-v3.16b}, [x0], #64
471         mov             v4.16b, v7.16b
472         cbz             w4, .Lxtsencret
473         xts_reload_mask v8
474         b               .LxtsencloopNx
475 .Lxtsenc1x:
476         adds            w4, w4, #64
477         beq             .Lxtsencout
478         subs            w4, w4, #16
479         bmi             .LxtsencctsNx
480 .Lxtsencloop:
481         ld1             {v0.16b}, [x1], #16
482 .Lxtsencctsout:
483         eor             v0.16b, v0.16b, v4.16b
484         encrypt_block   v0, w3, x2, x8, w7
485         eor             v0.16b, v0.16b, v4.16b
486         cbz             w4, .Lxtsencout
487         subs            w4, w4, #16
488         next_tweak      v4, v4, v8
489         bmi             .Lxtsenccts
490         st1             {v0.16b}, [x0], #16
491         b               .Lxtsencloop
492 .Lxtsencout:
493         st1             {v0.16b}, [x0]
494 .Lxtsencret:
495         st1             {v4.16b}, [x6]
496         ldp             x29, x30, [sp], #16
497         ret
498
499 .LxtsencctsNx:
500         mov             v0.16b, v3.16b
501         sub             x0, x0, #16
502 .Lxtsenccts:
503         adr_l           x8, .Lcts_permute_table
504
505         add             x1, x1, w4, sxtw        /* rewind input pointer */
506         add             w4, w4, #16             /* # bytes in final block */
507         add             x9, x8, #32
508         add             x8, x8, x4
509         sub             x9, x9, x4
510         add             x4, x0, x4              /* output address of final block */
511
512         ld1             {v1.16b}, [x1]          /* load final block */
513         ld1             {v2.16b}, [x8]
514         ld1             {v3.16b}, [x9]
515
516         tbl             v2.16b, {v0.16b}, v2.16b
517         tbx             v0.16b, {v1.16b}, v3.16b
518         st1             {v2.16b}, [x4]                  /* overlapping stores */
519         mov             w4, wzr
520         b               .Lxtsencctsout
521 AES_ENDPROC(aes_xts_encrypt)
522
523 AES_ENTRY(aes_xts_decrypt)
524         stp             x29, x30, [sp, #-16]!
525         mov             x29, sp
526
527         /* subtract 16 bytes if we are doing CTS */
528         sub             w8, w4, #0x10
529         tst             w4, #0xf
530         csel            w4, w4, w8, eq
531
532         ld1             {v4.16b}, [x6]
533         xts_load_mask   v8
534         xts_cts_skip_tw w7, .Lxtsdecskiptw
535         cbz             w7, .Lxtsdecnotfirst
536
537         enc_prepare     w3, x5, x8
538         encrypt_block   v4, w3, x5, x8, w7              /* first tweak */
539 .Lxtsdecskiptw:
540         dec_prepare     w3, x2, x8
541         b               .LxtsdecNx
542
543 .Lxtsdecnotfirst:
544         dec_prepare     w3, x2, x8
545 .LxtsdecloopNx:
546         next_tweak      v4, v4, v8
547 .LxtsdecNx:
548         subs            w4, w4, #64
549         bmi             .Lxtsdec1x
550         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
551         next_tweak      v5, v4, v8
552         eor             v0.16b, v0.16b, v4.16b
553         next_tweak      v6, v5, v8
554         eor             v1.16b, v1.16b, v5.16b
555         eor             v2.16b, v2.16b, v6.16b
556         next_tweak      v7, v6, v8
557         eor             v3.16b, v3.16b, v7.16b
558         bl              aes_decrypt_block4x
559         eor             v3.16b, v3.16b, v7.16b
560         eor             v0.16b, v0.16b, v4.16b
561         eor             v1.16b, v1.16b, v5.16b
562         eor             v2.16b, v2.16b, v6.16b
563         st1             {v0.16b-v3.16b}, [x0], #64
564         mov             v4.16b, v7.16b
565         cbz             w4, .Lxtsdecout
566         xts_reload_mask v8
567         b               .LxtsdecloopNx
568 .Lxtsdec1x:
569         adds            w4, w4, #64
570         beq             .Lxtsdecout
571         subs            w4, w4, #16
572 .Lxtsdecloop:
573         ld1             {v0.16b}, [x1], #16
574         bmi             .Lxtsdeccts
575 .Lxtsdecctsout:
576         eor             v0.16b, v0.16b, v4.16b
577         decrypt_block   v0, w3, x2, x8, w7
578         eor             v0.16b, v0.16b, v4.16b
579         st1             {v0.16b}, [x0], #16
580         cbz             w4, .Lxtsdecout
581         subs            w4, w4, #16
582         next_tweak      v4, v4, v8
583         b               .Lxtsdecloop
584 .Lxtsdecout:
585         st1             {v4.16b}, [x6]
586         ldp             x29, x30, [sp], #16
587         ret
588
589 .Lxtsdeccts:
590         adr_l           x8, .Lcts_permute_table
591
592         add             x1, x1, w4, sxtw        /* rewind input pointer */
593         add             w4, w4, #16             /* # bytes in final block */
594         add             x9, x8, #32
595         add             x8, x8, x4
596         sub             x9, x9, x4
597         add             x4, x0, x4              /* output address of final block */
598
599         next_tweak      v5, v4, v8
600
601         ld1             {v1.16b}, [x1]          /* load final block */
602         ld1             {v2.16b}, [x8]
603         ld1             {v3.16b}, [x9]
604
605         eor             v0.16b, v0.16b, v5.16b
606         decrypt_block   v0, w3, x2, x8, w7
607         eor             v0.16b, v0.16b, v5.16b
608
609         tbl             v2.16b, {v0.16b}, v2.16b
610         tbx             v0.16b, {v1.16b}, v3.16b
611
612         st1             {v2.16b}, [x4]                  /* overlapping stores */
613         mov             w4, wzr
614         b               .Lxtsdecctsout
615 AES_ENDPROC(aes_xts_decrypt)
616
617         /*
618          * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
619          *                int blocks, u8 dg[], int enc_before, int enc_after)
620          */
621 AES_ENTRY(aes_mac_update)
622         frame_push      6
623
624         mov             x19, x0
625         mov             x20, x1
626         mov             x21, x2
627         mov             x22, x3
628         mov             x23, x4
629         mov             x24, x6
630
631         ld1             {v0.16b}, [x23]                 /* get dg */
632         enc_prepare     w2, x1, x7
633         cbz             w5, .Lmacloop4x
634
635         encrypt_block   v0, w2, x1, x7, w8
636
637 .Lmacloop4x:
638         subs            w22, w22, #4
639         bmi             .Lmac1x
640         ld1             {v1.16b-v4.16b}, [x19], #64     /* get next pt block */
641         eor             v0.16b, v0.16b, v1.16b          /* ..and xor with dg */
642         encrypt_block   v0, w21, x20, x7, w8
643         eor             v0.16b, v0.16b, v2.16b
644         encrypt_block   v0, w21, x20, x7, w8
645         eor             v0.16b, v0.16b, v3.16b
646         encrypt_block   v0, w21, x20, x7, w8
647         eor             v0.16b, v0.16b, v4.16b
648         cmp             w22, wzr
649         csinv           x5, x24, xzr, eq
650         cbz             w5, .Lmacout
651         encrypt_block   v0, w21, x20, x7, w8
652         st1             {v0.16b}, [x23]                 /* return dg */
653         cond_yield_neon .Lmacrestart
654         b               .Lmacloop4x
655 .Lmac1x:
656         add             w22, w22, #4
657 .Lmacloop:
658         cbz             w22, .Lmacout
659         ld1             {v1.16b}, [x19], #16            /* get next pt block */
660         eor             v0.16b, v0.16b, v1.16b          /* ..and xor with dg */
661
662         subs            w22, w22, #1
663         csinv           x5, x24, xzr, eq
664         cbz             w5, .Lmacout
665
666 .Lmacenc:
667         encrypt_block   v0, w21, x20, x7, w8
668         b               .Lmacloop
669
670 .Lmacout:
671         st1             {v0.16b}, [x23]                 /* return dg */
672         frame_pop
673         ret
674
675 .Lmacrestart:
676         ld1             {v0.16b}, [x23]                 /* get dg */
677         enc_prepare     w21, x20, x0
678         b               .Lmacloop4x
679 AES_ENDPROC(aes_mac_update)