1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * Bit sliced AES using NEON instructions
5 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
9 * The algorithm implemented here is described in detail by the paper
10 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
11 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
13 * This implementation is based primarily on the OpenSSL implementation
14 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
17 #include <linux/linkage.h>
18 #include <asm/assembler.h>
25 .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
41 .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
55 .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
70 .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
86 .macro mul_gf4, x0, x1, y0, y1, t0, t1
96 .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
113 .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
114 y0, y1, y2, y3, t0, t1, t2, t3
117 mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3
120 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
127 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
130 mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3
137 .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
138 t0, t1, t2, t3, s0, s1, s2, s3
185 mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
186 \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
189 .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
190 t0, t1, t2, t3, s0, s1, s2, s3
191 in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
192 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
193 inv_gf256 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
194 \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
195 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
196 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
197 out_bs_ch \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
198 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
201 .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
202 t0, t1, t2, t3, s0, s1, s2, s3
203 inv_in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
204 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
205 inv_gf256 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
206 \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
207 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
208 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
209 inv_out_bs_ch \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
210 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
214 ldp q16, q17, [bskey], #128
215 ldp q18, q19, [bskey, #-96]
216 ldp q20, q21, [bskey, #-64]
217 ldp q22, q23, [bskey, #-32]
221 ldp q16, q17, [bskey, #-128]!
222 ldp q18, q19, [bskey, #32]
223 ldp q20, q21, [bskey, #64]
224 ldp q22, q23, [bskey, #96]
227 .macro add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
228 eor \x0\().16b, \x0\().16b, v16.16b
229 eor \x1\().16b, \x1\().16b, v17.16b
230 eor \x2\().16b, \x2\().16b, v18.16b
231 eor \x3\().16b, \x3\().16b, v19.16b
232 eor \x4\().16b, \x4\().16b, v20.16b
233 eor \x5\().16b, \x5\().16b, v21.16b
234 eor \x6\().16b, \x6\().16b, v22.16b
235 eor \x7\().16b, \x7\().16b, v23.16b
238 .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
239 tbl \x0\().16b, {\x0\().16b}, \mask\().16b
240 tbl \x1\().16b, {\x1\().16b}, \mask\().16b
241 tbl \x2\().16b, {\x2\().16b}, \mask\().16b
242 tbl \x3\().16b, {\x3\().16b}, \mask\().16b
243 tbl \x4\().16b, {\x4\().16b}, \mask\().16b
244 tbl \x5\().16b, {\x5\().16b}, \mask\().16b
245 tbl \x6\().16b, {\x6\().16b}, \mask\().16b
246 tbl \x7\().16b, {\x7\().16b}, \mask\().16b
249 .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
250 t0, t1, t2, t3, t4, t5, t6, t7, inv
251 ext \t0\().16b, \x0\().16b, \x0\().16b, #12
252 ext \t1\().16b, \x1\().16b, \x1\().16b, #12
253 eor \x0\().16b, \x0\().16b, \t0\().16b
254 ext \t2\().16b, \x2\().16b, \x2\().16b, #12
255 eor \x1\().16b, \x1\().16b, \t1\().16b
256 ext \t3\().16b, \x3\().16b, \x3\().16b, #12
257 eor \x2\().16b, \x2\().16b, \t2\().16b
258 ext \t4\().16b, \x4\().16b, \x4\().16b, #12
259 eor \x3\().16b, \x3\().16b, \t3\().16b
260 ext \t5\().16b, \x5\().16b, \x5\().16b, #12
261 eor \x4\().16b, \x4\().16b, \t4\().16b
262 ext \t6\().16b, \x6\().16b, \x6\().16b, #12
263 eor \x5\().16b, \x5\().16b, \t5\().16b
264 ext \t7\().16b, \x7\().16b, \x7\().16b, #12
265 eor \x6\().16b, \x6\().16b, \t6\().16b
266 eor \t1\().16b, \t1\().16b, \x0\().16b
267 eor \x7\().16b, \x7\().16b, \t7\().16b
268 ext \x0\().16b, \x0\().16b, \x0\().16b, #8
269 eor \t2\().16b, \t2\().16b, \x1\().16b
270 eor \t0\().16b, \t0\().16b, \x7\().16b
271 eor \t1\().16b, \t1\().16b, \x7\().16b
272 ext \x1\().16b, \x1\().16b, \x1\().16b, #8
273 eor \t5\().16b, \t5\().16b, \x4\().16b
274 eor \x0\().16b, \x0\().16b, \t0\().16b
275 eor \t6\().16b, \t6\().16b, \x5\().16b
276 eor \x1\().16b, \x1\().16b, \t1\().16b
277 ext \t0\().16b, \x4\().16b, \x4\().16b, #8
278 eor \t4\().16b, \t4\().16b, \x3\().16b
279 ext \t1\().16b, \x5\().16b, \x5\().16b, #8
280 eor \t7\().16b, \t7\().16b, \x6\().16b
281 ext \x4\().16b, \x3\().16b, \x3\().16b, #8
282 eor \t3\().16b, \t3\().16b, \x2\().16b
283 ext \x5\().16b, \x7\().16b, \x7\().16b, #8
284 eor \t4\().16b, \t4\().16b, \x7\().16b
285 ext \x3\().16b, \x6\().16b, \x6\().16b, #8
286 eor \t3\().16b, \t3\().16b, \x7\().16b
287 ext \x6\().16b, \x2\().16b, \x2\().16b, #8
288 eor \x7\().16b, \t1\().16b, \t5\().16b
290 eor \x2\().16b, \t0\().16b, \t4\().16b
291 eor \x4\().16b, \x4\().16b, \t3\().16b
292 eor \x5\().16b, \x5\().16b, \t7\().16b
293 eor \x3\().16b, \x3\().16b, \t6\().16b
294 eor \x6\().16b, \x6\().16b, \t2\().16b
296 eor \t3\().16b, \t3\().16b, \x4\().16b
297 eor \x5\().16b, \x5\().16b, \t7\().16b
298 eor \x2\().16b, \x3\().16b, \t6\().16b
299 eor \x3\().16b, \t0\().16b, \t4\().16b
300 eor \x4\().16b, \x6\().16b, \t2\().16b
301 mov \x6\().16b, \t3\().16b
305 .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
306 t0, t1, t2, t3, t4, t5, t6, t7
307 ext \t0\().16b, \x0\().16b, \x0\().16b, #8
308 ext \t6\().16b, \x6\().16b, \x6\().16b, #8
309 ext \t7\().16b, \x7\().16b, \x7\().16b, #8
310 eor \t0\().16b, \t0\().16b, \x0\().16b
311 ext \t1\().16b, \x1\().16b, \x1\().16b, #8
312 eor \t6\().16b, \t6\().16b, \x6\().16b
313 ext \t2\().16b, \x2\().16b, \x2\().16b, #8
314 eor \t7\().16b, \t7\().16b, \x7\().16b
315 ext \t3\().16b, \x3\().16b, \x3\().16b, #8
316 eor \t1\().16b, \t1\().16b, \x1\().16b
317 ext \t4\().16b, \x4\().16b, \x4\().16b, #8
318 eor \t2\().16b, \t2\().16b, \x2\().16b
319 ext \t5\().16b, \x5\().16b, \x5\().16b, #8
320 eor \t3\().16b, \t3\().16b, \x3\().16b
321 eor \t4\().16b, \t4\().16b, \x4\().16b
322 eor \t5\().16b, \t5\().16b, \x5\().16b
323 eor \x0\().16b, \x0\().16b, \t6\().16b
324 eor \x1\().16b, \x1\().16b, \t6\().16b
325 eor \x2\().16b, \x2\().16b, \t0\().16b
326 eor \x4\().16b, \x4\().16b, \t2\().16b
327 eor \x3\().16b, \x3\().16b, \t1\().16b
328 eor \x1\().16b, \x1\().16b, \t7\().16b
329 eor \x2\().16b, \x2\().16b, \t7\().16b
330 eor \x4\().16b, \x4\().16b, \t6\().16b
331 eor \x5\().16b, \x5\().16b, \t3\().16b
332 eor \x3\().16b, \x3\().16b, \t6\().16b
333 eor \x6\().16b, \x6\().16b, \t4\().16b
334 eor \x4\().16b, \x4\().16b, \t7\().16b
335 eor \x5\().16b, \x5\().16b, \t7\().16b
336 eor \x7\().16b, \x7\().16b, \t5\().16b
337 mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
338 \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
341 .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
342 ushr \t0\().2d, \b0\().2d, #\n
343 ushr \t1\().2d, \b1\().2d, #\n
344 eor \t0\().16b, \t0\().16b, \a0\().16b
345 eor \t1\().16b, \t1\().16b, \a1\().16b
346 and \t0\().16b, \t0\().16b, \mask\().16b
347 and \t1\().16b, \t1\().16b, \mask\().16b
348 eor \a0\().16b, \a0\().16b, \t0\().16b
349 shl \t0\().2d, \t0\().2d, #\n
350 eor \a1\().16b, \a1\().16b, \t1\().16b
351 shl \t1\().2d, \t1\().2d, #\n
352 eor \b0\().16b, \b0\().16b, \t0\().16b
353 eor \b1\().16b, \b1\().16b, \t1\().16b
356 .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
357 movi \t0\().16b, #0x55
358 movi \t1\().16b, #0x33
359 swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
360 swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
361 movi \t0\().16b, #0x0f
362 swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
363 swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
364 swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
365 swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
370 M0: .octa 0x0004080c0105090d02060a0e03070b0f
372 M0SR: .octa 0x0004080c05090d010a0e02060f03070b
373 SR: .octa 0x0f0e0d0c0a09080b0504070600030201
374 SRM0: .octa 0x01060b0c0207080d0304090e00050a0f
376 M0ISR: .octa 0x0004080c0d0105090a0e0206070b0f03
377 ISR: .octa 0x0f0e0d0c080b0a090504070602010003
378 ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f
381 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
383 SYM_FUNC_START(aesbs_convert_key)
384 ld1 {v7.4s}, [x1], #16 // load round 0 key
385 ld1 {v17.4s}, [x1], #16 // load round 1 key
387 movi v8.16b, #0x01 // bit masks
398 str q7, [x0], #16 // save round 0 key
401 tbl v7.16b ,{v17.16b}, v16.16b
402 ld1 {v17.4s}, [x1], #16 // load next round key
404 cmtst v0.16b, v7.16b, v8.16b
405 cmtst v1.16b, v7.16b, v9.16b
406 cmtst v2.16b, v7.16b, v10.16b
407 cmtst v3.16b, v7.16b, v11.16b
408 cmtst v4.16b, v7.16b, v12.16b
409 cmtst v5.16b, v7.16b, v13.16b
410 cmtst v6.16b, v7.16b, v14.16b
411 cmtst v7.16b, v7.16b, v15.16b
418 stp q0, q1, [x0], #128
419 stp q2, q3, [x0, #-96]
420 stp q4, q5, [x0, #-64]
421 stp q6, q7, [x0, #-32]
424 movi v7.16b, #0x63 // compose .L63
425 eor v17.16b, v17.16b, v7.16b
428 SYM_FUNC_END(aesbs_convert_key)
431 SYM_FUNC_START_LOCAL(aesbs_encrypt8)
432 ldr q9, [bskey], #16 // round 0 key
436 eor v10.16b, v0.16b, v9.16b // xor with round0 key
437 eor v11.16b, v1.16b, v9.16b
438 tbl v0.16b, {v10.16b}, v8.16b
439 eor v12.16b, v2.16b, v9.16b
440 tbl v1.16b, {v11.16b}, v8.16b
441 eor v13.16b, v3.16b, v9.16b
442 tbl v2.16b, {v12.16b}, v8.16b
443 eor v14.16b, v4.16b, v9.16b
444 tbl v3.16b, {v13.16b}, v8.16b
445 eor v15.16b, v5.16b, v9.16b
446 tbl v4.16b, {v14.16b}, v8.16b
447 eor v10.16b, v6.16b, v9.16b
448 tbl v5.16b, {v15.16b}, v8.16b
449 eor v11.16b, v7.16b, v9.16b
450 tbl v6.16b, {v10.16b}, v8.16b
451 tbl v7.16b, {v11.16b}, v8.16b
453 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
455 sub rounds, rounds, #1
459 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24
461 sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
463 subs rounds, rounds, #1
468 mix_cols v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
471 add_round_key v0, v1, v2, v3, v4, v5, v6, v7
478 ldr q12, [bskey] // last round key
480 bitslice v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
482 eor v0.16b, v0.16b, v12.16b
483 eor v1.16b, v1.16b, v12.16b
484 eor v4.16b, v4.16b, v12.16b
485 eor v6.16b, v6.16b, v12.16b
486 eor v3.16b, v3.16b, v12.16b
487 eor v7.16b, v7.16b, v12.16b
488 eor v2.16b, v2.16b, v12.16b
489 eor v5.16b, v5.16b, v12.16b
491 SYM_FUNC_END(aesbs_encrypt8)
494 SYM_FUNC_START_LOCAL(aesbs_decrypt8)
498 ldr q9, [bskey, #-112]! // round 0 key
502 eor v10.16b, v0.16b, v9.16b // xor with round0 key
503 eor v11.16b, v1.16b, v9.16b
504 tbl v0.16b, {v10.16b}, v8.16b
505 eor v12.16b, v2.16b, v9.16b
506 tbl v1.16b, {v11.16b}, v8.16b
507 eor v13.16b, v3.16b, v9.16b
508 tbl v2.16b, {v12.16b}, v8.16b
509 eor v14.16b, v4.16b, v9.16b
510 tbl v3.16b, {v13.16b}, v8.16b
511 eor v15.16b, v5.16b, v9.16b
512 tbl v4.16b, {v14.16b}, v8.16b
513 eor v10.16b, v6.16b, v9.16b
514 tbl v5.16b, {v15.16b}, v8.16b
515 eor v11.16b, v7.16b, v9.16b
516 tbl v6.16b, {v10.16b}, v8.16b
517 tbl v7.16b, {v11.16b}, v8.16b
519 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
521 sub rounds, rounds, #1
525 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24
527 inv_sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
529 subs rounds, rounds, #1
534 add_round_key v0, v1, v6, v4, v2, v7, v3, v5
536 inv_mix_cols v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
543 ldr q12, [bskey, #-16] // last round key
545 bitslice v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
547 eor v0.16b, v0.16b, v12.16b
548 eor v1.16b, v1.16b, v12.16b
549 eor v6.16b, v6.16b, v12.16b
550 eor v4.16b, v4.16b, v12.16b
551 eor v2.16b, v2.16b, v12.16b
552 eor v7.16b, v7.16b, v12.16b
553 eor v3.16b, v3.16b, v12.16b
554 eor v5.16b, v5.16b, v12.16b
556 SYM_FUNC_END(aesbs_decrypt8)
559 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
561 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
564 .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
576 csel x23, x23, xzr, pl
579 ld1 {v0.16b}, [x20], #16
581 ld1 {v1.16b}, [x20], #16
583 ld1 {v2.16b}, [x20], #16
585 ld1 {v3.16b}, [x20], #16
587 ld1 {v4.16b}, [x20], #16
589 ld1 {v5.16b}, [x20], #16
591 ld1 {v6.16b}, [x20], #16
593 ld1 {v7.16b}, [x20], #16
599 st1 {\o0\().16b}, [x19], #16
601 st1 {\o1\().16b}, [x19], #16
603 st1 {\o2\().16b}, [x19], #16
605 st1 {\o3\().16b}, [x19], #16
607 st1 {\o4\().16b}, [x19], #16
609 st1 {\o5\().16b}, [x19], #16
611 st1 {\o6\().16b}, [x19], #16
613 st1 {\o7\().16b}, [x19], #16
623 SYM_FUNC_START(aesbs_ecb_encrypt)
624 __ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
625 SYM_FUNC_END(aesbs_ecb_encrypt)
628 SYM_FUNC_START(aesbs_ecb_decrypt)
629 __ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
630 SYM_FUNC_END(aesbs_ecb_decrypt)
633 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
634 * int blocks, u8 iv[])
637 SYM_FUNC_START(aesbs_cbc_decrypt)
650 csel x23, x23, xzr, pl
653 ld1 {v0.16b}, [x20], #16
656 ld1 {v1.16b}, [x20], #16
659 ld1 {v2.16b}, [x20], #16
662 ld1 {v3.16b}, [x20], #16
665 ld1 {v4.16b}, [x20], #16
668 ld1 {v5.16b}, [x20], #16
671 ld1 {v6.16b}, [x20], #16
680 ld1 {v24.16b}, [x24] // load IV
682 eor v1.16b, v1.16b, v25.16b
683 eor v6.16b, v6.16b, v26.16b
684 eor v4.16b, v4.16b, v27.16b
685 eor v2.16b, v2.16b, v28.16b
686 eor v7.16b, v7.16b, v29.16b
687 eor v0.16b, v0.16b, v24.16b
688 eor v3.16b, v3.16b, v30.16b
689 eor v5.16b, v5.16b, v31.16b
691 st1 {v0.16b}, [x19], #16
694 st1 {v1.16b}, [x19], #16
697 st1 {v6.16b}, [x19], #16
700 st1 {v4.16b}, [x19], #16
703 st1 {v2.16b}, [x19], #16
706 st1 {v7.16b}, [x19], #16
709 st1 {v3.16b}, [x19], #16
712 ld1 {v24.16b}, [x20], #16
713 st1 {v5.16b}, [x19], #16
714 1: st1 {v24.16b}, [x24] // store IV
721 SYM_FUNC_END(aesbs_cbc_decrypt)
723 .macro next_tweak, out, in, const, tmp
724 sshr \tmp\().2d, \in\().2d, #63
725 and \tmp\().16b, \tmp\().16b, \const\().16b
726 add \out\().2d, \in\().2d, \in\().2d
727 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
728 eor \out\().16b, \out\().16b, \tmp\().16b
732 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
733 * int blocks, u8 iv[])
734 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
735 * int blocks, u8 iv[])
737 SYM_FUNC_START_LOCAL(__xts_crypt8)
741 csel x23, x23, xzr, pl
744 ld1 {v0.16b}, [x20], #16
745 next_tweak v26, v25, v30, v31
746 eor v0.16b, v0.16b, v25.16b
749 ld1 {v1.16b}, [x20], #16
750 next_tweak v27, v26, v30, v31
751 eor v1.16b, v1.16b, v26.16b
754 ld1 {v2.16b}, [x20], #16
755 next_tweak v28, v27, v30, v31
756 eor v2.16b, v2.16b, v27.16b
759 ld1 {v3.16b}, [x20], #16
760 next_tweak v29, v28, v30, v31
761 eor v3.16b, v3.16b, v28.16b
764 ld1 {v4.16b}, [x20], #16
765 str q29, [sp, #.Lframe_local_offset]
766 eor v4.16b, v4.16b, v29.16b
767 next_tweak v29, v29, v30, v31
770 ld1 {v5.16b}, [x20], #16
771 str q29, [sp, #.Lframe_local_offset + 16]
772 eor v5.16b, v5.16b, v29.16b
773 next_tweak v29, v29, v30, v31
776 ld1 {v6.16b}, [x20], #16
777 str q29, [sp, #.Lframe_local_offset + 32]
778 eor v6.16b, v6.16b, v29.16b
779 next_tweak v29, v29, v30, v31
782 ld1 {v7.16b}, [x20], #16
783 str q29, [sp, #.Lframe_local_offset + 48]
784 eor v7.16b, v7.16b, v29.16b
785 next_tweak v29, v29, v30, v31
790 SYM_FUNC_END(__xts_crypt8)
792 .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
804 uzp1 v30.4s, v30.4s, v25.4s
810 ldp q16, q17, [sp, #.Lframe_local_offset]
811 ldp q18, q19, [sp, #.Lframe_local_offset + 32]
813 eor \o0\().16b, \o0\().16b, v25.16b
814 eor \o1\().16b, \o1\().16b, v26.16b
815 eor \o2\().16b, \o2\().16b, v27.16b
816 eor \o3\().16b, \o3\().16b, v28.16b
818 st1 {\o0\().16b}, [x19], #16
821 st1 {\o1\().16b}, [x19], #16
824 st1 {\o2\().16b}, [x19], #16
827 st1 {\o3\().16b}, [x19], #16
831 eor \o4\().16b, \o4\().16b, v16.16b
832 eor \o5\().16b, \o5\().16b, v17.16b
833 eor \o6\().16b, \o6\().16b, v18.16b
834 eor \o7\().16b, \o7\().16b, v19.16b
836 st1 {\o4\().16b}, [x19], #16
838 st1 {\o5\().16b}, [x19], #16
840 st1 {\o6\().16b}, [x19], #16
842 st1 {\o7\().16b}, [x19], #16
849 1: st1 {v25.16b}, [x24]
854 SYM_FUNC_START(aesbs_xts_encrypt)
855 __xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
856 SYM_FUNC_END(aesbs_xts_encrypt)
858 SYM_FUNC_START(aesbs_xts_decrypt)
859 __xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
860 SYM_FUNC_END(aesbs_xts_decrypt)
867 rev64 \v\().16b, \v\().16b
871 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
872 * int rounds, int blocks, u8 iv[], u8 final[])
874 SYM_FUNC_START(aesbs_ctr_encrypt)
887 add x23, x23, x26 // do one extra block if final
899 csel x23, x23, xzr, pl
921 lsr x9, x9, x26 // disregard the extra block
924 ld1 {v8.16b}, [x20], #16
925 eor v0.16b, v0.16b, v8.16b
926 st1 {v0.16b}, [x19], #16
929 ld1 {v9.16b}, [x20], #16
930 eor v1.16b, v1.16b, v9.16b
931 st1 {v1.16b}, [x19], #16
934 ld1 {v10.16b}, [x20], #16
935 eor v4.16b, v4.16b, v10.16b
936 st1 {v4.16b}, [x19], #16
939 ld1 {v11.16b}, [x20], #16
940 eor v6.16b, v6.16b, v11.16b
941 st1 {v6.16b}, [x19], #16
944 ld1 {v12.16b}, [x20], #16
945 eor v3.16b, v3.16b, v12.16b
946 st1 {v3.16b}, [x19], #16
949 ld1 {v13.16b}, [x20], #16
950 eor v7.16b, v7.16b, v13.16b
951 st1 {v7.16b}, [x19], #16
954 ld1 {v14.16b}, [x20], #16
955 eor v2.16b, v2.16b, v14.16b
956 st1 {v2.16b}, [x19], #16
959 ld1 {v15.16b}, [x20], #16
960 eor v5.16b, v5.16b, v15.16b
961 st1 {v5.16b}, [x19], #16
974 * If we are handling the tail of the input (x6 != NULL), return the
975 * final keystream block back to the caller.
1001 SYM_FUNC_END(aesbs_ctr_encrypt)