arch/arm64/crypto/aes-modes.S

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 /*
   3  * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
   4  *
   5  * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
   6  */
   7
   8 /* included by aes-ce.S and aes-neon.S */
   9
  10         .text
  11         .align          4
  12
  13 #ifndef MAX_STRIDE
  14 #define MAX_STRIDE      4
  15 #endif
  16
  17 #if MAX_STRIDE == 4
  18 #define ST4(x...) x
  19 #define ST5(x...)
  20 #else
  21 #define ST4(x...)
  22 #define ST5(x...) x
  23 #endif
  24
  25 SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
  26         encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
  27         ret
  28 SYM_FUNC_END(aes_encrypt_block4x)
  29
  30 SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
  31         decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
  32         ret
  33 SYM_FUNC_END(aes_decrypt_block4x)
  34
  35 #if MAX_STRIDE == 5
  36 SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
  37         encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
  38         ret
  39 SYM_FUNC_END(aes_encrypt_block5x)
  40
  41 SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
  42         decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
  43         ret
  44 SYM_FUNC_END(aes_decrypt_block5x)
  45 #endif
  46
  47         /*
  48          * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  49          *                 int blocks)
  50          * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  51          *                 int blocks)
  52          */
  53
  54 AES_FUNC_START(aes_ecb_encrypt)
  55         frame_push      0
  56
  57         enc_prepare     w3, x2, x5
  58
  59 .LecbencloopNx:
  60         subs            w4, w4, #MAX_STRIDE
  61         bmi             .Lecbenc1x
  62         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
  63 ST4(    bl              aes_encrypt_block4x             )
  64 ST5(    ld1             {v4.16b}, [x1], #16             )
  65 ST5(    bl              aes_encrypt_block5x             )
  66         st1             {v0.16b-v3.16b}, [x0], #64
  67 ST5(    st1             {v4.16b}, [x0], #16             )
  68         b               .LecbencloopNx
  69 .Lecbenc1x:
  70         adds            w4, w4, #MAX_STRIDE
  71         beq             .Lecbencout
  72 .Lecbencloop:
  73         ld1             {v0.16b}, [x1], #16             /* get next pt block */
  74         encrypt_block   v0, w3, x2, x5, w6
  75         st1             {v0.16b}, [x0], #16
  76         subs            w4, w4, #1
  77         bne             .Lecbencloop
  78 .Lecbencout:
  79         frame_pop
  80         ret
  81 AES_FUNC_END(aes_ecb_encrypt)
  82
  83
  84 AES_FUNC_START(aes_ecb_decrypt)
  85         frame_push      0
  86
  87         dec_prepare     w3, x2, x5
  88
  89 .LecbdecloopNx:
  90         subs            w4, w4, #MAX_STRIDE
  91         bmi             .Lecbdec1x
  92         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
  93 ST4(    bl              aes_decrypt_block4x             )
  94 ST5(    ld1             {v4.16b}, [x1], #16             )
  95 ST5(    bl              aes_decrypt_block5x             )
  96         st1             {v0.16b-v3.16b}, [x0], #64
  97 ST5(    st1             {v4.16b}, [x0], #16             )
  98         b               .LecbdecloopNx
  99 .Lecbdec1x:
 100         adds            w4, w4, #MAX_STRIDE
 101         beq             .Lecbdecout
 102 .Lecbdecloop:
 103         ld1             {v0.16b}, [x1], #16             /* get next ct block */
 104         decrypt_block   v0, w3, x2, x5, w6
 105         st1             {v0.16b}, [x0], #16
 106         subs            w4, w4, #1
 107         bne             .Lecbdecloop
 108 .Lecbdecout:
 109         frame_pop
 110         ret
 111 AES_FUNC_END(aes_ecb_decrypt)
 112
 113
 114         /*
 115          * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 116          *                 int blocks, u8 iv[])
 117          * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 118          *                 int blocks, u8 iv[])
 119          * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
 120          *                       int rounds, int blocks, u8 iv[],
 121          *                       u32 const rk2[]);
 122          * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
 123          *                       int rounds, int blocks, u8 iv[],
 124          *                       u32 const rk2[]);
 125          */
 126
 127 AES_FUNC_START(aes_essiv_cbc_encrypt)
 128         ld1             {v4.16b}, [x5]                  /* get iv */
 129
 130         mov             w8, #14                         /* AES-256: 14 rounds */
 131         enc_prepare     w8, x6, x7
 132         encrypt_block   v4, w8, x6, x7, w9
 133         enc_switch_key  w3, x2, x6
 134         b               .Lcbcencloop4x
 135
 136 AES_FUNC_START(aes_cbc_encrypt)
 137         ld1             {v4.16b}, [x5]                  /* get iv */
 138         enc_prepare     w3, x2, x6
 139
 140 .Lcbcencloop4x:
 141         subs            w4, w4, #4
 142         bmi             .Lcbcenc1x
 143         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
 144         eor             v0.16b, v0.16b, v4.16b          /* ..and xor with iv */
 145         encrypt_block   v0, w3, x2, x6, w7
 146         eor             v1.16b, v1.16b, v0.16b
 147         encrypt_block   v1, w3, x2, x6, w7
 148         eor             v2.16b, v2.16b, v1.16b
 149         encrypt_block   v2, w3, x2, x6, w7
 150         eor             v3.16b, v3.16b, v2.16b
 151         encrypt_block   v3, w3, x2, x6, w7
 152         st1             {v0.16b-v3.16b}, [x0], #64
 153         mov             v4.16b, v3.16b
 154         b               .Lcbcencloop4x
 155 .Lcbcenc1x:
 156         adds            w4, w4, #4
 157         beq             .Lcbcencout
 158 .Lcbcencloop:
 159         ld1             {v0.16b}, [x1], #16             /* get next pt block */
 160         eor             v4.16b, v4.16b, v0.16b          /* ..and xor with iv */
 161         encrypt_block   v4, w3, x2, x6, w7
 162         st1             {v4.16b}, [x0], #16
 163         subs            w4, w4, #1
 164         bne             .Lcbcencloop
 165 .Lcbcencout:
 166         st1             {v4.16b}, [x5]                  /* return iv */
 167         ret
 168 AES_FUNC_END(aes_cbc_encrypt)
 169 AES_FUNC_END(aes_essiv_cbc_encrypt)
 170
 171 AES_FUNC_START(aes_essiv_cbc_decrypt)
 172         ld1             {cbciv.16b}, [x5]               /* get iv */
 173
 174         mov             w8, #14                         /* AES-256: 14 rounds */
 175         enc_prepare     w8, x6, x7
 176         encrypt_block   cbciv, w8, x6, x7, w9
 177         b               .Lessivcbcdecstart
 178
 179 AES_FUNC_START(aes_cbc_decrypt)
 180         ld1             {cbciv.16b}, [x5]               /* get iv */
 181 .Lessivcbcdecstart:
 182         frame_push      0
 183         dec_prepare     w3, x2, x6
 184
 185 .LcbcdecloopNx:
 186         subs            w4, w4, #MAX_STRIDE
 187         bmi             .Lcbcdec1x
 188         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
 189 #if MAX_STRIDE == 5
 190         ld1             {v4.16b}, [x1], #16             /* get 1 ct block */
 191         mov             v5.16b, v0.16b
 192         mov             v6.16b, v1.16b
 193         mov             v7.16b, v2.16b
 194         bl              aes_decrypt_block5x
 195         sub             x1, x1, #32
 196         eor             v0.16b, v0.16b, cbciv.16b
 197         eor             v1.16b, v1.16b, v5.16b
 198         ld1             {v5.16b}, [x1], #16             /* reload 1 ct block */
 199         ld1             {cbciv.16b}, [x1], #16          /* reload 1 ct block */
 200         eor             v2.16b, v2.16b, v6.16b
 201         eor             v3.16b, v3.16b, v7.16b
 202         eor             v4.16b, v4.16b, v5.16b
 203 #else
 204         mov             v4.16b, v0.16b
 205         mov             v5.16b, v1.16b
 206         mov             v6.16b, v2.16b
 207         bl              aes_decrypt_block4x
 208         sub             x1, x1, #16
 209         eor             v0.16b, v0.16b, cbciv.16b
 210         eor             v1.16b, v1.16b, v4.16b
 211         ld1             {cbciv.16b}, [x1], #16          /* reload 1 ct block */
 212         eor             v2.16b, v2.16b, v5.16b
 213         eor             v3.16b, v3.16b, v6.16b
 214 #endif
 215         st1             {v0.16b-v3.16b}, [x0], #64
 216 ST5(    st1             {v4.16b}, [x0], #16             )
 217         b               .LcbcdecloopNx
 218 .Lcbcdec1x:
 219         adds            w4, w4, #MAX_STRIDE
 220         beq             .Lcbcdecout
 221 .Lcbcdecloop:
 222         ld1             {v1.16b}, [x1], #16             /* get next ct block */
 223         mov             v0.16b, v1.16b                  /* ...and copy to v0 */
 224         decrypt_block   v0, w3, x2, x6, w7
 225         eor             v0.16b, v0.16b, cbciv.16b       /* xor with iv => pt */
 226         mov             cbciv.16b, v1.16b               /* ct is next iv */
 227         st1             {v0.16b}, [x0], #16
 228         subs            w4, w4, #1
 229         bne             .Lcbcdecloop
 230 .Lcbcdecout:
 231         st1             {cbciv.16b}, [x5]               /* return iv */
 232         frame_pop
 233         ret
 234 AES_FUNC_END(aes_cbc_decrypt)
 235 AES_FUNC_END(aes_essiv_cbc_decrypt)
 236
 237
 238         /*
 239          * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
 240          *                     int rounds, int bytes, u8 const iv[])
 241          * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
 242          *                     int rounds, int bytes, u8 const iv[])
 243          */
 244
 245 AES_FUNC_START(aes_cbc_cts_encrypt)
 246         adr_l           x8, .Lcts_permute_table
 247         sub             x4, x4, #16
 248         add             x9, x8, #32
 249         add             x8, x8, x4
 250         sub             x9, x9, x4
 251         ld1             {v3.16b}, [x8]
 252         ld1             {v4.16b}, [x9]
 253
 254         ld1             {v0.16b}, [x1], x4              /* overlapping loads */
 255         ld1             {v1.16b}, [x1]
 256
 257         ld1             {v5.16b}, [x5]                  /* get iv */
 258         enc_prepare     w3, x2, x6
 259
 260         eor             v0.16b, v0.16b, v5.16b          /* xor with iv */
 261         tbl             v1.16b, {v1.16b}, v4.16b
 262         encrypt_block   v0, w3, x2, x6, w7
 263
 264         eor             v1.16b, v1.16b, v0.16b
 265         tbl             v0.16b, {v0.16b}, v3.16b
 266         encrypt_block   v1, w3, x2, x6, w7
 267
 268         add             x4, x0, x4
 269         st1             {v0.16b}, [x4]                  /* overlapping stores */
 270         st1             {v1.16b}, [x0]
 271         ret
 272 AES_FUNC_END(aes_cbc_cts_encrypt)
 273
 274 AES_FUNC_START(aes_cbc_cts_decrypt)
 275         adr_l           x8, .Lcts_permute_table
 276         sub             x4, x4, #16
 277         add             x9, x8, #32
 278         add             x8, x8, x4
 279         sub             x9, x9, x4
 280         ld1             {v3.16b}, [x8]
 281         ld1             {v4.16b}, [x9]
 282
 283         ld1             {v0.16b}, [x1], x4              /* overlapping loads */
 284         ld1             {v1.16b}, [x1]
 285
 286         ld1             {v5.16b}, [x5]                  /* get iv */
 287         dec_prepare     w3, x2, x6
 288
 289         decrypt_block   v0, w3, x2, x6, w7
 290         tbl             v2.16b, {v0.16b}, v3.16b
 291         eor             v2.16b, v2.16b, v1.16b
 292
 293         tbx             v0.16b, {v1.16b}, v4.16b
 294         decrypt_block   v0, w3, x2, x6, w7
 295         eor             v0.16b, v0.16b, v5.16b          /* xor with iv */
 296
 297         add             x4, x0, x4
 298         st1             {v2.16b}, [x4]                  /* overlapping stores */
 299         st1             {v0.16b}, [x0]
 300         ret
 301 AES_FUNC_END(aes_cbc_cts_decrypt)
 302
 303         .section        ".rodata", "a"
 304         .align          6
 305 .Lcts_permute_table:
 306         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 307         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 308         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
 309         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
 310         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 311         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 312         .previous
 313
 314         /*
 315          * This macro generates the code for CTR and XCTR mode.
 316          */
 317 .macro ctr_encrypt xctr
 318         // Arguments
 319         OUT             .req x0
 320         IN              .req x1
 321         KEY             .req x2
 322         ROUNDS_W        .req w3
 323         BYTES_W         .req w4
 324         IV              .req x5
 325         BYTE_CTR_W      .req w6         // XCTR only
 326         // Intermediate values
 327         CTR_W           .req w11        // XCTR only
 328         CTR             .req x11        // XCTR only
 329         IV_PART         .req x12
 330         BLOCKS          .req x13
 331         BLOCKS_W        .req w13
 332
 333         frame_push      0
 334
 335         enc_prepare     ROUNDS_W, KEY, IV_PART
 336         ld1             {vctr.16b}, [IV]
 337
 338         /*
 339          * Keep 64 bits of the IV in a register.  For CTR mode this lets us
 340          * easily increment the IV.  For XCTR mode this lets us efficiently XOR
 341          * the 64-bit counter with the IV.
 342          */
 343         .if \xctr
 344                 umov            IV_PART, vctr.d[0]
 345                 lsr             CTR_W, BYTE_CTR_W, #4
 346         .else
 347                 umov            IV_PART, vctr.d[1]
 348                 rev             IV_PART, IV_PART
 349         .endif
 350
 351 .LctrloopNx\xctr:
 352         add             BLOCKS_W, BYTES_W, #15
 353         sub             BYTES_W, BYTES_W, #MAX_STRIDE << 4
 354         lsr             BLOCKS_W, BLOCKS_W, #4
 355         mov             w8, #MAX_STRIDE
 356         cmp             BLOCKS_W, w8
 357         csel            BLOCKS_W, BLOCKS_W, w8, lt
 358
 359         /*
 360          * Set up the counter values in v0-v{MAX_STRIDE-1}.
 361          *
 362          * If we are encrypting less than MAX_STRIDE blocks, the tail block
 363          * handling code expects the last keystream block to be in
 364          * v{MAX_STRIDE-1}.  For example: if encrypting two blocks with
 365          * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks.
 366          */
 367         .if \xctr
 368                 add             CTR, CTR, BLOCKS
 369         .else
 370                 adds            IV_PART, IV_PART, BLOCKS
 371         .endif
 372         mov             v0.16b, vctr.16b
 373         mov             v1.16b, vctr.16b
 374         mov             v2.16b, vctr.16b
 375         mov             v3.16b, vctr.16b
 376 ST5(    mov             v4.16b, vctr.16b                )
 377         .if \xctr
 378                 sub             x6, CTR, #MAX_STRIDE - 1
 379                 sub             x7, CTR, #MAX_STRIDE - 2
 380                 sub             x8, CTR, #MAX_STRIDE - 3
 381                 sub             x9, CTR, #MAX_STRIDE - 4
 382 ST5(            sub             x10, CTR, #MAX_STRIDE - 5       )
 383                 eor             x6, x6, IV_PART
 384                 eor             x7, x7, IV_PART
 385                 eor             x8, x8, IV_PART
 386                 eor             x9, x9, IV_PART
 387 ST5(            eor             x10, x10, IV_PART               )
 388                 mov             v0.d[0], x6
 389                 mov             v1.d[0], x7
 390                 mov             v2.d[0], x8
 391                 mov             v3.d[0], x9
 392 ST5(            mov             v4.d[0], x10                    )
 393         .else
 394                 bcs             0f
 395                 .subsection     1
 396                 /*
 397                  * This subsection handles carries.
 398                  *
 399                  * Conditional branching here is allowed with respect to time
 400                  * invariance since the branches are dependent on the IV instead
 401                  * of the plaintext or key.  This code is rarely executed in
 402                  * practice anyway.
 403                  */
 404
 405                 /* Apply carry to outgoing counter. */
 406 0:              umov            x8, vctr.d[0]
 407                 rev             x8, x8
 408                 add             x8, x8, #1
 409                 rev             x8, x8
 410                 ins             vctr.d[0], x8
 411
 412                 /*
 413                  * Apply carry to counter blocks if needed.
 414                  *
 415                  * Since the carry flag was set, we know 0 <= IV_PART <
 416                  * MAX_STRIDE.  Using the value of IV_PART we can determine how
 417                  * many counter blocks need to be updated.
 418                  */
 419                 cbz             IV_PART, 2f
 420                 adr             x16, 1f
 421                 sub             x16, x16, IV_PART, lsl #3
 422                 br              x16
 423                 bti             c
 424                 mov             v0.d[0], vctr.d[0]
 425                 bti             c
 426                 mov             v1.d[0], vctr.d[0]
 427                 bti             c
 428                 mov             v2.d[0], vctr.d[0]
 429                 bti             c
 430                 mov             v3.d[0], vctr.d[0]
 431 ST5(            bti             c                               )
 432 ST5(            mov             v4.d[0], vctr.d[0]              )
 433 1:              b               2f
 434                 .previous
 435
 436 2:              rev             x7, IV_PART
 437                 ins             vctr.d[1], x7
 438                 sub             x7, IV_PART, #MAX_STRIDE - 1
 439                 sub             x8, IV_PART, #MAX_STRIDE - 2
 440                 sub             x9, IV_PART, #MAX_STRIDE - 3
 441                 rev             x7, x7
 442                 rev             x8, x8
 443                 mov             v1.d[1], x7
 444                 rev             x9, x9
 445 ST5(            sub             x10, IV_PART, #MAX_STRIDE - 4   )
 446                 mov             v2.d[1], x8
 447 ST5(            rev             x10, x10                        )
 448                 mov             v3.d[1], x9
 449 ST5(            mov             v4.d[1], x10                    )
 450         .endif
 451
 452         /*
 453          * If there are at least MAX_STRIDE blocks left, XOR the data with
 454          * keystream and store.  Otherwise jump to tail handling.
 455          */
 456         tbnz            BYTES_W, #31, .Lctrtail\xctr
 457         ld1             {v5.16b-v7.16b}, [IN], #48
 458 ST4(    bl              aes_encrypt_block4x             )
 459 ST5(    bl              aes_encrypt_block5x             )
 460         eor             v0.16b, v5.16b, v0.16b
 461 ST4(    ld1             {v5.16b}, [IN], #16             )
 462         eor             v1.16b, v6.16b, v1.16b
 463 ST5(    ld1             {v5.16b-v6.16b}, [IN], #32      )
 464         eor             v2.16b, v7.16b, v2.16b
 465         eor             v3.16b, v5.16b, v3.16b
 466 ST5(    eor             v4.16b, v6.16b, v4.16b          )
 467         st1             {v0.16b-v3.16b}, [OUT], #64
 468 ST5(    st1             {v4.16b}, [OUT], #16            )
 469         cbz             BYTES_W, .Lctrout\xctr
 470         b               .LctrloopNx\xctr
 471
 472 .Lctrout\xctr:
 473         .if !\xctr
 474                 st1             {vctr.16b}, [IV] /* return next CTR value */
 475         .endif
 476         frame_pop
 477         ret
 478
 479 .Lctrtail\xctr:
 480         /*
 481          * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext
 482          *
 483          * This code expects the last keystream block to be in v{MAX_STRIDE-1}.
 484          * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and
 485          * v4 should have the next two counter blocks.
 486          *
 487          * This allows us to store the ciphertext by writing to overlapping
 488          * regions of memory.  Any invalid ciphertext blocks get overwritten by
 489          * correctly computed blocks.  This approach greatly simplifies the
 490          * logic for storing the ciphertext.
 491          */
 492         mov             x16, #16
 493         ands            w7, BYTES_W, #0xf
 494         csel            x13, x7, x16, ne
 495
 496 ST5(    cmp             BYTES_W, #64 - (MAX_STRIDE << 4))
 497 ST5(    csel            x14, x16, xzr, gt               )
 498         cmp             BYTES_W, #48 - (MAX_STRIDE << 4)
 499         csel            x15, x16, xzr, gt
 500         cmp             BYTES_W, #32 - (MAX_STRIDE << 4)
 501         csel            x16, x16, xzr, gt
 502         cmp             BYTES_W, #16 - (MAX_STRIDE << 4)
 503
 504         adr_l           x9, .Lcts_permute_table
 505         add             x9, x9, x13
 506         ble             .Lctrtail1x\xctr
 507
 508 ST5(    ld1             {v5.16b}, [IN], x14             )
 509         ld1             {v6.16b}, [IN], x15
 510         ld1             {v7.16b}, [IN], x16
 511
 512 ST4(    bl              aes_encrypt_block4x             )
 513 ST5(    bl              aes_encrypt_block5x             )
 514
 515         ld1             {v8.16b}, [IN], x13
 516         ld1             {v9.16b}, [IN]
 517         ld1             {v10.16b}, [x9]
 518
 519 ST4(    eor             v6.16b, v6.16b, v0.16b          )
 520 ST4(    eor             v7.16b, v7.16b, v1.16b          )
 521 ST4(    tbl             v3.16b, {v3.16b}, v10.16b       )
 522 ST4(    eor             v8.16b, v8.16b, v2.16b          )
 523 ST4(    eor             v9.16b, v9.16b, v3.16b          )
 524
 525 ST5(    eor             v5.16b, v5.16b, v0.16b          )
 526 ST5(    eor             v6.16b, v6.16b, v1.16b          )
 527 ST5(    tbl             v4.16b, {v4.16b}, v10.16b       )
 528 ST5(    eor             v7.16b, v7.16b, v2.16b          )
 529 ST5(    eor             v8.16b, v8.16b, v3.16b          )
 530 ST5(    eor             v9.16b, v9.16b, v4.16b          )
 531
 532 ST5(    st1             {v5.16b}, [OUT], x14            )
 533         st1             {v6.16b}, [OUT], x15
 534         st1             {v7.16b}, [OUT], x16
 535         add             x13, x13, OUT
 536         st1             {v9.16b}, [x13]         // overlapping stores
 537         st1             {v8.16b}, [OUT]
 538         b               .Lctrout\xctr
 539
 540 .Lctrtail1x\xctr:
 541         /*
 542          * Handle <= 16 bytes of plaintext
 543          *
 544          * This code always reads and writes 16 bytes.  To avoid out of bounds
 545          * accesses, XCTR and CTR modes must use a temporary buffer when
 546          * encrypting/decrypting less than 16 bytes.
 547          *
 548          * This code is unusual in that it loads the input and stores the output
 549          * relative to the end of the buffers rather than relative to the start.
 550          * This causes unusual behaviour when encrypting/decrypting less than 16
 551          * bytes; the end of the data is expected to be at the end of the
 552          * temporary buffer rather than the start of the data being at the start
 553          * of the temporary buffer.
 554          */
 555         sub             x8, x7, #16
 556         csel            x7, x7, x8, eq
 557         add             IN, IN, x7
 558         add             OUT, OUT, x7
 559         ld1             {v5.16b}, [IN]
 560         ld1             {v6.16b}, [OUT]
 561 ST5(    mov             v3.16b, v4.16b                  )
 562         encrypt_block   v3, ROUNDS_W, KEY, x8, w7
 563         ld1             {v10.16b-v11.16b}, [x9]
 564         tbl             v3.16b, {v3.16b}, v10.16b
 565         sshr            v11.16b, v11.16b, #7
 566         eor             v5.16b, v5.16b, v3.16b
 567         bif             v5.16b, v6.16b, v11.16b
 568         st1             {v5.16b}, [OUT]
 569         b               .Lctrout\xctr
 570
 571         // Arguments
 572         .unreq OUT
 573         .unreq IN
 574         .unreq KEY
 575         .unreq ROUNDS_W
 576         .unreq BYTES_W
 577         .unreq IV
 578         .unreq BYTE_CTR_W       // XCTR only
 579         // Intermediate values
 580         .unreq CTR_W            // XCTR only
 581         .unreq CTR              // XCTR only
 582         .unreq IV_PART
 583         .unreq BLOCKS
 584         .unreq BLOCKS_W
 585 .endm
 586
 587         /*
 588          * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 589          *                 int bytes, u8 ctr[])
 590          *
 591          * The input and output buffers must always be at least 16 bytes even if
 592          * encrypting/decrypting less than 16 bytes.  Otherwise out of bounds
 593          * accesses will occur.  The data to be encrypted/decrypted is expected
 594          * to be at the end of this 16-byte temporary buffer rather than the
 595          * start.
 596          */
 597
 598 AES_FUNC_START(aes_ctr_encrypt)
 599         ctr_encrypt 0
 600 AES_FUNC_END(aes_ctr_encrypt)
 601
 602         /*
 603          * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 604          *                 int bytes, u8 const iv[], int byte_ctr)
 605          *
 606          * The input and output buffers must always be at least 16 bytes even if
 607          * encrypting/decrypting less than 16 bytes.  Otherwise out of bounds
 608          * accesses will occur.  The data to be encrypted/decrypted is expected
 609          * to be at the end of this 16-byte temporary buffer rather than the
 610          * start.
 611          */
 612
 613 AES_FUNC_START(aes_xctr_encrypt)
 614         ctr_encrypt 1
 615 AES_FUNC_END(aes_xctr_encrypt)
 616
 617
 618         /*
 619          * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
 620          *                 int bytes, u8 const rk2[], u8 iv[], int first)
 621          * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
 622          *                 int bytes, u8 const rk2[], u8 iv[], int first)
 623          */
 624
 625         .macro          next_tweak, out, in, tmp
 626         sshr            \tmp\().2d,  \in\().2d,   #63
 627         and             \tmp\().16b, \tmp\().16b, xtsmask.16b
 628         add             \out\().2d,  \in\().2d,   \in\().2d
 629         ext             \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
 630         eor             \out\().16b, \out\().16b, \tmp\().16b
 631         .endm
 632
 633         .macro          xts_load_mask, tmp
 634         movi            xtsmask.2s, #0x1
 635         movi            \tmp\().2s, #0x87
 636         uzp1            xtsmask.4s, xtsmask.4s, \tmp\().4s
 637         .endm
 638
 639 AES_FUNC_START(aes_xts_encrypt)
 640         frame_push      0
 641
 642         ld1             {v4.16b}, [x6]
 643         xts_load_mask   v8
 644         cbz             w7, .Lxtsencnotfirst
 645
 646         enc_prepare     w3, x5, x8
 647         xts_cts_skip_tw w7, .LxtsencNx
 648         encrypt_block   v4, w3, x5, x8, w7              /* first tweak */
 649         enc_switch_key  w3, x2, x8
 650         b               .LxtsencNx
 651
 652 .Lxtsencnotfirst:
 653         enc_prepare     w3, x2, x8
 654 .LxtsencloopNx:
 655         next_tweak      v4, v4, v8
 656 .LxtsencNx:
 657         subs            w4, w4, #64
 658         bmi             .Lxtsenc1x
 659         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
 660         next_tweak      v5, v4, v8
 661         eor             v0.16b, v0.16b, v4.16b
 662         next_tweak      v6, v5, v8
 663         eor             v1.16b, v1.16b, v5.16b
 664         eor             v2.16b, v2.16b, v6.16b
 665         next_tweak      v7, v6, v8
 666         eor             v3.16b, v3.16b, v7.16b
 667         bl              aes_encrypt_block4x
 668         eor             v3.16b, v3.16b, v7.16b
 669         eor             v0.16b, v0.16b, v4.16b
 670         eor             v1.16b, v1.16b, v5.16b
 671         eor             v2.16b, v2.16b, v6.16b
 672         st1             {v0.16b-v3.16b}, [x0], #64
 673         mov             v4.16b, v7.16b
 674         cbz             w4, .Lxtsencret
 675         xts_reload_mask v8
 676         b               .LxtsencloopNx
 677 .Lxtsenc1x:
 678         adds            w4, w4, #64
 679         beq             .Lxtsencout
 680         subs            w4, w4, #16
 681         bmi             .LxtsencctsNx
 682 .Lxtsencloop:
 683         ld1             {v0.16b}, [x1], #16
 684 .Lxtsencctsout:
 685         eor             v0.16b, v0.16b, v4.16b
 686         encrypt_block   v0, w3, x2, x8, w7
 687         eor             v0.16b, v0.16b, v4.16b
 688         cbz             w4, .Lxtsencout
 689         subs            w4, w4, #16
 690         next_tweak      v4, v4, v8
 691         bmi             .Lxtsenccts
 692         st1             {v0.16b}, [x0], #16
 693         b               .Lxtsencloop
 694 .Lxtsencout:
 695         st1             {v0.16b}, [x0]
 696 .Lxtsencret:
 697         st1             {v4.16b}, [x6]
 698         frame_pop
 699         ret
 700
 701 .LxtsencctsNx:
 702         mov             v0.16b, v3.16b
 703         sub             x0, x0, #16
 704 .Lxtsenccts:
 705         adr_l           x8, .Lcts_permute_table
 706
 707         add             x1, x1, w4, sxtw        /* rewind input pointer */
 708         add             w4, w4, #16             /* # bytes in final block */
 709         add             x9, x8, #32
 710         add             x8, x8, x4
 711         sub             x9, x9, x4
 712         add             x4, x0, x4              /* output address of final block */
 713
 714         ld1             {v1.16b}, [x1]          /* load final block */
 715         ld1             {v2.16b}, [x8]
 716         ld1             {v3.16b}, [x9]
 717
 718         tbl             v2.16b, {v0.16b}, v2.16b
 719         tbx             v0.16b, {v1.16b}, v3.16b
 720         st1             {v2.16b}, [x4]                  /* overlapping stores */
 721         mov             w4, wzr
 722         b               .Lxtsencctsout
 723 AES_FUNC_END(aes_xts_encrypt)
 724
 725 AES_FUNC_START(aes_xts_decrypt)
 726         frame_push      0
 727
 728         /* subtract 16 bytes if we are doing CTS */
 729         sub             w8, w4, #0x10
 730         tst             w4, #0xf
 731         csel            w4, w4, w8, eq
 732
 733         ld1             {v4.16b}, [x6]
 734         xts_load_mask   v8
 735         xts_cts_skip_tw w7, .Lxtsdecskiptw
 736         cbz             w7, .Lxtsdecnotfirst
 737
 738         enc_prepare     w3, x5, x8
 739         encrypt_block   v4, w3, x5, x8, w7              /* first tweak */
 740 .Lxtsdecskiptw:
 741         dec_prepare     w3, x2, x8
 742         b               .LxtsdecNx
 743
 744 .Lxtsdecnotfirst:
 745         dec_prepare     w3, x2, x8
 746 .LxtsdecloopNx:
 747         next_tweak      v4, v4, v8
 748 .LxtsdecNx:
 749         subs            w4, w4, #64
 750         bmi             .Lxtsdec1x
 751         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
 752         next_tweak      v5, v4, v8
 753         eor             v0.16b, v0.16b, v4.16b
 754         next_tweak      v6, v5, v8
 755         eor             v1.16b, v1.16b, v5.16b
 756         eor             v2.16b, v2.16b, v6.16b
 757         next_tweak      v7, v6, v8
 758         eor             v3.16b, v3.16b, v7.16b
 759         bl              aes_decrypt_block4x
 760         eor             v3.16b, v3.16b, v7.16b
 761         eor             v0.16b, v0.16b, v4.16b
 762         eor             v1.16b, v1.16b, v5.16b
 763         eor             v2.16b, v2.16b, v6.16b
 764         st1             {v0.16b-v3.16b}, [x0], #64
 765         mov             v4.16b, v7.16b
 766         cbz             w4, .Lxtsdecout
 767         xts_reload_mask v8
 768         b               .LxtsdecloopNx
 769 .Lxtsdec1x:
 770         adds            w4, w4, #64
 771         beq             .Lxtsdecout
 772         subs            w4, w4, #16
 773 .Lxtsdecloop:
 774         ld1             {v0.16b}, [x1], #16
 775         bmi             .Lxtsdeccts
 776 .Lxtsdecctsout:
 777         eor             v0.16b, v0.16b, v4.16b
 778         decrypt_block   v0, w3, x2, x8, w7
 779         eor             v0.16b, v0.16b, v4.16b
 780         st1             {v0.16b}, [x0], #16
 781         cbz             w4, .Lxtsdecout
 782         subs            w4, w4, #16
 783         next_tweak      v4, v4, v8
 784         b               .Lxtsdecloop
 785 .Lxtsdecout:
 786         st1             {v4.16b}, [x6]
 787         frame_pop
 788         ret
 789
 790 .Lxtsdeccts:
 791         adr_l           x8, .Lcts_permute_table
 792
 793         add             x1, x1, w4, sxtw        /* rewind input pointer */
 794         add             w4, w4, #16             /* # bytes in final block */
 795         add             x9, x8, #32
 796         add             x8, x8, x4
 797         sub             x9, x9, x4
 798         add             x4, x0, x4              /* output address of final block */
 799
 800         next_tweak      v5, v4, v8
 801
 802         ld1             {v1.16b}, [x1]          /* load final block */
 803         ld1             {v2.16b}, [x8]
 804         ld1             {v3.16b}, [x9]
 805
 806         eor             v0.16b, v0.16b, v5.16b
 807         decrypt_block   v0, w3, x2, x8, w7
 808         eor             v0.16b, v0.16b, v5.16b
 809
 810         tbl             v2.16b, {v0.16b}, v2.16b
 811         tbx             v0.16b, {v1.16b}, v3.16b
 812
 813         st1             {v2.16b}, [x4]                  /* overlapping stores */
 814         mov             w4, wzr
 815         b               .Lxtsdecctsout
 816 AES_FUNC_END(aes_xts_decrypt)
 817
 818         /*
 819          * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
 820          *                int blocks, u8 dg[], int enc_before, int enc_after)
 821          */
 822 AES_FUNC_START(aes_mac_update)
 823         ld1             {v0.16b}, [x4]                  /* get dg */
 824         enc_prepare     w2, x1, x7
 825         cbz             w5, .Lmacloop4x
 826
 827         encrypt_block   v0, w2, x1, x7, w8
 828
 829 .Lmacloop4x:
 830         subs            w3, w3, #4
 831         bmi             .Lmac1x
 832         ld1             {v1.16b-v4.16b}, [x0], #64      /* get next pt block */
 833         eor             v0.16b, v0.16b, v1.16b          /* ..and xor with dg */
 834         encrypt_block   v0, w2, x1, x7, w8
 835         eor             v0.16b, v0.16b, v2.16b
 836         encrypt_block   v0, w2, x1, x7, w8
 837         eor             v0.16b, v0.16b, v3.16b
 838         encrypt_block   v0, w2, x1, x7, w8
 839         eor             v0.16b, v0.16b, v4.16b
 840         cmp             w3, wzr
 841         csinv           x5, x6, xzr, eq
 842         cbz             w5, .Lmacout
 843         encrypt_block   v0, w2, x1, x7, w8
 844         st1             {v0.16b}, [x4]                  /* return dg */
 845         cond_yield      .Lmacout, x7, x8
 846         b               .Lmacloop4x
 847 .Lmac1x:
 848         add             w3, w3, #4
 849 .Lmacloop:
 850         cbz             w3, .Lmacout
 851         ld1             {v1.16b}, [x0], #16             /* get next pt block */
 852         eor             v0.16b, v0.16b, v1.16b          /* ..and xor with dg */
 853
 854         subs            w3, w3, #1
 855         csinv           x5, x6, xzr, eq
 856         cbz             w5, .Lmacout
 857
 858 .Lmacenc:
 859         encrypt_block   v0, w2, x1, x7, w8
 860         b               .Lmacloop
 861
 862 .Lmacout:
 863         st1             {v0.16b}, [x4]                  /* return dg */
 864         mov             w0, w3
 865         ret
 866 AES_FUNC_END(aes_mac_update)