chacha-scalar-core.S

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 /*
   3  * Copyright (C) 2018 Google, Inc.
   4  */
   5
   6 #include <linux/linkage.h>
   7 #include <asm/assembler.h>
   8
   9 /*
  10  * Design notes:
  11  *
  12  * 16 registers would be needed to hold the state matrix, but only 14 are
  13  * available because 'sp' and 'pc' cannot be used.  So we spill the elements
  14  * (x8, x9) to the stack and swap them out with (x10, x11).  This adds one
  15  * 'ldrd' and one 'strd' instruction per round.
  16  *
  17  * All rotates are performed using the implicit rotate operand accepted by the
  18  * 'add' and 'eor' instructions.  This is faster than using explicit rotate
  19  * instructions.  To make this work, we allow the values in the second and last
  20  * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
  21  * wrong rotation amount.  The rotation amount is then fixed up just in time
  22  * when the values are used.  'brot' is the number of bits the values in row 'b'
  23  * need to be rotated right to arrive at the correct values, and 'drot'
  24  * similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
  25  * that they end up as (25, 24) after every round.
  26  */
  27
  28         // ChaCha state registers
  29         X0      .req    r0
  30         X1      .req    r1
  31         X2      .req    r2
  32         X3      .req    r3
  33         X4      .req    r4
  34         X5      .req    r5
  35         X6      .req    r6
  36         X7      .req    r7
  37         X8_X10  .req    r8      // shared by x8 and x10
  38         X9_X11  .req    r9      // shared by x9 and x11
  39         X12     .req    r10
  40         X13     .req    r11
  41         X14     .req    r12
  42         X15     .req    r14
  43
  44 .macro _le32_bswap_4x   a, b, c, d,  tmp
  45 #ifdef __ARMEB__
  46         rev_l           \a,  \tmp
  47         rev_l           \b,  \tmp
  48         rev_l           \c,  \tmp
  49         rev_l           \d,  \tmp
  50 #endif
  51 .endm
  52
  53 .macro __ldrd           a, b, src, offset
  54 #if __LINUX_ARM_ARCH__ >= 6
  55         ldrd            \a, \b, [\src, #\offset]
  56 #else
  57         ldr             \a, [\src, #\offset]
  58         ldr             \b, [\src, #\offset + 4]
  59 #endif
  60 .endm
  61
  62 .macro __strd           a, b, dst, offset
  63 #if __LINUX_ARM_ARCH__ >= 6
  64         strd            \a, \b, [\dst, #\offset]
  65 #else
  66         str             \a, [\dst, #\offset]
  67         str             \b, [\dst, #\offset + 4]
  68 #endif
  69 .endm
  70
  71 .macro _halfround       a1, b1, c1, d1,  a2, b2, c2, d2
  72
  73         // a += b; d ^= a; d = rol(d, 16);
  74         add             \a1, \a1, \b1, ror #brot
  75         add             \a2, \a2, \b2, ror #brot
  76         eor             \d1, \a1, \d1, ror #drot
  77         eor             \d2, \a2, \d2, ror #drot
  78         // drot == 32 - 16 == 16
  79
  80         // c += d; b ^= c; b = rol(b, 12);
  81         add             \c1, \c1, \d1, ror #16
  82         add             \c2, \c2, \d2, ror #16
  83         eor             \b1, \c1, \b1, ror #brot
  84         eor             \b2, \c2, \b2, ror #brot
  85         // brot == 32 - 12 == 20
  86
  87         // a += b; d ^= a; d = rol(d, 8);
  88         add             \a1, \a1, \b1, ror #20
  89         add             \a2, \a2, \b2, ror #20
  90         eor             \d1, \a1, \d1, ror #16
  91         eor             \d2, \a2, \d2, ror #16
  92         // drot == 32 - 8 == 24
  93
  94         // c += d; b ^= c; b = rol(b, 7);
  95         add             \c1, \c1, \d1, ror #24
  96         add             \c2, \c2, \d2, ror #24
  97         eor             \b1, \c1, \b1, ror #20
  98         eor             \b2, \c2, \b2, ror #20
  99         // brot == 32 - 7 == 25
 100 .endm
 101
 102 .macro _doubleround
 103
 104         // column round
 105
 106         // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
 107         _halfround      X0, X4, X8_X10, X12,  X1, X5, X9_X11, X13
 108
 109         // save (x8, x9); restore (x10, x11)
 110         __strd          X8_X10, X9_X11, sp, 0
 111         __ldrd          X8_X10, X9_X11, sp, 8
 112
 113         // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
 114         _halfround      X2, X6, X8_X10, X14,  X3, X7, X9_X11, X15
 115
 116         .set brot, 25
 117         .set drot, 24
 118
 119         // diagonal round
 120
 121         // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
 122         _halfround      X0, X5, X8_X10, X15,  X1, X6, X9_X11, X12
 123
 124         // save (x10, x11); restore (x8, x9)
 125         __strd          X8_X10, X9_X11, sp, 8
 126         __ldrd          X8_X10, X9_X11, sp, 0
 127
 128         // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
 129         _halfround      X2, X7, X8_X10, X13,  X3, X4, X9_X11, X14
 130 .endm
 131
 132 .macro _chacha_permute  nrounds
 133         .set brot, 0
 134         .set drot, 0
 135         .rept \nrounds / 2
 136          _doubleround
 137         .endr
 138 .endm
 139
 140 .macro _chacha          nrounds
 141
 142 .Lnext_block\@:
 143         // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
 144         // Registers contain x0-x9,x12-x15.
 145
 146         // Do the core ChaCha permutation to update x0-x15.
 147         _chacha_permute \nrounds
 148
 149         add             sp, #8
 150         // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
 151         // Registers contain x0-x9,x12-x15.
 152         // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
 153
 154         // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
 155         push            {X8_X10, X9_X11, X12, X13, X14, X15}
 156
 157         // Load (OUT, IN, LEN).
 158         ldr             r14, [sp, #96]
 159         ldr             r12, [sp, #100]
 160         ldr             r11, [sp, #104]
 161
 162         orr             r10, r14, r12
 163
 164         // Use slow path if fewer than 64 bytes remain.
 165         cmp             r11, #64
 166         blt             .Lxor_slowpath\@
 167
 168         // Use slow path if IN and/or OUT isn't 4-byte aligned.  Needed even on
 169         // ARMv6+, since ldmia and stmia (used below) still require alignment.
 170         tst             r10, #3
 171         bne             .Lxor_slowpath\@
 172
 173         // Fast path: XOR 64 bytes of aligned data.
 174
 175         // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
 176         // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
 177         // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
 178
 179         // x0-x3
 180         __ldrd          r8, r9, sp, 32
 181         __ldrd          r10, r11, sp, 40
 182         add             X0, X0, r8
 183         add             X1, X1, r9
 184         add             X2, X2, r10
 185         add             X3, X3, r11
 186         _le32_bswap_4x  X0, X1, X2, X3,  r8
 187         ldmia           r12!, {r8-r11}
 188         eor             X0, X0, r8
 189         eor             X1, X1, r9
 190         eor             X2, X2, r10
 191         eor             X3, X3, r11
 192         stmia           r14!, {X0-X3}
 193
 194         // x4-x7
 195         __ldrd          r8, r9, sp, 48
 196         __ldrd          r10, r11, sp, 56
 197         add             X4, r8, X4, ror #brot
 198         add             X5, r9, X5, ror #brot
 199         ldmia           r12!, {X0-X3}
 200         add             X6, r10, X6, ror #brot
 201         add             X7, r11, X7, ror #brot
 202         _le32_bswap_4x  X4, X5, X6, X7,  r8
 203         eor             X4, X4, X0
 204         eor             X5, X5, X1
 205         eor             X6, X6, X2
 206         eor             X7, X7, X3
 207         stmia           r14!, {X4-X7}
 208
 209         // x8-x15
 210         pop             {r0-r7}                 // (x8-x9,x12-x15,x10-x11)
 211         __ldrd          r8, r9, sp, 32
 212         __ldrd          r10, r11, sp, 40
 213         add             r0, r0, r8              // x8
 214         add             r1, r1, r9              // x9
 215         add             r6, r6, r10             // x10
 216         add             r7, r7, r11             // x11
 217         _le32_bswap_4x  r0, r1, r6, r7,  r8
 218         ldmia           r12!, {r8-r11}
 219         eor             r0, r0, r8              // x8
 220         eor             r1, r1, r9              // x9
 221         eor             r6, r6, r10             // x10
 222         eor             r7, r7, r11             // x11
 223         stmia           r14!, {r0,r1,r6,r7}
 224         ldmia           r12!, {r0,r1,r6,r7}
 225         __ldrd          r8, r9, sp, 48
 226         __ldrd          r10, r11, sp, 56
 227         add             r2, r8, r2, ror #drot   // x12
 228         add             r3, r9, r3, ror #drot   // x13
 229         add             r4, r10, r4, ror #drot  // x14
 230         add             r5, r11, r5, ror #drot  // x15
 231         _le32_bswap_4x  r2, r3, r4, r5,  r9
 232           ldr           r9, [sp, #72]           // load LEN
 233         eor             r2, r2, r0              // x12
 234         eor             r3, r3, r1              // x13
 235         eor             r4, r4, r6              // x14
 236         eor             r5, r5, r7              // x15
 237           subs          r9, #64                 // decrement and check LEN
 238         stmia           r14!, {r2-r5}
 239
 240         beq             .Ldone\@
 241
 242 .Lprepare_for_next_block\@:
 243
 244         // Stack: x0-x15 OUT IN LEN
 245
 246         // Increment block counter (x12)
 247         add             r8, #1
 248
 249         // Store updated (OUT, IN, LEN)
 250         str             r14, [sp, #64]
 251         str             r12, [sp, #68]
 252         str             r9, [sp, #72]
 253
 254           mov           r14, sp
 255
 256         // Store updated block counter (x12)
 257         str             r8, [sp, #48]
 258
 259           sub           sp, #16
 260
 261         // Reload state and do next block
 262         ldmia           r14!, {r0-r11}          // load x0-x11
 263         __strd          r10, r11, sp, 8         // store x10-x11 before state
 264         ldmia           r14, {r10-r12,r14}      // load x12-x15
 265         b               .Lnext_block\@
 266
 267 .Lxor_slowpath\@:
 268         // Slow path: < 64 bytes remaining, or unaligned input or output buffer.
 269         // We handle it by storing the 64 bytes of keystream to the stack, then
 270         // XOR-ing the needed portion with the data.
 271
 272         // Allocate keystream buffer
 273         sub             sp, #64
 274         mov             r14, sp
 275
 276         // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
 277         // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
 278         // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
 279
 280         // Save keystream for x0-x3
 281         __ldrd          r8, r9, sp, 96
 282         __ldrd          r10, r11, sp, 104
 283         add             X0, X0, r8
 284         add             X1, X1, r9
 285         add             X2, X2, r10
 286         add             X3, X3, r11
 287         _le32_bswap_4x  X0, X1, X2, X3,  r8
 288         stmia           r14!, {X0-X3}
 289
 290         // Save keystream for x4-x7
 291         __ldrd          r8, r9, sp, 112
 292         __ldrd          r10, r11, sp, 120
 293         add             X4, r8, X4, ror #brot
 294         add             X5, r9, X5, ror #brot
 295         add             X6, r10, X6, ror #brot
 296         add             X7, r11, X7, ror #brot
 297         _le32_bswap_4x  X4, X5, X6, X7,  r8
 298           add           r8, sp, #64
 299         stmia           r14!, {X4-X7}
 300
 301         // Save keystream for x8-x15
 302         ldm             r8, {r0-r7}             // (x8-x9,x12-x15,x10-x11)
 303         __ldrd          r8, r9, sp, 128
 304         __ldrd          r10, r11, sp, 136
 305         add             r0, r0, r8              // x8
 306         add             r1, r1, r9              // x9
 307         add             r6, r6, r10             // x10
 308         add             r7, r7, r11             // x11
 309         _le32_bswap_4x  r0, r1, r6, r7,  r8
 310         stmia           r14!, {r0,r1,r6,r7}
 311         __ldrd          r8, r9, sp, 144
 312         __ldrd          r10, r11, sp, 152
 313         add             r2, r8, r2, ror #drot   // x12
 314         add             r3, r9, r3, ror #drot   // x13
 315         add             r4, r10, r4, ror #drot  // x14
 316         add             r5, r11, r5, ror #drot  // x15
 317         _le32_bswap_4x  r2, r3, r4, r5,  r9
 318         stmia           r14, {r2-r5}
 319
 320         // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
 321         // Registers: r8 is block counter, r12 is IN.
 322
 323         ldr             r9, [sp, #168]          // LEN
 324         ldr             r14, [sp, #160]         // OUT
 325         cmp             r9, #64
 326           mov           r0, sp
 327         movle           r1, r9
 328         movgt           r1, #64
 329         // r1 is number of bytes to XOR, in range [1, 64]
 330
 331 .if __LINUX_ARM_ARCH__ < 6
 332         orr             r2, r12, r14
 333         tst             r2, #3                  // IN or OUT misaligned?
 334         bne             .Lxor_next_byte\@
 335 .endif
 336
 337         // XOR a word at a time
 338 .rept 16
 339         subs            r1, #4
 340         blt             .Lxor_words_done\@
 341         ldr             r2, [r12], #4
 342         ldr             r3, [r0], #4
 343         eor             r2, r2, r3
 344         str             r2, [r14], #4
 345 .endr
 346         b               .Lxor_slowpath_done\@
 347 .Lxor_words_done\@:
 348         ands            r1, r1, #3
 349         beq             .Lxor_slowpath_done\@
 350
 351         // XOR a byte at a time
 352 .Lxor_next_byte\@:
 353         ldrb            r2, [r12], #1
 354         ldrb            r3, [r0], #1
 355         eor             r2, r2, r3
 356         strb            r2, [r14], #1
 357         subs            r1, #1
 358         bne             .Lxor_next_byte\@
 359
 360 .Lxor_slowpath_done\@:
 361         subs            r9, #64
 362         add             sp, #96
 363         bgt             .Lprepare_for_next_block\@
 364
 365 .Ldone\@:
 366 .endm   // _chacha
 367
 368 /*
 369  * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
 370  *                   const u32 *state, int nrounds);
 371  */
 372 ENTRY(chacha_doarm)
 373         cmp             r2, #0                  // len == 0?
 374         reteq           lr
 375
 376         ldr             ip, [sp]
 377         cmp             ip, #12
 378
 379         push            {r0-r2,r4-r11,lr}
 380
 381         // Push state x0-x15 onto stack.
 382         // Also store an extra copy of x10-x11 just before the state.
 383
 384         add             X12, r3, #48
 385         ldm             X12, {X12,X13,X14,X15}
 386         push            {X12,X13,X14,X15}
 387         sub             sp, sp, #64
 388
 389         __ldrd          X8_X10, X9_X11, r3, 40
 390         __strd          X8_X10, X9_X11, sp, 8
 391         __strd          X8_X10, X9_X11, sp, 56
 392         ldm             r3, {X0-X9_X11}
 393         __strd          X0, X1, sp, 16
 394         __strd          X2, X3, sp, 24
 395         __strd          X4, X5, sp, 32
 396         __strd          X6, X7, sp, 40
 397         __strd          X8_X10, X9_X11, sp, 48
 398
 399         beq             1f
 400         _chacha         20
 401
 402 0:      add             sp, #76
 403         pop             {r4-r11, pc}
 404
 405 1:      _chacha         12
 406         b               0b
 407 ENDPROC(chacha_doarm)
 408
 409 /*
 410  * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds);
 411  */
 412 ENTRY(hchacha_block_arm)
 413         push            {r1,r4-r11,lr}
 414
 415         cmp             r2, #12                 // ChaCha12 ?
 416
 417         mov             r14, r0
 418         ldmia           r14!, {r0-r11}          // load x0-x11
 419         push            {r10-r11}               // store x10-x11 to stack
 420         ldm             r14, {r10-r12,r14}      // load x12-x15
 421         sub             sp, #8
 422
 423         beq             1f
 424         _chacha_permute 20
 425
 426         // Skip over (unused0-unused1, x10-x11)
 427 0:      add             sp, #16
 428
 429         // Fix up rotations of x12-x15
 430         ror             X12, X12, #drot
 431         ror             X13, X13, #drot
 432           pop           {r4}                    // load 'out'
 433         ror             X14, X14, #drot
 434         ror             X15, X15, #drot
 435
 436         // Store (x0-x3,x12-x15) to 'out'
 437         stm             r4, {X0,X1,X2,X3,X12,X13,X14,X15}
 438
 439         pop             {r4-r11,pc}
 440
 441 1:      _chacha_permute 12
 442         b               0b
 443 ENDPROC(hchacha_block_arm)