arch/arm/crypto/blake2b-neon-core.S

   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  * BLAKE2b digest algorithm, NEON accelerated
   4  *
   5  * Copyright 2020 Google LLC
   6  *
   7  * Author: Eric Biggers <ebiggers@google.com>
   8  */
   9
  10 #include <linux/linkage.h>
  11
  12         .text
  13         .fpu            neon
  14
  15         // The arguments to blake2b_compress_neon()
  16         STATE           .req    r0
  17         BLOCK           .req    r1
  18         NBLOCKS         .req    r2
  19         INC             .req    r3
  20
  21         // Pointers to the rotation tables
  22         ROR24_TABLE     .req    r4
  23         ROR16_TABLE     .req    r5
  24
  25         // The original stack pointer
  26         ORIG_SP         .req    r6
  27
  28         // NEON registers which contain the message words of the current block.
  29         // M_0-M_3 are occasionally used for other purposes too.
  30         M_0             .req    d16
  31         M_1             .req    d17
  32         M_2             .req    d18
  33         M_3             .req    d19
  34         M_4             .req    d20
  35         M_5             .req    d21
  36         M_6             .req    d22
  37         M_7             .req    d23
  38         M_8             .req    d24
  39         M_9             .req    d25
  40         M_10            .req    d26
  41         M_11            .req    d27
  42         M_12            .req    d28
  43         M_13            .req    d29
  44         M_14            .req    d30
  45         M_15            .req    d31
  46
  47         .align          4
  48         // Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
  49         // instruction.  This is the most efficient way to implement these
  50         // rotation amounts with NEON.  (On Cortex-A53 it's the same speed as
  51         // vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
  52 .Lror24_table:
  53         .byte           3, 4, 5, 6, 7, 0, 1, 2
  54 .Lror16_table:
  55         .byte           2, 3, 4, 5, 6, 7, 0, 1
  56         // The BLAKE2b initialization vector
  57 .Lblake2b_IV:
  58         .quad           0x6a09e667f3bcc908, 0xbb67ae8584caa73b
  59         .quad           0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
  60         .quad           0x510e527fade682d1, 0x9b05688c2b3e6c1f
  61         .quad           0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
  62
  63 // Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
  64 // NEON registers q0-q7.  The message block is in q8..q15 (M_0-M_15).  The stack
  65 // pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
  66 // (M_0-M_3), so that they can be reloaded if they are used as temporary
  67 // registers.  The macro arguments s0-s15 give the order in which the message
  68 // words are used in this round.  'final' is 1 if this is the final round.
  69 .macro  _blake2b_round  s0, s1, s2, s3, s4, s5, s6, s7, \
  70                         s8, s9, s10, s11, s12, s13, s14, s15, final=0
  71
  72         // Mix the columns:
  73         // (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
  74         // (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
  75
  76         // a += b + m[blake2b_sigma[r][2*i + 0]];
  77         vadd.u64        q0, q0, q2
  78         vadd.u64        q1, q1, q3
  79         vadd.u64        d0, d0, M_\s0
  80         vadd.u64        d1, d1, M_\s2
  81         vadd.u64        d2, d2, M_\s4
  82         vadd.u64        d3, d3, M_\s6
  83
  84         // d = ror64(d ^ a, 32);
  85         veor            q6, q6, q0
  86         veor            q7, q7, q1
  87         vrev64.32       q6, q6
  88         vrev64.32       q7, q7
  89
  90         // c += d;
  91         vadd.u64        q4, q4, q6
  92         vadd.u64        q5, q5, q7
  93
  94         // b = ror64(b ^ c, 24);
  95         vld1.8          {M_0}, [ROR24_TABLE, :64]
  96         veor            q2, q2, q4
  97         veor            q3, q3, q5
  98         vtbl.8          d4, {d4}, M_0
  99         vtbl.8          d5, {d5}, M_0
 100         vtbl.8          d6, {d6}, M_0
 101         vtbl.8          d7, {d7}, M_0
 102
 103         // a += b + m[blake2b_sigma[r][2*i + 1]];
 104         //
 105         // M_0 got clobbered above, so we have to reload it if any of the four
 106         // message words this step needs happens to be M_0.  Otherwise we don't
 107         // need to reload it here, as it will just get clobbered again below.
 108 .if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
 109         vld1.8          {M_0}, [sp, :64]
 110 .endif
 111         vadd.u64        q0, q0, q2
 112         vadd.u64        q1, q1, q3
 113         vadd.u64        d0, d0, M_\s1
 114         vadd.u64        d1, d1, M_\s3
 115         vadd.u64        d2, d2, M_\s5
 116         vadd.u64        d3, d3, M_\s7
 117
 118         // d = ror64(d ^ a, 16);
 119         vld1.8          {M_0}, [ROR16_TABLE, :64]
 120         veor            q6, q6, q0
 121         veor            q7, q7, q1
 122         vtbl.8          d12, {d12}, M_0
 123         vtbl.8          d13, {d13}, M_0
 124         vtbl.8          d14, {d14}, M_0
 125         vtbl.8          d15, {d15}, M_0
 126
 127         // c += d;
 128         vadd.u64        q4, q4, q6
 129         vadd.u64        q5, q5, q7
 130
 131         // b = ror64(b ^ c, 63);
 132         //
 133         // This rotation amount isn't a multiple of 8, so it has to be
 134         // implemented using a pair of shifts, which requires temporary
 135         // registers.  Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
 136         veor            q8, q2, q4
 137         veor            q9, q3, q5
 138         vshr.u64        q2, q8, #63
 139         vshr.u64        q3, q9, #63
 140         vsli.u64        q2, q8, #1
 141         vsli.u64        q3, q9, #1
 142         vld1.8          {q8-q9}, [sp, :256]
 143
 144         // Mix the diagonals:
 145         // (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
 146         // (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
 147         //
 148         // There are two possible ways to do this: use 'vext' instructions to
 149         // shift the rows of the matrix so that the diagonals become columns,
 150         // and undo it afterwards; or just use 64-bit operations on 'd'
 151         // registers instead of 128-bit operations on 'q' registers.  We use the
 152         // latter approach, as it performs much better on Cortex-A7.
 153
 154         // a += b + m[blake2b_sigma[r][2*i + 0]];
 155         vadd.u64        d0, d0, d5
 156         vadd.u64        d1, d1, d6
 157         vadd.u64        d2, d2, d7
 158         vadd.u64        d3, d3, d4
 159         vadd.u64        d0, d0, M_\s8
 160         vadd.u64        d1, d1, M_\s10
 161         vadd.u64        d2, d2, M_\s12
 162         vadd.u64        d3, d3, M_\s14
 163
 164         // d = ror64(d ^ a, 32);
 165         veor            d15, d15, d0
 166         veor            d12, d12, d1
 167         veor            d13, d13, d2
 168         veor            d14, d14, d3
 169         vrev64.32       d15, d15
 170         vrev64.32       d12, d12
 171         vrev64.32       d13, d13
 172         vrev64.32       d14, d14
 173
 174         // c += d;
 175         vadd.u64        d10, d10, d15
 176         vadd.u64        d11, d11, d12
 177         vadd.u64        d8, d8, d13
 178         vadd.u64        d9, d9, d14
 179
 180         // b = ror64(b ^ c, 24);
 181         vld1.8          {M_0}, [ROR24_TABLE, :64]
 182         veor            d5, d5, d10
 183         veor            d6, d6, d11
 184         veor            d7, d7, d8
 185         veor            d4, d4, d9
 186         vtbl.8          d5, {d5}, M_0
 187         vtbl.8          d6, {d6}, M_0
 188         vtbl.8          d7, {d7}, M_0
 189         vtbl.8          d4, {d4}, M_0
 190
 191         // a += b + m[blake2b_sigma[r][2*i + 1]];
 192 .if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
 193         vld1.8          {M_0}, [sp, :64]
 194 .endif
 195         vadd.u64        d0, d0, d5
 196         vadd.u64        d1, d1, d6
 197         vadd.u64        d2, d2, d7
 198         vadd.u64        d3, d3, d4
 199         vadd.u64        d0, d0, M_\s9
 200         vadd.u64        d1, d1, M_\s11
 201         vadd.u64        d2, d2, M_\s13
 202         vadd.u64        d3, d3, M_\s15
 203
 204         // d = ror64(d ^ a, 16);
 205         vld1.8          {M_0}, [ROR16_TABLE, :64]
 206         veor            d15, d15, d0
 207         veor            d12, d12, d1
 208         veor            d13, d13, d2
 209         veor            d14, d14, d3
 210         vtbl.8          d12, {d12}, M_0
 211         vtbl.8          d13, {d13}, M_0
 212         vtbl.8          d14, {d14}, M_0
 213         vtbl.8          d15, {d15}, M_0
 214
 215         // c += d;
 216         vadd.u64        d10, d10, d15
 217         vadd.u64        d11, d11, d12
 218         vadd.u64        d8, d8, d13
 219         vadd.u64        d9, d9, d14
 220
 221         // b = ror64(b ^ c, 63);
 222         veor            d16, d4, d9
 223         veor            d17, d5, d10
 224         veor            d18, d6, d11
 225         veor            d19, d7, d8
 226         vshr.u64        q2, q8, #63
 227         vshr.u64        q3, q9, #63
 228         vsli.u64        q2, q8, #1
 229         vsli.u64        q3, q9, #1
 230         // Reloading q8-q9 can be skipped on the final round.
 231 .if ! \final
 232         vld1.8          {q8-q9}, [sp, :256]
 233 .endif
 234 .endm
 235
 236 //
 237 // void blake2b_compress_neon(struct blake2b_state *state,
 238 //                            const u8 *block, size_t nblocks, u32 inc);
 239 //
 240 // Only the first three fields of struct blake2b_state are used:
 241 //      u64 h[8];       (inout)
 242 //      u64 t[2];       (inout)
 243 //      u64 f[2];       (in)
 244 //
 245         .align          5
 246 ENTRY(blake2b_compress_neon)
 247         push            {r4-r10}
 248
 249         // Allocate a 32-byte stack buffer that is 32-byte aligned.
 250         mov             ORIG_SP, sp
 251         sub             ip, sp, #32
 252         bic             ip, ip, #31
 253         mov             sp, ip
 254
 255         adr             ROR24_TABLE, .Lror24_table
 256         adr             ROR16_TABLE, .Lror16_table
 257
 258         mov             ip, STATE
 259         vld1.64         {q0-q1}, [ip]!          // Load h[0..3]
 260         vld1.64         {q2-q3}, [ip]!          // Load h[4..7]
 261 .Lnext_block:
 262           adr           r10, .Lblake2b_IV
 263         vld1.64         {q14-q15}, [ip]         // Load t[0..1] and f[0..1]
 264         vld1.64         {q4-q5}, [r10]!         // Load IV[0..3]
 265           vmov          r7, r8, d28             // Copy t[0] to (r7, r8)
 266         vld1.64         {q6-q7}, [r10]          // Load IV[4..7]
 267           adds          r7, r7, INC             // Increment counter
 268         bcs             .Lslow_inc_ctr
 269         vmov.i32        d28[0], r7
 270         vst1.64         {d28}, [ip]             // Update t[0]
 271 .Linc_ctr_done:
 272
 273         // Load the next message block and finish initializing the state matrix
 274         // 'v'.  Fortunately, there are exactly enough NEON registers to fit the
 275         // entire state matrix in q0-q7 and the entire message block in q8-15.
 276         //
 277         // However, _blake2b_round also needs some extra registers for rotates,
 278         // so we have to spill some registers.  It's better to spill the message
 279         // registers than the state registers, as the message doesn't change.
 280         // Therefore we store a copy of the first 32 bytes of the message block
 281         // (q8-q9) in an aligned buffer on the stack so that they can be
 282         // reloaded when needed.  (We could just reload directly from the
 283         // message buffer, but it's faster to use aligned loads.)
 284         vld1.8          {q8-q9}, [BLOCK]!
 285           veor          q6, q6, q14     // v[12..13] = IV[4..5] ^ t[0..1]
 286         vld1.8          {q10-q11}, [BLOCK]!
 287           veor          q7, q7, q15     // v[14..15] = IV[6..7] ^ f[0..1]
 288         vld1.8          {q12-q13}, [BLOCK]!
 289         vst1.8          {q8-q9}, [sp, :256]
 290           mov           ip, STATE
 291         vld1.8          {q14-q15}, [BLOCK]!
 292
 293         // Execute the rounds.  Each round is provided the order in which it
 294         // needs to use the message words.
 295         _blake2b_round  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 296         _blake2b_round  14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
 297         _blake2b_round  11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
 298         _blake2b_round  7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
 299         _blake2b_round  9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
 300         _blake2b_round  2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
 301         _blake2b_round  12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
 302         _blake2b_round  13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
 303         _blake2b_round  6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
 304         _blake2b_round  10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
 305         _blake2b_round  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 306         _blake2b_round  14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
 307                         final=1
 308
 309         // Fold the final state matrix into the hash chaining value:
 310         //
 311         //      for (i = 0; i < 8; i++)
 312         //              h[i] ^= v[i] ^ v[i + 8];
 313         //
 314           vld1.64       {q8-q9}, [ip]!          // Load old h[0..3]
 315         veor            q0, q0, q4              // v[0..1] ^= v[8..9]
 316         veor            q1, q1, q5              // v[2..3] ^= v[10..11]
 317           vld1.64       {q10-q11}, [ip]         // Load old h[4..7]
 318         veor            q2, q2, q6              // v[4..5] ^= v[12..13]
 319         veor            q3, q3, q7              // v[6..7] ^= v[14..15]
 320         veor            q0, q0, q8              // v[0..1] ^= h[0..1]
 321         veor            q1, q1, q9              // v[2..3] ^= h[2..3]
 322           mov           ip, STATE
 323           subs          NBLOCKS, NBLOCKS, #1    // nblocks--
 324           vst1.64       {q0-q1}, [ip]!          // Store new h[0..3]
 325         veor            q2, q2, q10             // v[4..5] ^= h[4..5]
 326         veor            q3, q3, q11             // v[6..7] ^= h[6..7]
 327           vst1.64       {q2-q3}, [ip]!          // Store new h[4..7]
 328
 329         // Advance to the next block, if there is one.
 330         bne             .Lnext_block            // nblocks != 0?
 331
 332         mov             sp, ORIG_SP
 333         pop             {r4-r10}
 334         mov             pc, lr
 335
 336 .Lslow_inc_ctr:
 337         // Handle the case where the counter overflowed its low 32 bits, by
 338         // carrying the overflow bit into the full 128-bit counter.
 339         vmov            r9, r10, d29
 340         adcs            r8, r8, #0
 341         adcs            r9, r9, #0
 342         adc             r10, r10, #0
 343         vmov            d28, r7, r8
 344         vmov            d29, r9, r10
 345         vst1.64         {q14}, [ip]             // Update t[0] and t[1]
 346         b               .Linc_ctr_done
 347 ENDPROC(blake2b_compress_neon)