arch/arm64/crypto/crct10dif-ce-core.S

   1 //
   2 // Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
   3 //
   4 // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
   5 //
   6 // This program is free software; you can redistribute it and/or modify
   7 // it under the terms of the GNU General Public License version 2 as
   8 // published by the Free Software Foundation.
   9 //
  10
  11 //
  12 // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
  13 //
  14 // Copyright (c) 2013, Intel Corporation
  15 //
  16 // Authors:
  17 //     Erdinc Ozturk <erdinc.ozturk@intel.com>
  18 //     Vinodh Gopal <vinodh.gopal@intel.com>
  19 //     James Guilford <james.guilford@intel.com>
  20 //     Tim Chen <tim.c.chen@linux.intel.com>
  21 //
  22 // This software is available to you under a choice of one of two
  23 // licenses.  You may choose to be licensed under the terms of the GNU
  24 // General Public License (GPL) Version 2, available from the file
  25 // COPYING in the main directory of this source tree, or the
  26 // OpenIB.org BSD license below:
  27 //
  28 // Redistribution and use in source and binary forms, with or without
  29 // modification, are permitted provided that the following conditions are
  30 // met:
  31 //
  32 // * Redistributions of source code must retain the above copyright
  33 //   notice, this list of conditions and the following disclaimer.
  34 //
  35 // * Redistributions in binary form must reproduce the above copyright
  36 //   notice, this list of conditions and the following disclaimer in the
  37 //   documentation and/or other materials provided with the
  38 //   distribution.
  39 //
  40 // * Neither the name of the Intel Corporation nor the names of its
  41 //   contributors may be used to endorse or promote products derived from
  42 //   this software without specific prior written permission.
  43 //
  44 //
  45 // THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
  46 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  47 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  48 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
  49 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  50 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  51 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  52 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  53 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  54 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  55 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  56 //
  57 //       Function API:
  58 //       UINT16 crc_t10dif_pcl(
  59 //               UINT16 init_crc, //initial CRC value, 16 bits
  60 //               const unsigned char *buf, //buffer pointer to calculate CRC on
  61 //               UINT64 len //buffer length in bytes (64-bit data)
  62 //       );
  63 //
  64 //       Reference paper titled "Fast CRC Computation for Generic
  65 //      Polynomials Using PCLMULQDQ Instruction"
  66 //       URL: http://www.intel.com/content/dam/www/public/us/en/documents
  67 //  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
  68 //
  69 //
  70
  71 #include <linux/linkage.h>
  72 #include <asm/assembler.h>
  73
  74         .text
  75         .cpu            generic+crypto
  76
  77         arg1_low32      .req    w19
  78         arg2            .req    x20
  79         arg3            .req    x21
  80
  81         vzr             .req    v13
  82
  83 ENTRY(crc_t10dif_pmull)
  84         frame_push      3, 128
  85
  86         mov             arg1_low32, w0
  87         mov             arg2, x1
  88         mov             arg3, x2
  89
  90         movi            vzr.16b, #0             // init zero register
  91
  92         // adjust the 16-bit initial_crc value, scale it to 32 bits
  93         lsl             arg1_low32, arg1_low32, #16
  94
  95         // check if smaller than 256
  96         cmp             arg3, #256
  97
  98         // for sizes less than 128, we can't fold 64B at a time...
  99         b.lt            _less_than_128
 100
 101         // load the initial crc value
 102         // crc value does not need to be byte-reflected, but it needs
 103         // to be moved to the high part of the register.
 104         // because data will be byte-reflected and will align with
 105         // initial crc at correct place.
 106         movi            v10.16b, #0
 107         mov             v10.s[3], arg1_low32            // initial crc
 108
 109         // receive the initial 64B data, xor the initial crc value
 110         ldp             q0, q1, [arg2]
 111         ldp             q2, q3, [arg2, #0x20]
 112         ldp             q4, q5, [arg2, #0x40]
 113         ldp             q6, q7, [arg2, #0x60]
 114         add             arg2, arg2, #0x80
 115
 116 CPU_LE( rev64           v0.16b, v0.16b                  )
 117 CPU_LE( rev64           v1.16b, v1.16b                  )
 118 CPU_LE( rev64           v2.16b, v2.16b                  )
 119 CPU_LE( rev64           v3.16b, v3.16b                  )
 120 CPU_LE( rev64           v4.16b, v4.16b                  )
 121 CPU_LE( rev64           v5.16b, v5.16b                  )
 122 CPU_LE( rev64           v6.16b, v6.16b                  )
 123 CPU_LE( rev64           v7.16b, v7.16b                  )
 124
 125 CPU_LE( ext             v0.16b, v0.16b, v0.16b, #8      )
 126 CPU_LE( ext             v1.16b, v1.16b, v1.16b, #8      )
 127 CPU_LE( ext             v2.16b, v2.16b, v2.16b, #8      )
 128 CPU_LE( ext             v3.16b, v3.16b, v3.16b, #8      )
 129 CPU_LE( ext             v4.16b, v4.16b, v4.16b, #8      )
 130 CPU_LE( ext             v5.16b, v5.16b, v5.16b, #8      )
 131 CPU_LE( ext             v6.16b, v6.16b, v6.16b, #8      )
 132 CPU_LE( ext             v7.16b, v7.16b, v7.16b, #8      )
 133
 134         // XOR the initial_crc value
 135         eor             v0.16b, v0.16b, v10.16b
 136
 137         ldr_l           q10, rk3, x8    // xmm10 has rk3 and rk4
 138                                         // type of pmull instruction
 139                                         // will determine which constant to use
 140
 141         //
 142         // we subtract 256 instead of 128 to save one instruction from the loop
 143         //
 144         sub             arg3, arg3, #256
 145
 146         // at this section of the code, there is 64*x+y (0<=y<64) bytes of
 147         // buffer. The _fold_64_B_loop will fold 64B at a time
 148         // until we have 64+y Bytes of buffer
 149
 150
 151         // fold 64B at a time. This section of the code folds 4 vector
 152         // registers in parallel
 153 _fold_64_B_loop:
 154
 155         .macro          fold64, reg1, reg2
 156         ldp             q11, q12, [arg2], #0x20
 157
 158         pmull2          v8.1q, \reg1\().2d, v10.2d
 159         pmull           \reg1\().1q, \reg1\().1d, v10.1d
 160
 161 CPU_LE( rev64           v11.16b, v11.16b                )
 162 CPU_LE( rev64           v12.16b, v12.16b                )
 163
 164         pmull2          v9.1q, \reg2\().2d, v10.2d
 165         pmull           \reg2\().1q, \reg2\().1d, v10.1d
 166
 167 CPU_LE( ext             v11.16b, v11.16b, v11.16b, #8   )
 168 CPU_LE( ext             v12.16b, v12.16b, v12.16b, #8   )
 169
 170         eor             \reg1\().16b, \reg1\().16b, v8.16b
 171         eor             \reg2\().16b, \reg2\().16b, v9.16b
 172         eor             \reg1\().16b, \reg1\().16b, v11.16b
 173         eor             \reg2\().16b, \reg2\().16b, v12.16b
 174         .endm
 175
 176         fold64          v0, v1
 177         fold64          v2, v3
 178         fold64          v4, v5
 179         fold64          v6, v7
 180
 181         subs            arg3, arg3, #128
 182
 183         // check if there is another 64B in the buffer to be able to fold
 184         b.lt            _fold_64_B_end
 185
 186         if_will_cond_yield_neon
 187         stp             q0, q1, [sp, #.Lframe_local_offset]
 188         stp             q2, q3, [sp, #.Lframe_local_offset + 32]
 189         stp             q4, q5, [sp, #.Lframe_local_offset + 64]
 190         stp             q6, q7, [sp, #.Lframe_local_offset + 96]
 191         do_cond_yield_neon
 192         ldp             q0, q1, [sp, #.Lframe_local_offset]
 193         ldp             q2, q3, [sp, #.Lframe_local_offset + 32]
 194         ldp             q4, q5, [sp, #.Lframe_local_offset + 64]
 195         ldp             q6, q7, [sp, #.Lframe_local_offset + 96]
 196         ldr_l           q10, rk3, x8
 197         movi            vzr.16b, #0             // init zero register
 198         endif_yield_neon
 199
 200         b               _fold_64_B_loop
 201
 202 _fold_64_B_end:
 203         // at this point, the buffer pointer is pointing at the last y Bytes
 204         // of the buffer the 64B of folded data is in 4 of the vector
 205         // registers: v0, v1, v2, v3
 206
 207         // fold the 8 vector registers to 1 vector register with different
 208         // constants
 209
 210         ldr_l           q10, rk9, x8
 211
 212         .macro          fold16, reg, rk
 213         pmull           v8.1q, \reg\().1d, v10.1d
 214         pmull2          \reg\().1q, \reg\().2d, v10.2d
 215         .ifnb           \rk
 216         ldr_l           q10, \rk, x8
 217         .endif
 218         eor             v7.16b, v7.16b, v8.16b
 219         eor             v7.16b, v7.16b, \reg\().16b
 220         .endm
 221
 222         fold16          v0, rk11
 223         fold16          v1, rk13
 224         fold16          v2, rk15
 225         fold16          v3, rk17
 226         fold16          v4, rk19
 227         fold16          v5, rk1
 228         fold16          v6
 229
 230         // instead of 64, we add 48 to the loop counter to save 1 instruction
 231         // from the loop instead of a cmp instruction, we use the negative
 232         // flag with the jl instruction
 233         adds            arg3, arg3, #(128-16)
 234         b.lt            _final_reduction_for_128
 235
 236         // now we have 16+y bytes left to reduce. 16 Bytes is in register v7
 237         // and the rest is in memory. We can fold 16 bytes at a time if y>=16
 238         // continue folding 16B at a time
 239
 240 _16B_reduction_loop:
 241         pmull           v8.1q, v7.1d, v10.1d
 242         pmull2          v7.1q, v7.2d, v10.2d
 243         eor             v7.16b, v7.16b, v8.16b
 244
 245         ldr             q0, [arg2], #16
 246 CPU_LE( rev64           v0.16b, v0.16b                  )
 247 CPU_LE( ext             v0.16b, v0.16b, v0.16b, #8      )
 248         eor             v7.16b, v7.16b, v0.16b
 249         subs            arg3, arg3, #16
 250
 251         // instead of a cmp instruction, we utilize the flags with the
 252         // jge instruction equivalent of: cmp arg3, 16-16
 253         // check if there is any more 16B in the buffer to be able to fold
 254         b.ge            _16B_reduction_loop
 255
 256         // now we have 16+z bytes left to reduce, where 0<= z < 16.
 257         // first, we reduce the data in the xmm7 register
 258
 259 _final_reduction_for_128:
 260         // check if any more data to fold. If not, compute the CRC of
 261         // the final 128 bits
 262         adds            arg3, arg3, #16
 263         b.eq            _128_done
 264
 265         // here we are getting data that is less than 16 bytes.
 266         // since we know that there was data before the pointer, we can
 267         // offset the input pointer before the actual point, to receive
 268         // exactly 16 bytes. after that the registers need to be adjusted.
 269 _get_last_two_regs:
 270         add             arg2, arg2, arg3
 271         ldr             q1, [arg2, #-16]
 272 CPU_LE( rev64           v1.16b, v1.16b                  )
 273 CPU_LE( ext             v1.16b, v1.16b, v1.16b, #8      )
 274
 275         // get rid of the extra data that was loaded before
 276         // load the shift constant
 277         adr_l           x4, tbl_shf_table + 16
 278         sub             x4, x4, arg3
 279         ld1             {v0.16b}, [x4]
 280
 281         // shift v2 to the left by arg3 bytes
 282         tbl             v2.16b, {v7.16b}, v0.16b
 283
 284         // shift v7 to the right by 16-arg3 bytes
 285         movi            v9.16b, #0x80
 286         eor             v0.16b, v0.16b, v9.16b
 287         tbl             v7.16b, {v7.16b}, v0.16b
 288
 289         // blend
 290         sshr            v0.16b, v0.16b, #7      // convert to 8-bit mask
 291         bsl             v0.16b, v2.16b, v1.16b
 292
 293         // fold 16 Bytes
 294         pmull           v8.1q, v7.1d, v10.1d
 295         pmull2          v7.1q, v7.2d, v10.2d
 296         eor             v7.16b, v7.16b, v8.16b
 297         eor             v7.16b, v7.16b, v0.16b
 298
 299 _128_done:
 300         // compute crc of a 128-bit value
 301         ldr_l           q10, rk5, x8            // rk5 and rk6 in xmm10
 302
 303         // 64b fold
 304         ext             v0.16b, vzr.16b, v7.16b, #8
 305         mov             v7.d[0], v7.d[1]
 306         pmull           v7.1q, v7.1d, v10.1d
 307         eor             v7.16b, v7.16b, v0.16b
 308
 309         // 32b fold
 310         ext             v0.16b, v7.16b, vzr.16b, #4
 311         mov             v7.s[3], vzr.s[0]
 312         pmull2          v0.1q, v0.2d, v10.2d
 313         eor             v7.16b, v7.16b, v0.16b
 314
 315         // barrett reduction
 316 _barrett:
 317         ldr_l           q10, rk7, x8
 318         mov             v0.d[0], v7.d[1]
 319
 320         pmull           v0.1q, v0.1d, v10.1d
 321         ext             v0.16b, vzr.16b, v0.16b, #12
 322         pmull2          v0.1q, v0.2d, v10.2d
 323         ext             v0.16b, vzr.16b, v0.16b, #12
 324         eor             v7.16b, v7.16b, v0.16b
 325         mov             w0, v7.s[1]
 326
 327 _cleanup:
 328         // scale the result back to 16 bits
 329         lsr             x0, x0, #16
 330         frame_pop
 331         ret
 332
 333 _less_than_128:
 334         cbz             arg3, _cleanup
 335
 336         movi            v0.16b, #0
 337         mov             v0.s[3], arg1_low32     // get the initial crc value
 338
 339         ldr             q7, [arg2], #0x10
 340 CPU_LE( rev64           v7.16b, v7.16b                  )
 341 CPU_LE( ext             v7.16b, v7.16b, v7.16b, #8      )
 342         eor             v7.16b, v7.16b, v0.16b  // xor the initial crc value
 343
 344         cmp             arg3, #16
 345         b.eq            _128_done               // exactly 16 left
 346         b.lt            _less_than_16_left
 347
 348         ldr_l           q10, rk1, x8            // rk1 and rk2 in xmm10
 349
 350         // update the counter. subtract 32 instead of 16 to save one
 351         // instruction from the loop
 352         subs            arg3, arg3, #32
 353         b.ge            _16B_reduction_loop
 354
 355         add             arg3, arg3, #16
 356         b               _get_last_two_regs
 357
 358 _less_than_16_left:
 359         // shl r9, 4
 360         adr_l           x0, tbl_shf_table + 16
 361         sub             x0, x0, arg3
 362         ld1             {v0.16b}, [x0]
 363         movi            v9.16b, #0x80
 364         eor             v0.16b, v0.16b, v9.16b
 365         tbl             v7.16b, {v7.16b}, v0.16b
 366         b               _128_done
 367 ENDPROC(crc_t10dif_pmull)
 368
 369 // precomputed constants
 370 // these constants are precomputed from the poly:
 371 // 0x8bb70000 (0x8bb7 scaled to 32 bits)
 372         .section        ".rodata", "a"
 373         .align          4
 374 // Q = 0x18BB70000
 375 // rk1 = 2^(32*3) mod Q << 32
 376 // rk2 = 2^(32*5) mod Q << 32
 377 // rk3 = 2^(32*15) mod Q << 32
 378 // rk4 = 2^(32*17) mod Q << 32
 379 // rk5 = 2^(32*3) mod Q << 32
 380 // rk6 = 2^(32*2) mod Q << 32
 381 // rk7 = floor(2^64/Q)
 382 // rk8 = Q
 383
 384 rk1:    .octa           0x06df0000000000002d56000000000000
 385 rk3:    .octa           0x7cf50000000000009d9d000000000000
 386 rk5:    .octa           0x13680000000000002d56000000000000
 387 rk7:    .octa           0x000000018bb7000000000001f65a57f8
 388 rk9:    .octa           0xbfd6000000000000ceae000000000000
 389 rk11:   .octa           0x713c0000000000001e16000000000000
 390 rk13:   .octa           0x80a6000000000000f7f9000000000000
 391 rk15:   .octa           0xe658000000000000044c000000000000
 392 rk17:   .octa           0xa497000000000000ad18000000000000
 393 rk19:   .octa           0xe7b50000000000006ee3000000000000
 394
 395 tbl_shf_table:
 396 // use these values for shift constants for the tbl/tbx instruction
 397 // different alignments result in values as shown:
 398 //      DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
 399 //      DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
 400 //      DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
 401 //      DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
 402 //      DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
 403 //      DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
 404 //      DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9  (16-7) / shr7
 405 //      DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8  (16-8) / shr8
 406 //      DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7  (16-9) / shr9
 407 //      DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6  (16-10) / shr10
 408 //      DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5  (16-11) / shr11
 409 //      DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4  (16-12) / shr12
 410 //      DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3  (16-13) / shr13
 411 //      DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2  (16-14) / shr14
 412 //      DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1  (16-15) / shr15
 413
 414         .byte            0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
 415         .byte           0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
 416         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
 417         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0