arch/arm64/crypto/crc32-ce-core.S

   1 /*
   2  * Accelerated CRC32(C) using arm64 CRC, NEON and Crypto Extensions instructions
   3  *
   4  * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 as
   8  * published by the Free Software Foundation.
   9  */
  10
  11 /* GPL HEADER START
  12  *
  13  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  14  *
  15  * This program is free software; you can redistribute it and/or modify
  16  * it under the terms of the GNU General Public License version 2 only,
  17  * as published by the Free Software Foundation.
  18  *
  19  * This program is distributed in the hope that it will be useful, but
  20  * WITHOUT ANY WARRANTY; without even the implied warranty of
  21  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22  * General Public License version 2 for more details (a copy is included
  23  * in the LICENSE file that accompanied this code).
  24  *
  25  * You should have received a copy of the GNU General Public License
  26  * version 2 along with this program; If not, see http://www.gnu.org/licenses
  27  *
  28  * Please  visit http://www.xyratex.com/contact if you need additional
  29  * information or have any questions.
  30  *
  31  * GPL HEADER END
  32  */
  33
  34 /*
  35  * Copyright 2012 Xyratex Technology Limited
  36  *
  37  * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
  38  * calculation.
  39  * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
  40  * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
  41  * at:
  42  * http://www.intel.com/products/processor/manuals/
  43  * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
  44  * Volume 2B: Instruction Set Reference, N-Z
  45  *
  46  * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
  47  *            Alexander Boyko <Alexander_Boyko@xyratex.com>
  48  */
  49
  50 #include <linux/linkage.h>
  51 #include <asm/assembler.h>
  52
  53         .text
  54         .align          6
  55         .cpu            generic+crypto+crc
  56
  57 .Lcrc32_constants:
  58         /*
  59          * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
  60          * #define CONSTANT_R1  0x154442bd4LL
  61          *
  62          * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
  63          * #define CONSTANT_R2  0x1c6e41596LL
  64          */
  65         .octa           0x00000001c6e415960000000154442bd4
  66
  67         /*
  68          * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
  69          * #define CONSTANT_R3  0x1751997d0LL
  70          *
  71          * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
  72          * #define CONSTANT_R4  0x0ccaa009eLL
  73          */
  74         .octa           0x00000000ccaa009e00000001751997d0
  75
  76         /*
  77          * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
  78          * #define CONSTANT_R5  0x163cd6124LL
  79          */
  80         .quad           0x0000000163cd6124
  81         .quad           0x00000000FFFFFFFF
  82
  83         /*
  84          * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
  85          *
  86          * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
  87          *                                                      = 0x1F7011641LL
  88          * #define CONSTANT_RU  0x1F7011641LL
  89          */
  90         .octa           0x00000001F701164100000001DB710641
  91
  92 .Lcrc32c_constants:
  93         .octa           0x000000009e4addf800000000740eef02
  94         .octa           0x000000014cd00bd600000000f20c0dfe
  95         .quad           0x00000000dd45aab8
  96         .quad           0x00000000FFFFFFFF
  97         .octa           0x00000000dea713f10000000105ec76f0
  98
  99         vCONSTANT       .req    v0
 100         dCONSTANT       .req    d0
 101         qCONSTANT       .req    q0
 102
 103         BUF             .req    x0
 104         LEN             .req    x1
 105         CRC             .req    x2
 106
 107         vzr             .req    v9
 108
 109         /**
 110          * Calculate crc32
 111          * BUF - buffer
 112          * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
 113          * CRC - initial crc32
 114          * return %eax crc32
 115          * uint crc32_pmull_le(unsigned char const *buffer,
 116          *                     size_t len, uint crc32)
 117          */
 118 ENTRY(crc32_pmull_le)
 119         adr             x3, .Lcrc32_constants
 120         b               0f
 121
 122 ENTRY(crc32c_pmull_le)
 123         adr             x3, .Lcrc32c_constants
 124
 125 0:      bic             LEN, LEN, #15
 126         ld1             {v1.16b-v4.16b}, [BUF], #0x40
 127         movi            vzr.16b, #0
 128         fmov            dCONSTANT, CRC
 129         eor             v1.16b, v1.16b, vCONSTANT.16b
 130         sub             LEN, LEN, #0x40
 131         cmp             LEN, #0x40
 132         b.lt            less_64
 133
 134         ldr             qCONSTANT, [x3]
 135
 136 loop_64:                /* 64 bytes Full cache line folding */
 137         sub             LEN, LEN, #0x40
 138
 139         pmull2          v5.1q, v1.2d, vCONSTANT.2d
 140         pmull2          v6.1q, v2.2d, vCONSTANT.2d
 141         pmull2          v7.1q, v3.2d, vCONSTANT.2d
 142         pmull2          v8.1q, v4.2d, vCONSTANT.2d
 143
 144         pmull           v1.1q, v1.1d, vCONSTANT.1d
 145         pmull           v2.1q, v2.1d, vCONSTANT.1d
 146         pmull           v3.1q, v3.1d, vCONSTANT.1d
 147         pmull           v4.1q, v4.1d, vCONSTANT.1d
 148
 149         eor             v1.16b, v1.16b, v5.16b
 150         ld1             {v5.16b}, [BUF], #0x10
 151         eor             v2.16b, v2.16b, v6.16b
 152         ld1             {v6.16b}, [BUF], #0x10
 153         eor             v3.16b, v3.16b, v7.16b
 154         ld1             {v7.16b}, [BUF], #0x10
 155         eor             v4.16b, v4.16b, v8.16b
 156         ld1             {v8.16b}, [BUF], #0x10
 157
 158         eor             v1.16b, v1.16b, v5.16b
 159         eor             v2.16b, v2.16b, v6.16b
 160         eor             v3.16b, v3.16b, v7.16b
 161         eor             v4.16b, v4.16b, v8.16b
 162
 163         cmp             LEN, #0x40
 164         b.ge            loop_64
 165
 166 less_64:                /* Folding cache line into 128bit */
 167         ldr             qCONSTANT, [x3, #16]
 168
 169         pmull2          v5.1q, v1.2d, vCONSTANT.2d
 170         pmull           v1.1q, v1.1d, vCONSTANT.1d
 171         eor             v1.16b, v1.16b, v5.16b
 172         eor             v1.16b, v1.16b, v2.16b
 173
 174         pmull2          v5.1q, v1.2d, vCONSTANT.2d
 175         pmull           v1.1q, v1.1d, vCONSTANT.1d
 176         eor             v1.16b, v1.16b, v5.16b
 177         eor             v1.16b, v1.16b, v3.16b
 178
 179         pmull2          v5.1q, v1.2d, vCONSTANT.2d
 180         pmull           v1.1q, v1.1d, vCONSTANT.1d
 181         eor             v1.16b, v1.16b, v5.16b
 182         eor             v1.16b, v1.16b, v4.16b
 183
 184         cbz             LEN, fold_64
 185
 186 loop_16:                /* Folding rest buffer into 128bit */
 187         subs            LEN, LEN, #0x10
 188
 189         ld1             {v2.16b}, [BUF], #0x10
 190         pmull2          v5.1q, v1.2d, vCONSTANT.2d
 191         pmull           v1.1q, v1.1d, vCONSTANT.1d
 192         eor             v1.16b, v1.16b, v5.16b
 193         eor             v1.16b, v1.16b, v2.16b
 194
 195         b.ne            loop_16
 196
 197 fold_64:
 198         /* perform the last 64 bit fold, also adds 32 zeroes
 199          * to the input stream */
 200         ext             v2.16b, v1.16b, v1.16b, #8
 201         pmull2          v2.1q, v2.2d, vCONSTANT.2d
 202         ext             v1.16b, v1.16b, vzr.16b, #8
 203         eor             v1.16b, v1.16b, v2.16b
 204
 205         /* final 32-bit fold */
 206         ldr             dCONSTANT, [x3, #32]
 207         ldr             d3, [x3, #40]
 208
 209         ext             v2.16b, v1.16b, vzr.16b, #4
 210         and             v1.16b, v1.16b, v3.16b
 211         pmull           v1.1q, v1.1d, vCONSTANT.1d
 212         eor             v1.16b, v1.16b, v2.16b
 213
 214         /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
 215         ldr             qCONSTANT, [x3, #48]
 216
 217         and             v2.16b, v1.16b, v3.16b
 218         ext             v2.16b, vzr.16b, v2.16b, #8
 219         pmull2          v2.1q, v2.2d, vCONSTANT.2d
 220         and             v2.16b, v2.16b, v3.16b
 221         pmull           v2.1q, v2.1d, vCONSTANT.1d
 222         eor             v1.16b, v1.16b, v2.16b
 223         mov             w0, v1.s[1]
 224
 225         ret
 226 ENDPROC(crc32_pmull_le)
 227 ENDPROC(crc32c_pmull_le)
 228
 229         .macro          __crc32, c
 230 0:      subs            x2, x2, #16
 231         b.mi            8f
 232         ldp             x3, x4, [x1], #16
 233 CPU_BE( rev             x3, x3          )
 234 CPU_BE( rev             x4, x4          )
 235         crc32\c\()x     w0, w0, x3
 236         crc32\c\()x     w0, w0, x4
 237         b.ne            0b
 238         ret
 239
 240 8:      tbz             x2, #3, 4f
 241         ldr             x3, [x1], #8
 242 CPU_BE( rev             x3, x3          )
 243         crc32\c\()x     w0, w0, x3
 244 4:      tbz             x2, #2, 2f
 245         ldr             w3, [x1], #4
 246 CPU_BE( rev             w3, w3          )
 247         crc32\c\()w     w0, w0, w3
 248 2:      tbz             x2, #1, 1f
 249         ldrh            w3, [x1], #2
 250 CPU_BE( rev16           w3, w3          )
 251         crc32\c\()h     w0, w0, w3
 252 1:      tbz             x2, #0, 0f
 253         ldrb            w3, [x1]
 254         crc32\c\()b     w0, w0, w3
 255 0:      ret
 256         .endm
 257
 258         .align          5
 259 ENTRY(crc32_armv8_le)
 260         __crc32
 261 ENDPROC(crc32_armv8_le)
 262
 263         .align          5
 264 ENTRY(crc32c_armv8_le)
 265         __crc32         c
 266 ENDPROC(crc32c_armv8_le)