GNU Linux-libre 4.14.254-gnu1
[releases.git] / arch / arm64 / crypto / crc32-ce-core.S
1 /*
2  * Accelerated CRC32(C) using arm64 CRC, NEON and Crypto Extensions instructions
3  *
4  * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  */
10
11 /* GPL HEADER START
12  *
13  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
14  *
15  * This program is free software; you can redistribute it and/or modify
16  * it under the terms of the GNU General Public License version 2 only,
17  * as published by the Free Software Foundation.
18  *
19  * This program is distributed in the hope that it will be useful, but
20  * WITHOUT ANY WARRANTY; without even the implied warranty of
21  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22  * General Public License version 2 for more details (a copy is included
23  * in the LICENSE file that accompanied this code).
24  *
25  * You should have received a copy of the GNU General Public License
26  * version 2 along with this program; If not, see http://www.gnu.org/licenses
27  *
28  * Please  visit http://www.xyratex.com/contact if you need additional
29  * information or have any questions.
30  *
31  * GPL HEADER END
32  */
33
34 /*
35  * Copyright 2012 Xyratex Technology Limited
36  *
37  * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
38  * calculation.
39  * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
40  * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
41  * at:
42  * http://www.intel.com/products/processor/manuals/
43  * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
44  * Volume 2B: Instruction Set Reference, N-Z
45  *
46  * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
47  *            Alexander Boyko <Alexander_Boyko@xyratex.com>
48  */
49
50 #include <linux/linkage.h>
51 #include <asm/assembler.h>
52
53         .text
54         .align          6
55         .cpu            generic+crypto+crc
56
57 .Lcrc32_constants:
58         /*
59          * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
60          * #define CONSTANT_R1  0x154442bd4LL
61          *
62          * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
63          * #define CONSTANT_R2  0x1c6e41596LL
64          */
65         .octa           0x00000001c6e415960000000154442bd4
66
67         /*
68          * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
69          * #define CONSTANT_R3  0x1751997d0LL
70          *
71          * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
72          * #define CONSTANT_R4  0x0ccaa009eLL
73          */
74         .octa           0x00000000ccaa009e00000001751997d0
75
76         /*
77          * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
78          * #define CONSTANT_R5  0x163cd6124LL
79          */
80         .quad           0x0000000163cd6124
81         .quad           0x00000000FFFFFFFF
82
83         /*
84          * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
85          *
86          * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
87          *                                                      = 0x1F7011641LL
88          * #define CONSTANT_RU  0x1F7011641LL
89          */
90         .octa           0x00000001F701164100000001DB710641
91
92 .Lcrc32c_constants:
93         .octa           0x000000009e4addf800000000740eef02
94         .octa           0x000000014cd00bd600000000f20c0dfe
95         .quad           0x00000000dd45aab8
96         .quad           0x00000000FFFFFFFF
97         .octa           0x00000000dea713f10000000105ec76f0
98
99         vCONSTANT       .req    v0
100         dCONSTANT       .req    d0
101         qCONSTANT       .req    q0
102
103         BUF             .req    x0
104         LEN             .req    x1
105         CRC             .req    x2
106
107         vzr             .req    v9
108
109         /**
110          * Calculate crc32
111          * BUF - buffer
112          * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
113          * CRC - initial crc32
114          * return %eax crc32
115          * uint crc32_pmull_le(unsigned char const *buffer,
116          *                     size_t len, uint crc32)
117          */
118 ENTRY(crc32_pmull_le)
119         adr             x3, .Lcrc32_constants
120         b               0f
121
122 ENTRY(crc32c_pmull_le)
123         adr             x3, .Lcrc32c_constants
124
125 0:      bic             LEN, LEN, #15
126         ld1             {v1.16b-v4.16b}, [BUF], #0x40
127         movi            vzr.16b, #0
128         fmov            dCONSTANT, CRC
129         eor             v1.16b, v1.16b, vCONSTANT.16b
130         sub             LEN, LEN, #0x40
131         cmp             LEN, #0x40
132         b.lt            less_64
133
134         ldr             qCONSTANT, [x3]
135
136 loop_64:                /* 64 bytes Full cache line folding */
137         sub             LEN, LEN, #0x40
138
139         pmull2          v5.1q, v1.2d, vCONSTANT.2d
140         pmull2          v6.1q, v2.2d, vCONSTANT.2d
141         pmull2          v7.1q, v3.2d, vCONSTANT.2d
142         pmull2          v8.1q, v4.2d, vCONSTANT.2d
143
144         pmull           v1.1q, v1.1d, vCONSTANT.1d
145         pmull           v2.1q, v2.1d, vCONSTANT.1d
146         pmull           v3.1q, v3.1d, vCONSTANT.1d
147         pmull           v4.1q, v4.1d, vCONSTANT.1d
148
149         eor             v1.16b, v1.16b, v5.16b
150         ld1             {v5.16b}, [BUF], #0x10
151         eor             v2.16b, v2.16b, v6.16b
152         ld1             {v6.16b}, [BUF], #0x10
153         eor             v3.16b, v3.16b, v7.16b
154         ld1             {v7.16b}, [BUF], #0x10
155         eor             v4.16b, v4.16b, v8.16b
156         ld1             {v8.16b}, [BUF], #0x10
157
158         eor             v1.16b, v1.16b, v5.16b
159         eor             v2.16b, v2.16b, v6.16b
160         eor             v3.16b, v3.16b, v7.16b
161         eor             v4.16b, v4.16b, v8.16b
162
163         cmp             LEN, #0x40
164         b.ge            loop_64
165
166 less_64:                /* Folding cache line into 128bit */
167         ldr             qCONSTANT, [x3, #16]
168
169         pmull2          v5.1q, v1.2d, vCONSTANT.2d
170         pmull           v1.1q, v1.1d, vCONSTANT.1d
171         eor             v1.16b, v1.16b, v5.16b
172         eor             v1.16b, v1.16b, v2.16b
173
174         pmull2          v5.1q, v1.2d, vCONSTANT.2d
175         pmull           v1.1q, v1.1d, vCONSTANT.1d
176         eor             v1.16b, v1.16b, v5.16b
177         eor             v1.16b, v1.16b, v3.16b
178
179         pmull2          v5.1q, v1.2d, vCONSTANT.2d
180         pmull           v1.1q, v1.1d, vCONSTANT.1d
181         eor             v1.16b, v1.16b, v5.16b
182         eor             v1.16b, v1.16b, v4.16b
183
184         cbz             LEN, fold_64
185
186 loop_16:                /* Folding rest buffer into 128bit */
187         subs            LEN, LEN, #0x10
188
189         ld1             {v2.16b}, [BUF], #0x10
190         pmull2          v5.1q, v1.2d, vCONSTANT.2d
191         pmull           v1.1q, v1.1d, vCONSTANT.1d
192         eor             v1.16b, v1.16b, v5.16b
193         eor             v1.16b, v1.16b, v2.16b
194
195         b.ne            loop_16
196
197 fold_64:
198         /* perform the last 64 bit fold, also adds 32 zeroes
199          * to the input stream */
200         ext             v2.16b, v1.16b, v1.16b, #8
201         pmull2          v2.1q, v2.2d, vCONSTANT.2d
202         ext             v1.16b, v1.16b, vzr.16b, #8
203         eor             v1.16b, v1.16b, v2.16b
204
205         /* final 32-bit fold */
206         ldr             dCONSTANT, [x3, #32]
207         ldr             d3, [x3, #40]
208
209         ext             v2.16b, v1.16b, vzr.16b, #4
210         and             v1.16b, v1.16b, v3.16b
211         pmull           v1.1q, v1.1d, vCONSTANT.1d
212         eor             v1.16b, v1.16b, v2.16b
213
214         /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
215         ldr             qCONSTANT, [x3, #48]
216
217         and             v2.16b, v1.16b, v3.16b
218         ext             v2.16b, vzr.16b, v2.16b, #8
219         pmull2          v2.1q, v2.2d, vCONSTANT.2d
220         and             v2.16b, v2.16b, v3.16b
221         pmull           v2.1q, v2.1d, vCONSTANT.1d
222         eor             v1.16b, v1.16b, v2.16b
223         mov             w0, v1.s[1]
224
225         ret
226 ENDPROC(crc32_pmull_le)
227 ENDPROC(crc32c_pmull_le)
228
229         .macro          __crc32, c
230 0:      subs            x2, x2, #16
231         b.mi            8f
232         ldp             x3, x4, [x1], #16
233 CPU_BE( rev             x3, x3          )
234 CPU_BE( rev             x4, x4          )
235         crc32\c\()x     w0, w0, x3
236         crc32\c\()x     w0, w0, x4
237         b.ne            0b
238         ret
239
240 8:      tbz             x2, #3, 4f
241         ldr             x3, [x1], #8
242 CPU_BE( rev             x3, x3          )
243         crc32\c\()x     w0, w0, x3
244 4:      tbz             x2, #2, 2f
245         ldr             w3, [x1], #4
246 CPU_BE( rev             w3, w3          )
247         crc32\c\()w     w0, w0, w3
248 2:      tbz             x2, #1, 1f
249         ldrh            w3, [x1], #2
250 CPU_BE( rev16           w3, w3          )
251         crc32\c\()h     w0, w0, w3
252 1:      tbz             x2, #0, 0f
253         ldrb            w3, [x1]
254         crc32\c\()b     w0, w0, w3
255 0:      ret
256         .endm
257
258         .align          5
259 ENTRY(crc32_armv8_le)
260         __crc32
261 ENDPROC(crc32_armv8_le)
262
263         .align          5
264 ENTRY(crc32c_armv8_le)
265         __crc32         c
266 ENDPROC(crc32c_armv8_le)