1 // SPDX-License-Identifier: GPL-2.0-only
3 * arch/arm64/lib/xor-neon.c
5 * Authors: Jackie Liu <liuyun01@kylinos.cn>
6 * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
9 #include <linux/raid/xor.h>
10 #include <linux/module.h>
11 #include <asm/neon-intrinsics.h>
13 static void xor_arm64_neon_2(unsigned long bytes, unsigned long * __restrict p1,
14 const unsigned long * __restrict p2)
16 uint64_t *dp1 = (uint64_t *)p1;
17 uint64_t *dp2 = (uint64_t *)p2;
19 register uint64x2_t v0, v1, v2, v3;
20 long lines = bytes / (sizeof(uint64x2_t) * 4);
24 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
25 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
26 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
27 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
30 vst1q_u64(dp1 + 0, v0);
31 vst1q_u64(dp1 + 2, v1);
32 vst1q_u64(dp1 + 4, v2);
33 vst1q_u64(dp1 + 6, v3);
37 } while (--lines > 0);
40 static void xor_arm64_neon_3(unsigned long bytes, unsigned long * __restrict p1,
41 const unsigned long * __restrict p2,
42 const unsigned long * __restrict p3)
44 uint64_t *dp1 = (uint64_t *)p1;
45 uint64_t *dp2 = (uint64_t *)p2;
46 uint64_t *dp3 = (uint64_t *)p3;
48 register uint64x2_t v0, v1, v2, v3;
49 long lines = bytes / (sizeof(uint64x2_t) * 4);
53 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
54 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
55 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
56 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
59 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
60 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
61 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
62 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
65 vst1q_u64(dp1 + 0, v0);
66 vst1q_u64(dp1 + 2, v1);
67 vst1q_u64(dp1 + 4, v2);
68 vst1q_u64(dp1 + 6, v3);
73 } while (--lines > 0);
76 static void xor_arm64_neon_4(unsigned long bytes, unsigned long * __restrict p1,
77 const unsigned long * __restrict p2,
78 const unsigned long * __restrict p3,
79 const unsigned long * __restrict p4)
81 uint64_t *dp1 = (uint64_t *)p1;
82 uint64_t *dp2 = (uint64_t *)p2;
83 uint64_t *dp3 = (uint64_t *)p3;
84 uint64_t *dp4 = (uint64_t *)p4;
86 register uint64x2_t v0, v1, v2, v3;
87 long lines = bytes / (sizeof(uint64x2_t) * 4);
91 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
92 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
93 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
94 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
97 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
98 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
99 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
100 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
103 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
104 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
105 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
106 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
109 vst1q_u64(dp1 + 0, v0);
110 vst1q_u64(dp1 + 2, v1);
111 vst1q_u64(dp1 + 4, v2);
112 vst1q_u64(dp1 + 6, v3);
118 } while (--lines > 0);
121 static void xor_arm64_neon_5(unsigned long bytes, unsigned long * __restrict p1,
122 const unsigned long * __restrict p2,
123 const unsigned long * __restrict p3,
124 const unsigned long * __restrict p4,
125 const unsigned long * __restrict p5)
127 uint64_t *dp1 = (uint64_t *)p1;
128 uint64_t *dp2 = (uint64_t *)p2;
129 uint64_t *dp3 = (uint64_t *)p3;
130 uint64_t *dp4 = (uint64_t *)p4;
131 uint64_t *dp5 = (uint64_t *)p5;
133 register uint64x2_t v0, v1, v2, v3;
134 long lines = bytes / (sizeof(uint64x2_t) * 4);
138 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
139 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
140 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
141 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
144 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
145 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
146 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
147 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
150 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
151 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
152 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
153 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
156 v0 = veorq_u64(v0, vld1q_u64(dp5 + 0));
157 v1 = veorq_u64(v1, vld1q_u64(dp5 + 2));
158 v2 = veorq_u64(v2, vld1q_u64(dp5 + 4));
159 v3 = veorq_u64(v3, vld1q_u64(dp5 + 6));
162 vst1q_u64(dp1 + 0, v0);
163 vst1q_u64(dp1 + 2, v1);
164 vst1q_u64(dp1 + 4, v2);
165 vst1q_u64(dp1 + 6, v3);
172 } while (--lines > 0);
175 struct xor_block_template xor_block_inner_neon __ro_after_init = {
176 .name = "__inner_neon__",
177 .do_2 = xor_arm64_neon_2,
178 .do_3 = xor_arm64_neon_3,
179 .do_4 = xor_arm64_neon_4,
180 .do_5 = xor_arm64_neon_5,
182 EXPORT_SYMBOL(xor_block_inner_neon);
184 static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
188 asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
189 "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
190 : "=w"(res) : "w"(p), "w"(q), "w"(r));
194 static void xor_arm64_eor3_3(unsigned long bytes,
195 unsigned long * __restrict p1,
196 const unsigned long * __restrict p2,
197 const unsigned long * __restrict p3)
199 uint64_t *dp1 = (uint64_t *)p1;
200 uint64_t *dp2 = (uint64_t *)p2;
201 uint64_t *dp3 = (uint64_t *)p3;
203 register uint64x2_t v0, v1, v2, v3;
204 long lines = bytes / (sizeof(uint64x2_t) * 4);
208 v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
210 v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
212 v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
214 v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
218 vst1q_u64(dp1 + 0, v0);
219 vst1q_u64(dp1 + 2, v1);
220 vst1q_u64(dp1 + 4, v2);
221 vst1q_u64(dp1 + 6, v3);
226 } while (--lines > 0);
229 static void xor_arm64_eor3_4(unsigned long bytes,
230 unsigned long * __restrict p1,
231 const unsigned long * __restrict p2,
232 const unsigned long * __restrict p3,
233 const unsigned long * __restrict p4)
235 uint64_t *dp1 = (uint64_t *)p1;
236 uint64_t *dp2 = (uint64_t *)p2;
237 uint64_t *dp3 = (uint64_t *)p3;
238 uint64_t *dp4 = (uint64_t *)p4;
240 register uint64x2_t v0, v1, v2, v3;
241 long lines = bytes / (sizeof(uint64x2_t) * 4);
245 v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
247 v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
249 v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
251 v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
255 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
256 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
257 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
258 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
261 vst1q_u64(dp1 + 0, v0);
262 vst1q_u64(dp1 + 2, v1);
263 vst1q_u64(dp1 + 4, v2);
264 vst1q_u64(dp1 + 6, v3);
270 } while (--lines > 0);
273 static void xor_arm64_eor3_5(unsigned long bytes,
274 unsigned long * __restrict p1,
275 const unsigned long * __restrict p2,
276 const unsigned long * __restrict p3,
277 const unsigned long * __restrict p4,
278 const unsigned long * __restrict p5)
280 uint64_t *dp1 = (uint64_t *)p1;
281 uint64_t *dp2 = (uint64_t *)p2;
282 uint64_t *dp3 = (uint64_t *)p3;
283 uint64_t *dp4 = (uint64_t *)p4;
284 uint64_t *dp5 = (uint64_t *)p5;
286 register uint64x2_t v0, v1, v2, v3;
287 long lines = bytes / (sizeof(uint64x2_t) * 4);
291 v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
293 v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
295 v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
297 v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
301 v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
302 v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
303 v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
304 v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
307 vst1q_u64(dp1 + 0, v0);
308 vst1q_u64(dp1 + 2, v1);
309 vst1q_u64(dp1 + 4, v2);
310 vst1q_u64(dp1 + 6, v3);
317 } while (--lines > 0);
320 static int __init xor_neon_init(void)
322 if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
323 xor_block_inner_neon.do_3 = xor_arm64_eor3_3;
324 xor_block_inner_neon.do_4 = xor_arm64_eor3_4;
325 xor_block_inner_neon.do_5 = xor_arm64_eor3_5;
329 module_init(xor_neon_init);
331 static void __exit xor_neon_exit(void)
334 module_exit(xor_neon_exit);
336 MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
337 MODULE_DESCRIPTION("ARMv8 XOR Extensions");
338 MODULE_LICENSE("GPL");