Mention branches and keyring.
[releases.git] / x86 / include / asm / xor.h
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 #ifndef _ASM_X86_XOR_H
3 #define _ASM_X86_XOR_H
4
5 /*
6  * Optimized RAID-5 checksumming functions for SSE.
7  */
8
9 /*
10  * Cache avoiding checksumming functions utilizing KNI instructions
11  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
12  */
13
14 /*
15  * Based on
16  * High-speed RAID5 checksumming functions utilizing SSE instructions.
17  * Copyright (C) 1998 Ingo Molnar.
18  */
19
20 /*
21  * x86-64 changes / gcc fixes from Andi Kleen.
22  * Copyright 2002 Andi Kleen, SuSE Labs.
23  *
24  * This hasn't been optimized for the hammer yet, but there are likely
25  * no advantages to be gotten from x86-64 here anyways.
26  */
27
28 #include <asm/fpu/api.h>
29
30 #ifdef CONFIG_X86_32
31 /* reduce register pressure */
32 # define XOR_CONSTANT_CONSTRAINT "i"
33 #else
34 # define XOR_CONSTANT_CONSTRAINT "re"
35 #endif
36
37 #define OFFS(x)         "16*("#x")"
38 #define PF_OFFS(x)      "256+16*("#x")"
39 #define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
40 #define LD(x, y)        "       movaps "OFFS(x)"(%[p1]), %%xmm"#y"      ;\n"
41 #define ST(x, y)        "       movaps %%xmm"#y", "OFFS(x)"(%[p1])      ;\n"
42 #define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
43 #define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
44 #define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
45 #define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
46 #define XO1(x, y)       "       xorps "OFFS(x)"(%[p2]), %%xmm"#y"       ;\n"
47 #define XO2(x, y)       "       xorps "OFFS(x)"(%[p3]), %%xmm"#y"       ;\n"
48 #define XO3(x, y)       "       xorps "OFFS(x)"(%[p4]), %%xmm"#y"       ;\n"
49 #define XO4(x, y)       "       xorps "OFFS(x)"(%[p5]), %%xmm"#y"       ;\n"
50 #define NOP(x)
51
52 #define BLK64(pf, op, i)                                \
53                 pf(i)                                   \
54                 op(i, 0)                                \
55                         op(i + 1, 1)                    \
56                                 op(i + 2, 2)            \
57                                         op(i + 3, 3)
58
59 static void
60 xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
61           const unsigned long * __restrict p2)
62 {
63         unsigned long lines = bytes >> 8;
64
65         kernel_fpu_begin();
66
67         asm volatile(
68 #undef BLOCK
69 #define BLOCK(i)                                        \
70                 LD(i, 0)                                \
71                         LD(i + 1, 1)                    \
72                 PF1(i)                                  \
73                                 PF1(i + 2)              \
74                                 LD(i + 2, 2)            \
75                                         LD(i + 3, 3)    \
76                 PF0(i + 4)                              \
77                                 PF0(i + 6)              \
78                 XO1(i, 0)                               \
79                         XO1(i + 1, 1)                   \
80                                 XO1(i + 2, 2)           \
81                                         XO1(i + 3, 3)   \
82                 ST(i, 0)                                \
83                         ST(i + 1, 1)                    \
84                                 ST(i + 2, 2)            \
85                                         ST(i + 3, 3)    \
86
87
88                 PF0(0)
89                                 PF0(2)
90
91         " .align 32                     ;\n"
92         " 1:                            ;\n"
93
94                 BLOCK(0)
95                 BLOCK(4)
96                 BLOCK(8)
97                 BLOCK(12)
98
99         "       add %[inc], %[p1]       ;\n"
100         "       add %[inc], %[p2]       ;\n"
101         "       dec %[cnt]              ;\n"
102         "       jnz 1b                  ;\n"
103         : [cnt] "+r" (lines),
104           [p1] "+r" (p1), [p2] "+r" (p2)
105         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
106         : "memory");
107
108         kernel_fpu_end();
109 }
110
111 static void
112 xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
113                const unsigned long * __restrict p2)
114 {
115         unsigned long lines = bytes >> 8;
116
117         kernel_fpu_begin();
118
119         asm volatile(
120 #undef BLOCK
121 #define BLOCK(i)                        \
122                 BLK64(PF0, LD, i)       \
123                 BLK64(PF1, XO1, i)      \
124                 BLK64(NOP, ST, i)       \
125
126         " .align 32                     ;\n"
127         " 1:                            ;\n"
128
129                 BLOCK(0)
130                 BLOCK(4)
131                 BLOCK(8)
132                 BLOCK(12)
133
134         "       add %[inc], %[p1]       ;\n"
135         "       add %[inc], %[p2]       ;\n"
136         "       dec %[cnt]              ;\n"
137         "       jnz 1b                  ;\n"
138         : [cnt] "+r" (lines),
139           [p1] "+r" (p1), [p2] "+r" (p2)
140         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
141         : "memory");
142
143         kernel_fpu_end();
144 }
145
146 static void
147 xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
148           const unsigned long * __restrict p2,
149           const unsigned long * __restrict p3)
150 {
151         unsigned long lines = bytes >> 8;
152
153         kernel_fpu_begin();
154
155         asm volatile(
156 #undef BLOCK
157 #define BLOCK(i) \
158                 PF1(i)                                  \
159                                 PF1(i + 2)              \
160                 LD(i, 0)                                \
161                         LD(i + 1, 1)                    \
162                                 LD(i + 2, 2)            \
163                                         LD(i + 3, 3)    \
164                 PF2(i)                                  \
165                                 PF2(i + 2)              \
166                 PF0(i + 4)                              \
167                                 PF0(i + 6)              \
168                 XO1(i, 0)                               \
169                         XO1(i + 1, 1)                   \
170                                 XO1(i + 2, 2)           \
171                                         XO1(i + 3, 3)   \
172                 XO2(i, 0)                               \
173                         XO2(i + 1, 1)                   \
174                                 XO2(i + 2, 2)           \
175                                         XO2(i + 3, 3)   \
176                 ST(i, 0)                                \
177                         ST(i + 1, 1)                    \
178                                 ST(i + 2, 2)            \
179                                         ST(i + 3, 3)    \
180
181
182                 PF0(0)
183                                 PF0(2)
184
185         " .align 32                     ;\n"
186         " 1:                            ;\n"
187
188                 BLOCK(0)
189                 BLOCK(4)
190                 BLOCK(8)
191                 BLOCK(12)
192
193         "       add %[inc], %[p1]       ;\n"
194         "       add %[inc], %[p2]       ;\n"
195         "       add %[inc], %[p3]       ;\n"
196         "       dec %[cnt]              ;\n"
197         "       jnz 1b                  ;\n"
198         : [cnt] "+r" (lines),
199           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
200         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
201         : "memory");
202
203         kernel_fpu_end();
204 }
205
206 static void
207 xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
208                const unsigned long * __restrict p2,
209                const unsigned long * __restrict p3)
210 {
211         unsigned long lines = bytes >> 8;
212
213         kernel_fpu_begin();
214
215         asm volatile(
216 #undef BLOCK
217 #define BLOCK(i)                        \
218                 BLK64(PF0, LD, i)       \
219                 BLK64(PF1, XO1, i)      \
220                 BLK64(PF2, XO2, i)      \
221                 BLK64(NOP, ST, i)       \
222
223         " .align 32                     ;\n"
224         " 1:                            ;\n"
225
226                 BLOCK(0)
227                 BLOCK(4)
228                 BLOCK(8)
229                 BLOCK(12)
230
231         "       add %[inc], %[p1]       ;\n"
232         "       add %[inc], %[p2]       ;\n"
233         "       add %[inc], %[p3]       ;\n"
234         "       dec %[cnt]              ;\n"
235         "       jnz 1b                  ;\n"
236         : [cnt] "+r" (lines),
237           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
238         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
239         : "memory");
240
241         kernel_fpu_end();
242 }
243
244 static void
245 xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
246           const unsigned long * __restrict p2,
247           const unsigned long * __restrict p3,
248           const unsigned long * __restrict p4)
249 {
250         unsigned long lines = bytes >> 8;
251
252         kernel_fpu_begin();
253
254         asm volatile(
255 #undef BLOCK
256 #define BLOCK(i) \
257                 PF1(i)                                  \
258                                 PF1(i + 2)              \
259                 LD(i, 0)                                \
260                         LD(i + 1, 1)                    \
261                                 LD(i + 2, 2)            \
262                                         LD(i + 3, 3)    \
263                 PF2(i)                                  \
264                                 PF2(i + 2)              \
265                 XO1(i, 0)                               \
266                         XO1(i + 1, 1)                   \
267                                 XO1(i + 2, 2)           \
268                                         XO1(i + 3, 3)   \
269                 PF3(i)                                  \
270                                 PF3(i + 2)              \
271                 PF0(i + 4)                              \
272                                 PF0(i + 6)              \
273                 XO2(i, 0)                               \
274                         XO2(i + 1, 1)                   \
275                                 XO2(i + 2, 2)           \
276                                         XO2(i + 3, 3)   \
277                 XO3(i, 0)                               \
278                         XO3(i + 1, 1)                   \
279                                 XO3(i + 2, 2)           \
280                                         XO3(i + 3, 3)   \
281                 ST(i, 0)                                \
282                         ST(i + 1, 1)                    \
283                                 ST(i + 2, 2)            \
284                                         ST(i + 3, 3)    \
285
286
287                 PF0(0)
288                                 PF0(2)
289
290         " .align 32                     ;\n"
291         " 1:                            ;\n"
292
293                 BLOCK(0)
294                 BLOCK(4)
295                 BLOCK(8)
296                 BLOCK(12)
297
298         "       add %[inc], %[p1]       ;\n"
299         "       add %[inc], %[p2]       ;\n"
300         "       add %[inc], %[p3]       ;\n"
301         "       add %[inc], %[p4]       ;\n"
302         "       dec %[cnt]              ;\n"
303         "       jnz 1b                  ;\n"
304         : [cnt] "+r" (lines), [p1] "+r" (p1),
305           [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
306         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
307         : "memory");
308
309         kernel_fpu_end();
310 }
311
312 static void
313 xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
314                const unsigned long * __restrict p2,
315                const unsigned long * __restrict p3,
316                const unsigned long * __restrict p4)
317 {
318         unsigned long lines = bytes >> 8;
319
320         kernel_fpu_begin();
321
322         asm volatile(
323 #undef BLOCK
324 #define BLOCK(i)                        \
325                 BLK64(PF0, LD, i)       \
326                 BLK64(PF1, XO1, i)      \
327                 BLK64(PF2, XO2, i)      \
328                 BLK64(PF3, XO3, i)      \
329                 BLK64(NOP, ST, i)       \
330
331         " .align 32                     ;\n"
332         " 1:                            ;\n"
333
334                 BLOCK(0)
335                 BLOCK(4)
336                 BLOCK(8)
337                 BLOCK(12)
338
339         "       add %[inc], %[p1]       ;\n"
340         "       add %[inc], %[p2]       ;\n"
341         "       add %[inc], %[p3]       ;\n"
342         "       add %[inc], %[p4]       ;\n"
343         "       dec %[cnt]              ;\n"
344         "       jnz 1b                  ;\n"
345         : [cnt] "+r" (lines), [p1] "+r" (p1),
346           [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
347         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
348         : "memory");
349
350         kernel_fpu_end();
351 }
352
353 static void
354 xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
355           const unsigned long * __restrict p2,
356           const unsigned long * __restrict p3,
357           const unsigned long * __restrict p4,
358           const unsigned long * __restrict p5)
359 {
360         unsigned long lines = bytes >> 8;
361
362         kernel_fpu_begin();
363
364         asm volatile(
365 #undef BLOCK
366 #define BLOCK(i) \
367                 PF1(i)                                  \
368                                 PF1(i + 2)              \
369                 LD(i, 0)                                \
370                         LD(i + 1, 1)                    \
371                                 LD(i + 2, 2)            \
372                                         LD(i + 3, 3)    \
373                 PF2(i)                                  \
374                                 PF2(i + 2)              \
375                 XO1(i, 0)                               \
376                         XO1(i + 1, 1)                   \
377                                 XO1(i + 2, 2)           \
378                                         XO1(i + 3, 3)   \
379                 PF3(i)                                  \
380                                 PF3(i + 2)              \
381                 XO2(i, 0)                               \
382                         XO2(i + 1, 1)                   \
383                                 XO2(i + 2, 2)           \
384                                         XO2(i + 3, 3)   \
385                 PF4(i)                                  \
386                                 PF4(i + 2)              \
387                 PF0(i + 4)                              \
388                                 PF0(i + 6)              \
389                 XO3(i, 0)                               \
390                         XO3(i + 1, 1)                   \
391                                 XO3(i + 2, 2)           \
392                                         XO3(i + 3, 3)   \
393                 XO4(i, 0)                               \
394                         XO4(i + 1, 1)                   \
395                                 XO4(i + 2, 2)           \
396                                         XO4(i + 3, 3)   \
397                 ST(i, 0)                                \
398                         ST(i + 1, 1)                    \
399                                 ST(i + 2, 2)            \
400                                         ST(i + 3, 3)    \
401
402
403                 PF0(0)
404                                 PF0(2)
405
406         " .align 32                     ;\n"
407         " 1:                            ;\n"
408
409                 BLOCK(0)
410                 BLOCK(4)
411                 BLOCK(8)
412                 BLOCK(12)
413
414         "       add %[inc], %[p1]       ;\n"
415         "       add %[inc], %[p2]       ;\n"
416         "       add %[inc], %[p3]       ;\n"
417         "       add %[inc], %[p4]       ;\n"
418         "       add %[inc], %[p5]       ;\n"
419         "       dec %[cnt]              ;\n"
420         "       jnz 1b                  ;\n"
421         : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
422           [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
423         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
424         : "memory");
425
426         kernel_fpu_end();
427 }
428
429 static void
430 xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
431                const unsigned long * __restrict p2,
432                const unsigned long * __restrict p3,
433                const unsigned long * __restrict p4,
434                const unsigned long * __restrict p5)
435 {
436         unsigned long lines = bytes >> 8;
437
438         kernel_fpu_begin();
439
440         asm volatile(
441 #undef BLOCK
442 #define BLOCK(i)                        \
443                 BLK64(PF0, LD, i)       \
444                 BLK64(PF1, XO1, i)      \
445                 BLK64(PF2, XO2, i)      \
446                 BLK64(PF3, XO3, i)      \
447                 BLK64(PF4, XO4, i)      \
448                 BLK64(NOP, ST, i)       \
449
450         " .align 32                     ;\n"
451         " 1:                            ;\n"
452
453                 BLOCK(0)
454                 BLOCK(4)
455                 BLOCK(8)
456                 BLOCK(12)
457
458         "       add %[inc], %[p1]       ;\n"
459         "       add %[inc], %[p2]       ;\n"
460         "       add %[inc], %[p3]       ;\n"
461         "       add %[inc], %[p4]       ;\n"
462         "       add %[inc], %[p5]       ;\n"
463         "       dec %[cnt]              ;\n"
464         "       jnz 1b                  ;\n"
465         : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
466           [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
467         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
468         : "memory");
469
470         kernel_fpu_end();
471 }
472
473 static struct xor_block_template xor_block_sse_pf64 = {
474         .name = "prefetch64-sse",
475         .do_2 = xor_sse_2_pf64,
476         .do_3 = xor_sse_3_pf64,
477         .do_4 = xor_sse_4_pf64,
478         .do_5 = xor_sse_5_pf64,
479 };
480
481 #undef LD
482 #undef XO1
483 #undef XO2
484 #undef XO3
485 #undef XO4
486 #undef ST
487 #undef NOP
488 #undef BLK64
489 #undef BLOCK
490
491 #undef XOR_CONSTANT_CONSTRAINT
492
493 #ifdef CONFIG_X86_32
494 # include <asm/xor_32.h>
495 #else
496 # include <asm/xor_64.h>
497 #endif
498
499 #define XOR_SELECT_TEMPLATE(FASTEST) \
500         AVX_SELECT(FASTEST)
501
502 #endif /* _ASM_X86_XOR_H */