arch/x86/include/asm/xor_avx.h

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 #ifndef _ASM_X86_XOR_AVX_H
   3 #define _ASM_X86_XOR_AVX_H
   4
   5 /*
   6  * Optimized RAID-5 checksumming functions for AVX
   7  *
   8  * Copyright (C) 2012 Intel Corporation
   9  * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
  10  *
  11  * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
  12  */
  13
  14 #ifdef CONFIG_AS_AVX
  15
  16 #include <linux/compiler.h>
  17 #include <asm/fpu/api.h>
  18
  19 #define BLOCK4(i) \
  20                 BLOCK(32 * i, 0) \
  21                 BLOCK(32 * (i + 1), 1) \
  22                 BLOCK(32 * (i + 2), 2) \
  23                 BLOCK(32 * (i + 3), 3)
  24
  25 #define BLOCK16() \
  26                 BLOCK4(0) \
  27                 BLOCK4(4) \
  28                 BLOCK4(8) \
  29                 BLOCK4(12)
  30
  31 static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
  32 {
  33         unsigned long lines = bytes >> 9;
  34
  35         kernel_fpu_begin();
  36
  37         while (lines--) {
  38 #undef BLOCK
  39 #define BLOCK(i, reg) \
  40 do { \
  41         asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
  42         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
  43                 "m" (p0[i / sizeof(*p0)])); \
  44         asm volatile("vmovdqa %%ymm" #reg ", %0" : \
  45                 "=m" (p0[i / sizeof(*p0)])); \
  46 } while (0);
  47
  48                 BLOCK16()
  49
  50                 p0 = (unsigned long *)((uintptr_t)p0 + 512);
  51                 p1 = (unsigned long *)((uintptr_t)p1 + 512);
  52         }
  53
  54         kernel_fpu_end();
  55 }
  56
  57 static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
  58         unsigned long *p2)
  59 {
  60         unsigned long lines = bytes >> 9;
  61
  62         kernel_fpu_begin();
  63
  64         while (lines--) {
  65 #undef BLOCK
  66 #define BLOCK(i, reg) \
  67 do { \
  68         asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
  69         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
  70                 "m" (p1[i / sizeof(*p1)])); \
  71         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
  72                 "m" (p0[i / sizeof(*p0)])); \
  73         asm volatile("vmovdqa %%ymm" #reg ", %0" : \
  74                 "=m" (p0[i / sizeof(*p0)])); \
  75 } while (0);
  76
  77                 BLOCK16()
  78
  79                 p0 = (unsigned long *)((uintptr_t)p0 + 512);
  80                 p1 = (unsigned long *)((uintptr_t)p1 + 512);
  81                 p2 = (unsigned long *)((uintptr_t)p2 + 512);
  82         }
  83
  84         kernel_fpu_end();
  85 }
  86
  87 static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
  88         unsigned long *p2, unsigned long *p3)
  89 {
  90         unsigned long lines = bytes >> 9;
  91
  92         kernel_fpu_begin();
  93
  94         while (lines--) {
  95 #undef BLOCK
  96 #define BLOCK(i, reg) \
  97 do { \
  98         asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
  99         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 100                 "m" (p2[i / sizeof(*p2)])); \
 101         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 102                 "m" (p1[i / sizeof(*p1)])); \
 103         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 104                 "m" (p0[i / sizeof(*p0)])); \
 105         asm volatile("vmovdqa %%ymm" #reg ", %0" : \
 106                 "=m" (p0[i / sizeof(*p0)])); \
 107 } while (0);
 108
 109                 BLOCK16();
 110
 111                 p0 = (unsigned long *)((uintptr_t)p0 + 512);
 112                 p1 = (unsigned long *)((uintptr_t)p1 + 512);
 113                 p2 = (unsigned long *)((uintptr_t)p2 + 512);
 114                 p3 = (unsigned long *)((uintptr_t)p3 + 512);
 115         }
 116
 117         kernel_fpu_end();
 118 }
 119
 120 static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
 121         unsigned long *p2, unsigned long *p3, unsigned long *p4)
 122 {
 123         unsigned long lines = bytes >> 9;
 124
 125         kernel_fpu_begin();
 126
 127         while (lines--) {
 128 #undef BLOCK
 129 #define BLOCK(i, reg) \
 130 do { \
 131         asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
 132         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 133                 "m" (p3[i / sizeof(*p3)])); \
 134         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 135                 "m" (p2[i / sizeof(*p2)])); \
 136         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 137                 "m" (p1[i / sizeof(*p1)])); \
 138         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 139                 "m" (p0[i / sizeof(*p0)])); \
 140         asm volatile("vmovdqa %%ymm" #reg ", %0" : \
 141                 "=m" (p0[i / sizeof(*p0)])); \
 142 } while (0);
 143
 144                 BLOCK16()
 145
 146                 p0 = (unsigned long *)((uintptr_t)p0 + 512);
 147                 p1 = (unsigned long *)((uintptr_t)p1 + 512);
 148                 p2 = (unsigned long *)((uintptr_t)p2 + 512);
 149                 p3 = (unsigned long *)((uintptr_t)p3 + 512);
 150                 p4 = (unsigned long *)((uintptr_t)p4 + 512);
 151         }
 152
 153         kernel_fpu_end();
 154 }
 155
 156 static struct xor_block_template xor_block_avx = {
 157         .name = "avx",
 158         .do_2 = xor_avx_2,
 159         .do_3 = xor_avx_3,
 160         .do_4 = xor_avx_4,
 161         .do_5 = xor_avx_5,
 162 };
 163
 164 #define AVX_XOR_SPEED \
 165 do { \
 166         if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
 167                 xor_speed(&xor_block_avx); \
 168 } while (0)
 169
 170 #define AVX_SELECT(FASTEST) \
 171         (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
 172
 173 #else
 174
 175 #define AVX_XOR_SPEED {}
 176
 177 #define AVX_SELECT(FASTEST) (FASTEST)
 178
 179 #endif
 180 #endif