x86/lib/memcpy_64.S

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 /* Copyright 2002 Andi Kleen */
   3
   4 #include <linux/linkage.h>
   5 #include <linux/cfi_types.h>
   6 #include <asm/errno.h>
   7 #include <asm/cpufeatures.h>
   8 #include <asm/alternative.h>
   9 #include <asm/export.h>
  10
  11 .pushsection .noinstr.text, "ax"
  12
  13 /*
  14  * We build a jump to memcpy_orig by default which gets NOPped out on
  15  * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
  16  * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
  17  * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
  18  */
  19
  20 /*
  21  * memcpy - Copy a memory block.
  22  *
  23  * Input:
  24  *  rdi destination
  25  *  rsi source
  26  *  rdx count
  27  *
  28  * Output:
  29  * rax original destination
  30  */
  31 SYM_TYPED_FUNC_START(__memcpy)
  32         ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
  33                       "jmp memcpy_erms", X86_FEATURE_ERMS
  34
  35         movq %rdi, %rax
  36         movq %rdx, %rcx
  37         shrq $3, %rcx
  38         andl $7, %edx
  39         rep movsq
  40         movl %edx, %ecx
  41         rep movsb
  42         RET
  43 SYM_FUNC_END(__memcpy)
  44 EXPORT_SYMBOL(__memcpy)
  45
  46 SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
  47 EXPORT_SYMBOL(memcpy)
  48
  49 /*
  50  * memcpy_erms() - enhanced fast string memcpy. This is faster and
  51  * simpler than memcpy. Use memcpy_erms when possible.
  52  */
  53 SYM_FUNC_START_LOCAL(memcpy_erms)
  54         movq %rdi, %rax
  55         movq %rdx, %rcx
  56         rep movsb
  57         RET
  58 SYM_FUNC_END(memcpy_erms)
  59
  60 SYM_FUNC_START_LOCAL(memcpy_orig)
  61         movq %rdi, %rax
  62
  63         cmpq $0x20, %rdx
  64         jb .Lhandle_tail
  65
  66         /*
  67          * We check whether memory false dependence could occur,
  68          * then jump to corresponding copy mode.
  69          */
  70         cmp  %dil, %sil
  71         jl .Lcopy_backward
  72         subq $0x20, %rdx
  73 .Lcopy_forward_loop:
  74         subq $0x20,     %rdx
  75
  76         /*
  77          * Move in blocks of 4x8 bytes:
  78          */
  79         movq 0*8(%rsi), %r8
  80         movq 1*8(%rsi), %r9
  81         movq 2*8(%rsi), %r10
  82         movq 3*8(%rsi), %r11
  83         leaq 4*8(%rsi), %rsi
  84
  85         movq %r8,       0*8(%rdi)
  86         movq %r9,       1*8(%rdi)
  87         movq %r10,      2*8(%rdi)
  88         movq %r11,      3*8(%rdi)
  89         leaq 4*8(%rdi), %rdi
  90         jae  .Lcopy_forward_loop
  91         addl $0x20,     %edx
  92         jmp  .Lhandle_tail
  93
  94 .Lcopy_backward:
  95         /*
  96          * Calculate copy position to tail.
  97          */
  98         addq %rdx,      %rsi
  99         addq %rdx,      %rdi
 100         subq $0x20,     %rdx
 101         /*
 102          * At most 3 ALU operations in one cycle,
 103          * so append NOPS in the same 16 bytes trunk.
 104          */
 105         .p2align 4
 106 .Lcopy_backward_loop:
 107         subq $0x20,     %rdx
 108         movq -1*8(%rsi),        %r8
 109         movq -2*8(%rsi),        %r9
 110         movq -3*8(%rsi),        %r10
 111         movq -4*8(%rsi),        %r11
 112         leaq -4*8(%rsi),        %rsi
 113         movq %r8,               -1*8(%rdi)
 114         movq %r9,               -2*8(%rdi)
 115         movq %r10,              -3*8(%rdi)
 116         movq %r11,              -4*8(%rdi)
 117         leaq -4*8(%rdi),        %rdi
 118         jae  .Lcopy_backward_loop
 119
 120         /*
 121          * Calculate copy position to head.
 122          */
 123         addl $0x20,     %edx
 124         subq %rdx,      %rsi
 125         subq %rdx,      %rdi
 126 .Lhandle_tail:
 127         cmpl $16,       %edx
 128         jb   .Lless_16bytes
 129
 130         /*
 131          * Move data from 16 bytes to 31 bytes.
 132          */
 133         movq 0*8(%rsi), %r8
 134         movq 1*8(%rsi), %r9
 135         movq -2*8(%rsi, %rdx),  %r10
 136         movq -1*8(%rsi, %rdx),  %r11
 137         movq %r8,       0*8(%rdi)
 138         movq %r9,       1*8(%rdi)
 139         movq %r10,      -2*8(%rdi, %rdx)
 140         movq %r11,      -1*8(%rdi, %rdx)
 141         RET
 142         .p2align 4
 143 .Lless_16bytes:
 144         cmpl $8,        %edx
 145         jb   .Lless_8bytes
 146         /*
 147          * Move data from 8 bytes to 15 bytes.
 148          */
 149         movq 0*8(%rsi), %r8
 150         movq -1*8(%rsi, %rdx),  %r9
 151         movq %r8,       0*8(%rdi)
 152         movq %r9,       -1*8(%rdi, %rdx)
 153         RET
 154         .p2align 4
 155 .Lless_8bytes:
 156         cmpl $4,        %edx
 157         jb   .Lless_3bytes
 158
 159         /*
 160          * Move data from 4 bytes to 7 bytes.
 161          */
 162         movl (%rsi), %ecx
 163         movl -4(%rsi, %rdx), %r8d
 164         movl %ecx, (%rdi)
 165         movl %r8d, -4(%rdi, %rdx)
 166         RET
 167         .p2align 4
 168 .Lless_3bytes:
 169         subl $1, %edx
 170         jb .Lend
 171         /*
 172          * Move data from 1 bytes to 3 bytes.
 173          */
 174         movzbl (%rsi), %ecx
 175         jz .Lstore_1byte
 176         movzbq 1(%rsi), %r8
 177         movzbq (%rsi, %rdx), %r9
 178         movb %r8b, 1(%rdi)
 179         movb %r9b, (%rdi, %rdx)
 180 .Lstore_1byte:
 181         movb %cl, (%rdi)
 182
 183 .Lend:
 184         RET
 185 SYM_FUNC_END(memcpy_orig)
 186
 187 .popsection