riscv/lib/memmove.S

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 /*
   3  * Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com>
   4  */
   5
   6 #include <linux/linkage.h>
   7 #include <asm/asm.h>
   8
   9 SYM_FUNC_START(__memmove)
  10 SYM_FUNC_START_WEAK(memmove)
  11         /*
  12          * Returns
  13          *   a0 - dest
  14          *
  15          * Parameters
  16          *   a0 - Inclusive first byte of dest
  17          *   a1 - Inclusive first byte of src
  18          *   a2 - Length of copy n
  19          *
  20          * Because the return matches the parameter register a0,
  21          * we will not clobber or modify that register.
  22          *
  23          * Note: This currently only works on little-endian.
  24          * To port to big-endian, reverse the direction of shifts
  25          * in the 2 misaligned fixup copy loops.
  26          */
  27
  28         /* Return if nothing to do */
  29         beq a0, a1, return_from_memmove
  30         beqz a2, return_from_memmove
  31
  32         /*
  33          * Register Uses
  34          *      Forward Copy: a1 - Index counter of src
  35          *      Reverse Copy: a4 - Index counter of src
  36          *      Forward Copy: t3 - Index counter of dest
  37          *      Reverse Copy: t4 - Index counter of dest
  38          *   Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest
  39          *   Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest
  40          *   Both Copy Modes: t0 - Link / Temporary for load-store
  41          *   Both Copy Modes: t1 - Temporary for load-store
  42          *   Both Copy Modes: t2 - Temporary for load-store
  43          *   Both Copy Modes: a5 - dest to src alignment offset
  44          *   Both Copy Modes: a6 - Shift ammount
  45          *   Both Copy Modes: a7 - Inverse Shift ammount
  46          *   Both Copy Modes: a2 - Alternate breakpoint for unrolled loops
  47          */
  48
  49         /*
  50          * Solve for some register values now.
  51          * Byte copy does not need t5 or t6.
  52          */
  53         mv   t3, a0
  54         add  t4, a0, a2
  55         add  a4, a1, a2
  56
  57         /*
  58          * Byte copy if copying less than (2 * SZREG) bytes. This can
  59          * cause problems with the bulk copy implementation and is
  60          * small enough not to bother.
  61          */
  62         andi t0, a2, -(2 * SZREG)
  63         beqz t0, byte_copy
  64
  65         /*
  66          * Now solve for t5 and t6.
  67          */
  68         andi t5, t3, -SZREG
  69         andi t6, t4, -SZREG
  70         /*
  71          * If dest(Register t3) rounded down to the nearest naturally
  72          * aligned SZREG address, does not equal dest, then add SZREG
  73          * to find the low-bound of SZREG alignment in the dest memory
  74          * region.  Note that this could overshoot the dest memory
  75          * region if n is less than SZREG.  This is one reason why
  76          * we always byte copy if n is less than SZREG.
  77          * Otherwise, dest is already naturally aligned to SZREG.
  78          */
  79         beq  t5, t3, 1f
  80                 addi t5, t5, SZREG
  81         1:
  82
  83         /*
  84          * If the dest and src are co-aligned to SZREG, then there is
  85          * no need for the full rigmarole of a full misaligned fixup copy.
  86          * Instead, do a simpler co-aligned copy.
  87          */
  88         xor  t0, a0, a1
  89         andi t1, t0, (SZREG - 1)
  90         beqz t1, coaligned_copy
  91         /* Fall through to misaligned fixup copy */
  92
  93 misaligned_fixup_copy:
  94         bltu a1, a0, misaligned_fixup_copy_reverse
  95
  96 misaligned_fixup_copy_forward:
  97         jal  t0, byte_copy_until_aligned_forward
  98
  99         andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */
 100         slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
 101         sub  a5, a1, t3 /* Find the difference between src and dest */
 102         andi a1, a1, -SZREG /* Align the src pointer */
 103         addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/
 104
 105         /*
 106          * Compute The Inverse Shift
 107          * a7 = XLEN - a6 = XLEN + -a6
 108          * 2s complement negation to find the negative: -a6 = ~a6 + 1
 109          * Add that to XLEN.  XLEN = SZREG * 8.
 110          */
 111         not  a7, a6
 112         addi a7, a7, (SZREG * 8 + 1)
 113
 114         /*
 115          * Fix Misalignment Copy Loop - Forward
 116          * load_val0 = load_ptr[0];
 117          * do {
 118          *      load_val1 = load_ptr[1];
 119          *      store_ptr += 2;
 120          *      store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7});
 121          *
 122          *      if (store_ptr == {a2})
 123          *              break;
 124          *
 125          *      load_val0 = load_ptr[2];
 126          *      load_ptr += 2;
 127          *      store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7});
 128          *
 129          * } while (store_ptr != store_ptr_end);
 130          * store_ptr = store_ptr_end;
 131          */
 132
 133         REG_L t0, (0 * SZREG)(a1)
 134         1:
 135         REG_L t1, (1 * SZREG)(a1)
 136         addi  t3, t3, (2 * SZREG)
 137         srl   t0, t0, a6
 138         sll   t2, t1, a7
 139         or    t2, t0, t2
 140         REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3)
 141
 142         beq   t3, a2, 2f
 143
 144         REG_L t0, (2 * SZREG)(a1)
 145         addi  a1, a1, (2 * SZREG)
 146         srl   t1, t1, a6
 147         sll   t2, t0, a7
 148         or    t2, t1, t2
 149         REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3)
 150
 151         bne   t3, t6, 1b
 152         2:
 153         mv    t3, t6 /* Fix the dest pointer in case the loop was broken */
 154
 155         add  a1, t3, a5 /* Restore the src pointer */
 156         j byte_copy_forward /* Copy any remaining bytes */
 157
 158 misaligned_fixup_copy_reverse:
 159         jal  t0, byte_copy_until_aligned_reverse
 160
 161         andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */
 162         slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
 163         sub  a5, a4, t4 /* Find the difference between src and dest */
 164         andi a4, a4, -SZREG /* Align the src pointer */
 165         addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/
 166
 167         /*
 168          * Compute The Inverse Shift
 169          * a7 = XLEN - a6 = XLEN + -a6
 170          * 2s complement negation to find the negative: -a6 = ~a6 + 1
 171          * Add that to XLEN.  XLEN = SZREG * 8.
 172          */
 173         not  a7, a6
 174         addi a7, a7, (SZREG * 8 + 1)
 175
 176         /*
 177          * Fix Misalignment Copy Loop - Reverse
 178          * load_val1 = load_ptr[0];
 179          * do {
 180          *      load_val0 = load_ptr[-1];
 181          *      store_ptr -= 2;
 182          *      store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7});
 183          *
 184          *      if (store_ptr == {a2})
 185          *              break;
 186          *
 187          *      load_val1 = load_ptr[-2];
 188          *      load_ptr -= 2;
 189          *      store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7});
 190          *
 191          * } while (store_ptr != store_ptr_end);
 192          * store_ptr = store_ptr_end;
 193          */
 194
 195         REG_L t1, ( 0 * SZREG)(a4)
 196         1:
 197         REG_L t0, (-1 * SZREG)(a4)
 198         addi  t4, t4, (-2 * SZREG)
 199         sll   t1, t1, a7
 200         srl   t2, t0, a6
 201         or    t2, t1, t2
 202         REG_S t2, ( 1 * SZREG)(t4)
 203
 204         beq   t4, a2, 2f
 205
 206         REG_L t1, (-2 * SZREG)(a4)
 207         addi  a4, a4, (-2 * SZREG)
 208         sll   t0, t0, a7
 209         srl   t2, t1, a6
 210         or    t2, t0, t2
 211         REG_S t2, ( 0 * SZREG)(t4)
 212
 213         bne   t4, t5, 1b
 214         2:
 215         mv    t4, t5 /* Fix the dest pointer in case the loop was broken */
 216
 217         add  a4, t4, a5 /* Restore the src pointer */
 218         j byte_copy_reverse /* Copy any remaining bytes */
 219
 220 /*
 221  * Simple copy loops for SZREG co-aligned memory locations.
 222  * These also make calls to do byte copies for any unaligned
 223  * data at their terminations.
 224  */
 225 coaligned_copy:
 226         bltu a1, a0, coaligned_copy_reverse
 227
 228 coaligned_copy_forward:
 229         jal t0, byte_copy_until_aligned_forward
 230
 231         1:
 232         REG_L t1, ( 0 * SZREG)(a1)
 233         addi  a1, a1, SZREG
 234         addi  t3, t3, SZREG
 235         REG_S t1, (-1 * SZREG)(t3)
 236         bne   t3, t6, 1b
 237
 238         j byte_copy_forward /* Copy any remaining bytes */
 239
 240 coaligned_copy_reverse:
 241         jal t0, byte_copy_until_aligned_reverse
 242
 243         1:
 244         REG_L t1, (-1 * SZREG)(a4)
 245         addi  a4, a4, -SZREG
 246         addi  t4, t4, -SZREG
 247         REG_S t1, ( 0 * SZREG)(t4)
 248         bne   t4, t5, 1b
 249
 250         j byte_copy_reverse /* Copy any remaining bytes */
 251
 252 /*
 253  * These are basically sub-functions within the function.  They
 254  * are used to byte copy until the dest pointer is in alignment.
 255  * At which point, a bulk copy method can be used by the
 256  * calling code.  These work on the same registers as the bulk
 257  * copy loops.  Therefore, the register values can be picked
 258  * up from where they were left and we avoid code duplication
 259  * without any overhead except the call in and return jumps.
 260  */
 261 byte_copy_until_aligned_forward:
 262         beq  t3, t5, 2f
 263         1:
 264         lb   t1,  0(a1)
 265         addi a1, a1, 1
 266         addi t3, t3, 1
 267         sb   t1, -1(t3)
 268         bne  t3, t5, 1b
 269         2:
 270         jalr zero, 0x0(t0) /* Return to multibyte copy loop */
 271
 272 byte_copy_until_aligned_reverse:
 273         beq  t4, t6, 2f
 274         1:
 275         lb   t1, -1(a4)
 276         addi a4, a4, -1
 277         addi t4, t4, -1
 278         sb   t1,  0(t4)
 279         bne  t4, t6, 1b
 280         2:
 281         jalr zero, 0x0(t0) /* Return to multibyte copy loop */
 282
 283 /*
 284  * Simple byte copy loops.
 285  * These will byte copy until they reach the end of data to copy.
 286  * At that point, they will call to return from memmove.
 287  */
 288 byte_copy:
 289         bltu a1, a0, byte_copy_reverse
 290
 291 byte_copy_forward:
 292         beq  t3, t4, 2f
 293         1:
 294         lb   t1,  0(a1)
 295         addi a1, a1, 1
 296         addi t3, t3, 1
 297         sb   t1, -1(t3)
 298         bne  t3, t4, 1b
 299         2:
 300         ret
 301
 302 byte_copy_reverse:
 303         beq  t4, t3, 2f
 304         1:
 305         lb   t1, -1(a4)
 306         addi a4, a4, -1
 307         addi t4, t4, -1
 308         sb   t1,  0(t4)
 309         bne  t4, t3, 1b
 310         2:
 311
 312 return_from_memmove:
 313         ret
 314
 315 SYM_FUNC_END(memmove)
 316 SYM_FUNC_END(__memmove)