1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com>
6 #include <linux/linkage.h>
9 SYM_FUNC_START(__memmove)
10 SYM_FUNC_START_WEAK(memmove)
16 * a0 - Inclusive first byte of dest
17 * a1 - Inclusive first byte of src
18 * a2 - Length of copy n
20 * Because the return matches the parameter register a0,
21 * we will not clobber or modify that register.
23 * Note: This currently only works on little-endian.
24 * To port to big-endian, reverse the direction of shifts
25 * in the 2 misaligned fixup copy loops.
28 /* Return if nothing to do */
29 beq a0, a1, return_from_memmove
30 beqz a2, return_from_memmove
34 * Forward Copy: a1 - Index counter of src
35 * Reverse Copy: a4 - Index counter of src
36 * Forward Copy: t3 - Index counter of dest
37 * Reverse Copy: t4 - Index counter of dest
38 * Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest
39 * Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest
40 * Both Copy Modes: t0 - Link / Temporary for load-store
41 * Both Copy Modes: t1 - Temporary for load-store
42 * Both Copy Modes: t2 - Temporary for load-store
43 * Both Copy Modes: a5 - dest to src alignment offset
44 * Both Copy Modes: a6 - Shift ammount
45 * Both Copy Modes: a7 - Inverse Shift ammount
46 * Both Copy Modes: a2 - Alternate breakpoint for unrolled loops
50 * Solve for some register values now.
51 * Byte copy does not need t5 or t6.
58 * Byte copy if copying less than (2 * SZREG) bytes. This can
59 * cause problems with the bulk copy implementation and is
60 * small enough not to bother.
62 andi t0, a2, -(2 * SZREG)
66 * Now solve for t5 and t6.
71 * If dest(Register t3) rounded down to the nearest naturally
72 * aligned SZREG address, does not equal dest, then add SZREG
73 * to find the low-bound of SZREG alignment in the dest memory
74 * region. Note that this could overshoot the dest memory
75 * region if n is less than SZREG. This is one reason why
76 * we always byte copy if n is less than SZREG.
77 * Otherwise, dest is already naturally aligned to SZREG.
84 * If the dest and src are co-aligned to SZREG, then there is
85 * no need for the full rigmarole of a full misaligned fixup copy.
86 * Instead, do a simpler co-aligned copy.
89 andi t1, t0, (SZREG - 1)
90 beqz t1, coaligned_copy
91 /* Fall through to misaligned fixup copy */
93 misaligned_fixup_copy:
94 bltu a1, a0, misaligned_fixup_copy_reverse
96 misaligned_fixup_copy_forward:
97 jal t0, byte_copy_until_aligned_forward
99 andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */
100 slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
101 sub a5, a1, t3 /* Find the difference between src and dest */
102 andi a1, a1, -SZREG /* Align the src pointer */
103 addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/
106 * Compute The Inverse Shift
107 * a7 = XLEN - a6 = XLEN + -a6
108 * 2s complement negation to find the negative: -a6 = ~a6 + 1
109 * Add that to XLEN. XLEN = SZREG * 8.
112 addi a7, a7, (SZREG * 8 + 1)
115 * Fix Misalignment Copy Loop - Forward
116 * load_val0 = load_ptr[0];
118 * load_val1 = load_ptr[1];
120 * store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7});
122 * if (store_ptr == {a2})
125 * load_val0 = load_ptr[2];
127 * store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7});
129 * } while (store_ptr != store_ptr_end);
130 * store_ptr = store_ptr_end;
133 REG_L t0, (0 * SZREG)(a1)
135 REG_L t1, (1 * SZREG)(a1)
136 addi t3, t3, (2 * SZREG)
140 REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3)
144 REG_L t0, (2 * SZREG)(a1)
145 addi a1, a1, (2 * SZREG)
149 REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3)
153 mv t3, t6 /* Fix the dest pointer in case the loop was broken */
155 add a1, t3, a5 /* Restore the src pointer */
156 j byte_copy_forward /* Copy any remaining bytes */
158 misaligned_fixup_copy_reverse:
159 jal t0, byte_copy_until_aligned_reverse
161 andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */
162 slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
163 sub a5, a4, t4 /* Find the difference between src and dest */
164 andi a4, a4, -SZREG /* Align the src pointer */
165 addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/
168 * Compute The Inverse Shift
169 * a7 = XLEN - a6 = XLEN + -a6
170 * 2s complement negation to find the negative: -a6 = ~a6 + 1
171 * Add that to XLEN. XLEN = SZREG * 8.
174 addi a7, a7, (SZREG * 8 + 1)
177 * Fix Misalignment Copy Loop - Reverse
178 * load_val1 = load_ptr[0];
180 * load_val0 = load_ptr[-1];
182 * store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7});
184 * if (store_ptr == {a2})
187 * load_val1 = load_ptr[-2];
189 * store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7});
191 * } while (store_ptr != store_ptr_end);
192 * store_ptr = store_ptr_end;
195 REG_L t1, ( 0 * SZREG)(a4)
197 REG_L t0, (-1 * SZREG)(a4)
198 addi t4, t4, (-2 * SZREG)
202 REG_S t2, ( 1 * SZREG)(t4)
206 REG_L t1, (-2 * SZREG)(a4)
207 addi a4, a4, (-2 * SZREG)
211 REG_S t2, ( 0 * SZREG)(t4)
215 mv t4, t5 /* Fix the dest pointer in case the loop was broken */
217 add a4, t4, a5 /* Restore the src pointer */
218 j byte_copy_reverse /* Copy any remaining bytes */
221 * Simple copy loops for SZREG co-aligned memory locations.
222 * These also make calls to do byte copies for any unaligned
223 * data at their terminations.
226 bltu a1, a0, coaligned_copy_reverse
228 coaligned_copy_forward:
229 jal t0, byte_copy_until_aligned_forward
232 REG_L t1, ( 0 * SZREG)(a1)
235 REG_S t1, (-1 * SZREG)(t3)
238 j byte_copy_forward /* Copy any remaining bytes */
240 coaligned_copy_reverse:
241 jal t0, byte_copy_until_aligned_reverse
244 REG_L t1, (-1 * SZREG)(a4)
247 REG_S t1, ( 0 * SZREG)(t4)
250 j byte_copy_reverse /* Copy any remaining bytes */
253 * These are basically sub-functions within the function. They
254 * are used to byte copy until the dest pointer is in alignment.
255 * At which point, a bulk copy method can be used by the
256 * calling code. These work on the same registers as the bulk
257 * copy loops. Therefore, the register values can be picked
258 * up from where they were left and we avoid code duplication
259 * without any overhead except the call in and return jumps.
261 byte_copy_until_aligned_forward:
270 jalr zero, 0x0(t0) /* Return to multibyte copy loop */
272 byte_copy_until_aligned_reverse:
281 jalr zero, 0x0(t0) /* Return to multibyte copy loop */
284 * Simple byte copy loops.
285 * These will byte copy until they reach the end of data to copy.
286 * At that point, they will call to return from memmove.
289 bltu a1, a0, byte_copy_reverse
315 SYM_FUNC_END(memmove)
316 SYM_FUNC_END(__memmove)