GNU Linux-libre 4.19.245-gnu1
[releases.git] / arch / powerpc / lib / copyuser_power7.S
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 2 of the License, or
5  * (at your option) any later version.
6  *
7  * This program is distributed in the hope that it will be useful,
8  * but WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  * GNU General Public License for more details.
11  *
12  * You should have received a copy of the GNU General Public License
13  * along with this program; if not, write to the Free Software
14  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15  *
16  * Copyright (C) IBM Corporation, 2011
17  *
18  * Author: Anton Blanchard <anton@au.ibm.com>
19  */
20 #include <asm/ppc_asm.h>
21
22 #ifndef SELFTEST_CASE
23 /* 0 == don't use VMX, 1 == use VMX */
24 #define SELFTEST_CASE   0
25 #endif
26
27 #ifdef __BIG_ENDIAN__
28 #define LVS(VRT,RA,RB)          lvsl    VRT,RA,RB
29 #define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRA,VRB,VRC
30 #else
31 #define LVS(VRT,RA,RB)          lvsr    VRT,RA,RB
32 #define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRB,VRA,VRC
33 #endif
34
35         .macro err1
36 100:
37         EX_TABLE(100b,.Ldo_err1)
38         .endm
39
40         .macro err2
41 200:
42         EX_TABLE(200b,.Ldo_err2)
43         .endm
44
45 #ifdef CONFIG_ALTIVEC
46         .macro err3
47 300:
48         EX_TABLE(300b,.Ldo_err3)
49         .endm
50
51         .macro err4
52 400:
53         EX_TABLE(400b,.Ldo_err4)
54         .endm
55
56
57 .Ldo_err4:
58         ld      r16,STK_REG(R16)(r1)
59         ld      r15,STK_REG(R15)(r1)
60         ld      r14,STK_REG(R14)(r1)
61 .Ldo_err3:
62         bl      exit_vmx_usercopy
63         ld      r0,STACKFRAMESIZE+16(r1)
64         mtlr    r0
65         b       .Lexit
66 #endif /* CONFIG_ALTIVEC */
67
68 .Ldo_err2:
69         ld      r22,STK_REG(R22)(r1)
70         ld      r21,STK_REG(R21)(r1)
71         ld      r20,STK_REG(R20)(r1)
72         ld      r19,STK_REG(R19)(r1)
73         ld      r18,STK_REG(R18)(r1)
74         ld      r17,STK_REG(R17)(r1)
75         ld      r16,STK_REG(R16)(r1)
76         ld      r15,STK_REG(R15)(r1)
77         ld      r14,STK_REG(R14)(r1)
78 .Lexit:
79         addi    r1,r1,STACKFRAMESIZE
80 .Ldo_err1:
81         ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
82         ld      r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
83         ld      r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
84         b       __copy_tofrom_user_base
85
86
87 _GLOBAL(__copy_tofrom_user_power7)
88         cmpldi  r5,16
89         cmpldi  cr1,r5,3328
90
91         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
92         std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
93         std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
94
95         blt     .Lshort_copy
96
97 #ifdef CONFIG_ALTIVEC
98 test_feature = SELFTEST_CASE
99 BEGIN_FTR_SECTION
100         bgt     cr1,.Lvmx_copy
101 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
102 #endif
103
104 .Lnonvmx_copy:
105         /* Get the source 8B aligned */
106         neg     r6,r4
107         mtocrf  0x01,r6
108         clrldi  r6,r6,(64-3)
109
110         bf      cr7*4+3,1f
111 err1;   lbz     r0,0(r4)
112         addi    r4,r4,1
113 err1;   stb     r0,0(r3)
114         addi    r3,r3,1
115
116 1:      bf      cr7*4+2,2f
117 err1;   lhz     r0,0(r4)
118         addi    r4,r4,2
119 err1;   sth     r0,0(r3)
120         addi    r3,r3,2
121
122 2:      bf      cr7*4+1,3f
123 err1;   lwz     r0,0(r4)
124         addi    r4,r4,4
125 err1;   stw     r0,0(r3)
126         addi    r3,r3,4
127
128 3:      sub     r5,r5,r6
129         cmpldi  r5,128
130         blt     5f
131
132         mflr    r0
133         stdu    r1,-STACKFRAMESIZE(r1)
134         std     r14,STK_REG(R14)(r1)
135         std     r15,STK_REG(R15)(r1)
136         std     r16,STK_REG(R16)(r1)
137         std     r17,STK_REG(R17)(r1)
138         std     r18,STK_REG(R18)(r1)
139         std     r19,STK_REG(R19)(r1)
140         std     r20,STK_REG(R20)(r1)
141         std     r21,STK_REG(R21)(r1)
142         std     r22,STK_REG(R22)(r1)
143         std     r0,STACKFRAMESIZE+16(r1)
144
145         srdi    r6,r5,7
146         mtctr   r6
147
148         /* Now do cacheline (128B) sized loads and stores. */
149         .align  5
150 4:
151 err2;   ld      r0,0(r4)
152 err2;   ld      r6,8(r4)
153 err2;   ld      r7,16(r4)
154 err2;   ld      r8,24(r4)
155 err2;   ld      r9,32(r4)
156 err2;   ld      r10,40(r4)
157 err2;   ld      r11,48(r4)
158 err2;   ld      r12,56(r4)
159 err2;   ld      r14,64(r4)
160 err2;   ld      r15,72(r4)
161 err2;   ld      r16,80(r4)
162 err2;   ld      r17,88(r4)
163 err2;   ld      r18,96(r4)
164 err2;   ld      r19,104(r4)
165 err2;   ld      r20,112(r4)
166 err2;   ld      r21,120(r4)
167         addi    r4,r4,128
168 err2;   std     r0,0(r3)
169 err2;   std     r6,8(r3)
170 err2;   std     r7,16(r3)
171 err2;   std     r8,24(r3)
172 err2;   std     r9,32(r3)
173 err2;   std     r10,40(r3)
174 err2;   std     r11,48(r3)
175 err2;   std     r12,56(r3)
176 err2;   std     r14,64(r3)
177 err2;   std     r15,72(r3)
178 err2;   std     r16,80(r3)
179 err2;   std     r17,88(r3)
180 err2;   std     r18,96(r3)
181 err2;   std     r19,104(r3)
182 err2;   std     r20,112(r3)
183 err2;   std     r21,120(r3)
184         addi    r3,r3,128
185         bdnz    4b
186
187         clrldi  r5,r5,(64-7)
188
189         ld      r14,STK_REG(R14)(r1)
190         ld      r15,STK_REG(R15)(r1)
191         ld      r16,STK_REG(R16)(r1)
192         ld      r17,STK_REG(R17)(r1)
193         ld      r18,STK_REG(R18)(r1)
194         ld      r19,STK_REG(R19)(r1)
195         ld      r20,STK_REG(R20)(r1)
196         ld      r21,STK_REG(R21)(r1)
197         ld      r22,STK_REG(R22)(r1)
198         addi    r1,r1,STACKFRAMESIZE
199
200         /* Up to 127B to go */
201 5:      srdi    r6,r5,4
202         mtocrf  0x01,r6
203
204 6:      bf      cr7*4+1,7f
205 err1;   ld      r0,0(r4)
206 err1;   ld      r6,8(r4)
207 err1;   ld      r7,16(r4)
208 err1;   ld      r8,24(r4)
209 err1;   ld      r9,32(r4)
210 err1;   ld      r10,40(r4)
211 err1;   ld      r11,48(r4)
212 err1;   ld      r12,56(r4)
213         addi    r4,r4,64
214 err1;   std     r0,0(r3)
215 err1;   std     r6,8(r3)
216 err1;   std     r7,16(r3)
217 err1;   std     r8,24(r3)
218 err1;   std     r9,32(r3)
219 err1;   std     r10,40(r3)
220 err1;   std     r11,48(r3)
221 err1;   std     r12,56(r3)
222         addi    r3,r3,64
223
224         /* Up to 63B to go */
225 7:      bf      cr7*4+2,8f
226 err1;   ld      r0,0(r4)
227 err1;   ld      r6,8(r4)
228 err1;   ld      r7,16(r4)
229 err1;   ld      r8,24(r4)
230         addi    r4,r4,32
231 err1;   std     r0,0(r3)
232 err1;   std     r6,8(r3)
233 err1;   std     r7,16(r3)
234 err1;   std     r8,24(r3)
235         addi    r3,r3,32
236
237         /* Up to 31B to go */
238 8:      bf      cr7*4+3,9f
239 err1;   ld      r0,0(r4)
240 err1;   ld      r6,8(r4)
241         addi    r4,r4,16
242 err1;   std     r0,0(r3)
243 err1;   std     r6,8(r3)
244         addi    r3,r3,16
245
246 9:      clrldi  r5,r5,(64-4)
247
248         /* Up to 15B to go */
249 .Lshort_copy:
250         mtocrf  0x01,r5
251         bf      cr7*4+0,12f
252 err1;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
253 err1;   lwz     r6,4(r4)
254         addi    r4,r4,8
255 err1;   stw     r0,0(r3)
256 err1;   stw     r6,4(r3)
257         addi    r3,r3,8
258
259 12:     bf      cr7*4+1,13f
260 err1;   lwz     r0,0(r4)
261         addi    r4,r4,4
262 err1;   stw     r0,0(r3)
263         addi    r3,r3,4
264
265 13:     bf      cr7*4+2,14f
266 err1;   lhz     r0,0(r4)
267         addi    r4,r4,2
268 err1;   sth     r0,0(r3)
269         addi    r3,r3,2
270
271 14:     bf      cr7*4+3,15f
272 err1;   lbz     r0,0(r4)
273 err1;   stb     r0,0(r3)
274
275 15:     li      r3,0
276         blr
277
278 .Lunwind_stack_nonvmx_copy:
279         addi    r1,r1,STACKFRAMESIZE
280         b       .Lnonvmx_copy
281
282 .Lvmx_copy:
283 #ifdef CONFIG_ALTIVEC
284         mflr    r0
285         std     r0,16(r1)
286         stdu    r1,-STACKFRAMESIZE(r1)
287         bl      enter_vmx_usercopy
288         cmpwi   cr1,r3,0
289         ld      r0,STACKFRAMESIZE+16(r1)
290         ld      r3,STK_REG(R31)(r1)
291         ld      r4,STK_REG(R30)(r1)
292         ld      r5,STK_REG(R29)(r1)
293         mtlr    r0
294
295         /*
296          * We prefetch both the source and destination using enhanced touch
297          * instructions. We use a stream ID of 0 for the load side and
298          * 1 for the store side.
299          */
300         clrrdi  r6,r4,7
301         clrrdi  r9,r3,7
302         ori     r9,r9,1         /* stream=1 */
303
304         srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
305         cmpldi  r7,0x3FF
306         ble     1f
307         li      r7,0x3FF
308 1:      lis     r0,0x0E00       /* depth=7 */
309         sldi    r7,r7,7
310         or      r7,r7,r0
311         ori     r10,r7,1        /* stream=1 */
312
313         lis     r8,0x8000       /* GO=1 */
314         clrldi  r8,r8,32
315
316         /* setup read stream 0 */
317         dcbt    0,r6,0b01000   /* addr from */
318         dcbt    0,r7,0b01010   /* length and depth from */
319         /* setup write stream 1 */
320         dcbtst  0,r9,0b01000   /* addr to */
321         dcbtst  0,r10,0b01010  /* length and depth to */
322         eieio
323         dcbt    0,r8,0b01010    /* all streams GO */
324
325         beq     cr1,.Lunwind_stack_nonvmx_copy
326
327         /*
328          * If source and destination are not relatively aligned we use a
329          * slower permute loop.
330          */
331         xor     r6,r4,r3
332         rldicl. r6,r6,0,(64-4)
333         bne     .Lvmx_unaligned_copy
334
335         /* Get the destination 16B aligned */
336         neg     r6,r3
337         mtocrf  0x01,r6
338         clrldi  r6,r6,(64-4)
339
340         bf      cr7*4+3,1f
341 err3;   lbz     r0,0(r4)
342         addi    r4,r4,1
343 err3;   stb     r0,0(r3)
344         addi    r3,r3,1
345
346 1:      bf      cr7*4+2,2f
347 err3;   lhz     r0,0(r4)
348         addi    r4,r4,2
349 err3;   sth     r0,0(r3)
350         addi    r3,r3,2
351
352 2:      bf      cr7*4+1,3f
353 err3;   lwz     r0,0(r4)
354         addi    r4,r4,4
355 err3;   stw     r0,0(r3)
356         addi    r3,r3,4
357
358 3:      bf      cr7*4+0,4f
359 err3;   ld      r0,0(r4)
360         addi    r4,r4,8
361 err3;   std     r0,0(r3)
362         addi    r3,r3,8
363
364 4:      sub     r5,r5,r6
365
366         /* Get the desination 128B aligned */
367         neg     r6,r3
368         srdi    r7,r6,4
369         mtocrf  0x01,r7
370         clrldi  r6,r6,(64-7)
371
372         li      r9,16
373         li      r10,32
374         li      r11,48
375
376         bf      cr7*4+3,5f
377 err3;   lvx     v1,0,r4
378         addi    r4,r4,16
379 err3;   stvx    v1,0,r3
380         addi    r3,r3,16
381
382 5:      bf      cr7*4+2,6f
383 err3;   lvx     v1,0,r4
384 err3;   lvx     v0,r4,r9
385         addi    r4,r4,32
386 err3;   stvx    v1,0,r3
387 err3;   stvx    v0,r3,r9
388         addi    r3,r3,32
389
390 6:      bf      cr7*4+1,7f
391 err3;   lvx     v3,0,r4
392 err3;   lvx     v2,r4,r9
393 err3;   lvx     v1,r4,r10
394 err3;   lvx     v0,r4,r11
395         addi    r4,r4,64
396 err3;   stvx    v3,0,r3
397 err3;   stvx    v2,r3,r9
398 err3;   stvx    v1,r3,r10
399 err3;   stvx    v0,r3,r11
400         addi    r3,r3,64
401
402 7:      sub     r5,r5,r6
403         srdi    r6,r5,7
404
405         std     r14,STK_REG(R14)(r1)
406         std     r15,STK_REG(R15)(r1)
407         std     r16,STK_REG(R16)(r1)
408
409         li      r12,64
410         li      r14,80
411         li      r15,96
412         li      r16,112
413
414         mtctr   r6
415
416         /*
417          * Now do cacheline sized loads and stores. By this stage the
418          * cacheline stores are also cacheline aligned.
419          */
420         .align  5
421 8:
422 err4;   lvx     v7,0,r4
423 err4;   lvx     v6,r4,r9
424 err4;   lvx     v5,r4,r10
425 err4;   lvx     v4,r4,r11
426 err4;   lvx     v3,r4,r12
427 err4;   lvx     v2,r4,r14
428 err4;   lvx     v1,r4,r15
429 err4;   lvx     v0,r4,r16
430         addi    r4,r4,128
431 err4;   stvx    v7,0,r3
432 err4;   stvx    v6,r3,r9
433 err4;   stvx    v5,r3,r10
434 err4;   stvx    v4,r3,r11
435 err4;   stvx    v3,r3,r12
436 err4;   stvx    v2,r3,r14
437 err4;   stvx    v1,r3,r15
438 err4;   stvx    v0,r3,r16
439         addi    r3,r3,128
440         bdnz    8b
441
442         ld      r14,STK_REG(R14)(r1)
443         ld      r15,STK_REG(R15)(r1)
444         ld      r16,STK_REG(R16)(r1)
445
446         /* Up to 127B to go */
447         clrldi  r5,r5,(64-7)
448         srdi    r6,r5,4
449         mtocrf  0x01,r6
450
451         bf      cr7*4+1,9f
452 err3;   lvx     v3,0,r4
453 err3;   lvx     v2,r4,r9
454 err3;   lvx     v1,r4,r10
455 err3;   lvx     v0,r4,r11
456         addi    r4,r4,64
457 err3;   stvx    v3,0,r3
458 err3;   stvx    v2,r3,r9
459 err3;   stvx    v1,r3,r10
460 err3;   stvx    v0,r3,r11
461         addi    r3,r3,64
462
463 9:      bf      cr7*4+2,10f
464 err3;   lvx     v1,0,r4
465 err3;   lvx     v0,r4,r9
466         addi    r4,r4,32
467 err3;   stvx    v1,0,r3
468 err3;   stvx    v0,r3,r9
469         addi    r3,r3,32
470
471 10:     bf      cr7*4+3,11f
472 err3;   lvx     v1,0,r4
473         addi    r4,r4,16
474 err3;   stvx    v1,0,r3
475         addi    r3,r3,16
476
477         /* Up to 15B to go */
478 11:     clrldi  r5,r5,(64-4)
479         mtocrf  0x01,r5
480         bf      cr7*4+0,12f
481 err3;   ld      r0,0(r4)
482         addi    r4,r4,8
483 err3;   std     r0,0(r3)
484         addi    r3,r3,8
485
486 12:     bf      cr7*4+1,13f
487 err3;   lwz     r0,0(r4)
488         addi    r4,r4,4
489 err3;   stw     r0,0(r3)
490         addi    r3,r3,4
491
492 13:     bf      cr7*4+2,14f
493 err3;   lhz     r0,0(r4)
494         addi    r4,r4,2
495 err3;   sth     r0,0(r3)
496         addi    r3,r3,2
497
498 14:     bf      cr7*4+3,15f
499 err3;   lbz     r0,0(r4)
500 err3;   stb     r0,0(r3)
501
502 15:     addi    r1,r1,STACKFRAMESIZE
503         b       exit_vmx_usercopy       /* tail call optimise */
504
505 .Lvmx_unaligned_copy:
506         /* Get the destination 16B aligned */
507         neg     r6,r3
508         mtocrf  0x01,r6
509         clrldi  r6,r6,(64-4)
510
511         bf      cr7*4+3,1f
512 err3;   lbz     r0,0(r4)
513         addi    r4,r4,1
514 err3;   stb     r0,0(r3)
515         addi    r3,r3,1
516
517 1:      bf      cr7*4+2,2f
518 err3;   lhz     r0,0(r4)
519         addi    r4,r4,2
520 err3;   sth     r0,0(r3)
521         addi    r3,r3,2
522
523 2:      bf      cr7*4+1,3f
524 err3;   lwz     r0,0(r4)
525         addi    r4,r4,4
526 err3;   stw     r0,0(r3)
527         addi    r3,r3,4
528
529 3:      bf      cr7*4+0,4f
530 err3;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
531 err3;   lwz     r7,4(r4)
532         addi    r4,r4,8
533 err3;   stw     r0,0(r3)
534 err3;   stw     r7,4(r3)
535         addi    r3,r3,8
536
537 4:      sub     r5,r5,r6
538
539         /* Get the desination 128B aligned */
540         neg     r6,r3
541         srdi    r7,r6,4
542         mtocrf  0x01,r7
543         clrldi  r6,r6,(64-7)
544
545         li      r9,16
546         li      r10,32
547         li      r11,48
548
549         LVS(v16,0,r4)           /* Setup permute control vector */
550 err3;   lvx     v0,0,r4
551         addi    r4,r4,16
552
553         bf      cr7*4+3,5f
554 err3;   lvx     v1,0,r4
555         VPERM(v8,v0,v1,v16)
556         addi    r4,r4,16
557 err3;   stvx    v8,0,r3
558         addi    r3,r3,16
559         vor     v0,v1,v1
560
561 5:      bf      cr7*4+2,6f
562 err3;   lvx     v1,0,r4
563         VPERM(v8,v0,v1,v16)
564 err3;   lvx     v0,r4,r9
565         VPERM(v9,v1,v0,v16)
566         addi    r4,r4,32
567 err3;   stvx    v8,0,r3
568 err3;   stvx    v9,r3,r9
569         addi    r3,r3,32
570
571 6:      bf      cr7*4+1,7f
572 err3;   lvx     v3,0,r4
573         VPERM(v8,v0,v3,v16)
574 err3;   lvx     v2,r4,r9
575         VPERM(v9,v3,v2,v16)
576 err3;   lvx     v1,r4,r10
577         VPERM(v10,v2,v1,v16)
578 err3;   lvx     v0,r4,r11
579         VPERM(v11,v1,v0,v16)
580         addi    r4,r4,64
581 err3;   stvx    v8,0,r3
582 err3;   stvx    v9,r3,r9
583 err3;   stvx    v10,r3,r10
584 err3;   stvx    v11,r3,r11
585         addi    r3,r3,64
586
587 7:      sub     r5,r5,r6
588         srdi    r6,r5,7
589
590         std     r14,STK_REG(R14)(r1)
591         std     r15,STK_REG(R15)(r1)
592         std     r16,STK_REG(R16)(r1)
593
594         li      r12,64
595         li      r14,80
596         li      r15,96
597         li      r16,112
598
599         mtctr   r6
600
601         /*
602          * Now do cacheline sized loads and stores. By this stage the
603          * cacheline stores are also cacheline aligned.
604          */
605         .align  5
606 8:
607 err4;   lvx     v7,0,r4
608         VPERM(v8,v0,v7,v16)
609 err4;   lvx     v6,r4,r9
610         VPERM(v9,v7,v6,v16)
611 err4;   lvx     v5,r4,r10
612         VPERM(v10,v6,v5,v16)
613 err4;   lvx     v4,r4,r11
614         VPERM(v11,v5,v4,v16)
615 err4;   lvx     v3,r4,r12
616         VPERM(v12,v4,v3,v16)
617 err4;   lvx     v2,r4,r14
618         VPERM(v13,v3,v2,v16)
619 err4;   lvx     v1,r4,r15
620         VPERM(v14,v2,v1,v16)
621 err4;   lvx     v0,r4,r16
622         VPERM(v15,v1,v0,v16)
623         addi    r4,r4,128
624 err4;   stvx    v8,0,r3
625 err4;   stvx    v9,r3,r9
626 err4;   stvx    v10,r3,r10
627 err4;   stvx    v11,r3,r11
628 err4;   stvx    v12,r3,r12
629 err4;   stvx    v13,r3,r14
630 err4;   stvx    v14,r3,r15
631 err4;   stvx    v15,r3,r16
632         addi    r3,r3,128
633         bdnz    8b
634
635         ld      r14,STK_REG(R14)(r1)
636         ld      r15,STK_REG(R15)(r1)
637         ld      r16,STK_REG(R16)(r1)
638
639         /* Up to 127B to go */
640         clrldi  r5,r5,(64-7)
641         srdi    r6,r5,4
642         mtocrf  0x01,r6
643
644         bf      cr7*4+1,9f
645 err3;   lvx     v3,0,r4
646         VPERM(v8,v0,v3,v16)
647 err3;   lvx     v2,r4,r9
648         VPERM(v9,v3,v2,v16)
649 err3;   lvx     v1,r4,r10
650         VPERM(v10,v2,v1,v16)
651 err3;   lvx     v0,r4,r11
652         VPERM(v11,v1,v0,v16)
653         addi    r4,r4,64
654 err3;   stvx    v8,0,r3
655 err3;   stvx    v9,r3,r9
656 err3;   stvx    v10,r3,r10
657 err3;   stvx    v11,r3,r11
658         addi    r3,r3,64
659
660 9:      bf      cr7*4+2,10f
661 err3;   lvx     v1,0,r4
662         VPERM(v8,v0,v1,v16)
663 err3;   lvx     v0,r4,r9
664         VPERM(v9,v1,v0,v16)
665         addi    r4,r4,32
666 err3;   stvx    v8,0,r3
667 err3;   stvx    v9,r3,r9
668         addi    r3,r3,32
669
670 10:     bf      cr7*4+3,11f
671 err3;   lvx     v1,0,r4
672         VPERM(v8,v0,v1,v16)
673         addi    r4,r4,16
674 err3;   stvx    v8,0,r3
675         addi    r3,r3,16
676
677         /* Up to 15B to go */
678 11:     clrldi  r5,r5,(64-4)
679         addi    r4,r4,-16       /* Unwind the +16 load offset */
680         mtocrf  0x01,r5
681         bf      cr7*4+0,12f
682 err3;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
683 err3;   lwz     r6,4(r4)
684         addi    r4,r4,8
685 err3;   stw     r0,0(r3)
686 err3;   stw     r6,4(r3)
687         addi    r3,r3,8
688
689 12:     bf      cr7*4+1,13f
690 err3;   lwz     r0,0(r4)
691         addi    r4,r4,4
692 err3;   stw     r0,0(r3)
693         addi    r3,r3,4
694
695 13:     bf      cr7*4+2,14f
696 err3;   lhz     r0,0(r4)
697         addi    r4,r4,2
698 err3;   sth     r0,0(r3)
699         addi    r3,r3,2
700
701 14:     bf      cr7*4+3,15f
702 err3;   lbz     r0,0(r4)
703 err3;   stb     r0,0(r3)
704
705 15:     addi    r1,r1,STACKFRAMESIZE
706         b       exit_vmx_usercopy       /* tail call optimise */
707 #endif /* CONFIG_ALTIVEC */