GNU Linux-libre 4.14.266-gnu1
[releases.git] / arch / powerpc / lib / copyuser_power7.S
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 2 of the License, or
5  * (at your option) any later version.
6  *
7  * This program is distributed in the hope that it will be useful,
8  * but WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  * GNU General Public License for more details.
11  *
12  * You should have received a copy of the GNU General Public License
13  * along with this program; if not, write to the Free Software
14  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15  *
16  * Copyright (C) IBM Corporation, 2011
17  *
18  * Author: Anton Blanchard <anton@au.ibm.com>
19  */
20 #include <asm/ppc_asm.h>
21
22 #ifdef __BIG_ENDIAN__
23 #define LVS(VRT,RA,RB)          lvsl    VRT,RA,RB
24 #define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRA,VRB,VRC
25 #else
26 #define LVS(VRT,RA,RB)          lvsr    VRT,RA,RB
27 #define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRB,VRA,VRC
28 #endif
29
30         .macro err1
31 100:
32         EX_TABLE(100b,.Ldo_err1)
33         .endm
34
35         .macro err2
36 200:
37         EX_TABLE(200b,.Ldo_err2)
38         .endm
39
40 #ifdef CONFIG_ALTIVEC
41         .macro err3
42 300:
43         EX_TABLE(300b,.Ldo_err3)
44         .endm
45
46         .macro err4
47 400:
48         EX_TABLE(400b,.Ldo_err4)
49         .endm
50
51
52 .Ldo_err4:
53         ld      r16,STK_REG(R16)(r1)
54         ld      r15,STK_REG(R15)(r1)
55         ld      r14,STK_REG(R14)(r1)
56 .Ldo_err3:
57         bl      exit_vmx_usercopy
58         ld      r0,STACKFRAMESIZE+16(r1)
59         mtlr    r0
60         b       .Lexit
61 #endif /* CONFIG_ALTIVEC */
62
63 .Ldo_err2:
64         ld      r22,STK_REG(R22)(r1)
65         ld      r21,STK_REG(R21)(r1)
66         ld      r20,STK_REG(R20)(r1)
67         ld      r19,STK_REG(R19)(r1)
68         ld      r18,STK_REG(R18)(r1)
69         ld      r17,STK_REG(R17)(r1)
70         ld      r16,STK_REG(R16)(r1)
71         ld      r15,STK_REG(R15)(r1)
72         ld      r14,STK_REG(R14)(r1)
73 .Lexit:
74         addi    r1,r1,STACKFRAMESIZE
75 .Ldo_err1:
76         ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
77         ld      r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
78         ld      r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
79         b       __copy_tofrom_user_base
80
81
82 _GLOBAL(__copy_tofrom_user_power7)
83 #ifdef CONFIG_ALTIVEC
84         cmpldi  r5,16
85         cmpldi  cr1,r5,3328
86
87         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
88         std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
89         std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
90
91         blt     .Lshort_copy
92         bge     cr1,.Lvmx_copy
93 #else
94         cmpldi  r5,16
95
96         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
97         std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
98         std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
99
100         blt     .Lshort_copy
101 #endif
102
103 .Lnonvmx_copy:
104         /* Get the source 8B aligned */
105         neg     r6,r4
106         mtocrf  0x01,r6
107         clrldi  r6,r6,(64-3)
108
109         bf      cr7*4+3,1f
110 err1;   lbz     r0,0(r4)
111         addi    r4,r4,1
112 err1;   stb     r0,0(r3)
113         addi    r3,r3,1
114
115 1:      bf      cr7*4+2,2f
116 err1;   lhz     r0,0(r4)
117         addi    r4,r4,2
118 err1;   sth     r0,0(r3)
119         addi    r3,r3,2
120
121 2:      bf      cr7*4+1,3f
122 err1;   lwz     r0,0(r4)
123         addi    r4,r4,4
124 err1;   stw     r0,0(r3)
125         addi    r3,r3,4
126
127 3:      sub     r5,r5,r6
128         cmpldi  r5,128
129         blt     5f
130
131         mflr    r0
132         stdu    r1,-STACKFRAMESIZE(r1)
133         std     r14,STK_REG(R14)(r1)
134         std     r15,STK_REG(R15)(r1)
135         std     r16,STK_REG(R16)(r1)
136         std     r17,STK_REG(R17)(r1)
137         std     r18,STK_REG(R18)(r1)
138         std     r19,STK_REG(R19)(r1)
139         std     r20,STK_REG(R20)(r1)
140         std     r21,STK_REG(R21)(r1)
141         std     r22,STK_REG(R22)(r1)
142         std     r0,STACKFRAMESIZE+16(r1)
143
144         srdi    r6,r5,7
145         mtctr   r6
146
147         /* Now do cacheline (128B) sized loads and stores. */
148         .align  5
149 4:
150 err2;   ld      r0,0(r4)
151 err2;   ld      r6,8(r4)
152 err2;   ld      r7,16(r4)
153 err2;   ld      r8,24(r4)
154 err2;   ld      r9,32(r4)
155 err2;   ld      r10,40(r4)
156 err2;   ld      r11,48(r4)
157 err2;   ld      r12,56(r4)
158 err2;   ld      r14,64(r4)
159 err2;   ld      r15,72(r4)
160 err2;   ld      r16,80(r4)
161 err2;   ld      r17,88(r4)
162 err2;   ld      r18,96(r4)
163 err2;   ld      r19,104(r4)
164 err2;   ld      r20,112(r4)
165 err2;   ld      r21,120(r4)
166         addi    r4,r4,128
167 err2;   std     r0,0(r3)
168 err2;   std     r6,8(r3)
169 err2;   std     r7,16(r3)
170 err2;   std     r8,24(r3)
171 err2;   std     r9,32(r3)
172 err2;   std     r10,40(r3)
173 err2;   std     r11,48(r3)
174 err2;   std     r12,56(r3)
175 err2;   std     r14,64(r3)
176 err2;   std     r15,72(r3)
177 err2;   std     r16,80(r3)
178 err2;   std     r17,88(r3)
179 err2;   std     r18,96(r3)
180 err2;   std     r19,104(r3)
181 err2;   std     r20,112(r3)
182 err2;   std     r21,120(r3)
183         addi    r3,r3,128
184         bdnz    4b
185
186         clrldi  r5,r5,(64-7)
187
188         ld      r14,STK_REG(R14)(r1)
189         ld      r15,STK_REG(R15)(r1)
190         ld      r16,STK_REG(R16)(r1)
191         ld      r17,STK_REG(R17)(r1)
192         ld      r18,STK_REG(R18)(r1)
193         ld      r19,STK_REG(R19)(r1)
194         ld      r20,STK_REG(R20)(r1)
195         ld      r21,STK_REG(R21)(r1)
196         ld      r22,STK_REG(R22)(r1)
197         addi    r1,r1,STACKFRAMESIZE
198
199         /* Up to 127B to go */
200 5:      srdi    r6,r5,4
201         mtocrf  0x01,r6
202
203 6:      bf      cr7*4+1,7f
204 err1;   ld      r0,0(r4)
205 err1;   ld      r6,8(r4)
206 err1;   ld      r7,16(r4)
207 err1;   ld      r8,24(r4)
208 err1;   ld      r9,32(r4)
209 err1;   ld      r10,40(r4)
210 err1;   ld      r11,48(r4)
211 err1;   ld      r12,56(r4)
212         addi    r4,r4,64
213 err1;   std     r0,0(r3)
214 err1;   std     r6,8(r3)
215 err1;   std     r7,16(r3)
216 err1;   std     r8,24(r3)
217 err1;   std     r9,32(r3)
218 err1;   std     r10,40(r3)
219 err1;   std     r11,48(r3)
220 err1;   std     r12,56(r3)
221         addi    r3,r3,64
222
223         /* Up to 63B to go */
224 7:      bf      cr7*4+2,8f
225 err1;   ld      r0,0(r4)
226 err1;   ld      r6,8(r4)
227 err1;   ld      r7,16(r4)
228 err1;   ld      r8,24(r4)
229         addi    r4,r4,32
230 err1;   std     r0,0(r3)
231 err1;   std     r6,8(r3)
232 err1;   std     r7,16(r3)
233 err1;   std     r8,24(r3)
234         addi    r3,r3,32
235
236         /* Up to 31B to go */
237 8:      bf      cr7*4+3,9f
238 err1;   ld      r0,0(r4)
239 err1;   ld      r6,8(r4)
240         addi    r4,r4,16
241 err1;   std     r0,0(r3)
242 err1;   std     r6,8(r3)
243         addi    r3,r3,16
244
245 9:      clrldi  r5,r5,(64-4)
246
247         /* Up to 15B to go */
248 .Lshort_copy:
249         mtocrf  0x01,r5
250         bf      cr7*4+0,12f
251 err1;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
252 err1;   lwz     r6,4(r4)
253         addi    r4,r4,8
254 err1;   stw     r0,0(r3)
255 err1;   stw     r6,4(r3)
256         addi    r3,r3,8
257
258 12:     bf      cr7*4+1,13f
259 err1;   lwz     r0,0(r4)
260         addi    r4,r4,4
261 err1;   stw     r0,0(r3)
262         addi    r3,r3,4
263
264 13:     bf      cr7*4+2,14f
265 err1;   lhz     r0,0(r4)
266         addi    r4,r4,2
267 err1;   sth     r0,0(r3)
268         addi    r3,r3,2
269
270 14:     bf      cr7*4+3,15f
271 err1;   lbz     r0,0(r4)
272 err1;   stb     r0,0(r3)
273
274 15:     li      r3,0
275         blr
276
277 .Lunwind_stack_nonvmx_copy:
278         addi    r1,r1,STACKFRAMESIZE
279         b       .Lnonvmx_copy
280
281 #ifdef CONFIG_ALTIVEC
282 .Lvmx_copy:
283         mflr    r0
284         std     r0,16(r1)
285         stdu    r1,-STACKFRAMESIZE(r1)
286         bl      enter_vmx_usercopy
287         cmpwi   cr1,r3,0
288         ld      r0,STACKFRAMESIZE+16(r1)
289         ld      r3,STK_REG(R31)(r1)
290         ld      r4,STK_REG(R30)(r1)
291         ld      r5,STK_REG(R29)(r1)
292         mtlr    r0
293
294         /*
295          * We prefetch both the source and destination using enhanced touch
296          * instructions. We use a stream ID of 0 for the load side and
297          * 1 for the store side.
298          */
299         clrrdi  r6,r4,7
300         clrrdi  r9,r3,7
301         ori     r9,r9,1         /* stream=1 */
302
303         srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
304         cmpldi  r7,0x3FF
305         ble     1f
306         li      r7,0x3FF
307 1:      lis     r0,0x0E00       /* depth=7 */
308         sldi    r7,r7,7
309         or      r7,r7,r0
310         ori     r10,r7,1        /* stream=1 */
311
312         lis     r8,0x8000       /* GO=1 */
313         clrldi  r8,r8,32
314
315 .machine push
316 .machine "power4"
317         /* setup read stream 0 */
318         dcbt    0,r6,0b01000   /* addr from */
319         dcbt    0,r7,0b01010   /* length and depth from */
320         /* setup write stream 1 */
321         dcbtst  0,r9,0b01000   /* addr to */
322         dcbtst  0,r10,0b01010  /* length and depth to */
323         eieio
324         dcbt    0,r8,0b01010    /* all streams GO */
325 .machine pop
326
327         beq     cr1,.Lunwind_stack_nonvmx_copy
328
329         /*
330          * If source and destination are not relatively aligned we use a
331          * slower permute loop.
332          */
333         xor     r6,r4,r3
334         rldicl. r6,r6,0,(64-4)
335         bne     .Lvmx_unaligned_copy
336
337         /* Get the destination 16B aligned */
338         neg     r6,r3
339         mtocrf  0x01,r6
340         clrldi  r6,r6,(64-4)
341
342         bf      cr7*4+3,1f
343 err3;   lbz     r0,0(r4)
344         addi    r4,r4,1
345 err3;   stb     r0,0(r3)
346         addi    r3,r3,1
347
348 1:      bf      cr7*4+2,2f
349 err3;   lhz     r0,0(r4)
350         addi    r4,r4,2
351 err3;   sth     r0,0(r3)
352         addi    r3,r3,2
353
354 2:      bf      cr7*4+1,3f
355 err3;   lwz     r0,0(r4)
356         addi    r4,r4,4
357 err3;   stw     r0,0(r3)
358         addi    r3,r3,4
359
360 3:      bf      cr7*4+0,4f
361 err3;   ld      r0,0(r4)
362         addi    r4,r4,8
363 err3;   std     r0,0(r3)
364         addi    r3,r3,8
365
366 4:      sub     r5,r5,r6
367
368         /* Get the desination 128B aligned */
369         neg     r6,r3
370         srdi    r7,r6,4
371         mtocrf  0x01,r7
372         clrldi  r6,r6,(64-7)
373
374         li      r9,16
375         li      r10,32
376         li      r11,48
377
378         bf      cr7*4+3,5f
379 err3;   lvx     v1,0,r4
380         addi    r4,r4,16
381 err3;   stvx    v1,0,r3
382         addi    r3,r3,16
383
384 5:      bf      cr7*4+2,6f
385 err3;   lvx     v1,0,r4
386 err3;   lvx     v0,r4,r9
387         addi    r4,r4,32
388 err3;   stvx    v1,0,r3
389 err3;   stvx    v0,r3,r9
390         addi    r3,r3,32
391
392 6:      bf      cr7*4+1,7f
393 err3;   lvx     v3,0,r4
394 err3;   lvx     v2,r4,r9
395 err3;   lvx     v1,r4,r10
396 err3;   lvx     v0,r4,r11
397         addi    r4,r4,64
398 err3;   stvx    v3,0,r3
399 err3;   stvx    v2,r3,r9
400 err3;   stvx    v1,r3,r10
401 err3;   stvx    v0,r3,r11
402         addi    r3,r3,64
403
404 7:      sub     r5,r5,r6
405         srdi    r6,r5,7
406
407         std     r14,STK_REG(R14)(r1)
408         std     r15,STK_REG(R15)(r1)
409         std     r16,STK_REG(R16)(r1)
410
411         li      r12,64
412         li      r14,80
413         li      r15,96
414         li      r16,112
415
416         mtctr   r6
417
418         /*
419          * Now do cacheline sized loads and stores. By this stage the
420          * cacheline stores are also cacheline aligned.
421          */
422         .align  5
423 8:
424 err4;   lvx     v7,0,r4
425 err4;   lvx     v6,r4,r9
426 err4;   lvx     v5,r4,r10
427 err4;   lvx     v4,r4,r11
428 err4;   lvx     v3,r4,r12
429 err4;   lvx     v2,r4,r14
430 err4;   lvx     v1,r4,r15
431 err4;   lvx     v0,r4,r16
432         addi    r4,r4,128
433 err4;   stvx    v7,0,r3
434 err4;   stvx    v6,r3,r9
435 err4;   stvx    v5,r3,r10
436 err4;   stvx    v4,r3,r11
437 err4;   stvx    v3,r3,r12
438 err4;   stvx    v2,r3,r14
439 err4;   stvx    v1,r3,r15
440 err4;   stvx    v0,r3,r16
441         addi    r3,r3,128
442         bdnz    8b
443
444         ld      r14,STK_REG(R14)(r1)
445         ld      r15,STK_REG(R15)(r1)
446         ld      r16,STK_REG(R16)(r1)
447
448         /* Up to 127B to go */
449         clrldi  r5,r5,(64-7)
450         srdi    r6,r5,4
451         mtocrf  0x01,r6
452
453         bf      cr7*4+1,9f
454 err3;   lvx     v3,0,r4
455 err3;   lvx     v2,r4,r9
456 err3;   lvx     v1,r4,r10
457 err3;   lvx     v0,r4,r11
458         addi    r4,r4,64
459 err3;   stvx    v3,0,r3
460 err3;   stvx    v2,r3,r9
461 err3;   stvx    v1,r3,r10
462 err3;   stvx    v0,r3,r11
463         addi    r3,r3,64
464
465 9:      bf      cr7*4+2,10f
466 err3;   lvx     v1,0,r4
467 err3;   lvx     v0,r4,r9
468         addi    r4,r4,32
469 err3;   stvx    v1,0,r3
470 err3;   stvx    v0,r3,r9
471         addi    r3,r3,32
472
473 10:     bf      cr7*4+3,11f
474 err3;   lvx     v1,0,r4
475         addi    r4,r4,16
476 err3;   stvx    v1,0,r3
477         addi    r3,r3,16
478
479         /* Up to 15B to go */
480 11:     clrldi  r5,r5,(64-4)
481         mtocrf  0x01,r5
482         bf      cr7*4+0,12f
483 err3;   ld      r0,0(r4)
484         addi    r4,r4,8
485 err3;   std     r0,0(r3)
486         addi    r3,r3,8
487
488 12:     bf      cr7*4+1,13f
489 err3;   lwz     r0,0(r4)
490         addi    r4,r4,4
491 err3;   stw     r0,0(r3)
492         addi    r3,r3,4
493
494 13:     bf      cr7*4+2,14f
495 err3;   lhz     r0,0(r4)
496         addi    r4,r4,2
497 err3;   sth     r0,0(r3)
498         addi    r3,r3,2
499
500 14:     bf      cr7*4+3,15f
501 err3;   lbz     r0,0(r4)
502 err3;   stb     r0,0(r3)
503
504 15:     addi    r1,r1,STACKFRAMESIZE
505         b       exit_vmx_usercopy       /* tail call optimise */
506
507 .Lvmx_unaligned_copy:
508         /* Get the destination 16B aligned */
509         neg     r6,r3
510         mtocrf  0x01,r6
511         clrldi  r6,r6,(64-4)
512
513         bf      cr7*4+3,1f
514 err3;   lbz     r0,0(r4)
515         addi    r4,r4,1
516 err3;   stb     r0,0(r3)
517         addi    r3,r3,1
518
519 1:      bf      cr7*4+2,2f
520 err3;   lhz     r0,0(r4)
521         addi    r4,r4,2
522 err3;   sth     r0,0(r3)
523         addi    r3,r3,2
524
525 2:      bf      cr7*4+1,3f
526 err3;   lwz     r0,0(r4)
527         addi    r4,r4,4
528 err3;   stw     r0,0(r3)
529         addi    r3,r3,4
530
531 3:      bf      cr7*4+0,4f
532 err3;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
533 err3;   lwz     r7,4(r4)
534         addi    r4,r4,8
535 err3;   stw     r0,0(r3)
536 err3;   stw     r7,4(r3)
537         addi    r3,r3,8
538
539 4:      sub     r5,r5,r6
540
541         /* Get the desination 128B aligned */
542         neg     r6,r3
543         srdi    r7,r6,4
544         mtocrf  0x01,r7
545         clrldi  r6,r6,(64-7)
546
547         li      r9,16
548         li      r10,32
549         li      r11,48
550
551         LVS(v16,0,r4)           /* Setup permute control vector */
552 err3;   lvx     v0,0,r4
553         addi    r4,r4,16
554
555         bf      cr7*4+3,5f
556 err3;   lvx     v1,0,r4
557         VPERM(v8,v0,v1,v16)
558         addi    r4,r4,16
559 err3;   stvx    v8,0,r3
560         addi    r3,r3,16
561         vor     v0,v1,v1
562
563 5:      bf      cr7*4+2,6f
564 err3;   lvx     v1,0,r4
565         VPERM(v8,v0,v1,v16)
566 err3;   lvx     v0,r4,r9
567         VPERM(v9,v1,v0,v16)
568         addi    r4,r4,32
569 err3;   stvx    v8,0,r3
570 err3;   stvx    v9,r3,r9
571         addi    r3,r3,32
572
573 6:      bf      cr7*4+1,7f
574 err3;   lvx     v3,0,r4
575         VPERM(v8,v0,v3,v16)
576 err3;   lvx     v2,r4,r9
577         VPERM(v9,v3,v2,v16)
578 err3;   lvx     v1,r4,r10
579         VPERM(v10,v2,v1,v16)
580 err3;   lvx     v0,r4,r11
581         VPERM(v11,v1,v0,v16)
582         addi    r4,r4,64
583 err3;   stvx    v8,0,r3
584 err3;   stvx    v9,r3,r9
585 err3;   stvx    v10,r3,r10
586 err3;   stvx    v11,r3,r11
587         addi    r3,r3,64
588
589 7:      sub     r5,r5,r6
590         srdi    r6,r5,7
591
592         std     r14,STK_REG(R14)(r1)
593         std     r15,STK_REG(R15)(r1)
594         std     r16,STK_REG(R16)(r1)
595
596         li      r12,64
597         li      r14,80
598         li      r15,96
599         li      r16,112
600
601         mtctr   r6
602
603         /*
604          * Now do cacheline sized loads and stores. By this stage the
605          * cacheline stores are also cacheline aligned.
606          */
607         .align  5
608 8:
609 err4;   lvx     v7,0,r4
610         VPERM(v8,v0,v7,v16)
611 err4;   lvx     v6,r4,r9
612         VPERM(v9,v7,v6,v16)
613 err4;   lvx     v5,r4,r10
614         VPERM(v10,v6,v5,v16)
615 err4;   lvx     v4,r4,r11
616         VPERM(v11,v5,v4,v16)
617 err4;   lvx     v3,r4,r12
618         VPERM(v12,v4,v3,v16)
619 err4;   lvx     v2,r4,r14
620         VPERM(v13,v3,v2,v16)
621 err4;   lvx     v1,r4,r15
622         VPERM(v14,v2,v1,v16)
623 err4;   lvx     v0,r4,r16
624         VPERM(v15,v1,v0,v16)
625         addi    r4,r4,128
626 err4;   stvx    v8,0,r3
627 err4;   stvx    v9,r3,r9
628 err4;   stvx    v10,r3,r10
629 err4;   stvx    v11,r3,r11
630 err4;   stvx    v12,r3,r12
631 err4;   stvx    v13,r3,r14
632 err4;   stvx    v14,r3,r15
633 err4;   stvx    v15,r3,r16
634         addi    r3,r3,128
635         bdnz    8b
636
637         ld      r14,STK_REG(R14)(r1)
638         ld      r15,STK_REG(R15)(r1)
639         ld      r16,STK_REG(R16)(r1)
640
641         /* Up to 127B to go */
642         clrldi  r5,r5,(64-7)
643         srdi    r6,r5,4
644         mtocrf  0x01,r6
645
646         bf      cr7*4+1,9f
647 err3;   lvx     v3,0,r4
648         VPERM(v8,v0,v3,v16)
649 err3;   lvx     v2,r4,r9
650         VPERM(v9,v3,v2,v16)
651 err3;   lvx     v1,r4,r10
652         VPERM(v10,v2,v1,v16)
653 err3;   lvx     v0,r4,r11
654         VPERM(v11,v1,v0,v16)
655         addi    r4,r4,64
656 err3;   stvx    v8,0,r3
657 err3;   stvx    v9,r3,r9
658 err3;   stvx    v10,r3,r10
659 err3;   stvx    v11,r3,r11
660         addi    r3,r3,64
661
662 9:      bf      cr7*4+2,10f
663 err3;   lvx     v1,0,r4
664         VPERM(v8,v0,v1,v16)
665 err3;   lvx     v0,r4,r9
666         VPERM(v9,v1,v0,v16)
667         addi    r4,r4,32
668 err3;   stvx    v8,0,r3
669 err3;   stvx    v9,r3,r9
670         addi    r3,r3,32
671
672 10:     bf      cr7*4+3,11f
673 err3;   lvx     v1,0,r4
674         VPERM(v8,v0,v1,v16)
675         addi    r4,r4,16
676 err3;   stvx    v8,0,r3
677         addi    r3,r3,16
678
679         /* Up to 15B to go */
680 11:     clrldi  r5,r5,(64-4)
681         addi    r4,r4,-16       /* Unwind the +16 load offset */
682         mtocrf  0x01,r5
683         bf      cr7*4+0,12f
684 err3;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
685 err3;   lwz     r6,4(r4)
686         addi    r4,r4,8
687 err3;   stw     r0,0(r3)
688 err3;   stw     r6,4(r3)
689         addi    r3,r3,8
690
691 12:     bf      cr7*4+1,13f
692 err3;   lwz     r0,0(r4)
693         addi    r4,r4,4
694 err3;   stw     r0,0(r3)
695         addi    r3,r3,4
696
697 13:     bf      cr7*4+2,14f
698 err3;   lhz     r0,0(r4)
699         addi    r4,r4,2
700 err3;   sth     r0,0(r3)
701         addi    r3,r3,2
702
703 14:     bf      cr7*4+3,15f
704 err3;   lbz     r0,0(r4)
705 err3;   stb     r0,0(r3)
706
707 15:     addi    r1,r1,STACKFRAMESIZE
708         b       exit_vmx_usercopy       /* tail call optimise */
709 #endif /* CONFIG_ALTIVEC */