GNU Linux-libre 4.14.257-gnu1
[releases.git] / arch / arm / crypto / sha256-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 #
9 # Permission to use under GPL terms is granted.
10 # ====================================================================
11
12 # SHA256 block procedure for ARMv4. May 2007.
13
14 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
15 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
16 # byte [on single-issue Xscale PXA250 core].
17
18 # July 2010.
19 #
20 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
21 # Cortex A8 core and ~20 cycles per processed byte.
22
23 # February 2011.
24 #
25 # Profiler-assisted and platform-specific optimization resulted in 16%
26 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
27
28 # September 2013.
29 #
30 # Add NEON implementation. On Cortex A8 it was measured to process one
31 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
32 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
33 # code (meaning that latter performs sub-optimally, nothing was done
34 # about it).
35
36 # May 2014.
37 #
38 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
39
40 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
41 open STDOUT,">$output";
42
43 $ctx="r0";      $t0="r0";
44 $inp="r1";      $t4="r1";
45 $len="r2";      $t1="r2";
46 $T1="r3";       $t3="r3";
47 $A="r4";
48 $B="r5";
49 $C="r6";
50 $D="r7";
51 $E="r8";
52 $F="r9";
53 $G="r10";
54 $H="r11";
55 @V=($A,$B,$C,$D,$E,$F,$G,$H);
56 $t2="r12";
57 $Ktbl="r14";
58
59 @Sigma0=( 2,13,22);
60 @Sigma1=( 6,11,25);
61 @sigma0=( 7,18, 3);
62 @sigma1=(17,19,10);
63
64 sub BODY_00_15 {
65 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
66
67 $code.=<<___ if ($i<16);
68 #if __ARM_ARCH__>=7
69         @ ldr   $t1,[$inp],#4                   @ $i
70 # if $i==15
71         str     $inp,[sp,#17*4]                 @ make room for $t4
72 # endif
73         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
74         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
75         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
76 # ifndef __ARMEB__
77         rev     $t1,$t1
78 # endif
79 #else
80         @ ldrb  $t1,[$inp,#3]                   @ $i
81         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
82         ldrb    $t2,[$inp,#2]
83         ldrb    $t0,[$inp,#1]
84         orr     $t1,$t1,$t2,lsl#8
85         ldrb    $t2,[$inp],#4
86         orr     $t1,$t1,$t0,lsl#16
87 # if $i==15
88         str     $inp,[sp,#17*4]                 @ make room for $t4
89 # endif
90         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
91         orr     $t1,$t1,$t2,lsl#24
92         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
93 #endif
94 ___
95 $code.=<<___;
96         ldr     $t2,[$Ktbl],#4                  @ *K256++
97         add     $h,$h,$t1                       @ h+=X[i]
98         str     $t1,[sp,#`$i%16`*4]
99         eor     $t1,$f,$g
100         add     $h,$h,$t0,ror#$Sigma1[0]        @ h+=Sigma1(e)
101         and     $t1,$t1,$e
102         add     $h,$h,$t2                       @ h+=K256[i]
103         eor     $t1,$t1,$g                      @ Ch(e,f,g)
104         eor     $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
105         add     $h,$h,$t1                       @ h+=Ch(e,f,g)
106 #if $i==31
107         and     $t2,$t2,#0xff
108         cmp     $t2,#0xf2                       @ done?
109 #endif
110 #if $i<15
111 # if __ARM_ARCH__>=7
112         ldr     $t1,[$inp],#4                   @ prefetch
113 # else
114         ldrb    $t1,[$inp,#3]
115 # endif
116         eor     $t2,$a,$b                       @ a^b, b^c in next round
117 #else
118         ldr     $t1,[sp,#`($i+2)%16`*4]         @ from future BODY_16_xx
119         eor     $t2,$a,$b                       @ a^b, b^c in next round
120         ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
121 #endif
122         eor     $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`  @ Sigma0(a)
123         and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
124         add     $d,$d,$h                        @ d+=h
125         eor     $t3,$t3,$b                      @ Maj(a,b,c)
126         add     $h,$h,$t0,ror#$Sigma0[0]        @ h+=Sigma0(a)
127         @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
128 ___
129         ($t2,$t3)=($t3,$t2);
130 }
131
132 sub BODY_16_XX {
133 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
134
135 $code.=<<___;
136         @ ldr   $t1,[sp,#`($i+1)%16`*4]         @ $i
137         @ ldr   $t4,[sp,#`($i+14)%16`*4]
138         mov     $t0,$t1,ror#$sigma0[0]
139         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
140         mov     $t2,$t4,ror#$sigma1[0]
141         eor     $t0,$t0,$t1,ror#$sigma0[1]
142         eor     $t2,$t2,$t4,ror#$sigma1[1]
143         eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
144         ldr     $t1,[sp,#`($i+0)%16`*4]
145         eor     $t2,$t2,$t4,lsr#$sigma1[2]      @ sigma1(X[i+14])
146         ldr     $t4,[sp,#`($i+9)%16`*4]
147
148         add     $t2,$t2,$t0
149         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`   @ from BODY_00_15
150         add     $t1,$t1,$t2
151         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
152         add     $t1,$t1,$t4                     @ X[i]
153 ___
154         &BODY_00_15(@_);
155 }
156
157 $code=<<___;
158 #ifndef __KERNEL__
159 # include "arm_arch.h"
160 #else
161 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
162 # define __ARM_MAX_ARCH__ 7
163 #endif
164
165 .text
166 #if __ARM_ARCH__<7
167 .code   32
168 #else
169 .syntax unified
170 # ifdef __thumb2__
171 #  define adrl adr
172 .thumb
173 # else
174 .code   32
175 # endif
176 #endif
177
178 .type   K256,%object
179 .align  5
180 K256:
181 .word   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
182 .word   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
183 .word   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
184 .word   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
185 .word   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
186 .word   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
187 .word   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
188 .word   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
189 .word   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
190 .word   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
191 .word   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
192 .word   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
193 .word   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
194 .word   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
195 .word   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
196 .word   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
197 .size   K256,.-K256
198 .word   0                               @ terminator
199 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
200 .LOPENSSL_armcap:
201 .word   OPENSSL_armcap_P-sha256_block_data_order
202 #endif
203 .align  5
204
205 .global sha256_block_data_order
206 .type   sha256_block_data_order,%function
207 sha256_block_data_order:
208 .Lsha256_block_data_order:
209 #if __ARM_ARCH__<7
210         sub     r3,pc,#8                @ sha256_block_data_order
211 #else
212         adr     r3,.Lsha256_block_data_order
213 #endif
214 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
215         ldr     r12,.LOPENSSL_armcap
216         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
217         tst     r12,#ARMV8_SHA256
218         bne     .LARMv8
219         tst     r12,#ARMV7_NEON
220         bne     .LNEON
221 #endif
222         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
223         stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
224         ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
225         sub     $Ktbl,r3,#256+32        @ K256
226         sub     sp,sp,#16*4             @ alloca(X[16])
227 .Loop:
228 # if __ARM_ARCH__>=7
229         ldr     $t1,[$inp],#4
230 # else
231         ldrb    $t1,[$inp,#3]
232 # endif
233         eor     $t3,$B,$C               @ magic
234         eor     $t2,$t2,$t2
235 ___
236 for($i=0;$i<16;$i++)    { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
237 $code.=".Lrounds_16_xx:\n";
238 for (;$i<32;$i++)       { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
239 $code.=<<___;
240 #if __ARM_ARCH__>=7
241         ite     eq                      @ Thumb2 thing, sanity check in ARM
242 #endif
243         ldreq   $t3,[sp,#16*4]          @ pull ctx
244         bne     .Lrounds_16_xx
245
246         add     $A,$A,$t2               @ h+=Maj(a,b,c) from the past
247         ldr     $t0,[$t3,#0]
248         ldr     $t1,[$t3,#4]
249         ldr     $t2,[$t3,#8]
250         add     $A,$A,$t0
251         ldr     $t0,[$t3,#12]
252         add     $B,$B,$t1
253         ldr     $t1,[$t3,#16]
254         add     $C,$C,$t2
255         ldr     $t2,[$t3,#20]
256         add     $D,$D,$t0
257         ldr     $t0,[$t3,#24]
258         add     $E,$E,$t1
259         ldr     $t1,[$t3,#28]
260         add     $F,$F,$t2
261         ldr     $inp,[sp,#17*4]         @ pull inp
262         ldr     $t2,[sp,#18*4]          @ pull inp+len
263         add     $G,$G,$t0
264         add     $H,$H,$t1
265         stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
266         cmp     $inp,$t2
267         sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
268         bne     .Loop
269
270         add     sp,sp,#`16+3`*4 @ destroy frame
271 #if __ARM_ARCH__>=5
272         ldmia   sp!,{r4-r11,pc}
273 #else
274         ldmia   sp!,{r4-r11,lr}
275         tst     lr,#1
276         moveq   pc,lr                   @ be binary compatible with V4, yet
277         bx      lr                      @ interoperable with Thumb ISA:-)
278 #endif
279 .size   sha256_block_data_order,.-sha256_block_data_order
280 ___
281 ######################################################################
282 # NEON stuff
283 #
284 {{{
285 my @X=map("q$_",(0..3));
286 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
287 my $Xfer=$t4;
288 my $j=0;
289
290 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
291 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
292
293 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
294 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
295   my $arg = pop;
296     $arg = "#$arg" if ($arg*1 eq $arg);
297     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
298 }
299
300 sub Xupdate()
301 { use integer;
302   my $body = shift;
303   my @insns = (&$body,&$body,&$body,&$body);
304   my ($a,$b,$c,$d,$e,$f,$g,$h);
305
306         &vext_8         ($T0,@X[0],@X[1],4);    # X[1..4]
307          eval(shift(@insns));
308          eval(shift(@insns));
309          eval(shift(@insns));
310         &vext_8         ($T1,@X[2],@X[3],4);    # X[9..12]
311          eval(shift(@insns));
312          eval(shift(@insns));
313          eval(shift(@insns));
314         &vshr_u32       ($T2,$T0,$sigma0[0]);
315          eval(shift(@insns));
316          eval(shift(@insns));
317         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += X[9..12]
318          eval(shift(@insns));
319          eval(shift(@insns));
320         &vshr_u32       ($T1,$T0,$sigma0[2]);
321          eval(shift(@insns));
322          eval(shift(@insns));
323         &vsli_32        ($T2,$T0,32-$sigma0[0]);
324          eval(shift(@insns));
325          eval(shift(@insns));
326         &vshr_u32       ($T3,$T0,$sigma0[1]);
327          eval(shift(@insns));
328          eval(shift(@insns));
329         &veor           ($T1,$T1,$T2);
330          eval(shift(@insns));
331          eval(shift(@insns));
332         &vsli_32        ($T3,$T0,32-$sigma0[1]);
333          eval(shift(@insns));
334          eval(shift(@insns));
335           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[0]);
336          eval(shift(@insns));
337          eval(shift(@insns));
338         &veor           ($T1,$T1,$T3);          # sigma0(X[1..4])
339          eval(shift(@insns));
340          eval(shift(@insns));
341           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[0]);
342          eval(shift(@insns));
343          eval(shift(@insns));
344           &vshr_u32     ($T5,&Dhi(@X[3]),$sigma1[2]);
345          eval(shift(@insns));
346          eval(shift(@insns));
347         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
348          eval(shift(@insns));
349          eval(shift(@insns));
350           &veor         ($T5,$T5,$T4);
351          eval(shift(@insns));
352          eval(shift(@insns));
353           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[1]);
354          eval(shift(@insns));
355          eval(shift(@insns));
356           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[1]);
357          eval(shift(@insns));
358          eval(shift(@insns));
359           &veor         ($T5,$T5,$T4);          # sigma1(X[14..15])
360          eval(shift(@insns));
361          eval(shift(@insns));
362         &vadd_i32       (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
363          eval(shift(@insns));
364          eval(shift(@insns));
365           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[0]);
366          eval(shift(@insns));
367          eval(shift(@insns));
368           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[0]);
369          eval(shift(@insns));
370          eval(shift(@insns));
371           &vshr_u32     ($T5,&Dlo(@X[0]),$sigma1[2]);
372          eval(shift(@insns));
373          eval(shift(@insns));
374           &veor         ($T5,$T5,$T4);
375          eval(shift(@insns));
376          eval(shift(@insns));
377           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[1]);
378          eval(shift(@insns));
379          eval(shift(@insns));
380         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
381          eval(shift(@insns));
382          eval(shift(@insns));
383           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[1]);
384          eval(shift(@insns));
385          eval(shift(@insns));
386           &veor         ($T5,$T5,$T4);          # sigma1(X[16..17])
387          eval(shift(@insns));
388          eval(shift(@insns));
389         &vadd_i32       (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
390          eval(shift(@insns));
391          eval(shift(@insns));
392         &vadd_i32       ($T0,$T0,@X[0]);
393          while($#insns>=2) { eval(shift(@insns)); }
394         &vst1_32        ("{$T0}","[$Xfer,:128]!");
395          eval(shift(@insns));
396          eval(shift(@insns));
397
398         push(@X,shift(@X));             # "rotate" X[]
399 }
400
401 sub Xpreload()
402 { use integer;
403   my $body = shift;
404   my @insns = (&$body,&$body,&$body,&$body);
405   my ($a,$b,$c,$d,$e,$f,$g,$h);
406
407          eval(shift(@insns));
408          eval(shift(@insns));
409          eval(shift(@insns));
410          eval(shift(@insns));
411         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
412          eval(shift(@insns));
413          eval(shift(@insns));
414          eval(shift(@insns));
415          eval(shift(@insns));
416         &vrev32_8       (@X[0],@X[0]);
417          eval(shift(@insns));
418          eval(shift(@insns));
419          eval(shift(@insns));
420          eval(shift(@insns));
421         &vadd_i32       ($T0,$T0,@X[0]);
422          foreach (@insns) { eval; }     # remaining instructions
423         &vst1_32        ("{$T0}","[$Xfer,:128]!");
424
425         push(@X,shift(@X));             # "rotate" X[]
426 }
427
428 sub body_00_15 () {
429         (
430         '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
431         '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
432         '&eor   ($t1,$f,$g)',
433         '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
434         '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
435         '&and   ($t1,$t1,$e)',
436         '&eor   ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
437         '&eor   ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
438         '&eor   ($t1,$t1,$g)',                  # Ch(e,f,g)
439         '&add   ($h,$h,$t2,"ror#$Sigma1[0]")',  # h+=Sigma1(e)
440         '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
441         '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
442         '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
443         '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
444         '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
445         '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
446         '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
447         '&add   ($d,$d,$h)',                    # d+=h
448         '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
449         '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
450         '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
451         )
452 }
453
454 $code.=<<___;
455 #if __ARM_MAX_ARCH__>=7
456 .arch   armv7-a
457 .fpu    neon
458
459 .global sha256_block_data_order_neon
460 .type   sha256_block_data_order_neon,%function
461 .align  4
462 sha256_block_data_order_neon:
463 .LNEON:
464         stmdb   sp!,{r4-r12,lr}
465
466         sub     $H,sp,#16*4+16
467         adrl    $Ktbl,K256
468         bic     $H,$H,#15               @ align for 128-bit stores
469         mov     $t2,sp
470         mov     sp,$H                   @ alloca
471         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
472
473         vld1.8          {@X[0]},[$inp]!
474         vld1.8          {@X[1]},[$inp]!
475         vld1.8          {@X[2]},[$inp]!
476         vld1.8          {@X[3]},[$inp]!
477         vld1.32         {$T0},[$Ktbl,:128]!
478         vld1.32         {$T1},[$Ktbl,:128]!
479         vld1.32         {$T2},[$Ktbl,:128]!
480         vld1.32         {$T3},[$Ktbl,:128]!
481         vrev32.8        @X[0],@X[0]             @ yes, even on
482         str             $ctx,[sp,#64]
483         vrev32.8        @X[1],@X[1]             @ big-endian
484         str             $inp,[sp,#68]
485         mov             $Xfer,sp
486         vrev32.8        @X[2],@X[2]
487         str             $len,[sp,#72]
488         vrev32.8        @X[3],@X[3]
489         str             $t2,[sp,#76]            @ save original sp
490         vadd.i32        $T0,$T0,@X[0]
491         vadd.i32        $T1,$T1,@X[1]
492         vst1.32         {$T0},[$Xfer,:128]!
493         vadd.i32        $T2,$T2,@X[2]
494         vst1.32         {$T1},[$Xfer,:128]!
495         vadd.i32        $T3,$T3,@X[3]
496         vst1.32         {$T2},[$Xfer,:128]!
497         vst1.32         {$T3},[$Xfer,:128]!
498
499         ldmia           $ctx,{$A-$H}
500         sub             $Xfer,$Xfer,#64
501         ldr             $t1,[sp,#0]
502         eor             $t2,$t2,$t2
503         eor             $t3,$B,$C
504         b               .L_00_48
505
506 .align  4
507 .L_00_48:
508 ___
509         &Xupdate(\&body_00_15);
510         &Xupdate(\&body_00_15);
511         &Xupdate(\&body_00_15);
512         &Xupdate(\&body_00_15);
513 $code.=<<___;
514         teq     $t1,#0                          @ check for K256 terminator
515         ldr     $t1,[sp,#0]
516         sub     $Xfer,$Xfer,#64
517         bne     .L_00_48
518
519         ldr             $inp,[sp,#68]
520         ldr             $t0,[sp,#72]
521         sub             $Ktbl,$Ktbl,#256        @ rewind $Ktbl
522         teq             $inp,$t0
523         it              eq
524         subeq           $inp,$inp,#64           @ avoid SEGV
525         vld1.8          {@X[0]},[$inp]!         @ load next input block
526         vld1.8          {@X[1]},[$inp]!
527         vld1.8          {@X[2]},[$inp]!
528         vld1.8          {@X[3]},[$inp]!
529         it              ne
530         strne           $inp,[sp,#68]
531         mov             $Xfer,sp
532 ___
533         &Xpreload(\&body_00_15);
534         &Xpreload(\&body_00_15);
535         &Xpreload(\&body_00_15);
536         &Xpreload(\&body_00_15);
537 $code.=<<___;
538         ldr     $t0,[$t1,#0]
539         add     $A,$A,$t2                       @ h+=Maj(a,b,c) from the past
540         ldr     $t2,[$t1,#4]
541         ldr     $t3,[$t1,#8]
542         ldr     $t4,[$t1,#12]
543         add     $A,$A,$t0                       @ accumulate
544         ldr     $t0,[$t1,#16]
545         add     $B,$B,$t2
546         ldr     $t2,[$t1,#20]
547         add     $C,$C,$t3
548         ldr     $t3,[$t1,#24]
549         add     $D,$D,$t4
550         ldr     $t4,[$t1,#28]
551         add     $E,$E,$t0
552         str     $A,[$t1],#4
553         add     $F,$F,$t2
554         str     $B,[$t1],#4
555         add     $G,$G,$t3
556         str     $C,[$t1],#4
557         add     $H,$H,$t4
558         str     $D,[$t1],#4
559         stmia   $t1,{$E-$H}
560
561         ittte   ne
562         movne   $Xfer,sp
563         ldrne   $t1,[sp,#0]
564         eorne   $t2,$t2,$t2
565         ldreq   sp,[sp,#76]                     @ restore original sp
566         itt     ne
567         eorne   $t3,$B,$C
568         bne     .L_00_48
569
570         ldmia   sp!,{r4-r12,pc}
571 .size   sha256_block_data_order_neon,.-sha256_block_data_order_neon
572 #endif
573 ___
574 }}}
575 ######################################################################
576 # ARMv8 stuff
577 #
578 {{{
579 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
580 my @MSG=map("q$_",(8..11));
581 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
582 my $Ktbl="r3";
583
584 $code.=<<___;
585 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
586
587 # ifdef __thumb2__
588 #  define INST(a,b,c,d) .byte   c,d|0xc,a,b
589 # else
590 #  define INST(a,b,c,d) .byte   a,b,c,d
591 # endif
592
593 .type   sha256_block_data_order_armv8,%function
594 .align  5
595 sha256_block_data_order_armv8:
596 .LARMv8:
597         vld1.32 {$ABCD,$EFGH},[$ctx]
598 # ifdef __thumb2__
599         adr     $Ktbl,.LARMv8
600         sub     $Ktbl,$Ktbl,#.LARMv8-K256
601 # else
602         adrl    $Ktbl,K256
603 # endif
604         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
605
606 .Loop_v8:
607         vld1.8          {@MSG[0]-@MSG[1]},[$inp]!
608         vld1.8          {@MSG[2]-@MSG[3]},[$inp]!
609         vld1.32         {$W0},[$Ktbl]!
610         vrev32.8        @MSG[0],@MSG[0]
611         vrev32.8        @MSG[1],@MSG[1]
612         vrev32.8        @MSG[2],@MSG[2]
613         vrev32.8        @MSG[3],@MSG[3]
614         vmov            $ABCD_SAVE,$ABCD        @ offload
615         vmov            $EFGH_SAVE,$EFGH
616         teq             $inp,$len
617 ___
618 for($i=0;$i<12;$i++) {
619 $code.=<<___;
620         vld1.32         {$W1},[$Ktbl]!
621         vadd.i32        $W0,$W0,@MSG[0]
622         sha256su0       @MSG[0],@MSG[1]
623         vmov            $abcd,$ABCD
624         sha256h         $ABCD,$EFGH,$W0
625         sha256h2        $EFGH,$abcd,$W0
626         sha256su1       @MSG[0],@MSG[2],@MSG[3]
627 ___
628         ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
629 }
630 $code.=<<___;
631         vld1.32         {$W1},[$Ktbl]!
632         vadd.i32        $W0,$W0,@MSG[0]
633         vmov            $abcd,$ABCD
634         sha256h         $ABCD,$EFGH,$W0
635         sha256h2        $EFGH,$abcd,$W0
636
637         vld1.32         {$W0},[$Ktbl]!
638         vadd.i32        $W1,$W1,@MSG[1]
639         vmov            $abcd,$ABCD
640         sha256h         $ABCD,$EFGH,$W1
641         sha256h2        $EFGH,$abcd,$W1
642
643         vld1.32         {$W1},[$Ktbl]
644         vadd.i32        $W0,$W0,@MSG[2]
645         sub             $Ktbl,$Ktbl,#256-16     @ rewind
646         vmov            $abcd,$ABCD
647         sha256h         $ABCD,$EFGH,$W0
648         sha256h2        $EFGH,$abcd,$W0
649
650         vadd.i32        $W1,$W1,@MSG[3]
651         vmov            $abcd,$ABCD
652         sha256h         $ABCD,$EFGH,$W1
653         sha256h2        $EFGH,$abcd,$W1
654
655         vadd.i32        $ABCD,$ABCD,$ABCD_SAVE
656         vadd.i32        $EFGH,$EFGH,$EFGH_SAVE
657         it              ne
658         bne             .Loop_v8
659
660         vst1.32         {$ABCD,$EFGH},[$ctx]
661
662         ret             @ bx lr
663 .size   sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
664 #endif
665 ___
666 }}}
667 $code.=<<___;
668 .asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
669 .align  2
670 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
671 .comm   OPENSSL_armcap_P,4,4
672 #endif
673 ___
674
675 open SELF,$0;
676 while(<SELF>) {
677         next if (/^#!/);
678         last if (!s/^#/@/ and !/^$/);
679         print;
680 }
681 close SELF;
682
683 {   my  %opcode = (
684         "sha256h"       => 0xf3000c40,  "sha256h2"      => 0xf3100c40,
685         "sha256su0"     => 0xf3ba03c0,  "sha256su1"     => 0xf3200c40   );
686
687     sub unsha256 {
688         my ($mnemonic,$arg)=@_;
689
690         if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
691             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
692                                          |(($2&7)<<17)|(($2&8)<<4)
693                                          |(($3&7)<<1) |(($3&8)<<2);
694             # since ARMv7 instructions are always encoded little-endian.
695             # correct solution is to use .inst directive, but older
696             # assemblers don't implement it:-(
697             sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
698                         $word&0xff,($word>>8)&0xff,
699                         ($word>>16)&0xff,($word>>24)&0xff,
700                         $mnemonic,$arg;
701         }
702     }
703 }
704
705 foreach (split($/,$code)) {
706
707         s/\`([^\`]*)\`/eval $1/geo;
708
709         s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
710
711         s/\bret\b/bx    lr/go           or
712         s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
713
714         print $_,"\n";
715 }
716
717 close STDOUT; # enforce flush