GNU Linux-libre 5.4.274-gnu1
[releases.git] / arch / arm / crypto / sha256-armv4.pl
1 #!/usr/bin/env perl
2 # SPDX-License-Identifier: GPL-2.0
3
4 # This code is taken from the OpenSSL project but the author (Andy Polyakov)
5 # has relicensed it under the GPLv2. Therefore this program is free software;
6 # you can redistribute it and/or modify it under the terms of the GNU General
7 # Public License version 2 as published by the Free Software Foundation.
8 #
9 # The original headers, including the original license headers, are
10 # included below for completeness.
11
12 # ====================================================================
13 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14 # project. The module is, however, dual licensed under OpenSSL and
15 # CRYPTOGAMS licenses depending on where you obtain it. For further
16 # details see http://www.openssl.org/~appro/cryptogams/.
17 # ====================================================================
18
19 # SHA256 block procedure for ARMv4. May 2007.
20
21 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
22 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
23 # byte [on single-issue Xscale PXA250 core].
24
25 # July 2010.
26 #
27 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
28 # Cortex A8 core and ~20 cycles per processed byte.
29
30 # February 2011.
31 #
32 # Profiler-assisted and platform-specific optimization resulted in 16%
33 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
34
35 # September 2013.
36 #
37 # Add NEON implementation. On Cortex A8 it was measured to process one
38 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40 # code (meaning that latter performs sub-optimally, nothing was done
41 # about it).
42
43 # May 2014.
44 #
45 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
46
47 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
48 open STDOUT,">$output";
49
50 $ctx="r0";      $t0="r0";
51 $inp="r1";      $t4="r1";
52 $len="r2";      $t1="r2";
53 $T1="r3";       $t3="r3";
54 $A="r4";
55 $B="r5";
56 $C="r6";
57 $D="r7";
58 $E="r8";
59 $F="r9";
60 $G="r10";
61 $H="r11";
62 @V=($A,$B,$C,$D,$E,$F,$G,$H);
63 $t2="r12";
64 $Ktbl="r14";
65
66 @Sigma0=( 2,13,22);
67 @Sigma1=( 6,11,25);
68 @sigma0=( 7,18, 3);
69 @sigma1=(17,19,10);
70
71 sub BODY_00_15 {
72 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
73
74 $code.=<<___ if ($i<16);
75 #if __ARM_ARCH__>=7
76         @ ldr   $t1,[$inp],#4                   @ $i
77 # if $i==15
78         str     $inp,[sp,#17*4]                 @ make room for $t4
79 # endif
80         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
81         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
82         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
83 # ifndef __ARMEB__
84         rev     $t1,$t1
85 # endif
86 #else
87         @ ldrb  $t1,[$inp,#3]                   @ $i
88         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
89         ldrb    $t2,[$inp,#2]
90         ldrb    $t0,[$inp,#1]
91         orr     $t1,$t1,$t2,lsl#8
92         ldrb    $t2,[$inp],#4
93         orr     $t1,$t1,$t0,lsl#16
94 # if $i==15
95         str     $inp,[sp,#17*4]                 @ make room for $t4
96 # endif
97         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
98         orr     $t1,$t1,$t2,lsl#24
99         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
100 #endif
101 ___
102 $code.=<<___;
103         ldr     $t2,[$Ktbl],#4                  @ *K256++
104         add     $h,$h,$t1                       @ h+=X[i]
105         str     $t1,[sp,#`$i%16`*4]
106         eor     $t1,$f,$g
107         add     $h,$h,$t0,ror#$Sigma1[0]        @ h+=Sigma1(e)
108         and     $t1,$t1,$e
109         add     $h,$h,$t2                       @ h+=K256[i]
110         eor     $t1,$t1,$g                      @ Ch(e,f,g)
111         eor     $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
112         add     $h,$h,$t1                       @ h+=Ch(e,f,g)
113 #if $i==31
114         and     $t2,$t2,#0xff
115         cmp     $t2,#0xf2                       @ done?
116 #endif
117 #if $i<15
118 # if __ARM_ARCH__>=7
119         ldr     $t1,[$inp],#4                   @ prefetch
120 # else
121         ldrb    $t1,[$inp,#3]
122 # endif
123         eor     $t2,$a,$b                       @ a^b, b^c in next round
124 #else
125         ldr     $t1,[sp,#`($i+2)%16`*4]         @ from future BODY_16_xx
126         eor     $t2,$a,$b                       @ a^b, b^c in next round
127         ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
128 #endif
129         eor     $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`  @ Sigma0(a)
130         and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
131         add     $d,$d,$h                        @ d+=h
132         eor     $t3,$t3,$b                      @ Maj(a,b,c)
133         add     $h,$h,$t0,ror#$Sigma0[0]        @ h+=Sigma0(a)
134         @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
135 ___
136         ($t2,$t3)=($t3,$t2);
137 }
138
139 sub BODY_16_XX {
140 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
141
142 $code.=<<___;
143         @ ldr   $t1,[sp,#`($i+1)%16`*4]         @ $i
144         @ ldr   $t4,[sp,#`($i+14)%16`*4]
145         mov     $t0,$t1,ror#$sigma0[0]
146         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
147         mov     $t2,$t4,ror#$sigma1[0]
148         eor     $t0,$t0,$t1,ror#$sigma0[1]
149         eor     $t2,$t2,$t4,ror#$sigma1[1]
150         eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
151         ldr     $t1,[sp,#`($i+0)%16`*4]
152         eor     $t2,$t2,$t4,lsr#$sigma1[2]      @ sigma1(X[i+14])
153         ldr     $t4,[sp,#`($i+9)%16`*4]
154
155         add     $t2,$t2,$t0
156         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`   @ from BODY_00_15
157         add     $t1,$t1,$t2
158         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
159         add     $t1,$t1,$t4                     @ X[i]
160 ___
161         &BODY_00_15(@_);
162 }
163
164 $code=<<___;
165 #ifndef __KERNEL__
166 # include "arm_arch.h"
167 #else
168 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
169 # define __ARM_MAX_ARCH__ 7
170 #endif
171
172 .text
173 #if __ARM_ARCH__<7
174 .code   32
175 #else
176 .syntax unified
177 # ifdef __thumb2__
178 .thumb
179 # else
180 .code   32
181 # endif
182 #endif
183
184 .type   K256,%object
185 .align  5
186 K256:
187 .word   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
188 .word   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
189 .word   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
190 .word   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
191 .word   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
192 .word   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
193 .word   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
194 .word   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
195 .word   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
196 .word   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
197 .word   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
198 .word   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
199 .word   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
200 .word   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
201 .word   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
202 .word   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
203 .size   K256,.-K256
204 .word   0                               @ terminator
205 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
206 .LOPENSSL_armcap:
207 .word   OPENSSL_armcap_P-sha256_block_data_order
208 #endif
209 .align  5
210
211 .global sha256_block_data_order
212 .type   sha256_block_data_order,%function
213 sha256_block_data_order:
214 .Lsha256_block_data_order:
215 #if __ARM_ARCH__<7
216         sub     r3,pc,#8                @ sha256_block_data_order
217 #else
218         adr     r3,.Lsha256_block_data_order
219 #endif
220 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
221         ldr     r12,.LOPENSSL_armcap
222         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
223         tst     r12,#ARMV8_SHA256
224         bne     .LARMv8
225         tst     r12,#ARMV7_NEON
226         bne     .LNEON
227 #endif
228         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
229         stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
230         ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
231         sub     $Ktbl,r3,#256+32        @ K256
232         sub     sp,sp,#16*4             @ alloca(X[16])
233 .Loop:
234 # if __ARM_ARCH__>=7
235         ldr     $t1,[$inp],#4
236 # else
237         ldrb    $t1,[$inp,#3]
238 # endif
239         eor     $t3,$B,$C               @ magic
240         eor     $t2,$t2,$t2
241 ___
242 for($i=0;$i<16;$i++)    { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
243 $code.=".Lrounds_16_xx:\n";
244 for (;$i<32;$i++)       { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
245 $code.=<<___;
246 #if __ARM_ARCH__>=7
247         ite     eq                      @ Thumb2 thing, sanity check in ARM
248 #endif
249         ldreq   $t3,[sp,#16*4]          @ pull ctx
250         bne     .Lrounds_16_xx
251
252         add     $A,$A,$t2               @ h+=Maj(a,b,c) from the past
253         ldr     $t0,[$t3,#0]
254         ldr     $t1,[$t3,#4]
255         ldr     $t2,[$t3,#8]
256         add     $A,$A,$t0
257         ldr     $t0,[$t3,#12]
258         add     $B,$B,$t1
259         ldr     $t1,[$t3,#16]
260         add     $C,$C,$t2
261         ldr     $t2,[$t3,#20]
262         add     $D,$D,$t0
263         ldr     $t0,[$t3,#24]
264         add     $E,$E,$t1
265         ldr     $t1,[$t3,#28]
266         add     $F,$F,$t2
267         ldr     $inp,[sp,#17*4]         @ pull inp
268         ldr     $t2,[sp,#18*4]          @ pull inp+len
269         add     $G,$G,$t0
270         add     $H,$H,$t1
271         stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
272         cmp     $inp,$t2
273         sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
274         bne     .Loop
275
276         add     sp,sp,#`16+3`*4 @ destroy frame
277 #if __ARM_ARCH__>=5
278         ldmia   sp!,{r4-r11,pc}
279 #else
280         ldmia   sp!,{r4-r11,lr}
281         tst     lr,#1
282         moveq   pc,lr                   @ be binary compatible with V4, yet
283         bx      lr                      @ interoperable with Thumb ISA:-)
284 #endif
285 .size   sha256_block_data_order,.-sha256_block_data_order
286 ___
287 ######################################################################
288 # NEON stuff
289 #
290 {{{
291 my @X=map("q$_",(0..3));
292 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
293 my $Xfer=$t4;
294 my $j=0;
295
296 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
297 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
298
299 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
300 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
301   my $arg = pop;
302     $arg = "#$arg" if ($arg*1 eq $arg);
303     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
304 }
305
306 sub Xupdate()
307 { use integer;
308   my $body = shift;
309   my @insns = (&$body,&$body,&$body,&$body);
310   my ($a,$b,$c,$d,$e,$f,$g,$h);
311
312         &vext_8         ($T0,@X[0],@X[1],4);    # X[1..4]
313          eval(shift(@insns));
314          eval(shift(@insns));
315          eval(shift(@insns));
316         &vext_8         ($T1,@X[2],@X[3],4);    # X[9..12]
317          eval(shift(@insns));
318          eval(shift(@insns));
319          eval(shift(@insns));
320         &vshr_u32       ($T2,$T0,$sigma0[0]);
321          eval(shift(@insns));
322          eval(shift(@insns));
323         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += X[9..12]
324          eval(shift(@insns));
325          eval(shift(@insns));
326         &vshr_u32       ($T1,$T0,$sigma0[2]);
327          eval(shift(@insns));
328          eval(shift(@insns));
329         &vsli_32        ($T2,$T0,32-$sigma0[0]);
330          eval(shift(@insns));
331          eval(shift(@insns));
332         &vshr_u32       ($T3,$T0,$sigma0[1]);
333          eval(shift(@insns));
334          eval(shift(@insns));
335         &veor           ($T1,$T1,$T2);
336          eval(shift(@insns));
337          eval(shift(@insns));
338         &vsli_32        ($T3,$T0,32-$sigma0[1]);
339          eval(shift(@insns));
340          eval(shift(@insns));
341           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[0]);
342          eval(shift(@insns));
343          eval(shift(@insns));
344         &veor           ($T1,$T1,$T3);          # sigma0(X[1..4])
345          eval(shift(@insns));
346          eval(shift(@insns));
347           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[0]);
348          eval(shift(@insns));
349          eval(shift(@insns));
350           &vshr_u32     ($T5,&Dhi(@X[3]),$sigma1[2]);
351          eval(shift(@insns));
352          eval(shift(@insns));
353         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
354          eval(shift(@insns));
355          eval(shift(@insns));
356           &veor         ($T5,$T5,$T4);
357          eval(shift(@insns));
358          eval(shift(@insns));
359           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[1]);
360          eval(shift(@insns));
361          eval(shift(@insns));
362           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[1]);
363          eval(shift(@insns));
364          eval(shift(@insns));
365           &veor         ($T5,$T5,$T4);          # sigma1(X[14..15])
366          eval(shift(@insns));
367          eval(shift(@insns));
368         &vadd_i32       (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
369          eval(shift(@insns));
370          eval(shift(@insns));
371           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[0]);
372          eval(shift(@insns));
373          eval(shift(@insns));
374           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[0]);
375          eval(shift(@insns));
376          eval(shift(@insns));
377           &vshr_u32     ($T5,&Dlo(@X[0]),$sigma1[2]);
378          eval(shift(@insns));
379          eval(shift(@insns));
380           &veor         ($T5,$T5,$T4);
381          eval(shift(@insns));
382          eval(shift(@insns));
383           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[1]);
384          eval(shift(@insns));
385          eval(shift(@insns));
386         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
387          eval(shift(@insns));
388          eval(shift(@insns));
389           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[1]);
390          eval(shift(@insns));
391          eval(shift(@insns));
392           &veor         ($T5,$T5,$T4);          # sigma1(X[16..17])
393          eval(shift(@insns));
394          eval(shift(@insns));
395         &vadd_i32       (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
396          eval(shift(@insns));
397          eval(shift(@insns));
398         &vadd_i32       ($T0,$T0,@X[0]);
399          while($#insns>=2) { eval(shift(@insns)); }
400         &vst1_32        ("{$T0}","[$Xfer,:128]!");
401          eval(shift(@insns));
402          eval(shift(@insns));
403
404         push(@X,shift(@X));             # "rotate" X[]
405 }
406
407 sub Xpreload()
408 { use integer;
409   my $body = shift;
410   my @insns = (&$body,&$body,&$body,&$body);
411   my ($a,$b,$c,$d,$e,$f,$g,$h);
412
413          eval(shift(@insns));
414          eval(shift(@insns));
415          eval(shift(@insns));
416          eval(shift(@insns));
417         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
418          eval(shift(@insns));
419          eval(shift(@insns));
420          eval(shift(@insns));
421          eval(shift(@insns));
422         &vrev32_8       (@X[0],@X[0]);
423          eval(shift(@insns));
424          eval(shift(@insns));
425          eval(shift(@insns));
426          eval(shift(@insns));
427         &vadd_i32       ($T0,$T0,@X[0]);
428          foreach (@insns) { eval; }     # remaining instructions
429         &vst1_32        ("{$T0}","[$Xfer,:128]!");
430
431         push(@X,shift(@X));             # "rotate" X[]
432 }
433
434 sub body_00_15 () {
435         (
436         '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
437         '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
438         '&eor   ($t1,$f,$g)',
439         '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
440         '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
441         '&and   ($t1,$t1,$e)',
442         '&eor   ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
443         '&eor   ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
444         '&eor   ($t1,$t1,$g)',                  # Ch(e,f,g)
445         '&add   ($h,$h,$t2,"ror#$Sigma1[0]")',  # h+=Sigma1(e)
446         '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
447         '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
448         '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
449         '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
450         '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
451         '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
452         '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
453         '&add   ($d,$d,$h)',                    # d+=h
454         '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
455         '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
456         '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
457         )
458 }
459
460 $code.=<<___;
461 #if __ARM_MAX_ARCH__>=7
462 .arch   armv7-a
463 .fpu    neon
464
465 .global sha256_block_data_order_neon
466 .type   sha256_block_data_order_neon,%function
467 .align  4
468 sha256_block_data_order_neon:
469 .LNEON:
470         stmdb   sp!,{r4-r12,lr}
471
472         sub     $H,sp,#16*4+16
473         adr     $Ktbl,.Lsha256_block_data_order
474         sub     $Ktbl,$Ktbl,#.Lsha256_block_data_order-K256
475         bic     $H,$H,#15               @ align for 128-bit stores
476         mov     $t2,sp
477         mov     sp,$H                   @ alloca
478         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
479
480         vld1.8          {@X[0]},[$inp]!
481         vld1.8          {@X[1]},[$inp]!
482         vld1.8          {@X[2]},[$inp]!
483         vld1.8          {@X[3]},[$inp]!
484         vld1.32         {$T0},[$Ktbl,:128]!
485         vld1.32         {$T1},[$Ktbl,:128]!
486         vld1.32         {$T2},[$Ktbl,:128]!
487         vld1.32         {$T3},[$Ktbl,:128]!
488         vrev32.8        @X[0],@X[0]             @ yes, even on
489         str             $ctx,[sp,#64]
490         vrev32.8        @X[1],@X[1]             @ big-endian
491         str             $inp,[sp,#68]
492         mov             $Xfer,sp
493         vrev32.8        @X[2],@X[2]
494         str             $len,[sp,#72]
495         vrev32.8        @X[3],@X[3]
496         str             $t2,[sp,#76]            @ save original sp
497         vadd.i32        $T0,$T0,@X[0]
498         vadd.i32        $T1,$T1,@X[1]
499         vst1.32         {$T0},[$Xfer,:128]!
500         vadd.i32        $T2,$T2,@X[2]
501         vst1.32         {$T1},[$Xfer,:128]!
502         vadd.i32        $T3,$T3,@X[3]
503         vst1.32         {$T2},[$Xfer,:128]!
504         vst1.32         {$T3},[$Xfer,:128]!
505
506         ldmia           $ctx,{$A-$H}
507         sub             $Xfer,$Xfer,#64
508         ldr             $t1,[sp,#0]
509         eor             $t2,$t2,$t2
510         eor             $t3,$B,$C
511         b               .L_00_48
512
513 .align  4
514 .L_00_48:
515 ___
516         &Xupdate(\&body_00_15);
517         &Xupdate(\&body_00_15);
518         &Xupdate(\&body_00_15);
519         &Xupdate(\&body_00_15);
520 $code.=<<___;
521         teq     $t1,#0                          @ check for K256 terminator
522         ldr     $t1,[sp,#0]
523         sub     $Xfer,$Xfer,#64
524         bne     .L_00_48
525
526         ldr             $inp,[sp,#68]
527         ldr             $t0,[sp,#72]
528         sub             $Ktbl,$Ktbl,#256        @ rewind $Ktbl
529         teq             $inp,$t0
530         it              eq
531         subeq           $inp,$inp,#64           @ avoid SEGV
532         vld1.8          {@X[0]},[$inp]!         @ load next input block
533         vld1.8          {@X[1]},[$inp]!
534         vld1.8          {@X[2]},[$inp]!
535         vld1.8          {@X[3]},[$inp]!
536         it              ne
537         strne           $inp,[sp,#68]
538         mov             $Xfer,sp
539 ___
540         &Xpreload(\&body_00_15);
541         &Xpreload(\&body_00_15);
542         &Xpreload(\&body_00_15);
543         &Xpreload(\&body_00_15);
544 $code.=<<___;
545         ldr     $t0,[$t1,#0]
546         add     $A,$A,$t2                       @ h+=Maj(a,b,c) from the past
547         ldr     $t2,[$t1,#4]
548         ldr     $t3,[$t1,#8]
549         ldr     $t4,[$t1,#12]
550         add     $A,$A,$t0                       @ accumulate
551         ldr     $t0,[$t1,#16]
552         add     $B,$B,$t2
553         ldr     $t2,[$t1,#20]
554         add     $C,$C,$t3
555         ldr     $t3,[$t1,#24]
556         add     $D,$D,$t4
557         ldr     $t4,[$t1,#28]
558         add     $E,$E,$t0
559         str     $A,[$t1],#4
560         add     $F,$F,$t2
561         str     $B,[$t1],#4
562         add     $G,$G,$t3
563         str     $C,[$t1],#4
564         add     $H,$H,$t4
565         str     $D,[$t1],#4
566         stmia   $t1,{$E-$H}
567
568         ittte   ne
569         movne   $Xfer,sp
570         ldrne   $t1,[sp,#0]
571         eorne   $t2,$t2,$t2
572         ldreq   sp,[sp,#76]                     @ restore original sp
573         itt     ne
574         eorne   $t3,$B,$C
575         bne     .L_00_48
576
577         ldmia   sp!,{r4-r12,pc}
578 .size   sha256_block_data_order_neon,.-sha256_block_data_order_neon
579 #endif
580 ___
581 }}}
582 ######################################################################
583 # ARMv8 stuff
584 #
585 {{{
586 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
587 my @MSG=map("q$_",(8..11));
588 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
589 my $Ktbl="r3";
590
591 $code.=<<___;
592 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
593
594 # ifdef __thumb2__
595 #  define INST(a,b,c,d) .byte   c,d|0xc,a,b
596 # else
597 #  define INST(a,b,c,d) .byte   a,b,c,d
598 # endif
599
600 .type   sha256_block_data_order_armv8,%function
601 .align  5
602 sha256_block_data_order_armv8:
603 .LARMv8:
604         vld1.32 {$ABCD,$EFGH},[$ctx]
605 # ifdef __thumb2__
606         adr     $Ktbl,.LARMv8
607         sub     $Ktbl,$Ktbl,#.LARMv8-K256
608 # else
609         adrl    $Ktbl,K256
610 # endif
611         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
612
613 .Loop_v8:
614         vld1.8          {@MSG[0]-@MSG[1]},[$inp]!
615         vld1.8          {@MSG[2]-@MSG[3]},[$inp]!
616         vld1.32         {$W0},[$Ktbl]!
617         vrev32.8        @MSG[0],@MSG[0]
618         vrev32.8        @MSG[1],@MSG[1]
619         vrev32.8        @MSG[2],@MSG[2]
620         vrev32.8        @MSG[3],@MSG[3]
621         vmov            $ABCD_SAVE,$ABCD        @ offload
622         vmov            $EFGH_SAVE,$EFGH
623         teq             $inp,$len
624 ___
625 for($i=0;$i<12;$i++) {
626 $code.=<<___;
627         vld1.32         {$W1},[$Ktbl]!
628         vadd.i32        $W0,$W0,@MSG[0]
629         sha256su0       @MSG[0],@MSG[1]
630         vmov            $abcd,$ABCD
631         sha256h         $ABCD,$EFGH,$W0
632         sha256h2        $EFGH,$abcd,$W0
633         sha256su1       @MSG[0],@MSG[2],@MSG[3]
634 ___
635         ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
636 }
637 $code.=<<___;
638         vld1.32         {$W1},[$Ktbl]!
639         vadd.i32        $W0,$W0,@MSG[0]
640         vmov            $abcd,$ABCD
641         sha256h         $ABCD,$EFGH,$W0
642         sha256h2        $EFGH,$abcd,$W0
643
644         vld1.32         {$W0},[$Ktbl]!
645         vadd.i32        $W1,$W1,@MSG[1]
646         vmov            $abcd,$ABCD
647         sha256h         $ABCD,$EFGH,$W1
648         sha256h2        $EFGH,$abcd,$W1
649
650         vld1.32         {$W1},[$Ktbl]
651         vadd.i32        $W0,$W0,@MSG[2]
652         sub             $Ktbl,$Ktbl,#256-16     @ rewind
653         vmov            $abcd,$ABCD
654         sha256h         $ABCD,$EFGH,$W0
655         sha256h2        $EFGH,$abcd,$W0
656
657         vadd.i32        $W1,$W1,@MSG[3]
658         vmov            $abcd,$ABCD
659         sha256h         $ABCD,$EFGH,$W1
660         sha256h2        $EFGH,$abcd,$W1
661
662         vadd.i32        $ABCD,$ABCD,$ABCD_SAVE
663         vadd.i32        $EFGH,$EFGH,$EFGH_SAVE
664         it              ne
665         bne             .Loop_v8
666
667         vst1.32         {$ABCD,$EFGH},[$ctx]
668
669         ret             @ bx lr
670 .size   sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
671 #endif
672 ___
673 }}}
674 $code.=<<___;
675 .asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
676 .align  2
677 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
678 .comm   OPENSSL_armcap_P,4,4
679 #endif
680 ___
681
682 open SELF,$0;
683 while(<SELF>) {
684         next if (/^#!/);
685         last if (!s/^#/@/ and !/^$/);
686         print;
687 }
688 close SELF;
689
690 {   my  %opcode = (
691         "sha256h"       => 0xf3000c40,  "sha256h2"      => 0xf3100c40,
692         "sha256su0"     => 0xf3ba03c0,  "sha256su1"     => 0xf3200c40   );
693
694     sub unsha256 {
695         my ($mnemonic,$arg)=@_;
696
697         if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
698             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
699                                          |(($2&7)<<17)|(($2&8)<<4)
700                                          |(($3&7)<<1) |(($3&8)<<2);
701             # since ARMv7 instructions are always encoded little-endian.
702             # correct solution is to use .inst directive, but older
703             # assemblers don't implement it:-(
704             sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
705                         $word&0xff,($word>>8)&0xff,
706                         ($word>>16)&0xff,($word>>24)&0xff,
707                         $mnemonic,$arg;
708         }
709     }
710 }
711
712 foreach (split($/,$code)) {
713
714         s/\`([^\`]*)\`/eval $1/geo;
715
716         s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
717
718         s/\bret\b/bx    lr/go           or
719         s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
720
721         print $_,"\n";
722 }
723
724 close STDOUT; # enforce flush