GNU Linux-libre 4.14.332-gnu1
[releases.git] / arch / arm / crypto / sha512-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 #
9 # Permission to use under GPL terms is granted.
10 # ====================================================================
11
12 # SHA512 block procedure for ARMv4. September 2007.
13
14 # This code is ~4.5 (four and a half) times faster than code generated
15 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
16 # Xscale PXA250 core].
17 #
18 # July 2010.
19 #
20 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
21 # Cortex A8 core and ~40 cycles per processed byte.
22
23 # February 2011.
24 #
25 # Profiler-assisted and platform-specific optimization resulted in 7%
26 # improvement on Coxtex A8 core and ~38 cycles per byte.
27
28 # March 2011.
29 #
30 # Add NEON implementation. On Cortex A8 it was measured to process
31 # one byte in 23.3 cycles or ~60% faster than integer-only code.
32
33 # August 2012.
34 #
35 # Improve NEON performance by 12% on Snapdragon S4. In absolute
36 # terms it's 22.6 cycles per byte, which is disappointing result.
37 # Technical writers asserted that 3-way S4 pipeline can sustain
38 # multiple NEON instructions per cycle, but dual NEON issue could
39 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
40 # for further details. On side note Cortex-A15 processes one byte in
41 # 16 cycles.
42
43 # Byte order [in]dependence. =========================================
44 #
45 # Originally caller was expected to maintain specific *dword* order in
46 # h[0-7], namely with most significant dword at *lower* address, which
47 # was reflected in below two parameters as 0 and 4. Now caller is
48 # expected to maintain native byte order for whole 64-bit values.
49 $hi="HI";
50 $lo="LO";
51 # ====================================================================
52
53 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
54 open STDOUT,">$output";
55
56 $ctx="r0";      # parameter block
57 $inp="r1";
58 $len="r2";
59
60 $Tlo="r3";
61 $Thi="r4";
62 $Alo="r5";
63 $Ahi="r6";
64 $Elo="r7";
65 $Ehi="r8";
66 $t0="r9";
67 $t1="r10";
68 $t2="r11";
69 $t3="r12";
70 ############    r13 is stack pointer
71 $Ktbl="r14";
72 ############    r15 is program counter
73
74 $Aoff=8*0;
75 $Boff=8*1;
76 $Coff=8*2;
77 $Doff=8*3;
78 $Eoff=8*4;
79 $Foff=8*5;
80 $Goff=8*6;
81 $Hoff=8*7;
82 $Xoff=8*8;
83
84 sub BODY_00_15() {
85 my $magic = shift;
86 $code.=<<___;
87         @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
88         @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
89         @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
90         mov     $t0,$Elo,lsr#14
91         str     $Tlo,[sp,#$Xoff+0]
92         mov     $t1,$Ehi,lsr#14
93         str     $Thi,[sp,#$Xoff+4]
94         eor     $t0,$t0,$Ehi,lsl#18
95         ldr     $t2,[sp,#$Hoff+0]       @ h.lo
96         eor     $t1,$t1,$Elo,lsl#18
97         ldr     $t3,[sp,#$Hoff+4]       @ h.hi
98         eor     $t0,$t0,$Elo,lsr#18
99         eor     $t1,$t1,$Ehi,lsr#18
100         eor     $t0,$t0,$Ehi,lsl#14
101         eor     $t1,$t1,$Elo,lsl#14
102         eor     $t0,$t0,$Ehi,lsr#9
103         eor     $t1,$t1,$Elo,lsr#9
104         eor     $t0,$t0,$Elo,lsl#23
105         eor     $t1,$t1,$Ehi,lsl#23     @ Sigma1(e)
106         adds    $Tlo,$Tlo,$t0
107         ldr     $t0,[sp,#$Foff+0]       @ f.lo
108         adc     $Thi,$Thi,$t1           @ T += Sigma1(e)
109         ldr     $t1,[sp,#$Foff+4]       @ f.hi
110         adds    $Tlo,$Tlo,$t2
111         ldr     $t2,[sp,#$Goff+0]       @ g.lo
112         adc     $Thi,$Thi,$t3           @ T += h
113         ldr     $t3,[sp,#$Goff+4]       @ g.hi
114
115         eor     $t0,$t0,$t2
116         str     $Elo,[sp,#$Eoff+0]
117         eor     $t1,$t1,$t3
118         str     $Ehi,[sp,#$Eoff+4]
119         and     $t0,$t0,$Elo
120         str     $Alo,[sp,#$Aoff+0]
121         and     $t1,$t1,$Ehi
122         str     $Ahi,[sp,#$Aoff+4]
123         eor     $t0,$t0,$t2
124         ldr     $t2,[$Ktbl,#$lo]        @ K[i].lo
125         eor     $t1,$t1,$t3             @ Ch(e,f,g)
126         ldr     $t3,[$Ktbl,#$hi]        @ K[i].hi
127
128         adds    $Tlo,$Tlo,$t0
129         ldr     $Elo,[sp,#$Doff+0]      @ d.lo
130         adc     $Thi,$Thi,$t1           @ T += Ch(e,f,g)
131         ldr     $Ehi,[sp,#$Doff+4]      @ d.hi
132         adds    $Tlo,$Tlo,$t2
133         and     $t0,$t2,#0xff
134         adc     $Thi,$Thi,$t3           @ T += K[i]
135         adds    $Elo,$Elo,$Tlo
136         ldr     $t2,[sp,#$Boff+0]       @ b.lo
137         adc     $Ehi,$Ehi,$Thi          @ d += T
138         teq     $t0,#$magic
139
140         ldr     $t3,[sp,#$Coff+0]       @ c.lo
141 #if __ARM_ARCH__>=7
142         it      eq                      @ Thumb2 thing, sanity check in ARM
143 #endif
144         orreq   $Ktbl,$Ktbl,#1
145         @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
146         @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
147         @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
148         mov     $t0,$Alo,lsr#28
149         mov     $t1,$Ahi,lsr#28
150         eor     $t0,$t0,$Ahi,lsl#4
151         eor     $t1,$t1,$Alo,lsl#4
152         eor     $t0,$t0,$Ahi,lsr#2
153         eor     $t1,$t1,$Alo,lsr#2
154         eor     $t0,$t0,$Alo,lsl#30
155         eor     $t1,$t1,$Ahi,lsl#30
156         eor     $t0,$t0,$Ahi,lsr#7
157         eor     $t1,$t1,$Alo,lsr#7
158         eor     $t0,$t0,$Alo,lsl#25
159         eor     $t1,$t1,$Ahi,lsl#25     @ Sigma0(a)
160         adds    $Tlo,$Tlo,$t0
161         and     $t0,$Alo,$t2
162         adc     $Thi,$Thi,$t1           @ T += Sigma0(a)
163
164         ldr     $t1,[sp,#$Boff+4]       @ b.hi
165         orr     $Alo,$Alo,$t2
166         ldr     $t2,[sp,#$Coff+4]       @ c.hi
167         and     $Alo,$Alo,$t3
168         and     $t3,$Ahi,$t1
169         orr     $Ahi,$Ahi,$t1
170         orr     $Alo,$Alo,$t0           @ Maj(a,b,c).lo
171         and     $Ahi,$Ahi,$t2
172         adds    $Alo,$Alo,$Tlo
173         orr     $Ahi,$Ahi,$t3           @ Maj(a,b,c).hi
174         sub     sp,sp,#8
175         adc     $Ahi,$Ahi,$Thi          @ h += T
176         tst     $Ktbl,#1
177         add     $Ktbl,$Ktbl,#8
178 ___
179 }
180 $code=<<___;
181 #ifndef __KERNEL__
182 # include "arm_arch.h"
183 # define VFP_ABI_PUSH   vstmdb  sp!,{d8-d15}
184 # define VFP_ABI_POP    vldmia  sp!,{d8-d15}
185 #else
186 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
187 # define __ARM_MAX_ARCH__ 7
188 # define VFP_ABI_PUSH
189 # define VFP_ABI_POP
190 #endif
191
192 #ifdef __ARMEL__
193 # define LO 0
194 # define HI 4
195 # define WORD64(hi0,lo0,hi1,lo1)        .word   lo0,hi0, lo1,hi1
196 #else
197 # define HI 0
198 # define LO 4
199 # define WORD64(hi0,lo0,hi1,lo1)        .word   hi0,lo0, hi1,lo1
200 #endif
201
202 .text
203 #if __ARM_ARCH__<7
204 .code   32
205 #else
206 .syntax unified
207 # ifdef __thumb2__
208 #  define adrl adr
209 .thumb
210 # else
211 .code   32
212 # endif
213 #endif
214
215 .type   K512,%object
216 .align  5
217 K512:
218 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
219 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
220 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
221 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
222 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
223 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
224 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
225 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
226 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
227 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
228 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
229 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
230 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
231 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
232 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
233 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
234 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
235 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
236 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
237 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
238 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
239 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
240 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
241 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
242 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
243 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
244 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
245 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
246 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
247 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
248 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
249 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
250 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
251 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
252 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
253 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
254 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
255 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
256 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
257 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
258 .size   K512,.-K512
259 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
260 .LOPENSSL_armcap:
261 .word   OPENSSL_armcap_P-sha512_block_data_order
262 .skip   32-4
263 #else
264 .skip   32
265 #endif
266
267 .global sha512_block_data_order
268 .type   sha512_block_data_order,%function
269 sha512_block_data_order:
270 .Lsha512_block_data_order:
271 #if __ARM_ARCH__<7
272         sub     r3,pc,#8                @ sha512_block_data_order
273 #else
274         adr     r3,.Lsha512_block_data_order
275 #endif
276 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
277         ldr     r12,.LOPENSSL_armcap
278         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
279         tst     r12,#1
280         bne     .LNEON
281 #endif
282         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
283         stmdb   sp!,{r4-r12,lr}
284         sub     $Ktbl,r3,#672           @ K512
285         sub     sp,sp,#9*8
286
287         ldr     $Elo,[$ctx,#$Eoff+$lo]
288         ldr     $Ehi,[$ctx,#$Eoff+$hi]
289         ldr     $t0, [$ctx,#$Goff+$lo]
290         ldr     $t1, [$ctx,#$Goff+$hi]
291         ldr     $t2, [$ctx,#$Hoff+$lo]
292         ldr     $t3, [$ctx,#$Hoff+$hi]
293 .Loop:
294         str     $t0, [sp,#$Goff+0]
295         str     $t1, [sp,#$Goff+4]
296         str     $t2, [sp,#$Hoff+0]
297         str     $t3, [sp,#$Hoff+4]
298         ldr     $Alo,[$ctx,#$Aoff+$lo]
299         ldr     $Ahi,[$ctx,#$Aoff+$hi]
300         ldr     $Tlo,[$ctx,#$Boff+$lo]
301         ldr     $Thi,[$ctx,#$Boff+$hi]
302         ldr     $t0, [$ctx,#$Coff+$lo]
303         ldr     $t1, [$ctx,#$Coff+$hi]
304         ldr     $t2, [$ctx,#$Doff+$lo]
305         ldr     $t3, [$ctx,#$Doff+$hi]
306         str     $Tlo,[sp,#$Boff+0]
307         str     $Thi,[sp,#$Boff+4]
308         str     $t0, [sp,#$Coff+0]
309         str     $t1, [sp,#$Coff+4]
310         str     $t2, [sp,#$Doff+0]
311         str     $t3, [sp,#$Doff+4]
312         ldr     $Tlo,[$ctx,#$Foff+$lo]
313         ldr     $Thi,[$ctx,#$Foff+$hi]
314         str     $Tlo,[sp,#$Foff+0]
315         str     $Thi,[sp,#$Foff+4]
316
317 .L00_15:
318 #if __ARM_ARCH__<7
319         ldrb    $Tlo,[$inp,#7]
320         ldrb    $t0, [$inp,#6]
321         ldrb    $t1, [$inp,#5]
322         ldrb    $t2, [$inp,#4]
323         ldrb    $Thi,[$inp,#3]
324         ldrb    $t3, [$inp,#2]
325         orr     $Tlo,$Tlo,$t0,lsl#8
326         ldrb    $t0, [$inp,#1]
327         orr     $Tlo,$Tlo,$t1,lsl#16
328         ldrb    $t1, [$inp],#8
329         orr     $Tlo,$Tlo,$t2,lsl#24
330         orr     $Thi,$Thi,$t3,lsl#8
331         orr     $Thi,$Thi,$t0,lsl#16
332         orr     $Thi,$Thi,$t1,lsl#24
333 #else
334         ldr     $Tlo,[$inp,#4]
335         ldr     $Thi,[$inp],#8
336 #ifdef __ARMEL__
337         rev     $Tlo,$Tlo
338         rev     $Thi,$Thi
339 #endif
340 #endif
341 ___
342         &BODY_00_15(0x94);
343 $code.=<<___;
344         tst     $Ktbl,#1
345         beq     .L00_15
346         ldr     $t0,[sp,#`$Xoff+8*(16-1)`+0]
347         ldr     $t1,[sp,#`$Xoff+8*(16-1)`+4]
348         bic     $Ktbl,$Ktbl,#1
349 .L16_79:
350         @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
351         @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
352         @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
353         mov     $Tlo,$t0,lsr#1
354         ldr     $t2,[sp,#`$Xoff+8*(16-14)`+0]
355         mov     $Thi,$t1,lsr#1
356         ldr     $t3,[sp,#`$Xoff+8*(16-14)`+4]
357         eor     $Tlo,$Tlo,$t1,lsl#31
358         eor     $Thi,$Thi,$t0,lsl#31
359         eor     $Tlo,$Tlo,$t0,lsr#8
360         eor     $Thi,$Thi,$t1,lsr#8
361         eor     $Tlo,$Tlo,$t1,lsl#24
362         eor     $Thi,$Thi,$t0,lsl#24
363         eor     $Tlo,$Tlo,$t0,lsr#7
364         eor     $Thi,$Thi,$t1,lsr#7
365         eor     $Tlo,$Tlo,$t1,lsl#25
366
367         @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
368         @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
369         @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
370         mov     $t0,$t2,lsr#19
371         mov     $t1,$t3,lsr#19
372         eor     $t0,$t0,$t3,lsl#13
373         eor     $t1,$t1,$t2,lsl#13
374         eor     $t0,$t0,$t3,lsr#29
375         eor     $t1,$t1,$t2,lsr#29
376         eor     $t0,$t0,$t2,lsl#3
377         eor     $t1,$t1,$t3,lsl#3
378         eor     $t0,$t0,$t2,lsr#6
379         eor     $t1,$t1,$t3,lsr#6
380         ldr     $t2,[sp,#`$Xoff+8*(16-9)`+0]
381         eor     $t0,$t0,$t3,lsl#26
382
383         ldr     $t3,[sp,#`$Xoff+8*(16-9)`+4]
384         adds    $Tlo,$Tlo,$t0
385         ldr     $t0,[sp,#`$Xoff+8*16`+0]
386         adc     $Thi,$Thi,$t1
387
388         ldr     $t1,[sp,#`$Xoff+8*16`+4]
389         adds    $Tlo,$Tlo,$t2
390         adc     $Thi,$Thi,$t3
391         adds    $Tlo,$Tlo,$t0
392         adc     $Thi,$Thi,$t1
393 ___
394         &BODY_00_15(0x17);
395 $code.=<<___;
396 #if __ARM_ARCH__>=7
397         ittt    eq                      @ Thumb2 thing, sanity check in ARM
398 #endif
399         ldreq   $t0,[sp,#`$Xoff+8*(16-1)`+0]
400         ldreq   $t1,[sp,#`$Xoff+8*(16-1)`+4]
401         beq     .L16_79
402         bic     $Ktbl,$Ktbl,#1
403
404         ldr     $Tlo,[sp,#$Boff+0]
405         ldr     $Thi,[sp,#$Boff+4]
406         ldr     $t0, [$ctx,#$Aoff+$lo]
407         ldr     $t1, [$ctx,#$Aoff+$hi]
408         ldr     $t2, [$ctx,#$Boff+$lo]
409         ldr     $t3, [$ctx,#$Boff+$hi]
410         adds    $t0,$Alo,$t0
411         str     $t0, [$ctx,#$Aoff+$lo]
412         adc     $t1,$Ahi,$t1
413         str     $t1, [$ctx,#$Aoff+$hi]
414         adds    $t2,$Tlo,$t2
415         str     $t2, [$ctx,#$Boff+$lo]
416         adc     $t3,$Thi,$t3
417         str     $t3, [$ctx,#$Boff+$hi]
418
419         ldr     $Alo,[sp,#$Coff+0]
420         ldr     $Ahi,[sp,#$Coff+4]
421         ldr     $Tlo,[sp,#$Doff+0]
422         ldr     $Thi,[sp,#$Doff+4]
423         ldr     $t0, [$ctx,#$Coff+$lo]
424         ldr     $t1, [$ctx,#$Coff+$hi]
425         ldr     $t2, [$ctx,#$Doff+$lo]
426         ldr     $t3, [$ctx,#$Doff+$hi]
427         adds    $t0,$Alo,$t0
428         str     $t0, [$ctx,#$Coff+$lo]
429         adc     $t1,$Ahi,$t1
430         str     $t1, [$ctx,#$Coff+$hi]
431         adds    $t2,$Tlo,$t2
432         str     $t2, [$ctx,#$Doff+$lo]
433         adc     $t3,$Thi,$t3
434         str     $t3, [$ctx,#$Doff+$hi]
435
436         ldr     $Tlo,[sp,#$Foff+0]
437         ldr     $Thi,[sp,#$Foff+4]
438         ldr     $t0, [$ctx,#$Eoff+$lo]
439         ldr     $t1, [$ctx,#$Eoff+$hi]
440         ldr     $t2, [$ctx,#$Foff+$lo]
441         ldr     $t3, [$ctx,#$Foff+$hi]
442         adds    $Elo,$Elo,$t0
443         str     $Elo,[$ctx,#$Eoff+$lo]
444         adc     $Ehi,$Ehi,$t1
445         str     $Ehi,[$ctx,#$Eoff+$hi]
446         adds    $t2,$Tlo,$t2
447         str     $t2, [$ctx,#$Foff+$lo]
448         adc     $t3,$Thi,$t3
449         str     $t3, [$ctx,#$Foff+$hi]
450
451         ldr     $Alo,[sp,#$Goff+0]
452         ldr     $Ahi,[sp,#$Goff+4]
453         ldr     $Tlo,[sp,#$Hoff+0]
454         ldr     $Thi,[sp,#$Hoff+4]
455         ldr     $t0, [$ctx,#$Goff+$lo]
456         ldr     $t1, [$ctx,#$Goff+$hi]
457         ldr     $t2, [$ctx,#$Hoff+$lo]
458         ldr     $t3, [$ctx,#$Hoff+$hi]
459         adds    $t0,$Alo,$t0
460         str     $t0, [$ctx,#$Goff+$lo]
461         adc     $t1,$Ahi,$t1
462         str     $t1, [$ctx,#$Goff+$hi]
463         adds    $t2,$Tlo,$t2
464         str     $t2, [$ctx,#$Hoff+$lo]
465         adc     $t3,$Thi,$t3
466         str     $t3, [$ctx,#$Hoff+$hi]
467
468         add     sp,sp,#640
469         sub     $Ktbl,$Ktbl,#640
470
471         teq     $inp,$len
472         bne     .Loop
473
474         add     sp,sp,#8*9              @ destroy frame
475 #if __ARM_ARCH__>=5
476         ldmia   sp!,{r4-r12,pc}
477 #else
478         ldmia   sp!,{r4-r12,lr}
479         tst     lr,#1
480         moveq   pc,lr                   @ be binary compatible with V4, yet
481         bx      lr                      @ interoperable with Thumb ISA:-)
482 #endif
483 .size   sha512_block_data_order,.-sha512_block_data_order
484 ___
485
486 {
487 my @Sigma0=(28,34,39);
488 my @Sigma1=(14,18,41);
489 my @sigma0=(1, 8, 7);
490 my @sigma1=(19,61,6);
491
492 my $Ktbl="r3";
493 my $cnt="r12";  # volatile register known as ip, intra-procedure-call scratch
494
495 my @X=map("d$_",(0..15));
496 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
497
498 sub NEON_00_15() {
499 my $i=shift;
500 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
501 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));   # temps
502
503 $code.=<<___ if ($i<16 || $i&1);
504         vshr.u64        $t0,$e,#@Sigma1[0]      @ $i
505 #if $i<16
506         vld1.64         {@X[$i%16]},[$inp]!     @ handles unaligned
507 #endif
508         vshr.u64        $t1,$e,#@Sigma1[1]
509 #if $i>0
510          vadd.i64       $a,$Maj                 @ h+=Maj from the past
511 #endif
512         vshr.u64        $t2,$e,#@Sigma1[2]
513 ___
514 $code.=<<___;
515         vld1.64         {$K},[$Ktbl,:64]!       @ K[i++]
516         vsli.64         $t0,$e,#`64-@Sigma1[0]`
517         vsli.64         $t1,$e,#`64-@Sigma1[1]`
518         vmov            $Ch,$e
519         vsli.64         $t2,$e,#`64-@Sigma1[2]`
520 #if $i<16 && defined(__ARMEL__)
521         vrev64.8        @X[$i],@X[$i]
522 #endif
523         veor            $t1,$t0
524         vbsl            $Ch,$f,$g               @ Ch(e,f,g)
525         vshr.u64        $t0,$a,#@Sigma0[0]
526         veor            $t2,$t1                 @ Sigma1(e)
527         vadd.i64        $T1,$Ch,$h
528         vshr.u64        $t1,$a,#@Sigma0[1]
529         vsli.64         $t0,$a,#`64-@Sigma0[0]`
530         vadd.i64        $T1,$t2
531         vshr.u64        $t2,$a,#@Sigma0[2]
532         vadd.i64        $K,@X[$i%16]
533         vsli.64         $t1,$a,#`64-@Sigma0[1]`
534         veor            $Maj,$a,$b
535         vsli.64         $t2,$a,#`64-@Sigma0[2]`
536         veor            $h,$t0,$t1
537         vadd.i64        $T1,$K
538         vbsl            $Maj,$c,$b              @ Maj(a,b,c)
539         veor            $h,$t2                  @ Sigma0(a)
540         vadd.i64        $d,$T1
541         vadd.i64        $Maj,$T1
542         @ vadd.i64      $h,$Maj
543 ___
544 }
545
546 sub NEON_16_79() {
547 my $i=shift;
548
549 if ($i&1)       { &NEON_00_15($i,@_); return; }
550
551 # 2x-vectorized, therefore runs every 2nd round
552 my @X=map("q$_",(0..7));                        # view @X as 128-bit vector
553 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));     # temps
554 my ($d0,$d1,$d2) = map("d$_",(24..26));         # temps from NEON_00_15
555 my $e=@_[4];                                    # $e from NEON_00_15
556 $i /= 2;
557 $code.=<<___;
558         vshr.u64        $t0,@X[($i+7)%8],#@sigma1[0]
559         vshr.u64        $t1,@X[($i+7)%8],#@sigma1[1]
560          vadd.i64       @_[0],d30                       @ h+=Maj from the past
561         vshr.u64        $s1,@X[($i+7)%8],#@sigma1[2]
562         vsli.64         $t0,@X[($i+7)%8],#`64-@sigma1[0]`
563         vext.8          $s0,@X[$i%8],@X[($i+1)%8],#8    @ X[i+1]
564         vsli.64         $t1,@X[($i+7)%8],#`64-@sigma1[1]`
565         veor            $s1,$t0
566         vshr.u64        $t0,$s0,#@sigma0[0]
567         veor            $s1,$t1                         @ sigma1(X[i+14])
568         vshr.u64        $t1,$s0,#@sigma0[1]
569         vadd.i64        @X[$i%8],$s1
570         vshr.u64        $s1,$s0,#@sigma0[2]
571         vsli.64         $t0,$s0,#`64-@sigma0[0]`
572         vsli.64         $t1,$s0,#`64-@sigma0[1]`
573         vext.8          $s0,@X[($i+4)%8],@X[($i+5)%8],#8        @ X[i+9]
574         veor            $s1,$t0
575         vshr.u64        $d0,$e,#@Sigma1[0]              @ from NEON_00_15
576         vadd.i64        @X[$i%8],$s0
577         vshr.u64        $d1,$e,#@Sigma1[1]              @ from NEON_00_15
578         veor            $s1,$t1                         @ sigma0(X[i+1])
579         vshr.u64        $d2,$e,#@Sigma1[2]              @ from NEON_00_15
580         vadd.i64        @X[$i%8],$s1
581 ___
582         &NEON_00_15(2*$i,@_);
583 }
584
585 $code.=<<___;
586 #if __ARM_MAX_ARCH__>=7
587 .arch   armv7-a
588 .fpu    neon
589
590 .global sha512_block_data_order_neon
591 .type   sha512_block_data_order_neon,%function
592 .align  4
593 sha512_block_data_order_neon:
594 .LNEON:
595         dmb                             @ errata #451034 on early Cortex A8
596         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
597         VFP_ABI_PUSH
598         adrl    $Ktbl,K512
599         vldmia  $ctx,{$A-$H}            @ load context
600 .Loop_neon:
601 ___
602 for($i=0;$i<16;$i++)    { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
603 $code.=<<___;
604         mov             $cnt,#4
605 .L16_79_neon:
606         subs            $cnt,#1
607 ___
608 for(;$i<32;$i++)        { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
609 $code.=<<___;
610         bne             .L16_79_neon
611
612          vadd.i64       $A,d30          @ h+=Maj from the past
613         vldmia          $ctx,{d24-d31}  @ load context to temp
614         vadd.i64        q8,q12          @ vectorized accumulate
615         vadd.i64        q9,q13
616         vadd.i64        q10,q14
617         vadd.i64        q11,q15
618         vstmia          $ctx,{$A-$H}    @ save context
619         teq             $inp,$len
620         sub             $Ktbl,#640      @ rewind K512
621         bne             .Loop_neon
622
623         VFP_ABI_POP
624         ret                             @ bx lr
625 .size   sha512_block_data_order_neon,.-sha512_block_data_order_neon
626 #endif
627 ___
628 }
629 $code.=<<___;
630 .asciz  "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
631 .align  2
632 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
633 .comm   OPENSSL_armcap_P,4,4
634 #endif
635 ___
636
637 $code =~ s/\`([^\`]*)\`/eval $1/gem;
638 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
639 $code =~ s/\bret\b/bx   lr/gm;
640
641 open SELF,$0;
642 while(<SELF>) {
643         next if (/^#!/);
644         last if (!s/^#/@/ and !/^$/);
645         print;
646 }
647 close SELF;
648
649 print $code;
650 close STDOUT; # enforce flush