2 @ ====================================================================
3 @ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
4 @ project. The module is, however, dual licensed under OpenSSL and
5 @ CRYPTOGAMS licenses depending on where you obtain it. For further
6 @ details see http://www.openssl.org/~appro/cryptogams/.
8 @ Permission to use under GPL terms is granted.
9 @ ====================================================================
11 @ SHA512 block procedure for ARMv4. September 2007.
13 @ This code is ~4.5 (four and a half) times faster than code generated
14 @ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
15 @ Xscale PXA250 core].
19 @ Rescheduling for dual-issue pipeline resulted in 6% improvement on
20 @ Cortex A8 core and ~40 cycles per processed byte.
24 @ Profiler-assisted and platform-specific optimization resulted in 7%
25 @ improvement on Coxtex A8 core and ~38 cycles per byte.
29 @ Add NEON implementation. On Cortex A8 it was measured to process
30 @ one byte in 23.3 cycles or ~60% faster than integer-only code.
34 @ Improve NEON performance by 12% on Snapdragon S4. In absolute
35 @ terms it's 22.6 cycles per byte, which is disappointing result.
36 @ Technical writers asserted that 3-way S4 pipeline can sustain
37 @ multiple NEON instructions per cycle, but dual NEON issue could
38 @ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
39 @ for further details. On side note Cortex-A15 processes one byte in
42 @ Byte order [in]dependence. =========================================
44 @ Originally caller was expected to maintain specific *dword* order in
45 @ h[0-7], namely with most significant dword at *lower* address, which
46 @ was reflected in below two parameters as 0 and 4. Now caller is
47 @ expected to maintain native byte order for whole 64-bit values.
49 # include "arm_arch.h"
50 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
51 # define VFP_ABI_POP vldmia sp!,{d8-d15}
53 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
54 # define __ARM_MAX_ARCH__ 7
62 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
66 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
85 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
86 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
87 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
88 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
89 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
90 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
91 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
92 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
93 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
94 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
95 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
96 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
97 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
98 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
99 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
100 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
101 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
102 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
103 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
104 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
105 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
106 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
107 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
108 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
109 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
110 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
111 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
112 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
113 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
114 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
115 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
116 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
117 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
118 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
119 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
120 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
121 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
122 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
123 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
124 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
126 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
128 .word OPENSSL_armcap_P-sha512_block_data_order
134 .global sha512_block_data_order
135 .type sha512_block_data_order,%function
136 sha512_block_data_order:
137 .Lsha512_block_data_order:
139 sub r3,pc,#8 @ sha512_block_data_order
141 adr r3,.Lsha512_block_data_order
143 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
144 ldr r12,.LOPENSSL_armcap
145 ldr r12,[r3,r12] @ OPENSSL_armcap_P
149 add r2,r1,r2,lsl#7 @ len to point at the end of inp
150 stmdb sp!,{r4-r12,lr}
151 sub r14,r3,#672 @ K512
208 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
209 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
210 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
216 ldr r11,[sp,#56+0] @ h.lo
217 eor r10,r10,r7,lsl#18
218 ldr r12,[sp,#56+4] @ h.hi
220 eor r10,r10,r8,lsr#18
222 eor r10,r10,r7,lsl#14
226 eor r10,r10,r8,lsl#23 @ Sigma1(e)
228 ldr r9,[sp,#40+0] @ f.lo
229 adc r4,r4,r10 @ T += Sigma1(e)
230 ldr r10,[sp,#40+4] @ f.hi
232 ldr r11,[sp,#48+0] @ g.lo
233 adc r4,r4,r12 @ T += h
234 ldr r12,[sp,#48+4] @ g.hi
245 ldr r11,[r14,#LO] @ K[i].lo
246 eor r10,r10,r12 @ Ch(e,f,g)
247 ldr r12,[r14,#HI] @ K[i].hi
250 ldr r7,[sp,#24+0] @ d.lo
251 adc r4,r4,r10 @ T += Ch(e,f,g)
252 ldr r8,[sp,#24+4] @ d.hi
255 adc r4,r4,r12 @ T += K[i]
257 ldr r11,[sp,#8+0] @ b.lo
258 adc r8,r8,r4 @ d += T
261 ldr r12,[sp,#16+0] @ c.lo
263 it eq @ Thumb2 thing, sanity check in ARM
266 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
267 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
268 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
276 eor r10,r10,r6,lsl#30
280 eor r10,r10,r6,lsl#25 @ Sigma0(a)
283 adc r4,r4,r10 @ T += Sigma0(a)
285 ldr r10,[sp,#8+4] @ b.hi
287 ldr r11,[sp,#16+4] @ c.hi
291 orr r5,r5,r9 @ Maj(a,b,c).lo
294 orr r6,r6,r12 @ Maj(a,b,c).hi
296 adc r6,r6,r4 @ h += T
305 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
306 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
307 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
322 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
323 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
324 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
328 eor r10,r10,r11,lsl#13
330 eor r10,r10,r11,lsr#29
332 eor r10,r10,r12,lsl#3
334 eor r10,r10,r12,lsr#6
348 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
349 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
350 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
356 ldr r11,[sp,#56+0] @ h.lo
357 eor r10,r10,r7,lsl#18
358 ldr r12,[sp,#56+4] @ h.hi
360 eor r10,r10,r8,lsr#18
362 eor r10,r10,r7,lsl#14
366 eor r10,r10,r8,lsl#23 @ Sigma1(e)
368 ldr r9,[sp,#40+0] @ f.lo
369 adc r4,r4,r10 @ T += Sigma1(e)
370 ldr r10,[sp,#40+4] @ f.hi
372 ldr r11,[sp,#48+0] @ g.lo
373 adc r4,r4,r12 @ T += h
374 ldr r12,[sp,#48+4] @ g.hi
385 ldr r11,[r14,#LO] @ K[i].lo
386 eor r10,r10,r12 @ Ch(e,f,g)
387 ldr r12,[r14,#HI] @ K[i].hi
390 ldr r7,[sp,#24+0] @ d.lo
391 adc r4,r4,r10 @ T += Ch(e,f,g)
392 ldr r8,[sp,#24+4] @ d.hi
395 adc r4,r4,r12 @ T += K[i]
397 ldr r11,[sp,#8+0] @ b.lo
398 adc r8,r8,r4 @ d += T
401 ldr r12,[sp,#16+0] @ c.lo
403 it eq @ Thumb2 thing, sanity check in ARM
406 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
407 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
408 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
416 eor r10,r10,r6,lsl#30
420 eor r10,r10,r6,lsl#25 @ Sigma0(a)
423 adc r4,r4,r10 @ T += Sigma0(a)
425 ldr r10,[sp,#8+4] @ b.hi
427 ldr r11,[sp,#16+4] @ c.hi
431 orr r5,r5,r9 @ Maj(a,b,c).lo
434 orr r6,r6,r12 @ Maj(a,b,c).hi
436 adc r6,r6,r4 @ h += T
440 ittt eq @ Thumb2 thing, sanity check in ARM
443 ldreq r10,[sp,#184+4]
517 add sp,sp,#8*9 @ destroy frame
519 ldmia sp!,{r4-r12,pc}
521 ldmia sp!,{r4-r12,lr}
523 moveq pc,lr @ be binary compatible with V4, yet
524 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
526 .size sha512_block_data_order,.-sha512_block_data_order
527 #if __ARM_MAX_ARCH__>=7
531 .global sha512_block_data_order_neon
532 .type sha512_block_data_order_neon,%function
534 sha512_block_data_order_neon:
536 dmb @ errata #451034 on early Cortex A8
537 add r2,r1,r2,lsl#7 @ len to point at the end of inp
540 vldmia r0,{d16-d23} @ load context
542 vshr.u64 d24,d20,#14 @ 0
544 vld1.64 {d0},[r1]! @ handles unaligned
548 vadd.i64 d16,d30 @ h+=Maj from the past
551 vld1.64 {d28},[r3,:64]! @ K[i++]
556 #if 0<16 && defined(__ARMEL__)
560 vbsl d29,d21,d22 @ Ch(e,f,g)
562 veor d26,d25 @ Sigma1(e)
574 vbsl d30,d18,d17 @ Maj(a,b,c)
575 veor d23,d26 @ Sigma0(a)
579 vshr.u64 d24,d19,#14 @ 1
581 vld1.64 {d1},[r1]! @ handles unaligned
585 vadd.i64 d23,d30 @ h+=Maj from the past
588 vld1.64 {d28},[r3,:64]! @ K[i++]
593 #if 1<16 && defined(__ARMEL__)
597 vbsl d29,d20,d21 @ Ch(e,f,g)
599 veor d26,d25 @ Sigma1(e)
611 vbsl d30,d17,d16 @ Maj(a,b,c)
612 veor d22,d26 @ Sigma0(a)
616 vshr.u64 d24,d18,#14 @ 2
618 vld1.64 {d2},[r1]! @ handles unaligned
622 vadd.i64 d22,d30 @ h+=Maj from the past
625 vld1.64 {d28},[r3,:64]! @ K[i++]
630 #if 2<16 && defined(__ARMEL__)
634 vbsl d29,d19,d20 @ Ch(e,f,g)
636 veor d26,d25 @ Sigma1(e)
648 vbsl d30,d16,d23 @ Maj(a,b,c)
649 veor d21,d26 @ Sigma0(a)
653 vshr.u64 d24,d17,#14 @ 3
655 vld1.64 {d3},[r1]! @ handles unaligned
659 vadd.i64 d21,d30 @ h+=Maj from the past
662 vld1.64 {d28},[r3,:64]! @ K[i++]
667 #if 3<16 && defined(__ARMEL__)
671 vbsl d29,d18,d19 @ Ch(e,f,g)
673 veor d26,d25 @ Sigma1(e)
685 vbsl d30,d23,d22 @ Maj(a,b,c)
686 veor d20,d26 @ Sigma0(a)
690 vshr.u64 d24,d16,#14 @ 4
692 vld1.64 {d4},[r1]! @ handles unaligned
696 vadd.i64 d20,d30 @ h+=Maj from the past
699 vld1.64 {d28},[r3,:64]! @ K[i++]
704 #if 4<16 && defined(__ARMEL__)
708 vbsl d29,d17,d18 @ Ch(e,f,g)
710 veor d26,d25 @ Sigma1(e)
722 vbsl d30,d22,d21 @ Maj(a,b,c)
723 veor d19,d26 @ Sigma0(a)
727 vshr.u64 d24,d23,#14 @ 5
729 vld1.64 {d5},[r1]! @ handles unaligned
733 vadd.i64 d19,d30 @ h+=Maj from the past
736 vld1.64 {d28},[r3,:64]! @ K[i++]
741 #if 5<16 && defined(__ARMEL__)
745 vbsl d29,d16,d17 @ Ch(e,f,g)
747 veor d26,d25 @ Sigma1(e)
759 vbsl d30,d21,d20 @ Maj(a,b,c)
760 veor d18,d26 @ Sigma0(a)
764 vshr.u64 d24,d22,#14 @ 6
766 vld1.64 {d6},[r1]! @ handles unaligned
770 vadd.i64 d18,d30 @ h+=Maj from the past
773 vld1.64 {d28},[r3,:64]! @ K[i++]
778 #if 6<16 && defined(__ARMEL__)
782 vbsl d29,d23,d16 @ Ch(e,f,g)
784 veor d26,d25 @ Sigma1(e)
796 vbsl d30,d20,d19 @ Maj(a,b,c)
797 veor d17,d26 @ Sigma0(a)
801 vshr.u64 d24,d21,#14 @ 7
803 vld1.64 {d7},[r1]! @ handles unaligned
807 vadd.i64 d17,d30 @ h+=Maj from the past
810 vld1.64 {d28},[r3,:64]! @ K[i++]
815 #if 7<16 && defined(__ARMEL__)
819 vbsl d29,d22,d23 @ Ch(e,f,g)
821 veor d26,d25 @ Sigma1(e)
833 vbsl d30,d19,d18 @ Maj(a,b,c)
834 veor d16,d26 @ Sigma0(a)
838 vshr.u64 d24,d20,#14 @ 8
840 vld1.64 {d8},[r1]! @ handles unaligned
844 vadd.i64 d16,d30 @ h+=Maj from the past
847 vld1.64 {d28},[r3,:64]! @ K[i++]
852 #if 8<16 && defined(__ARMEL__)
856 vbsl d29,d21,d22 @ Ch(e,f,g)
858 veor d26,d25 @ Sigma1(e)
870 vbsl d30,d18,d17 @ Maj(a,b,c)
871 veor d23,d26 @ Sigma0(a)
875 vshr.u64 d24,d19,#14 @ 9
877 vld1.64 {d9},[r1]! @ handles unaligned
881 vadd.i64 d23,d30 @ h+=Maj from the past
884 vld1.64 {d28},[r3,:64]! @ K[i++]
889 #if 9<16 && defined(__ARMEL__)
893 vbsl d29,d20,d21 @ Ch(e,f,g)
895 veor d26,d25 @ Sigma1(e)
907 vbsl d30,d17,d16 @ Maj(a,b,c)
908 veor d22,d26 @ Sigma0(a)
912 vshr.u64 d24,d18,#14 @ 10
914 vld1.64 {d10},[r1]! @ handles unaligned
918 vadd.i64 d22,d30 @ h+=Maj from the past
921 vld1.64 {d28},[r3,:64]! @ K[i++]
926 #if 10<16 && defined(__ARMEL__)
930 vbsl d29,d19,d20 @ Ch(e,f,g)
932 veor d26,d25 @ Sigma1(e)
944 vbsl d30,d16,d23 @ Maj(a,b,c)
945 veor d21,d26 @ Sigma0(a)
949 vshr.u64 d24,d17,#14 @ 11
951 vld1.64 {d11},[r1]! @ handles unaligned
955 vadd.i64 d21,d30 @ h+=Maj from the past
958 vld1.64 {d28},[r3,:64]! @ K[i++]
963 #if 11<16 && defined(__ARMEL__)
967 vbsl d29,d18,d19 @ Ch(e,f,g)
969 veor d26,d25 @ Sigma1(e)
981 vbsl d30,d23,d22 @ Maj(a,b,c)
982 veor d20,d26 @ Sigma0(a)
986 vshr.u64 d24,d16,#14 @ 12
988 vld1.64 {d12},[r1]! @ handles unaligned
992 vadd.i64 d20,d30 @ h+=Maj from the past
995 vld1.64 {d28},[r3,:64]! @ K[i++]
1000 #if 12<16 && defined(__ARMEL__)
1004 vbsl d29,d17,d18 @ Ch(e,f,g)
1005 vshr.u64 d24,d20,#28
1006 veor d26,d25 @ Sigma1(e)
1007 vadd.i64 d27,d29,d19
1008 vshr.u64 d25,d20,#34
1011 vshr.u64 d26,d20,#39
1018 vbsl d30,d22,d21 @ Maj(a,b,c)
1019 veor d19,d26 @ Sigma0(a)
1023 vshr.u64 d24,d23,#14 @ 13
1025 vld1.64 {d13},[r1]! @ handles unaligned
1027 vshr.u64 d25,d23,#18
1029 vadd.i64 d19,d30 @ h+=Maj from the past
1031 vshr.u64 d26,d23,#41
1032 vld1.64 {d28},[r3,:64]! @ K[i++]
1037 #if 13<16 && defined(__ARMEL__)
1041 vbsl d29,d16,d17 @ Ch(e,f,g)
1042 vshr.u64 d24,d19,#28
1043 veor d26,d25 @ Sigma1(e)
1044 vadd.i64 d27,d29,d18
1045 vshr.u64 d25,d19,#34
1048 vshr.u64 d26,d19,#39
1055 vbsl d30,d21,d20 @ Maj(a,b,c)
1056 veor d18,d26 @ Sigma0(a)
1060 vshr.u64 d24,d22,#14 @ 14
1062 vld1.64 {d14},[r1]! @ handles unaligned
1064 vshr.u64 d25,d22,#18
1066 vadd.i64 d18,d30 @ h+=Maj from the past
1068 vshr.u64 d26,d22,#41
1069 vld1.64 {d28},[r3,:64]! @ K[i++]
1074 #if 14<16 && defined(__ARMEL__)
1078 vbsl d29,d23,d16 @ Ch(e,f,g)
1079 vshr.u64 d24,d18,#28
1080 veor d26,d25 @ Sigma1(e)
1081 vadd.i64 d27,d29,d17
1082 vshr.u64 d25,d18,#34
1085 vshr.u64 d26,d18,#39
1092 vbsl d30,d20,d19 @ Maj(a,b,c)
1093 veor d17,d26 @ Sigma0(a)
1097 vshr.u64 d24,d21,#14 @ 15
1099 vld1.64 {d15},[r1]! @ handles unaligned
1101 vshr.u64 d25,d21,#18
1103 vadd.i64 d17,d30 @ h+=Maj from the past
1105 vshr.u64 d26,d21,#41
1106 vld1.64 {d28},[r3,:64]! @ K[i++]
1111 #if 15<16 && defined(__ARMEL__)
1115 vbsl d29,d22,d23 @ Ch(e,f,g)
1116 vshr.u64 d24,d17,#28
1117 veor d26,d25 @ Sigma1(e)
1118 vadd.i64 d27,d29,d16
1119 vshr.u64 d25,d17,#34
1122 vshr.u64 d26,d17,#39
1129 vbsl d30,d19,d18 @ Maj(a,b,c)
1130 veor d16,d26 @ Sigma0(a)
1139 vadd.i64 d16,d30 @ h+=Maj from the past
1142 vext.8 q14,q0,q1,#8 @ X[i+1]
1146 veor q15,q13 @ sigma1(X[i+14])
1152 vext.8 q14,q4,q5,#8 @ X[i+9]
1154 vshr.u64 d24,d20,#14 @ from NEON_00_15
1156 vshr.u64 d25,d20,#18 @ from NEON_00_15
1157 veor q15,q13 @ sigma0(X[i+1])
1158 vshr.u64 d26,d20,#41 @ from NEON_00_15
1160 vld1.64 {d28},[r3,:64]! @ K[i++]
1165 #if 16<16 && defined(__ARMEL__)
1169 vbsl d29,d21,d22 @ Ch(e,f,g)
1170 vshr.u64 d24,d16,#28
1171 veor d26,d25 @ Sigma1(e)
1172 vadd.i64 d27,d29,d23
1173 vshr.u64 d25,d16,#34
1176 vshr.u64 d26,d16,#39
1183 vbsl d30,d18,d17 @ Maj(a,b,c)
1184 veor d23,d26 @ Sigma0(a)
1188 vshr.u64 d24,d19,#14 @ 17
1190 vld1.64 {d1},[r1]! @ handles unaligned
1192 vshr.u64 d25,d19,#18
1194 vadd.i64 d23,d30 @ h+=Maj from the past
1196 vshr.u64 d26,d19,#41
1197 vld1.64 {d28},[r3,:64]! @ K[i++]
1202 #if 17<16 && defined(__ARMEL__)
1206 vbsl d29,d20,d21 @ Ch(e,f,g)
1207 vshr.u64 d24,d23,#28
1208 veor d26,d25 @ Sigma1(e)
1209 vadd.i64 d27,d29,d22
1210 vshr.u64 d25,d23,#34
1213 vshr.u64 d26,d23,#39
1220 vbsl d30,d17,d16 @ Maj(a,b,c)
1221 veor d22,d26 @ Sigma0(a)
1227 vadd.i64 d22,d30 @ h+=Maj from the past
1230 vext.8 q14,q1,q2,#8 @ X[i+1]
1234 veor q15,q13 @ sigma1(X[i+14])
1240 vext.8 q14,q5,q6,#8 @ X[i+9]
1242 vshr.u64 d24,d18,#14 @ from NEON_00_15
1244 vshr.u64 d25,d18,#18 @ from NEON_00_15
1245 veor q15,q13 @ sigma0(X[i+1])
1246 vshr.u64 d26,d18,#41 @ from NEON_00_15
1248 vld1.64 {d28},[r3,:64]! @ K[i++]
1253 #if 18<16 && defined(__ARMEL__)
1257 vbsl d29,d19,d20 @ Ch(e,f,g)
1258 vshr.u64 d24,d22,#28
1259 veor d26,d25 @ Sigma1(e)
1260 vadd.i64 d27,d29,d21
1261 vshr.u64 d25,d22,#34
1264 vshr.u64 d26,d22,#39
1271 vbsl d30,d16,d23 @ Maj(a,b,c)
1272 veor d21,d26 @ Sigma0(a)
1276 vshr.u64 d24,d17,#14 @ 19
1278 vld1.64 {d3},[r1]! @ handles unaligned
1280 vshr.u64 d25,d17,#18
1282 vadd.i64 d21,d30 @ h+=Maj from the past
1284 vshr.u64 d26,d17,#41
1285 vld1.64 {d28},[r3,:64]! @ K[i++]
1290 #if 19<16 && defined(__ARMEL__)
1294 vbsl d29,d18,d19 @ Ch(e,f,g)
1295 vshr.u64 d24,d21,#28
1296 veor d26,d25 @ Sigma1(e)
1297 vadd.i64 d27,d29,d20
1298 vshr.u64 d25,d21,#34
1301 vshr.u64 d26,d21,#39
1308 vbsl d30,d23,d22 @ Maj(a,b,c)
1309 veor d20,d26 @ Sigma0(a)
1315 vadd.i64 d20,d30 @ h+=Maj from the past
1318 vext.8 q14,q2,q3,#8 @ X[i+1]
1322 veor q15,q13 @ sigma1(X[i+14])
1328 vext.8 q14,q6,q7,#8 @ X[i+9]
1330 vshr.u64 d24,d16,#14 @ from NEON_00_15
1332 vshr.u64 d25,d16,#18 @ from NEON_00_15
1333 veor q15,q13 @ sigma0(X[i+1])
1334 vshr.u64 d26,d16,#41 @ from NEON_00_15
1336 vld1.64 {d28},[r3,:64]! @ K[i++]
1341 #if 20<16 && defined(__ARMEL__)
1345 vbsl d29,d17,d18 @ Ch(e,f,g)
1346 vshr.u64 d24,d20,#28
1347 veor d26,d25 @ Sigma1(e)
1348 vadd.i64 d27,d29,d19
1349 vshr.u64 d25,d20,#34
1352 vshr.u64 d26,d20,#39
1359 vbsl d30,d22,d21 @ Maj(a,b,c)
1360 veor d19,d26 @ Sigma0(a)
1364 vshr.u64 d24,d23,#14 @ 21
1366 vld1.64 {d5},[r1]! @ handles unaligned
1368 vshr.u64 d25,d23,#18
1370 vadd.i64 d19,d30 @ h+=Maj from the past
1372 vshr.u64 d26,d23,#41
1373 vld1.64 {d28},[r3,:64]! @ K[i++]
1378 #if 21<16 && defined(__ARMEL__)
1382 vbsl d29,d16,d17 @ Ch(e,f,g)
1383 vshr.u64 d24,d19,#28
1384 veor d26,d25 @ Sigma1(e)
1385 vadd.i64 d27,d29,d18
1386 vshr.u64 d25,d19,#34
1389 vshr.u64 d26,d19,#39
1396 vbsl d30,d21,d20 @ Maj(a,b,c)
1397 veor d18,d26 @ Sigma0(a)
1403 vadd.i64 d18,d30 @ h+=Maj from the past
1406 vext.8 q14,q3,q4,#8 @ X[i+1]
1410 veor q15,q13 @ sigma1(X[i+14])
1416 vext.8 q14,q7,q0,#8 @ X[i+9]
1418 vshr.u64 d24,d22,#14 @ from NEON_00_15
1420 vshr.u64 d25,d22,#18 @ from NEON_00_15
1421 veor q15,q13 @ sigma0(X[i+1])
1422 vshr.u64 d26,d22,#41 @ from NEON_00_15
1424 vld1.64 {d28},[r3,:64]! @ K[i++]
1429 #if 22<16 && defined(__ARMEL__)
1433 vbsl d29,d23,d16 @ Ch(e,f,g)
1434 vshr.u64 d24,d18,#28
1435 veor d26,d25 @ Sigma1(e)
1436 vadd.i64 d27,d29,d17
1437 vshr.u64 d25,d18,#34
1440 vshr.u64 d26,d18,#39
1447 vbsl d30,d20,d19 @ Maj(a,b,c)
1448 veor d17,d26 @ Sigma0(a)
1452 vshr.u64 d24,d21,#14 @ 23
1454 vld1.64 {d7},[r1]! @ handles unaligned
1456 vshr.u64 d25,d21,#18
1458 vadd.i64 d17,d30 @ h+=Maj from the past
1460 vshr.u64 d26,d21,#41
1461 vld1.64 {d28},[r3,:64]! @ K[i++]
1466 #if 23<16 && defined(__ARMEL__)
1470 vbsl d29,d22,d23 @ Ch(e,f,g)
1471 vshr.u64 d24,d17,#28
1472 veor d26,d25 @ Sigma1(e)
1473 vadd.i64 d27,d29,d16
1474 vshr.u64 d25,d17,#34
1477 vshr.u64 d26,d17,#39
1484 vbsl d30,d19,d18 @ Maj(a,b,c)
1485 veor d16,d26 @ Sigma0(a)
1491 vadd.i64 d16,d30 @ h+=Maj from the past
1494 vext.8 q14,q4,q5,#8 @ X[i+1]
1498 veor q15,q13 @ sigma1(X[i+14])
1504 vext.8 q14,q0,q1,#8 @ X[i+9]
1506 vshr.u64 d24,d20,#14 @ from NEON_00_15
1508 vshr.u64 d25,d20,#18 @ from NEON_00_15
1509 veor q15,q13 @ sigma0(X[i+1])
1510 vshr.u64 d26,d20,#41 @ from NEON_00_15
1512 vld1.64 {d28},[r3,:64]! @ K[i++]
1517 #if 24<16 && defined(__ARMEL__)
1521 vbsl d29,d21,d22 @ Ch(e,f,g)
1522 vshr.u64 d24,d16,#28
1523 veor d26,d25 @ Sigma1(e)
1524 vadd.i64 d27,d29,d23
1525 vshr.u64 d25,d16,#34
1528 vshr.u64 d26,d16,#39
1535 vbsl d30,d18,d17 @ Maj(a,b,c)
1536 veor d23,d26 @ Sigma0(a)
1540 vshr.u64 d24,d19,#14 @ 25
1542 vld1.64 {d9},[r1]! @ handles unaligned
1544 vshr.u64 d25,d19,#18
1546 vadd.i64 d23,d30 @ h+=Maj from the past
1548 vshr.u64 d26,d19,#41
1549 vld1.64 {d28},[r3,:64]! @ K[i++]
1554 #if 25<16 && defined(__ARMEL__)
1558 vbsl d29,d20,d21 @ Ch(e,f,g)
1559 vshr.u64 d24,d23,#28
1560 veor d26,d25 @ Sigma1(e)
1561 vadd.i64 d27,d29,d22
1562 vshr.u64 d25,d23,#34
1565 vshr.u64 d26,d23,#39
1572 vbsl d30,d17,d16 @ Maj(a,b,c)
1573 veor d22,d26 @ Sigma0(a)
1579 vadd.i64 d22,d30 @ h+=Maj from the past
1582 vext.8 q14,q5,q6,#8 @ X[i+1]
1586 veor q15,q13 @ sigma1(X[i+14])
1592 vext.8 q14,q1,q2,#8 @ X[i+9]
1594 vshr.u64 d24,d18,#14 @ from NEON_00_15
1596 vshr.u64 d25,d18,#18 @ from NEON_00_15
1597 veor q15,q13 @ sigma0(X[i+1])
1598 vshr.u64 d26,d18,#41 @ from NEON_00_15
1600 vld1.64 {d28},[r3,:64]! @ K[i++]
1605 #if 26<16 && defined(__ARMEL__)
1609 vbsl d29,d19,d20 @ Ch(e,f,g)
1610 vshr.u64 d24,d22,#28
1611 veor d26,d25 @ Sigma1(e)
1612 vadd.i64 d27,d29,d21
1613 vshr.u64 d25,d22,#34
1616 vshr.u64 d26,d22,#39
1623 vbsl d30,d16,d23 @ Maj(a,b,c)
1624 veor d21,d26 @ Sigma0(a)
1628 vshr.u64 d24,d17,#14 @ 27
1630 vld1.64 {d11},[r1]! @ handles unaligned
1632 vshr.u64 d25,d17,#18
1634 vadd.i64 d21,d30 @ h+=Maj from the past
1636 vshr.u64 d26,d17,#41
1637 vld1.64 {d28},[r3,:64]! @ K[i++]
1642 #if 27<16 && defined(__ARMEL__)
1646 vbsl d29,d18,d19 @ Ch(e,f,g)
1647 vshr.u64 d24,d21,#28
1648 veor d26,d25 @ Sigma1(e)
1649 vadd.i64 d27,d29,d20
1650 vshr.u64 d25,d21,#34
1653 vshr.u64 d26,d21,#39
1660 vbsl d30,d23,d22 @ Maj(a,b,c)
1661 veor d20,d26 @ Sigma0(a)
1667 vadd.i64 d20,d30 @ h+=Maj from the past
1670 vext.8 q14,q6,q7,#8 @ X[i+1]
1674 veor q15,q13 @ sigma1(X[i+14])
1680 vext.8 q14,q2,q3,#8 @ X[i+9]
1682 vshr.u64 d24,d16,#14 @ from NEON_00_15
1684 vshr.u64 d25,d16,#18 @ from NEON_00_15
1685 veor q15,q13 @ sigma0(X[i+1])
1686 vshr.u64 d26,d16,#41 @ from NEON_00_15
1688 vld1.64 {d28},[r3,:64]! @ K[i++]
1693 #if 28<16 && defined(__ARMEL__)
1697 vbsl d29,d17,d18 @ Ch(e,f,g)
1698 vshr.u64 d24,d20,#28
1699 veor d26,d25 @ Sigma1(e)
1700 vadd.i64 d27,d29,d19
1701 vshr.u64 d25,d20,#34
1704 vshr.u64 d26,d20,#39
1711 vbsl d30,d22,d21 @ Maj(a,b,c)
1712 veor d19,d26 @ Sigma0(a)
1716 vshr.u64 d24,d23,#14 @ 29
1718 vld1.64 {d13},[r1]! @ handles unaligned
1720 vshr.u64 d25,d23,#18
1722 vadd.i64 d19,d30 @ h+=Maj from the past
1724 vshr.u64 d26,d23,#41
1725 vld1.64 {d28},[r3,:64]! @ K[i++]
1730 #if 29<16 && defined(__ARMEL__)
1734 vbsl d29,d16,d17 @ Ch(e,f,g)
1735 vshr.u64 d24,d19,#28
1736 veor d26,d25 @ Sigma1(e)
1737 vadd.i64 d27,d29,d18
1738 vshr.u64 d25,d19,#34
1741 vshr.u64 d26,d19,#39
1748 vbsl d30,d21,d20 @ Maj(a,b,c)
1749 veor d18,d26 @ Sigma0(a)
1755 vadd.i64 d18,d30 @ h+=Maj from the past
1758 vext.8 q14,q7,q0,#8 @ X[i+1]
1762 veor q15,q13 @ sigma1(X[i+14])
1768 vext.8 q14,q3,q4,#8 @ X[i+9]
1770 vshr.u64 d24,d22,#14 @ from NEON_00_15
1772 vshr.u64 d25,d22,#18 @ from NEON_00_15
1773 veor q15,q13 @ sigma0(X[i+1])
1774 vshr.u64 d26,d22,#41 @ from NEON_00_15
1776 vld1.64 {d28},[r3,:64]! @ K[i++]
1781 #if 30<16 && defined(__ARMEL__)
1785 vbsl d29,d23,d16 @ Ch(e,f,g)
1786 vshr.u64 d24,d18,#28
1787 veor d26,d25 @ Sigma1(e)
1788 vadd.i64 d27,d29,d17
1789 vshr.u64 d25,d18,#34
1792 vshr.u64 d26,d18,#39
1799 vbsl d30,d20,d19 @ Maj(a,b,c)
1800 veor d17,d26 @ Sigma0(a)
1804 vshr.u64 d24,d21,#14 @ 31
1806 vld1.64 {d15},[r1]! @ handles unaligned
1808 vshr.u64 d25,d21,#18
1810 vadd.i64 d17,d30 @ h+=Maj from the past
1812 vshr.u64 d26,d21,#41
1813 vld1.64 {d28},[r3,:64]! @ K[i++]
1818 #if 31<16 && defined(__ARMEL__)
1822 vbsl d29,d22,d23 @ Ch(e,f,g)
1823 vshr.u64 d24,d17,#28
1824 veor d26,d25 @ Sigma1(e)
1825 vadd.i64 d27,d29,d16
1826 vshr.u64 d25,d17,#34
1829 vshr.u64 d26,d17,#39
1836 vbsl d30,d19,d18 @ Maj(a,b,c)
1837 veor d16,d26 @ Sigma0(a)
1843 vadd.i64 d16,d30 @ h+=Maj from the past
1844 vldmia r0,{d24-d31} @ load context to temp
1845 vadd.i64 q8,q12 @ vectorized accumulate
1849 vstmia r0,{d16-d23} @ save context
1851 sub r3,#640 @ rewind K512
1855 bx lr @ .word 0xe12fff1e
1856 .size sha512_block_data_order_neon,.-sha512_block_data_order_neon
1858 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
1860 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
1861 .comm OPENSSL_armcap_P,4,4