1 ########################################################################
2 # Implement fast SHA-256 with AVX2 instructions. (x86_64)
4 # Copyright (C) 2013 Intel Corporation.
7 # James Guilford <james.guilford@intel.com>
8 # Kirk Yap <kirk.s.yap@intel.com>
9 # Tim Chen <tim.c.chen@linux.intel.com>
11 # This software is available to you under a choice of one of two
12 # licenses. You may choose to be licensed under the terms of the GNU
13 # General Public License (GPL) Version 2, available from the file
14 # COPYING in the main directory of this source tree, or the
15 # OpenIB.org BSD license below:
17 # Redistribution and use in source and binary forms, with or
18 # without modification, are permitted provided that the following
21 # - Redistributions of source code must retain the above
22 # copyright notice, this list of conditions and the following
25 # - Redistributions in binary form must reproduce the above
26 # copyright notice, this list of conditions and the following
27 # disclaimer in the documentation and/or other materials
28 # provided with the distribution.
30 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
39 ########################################################################
41 # This code is described in an Intel White-Paper:
42 # "Fast SHA-256 Implementations on Intel Architecture Processors"
44 # To find it, surf to http://www.intel.com/p/en_US/embedded
45 # and search for that title.
47 ########################################################################
48 # This code schedules 2 blocks at a time, with 4 lanes per block
49 ########################################################################
52 #include <linux/linkage.h>
54 ## assume buffers not aligned
55 #define VMOVDQ vmovdqu
57 ################################ Define Macros
60 # Add reg to mem using reg-mem add and store
66 ################################
73 # XMM versions of above
87 SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
88 SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
89 BYTE_FLIP_MASK = %ymm13
91 X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
93 NUM_BLKS = %rdx # 3rd arg
98 e = %edx # clobbers NUM_BLKS
99 y3 = %esi # clobbers INP
101 SRND = CTX # SRND is same register as CTX
116 _XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
124 _XMM_SAVE = _XFER + _XFER_SIZE
125 _INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
126 _INP = _INP_END + _INP_END_SIZE
127 _CTX = _INP + _INP_SIZE
128 _RSP = _CTX + _CTX_SIZE
129 STACK_SIZE = _RSP + _RSP_SIZE
132 # Rotate values of symbols X0...X3
142 # Rotate values of symbols a...h
156 .macro FOUR_ROUNDS_AND_SCHED disp
157 ################################### RND N + 0 ############################
159 mov a, y3 # y3 = a # MAJA
160 rorx $25, e, y0 # y0 = e >> 25 # S1A
161 rorx $11, e, y1 # y1 = e >> 11 # S1B
163 addl \disp(%rsp, SRND), h # h = k + w + h # --
164 or c, y3 # y3 = a|c # MAJA
165 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
166 mov f, y2 # y2 = f # CH
167 rorx $13, a, T1 # T1 = a >> 13 # S0B
169 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
170 xor g, y2 # y2 = f^g # CH
171 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
172 rorx $6, e, y1 # y1 = (e >> 6) # S1
174 and e, y2 # y2 = (f^g)&e # CH
175 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
176 rorx $22, a, y1 # y1 = a >> 22 # S0A
177 add h, d # d = k + w + h + d # --
179 and b, y3 # y3 = (a|c)&b # MAJA
180 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
181 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
182 rorx $2, a, T1 # T1 = (a >> 2) # S0
184 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
185 vpsrld $7, XTMP1, XTMP2
186 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
187 mov a, T1 # T1 = a # MAJB
188 and c, T1 # T1 = a&c # MAJB
190 add y0, y2 # y2 = S1 + CH # --
191 vpslld $(32-7), XTMP1, XTMP3
192 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
193 add y1, h # h = k + w + h + S0 # --
195 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
196 vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
198 vpsrld $18, XTMP1, XTMP2
199 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
200 add y3, h # h = t1 + S0 + MAJ # --
205 ################################### RND N + 1 ############################
207 mov a, y3 # y3 = a # MAJA
208 rorx $25, e, y0 # y0 = e >> 25 # S1A
209 rorx $11, e, y1 # y1 = e >> 11 # S1B
211 addl offset(%rsp, SRND), h # h = k + w + h # --
212 or c, y3 # y3 = a|c # MAJA
215 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
216 mov f, y2 # y2 = f # CH
217 rorx $13, a, T1 # T1 = a >> 13 # S0B
218 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
219 xor g, y2 # y2 = f^g # CH
222 rorx $6, e, y1 # y1 = (e >> 6) # S1
223 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
224 rorx $22, a, y1 # y1 = a >> 22 # S0A
225 and e, y2 # y2 = (f^g)&e # CH
226 add h, d # d = k + w + h + d # --
228 vpslld $(32-18), XTMP1, XTMP1
229 and b, y3 # y3 = (a|c)&b # MAJA
230 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
232 vpxor XTMP1, XTMP3, XTMP3
233 rorx $2, a, T1 # T1 = (a >> 2) # S0
234 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
236 vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
237 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
238 mov a, T1 # T1 = a # MAJB
239 and c, T1 # T1 = a&c # MAJB
240 add y0, y2 # y2 = S1 + CH # --
242 vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
243 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
244 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
245 add y1, h # h = k + w + h + S0 # --
247 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
248 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
249 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
250 add y3, h # h = t1 + S0 + MAJ # --
252 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
257 ################################### RND N + 2 ############################
259 mov a, y3 # y3 = a # MAJA
260 rorx $25, e, y0 # y0 = e >> 25 # S1A
262 addl offset(%rsp, SRND), h # h = k + w + h # --
264 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
265 rorx $11, e, y1 # y1 = e >> 11 # S1B
266 or c, y3 # y3 = a|c # MAJA
267 mov f, y2 # y2 = f # CH
268 xor g, y2 # y2 = f^g # CH
270 rorx $13, a, T1 # T1 = a >> 13 # S0B
271 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
272 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
273 and e, y2 # y2 = (f^g)&e # CH
275 rorx $6, e, y1 # y1 = (e >> 6) # S1
276 vpxor XTMP3, XTMP2, XTMP2
277 add h, d # d = k + w + h + d # --
278 and b, y3 # y3 = (a|c)&b # MAJA
280 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
281 rorx $22, a, y1 # y1 = a >> 22 # S0A
282 vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
283 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
285 vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
286 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
287 rorx $2, a ,T1 # T1 = (a >> 2) # S0
288 vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
290 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
291 mov a, T1 # T1 = a # MAJB
292 and c, T1 # T1 = a&c # MAJB
293 add y0, y2 # y2 = S1 + CH # --
294 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
296 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
297 add y1,h # h = k + w + h + S0 # --
298 add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
299 add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
301 add y3,h # h = t1 + S0 + MAJ # --
306 ################################### RND N + 3 ############################
308 mov a, y3 # y3 = a # MAJA
309 rorx $25, e, y0 # y0 = e >> 25 # S1A
310 rorx $11, e, y1 # y1 = e >> 11 # S1B
312 addl offset(%rsp, SRND), h # h = k + w + h # --
313 or c, y3 # y3 = a|c # MAJA
316 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
317 mov f, y2 # y2 = f # CH
318 rorx $13, a, T1 # T1 = a >> 13 # S0B
319 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
320 xor g, y2 # y2 = f^g # CH
323 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
324 rorx $6, e, y1 # y1 = (e >> 6) # S1
325 and e, y2 # y2 = (f^g)&e # CH
326 add h, d # d = k + w + h + d # --
327 and b, y3 # y3 = (a|c)&b # MAJA
329 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
330 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
331 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
333 vpxor XTMP3, XTMP2, XTMP2
334 rorx $22, a, y1 # y1 = a >> 22 # S0A
335 add y0, y2 # y2 = S1 + CH # --
337 vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
338 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
339 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
341 rorx $2, a, T1 # T1 = (a >> 2) # S0
342 vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
344 vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
345 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
346 mov a, T1 # T1 = a # MAJB
347 and c, T1 # T1 = a&c # MAJB
348 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
350 add y1, h # h = k + w + h + S0 # --
351 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
352 add y3, h # h = t1 + S0 + MAJ # --
358 .macro DO_4ROUNDS disp
359 ################################### RND N + 0 ###########################
361 mov f, y2 # y2 = f # CH
362 rorx $25, e, y0 # y0 = e >> 25 # S1A
363 rorx $11, e, y1 # y1 = e >> 11 # S1B
364 xor g, y2 # y2 = f^g # CH
366 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
367 rorx $6, e, y1 # y1 = (e >> 6) # S1
368 and e, y2 # y2 = (f^g)&e # CH
370 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
371 rorx $13, a, T1 # T1 = a >> 13 # S0B
372 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
373 rorx $22, a, y1 # y1 = a >> 22 # S0A
374 mov a, y3 # y3 = a # MAJA
376 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
377 rorx $2, a, T1 # T1 = (a >> 2) # S0
378 addl \disp(%rsp, SRND), h # h = k + w + h # --
379 or c, y3 # y3 = a|c # MAJA
381 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
382 mov a, T1 # T1 = a # MAJB
383 and b, y3 # y3 = (a|c)&b # MAJA
384 and c, T1 # T1 = a&c # MAJB
385 add y0, y2 # y2 = S1 + CH # --
388 add h, d # d = k + w + h + d # --
389 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
390 add y1, h # h = k + w + h + S0 # --
391 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
395 ################################### RND N + 1 ###########################
397 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
398 mov f, y2 # y2 = f # CH
399 rorx $25, e, y0 # y0 = e >> 25 # S1A
400 rorx $11, e, y1 # y1 = e >> 11 # S1B
401 xor g, y2 # y2 = f^g # CH
403 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
404 rorx $6, e, y1 # y1 = (e >> 6) # S1
405 and e, y2 # y2 = (f^g)&e # CH
406 add y3, old_h # h = t1 + S0 + MAJ # --
408 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
409 rorx $13, a, T1 # T1 = a >> 13 # S0B
410 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
411 rorx $22, a, y1 # y1 = a >> 22 # S0A
412 mov a, y3 # y3 = a # MAJA
414 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
415 rorx $2, a, T1 # T1 = (a >> 2) # S0
417 addl offset(%rsp, SRND), h # h = k + w + h # --
418 or c, y3 # y3 = a|c # MAJA
420 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
421 mov a, T1 # T1 = a # MAJB
422 and b, y3 # y3 = (a|c)&b # MAJA
423 and c, T1 # T1 = a&c # MAJB
424 add y0, y2 # y2 = S1 + CH # --
427 add h, d # d = k + w + h + d # --
428 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
429 add y1, h # h = k + w + h + S0 # --
431 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
435 ################################### RND N + 2 ##############################
437 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
438 mov f, y2 # y2 = f # CH
439 rorx $25, e, y0 # y0 = e >> 25 # S1A
440 rorx $11, e, y1 # y1 = e >> 11 # S1B
441 xor g, y2 # y2 = f^g # CH
443 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
444 rorx $6, e, y1 # y1 = (e >> 6) # S1
445 and e, y2 # y2 = (f^g)&e # CH
446 add y3, old_h # h = t1 + S0 + MAJ # --
448 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
449 rorx $13, a, T1 # T1 = a >> 13 # S0B
450 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
451 rorx $22, a, y1 # y1 = a >> 22 # S0A
452 mov a, y3 # y3 = a # MAJA
454 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
455 rorx $2, a, T1 # T1 = (a >> 2) # S0
457 addl offset(%rsp, SRND), h # h = k + w + h # --
458 or c, y3 # y3 = a|c # MAJA
460 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
461 mov a, T1 # T1 = a # MAJB
462 and b, y3 # y3 = (a|c)&b # MAJA
463 and c, T1 # T1 = a&c # MAJB
464 add y0, y2 # y2 = S1 + CH # --
467 add h, d # d = k + w + h + d # --
468 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
469 add y1, h # h = k + w + h + S0 # --
471 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
475 ################################### RND N + 3 ###########################
477 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
478 mov f, y2 # y2 = f # CH
479 rorx $25, e, y0 # y0 = e >> 25 # S1A
480 rorx $11, e, y1 # y1 = e >> 11 # S1B
481 xor g, y2 # y2 = f^g # CH
483 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
484 rorx $6, e, y1 # y1 = (e >> 6) # S1
485 and e, y2 # y2 = (f^g)&e # CH
486 add y3, old_h # h = t1 + S0 + MAJ # --
488 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
489 rorx $13, a, T1 # T1 = a >> 13 # S0B
490 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
491 rorx $22, a, y1 # y1 = a >> 22 # S0A
492 mov a, y3 # y3 = a # MAJA
494 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
495 rorx $2, a, T1 # T1 = (a >> 2) # S0
497 addl offset(%rsp, SRND), h # h = k + w + h # --
498 or c, y3 # y3 = a|c # MAJA
500 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
501 mov a, T1 # T1 = a # MAJB
502 and b, y3 # y3 = (a|c)&b # MAJA
503 and c, T1 # T1 = a&c # MAJB
504 add y0, y2 # y2 = S1 + CH # --
507 add h, d # d = k + w + h + d # --
508 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
509 add y1, h # h = k + w + h + S0 # --
511 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
514 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
516 add y3, h # h = t1 + S0 + MAJ # --
522 ########################################################################
523 ## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
524 ## arg 1 : pointer to digest
525 ## arg 2 : pointer to input data
526 ## arg 3 : Num blocks
527 ########################################################################
529 ENTRY(sha256_transform_rorx)
538 subq $STACK_SIZE, %rsp
539 and $-32, %rsp # align rsp to 32 byte boundary
543 shl $6, NUM_BLKS # convert to bytes
545 lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
546 mov NUM_BLKS, _INP_END(%rsp)
551 ## load initial digest
561 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
562 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
563 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
568 ## Load first 16 dwords from two blocks
569 VMOVDQ 0*32(INP),XTMP0
570 VMOVDQ 1*32(INP),XTMP1
571 VMOVDQ 2*32(INP),XTMP2
572 VMOVDQ 3*32(INP),XTMP3
575 vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
576 vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
577 vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
578 vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
580 ## transpose data into high/low halves
581 vperm2i128 $0x20, XTMP2, XTMP0, X0
582 vperm2i128 $0x31, XTMP2, XTMP0, X1
583 vperm2i128 $0x20, XTMP3, XTMP1, X2
584 vperm2i128 $0x31, XTMP3, XTMP1, X3
590 ## schedule 48 input dwords, by doing 3 rounds of 12 each
595 vpaddd K256+0*32(SRND), X0, XFER
596 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
597 FOUR_ROUNDS_AND_SCHED _XFER + 0*32
599 vpaddd K256+1*32(SRND), X0, XFER
600 vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
601 FOUR_ROUNDS_AND_SCHED _XFER + 1*32
603 vpaddd K256+2*32(SRND), X0, XFER
604 vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
605 FOUR_ROUNDS_AND_SCHED _XFER + 2*32
607 vpaddd K256+3*32(SRND), X0, XFER
608 vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
609 FOUR_ROUNDS_AND_SCHED _XFER + 3*32
616 ## Do last 16 rounds with no scheduling
617 vpaddd K256+0*32(SRND), X0, XFER
618 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
619 DO_4ROUNDS _XFER + 0*32
621 vpaddd K256+1*32(SRND), X1, XFER
622 vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
623 DO_4ROUNDS _XFER + 1*32
644 cmp _INP_END(%rsp), INP
647 #### Do second block using previously scheduled results
651 DO_4ROUNDS _XFER + 0*32 + 16
652 DO_4ROUNDS _XFER + 1*32 + 16
670 cmp _INP_END(%rsp), INP
675 VMOVDQ 0*16(INP),XWORD0
676 VMOVDQ 1*16(INP),XWORD1
677 VMOVDQ 2*16(INP),XWORD2
678 VMOVDQ 3*16(INP),XWORD3
680 vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
681 vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
682 vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
683 vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
689 ## load initial digest
699 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
700 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
701 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
716 ENDPROC(sha256_transform_rorx)
718 .section .rodata.cst512.K256, "aM", @progbits, 512
721 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
722 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
723 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
724 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
725 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
726 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
727 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
728 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
729 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
730 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
731 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
732 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
733 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
734 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
735 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
736 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
737 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
738 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
739 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
740 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
741 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
742 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
743 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
744 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
745 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
746 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
747 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
748 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
749 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
750 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
751 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
752 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
754 .section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
756 PSHUFFLE_BYTE_FLIP_MASK:
757 .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
759 # shuffle xBxA -> 00BA
760 .section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
763 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
765 # shuffle xDxC -> DC00
766 .section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
769 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF