GNU Linux-libre 4.14.266-gnu1
[releases.git] / arch / x86 / crypto / chacha20-ssse3-x86_64.S
1 /*
2  * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
3  *
4  * Copyright (C) 2015 Martin Willi
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  */
11
12 #include <linux/linkage.h>
13
14 .section        .rodata.cst16.ROT8, "aM", @progbits, 16
15 .align 16
16 ROT8:   .octa 0x0e0d0c0f0a09080b0605040702010003
17 .section        .rodata.cst16.ROT16, "aM", @progbits, 16
18 .align 16
19 ROT16:  .octa 0x0d0c0f0e09080b0a0504070601000302
20 .section        .rodata.cst16.CTRINC, "aM", @progbits, 16
21 .align 16
22 CTRINC: .octa 0x00000003000000020000000100000000
23
24 .text
25
26 ENTRY(chacha20_block_xor_ssse3)
27         # %rdi: Input state matrix, s
28         # %rsi: 1 data block output, o
29         # %rdx: 1 data block input, i
30
31         # This function encrypts one ChaCha20 block by loading the state matrix
32         # in four SSE registers. It performs matrix operation on four words in
33         # parallel, but requireds shuffling to rearrange the words after each
34         # round. 8/16-bit word rotation is done with the slightly better
35         # performing SSSE3 byte shuffling, 7/12-bit word rotation uses
36         # traditional shift+OR.
37
38         # x0..3 = s0..3
39         movdqa          0x00(%rdi),%xmm0
40         movdqa          0x10(%rdi),%xmm1
41         movdqa          0x20(%rdi),%xmm2
42         movdqa          0x30(%rdi),%xmm3
43         movdqa          %xmm0,%xmm8
44         movdqa          %xmm1,%xmm9
45         movdqa          %xmm2,%xmm10
46         movdqa          %xmm3,%xmm11
47
48         movdqa          ROT8(%rip),%xmm4
49         movdqa          ROT16(%rip),%xmm5
50
51         mov     $10,%ecx
52
53 .Ldoubleround:
54
55         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
56         paddd           %xmm1,%xmm0
57         pxor            %xmm0,%xmm3
58         pshufb          %xmm5,%xmm3
59
60         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
61         paddd           %xmm3,%xmm2
62         pxor            %xmm2,%xmm1
63         movdqa          %xmm1,%xmm6
64         pslld           $12,%xmm6
65         psrld           $20,%xmm1
66         por             %xmm6,%xmm1
67
68         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
69         paddd           %xmm1,%xmm0
70         pxor            %xmm0,%xmm3
71         pshufb          %xmm4,%xmm3
72
73         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
74         paddd           %xmm3,%xmm2
75         pxor            %xmm2,%xmm1
76         movdqa          %xmm1,%xmm7
77         pslld           $7,%xmm7
78         psrld           $25,%xmm1
79         por             %xmm7,%xmm1
80
81         # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
82         pshufd          $0x39,%xmm1,%xmm1
83         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
84         pshufd          $0x4e,%xmm2,%xmm2
85         # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
86         pshufd          $0x93,%xmm3,%xmm3
87
88         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
89         paddd           %xmm1,%xmm0
90         pxor            %xmm0,%xmm3
91         pshufb          %xmm5,%xmm3
92
93         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
94         paddd           %xmm3,%xmm2
95         pxor            %xmm2,%xmm1
96         movdqa          %xmm1,%xmm6
97         pslld           $12,%xmm6
98         psrld           $20,%xmm1
99         por             %xmm6,%xmm1
100
101         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
102         paddd           %xmm1,%xmm0
103         pxor            %xmm0,%xmm3
104         pshufb          %xmm4,%xmm3
105
106         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
107         paddd           %xmm3,%xmm2
108         pxor            %xmm2,%xmm1
109         movdqa          %xmm1,%xmm7
110         pslld           $7,%xmm7
111         psrld           $25,%xmm1
112         por             %xmm7,%xmm1
113
114         # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
115         pshufd          $0x93,%xmm1,%xmm1
116         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
117         pshufd          $0x4e,%xmm2,%xmm2
118         # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
119         pshufd          $0x39,%xmm3,%xmm3
120
121         dec             %ecx
122         jnz             .Ldoubleround
123
124         # o0 = i0 ^ (x0 + s0)
125         movdqu          0x00(%rdx),%xmm4
126         paddd           %xmm8,%xmm0
127         pxor            %xmm4,%xmm0
128         movdqu          %xmm0,0x00(%rsi)
129         # o1 = i1 ^ (x1 + s1)
130         movdqu          0x10(%rdx),%xmm5
131         paddd           %xmm9,%xmm1
132         pxor            %xmm5,%xmm1
133         movdqu          %xmm1,0x10(%rsi)
134         # o2 = i2 ^ (x2 + s2)
135         movdqu          0x20(%rdx),%xmm6
136         paddd           %xmm10,%xmm2
137         pxor            %xmm6,%xmm2
138         movdqu          %xmm2,0x20(%rsi)
139         # o3 = i3 ^ (x3 + s3)
140         movdqu          0x30(%rdx),%xmm7
141         paddd           %xmm11,%xmm3
142         pxor            %xmm7,%xmm3
143         movdqu          %xmm3,0x30(%rsi)
144
145         ret
146 ENDPROC(chacha20_block_xor_ssse3)
147
148 ENTRY(chacha20_4block_xor_ssse3)
149         # %rdi: Input state matrix, s
150         # %rsi: 4 data blocks output, o
151         # %rdx: 4 data blocks input, i
152
153         # This function encrypts four consecutive ChaCha20 blocks by loading the
154         # the state matrix in SSE registers four times. As we need some scratch
155         # registers, we save the first four registers on the stack. The
156         # algorithm performs each operation on the corresponding word of each
157         # state matrix, hence requires no word shuffling. For final XORing step
158         # we transpose the matrix by interleaving 32- and then 64-bit words,
159         # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
160         # done with the slightly better performing SSSE3 byte shuffling,
161         # 7/12-bit word rotation uses traditional shift+OR.
162
163         lea             8(%rsp),%r10
164         sub             $0x80,%rsp
165         and             $~63,%rsp
166
167         # x0..15[0-3] = s0..3[0..3]
168         movq            0x00(%rdi),%xmm1
169         pshufd          $0x00,%xmm1,%xmm0
170         pshufd          $0x55,%xmm1,%xmm1
171         movq            0x08(%rdi),%xmm3
172         pshufd          $0x00,%xmm3,%xmm2
173         pshufd          $0x55,%xmm3,%xmm3
174         movq            0x10(%rdi),%xmm5
175         pshufd          $0x00,%xmm5,%xmm4
176         pshufd          $0x55,%xmm5,%xmm5
177         movq            0x18(%rdi),%xmm7
178         pshufd          $0x00,%xmm7,%xmm6
179         pshufd          $0x55,%xmm7,%xmm7
180         movq            0x20(%rdi),%xmm9
181         pshufd          $0x00,%xmm9,%xmm8
182         pshufd          $0x55,%xmm9,%xmm9
183         movq            0x28(%rdi),%xmm11
184         pshufd          $0x00,%xmm11,%xmm10
185         pshufd          $0x55,%xmm11,%xmm11
186         movq            0x30(%rdi),%xmm13
187         pshufd          $0x00,%xmm13,%xmm12
188         pshufd          $0x55,%xmm13,%xmm13
189         movq            0x38(%rdi),%xmm15
190         pshufd          $0x00,%xmm15,%xmm14
191         pshufd          $0x55,%xmm15,%xmm15
192         # x0..3 on stack
193         movdqa          %xmm0,0x00(%rsp)
194         movdqa          %xmm1,0x10(%rsp)
195         movdqa          %xmm2,0x20(%rsp)
196         movdqa          %xmm3,0x30(%rsp)
197
198         movdqa          CTRINC(%rip),%xmm1
199         movdqa          ROT8(%rip),%xmm2
200         movdqa          ROT16(%rip),%xmm3
201
202         # x12 += counter values 0-3
203         paddd           %xmm1,%xmm12
204
205         mov             $10,%ecx
206
207 .Ldoubleround4:
208         # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
209         movdqa          0x00(%rsp),%xmm0
210         paddd           %xmm4,%xmm0
211         movdqa          %xmm0,0x00(%rsp)
212         pxor            %xmm0,%xmm12
213         pshufb          %xmm3,%xmm12
214         # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
215         movdqa          0x10(%rsp),%xmm0
216         paddd           %xmm5,%xmm0
217         movdqa          %xmm0,0x10(%rsp)
218         pxor            %xmm0,%xmm13
219         pshufb          %xmm3,%xmm13
220         # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
221         movdqa          0x20(%rsp),%xmm0
222         paddd           %xmm6,%xmm0
223         movdqa          %xmm0,0x20(%rsp)
224         pxor            %xmm0,%xmm14
225         pshufb          %xmm3,%xmm14
226         # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
227         movdqa          0x30(%rsp),%xmm0
228         paddd           %xmm7,%xmm0
229         movdqa          %xmm0,0x30(%rsp)
230         pxor            %xmm0,%xmm15
231         pshufb          %xmm3,%xmm15
232
233         # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
234         paddd           %xmm12,%xmm8
235         pxor            %xmm8,%xmm4
236         movdqa          %xmm4,%xmm0
237         pslld           $12,%xmm0
238         psrld           $20,%xmm4
239         por             %xmm0,%xmm4
240         # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
241         paddd           %xmm13,%xmm9
242         pxor            %xmm9,%xmm5
243         movdqa          %xmm5,%xmm0
244         pslld           $12,%xmm0
245         psrld           $20,%xmm5
246         por             %xmm0,%xmm5
247         # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
248         paddd           %xmm14,%xmm10
249         pxor            %xmm10,%xmm6
250         movdqa          %xmm6,%xmm0
251         pslld           $12,%xmm0
252         psrld           $20,%xmm6
253         por             %xmm0,%xmm6
254         # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
255         paddd           %xmm15,%xmm11
256         pxor            %xmm11,%xmm7
257         movdqa          %xmm7,%xmm0
258         pslld           $12,%xmm0
259         psrld           $20,%xmm7
260         por             %xmm0,%xmm7
261
262         # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
263         movdqa          0x00(%rsp),%xmm0
264         paddd           %xmm4,%xmm0
265         movdqa          %xmm0,0x00(%rsp)
266         pxor            %xmm0,%xmm12
267         pshufb          %xmm2,%xmm12
268         # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
269         movdqa          0x10(%rsp),%xmm0
270         paddd           %xmm5,%xmm0
271         movdqa          %xmm0,0x10(%rsp)
272         pxor            %xmm0,%xmm13
273         pshufb          %xmm2,%xmm13
274         # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
275         movdqa          0x20(%rsp),%xmm0
276         paddd           %xmm6,%xmm0
277         movdqa          %xmm0,0x20(%rsp)
278         pxor            %xmm0,%xmm14
279         pshufb          %xmm2,%xmm14
280         # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
281         movdqa          0x30(%rsp),%xmm0
282         paddd           %xmm7,%xmm0
283         movdqa          %xmm0,0x30(%rsp)
284         pxor            %xmm0,%xmm15
285         pshufb          %xmm2,%xmm15
286
287         # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
288         paddd           %xmm12,%xmm8
289         pxor            %xmm8,%xmm4
290         movdqa          %xmm4,%xmm0
291         pslld           $7,%xmm0
292         psrld           $25,%xmm4
293         por             %xmm0,%xmm4
294         # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
295         paddd           %xmm13,%xmm9
296         pxor            %xmm9,%xmm5
297         movdqa          %xmm5,%xmm0
298         pslld           $7,%xmm0
299         psrld           $25,%xmm5
300         por             %xmm0,%xmm5
301         # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
302         paddd           %xmm14,%xmm10
303         pxor            %xmm10,%xmm6
304         movdqa          %xmm6,%xmm0
305         pslld           $7,%xmm0
306         psrld           $25,%xmm6
307         por             %xmm0,%xmm6
308         # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
309         paddd           %xmm15,%xmm11
310         pxor            %xmm11,%xmm7
311         movdqa          %xmm7,%xmm0
312         pslld           $7,%xmm0
313         psrld           $25,%xmm7
314         por             %xmm0,%xmm7
315
316         # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
317         movdqa          0x00(%rsp),%xmm0
318         paddd           %xmm5,%xmm0
319         movdqa          %xmm0,0x00(%rsp)
320         pxor            %xmm0,%xmm15
321         pshufb          %xmm3,%xmm15
322         # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
323         movdqa          0x10(%rsp),%xmm0
324         paddd           %xmm6,%xmm0
325         movdqa          %xmm0,0x10(%rsp)
326         pxor            %xmm0,%xmm12
327         pshufb          %xmm3,%xmm12
328         # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
329         movdqa          0x20(%rsp),%xmm0
330         paddd           %xmm7,%xmm0
331         movdqa          %xmm0,0x20(%rsp)
332         pxor            %xmm0,%xmm13
333         pshufb          %xmm3,%xmm13
334         # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
335         movdqa          0x30(%rsp),%xmm0
336         paddd           %xmm4,%xmm0
337         movdqa          %xmm0,0x30(%rsp)
338         pxor            %xmm0,%xmm14
339         pshufb          %xmm3,%xmm14
340
341         # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
342         paddd           %xmm15,%xmm10
343         pxor            %xmm10,%xmm5
344         movdqa          %xmm5,%xmm0
345         pslld           $12,%xmm0
346         psrld           $20,%xmm5
347         por             %xmm0,%xmm5
348         # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
349         paddd           %xmm12,%xmm11
350         pxor            %xmm11,%xmm6
351         movdqa          %xmm6,%xmm0
352         pslld           $12,%xmm0
353         psrld           $20,%xmm6
354         por             %xmm0,%xmm6
355         # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
356         paddd           %xmm13,%xmm8
357         pxor            %xmm8,%xmm7
358         movdqa          %xmm7,%xmm0
359         pslld           $12,%xmm0
360         psrld           $20,%xmm7
361         por             %xmm0,%xmm7
362         # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
363         paddd           %xmm14,%xmm9
364         pxor            %xmm9,%xmm4
365         movdqa          %xmm4,%xmm0
366         pslld           $12,%xmm0
367         psrld           $20,%xmm4
368         por             %xmm0,%xmm4
369
370         # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
371         movdqa          0x00(%rsp),%xmm0
372         paddd           %xmm5,%xmm0
373         movdqa          %xmm0,0x00(%rsp)
374         pxor            %xmm0,%xmm15
375         pshufb          %xmm2,%xmm15
376         # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
377         movdqa          0x10(%rsp),%xmm0
378         paddd           %xmm6,%xmm0
379         movdqa          %xmm0,0x10(%rsp)
380         pxor            %xmm0,%xmm12
381         pshufb          %xmm2,%xmm12
382         # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
383         movdqa          0x20(%rsp),%xmm0
384         paddd           %xmm7,%xmm0
385         movdqa          %xmm0,0x20(%rsp)
386         pxor            %xmm0,%xmm13
387         pshufb          %xmm2,%xmm13
388         # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
389         movdqa          0x30(%rsp),%xmm0
390         paddd           %xmm4,%xmm0
391         movdqa          %xmm0,0x30(%rsp)
392         pxor            %xmm0,%xmm14
393         pshufb          %xmm2,%xmm14
394
395         # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
396         paddd           %xmm15,%xmm10
397         pxor            %xmm10,%xmm5
398         movdqa          %xmm5,%xmm0
399         pslld           $7,%xmm0
400         psrld           $25,%xmm5
401         por             %xmm0,%xmm5
402         # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
403         paddd           %xmm12,%xmm11
404         pxor            %xmm11,%xmm6
405         movdqa          %xmm6,%xmm0
406         pslld           $7,%xmm0
407         psrld           $25,%xmm6
408         por             %xmm0,%xmm6
409         # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
410         paddd           %xmm13,%xmm8
411         pxor            %xmm8,%xmm7
412         movdqa          %xmm7,%xmm0
413         pslld           $7,%xmm0
414         psrld           $25,%xmm7
415         por             %xmm0,%xmm7
416         # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
417         paddd           %xmm14,%xmm9
418         pxor            %xmm9,%xmm4
419         movdqa          %xmm4,%xmm0
420         pslld           $7,%xmm0
421         psrld           $25,%xmm4
422         por             %xmm0,%xmm4
423
424         dec             %ecx
425         jnz             .Ldoubleround4
426
427         # x0[0-3] += s0[0]
428         # x1[0-3] += s0[1]
429         movq            0x00(%rdi),%xmm3
430         pshufd          $0x00,%xmm3,%xmm2
431         pshufd          $0x55,%xmm3,%xmm3
432         paddd           0x00(%rsp),%xmm2
433         movdqa          %xmm2,0x00(%rsp)
434         paddd           0x10(%rsp),%xmm3
435         movdqa          %xmm3,0x10(%rsp)
436         # x2[0-3] += s0[2]
437         # x3[0-3] += s0[3]
438         movq            0x08(%rdi),%xmm3
439         pshufd          $0x00,%xmm3,%xmm2
440         pshufd          $0x55,%xmm3,%xmm3
441         paddd           0x20(%rsp),%xmm2
442         movdqa          %xmm2,0x20(%rsp)
443         paddd           0x30(%rsp),%xmm3
444         movdqa          %xmm3,0x30(%rsp)
445
446         # x4[0-3] += s1[0]
447         # x5[0-3] += s1[1]
448         movq            0x10(%rdi),%xmm3
449         pshufd          $0x00,%xmm3,%xmm2
450         pshufd          $0x55,%xmm3,%xmm3
451         paddd           %xmm2,%xmm4
452         paddd           %xmm3,%xmm5
453         # x6[0-3] += s1[2]
454         # x7[0-3] += s1[3]
455         movq            0x18(%rdi),%xmm3
456         pshufd          $0x00,%xmm3,%xmm2
457         pshufd          $0x55,%xmm3,%xmm3
458         paddd           %xmm2,%xmm6
459         paddd           %xmm3,%xmm7
460
461         # x8[0-3] += s2[0]
462         # x9[0-3] += s2[1]
463         movq            0x20(%rdi),%xmm3
464         pshufd          $0x00,%xmm3,%xmm2
465         pshufd          $0x55,%xmm3,%xmm3
466         paddd           %xmm2,%xmm8
467         paddd           %xmm3,%xmm9
468         # x10[0-3] += s2[2]
469         # x11[0-3] += s2[3]
470         movq            0x28(%rdi),%xmm3
471         pshufd          $0x00,%xmm3,%xmm2
472         pshufd          $0x55,%xmm3,%xmm3
473         paddd           %xmm2,%xmm10
474         paddd           %xmm3,%xmm11
475
476         # x12[0-3] += s3[0]
477         # x13[0-3] += s3[1]
478         movq            0x30(%rdi),%xmm3
479         pshufd          $0x00,%xmm3,%xmm2
480         pshufd          $0x55,%xmm3,%xmm3
481         paddd           %xmm2,%xmm12
482         paddd           %xmm3,%xmm13
483         # x14[0-3] += s3[2]
484         # x15[0-3] += s3[3]
485         movq            0x38(%rdi),%xmm3
486         pshufd          $0x00,%xmm3,%xmm2
487         pshufd          $0x55,%xmm3,%xmm3
488         paddd           %xmm2,%xmm14
489         paddd           %xmm3,%xmm15
490
491         # x12 += counter values 0-3
492         paddd           %xmm1,%xmm12
493
494         # interleave 32-bit words in state n, n+1
495         movdqa          0x00(%rsp),%xmm0
496         movdqa          0x10(%rsp),%xmm1
497         movdqa          %xmm0,%xmm2
498         punpckldq       %xmm1,%xmm2
499         punpckhdq       %xmm1,%xmm0
500         movdqa          %xmm2,0x00(%rsp)
501         movdqa          %xmm0,0x10(%rsp)
502         movdqa          0x20(%rsp),%xmm0
503         movdqa          0x30(%rsp),%xmm1
504         movdqa          %xmm0,%xmm2
505         punpckldq       %xmm1,%xmm2
506         punpckhdq       %xmm1,%xmm0
507         movdqa          %xmm2,0x20(%rsp)
508         movdqa          %xmm0,0x30(%rsp)
509         movdqa          %xmm4,%xmm0
510         punpckldq       %xmm5,%xmm4
511         punpckhdq       %xmm5,%xmm0
512         movdqa          %xmm0,%xmm5
513         movdqa          %xmm6,%xmm0
514         punpckldq       %xmm7,%xmm6
515         punpckhdq       %xmm7,%xmm0
516         movdqa          %xmm0,%xmm7
517         movdqa          %xmm8,%xmm0
518         punpckldq       %xmm9,%xmm8
519         punpckhdq       %xmm9,%xmm0
520         movdqa          %xmm0,%xmm9
521         movdqa          %xmm10,%xmm0
522         punpckldq       %xmm11,%xmm10
523         punpckhdq       %xmm11,%xmm0
524         movdqa          %xmm0,%xmm11
525         movdqa          %xmm12,%xmm0
526         punpckldq       %xmm13,%xmm12
527         punpckhdq       %xmm13,%xmm0
528         movdqa          %xmm0,%xmm13
529         movdqa          %xmm14,%xmm0
530         punpckldq       %xmm15,%xmm14
531         punpckhdq       %xmm15,%xmm0
532         movdqa          %xmm0,%xmm15
533
534         # interleave 64-bit words in state n, n+2
535         movdqa          0x00(%rsp),%xmm0
536         movdqa          0x20(%rsp),%xmm1
537         movdqa          %xmm0,%xmm2
538         punpcklqdq      %xmm1,%xmm2
539         punpckhqdq      %xmm1,%xmm0
540         movdqa          %xmm2,0x00(%rsp)
541         movdqa          %xmm0,0x20(%rsp)
542         movdqa          0x10(%rsp),%xmm0
543         movdqa          0x30(%rsp),%xmm1
544         movdqa          %xmm0,%xmm2
545         punpcklqdq      %xmm1,%xmm2
546         punpckhqdq      %xmm1,%xmm0
547         movdqa          %xmm2,0x10(%rsp)
548         movdqa          %xmm0,0x30(%rsp)
549         movdqa          %xmm4,%xmm0
550         punpcklqdq      %xmm6,%xmm4
551         punpckhqdq      %xmm6,%xmm0
552         movdqa          %xmm0,%xmm6
553         movdqa          %xmm5,%xmm0
554         punpcklqdq      %xmm7,%xmm5
555         punpckhqdq      %xmm7,%xmm0
556         movdqa          %xmm0,%xmm7
557         movdqa          %xmm8,%xmm0
558         punpcklqdq      %xmm10,%xmm8
559         punpckhqdq      %xmm10,%xmm0
560         movdqa          %xmm0,%xmm10
561         movdqa          %xmm9,%xmm0
562         punpcklqdq      %xmm11,%xmm9
563         punpckhqdq      %xmm11,%xmm0
564         movdqa          %xmm0,%xmm11
565         movdqa          %xmm12,%xmm0
566         punpcklqdq      %xmm14,%xmm12
567         punpckhqdq      %xmm14,%xmm0
568         movdqa          %xmm0,%xmm14
569         movdqa          %xmm13,%xmm0
570         punpcklqdq      %xmm15,%xmm13
571         punpckhqdq      %xmm15,%xmm0
572         movdqa          %xmm0,%xmm15
573
574         # xor with corresponding input, write to output
575         movdqa          0x00(%rsp),%xmm0
576         movdqu          0x00(%rdx),%xmm1
577         pxor            %xmm1,%xmm0
578         movdqu          %xmm0,0x00(%rsi)
579         movdqa          0x10(%rsp),%xmm0
580         movdqu          0x80(%rdx),%xmm1
581         pxor            %xmm1,%xmm0
582         movdqu          %xmm0,0x80(%rsi)
583         movdqa          0x20(%rsp),%xmm0
584         movdqu          0x40(%rdx),%xmm1
585         pxor            %xmm1,%xmm0
586         movdqu          %xmm0,0x40(%rsi)
587         movdqa          0x30(%rsp),%xmm0
588         movdqu          0xc0(%rdx),%xmm1
589         pxor            %xmm1,%xmm0
590         movdqu          %xmm0,0xc0(%rsi)
591         movdqu          0x10(%rdx),%xmm1
592         pxor            %xmm1,%xmm4
593         movdqu          %xmm4,0x10(%rsi)
594         movdqu          0x90(%rdx),%xmm1
595         pxor            %xmm1,%xmm5
596         movdqu          %xmm5,0x90(%rsi)
597         movdqu          0x50(%rdx),%xmm1
598         pxor            %xmm1,%xmm6
599         movdqu          %xmm6,0x50(%rsi)
600         movdqu          0xd0(%rdx),%xmm1
601         pxor            %xmm1,%xmm7
602         movdqu          %xmm7,0xd0(%rsi)
603         movdqu          0x20(%rdx),%xmm1
604         pxor            %xmm1,%xmm8
605         movdqu          %xmm8,0x20(%rsi)
606         movdqu          0xa0(%rdx),%xmm1
607         pxor            %xmm1,%xmm9
608         movdqu          %xmm9,0xa0(%rsi)
609         movdqu          0x60(%rdx),%xmm1
610         pxor            %xmm1,%xmm10
611         movdqu          %xmm10,0x60(%rsi)
612         movdqu          0xe0(%rdx),%xmm1
613         pxor            %xmm1,%xmm11
614         movdqu          %xmm11,0xe0(%rsi)
615         movdqu          0x30(%rdx),%xmm1
616         pxor            %xmm1,%xmm12
617         movdqu          %xmm12,0x30(%rsi)
618         movdqu          0xb0(%rdx),%xmm1
619         pxor            %xmm1,%xmm13
620         movdqu          %xmm13,0xb0(%rsi)
621         movdqu          0x70(%rdx),%xmm1
622         pxor            %xmm1,%xmm14
623         movdqu          %xmm14,0x70(%rsi)
624         movdqu          0xf0(%rdx),%xmm1
625         pxor            %xmm1,%xmm15
626         movdqu          %xmm15,0xf0(%rsi)
627
628         lea             -8(%r10),%rsp
629         ret
630 ENDPROC(chacha20_4block_xor_ssse3)