GNU Linux-libre 5.4.274-gnu1
[releases.git] / arch / arm64 / crypto / chacha-neon-core.S
1 /*
2  * ChaCha/XChaCha NEON helper functions
3  *
4  * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  *
10  * Originally based on:
11  * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
12  *
13  * Copyright (C) 2015 Martin Willi
14  *
15  * This program is free software; you can redistribute it and/or modify
16  * it under the terms of the GNU General Public License as published by
17  * the Free Software Foundation; either version 2 of the License, or
18  * (at your option) any later version.
19  */
20
21 #include <linux/linkage.h>
22 #include <asm/assembler.h>
23 #include <asm/cache.h>
24
25         .text
26         .align          6
27
28 /*
29  * chacha_permute - permute one block
30  *
31  * Permute one 64-byte block where the state matrix is stored in the four NEON
32  * registers v0-v3.  It performs matrix operations on four words in parallel,
33  * but requires shuffling to rearrange the words after each round.
34  *
35  * The round count is given in w3.
36  *
37  * Clobbers: w3, x10, v4, v12
38  */
39 chacha_permute:
40
41         adr_l           x10, ROT8
42         ld1             {v12.4s}, [x10]
43
44 .Ldoubleround:
45         // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
46         add             v0.4s, v0.4s, v1.4s
47         eor             v3.16b, v3.16b, v0.16b
48         rev32           v3.8h, v3.8h
49
50         // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
51         add             v2.4s, v2.4s, v3.4s
52         eor             v4.16b, v1.16b, v2.16b
53         shl             v1.4s, v4.4s, #12
54         sri             v1.4s, v4.4s, #20
55
56         // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
57         add             v0.4s, v0.4s, v1.4s
58         eor             v3.16b, v3.16b, v0.16b
59         tbl             v3.16b, {v3.16b}, v12.16b
60
61         // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
62         add             v2.4s, v2.4s, v3.4s
63         eor             v4.16b, v1.16b, v2.16b
64         shl             v1.4s, v4.4s, #7
65         sri             v1.4s, v4.4s, #25
66
67         // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
68         ext             v1.16b, v1.16b, v1.16b, #4
69         // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
70         ext             v2.16b, v2.16b, v2.16b, #8
71         // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
72         ext             v3.16b, v3.16b, v3.16b, #12
73
74         // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
75         add             v0.4s, v0.4s, v1.4s
76         eor             v3.16b, v3.16b, v0.16b
77         rev32           v3.8h, v3.8h
78
79         // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
80         add             v2.4s, v2.4s, v3.4s
81         eor             v4.16b, v1.16b, v2.16b
82         shl             v1.4s, v4.4s, #12
83         sri             v1.4s, v4.4s, #20
84
85         // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
86         add             v0.4s, v0.4s, v1.4s
87         eor             v3.16b, v3.16b, v0.16b
88         tbl             v3.16b, {v3.16b}, v12.16b
89
90         // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
91         add             v2.4s, v2.4s, v3.4s
92         eor             v4.16b, v1.16b, v2.16b
93         shl             v1.4s, v4.4s, #7
94         sri             v1.4s, v4.4s, #25
95
96         // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
97         ext             v1.16b, v1.16b, v1.16b, #12
98         // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
99         ext             v2.16b, v2.16b, v2.16b, #8
100         // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
101         ext             v3.16b, v3.16b, v3.16b, #4
102
103         subs            w3, w3, #2
104         b.ne            .Ldoubleround
105
106         ret
107 ENDPROC(chacha_permute)
108
109 ENTRY(chacha_block_xor_neon)
110         // x0: Input state matrix, s
111         // x1: 1 data block output, o
112         // x2: 1 data block input, i
113         // w3: nrounds
114
115         stp             x29, x30, [sp, #-16]!
116         mov             x29, sp
117
118         // x0..3 = s0..3
119         ld1             {v0.4s-v3.4s}, [x0]
120         ld1             {v8.4s-v11.4s}, [x0]
121
122         bl              chacha_permute
123
124         ld1             {v4.16b-v7.16b}, [x2]
125
126         // o0 = i0 ^ (x0 + s0)
127         add             v0.4s, v0.4s, v8.4s
128         eor             v0.16b, v0.16b, v4.16b
129
130         // o1 = i1 ^ (x1 + s1)
131         add             v1.4s, v1.4s, v9.4s
132         eor             v1.16b, v1.16b, v5.16b
133
134         // o2 = i2 ^ (x2 + s2)
135         add             v2.4s, v2.4s, v10.4s
136         eor             v2.16b, v2.16b, v6.16b
137
138         // o3 = i3 ^ (x3 + s3)
139         add             v3.4s, v3.4s, v11.4s
140         eor             v3.16b, v3.16b, v7.16b
141
142         st1             {v0.16b-v3.16b}, [x1]
143
144         ldp             x29, x30, [sp], #16
145         ret
146 ENDPROC(chacha_block_xor_neon)
147
148 ENTRY(hchacha_block_neon)
149         // x0: Input state matrix, s
150         // x1: output (8 32-bit words)
151         // w2: nrounds
152
153         stp             x29, x30, [sp, #-16]!
154         mov             x29, sp
155
156         ld1             {v0.4s-v3.4s}, [x0]
157
158         mov             w3, w2
159         bl              chacha_permute
160
161         st1             {v0.4s}, [x1], #16
162         st1             {v3.4s}, [x1]
163
164         ldp             x29, x30, [sp], #16
165         ret
166 ENDPROC(hchacha_block_neon)
167
168         a0              .req    w12
169         a1              .req    w13
170         a2              .req    w14
171         a3              .req    w15
172         a4              .req    w16
173         a5              .req    w17
174         a6              .req    w19
175         a7              .req    w20
176         a8              .req    w21
177         a9              .req    w22
178         a10             .req    w23
179         a11             .req    w24
180         a12             .req    w25
181         a13             .req    w26
182         a14             .req    w27
183         a15             .req    w28
184
185         .align          6
186 ENTRY(chacha_4block_xor_neon)
187         frame_push      10
188
189         // x0: Input state matrix, s
190         // x1: 4 data blocks output, o
191         // x2: 4 data blocks input, i
192         // w3: nrounds
193         // x4: byte count
194
195         adr_l           x10, .Lpermute
196         and             x5, x4, #63
197         add             x10, x10, x5
198         add             x11, x10, #64
199
200         //
201         // This function encrypts four consecutive ChaCha blocks by loading
202         // the state matrix in NEON registers four times. The algorithm performs
203         // each operation on the corresponding word of each state matrix, hence
204         // requires no word shuffling. For final XORing step we transpose the
205         // matrix by interleaving 32- and then 64-bit words, which allows us to
206         // do XOR in NEON registers.
207         //
208         // At the same time, a fifth block is encrypted in parallel using
209         // scalar registers
210         //
211         adr_l           x9, CTRINC              // ... and ROT8
212         ld1             {v30.4s-v31.4s}, [x9]
213
214         // x0..15[0-3] = s0..3[0..3]
215         add             x8, x0, #16
216         ld4r            { v0.4s- v3.4s}, [x0]
217         ld4r            { v4.4s- v7.4s}, [x8], #16
218         ld4r            { v8.4s-v11.4s}, [x8], #16
219         ld4r            {v12.4s-v15.4s}, [x8]
220
221         mov             a0, v0.s[0]
222         mov             a1, v1.s[0]
223         mov             a2, v2.s[0]
224         mov             a3, v3.s[0]
225         mov             a4, v4.s[0]
226         mov             a5, v5.s[0]
227         mov             a6, v6.s[0]
228         mov             a7, v7.s[0]
229         mov             a8, v8.s[0]
230         mov             a9, v9.s[0]
231         mov             a10, v10.s[0]
232         mov             a11, v11.s[0]
233         mov             a12, v12.s[0]
234         mov             a13, v13.s[0]
235         mov             a14, v14.s[0]
236         mov             a15, v15.s[0]
237
238         // x12 += counter values 1-4
239         add             v12.4s, v12.4s, v30.4s
240
241 .Ldoubleround4:
242         // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
243         // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
244         // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
245         // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
246         add             v0.4s, v0.4s, v4.4s
247           add           a0, a0, a4
248         add             v1.4s, v1.4s, v5.4s
249           add           a1, a1, a5
250         add             v2.4s, v2.4s, v6.4s
251           add           a2, a2, a6
252         add             v3.4s, v3.4s, v7.4s
253           add           a3, a3, a7
254
255         eor             v12.16b, v12.16b, v0.16b
256           eor           a12, a12, a0
257         eor             v13.16b, v13.16b, v1.16b
258           eor           a13, a13, a1
259         eor             v14.16b, v14.16b, v2.16b
260           eor           a14, a14, a2
261         eor             v15.16b, v15.16b, v3.16b
262           eor           a15, a15, a3
263
264         rev32           v12.8h, v12.8h
265           ror           a12, a12, #16
266         rev32           v13.8h, v13.8h
267           ror           a13, a13, #16
268         rev32           v14.8h, v14.8h
269           ror           a14, a14, #16
270         rev32           v15.8h, v15.8h
271           ror           a15, a15, #16
272
273         // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
274         // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
275         // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
276         // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
277         add             v8.4s, v8.4s, v12.4s
278           add           a8, a8, a12
279         add             v9.4s, v9.4s, v13.4s
280           add           a9, a9, a13
281         add             v10.4s, v10.4s, v14.4s
282           add           a10, a10, a14
283         add             v11.4s, v11.4s, v15.4s
284           add           a11, a11, a15
285
286         eor             v16.16b, v4.16b, v8.16b
287           eor           a4, a4, a8
288         eor             v17.16b, v5.16b, v9.16b
289           eor           a5, a5, a9
290         eor             v18.16b, v6.16b, v10.16b
291           eor           a6, a6, a10
292         eor             v19.16b, v7.16b, v11.16b
293           eor           a7, a7, a11
294
295         shl             v4.4s, v16.4s, #12
296         shl             v5.4s, v17.4s, #12
297         shl             v6.4s, v18.4s, #12
298         shl             v7.4s, v19.4s, #12
299
300         sri             v4.4s, v16.4s, #20
301           ror           a4, a4, #20
302         sri             v5.4s, v17.4s, #20
303           ror           a5, a5, #20
304         sri             v6.4s, v18.4s, #20
305           ror           a6, a6, #20
306         sri             v7.4s, v19.4s, #20
307           ror           a7, a7, #20
308
309         // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
310         // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
311         // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
312         // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
313         add             v0.4s, v0.4s, v4.4s
314           add           a0, a0, a4
315         add             v1.4s, v1.4s, v5.4s
316           add           a1, a1, a5
317         add             v2.4s, v2.4s, v6.4s
318           add           a2, a2, a6
319         add             v3.4s, v3.4s, v7.4s
320           add           a3, a3, a7
321
322         eor             v12.16b, v12.16b, v0.16b
323           eor           a12, a12, a0
324         eor             v13.16b, v13.16b, v1.16b
325           eor           a13, a13, a1
326         eor             v14.16b, v14.16b, v2.16b
327           eor           a14, a14, a2
328         eor             v15.16b, v15.16b, v3.16b
329           eor           a15, a15, a3
330
331         tbl             v12.16b, {v12.16b}, v31.16b
332           ror           a12, a12, #24
333         tbl             v13.16b, {v13.16b}, v31.16b
334           ror           a13, a13, #24
335         tbl             v14.16b, {v14.16b}, v31.16b
336           ror           a14, a14, #24
337         tbl             v15.16b, {v15.16b}, v31.16b
338           ror           a15, a15, #24
339
340         // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
341         // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
342         // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
343         // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
344         add             v8.4s, v8.4s, v12.4s
345           add           a8, a8, a12
346         add             v9.4s, v9.4s, v13.4s
347           add           a9, a9, a13
348         add             v10.4s, v10.4s, v14.4s
349           add           a10, a10, a14
350         add             v11.4s, v11.4s, v15.4s
351           add           a11, a11, a15
352
353         eor             v16.16b, v4.16b, v8.16b
354           eor           a4, a4, a8
355         eor             v17.16b, v5.16b, v9.16b
356           eor           a5, a5, a9
357         eor             v18.16b, v6.16b, v10.16b
358           eor           a6, a6, a10
359         eor             v19.16b, v7.16b, v11.16b
360           eor           a7, a7, a11
361
362         shl             v4.4s, v16.4s, #7
363         shl             v5.4s, v17.4s, #7
364         shl             v6.4s, v18.4s, #7
365         shl             v7.4s, v19.4s, #7
366
367         sri             v4.4s, v16.4s, #25
368           ror           a4, a4, #25
369         sri             v5.4s, v17.4s, #25
370           ror           a5, a5, #25
371         sri             v6.4s, v18.4s, #25
372          ror            a6, a6, #25
373         sri             v7.4s, v19.4s, #25
374           ror           a7, a7, #25
375
376         // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
377         // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
378         // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
379         // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
380         add             v0.4s, v0.4s, v5.4s
381           add           a0, a0, a5
382         add             v1.4s, v1.4s, v6.4s
383           add           a1, a1, a6
384         add             v2.4s, v2.4s, v7.4s
385           add           a2, a2, a7
386         add             v3.4s, v3.4s, v4.4s
387           add           a3, a3, a4
388
389         eor             v15.16b, v15.16b, v0.16b
390           eor           a15, a15, a0
391         eor             v12.16b, v12.16b, v1.16b
392           eor           a12, a12, a1
393         eor             v13.16b, v13.16b, v2.16b
394           eor           a13, a13, a2
395         eor             v14.16b, v14.16b, v3.16b
396           eor           a14, a14, a3
397
398         rev32           v15.8h, v15.8h
399           ror           a15, a15, #16
400         rev32           v12.8h, v12.8h
401           ror           a12, a12, #16
402         rev32           v13.8h, v13.8h
403           ror           a13, a13, #16
404         rev32           v14.8h, v14.8h
405           ror           a14, a14, #16
406
407         // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
408         // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
409         // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
410         // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
411         add             v10.4s, v10.4s, v15.4s
412           add           a10, a10, a15
413         add             v11.4s, v11.4s, v12.4s
414           add           a11, a11, a12
415         add             v8.4s, v8.4s, v13.4s
416           add           a8, a8, a13
417         add             v9.4s, v9.4s, v14.4s
418           add           a9, a9, a14
419
420         eor             v16.16b, v5.16b, v10.16b
421           eor           a5, a5, a10
422         eor             v17.16b, v6.16b, v11.16b
423           eor           a6, a6, a11
424         eor             v18.16b, v7.16b, v8.16b
425           eor           a7, a7, a8
426         eor             v19.16b, v4.16b, v9.16b
427           eor           a4, a4, a9
428
429         shl             v5.4s, v16.4s, #12
430         shl             v6.4s, v17.4s, #12
431         shl             v7.4s, v18.4s, #12
432         shl             v4.4s, v19.4s, #12
433
434         sri             v5.4s, v16.4s, #20
435           ror           a5, a5, #20
436         sri             v6.4s, v17.4s, #20
437           ror           a6, a6, #20
438         sri             v7.4s, v18.4s, #20
439           ror           a7, a7, #20
440         sri             v4.4s, v19.4s, #20
441           ror           a4, a4, #20
442
443         // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
444         // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
445         // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
446         // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
447         add             v0.4s, v0.4s, v5.4s
448           add           a0, a0, a5
449         add             v1.4s, v1.4s, v6.4s
450           add           a1, a1, a6
451         add             v2.4s, v2.4s, v7.4s
452           add           a2, a2, a7
453         add             v3.4s, v3.4s, v4.4s
454           add           a3, a3, a4
455
456         eor             v15.16b, v15.16b, v0.16b
457           eor           a15, a15, a0
458         eor             v12.16b, v12.16b, v1.16b
459           eor           a12, a12, a1
460         eor             v13.16b, v13.16b, v2.16b
461           eor           a13, a13, a2
462         eor             v14.16b, v14.16b, v3.16b
463           eor           a14, a14, a3
464
465         tbl             v15.16b, {v15.16b}, v31.16b
466           ror           a15, a15, #24
467         tbl             v12.16b, {v12.16b}, v31.16b
468           ror           a12, a12, #24
469         tbl             v13.16b, {v13.16b}, v31.16b
470           ror           a13, a13, #24
471         tbl             v14.16b, {v14.16b}, v31.16b
472           ror           a14, a14, #24
473
474         // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
475         // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
476         // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
477         // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
478         add             v10.4s, v10.4s, v15.4s
479           add           a10, a10, a15
480         add             v11.4s, v11.4s, v12.4s
481           add           a11, a11, a12
482         add             v8.4s, v8.4s, v13.4s
483           add           a8, a8, a13
484         add             v9.4s, v9.4s, v14.4s
485           add           a9, a9, a14
486
487         eor             v16.16b, v5.16b, v10.16b
488           eor           a5, a5, a10
489         eor             v17.16b, v6.16b, v11.16b
490           eor           a6, a6, a11
491         eor             v18.16b, v7.16b, v8.16b
492           eor           a7, a7, a8
493         eor             v19.16b, v4.16b, v9.16b
494           eor           a4, a4, a9
495
496         shl             v5.4s, v16.4s, #7
497         shl             v6.4s, v17.4s, #7
498         shl             v7.4s, v18.4s, #7
499         shl             v4.4s, v19.4s, #7
500
501         sri             v5.4s, v16.4s, #25
502           ror           a5, a5, #25
503         sri             v6.4s, v17.4s, #25
504           ror           a6, a6, #25
505         sri             v7.4s, v18.4s, #25
506           ror           a7, a7, #25
507         sri             v4.4s, v19.4s, #25
508           ror           a4, a4, #25
509
510         subs            w3, w3, #2
511         b.ne            .Ldoubleround4
512
513         ld4r            {v16.4s-v19.4s}, [x0], #16
514         ld4r            {v20.4s-v23.4s}, [x0], #16
515
516         // x12 += counter values 0-3
517         add             v12.4s, v12.4s, v30.4s
518
519         // x0[0-3] += s0[0]
520         // x1[0-3] += s0[1]
521         // x2[0-3] += s0[2]
522         // x3[0-3] += s0[3]
523         add             v0.4s, v0.4s, v16.4s
524           mov           w6, v16.s[0]
525           mov           w7, v17.s[0]
526         add             v1.4s, v1.4s, v17.4s
527           mov           w8, v18.s[0]
528           mov           w9, v19.s[0]
529         add             v2.4s, v2.4s, v18.4s
530           add           a0, a0, w6
531           add           a1, a1, w7
532         add             v3.4s, v3.4s, v19.4s
533           add           a2, a2, w8
534           add           a3, a3, w9
535 CPU_BE(   rev           a0, a0          )
536 CPU_BE(   rev           a1, a1          )
537 CPU_BE(   rev           a2, a2          )
538 CPU_BE(   rev           a3, a3          )
539
540         ld4r            {v24.4s-v27.4s}, [x0], #16
541         ld4r            {v28.4s-v31.4s}, [x0]
542
543         // x4[0-3] += s1[0]
544         // x5[0-3] += s1[1]
545         // x6[0-3] += s1[2]
546         // x7[0-3] += s1[3]
547         add             v4.4s, v4.4s, v20.4s
548           mov           w6, v20.s[0]
549           mov           w7, v21.s[0]
550         add             v5.4s, v5.4s, v21.4s
551           mov           w8, v22.s[0]
552           mov           w9, v23.s[0]
553         add             v6.4s, v6.4s, v22.4s
554           add           a4, a4, w6
555           add           a5, a5, w7
556         add             v7.4s, v7.4s, v23.4s
557           add           a6, a6, w8
558           add           a7, a7, w9
559 CPU_BE(   rev           a4, a4          )
560 CPU_BE(   rev           a5, a5          )
561 CPU_BE(   rev           a6, a6          )
562 CPU_BE(   rev           a7, a7          )
563
564         // x8[0-3] += s2[0]
565         // x9[0-3] += s2[1]
566         // x10[0-3] += s2[2]
567         // x11[0-3] += s2[3]
568         add             v8.4s, v8.4s, v24.4s
569           mov           w6, v24.s[0]
570           mov           w7, v25.s[0]
571         add             v9.4s, v9.4s, v25.4s
572           mov           w8, v26.s[0]
573           mov           w9, v27.s[0]
574         add             v10.4s, v10.4s, v26.4s
575           add           a8, a8, w6
576           add           a9, a9, w7
577         add             v11.4s, v11.4s, v27.4s
578           add           a10, a10, w8
579           add           a11, a11, w9
580 CPU_BE(   rev           a8, a8          )
581 CPU_BE(   rev           a9, a9          )
582 CPU_BE(   rev           a10, a10        )
583 CPU_BE(   rev           a11, a11        )
584
585         // x12[0-3] += s3[0]
586         // x13[0-3] += s3[1]
587         // x14[0-3] += s3[2]
588         // x15[0-3] += s3[3]
589         add             v12.4s, v12.4s, v28.4s
590           mov           w6, v28.s[0]
591           mov           w7, v29.s[0]
592         add             v13.4s, v13.4s, v29.4s
593           mov           w8, v30.s[0]
594           mov           w9, v31.s[0]
595         add             v14.4s, v14.4s, v30.4s
596           add           a12, a12, w6
597           add           a13, a13, w7
598         add             v15.4s, v15.4s, v31.4s
599           add           a14, a14, w8
600           add           a15, a15, w9
601 CPU_BE(   rev           a12, a12        )
602 CPU_BE(   rev           a13, a13        )
603 CPU_BE(   rev           a14, a14        )
604 CPU_BE(   rev           a15, a15        )
605
606         // interleave 32-bit words in state n, n+1
607           ldp           w6, w7, [x2], #64
608         zip1            v16.4s, v0.4s, v1.4s
609           ldp           w8, w9, [x2, #-56]
610           eor           a0, a0, w6
611         zip2            v17.4s, v0.4s, v1.4s
612           eor           a1, a1, w7
613         zip1            v18.4s, v2.4s, v3.4s
614           eor           a2, a2, w8
615         zip2            v19.4s, v2.4s, v3.4s
616           eor           a3, a3, w9
617           ldp           w6, w7, [x2, #-48]
618         zip1            v20.4s, v4.4s, v5.4s
619           ldp           w8, w9, [x2, #-40]
620           eor           a4, a4, w6
621         zip2            v21.4s, v4.4s, v5.4s
622           eor           a5, a5, w7
623         zip1            v22.4s, v6.4s, v7.4s
624           eor           a6, a6, w8
625         zip2            v23.4s, v6.4s, v7.4s
626           eor           a7, a7, w9
627           ldp           w6, w7, [x2, #-32]
628         zip1            v24.4s, v8.4s, v9.4s
629           ldp           w8, w9, [x2, #-24]
630           eor           a8, a8, w6
631         zip2            v25.4s, v8.4s, v9.4s
632           eor           a9, a9, w7
633         zip1            v26.4s, v10.4s, v11.4s
634           eor           a10, a10, w8
635         zip2            v27.4s, v10.4s, v11.4s
636           eor           a11, a11, w9
637           ldp           w6, w7, [x2, #-16]
638         zip1            v28.4s, v12.4s, v13.4s
639           ldp           w8, w9, [x2, #-8]
640           eor           a12, a12, w6
641         zip2            v29.4s, v12.4s, v13.4s
642           eor           a13, a13, w7
643         zip1            v30.4s, v14.4s, v15.4s
644           eor           a14, a14, w8
645         zip2            v31.4s, v14.4s, v15.4s
646           eor           a15, a15, w9
647
648         mov             x3, #64
649         subs            x5, x4, #128
650         add             x6, x5, x2
651         csel            x3, x3, xzr, ge
652         csel            x2, x2, x6, ge
653
654         // interleave 64-bit words in state n, n+2
655         zip1            v0.2d, v16.2d, v18.2d
656         zip2            v4.2d, v16.2d, v18.2d
657           stp           a0, a1, [x1], #64
658         zip1            v8.2d, v17.2d, v19.2d
659         zip2            v12.2d, v17.2d, v19.2d
660           stp           a2, a3, [x1, #-56]
661         ld1             {v16.16b-v19.16b}, [x2], x3
662
663         subs            x6, x4, #192
664         ccmp            x3, xzr, #4, lt
665         add             x7, x6, x2
666         csel            x3, x3, xzr, eq
667         csel            x2, x2, x7, eq
668
669         zip1            v1.2d, v20.2d, v22.2d
670         zip2            v5.2d, v20.2d, v22.2d
671           stp           a4, a5, [x1, #-48]
672         zip1            v9.2d, v21.2d, v23.2d
673         zip2            v13.2d, v21.2d, v23.2d
674           stp           a6, a7, [x1, #-40]
675         ld1             {v20.16b-v23.16b}, [x2], x3
676
677         subs            x7, x4, #256
678         ccmp            x3, xzr, #4, lt
679         add             x8, x7, x2
680         csel            x3, x3, xzr, eq
681         csel            x2, x2, x8, eq
682
683         zip1            v2.2d, v24.2d, v26.2d
684         zip2            v6.2d, v24.2d, v26.2d
685           stp           a8, a9, [x1, #-32]
686         zip1            v10.2d, v25.2d, v27.2d
687         zip2            v14.2d, v25.2d, v27.2d
688           stp           a10, a11, [x1, #-24]
689         ld1             {v24.16b-v27.16b}, [x2], x3
690
691         subs            x8, x4, #320
692         ccmp            x3, xzr, #4, lt
693         add             x9, x8, x2
694         csel            x2, x2, x9, eq
695
696         zip1            v3.2d, v28.2d, v30.2d
697         zip2            v7.2d, v28.2d, v30.2d
698           stp           a12, a13, [x1, #-16]
699         zip1            v11.2d, v29.2d, v31.2d
700         zip2            v15.2d, v29.2d, v31.2d
701           stp           a14, a15, [x1, #-8]
702         ld1             {v28.16b-v31.16b}, [x2]
703
704         // xor with corresponding input, write to output
705         tbnz            x5, #63, 0f
706         eor             v16.16b, v16.16b, v0.16b
707         eor             v17.16b, v17.16b, v1.16b
708         eor             v18.16b, v18.16b, v2.16b
709         eor             v19.16b, v19.16b, v3.16b
710         st1             {v16.16b-v19.16b}, [x1], #64
711         cbz             x5, .Lout
712
713         tbnz            x6, #63, 1f
714         eor             v20.16b, v20.16b, v4.16b
715         eor             v21.16b, v21.16b, v5.16b
716         eor             v22.16b, v22.16b, v6.16b
717         eor             v23.16b, v23.16b, v7.16b
718         st1             {v20.16b-v23.16b}, [x1], #64
719         cbz             x6, .Lout
720
721         tbnz            x7, #63, 2f
722         eor             v24.16b, v24.16b, v8.16b
723         eor             v25.16b, v25.16b, v9.16b
724         eor             v26.16b, v26.16b, v10.16b
725         eor             v27.16b, v27.16b, v11.16b
726         st1             {v24.16b-v27.16b}, [x1], #64
727         cbz             x7, .Lout
728
729         tbnz            x8, #63, 3f
730         eor             v28.16b, v28.16b, v12.16b
731         eor             v29.16b, v29.16b, v13.16b
732         eor             v30.16b, v30.16b, v14.16b
733         eor             v31.16b, v31.16b, v15.16b
734         st1             {v28.16b-v31.16b}, [x1]
735
736 .Lout:  frame_pop
737         ret
738
739         // fewer than 128 bytes of in/output
740 0:      ld1             {v8.16b}, [x10]
741         ld1             {v9.16b}, [x11]
742         movi            v10.16b, #16
743         sub             x2, x1, #64
744         add             x1, x1, x5
745         ld1             {v16.16b-v19.16b}, [x2]
746         tbl             v4.16b, {v0.16b-v3.16b}, v8.16b
747         tbx             v20.16b, {v16.16b-v19.16b}, v9.16b
748         add             v8.16b, v8.16b, v10.16b
749         add             v9.16b, v9.16b, v10.16b
750         tbl             v5.16b, {v0.16b-v3.16b}, v8.16b
751         tbx             v21.16b, {v16.16b-v19.16b}, v9.16b
752         add             v8.16b, v8.16b, v10.16b
753         add             v9.16b, v9.16b, v10.16b
754         tbl             v6.16b, {v0.16b-v3.16b}, v8.16b
755         tbx             v22.16b, {v16.16b-v19.16b}, v9.16b
756         add             v8.16b, v8.16b, v10.16b
757         add             v9.16b, v9.16b, v10.16b
758         tbl             v7.16b, {v0.16b-v3.16b}, v8.16b
759         tbx             v23.16b, {v16.16b-v19.16b}, v9.16b
760
761         eor             v20.16b, v20.16b, v4.16b
762         eor             v21.16b, v21.16b, v5.16b
763         eor             v22.16b, v22.16b, v6.16b
764         eor             v23.16b, v23.16b, v7.16b
765         st1             {v20.16b-v23.16b}, [x1]
766         b               .Lout
767
768         // fewer than 192 bytes of in/output
769 1:      ld1             {v8.16b}, [x10]
770         ld1             {v9.16b}, [x11]
771         movi            v10.16b, #16
772         add             x1, x1, x6
773         tbl             v0.16b, {v4.16b-v7.16b}, v8.16b
774         tbx             v20.16b, {v16.16b-v19.16b}, v9.16b
775         add             v8.16b, v8.16b, v10.16b
776         add             v9.16b, v9.16b, v10.16b
777         tbl             v1.16b, {v4.16b-v7.16b}, v8.16b
778         tbx             v21.16b, {v16.16b-v19.16b}, v9.16b
779         add             v8.16b, v8.16b, v10.16b
780         add             v9.16b, v9.16b, v10.16b
781         tbl             v2.16b, {v4.16b-v7.16b}, v8.16b
782         tbx             v22.16b, {v16.16b-v19.16b}, v9.16b
783         add             v8.16b, v8.16b, v10.16b
784         add             v9.16b, v9.16b, v10.16b
785         tbl             v3.16b, {v4.16b-v7.16b}, v8.16b
786         tbx             v23.16b, {v16.16b-v19.16b}, v9.16b
787
788         eor             v20.16b, v20.16b, v0.16b
789         eor             v21.16b, v21.16b, v1.16b
790         eor             v22.16b, v22.16b, v2.16b
791         eor             v23.16b, v23.16b, v3.16b
792         st1             {v20.16b-v23.16b}, [x1]
793         b               .Lout
794
795         // fewer than 256 bytes of in/output
796 2:      ld1             {v4.16b}, [x10]
797         ld1             {v5.16b}, [x11]
798         movi            v6.16b, #16
799         add             x1, x1, x7
800         tbl             v0.16b, {v8.16b-v11.16b}, v4.16b
801         tbx             v24.16b, {v20.16b-v23.16b}, v5.16b
802         add             v4.16b, v4.16b, v6.16b
803         add             v5.16b, v5.16b, v6.16b
804         tbl             v1.16b, {v8.16b-v11.16b}, v4.16b
805         tbx             v25.16b, {v20.16b-v23.16b}, v5.16b
806         add             v4.16b, v4.16b, v6.16b
807         add             v5.16b, v5.16b, v6.16b
808         tbl             v2.16b, {v8.16b-v11.16b}, v4.16b
809         tbx             v26.16b, {v20.16b-v23.16b}, v5.16b
810         add             v4.16b, v4.16b, v6.16b
811         add             v5.16b, v5.16b, v6.16b
812         tbl             v3.16b, {v8.16b-v11.16b}, v4.16b
813         tbx             v27.16b, {v20.16b-v23.16b}, v5.16b
814
815         eor             v24.16b, v24.16b, v0.16b
816         eor             v25.16b, v25.16b, v1.16b
817         eor             v26.16b, v26.16b, v2.16b
818         eor             v27.16b, v27.16b, v3.16b
819         st1             {v24.16b-v27.16b}, [x1]
820         b               .Lout
821
822         // fewer than 320 bytes of in/output
823 3:      ld1             {v4.16b}, [x10]
824         ld1             {v5.16b}, [x11]
825         movi            v6.16b, #16
826         add             x1, x1, x8
827         tbl             v0.16b, {v12.16b-v15.16b}, v4.16b
828         tbx             v28.16b, {v24.16b-v27.16b}, v5.16b
829         add             v4.16b, v4.16b, v6.16b
830         add             v5.16b, v5.16b, v6.16b
831         tbl             v1.16b, {v12.16b-v15.16b}, v4.16b
832         tbx             v29.16b, {v24.16b-v27.16b}, v5.16b
833         add             v4.16b, v4.16b, v6.16b
834         add             v5.16b, v5.16b, v6.16b
835         tbl             v2.16b, {v12.16b-v15.16b}, v4.16b
836         tbx             v30.16b, {v24.16b-v27.16b}, v5.16b
837         add             v4.16b, v4.16b, v6.16b
838         add             v5.16b, v5.16b, v6.16b
839         tbl             v3.16b, {v12.16b-v15.16b}, v4.16b
840         tbx             v31.16b, {v24.16b-v27.16b}, v5.16b
841
842         eor             v28.16b, v28.16b, v0.16b
843         eor             v29.16b, v29.16b, v1.16b
844         eor             v30.16b, v30.16b, v2.16b
845         eor             v31.16b, v31.16b, v3.16b
846         st1             {v28.16b-v31.16b}, [x1]
847         b               .Lout
848 ENDPROC(chacha_4block_xor_neon)
849
850         .section        ".rodata", "a", %progbits
851         .align          L1_CACHE_SHIFT
852 .Lpermute:
853         .set            .Li, 0
854         .rept           192
855         .byte           (.Li - 64)
856         .set            .Li, .Li + 1
857         .endr
858
859 CTRINC: .word           1, 2, 3, 4
860 ROT8:   .word           0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f