1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * ARIA Cipher 16-way parallel algorithm (AVX)
5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
9 #include <linux/linkage.h>
10 #include <linux/cfi_types.h>
11 #include <asm/frame.h>
13 /* struct aria_ctx: */
22 #define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
23 ( (((a0) & 1) << 0) | \
32 #define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
33 ( ((l7) << (0 * 8)) | \
42 #define inc_le128(x, minus_one, tmp) \
43 vpcmpeqq minus_one, x, tmp; \
44 vpsubq minus_one, x, x; \
45 vpslldq $8, tmp, tmp; \
48 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
49 vpand x, mask4bit, tmp0; \
50 vpandn x, mask4bit, x; \
53 vpshufb tmp0, lo_t, tmp0; \
57 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
58 vpunpckhdq x1, x0, t2; \
59 vpunpckldq x1, x0, x0; \
61 vpunpckldq x3, x2, t1; \
62 vpunpckhdq x3, x2, x2; \
64 vpunpckhqdq t1, x0, x1; \
65 vpunpcklqdq t1, x0, x0; \
67 vpunpckhqdq x2, t2, x3; \
68 vpunpcklqdq x2, t2, x2;
70 #define byteslice_16x16b(a0, b0, c0, d0, \
77 transpose_4x4(a0, a1, a2, a3, d2, d3); \
78 transpose_4x4(b0, b1, b2, b3, d2, d3); \
84 transpose_4x4(c0, c1, c2, c3, a0, a1); \
85 transpose_4x4(d0, d1, d2, d3, a0, a1); \
87 vmovdqu .Lshufb_16x16b, a0; \
100 vpshufb a0, d0, d0; \
101 vpshufb a0, d1, d1; \
102 vpshufb a0, d2, d2; \
103 vpshufb a0, d3, d3; \
106 vpshufb a0, d3, a0; \
109 transpose_4x4(a0, b0, c0, d0, d2, d3); \
110 transpose_4x4(a1, b1, c1, d1, d2, d3); \
116 transpose_4x4(a2, b2, c2, d2, b0, b1); \
117 transpose_4x4(a3, b3, c3, d3, b0, b1); \
120 /* does not adjust output bytes inside vectors */
122 #define debyteslice_16x16b(a0, b0, c0, d0, \
129 transpose_4x4(a0, a1, a2, a3, d2, d3); \
130 transpose_4x4(b0, b1, b2, b3, d2, d3); \
136 transpose_4x4(c0, c1, c2, c3, a0, a1); \
137 transpose_4x4(d0, d1, d2, d3, a0, a1); \
139 vmovdqu .Lshufb_16x16b, a0; \
141 vpshufb a0, a2, a2; \
142 vpshufb a0, a3, a3; \
143 vpshufb a0, b0, b0; \
144 vpshufb a0, b1, b1; \
145 vpshufb a0, b2, b2; \
146 vpshufb a0, b3, b3; \
147 vpshufb a0, a1, a1; \
148 vpshufb a0, c0, c0; \
149 vpshufb a0, c1, c1; \
150 vpshufb a0, c2, c2; \
151 vpshufb a0, c3, c3; \
152 vpshufb a0, d0, d0; \
153 vpshufb a0, d1, d1; \
154 vpshufb a0, d2, d2; \
155 vpshufb a0, d3, d3; \
158 vpshufb a0, d3, a0; \
161 transpose_4x4(c0, d0, a0, b0, d2, d3); \
162 transpose_4x4(c1, d1, a1, b1, d2, d3); \
168 transpose_4x4(c2, d2, a2, b2, b0, b1); \
169 transpose_4x4(c3, d3, a3, b3, b0, b1); \
172 /* does not adjust output bytes inside vectors */
174 /* load blocks to registers and apply pre-whitening */
175 #define inpack16_pre(x0, x1, x2, x3, \
180 vmovdqu (0 * 16)(rio), x0; \
181 vmovdqu (1 * 16)(rio), x1; \
182 vmovdqu (2 * 16)(rio), x2; \
183 vmovdqu (3 * 16)(rio), x3; \
184 vmovdqu (4 * 16)(rio), x4; \
185 vmovdqu (5 * 16)(rio), x5; \
186 vmovdqu (6 * 16)(rio), x6; \
187 vmovdqu (7 * 16)(rio), x7; \
188 vmovdqu (8 * 16)(rio), y0; \
189 vmovdqu (9 * 16)(rio), y1; \
190 vmovdqu (10 * 16)(rio), y2; \
191 vmovdqu (11 * 16)(rio), y3; \
192 vmovdqu (12 * 16)(rio), y4; \
193 vmovdqu (13 * 16)(rio), y5; \
194 vmovdqu (14 * 16)(rio), y6; \
195 vmovdqu (15 * 16)(rio), y7;
197 /* byteslice pre-whitened blocks and store to temporary memory */
198 #define inpack16_post(x0, x1, x2, x3, \
203 byteslice_16x16b(x0, x1, x2, x3, \
207 (mem_ab), (mem_cd)); \
209 vmovdqu x0, 0 * 16(mem_ab); \
210 vmovdqu x1, 1 * 16(mem_ab); \
211 vmovdqu x2, 2 * 16(mem_ab); \
212 vmovdqu x3, 3 * 16(mem_ab); \
213 vmovdqu x4, 4 * 16(mem_ab); \
214 vmovdqu x5, 5 * 16(mem_ab); \
215 vmovdqu x6, 6 * 16(mem_ab); \
216 vmovdqu x7, 7 * 16(mem_ab); \
217 vmovdqu y0, 0 * 16(mem_cd); \
218 vmovdqu y1, 1 * 16(mem_cd); \
219 vmovdqu y2, 2 * 16(mem_cd); \
220 vmovdqu y3, 3 * 16(mem_cd); \
221 vmovdqu y4, 4 * 16(mem_cd); \
222 vmovdqu y5, 5 * 16(mem_cd); \
223 vmovdqu y6, 6 * 16(mem_cd); \
224 vmovdqu y7, 7 * 16(mem_cd);
226 #define write_output(x0, x1, x2, x3, \
231 vmovdqu x0, 0 * 16(mem); \
232 vmovdqu x1, 1 * 16(mem); \
233 vmovdqu x2, 2 * 16(mem); \
234 vmovdqu x3, 3 * 16(mem); \
235 vmovdqu x4, 4 * 16(mem); \
236 vmovdqu x5, 5 * 16(mem); \
237 vmovdqu x6, 6 * 16(mem); \
238 vmovdqu x7, 7 * 16(mem); \
239 vmovdqu y0, 8 * 16(mem); \
240 vmovdqu y1, 9 * 16(mem); \
241 vmovdqu y2, 10 * 16(mem); \
242 vmovdqu y3, 11 * 16(mem); \
243 vmovdqu y4, 12 * 16(mem); \
244 vmovdqu y5, 13 * 16(mem); \
245 vmovdqu y6, 14 * 16(mem); \
246 vmovdqu y7, 15 * 16(mem); \
248 #define aria_store_state_8way(x0, x1, x2, x3, \
251 vmovdqu x0, ((idx + 0) * 16)(mem_tmp); \
252 vmovdqu x1, ((idx + 1) * 16)(mem_tmp); \
253 vmovdqu x2, ((idx + 2) * 16)(mem_tmp); \
254 vmovdqu x3, ((idx + 3) * 16)(mem_tmp); \
255 vmovdqu x4, ((idx + 4) * 16)(mem_tmp); \
256 vmovdqu x5, ((idx + 5) * 16)(mem_tmp); \
257 vmovdqu x6, ((idx + 6) * 16)(mem_tmp); \
258 vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
260 #define aria_load_state_8way(x0, x1, x2, x3, \
263 vmovdqu ((idx + 0) * 16)(mem_tmp), x0; \
264 vmovdqu ((idx + 1) * 16)(mem_tmp), x1; \
265 vmovdqu ((idx + 2) * 16)(mem_tmp), x2; \
266 vmovdqu ((idx + 3) * 16)(mem_tmp), x3; \
267 vmovdqu ((idx + 4) * 16)(mem_tmp), x4; \
268 vmovdqu ((idx + 5) * 16)(mem_tmp), x5; \
269 vmovdqu ((idx + 6) * 16)(mem_tmp), x6; \
270 vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
272 #define aria_ark_8way(x0, x1, x2, x3, \
274 t0, rk, idx, round) \
276 vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \
278 vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \
280 vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \
282 vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \
284 vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \
286 vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \
288 vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \
290 vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \
293 #define aria_sbox_8way_gfni(x0, x1, x2, x3, \
297 vpbroadcastq .Ltf_s2_bitmatrix, t0; \
298 vpbroadcastq .Ltf_inv_bitmatrix, t1; \
299 vpbroadcastq .Ltf_id_bitmatrix, t2; \
300 vpbroadcastq .Ltf_aff_bitmatrix, t3; \
301 vpbroadcastq .Ltf_x2_bitmatrix, t4; \
302 vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
303 vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
304 vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
305 vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
306 vgf2p8affineinvqb $0, t2, x2, x2; \
307 vgf2p8affineinvqb $0, t2, x6, x6; \
308 vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
309 vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
310 vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
311 vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
312 vgf2p8affineinvqb $0, t2, x3, x3; \
313 vgf2p8affineinvqb $0, t2, x7, x7
315 #define aria_sbox_8way(x0, x1, x2, x3, \
320 vmovdqa .Linv_shift_row, t0; \
321 vmovdqa .Lshift_row, t1; \
322 vpbroadcastd .L0f0f0f0f, t6; \
323 vmovdqa .Ltf_lo__inv_aff__and__s2, t2; \
324 vmovdqa .Ltf_hi__inv_aff__and__s2, t3; \
325 vmovdqa .Ltf_lo__x2__and__fwd_aff, t4; \
326 vmovdqa .Ltf_hi__x2__and__fwd_aff, t5; \
328 vaesenclast t7, x0, x0; \
329 vaesenclast t7, x4, x4; \
330 vaesenclast t7, x1, x1; \
331 vaesenclast t7, x5, x5; \
332 vaesdeclast t7, x2, x2; \
333 vaesdeclast t7, x6, x6; \
335 /* AES inverse shift rows */ \
336 vpshufb t0, x0, x0; \
337 vpshufb t0, x4, x4; \
338 vpshufb t0, x1, x1; \
339 vpshufb t0, x5, x5; \
340 vpshufb t1, x3, x3; \
341 vpshufb t1, x7, x7; \
342 vpshufb t1, x2, x2; \
343 vpshufb t1, x6, x6; \
345 /* affine transformation for S2 */ \
346 filter_8bit(x1, t2, t3, t6, t0); \
347 /* affine transformation for S2 */ \
348 filter_8bit(x5, t2, t3, t6, t0); \
350 /* affine transformation for X2 */ \
351 filter_8bit(x3, t4, t5, t6, t0); \
352 /* affine transformation for X2 */ \
353 filter_8bit(x7, t4, t5, t6, t0); \
354 vaesdeclast t7, x3, x3; \
355 vaesdeclast t7, x7, x7;
357 #define aria_diff_m(x0, x1, x2, x3, \
359 /* T = rotr32(X, 8); */ \
365 /* X = T ^ rotr(X, 16); */ \
372 #define aria_diff_word(x0, x1, x2, x3, \
412 #define aria_fe(x0, x1, x2, x3, \
416 mem_tmp, rk, round) \
417 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
420 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
421 y0, y1, y2, y3, y4, y5, y6, y7); \
423 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
424 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
425 aria_store_state_8way(x0, x1, x2, x3, \
429 aria_load_state_8way(x0, x1, x2, x3, \
432 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
435 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
436 y0, y1, y2, y3, y4, y5, y6, y7); \
438 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
439 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
440 aria_store_state_8way(x0, x1, x2, x3, \
443 aria_load_state_8way(y0, y1, y2, y3, \
446 aria_diff_word(x0, x1, x2, x3, \
450 /* aria_diff_byte() \
451 * T3 = ABCD -> BADC \
452 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
453 * T0 = ABCD -> CDAB \
454 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
455 * T1 = ABCD -> DCBA \
456 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
458 aria_diff_word(x2, x3, x0, x1, \
462 aria_store_state_8way(x3, x2, x1, x0, \
466 #define aria_fo(x0, x1, x2, x3, \
470 mem_tmp, rk, round) \
471 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
474 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
475 y0, y1, y2, y3, y4, y5, y6, y7); \
477 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
478 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
479 aria_store_state_8way(x0, x1, x2, x3, \
483 aria_load_state_8way(x0, x1, x2, x3, \
486 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
489 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
490 y0, y1, y2, y3, y4, y5, y6, y7); \
492 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
493 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
494 aria_store_state_8way(x0, x1, x2, x3, \
497 aria_load_state_8way(y0, y1, y2, y3, \
500 aria_diff_word(x0, x1, x2, x3, \
504 /* aria_diff_byte() \
505 * T1 = ABCD -> BADC \
506 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
507 * T2 = ABCD -> CDAB \
508 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
509 * T3 = ABCD -> DCBA \
510 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
512 aria_diff_word(x0, x1, x2, x3, \
516 aria_store_state_8way(x3, x2, x1, x0, \
520 #define aria_ff(x0, x1, x2, x3, \
524 mem_tmp, rk, round, last_round) \
525 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
528 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
529 y0, y1, y2, y3, y4, y5, y6, y7); \
531 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
532 y0, rk, 8, last_round); \
534 aria_store_state_8way(x0, x1, x2, x3, \
538 aria_load_state_8way(x0, x1, x2, x3, \
541 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
544 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
545 y0, y1, y2, y3, y4, y5, y6, y7); \
547 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
548 y0, rk, 0, last_round); \
550 aria_load_state_8way(y0, y1, y2, y3, \
554 #define aria_fe_gfni(x0, x1, x2, x3, \
558 mem_tmp, rk, round) \
559 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
562 aria_sbox_8way_gfni(x2, x3, x0, x1, \
567 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
568 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
569 aria_store_state_8way(x0, x1, x2, x3, \
573 aria_load_state_8way(x0, x1, x2, x3, \
576 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
579 aria_sbox_8way_gfni(x2, x3, x0, x1, \
584 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
585 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
586 aria_store_state_8way(x0, x1, x2, x3, \
589 aria_load_state_8way(y0, y1, y2, y3, \
592 aria_diff_word(x0, x1, x2, x3, \
596 /* aria_diff_byte() \
597 * T3 = ABCD -> BADC \
598 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
599 * T0 = ABCD -> CDAB \
600 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
601 * T1 = ABCD -> DCBA \
602 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
604 aria_diff_word(x2, x3, x0, x1, \
608 aria_store_state_8way(x3, x2, x1, x0, \
612 #define aria_fo_gfni(x0, x1, x2, x3, \
616 mem_tmp, rk, round) \
617 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
620 aria_sbox_8way_gfni(x0, x1, x2, x3, \
625 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
626 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
627 aria_store_state_8way(x0, x1, x2, x3, \
631 aria_load_state_8way(x0, x1, x2, x3, \
634 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
637 aria_sbox_8way_gfni(x0, x1, x2, x3, \
642 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
643 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
644 aria_store_state_8way(x0, x1, x2, x3, \
647 aria_load_state_8way(y0, y1, y2, y3, \
650 aria_diff_word(x0, x1, x2, x3, \
654 /* aria_diff_byte() \
655 * T1 = ABCD -> BADC \
656 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
657 * T2 = ABCD -> CDAB \
658 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
659 * T3 = ABCD -> DCBA \
660 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
662 aria_diff_word(x0, x1, x2, x3, \
666 aria_store_state_8way(x3, x2, x1, x0, \
670 #define aria_ff_gfni(x0, x1, x2, x3, \
674 mem_tmp, rk, round, last_round) \
675 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
678 aria_sbox_8way_gfni(x2, x3, x0, x1, \
683 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
684 y0, rk, 8, last_round); \
686 aria_store_state_8way(x0, x1, x2, x3, \
690 aria_load_state_8way(x0, x1, x2, x3, \
693 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
696 aria_sbox_8way_gfni(x2, x3, x0, x1, \
701 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
702 y0, rk, 0, last_round); \
704 aria_load_state_8way(y0, y1, y2, y3, \
708 /* NB: section is mergeable, all elements must be aligned 16-byte blocks */
709 .section .rodata.cst16, "aM", @progbits, 16
712 #define SHUFB_BYTES(idx) \
713 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
716 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
717 /* For isolating SubBytes from AESENCLAST, inverse shift row */
719 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
720 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
722 .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
723 .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
724 /* For CTR-mode IV byteswap */
726 .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
727 .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
729 /* AES inverse affine and S2 combined:
730 * 1 1 0 0 0 0 0 1 x0 0
731 * 0 1 0 0 1 0 0 0 x1 0
732 * 1 1 0 0 1 1 1 1 x2 0
733 * 0 1 1 0 1 0 0 1 x3 1
734 * 0 1 0 0 1 1 0 0 * x4 + 0
735 * 0 1 0 1 1 0 0 0 x5 0
736 * 0 0 0 0 0 1 0 1 x6 0
737 * 1 1 1 0 0 1 1 1 x7 1
739 .Ltf_lo__inv_aff__and__s2:
740 .octa 0x92172DA81A9FA520B2370D883ABF8500
741 .Ltf_hi__inv_aff__and__s2:
742 .octa 0x2B15FFC1AF917B45E6D8320C625CB688
744 /* X2 and AES forward affine combined:
745 * 1 0 1 1 0 0 0 1 x0 0
746 * 0 1 1 1 1 0 1 1 x1 0
747 * 0 0 0 1 1 0 1 0 x2 1
748 * 0 1 0 0 0 1 0 0 x3 0
749 * 0 0 1 1 1 0 1 1 * x4 + 0
750 * 0 1 0 0 1 0 0 0 x5 0
751 * 1 1 0 1 0 0 1 1 x6 0
752 * 0 1 0 0 1 0 1 0 x7 0
754 .Ltf_lo__x2__and__fwd_aff:
755 .octa 0xEFAE0544FCBD1657B8F95213ABEA4100
756 .Ltf_hi__x2__and__fwd_aff:
757 .octa 0x3F893781E95FE1576CDA64D2BA0CB204
759 .section .rodata.cst8, "aM", @progbits, 8
762 #define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
764 .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
765 BV8(1, 1, 0, 0, 0, 1, 1, 1),
766 BV8(1, 1, 1, 0, 0, 0, 1, 1),
767 BV8(1, 1, 1, 1, 0, 0, 0, 1),
768 BV8(1, 1, 1, 1, 1, 0, 0, 0),
769 BV8(0, 1, 1, 1, 1, 1, 0, 0),
770 BV8(0, 0, 1, 1, 1, 1, 1, 0),
771 BV8(0, 0, 0, 1, 1, 1, 1, 1))
773 /* AES inverse affine: */
774 #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
776 .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
777 BV8(1, 0, 0, 1, 0, 0, 1, 0),
778 BV8(0, 1, 0, 0, 1, 0, 0, 1),
779 BV8(1, 0, 1, 0, 0, 1, 0, 0),
780 BV8(0, 1, 0, 1, 0, 0, 1, 0),
781 BV8(0, 0, 1, 0, 1, 0, 0, 1),
782 BV8(1, 0, 0, 1, 0, 1, 0, 0),
783 BV8(0, 1, 0, 0, 1, 0, 1, 0))
786 #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
788 .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
789 BV8(0, 0, 1, 1, 1, 1, 1, 1),
790 BV8(1, 1, 1, 0, 1, 1, 0, 1),
791 BV8(1, 1, 0, 0, 0, 0, 1, 1),
792 BV8(0, 1, 0, 0, 0, 0, 1, 1),
793 BV8(1, 1, 0, 0, 1, 1, 1, 0),
794 BV8(0, 1, 1, 0, 0, 0, 1, 1),
795 BV8(1, 1, 1, 1, 0, 1, 1, 0))
798 #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
800 .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
801 BV8(0, 0, 1, 0, 0, 1, 1, 0),
802 BV8(0, 0, 0, 0, 1, 0, 1, 0),
803 BV8(1, 1, 1, 0, 0, 0, 1, 1),
804 BV8(1, 1, 1, 0, 1, 1, 0, 0),
805 BV8(0, 1, 1, 0, 1, 0, 1, 1),
806 BV8(1, 0, 1, 1, 1, 1, 0, 1),
807 BV8(1, 0, 0, 1, 0, 0, 1, 1))
809 /* Identity matrix: */
811 .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
812 BV8(0, 1, 0, 0, 0, 0, 0, 0),
813 BV8(0, 0, 1, 0, 0, 0, 0, 0),
814 BV8(0, 0, 0, 1, 0, 0, 0, 0),
815 BV8(0, 0, 0, 0, 1, 0, 0, 0),
816 BV8(0, 0, 0, 0, 0, 1, 0, 0),
817 BV8(0, 0, 0, 0, 0, 0, 1, 0),
818 BV8(0, 0, 0, 0, 0, 0, 0, 1))
821 .section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
828 SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
833 * %xmm0..%xmm15: 16 byte-sliced blocks
839 leaq 8 * 16(%rax), %r8;
841 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
842 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
844 aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
845 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
847 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
848 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
849 %xmm15, %rax, %r9, 1);
850 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
851 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
853 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
854 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
855 %xmm15, %rax, %r9, 3);
856 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
857 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
859 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
860 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
861 %xmm15, %rax, %r9, 5);
862 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
863 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
865 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
866 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
867 %xmm15, %rax, %r9, 7);
868 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
869 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
871 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
872 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
873 %xmm15, %rax, %r9, 9);
874 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
875 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
877 cmpl $12, rounds(CTX);
879 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
880 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
881 %xmm15, %rax, %r9, 11, 12);
884 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
885 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
886 %xmm15, %rax, %r9, 11);
887 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
888 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
890 cmpl $14, rounds(CTX);
892 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
893 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
894 %xmm15, %rax, %r9, 13, 14);
897 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
898 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
899 %xmm15, %rax, %r9, 13);
900 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
901 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
903 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
904 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
905 %xmm15, %rax, %r9, 15, 16);
907 debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
908 %xmm9, %xmm13, %xmm0, %xmm5,
909 %xmm10, %xmm14, %xmm3, %xmm6,
910 %xmm11, %xmm15, %xmm2, %xmm7,
915 SYM_FUNC_END(__aria_aesni_avx_crypt_16way)
917 SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
926 leaq enc_key(CTX), %r9;
928 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
929 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
932 call __aria_aesni_avx_crypt_16way;
934 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
935 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
940 SYM_FUNC_END(aria_aesni_avx_encrypt_16way)
942 SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
951 leaq dec_key(CTX), %r9;
953 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
954 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
957 call __aria_aesni_avx_crypt_16way;
959 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
960 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
965 SYM_FUNC_END(aria_aesni_avx_decrypt_16way)
967 SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
973 * %r8: iv (big endian, 128bit)
977 /* load IV and byteswap */
978 vmovdqu (%r8), %xmm8;
980 vmovdqa .Lbswap128_mask (%rip), %xmm1;
981 vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
983 vpcmpeqd %xmm0, %xmm0, %xmm0;
984 vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
987 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
988 vpshufb %xmm1, %xmm3, %xmm9;
989 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
990 vpshufb %xmm1, %xmm3, %xmm10;
991 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
992 vpshufb %xmm1, %xmm3, %xmm11;
993 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
994 vpshufb %xmm1, %xmm3, %xmm12;
995 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
996 vpshufb %xmm1, %xmm3, %xmm13;
997 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
998 vpshufb %xmm1, %xmm3, %xmm14;
999 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1000 vpshufb %xmm1, %xmm3, %xmm15;
1001 vmovdqu %xmm8, (0 * 16)(%rcx);
1002 vmovdqu %xmm9, (1 * 16)(%rcx);
1003 vmovdqu %xmm10, (2 * 16)(%rcx);
1004 vmovdqu %xmm11, (3 * 16)(%rcx);
1005 vmovdqu %xmm12, (4 * 16)(%rcx);
1006 vmovdqu %xmm13, (5 * 16)(%rcx);
1007 vmovdqu %xmm14, (6 * 16)(%rcx);
1008 vmovdqu %xmm15, (7 * 16)(%rcx);
1010 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1011 vpshufb %xmm1, %xmm3, %xmm8;
1012 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1013 vpshufb %xmm1, %xmm3, %xmm9;
1014 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1015 vpshufb %xmm1, %xmm3, %xmm10;
1016 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1017 vpshufb %xmm1, %xmm3, %xmm11;
1018 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1019 vpshufb %xmm1, %xmm3, %xmm12;
1020 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1021 vpshufb %xmm1, %xmm3, %xmm13;
1022 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1023 vpshufb %xmm1, %xmm3, %xmm14;
1024 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1025 vpshufb %xmm1, %xmm3, %xmm15;
1026 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1027 vpshufb %xmm1, %xmm3, %xmm4;
1028 vmovdqu %xmm4, (%r8);
1030 vmovdqu (0 * 16)(%rcx), %xmm0;
1031 vmovdqu (1 * 16)(%rcx), %xmm1;
1032 vmovdqu (2 * 16)(%rcx), %xmm2;
1033 vmovdqu (3 * 16)(%rcx), %xmm3;
1034 vmovdqu (4 * 16)(%rcx), %xmm4;
1035 vmovdqu (5 * 16)(%rcx), %xmm5;
1036 vmovdqu (6 * 16)(%rcx), %xmm6;
1037 vmovdqu (7 * 16)(%rcx), %xmm7;
1041 SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
1043 SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
1049 * %r8: iv (big endian, 128bit)
1053 call __aria_aesni_avx_ctr_gen_keystream_16way;
1059 leaq enc_key(CTX), %r9;
1061 call __aria_aesni_avx_crypt_16way;
1063 vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1064 vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1065 vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1066 vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1067 vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1068 vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1069 vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1070 vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1071 vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1072 vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1073 vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1074 vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1075 vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1076 vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1077 vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1078 vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1079 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1080 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1085 SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
1087 SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
1092 * %xmm0..%xmm15: 16 byte-sliced blocks
1098 leaq 8 * 16(%rax), %r8;
1100 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
1101 %xmm4, %xmm5, %xmm6, %xmm7,
1102 %xmm8, %xmm9, %xmm10, %xmm11,
1103 %xmm12, %xmm13, %xmm14,
1105 aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
1106 %xmm12, %xmm13, %xmm14, %xmm15,
1107 %xmm0, %xmm1, %xmm2, %xmm3,
1108 %xmm4, %xmm5, %xmm6, %xmm7,
1110 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1111 %xmm4, %xmm5, %xmm6, %xmm7,
1112 %xmm8, %xmm9, %xmm10, %xmm11,
1113 %xmm12, %xmm13, %xmm14,
1114 %xmm15, %rax, %r9, 1);
1115 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1116 %xmm12, %xmm13, %xmm14, %xmm15,
1117 %xmm0, %xmm1, %xmm2, %xmm3,
1118 %xmm4, %xmm5, %xmm6, %xmm7,
1120 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1121 %xmm4, %xmm5, %xmm6, %xmm7,
1122 %xmm8, %xmm9, %xmm10, %xmm11,
1123 %xmm12, %xmm13, %xmm14,
1124 %xmm15, %rax, %r9, 3);
1125 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1126 %xmm12, %xmm13, %xmm14, %xmm15,
1127 %xmm0, %xmm1, %xmm2, %xmm3,
1128 %xmm4, %xmm5, %xmm6, %xmm7,
1130 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1131 %xmm4, %xmm5, %xmm6, %xmm7,
1132 %xmm8, %xmm9, %xmm10, %xmm11,
1133 %xmm12, %xmm13, %xmm14,
1134 %xmm15, %rax, %r9, 5);
1135 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1136 %xmm12, %xmm13, %xmm14, %xmm15,
1137 %xmm0, %xmm1, %xmm2, %xmm3,
1138 %xmm4, %xmm5, %xmm6, %xmm7,
1140 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1141 %xmm4, %xmm5, %xmm6, %xmm7,
1142 %xmm8, %xmm9, %xmm10, %xmm11,
1143 %xmm12, %xmm13, %xmm14,
1144 %xmm15, %rax, %r9, 7);
1145 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1146 %xmm12, %xmm13, %xmm14, %xmm15,
1147 %xmm0, %xmm1, %xmm2, %xmm3,
1148 %xmm4, %xmm5, %xmm6, %xmm7,
1150 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1151 %xmm4, %xmm5, %xmm6, %xmm7,
1152 %xmm8, %xmm9, %xmm10, %xmm11,
1153 %xmm12, %xmm13, %xmm14,
1154 %xmm15, %rax, %r9, 9);
1155 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1156 %xmm12, %xmm13, %xmm14, %xmm15,
1157 %xmm0, %xmm1, %xmm2, %xmm3,
1158 %xmm4, %xmm5, %xmm6, %xmm7,
1160 cmpl $12, rounds(CTX);
1161 jne .Laria_gfni_192;
1162 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1163 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1164 %xmm15, %rax, %r9, 11, 12);
1165 jmp .Laria_gfni_end;
1167 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1168 %xmm4, %xmm5, %xmm6, %xmm7,
1169 %xmm8, %xmm9, %xmm10, %xmm11,
1170 %xmm12, %xmm13, %xmm14,
1171 %xmm15, %rax, %r9, 11);
1172 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1173 %xmm12, %xmm13, %xmm14, %xmm15,
1174 %xmm0, %xmm1, %xmm2, %xmm3,
1175 %xmm4, %xmm5, %xmm6, %xmm7,
1177 cmpl $14, rounds(CTX);
1178 jne .Laria_gfni_256;
1179 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1180 %xmm4, %xmm5, %xmm6, %xmm7,
1181 %xmm8, %xmm9, %xmm10, %xmm11,
1182 %xmm12, %xmm13, %xmm14,
1183 %xmm15, %rax, %r9, 13, 14);
1184 jmp .Laria_gfni_end;
1186 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1187 %xmm4, %xmm5, %xmm6, %xmm7,
1188 %xmm8, %xmm9, %xmm10, %xmm11,
1189 %xmm12, %xmm13, %xmm14,
1190 %xmm15, %rax, %r9, 13);
1191 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1192 %xmm12, %xmm13, %xmm14, %xmm15,
1193 %xmm0, %xmm1, %xmm2, %xmm3,
1194 %xmm4, %xmm5, %xmm6, %xmm7,
1196 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1197 %xmm4, %xmm5, %xmm6, %xmm7,
1198 %xmm8, %xmm9, %xmm10, %xmm11,
1199 %xmm12, %xmm13, %xmm14,
1200 %xmm15, %rax, %r9, 15, 16);
1202 debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
1203 %xmm9, %xmm13, %xmm0, %xmm5,
1204 %xmm10, %xmm14, %xmm3, %xmm6,
1205 %xmm11, %xmm15, %xmm2, %xmm7,
1210 SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
1212 SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
1221 leaq enc_key(CTX), %r9;
1223 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1224 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1227 call __aria_aesni_avx_gfni_crypt_16way;
1229 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1230 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1235 SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
1237 SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
1246 leaq dec_key(CTX), %r9;
1248 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1249 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1252 call __aria_aesni_avx_gfni_crypt_16way;
1254 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1255 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1260 SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
1262 SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
1268 * %r8: iv (big endian, 128bit)
1272 call __aria_aesni_avx_ctr_gen_keystream_16way
1278 leaq enc_key(CTX), %r9;
1280 call __aria_aesni_avx_gfni_crypt_16way;
1282 vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1283 vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1284 vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1285 vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1286 vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1287 vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1288 vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1289 vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1290 vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1291 vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1292 vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1293 vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1294 vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1295 vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1296 vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1297 vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1298 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1299 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1304 SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)