2 * x86_64/AVX2/AES-NI assembler implementation of Camellia
4 * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
13 #include <linux/linkage.h>
14 #include <asm/frame.h>
15 #include <asm/nospec-branch.h>
17 #define CAMELLIA_TABLE_BYTE_LEN 272
19 /* struct camellia_ctx: */
21 #define key_length CAMELLIA_TABLE_BYTE_LEN
27 /**********************************************************************
29 **********************************************************************/
30 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
31 vpand x, mask4bit, tmp0; \
32 vpandn x, mask4bit, x; \
35 vpshufb tmp0, lo_t, tmp0; \
56 /**********************************************************************
58 **********************************************************************/
62 * x0..x7: byte-sliced AB state
63 * mem_cd: register pointer storing CD state
64 * key: index for key material
66 * x0..x7: new byte-sliced CD state
68 #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
71 * S-function with AES subbytes \
73 vbroadcasti128 .Linv_shift_row, t4; \
74 vpbroadcastd .L0f0f0f0f, t7; \
75 vbroadcasti128 .Lpre_tf_lo_s1, t5; \
76 vbroadcasti128 .Lpre_tf_hi_s1, t6; \
77 vbroadcasti128 .Lpre_tf_lo_s4, t2; \
78 vbroadcasti128 .Lpre_tf_hi_s4, t3; \
80 /* AES inverse shift rows */ \
90 /* prefilter sboxes 1, 2 and 3 */ \
91 /* prefilter sbox 4 */ \
92 filter_8bit(x0, t5, t6, t7, t4); \
93 filter_8bit(x7, t5, t6, t7, t4); \
94 vextracti128 $1, x0, t0##_x; \
95 vextracti128 $1, x7, t1##_x; \
96 filter_8bit(x3, t2, t3, t7, t4); \
97 filter_8bit(x6, t2, t3, t7, t4); \
98 vextracti128 $1, x3, t3##_x; \
99 vextracti128 $1, x6, t2##_x; \
100 filter_8bit(x2, t5, t6, t7, t4); \
101 filter_8bit(x5, t5, t6, t7, t4); \
102 filter_8bit(x1, t5, t6, t7, t4); \
103 filter_8bit(x4, t5, t6, t7, t4); \
105 vpxor t4##_x, t4##_x, t4##_x; \
107 /* AES subbytes + AES shift rows */ \
108 vextracti128 $1, x2, t6##_x; \
109 vextracti128 $1, x5, t5##_x; \
110 vaesenclast t4##_x, x0##_x, x0##_x; \
111 vaesenclast t4##_x, t0##_x, t0##_x; \
112 vinserti128 $1, t0##_x, x0, x0; \
113 vaesenclast t4##_x, x7##_x, x7##_x; \
114 vaesenclast t4##_x, t1##_x, t1##_x; \
115 vinserti128 $1, t1##_x, x7, x7; \
116 vaesenclast t4##_x, x3##_x, x3##_x; \
117 vaesenclast t4##_x, t3##_x, t3##_x; \
118 vinserti128 $1, t3##_x, x3, x3; \
119 vaesenclast t4##_x, x6##_x, x6##_x; \
120 vaesenclast t4##_x, t2##_x, t2##_x; \
121 vinserti128 $1, t2##_x, x6, x6; \
122 vextracti128 $1, x1, t3##_x; \
123 vextracti128 $1, x4, t2##_x; \
124 vbroadcasti128 .Lpost_tf_lo_s1, t0; \
125 vbroadcasti128 .Lpost_tf_hi_s1, t1; \
126 vaesenclast t4##_x, x2##_x, x2##_x; \
127 vaesenclast t4##_x, t6##_x, t6##_x; \
128 vinserti128 $1, t6##_x, x2, x2; \
129 vaesenclast t4##_x, x5##_x, x5##_x; \
130 vaesenclast t4##_x, t5##_x, t5##_x; \
131 vinserti128 $1, t5##_x, x5, x5; \
132 vaesenclast t4##_x, x1##_x, x1##_x; \
133 vaesenclast t4##_x, t3##_x, t3##_x; \
134 vinserti128 $1, t3##_x, x1, x1; \
135 vaesenclast t4##_x, x4##_x, x4##_x; \
136 vaesenclast t4##_x, t2##_x, t2##_x; \
137 vinserti128 $1, t2##_x, x4, x4; \
139 /* postfilter sboxes 1 and 4 */ \
140 vbroadcasti128 .Lpost_tf_lo_s3, t2; \
141 vbroadcasti128 .Lpost_tf_hi_s3, t3; \
142 filter_8bit(x0, t0, t1, t7, t6); \
143 filter_8bit(x7, t0, t1, t7, t6); \
144 filter_8bit(x3, t0, t1, t7, t6); \
145 filter_8bit(x6, t0, t1, t7, t6); \
147 /* postfilter sbox 3 */ \
148 vbroadcasti128 .Lpost_tf_lo_s2, t4; \
149 vbroadcasti128 .Lpost_tf_hi_s2, t5; \
150 filter_8bit(x2, t2, t3, t7, t6); \
151 filter_8bit(x5, t2, t3, t7, t6); \
153 vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
155 /* postfilter sbox 2 */ \
156 filter_8bit(x1, t4, t5, t7, t2); \
157 filter_8bit(x4, t4, t5, t7, t2); \
160 vpsrldq $1, t0, t1; \
161 vpsrldq $2, t0, t2; \
162 vpshufb t7, t1, t1; \
163 vpsrldq $3, t0, t3; \
171 vpshufb t7, t2, t2; \
172 vpsrldq $4, t0, t4; \
173 vpshufb t7, t3, t3; \
174 vpsrldq $5, t0, t5; \
175 vpshufb t7, t4, t4; \
182 vpsrldq $6, t0, t6; \
183 vpshufb t7, t5, t5; \
184 vpshufb t7, t6, t6; \
194 vpxor x2, x7, x7; /* note: high and low parts swapped */ \
196 /* Add key material and result to CD (x becomes new CD) */ \
199 vpxor 5 * 32(mem_cd), x1, x1; \
201 vpsrldq $7, t0, t6; \
202 vpshufb t7, t0, t0; \
203 vpshufb t7, t6, t7; \
206 vpxor 4 * 32(mem_cd), x0, x0; \
209 vpxor 6 * 32(mem_cd), x2, x2; \
212 vpxor 7 * 32(mem_cd), x3, x3; \
215 vpxor 0 * 32(mem_cd), x4, x4; \
218 vpxor 1 * 32(mem_cd), x5, x5; \
221 vpxor 2 * 32(mem_cd), x6, x6; \
224 vpxor 3 * 32(mem_cd), x7, x7;
227 * Size optimization... with inlined roundsm32 binary would be over 5 times
228 * larger and would only marginally faster.
231 roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
232 roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
233 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
236 ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
239 roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
240 roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
241 %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
244 ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
248 * x0..x7: byte-sliced AB state preloaded
249 * mem_ab: byte-sliced AB state in memory
250 * mem_cb: byte-sliced CD state in memory
252 #define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
253 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
254 leaq (key_table + (i) * 8)(CTX), %r9; \
255 call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
257 vmovdqu x0, 4 * 32(mem_cd); \
258 vmovdqu x1, 5 * 32(mem_cd); \
259 vmovdqu x2, 6 * 32(mem_cd); \
260 vmovdqu x3, 7 * 32(mem_cd); \
261 vmovdqu x4, 0 * 32(mem_cd); \
262 vmovdqu x5, 1 * 32(mem_cd); \
263 vmovdqu x6, 2 * 32(mem_cd); \
264 vmovdqu x7, 3 * 32(mem_cd); \
266 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
267 call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
269 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
271 #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
273 #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
274 /* Store new AB state */ \
275 vmovdqu x4, 4 * 32(mem_ab); \
276 vmovdqu x5, 5 * 32(mem_ab); \
277 vmovdqu x6, 6 * 32(mem_ab); \
278 vmovdqu x7, 7 * 32(mem_ab); \
279 vmovdqu x0, 0 * 32(mem_ab); \
280 vmovdqu x1, 1 * 32(mem_ab); \
281 vmovdqu x2, 2 * 32(mem_ab); \
282 vmovdqu x3, 3 * 32(mem_ab);
284 #define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
285 y6, y7, mem_ab, mem_cd, i) \
286 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
287 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
288 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
289 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
290 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
291 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
293 #define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
294 y6, y7, mem_ab, mem_cd, i) \
295 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
296 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
297 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
298 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
299 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
300 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
304 * v0..3: byte-sliced 32-bit integers
308 #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
309 vpcmpgtb v0, zero, t0; \
313 vpcmpgtb v1, zero, t1; \
317 vpcmpgtb v2, zero, t2; \
323 vpcmpgtb v3, zero, t0; \
333 * r: byte-sliced AB state in memory
334 * l: byte-sliced CD state in memory
336 * x0..x7: new byte-sliced CD state
338 #define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
339 tt1, tt2, tt3, kll, klr, krl, krr) \
343 * lr ^= rol32(t0, 1); \
345 vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
346 vpxor tt0, tt0, tt0; \
347 vpshufb tt0, t0, t3; \
348 vpsrldq $1, t0, t0; \
349 vpshufb tt0, t0, t2; \
350 vpsrldq $1, t0, t0; \
351 vpshufb tt0, t0, t1; \
352 vpsrldq $1, t0, t0; \
353 vpshufb tt0, t0, t0; \
360 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
363 vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
364 vmovdqu l4, 4 * 32(l); \
366 vmovdqu l5, 5 * 32(l); \
368 vmovdqu l6, 6 * 32(l); \
370 vmovdqu l7, 7 * 32(l); \
378 vpshufb tt0, t0, t3; \
379 vpsrldq $1, t0, t0; \
380 vpshufb tt0, t0, t2; \
381 vpsrldq $1, t0, t0; \
382 vpshufb tt0, t0, t1; \
383 vpsrldq $1, t0, t0; \
384 vpshufb tt0, t0, t0; \
386 vpor 4 * 32(r), t0, t0; \
387 vpor 5 * 32(r), t1, t1; \
388 vpor 6 * 32(r), t2, t2; \
389 vpor 7 * 32(r), t3, t3; \
391 vpxor 0 * 32(r), t0, t0; \
392 vpxor 1 * 32(r), t1, t1; \
393 vpxor 2 * 32(r), t2, t2; \
394 vpxor 3 * 32(r), t3, t3; \
395 vmovdqu t0, 0 * 32(r); \
396 vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
397 vmovdqu t1, 1 * 32(r); \
398 vmovdqu t2, 2 * 32(r); \
399 vmovdqu t3, 3 * 32(r); \
404 * rr ^= rol32(t2, 1); \
406 vpshufb tt0, t0, t3; \
407 vpsrldq $1, t0, t0; \
408 vpshufb tt0, t0, t2; \
409 vpsrldq $1, t0, t0; \
410 vpshufb tt0, t0, t1; \
411 vpsrldq $1, t0, t0; \
412 vpshufb tt0, t0, t0; \
414 vpand 0 * 32(r), t0, t0; \
415 vpand 1 * 32(r), t1, t1; \
416 vpand 2 * 32(r), t2, t2; \
417 vpand 3 * 32(r), t3, t3; \
419 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
421 vpxor 4 * 32(r), t0, t0; \
422 vpxor 5 * 32(r), t1, t1; \
423 vpxor 6 * 32(r), t2, t2; \
424 vpxor 7 * 32(r), t3, t3; \
425 vmovdqu t0, 4 * 32(r); \
426 vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
427 vmovdqu t1, 5 * 32(r); \
428 vmovdqu t2, 6 * 32(r); \
429 vmovdqu t3, 7 * 32(r); \
437 vpshufb tt0, t0, t3; \
438 vpsrldq $1, t0, t0; \
439 vpshufb tt0, t0, t2; \
440 vpsrldq $1, t0, t0; \
441 vpshufb tt0, t0, t1; \
442 vpsrldq $1, t0, t0; \
443 vpshufb tt0, t0, t0; \
451 vmovdqu l0, 0 * 32(l); \
453 vmovdqu l1, 1 * 32(l); \
455 vmovdqu l2, 2 * 32(l); \
457 vmovdqu l3, 3 * 32(l);
459 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
460 vpunpckhdq x1, x0, t2; \
461 vpunpckldq x1, x0, x0; \
463 vpunpckldq x3, x2, t1; \
464 vpunpckhdq x3, x2, x2; \
466 vpunpckhqdq t1, x0, x1; \
467 vpunpcklqdq t1, x0, x0; \
469 vpunpckhqdq x2, t2, x3; \
470 vpunpcklqdq x2, t2, x2;
472 #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
473 a3, b3, c3, d3, st0, st1) \
476 transpose_4x4(a0, a1, a2, a3, d2, d3); \
477 transpose_4x4(b0, b1, b2, b3, d2, d3); \
483 transpose_4x4(c0, c1, c2, c3, a0, a1); \
484 transpose_4x4(d0, d1, d2, d3, a0, a1); \
486 vbroadcasti128 .Lshufb_16x16b, a0; \
488 vpshufb a0, a2, a2; \
489 vpshufb a0, a3, a3; \
490 vpshufb a0, b0, b0; \
491 vpshufb a0, b1, b1; \
492 vpshufb a0, b2, b2; \
493 vpshufb a0, b3, b3; \
494 vpshufb a0, a1, a1; \
495 vpshufb a0, c0, c0; \
496 vpshufb a0, c1, c1; \
497 vpshufb a0, c2, c2; \
498 vpshufb a0, c3, c3; \
499 vpshufb a0, d0, d0; \
500 vpshufb a0, d1, d1; \
501 vpshufb a0, d2, d2; \
502 vpshufb a0, d3, d3; \
505 vpshufb a0, d3, a0; \
508 transpose_4x4(a0, b0, c0, d0, d2, d3); \
509 transpose_4x4(a1, b1, c1, d1, d2, d3); \
515 transpose_4x4(a2, b2, c2, d2, b0, b1); \
516 transpose_4x4(a3, b3, c3, d3, b0, b1); \
519 /* does not adjust output bytes inside vectors */
521 /* load blocks to registers and apply pre-whitening */
522 #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
524 vpbroadcastq key, x0; \
525 vpshufb .Lpack_bswap, x0, x0; \
527 vpxor 0 * 32(rio), x0, y7; \
528 vpxor 1 * 32(rio), x0, y6; \
529 vpxor 2 * 32(rio), x0, y5; \
530 vpxor 3 * 32(rio), x0, y4; \
531 vpxor 4 * 32(rio), x0, y3; \
532 vpxor 5 * 32(rio), x0, y2; \
533 vpxor 6 * 32(rio), x0, y1; \
534 vpxor 7 * 32(rio), x0, y0; \
535 vpxor 8 * 32(rio), x0, x7; \
536 vpxor 9 * 32(rio), x0, x6; \
537 vpxor 10 * 32(rio), x0, x5; \
538 vpxor 11 * 32(rio), x0, x4; \
539 vpxor 12 * 32(rio), x0, x3; \
540 vpxor 13 * 32(rio), x0, x2; \
541 vpxor 14 * 32(rio), x0, x1; \
542 vpxor 15 * 32(rio), x0, x0;
544 /* byteslice pre-whitened blocks and store to temporary memory */
545 #define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
546 y6, y7, mem_ab, mem_cd) \
547 byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
548 y4, y5, y6, y7, (mem_ab), (mem_cd)); \
550 vmovdqu x0, 0 * 32(mem_ab); \
551 vmovdqu x1, 1 * 32(mem_ab); \
552 vmovdqu x2, 2 * 32(mem_ab); \
553 vmovdqu x3, 3 * 32(mem_ab); \
554 vmovdqu x4, 4 * 32(mem_ab); \
555 vmovdqu x5, 5 * 32(mem_ab); \
556 vmovdqu x6, 6 * 32(mem_ab); \
557 vmovdqu x7, 7 * 32(mem_ab); \
558 vmovdqu y0, 0 * 32(mem_cd); \
559 vmovdqu y1, 1 * 32(mem_cd); \
560 vmovdqu y2, 2 * 32(mem_cd); \
561 vmovdqu y3, 3 * 32(mem_cd); \
562 vmovdqu y4, 4 * 32(mem_cd); \
563 vmovdqu y5, 5 * 32(mem_cd); \
564 vmovdqu y6, 6 * 32(mem_cd); \
565 vmovdqu y7, 7 * 32(mem_cd);
567 /* de-byteslice, apply post-whitening and store blocks */
568 #define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
569 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
570 byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
571 y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
573 vmovdqu x0, stack_tmp0; \
575 vpbroadcastq key, x0; \
576 vpshufb .Lpack_bswap, x0, x0; \
593 vpxor stack_tmp0, x0, x0;
595 #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
597 vmovdqu x0, 0 * 32(rio); \
598 vmovdqu x1, 1 * 32(rio); \
599 vmovdqu x2, 2 * 32(rio); \
600 vmovdqu x3, 3 * 32(rio); \
601 vmovdqu x4, 4 * 32(rio); \
602 vmovdqu x5, 5 * 32(rio); \
603 vmovdqu x6, 6 * 32(rio); \
604 vmovdqu x7, 7 * 32(rio); \
605 vmovdqu y0, 8 * 32(rio); \
606 vmovdqu y1, 9 * 32(rio); \
607 vmovdqu y2, 10 * 32(rio); \
608 vmovdqu y3, 11 * 32(rio); \
609 vmovdqu y4, 12 * 32(rio); \
610 vmovdqu y5, 13 * 32(rio); \
611 vmovdqu y6, 14 * 32(rio); \
612 vmovdqu y7, 15 * 32(rio);
617 #define SHUFB_BYTES(idx) \
618 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
621 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
622 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
625 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
626 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
628 /* For CTR-mode IV byteswap */
630 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
633 .Lxts_gf128mul_and_shl1_mask_0:
634 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
635 .Lxts_gf128mul_and_shl1_mask_1:
636 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
639 * pre-SubByte transform
641 * pre-lookup for sbox1, sbox2, sbox3:
642 * swap_bitendianness(
643 * isom_map_camellia_to_aes(
645 * swap_bitendianess(in)
650 * (note: '⊕ 0xc5' inside camellia_f())
653 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
654 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
656 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
657 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
660 * pre-SubByte transform
662 * pre-lookup for sbox4:
663 * swap_bitendianness(
664 * isom_map_camellia_to_aes(
666 * swap_bitendianess(in <<< 1)
671 * (note: '⊕ 0xc5' inside camellia_f())
674 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
675 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
677 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
678 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
681 * post-SubByte transform
683 * post-lookup for sbox1, sbox4:
684 * swap_bitendianness(
686 * isom_map_aes_to_camellia(
687 * swap_bitendianness(
688 * aes_inverse_affine_transform(in)
694 * (note: '⊕ 0x6e' inside camellia_h())
697 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
698 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
700 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
701 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
704 * post-SubByte transform
706 * post-lookup for sbox2:
707 * swap_bitendianness(
709 * isom_map_aes_to_camellia(
710 * swap_bitendianness(
711 * aes_inverse_affine_transform(in)
717 * (note: '⊕ 0x6e' inside camellia_h())
720 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
721 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
723 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
724 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
727 * post-SubByte transform
729 * post-lookup for sbox3:
730 * swap_bitendianness(
732 * isom_map_aes_to_camellia(
733 * swap_bitendianness(
734 * aes_inverse_affine_transform(in)
740 * (note: '⊕ 0x6e' inside camellia_h())
743 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
744 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
746 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
747 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
749 /* For isolating SubBytes from AESENCLAST, inverse shift row */
751 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
752 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
762 __camellia_enc_blk32:
765 * %rax: temporary storage, 512 bytes
766 * %ymm0..%ymm15: 32 plaintext blocks
768 * %ymm0..%ymm15: 32 encrypted blocks, order swapped:
769 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
773 leaq 8 * 32(%rax), %rcx;
775 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
776 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
779 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
780 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
781 %ymm15, %rax, %rcx, 0);
783 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
784 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
786 ((key_table + (8) * 8) + 0)(CTX),
787 ((key_table + (8) * 8) + 4)(CTX),
788 ((key_table + (8) * 8) + 8)(CTX),
789 ((key_table + (8) * 8) + 12)(CTX));
791 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
792 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
793 %ymm15, %rax, %rcx, 8);
795 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
796 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
798 ((key_table + (16) * 8) + 0)(CTX),
799 ((key_table + (16) * 8) + 4)(CTX),
800 ((key_table + (16) * 8) + 8)(CTX),
801 ((key_table + (16) * 8) + 12)(CTX));
803 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
804 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
805 %ymm15, %rax, %rcx, 16);
808 cmpl $16, key_length(CTX);
812 /* load CD for output */
813 vmovdqu 0 * 32(%rcx), %ymm8;
814 vmovdqu 1 * 32(%rcx), %ymm9;
815 vmovdqu 2 * 32(%rcx), %ymm10;
816 vmovdqu 3 * 32(%rcx), %ymm11;
817 vmovdqu 4 * 32(%rcx), %ymm12;
818 vmovdqu 5 * 32(%rcx), %ymm13;
819 vmovdqu 6 * 32(%rcx), %ymm14;
820 vmovdqu 7 * 32(%rcx), %ymm15;
822 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
823 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
824 %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
833 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
834 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
836 ((key_table + (24) * 8) + 0)(CTX),
837 ((key_table + (24) * 8) + 4)(CTX),
838 ((key_table + (24) * 8) + 8)(CTX),
839 ((key_table + (24) * 8) + 12)(CTX));
841 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
842 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
843 %ymm15, %rax, %rcx, 24);
846 ENDPROC(__camellia_enc_blk32)
849 __camellia_dec_blk32:
852 * %rax: temporary storage, 512 bytes
853 * %r8d: 24 for 16 byte key, 32 for larger
854 * %ymm0..%ymm15: 16 encrypted blocks
856 * %ymm0..%ymm15: 16 plaintext blocks, order swapped:
857 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
861 leaq 8 * 32(%rax), %rcx;
863 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
864 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
871 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
872 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
873 %ymm15, %rax, %rcx, 16);
875 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
876 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
878 ((key_table + (16) * 8) + 8)(CTX),
879 ((key_table + (16) * 8) + 12)(CTX),
880 ((key_table + (16) * 8) + 0)(CTX),
881 ((key_table + (16) * 8) + 4)(CTX));
883 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
884 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
885 %ymm15, %rax, %rcx, 8);
887 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
888 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
890 ((key_table + (8) * 8) + 8)(CTX),
891 ((key_table + (8) * 8) + 12)(CTX),
892 ((key_table + (8) * 8) + 0)(CTX),
893 ((key_table + (8) * 8) + 4)(CTX));
895 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
896 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
897 %ymm15, %rax, %rcx, 0);
899 /* load CD for output */
900 vmovdqu 0 * 32(%rcx), %ymm8;
901 vmovdqu 1 * 32(%rcx), %ymm9;
902 vmovdqu 2 * 32(%rcx), %ymm10;
903 vmovdqu 3 * 32(%rcx), %ymm11;
904 vmovdqu 4 * 32(%rcx), %ymm12;
905 vmovdqu 5 * 32(%rcx), %ymm13;
906 vmovdqu 6 * 32(%rcx), %ymm14;
907 vmovdqu 7 * 32(%rcx), %ymm15;
909 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
910 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
911 %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
918 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
919 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
920 %ymm15, %rax, %rcx, 24);
922 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
923 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
925 ((key_table + (24) * 8) + 8)(CTX),
926 ((key_table + (24) * 8) + 12)(CTX),
927 ((key_table + (24) * 8) + 0)(CTX),
928 ((key_table + (24) * 8) + 4)(CTX));
931 ENDPROC(__camellia_dec_blk32)
933 ENTRY(camellia_ecb_enc_32way)
936 * %rsi: dst (32 blocks)
937 * %rdx: src (32 blocks)
943 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
944 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
945 %ymm15, %rdx, (key_table)(CTX));
947 /* now dst can be used as temporary buffer (even in src == dst case) */
950 call __camellia_enc_blk32;
952 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
953 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
960 ENDPROC(camellia_ecb_enc_32way)
962 ENTRY(camellia_ecb_dec_32way)
965 * %rsi: dst (32 blocks)
966 * %rdx: src (32 blocks)
972 cmpl $16, key_length(CTX);
975 cmovel %eax, %r8d; /* max */
977 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
978 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
979 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
981 /* now dst can be used as temporary buffer (even in src == dst case) */
984 call __camellia_dec_blk32;
986 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
987 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
994 ENDPROC(camellia_ecb_dec_32way)
996 ENTRY(camellia_cbc_dec_32way)
999 * %rsi: dst (32 blocks)
1000 * %rdx: src (32 blocks)
1006 cmpl $16, key_length(CTX);
1009 cmovel %eax, %r8d; /* max */
1011 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1012 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1013 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
1017 je .Lcbc_dec_use_stack;
1019 /* dst can be used as temporary storage, src is not overwritten. */
1021 jmp .Lcbc_dec_continue;
1023 .Lcbc_dec_use_stack:
1025 * dst still in-use (because dst == src), so use stack for temporary
1028 subq $(16 * 32), %rsp;
1032 call __camellia_dec_blk32;
1034 vmovdqu %ymm7, (%rax);
1035 vpxor %ymm7, %ymm7, %ymm7;
1036 vinserti128 $1, (%rdx), %ymm7, %ymm7;
1037 vpxor (%rax), %ymm7, %ymm7;
1039 vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
1040 vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
1041 vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
1042 vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
1043 vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
1044 vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
1045 vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
1046 vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
1047 vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
1048 vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
1049 vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
1050 vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
1051 vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
1052 vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
1053 vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
1054 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1055 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1062 ENDPROC(camellia_cbc_dec_32way)
1064 #define inc_le128(x, minus_one, tmp) \
1065 vpcmpeqq minus_one, x, tmp; \
1066 vpsubq minus_one, x, x; \
1067 vpslldq $8, tmp, tmp; \
1070 #define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
1071 vpcmpeqq minus_one, x, tmp1; \
1072 vpcmpeqq minus_two, x, tmp2; \
1073 vpsubq minus_two, x, x; \
1074 vpor tmp2, tmp1, tmp1; \
1075 vpslldq $8, tmp1, tmp1; \
1078 ENTRY(camellia_ctr_32way)
1081 * %rsi: dst (32 blocks)
1082 * %rdx: src (32 blocks)
1083 * %rcx: iv (little endian, 128bit)
1093 /* dst can be used as temporary storage, src is not overwritten. */
1098 subq $(16 * 32), %rsp;
1102 vpcmpeqd %ymm15, %ymm15, %ymm15;
1103 vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
1104 vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */
1106 /* load IV and byteswap */
1107 vmovdqu (%rcx), %xmm0;
1108 vmovdqa %xmm0, %xmm1;
1109 inc_le128(%xmm0, %xmm15, %xmm14);
1110 vbroadcasti128 .Lbswap128_mask, %ymm14;
1111 vinserti128 $1, %xmm0, %ymm1, %ymm0;
1112 vpshufb %ymm14, %ymm0, %ymm13;
1113 vmovdqu %ymm13, 15 * 32(%rax);
1116 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */
1117 vpshufb %ymm14, %ymm0, %ymm13;
1118 vmovdqu %ymm13, 14 * 32(%rax);
1119 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1120 vpshufb %ymm14, %ymm0, %ymm13;
1121 vmovdqu %ymm13, 13 * 32(%rax);
1122 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1123 vpshufb %ymm14, %ymm0, %ymm13;
1124 vmovdqu %ymm13, 12 * 32(%rax);
1125 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1126 vpshufb %ymm14, %ymm0, %ymm13;
1127 vmovdqu %ymm13, 11 * 32(%rax);
1128 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1129 vpshufb %ymm14, %ymm0, %ymm10;
1130 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1131 vpshufb %ymm14, %ymm0, %ymm9;
1132 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1133 vpshufb %ymm14, %ymm0, %ymm8;
1134 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1135 vpshufb %ymm14, %ymm0, %ymm7;
1136 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1137 vpshufb %ymm14, %ymm0, %ymm6;
1138 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1139 vpshufb %ymm14, %ymm0, %ymm5;
1140 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1141 vpshufb %ymm14, %ymm0, %ymm4;
1142 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1143 vpshufb %ymm14, %ymm0, %ymm3;
1144 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1145 vpshufb %ymm14, %ymm0, %ymm2;
1146 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1147 vpshufb %ymm14, %ymm0, %ymm1;
1148 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1149 vextracti128 $1, %ymm0, %xmm13;
1150 vpshufb %ymm14, %ymm0, %ymm0;
1151 inc_le128(%xmm13, %xmm15, %xmm14);
1152 vmovdqu %xmm13, (%rcx);
1155 vpbroadcastq (key_table)(CTX), %ymm15;
1156 vpshufb .Lpack_bswap, %ymm15, %ymm15;
1157 vpxor %ymm0, %ymm15, %ymm0;
1158 vpxor %ymm1, %ymm15, %ymm1;
1159 vpxor %ymm2, %ymm15, %ymm2;
1160 vpxor %ymm3, %ymm15, %ymm3;
1161 vpxor %ymm4, %ymm15, %ymm4;
1162 vpxor %ymm5, %ymm15, %ymm5;
1163 vpxor %ymm6, %ymm15, %ymm6;
1164 vpxor %ymm7, %ymm15, %ymm7;
1165 vpxor %ymm8, %ymm15, %ymm8;
1166 vpxor %ymm9, %ymm15, %ymm9;
1167 vpxor %ymm10, %ymm15, %ymm10;
1168 vpxor 11 * 32(%rax), %ymm15, %ymm11;
1169 vpxor 12 * 32(%rax), %ymm15, %ymm12;
1170 vpxor 13 * 32(%rax), %ymm15, %ymm13;
1171 vpxor 14 * 32(%rax), %ymm15, %ymm14;
1172 vpxor 15 * 32(%rax), %ymm15, %ymm15;
1174 call __camellia_enc_blk32;
1178 vpxor 0 * 32(%rdx), %ymm7, %ymm7;
1179 vpxor 1 * 32(%rdx), %ymm6, %ymm6;
1180 vpxor 2 * 32(%rdx), %ymm5, %ymm5;
1181 vpxor 3 * 32(%rdx), %ymm4, %ymm4;
1182 vpxor 4 * 32(%rdx), %ymm3, %ymm3;
1183 vpxor 5 * 32(%rdx), %ymm2, %ymm2;
1184 vpxor 6 * 32(%rdx), %ymm1, %ymm1;
1185 vpxor 7 * 32(%rdx), %ymm0, %ymm0;
1186 vpxor 8 * 32(%rdx), %ymm15, %ymm15;
1187 vpxor 9 * 32(%rdx), %ymm14, %ymm14;
1188 vpxor 10 * 32(%rdx), %ymm13, %ymm13;
1189 vpxor 11 * 32(%rdx), %ymm12, %ymm12;
1190 vpxor 12 * 32(%rdx), %ymm11, %ymm11;
1191 vpxor 13 * 32(%rdx), %ymm10, %ymm10;
1192 vpxor 14 * 32(%rdx), %ymm9, %ymm9;
1193 vpxor 15 * 32(%rdx), %ymm8, %ymm8;
1194 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1195 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1202 ENDPROC(camellia_ctr_32way)
1204 #define gf128mul_x_ble(iv, mask, tmp) \
1205 vpsrad $31, iv, tmp; \
1206 vpaddq iv, iv, iv; \
1207 vpshufd $0x13, tmp, tmp; \
1208 vpand mask, tmp, tmp; \
1211 #define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
1212 vpsrad $31, iv, tmp0; \
1213 vpaddq iv, iv, tmp1; \
1214 vpsllq $2, iv, iv; \
1215 vpshufd $0x13, tmp0, tmp0; \
1216 vpsrad $31, tmp1, tmp1; \
1217 vpand mask2, tmp0, tmp0; \
1218 vpshufd $0x13, tmp1, tmp1; \
1219 vpxor tmp0, iv, iv; \
1220 vpand mask1, tmp1, tmp1; \
1224 camellia_xts_crypt_32way:
1227 * %rsi: dst (32 blocks)
1228 * %rdx: src (32 blocks)
1229 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1230 * %r8: index for input whitening key
1231 * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32
1237 subq $(16 * 32), %rsp;
1240 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12;
1242 /* load IV and construct second IV */
1243 vmovdqu (%rcx), %xmm0;
1244 vmovdqa %xmm0, %xmm15;
1245 gf128mul_x_ble(%xmm0, %xmm12, %xmm13);
1246 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13;
1247 vinserti128 $1, %xmm0, %ymm15, %ymm0;
1248 vpxor 0 * 32(%rdx), %ymm0, %ymm15;
1249 vmovdqu %ymm15, 15 * 32(%rax);
1250 vmovdqu %ymm0, 0 * 32(%rsi);
1253 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1254 vpxor 1 * 32(%rdx), %ymm0, %ymm15;
1255 vmovdqu %ymm15, 14 * 32(%rax);
1256 vmovdqu %ymm0, 1 * 32(%rsi);
1258 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1259 vpxor 2 * 32(%rdx), %ymm0, %ymm15;
1260 vmovdqu %ymm15, 13 * 32(%rax);
1261 vmovdqu %ymm0, 2 * 32(%rsi);
1263 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1264 vpxor 3 * 32(%rdx), %ymm0, %ymm15;
1265 vmovdqu %ymm15, 12 * 32(%rax);
1266 vmovdqu %ymm0, 3 * 32(%rsi);
1268 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1269 vpxor 4 * 32(%rdx), %ymm0, %ymm11;
1270 vmovdqu %ymm0, 4 * 32(%rsi);
1272 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1273 vpxor 5 * 32(%rdx), %ymm0, %ymm10;
1274 vmovdqu %ymm0, 5 * 32(%rsi);
1276 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1277 vpxor 6 * 32(%rdx), %ymm0, %ymm9;
1278 vmovdqu %ymm0, 6 * 32(%rsi);
1280 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1281 vpxor 7 * 32(%rdx), %ymm0, %ymm8;
1282 vmovdqu %ymm0, 7 * 32(%rsi);
1284 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1285 vpxor 8 * 32(%rdx), %ymm0, %ymm7;
1286 vmovdqu %ymm0, 8 * 32(%rsi);
1288 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1289 vpxor 9 * 32(%rdx), %ymm0, %ymm6;
1290 vmovdqu %ymm0, 9 * 32(%rsi);
1292 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1293 vpxor 10 * 32(%rdx), %ymm0, %ymm5;
1294 vmovdqu %ymm0, 10 * 32(%rsi);
1296 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1297 vpxor 11 * 32(%rdx), %ymm0, %ymm4;
1298 vmovdqu %ymm0, 11 * 32(%rsi);
1300 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1301 vpxor 12 * 32(%rdx), %ymm0, %ymm3;
1302 vmovdqu %ymm0, 12 * 32(%rsi);
1304 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1305 vpxor 13 * 32(%rdx), %ymm0, %ymm2;
1306 vmovdqu %ymm0, 13 * 32(%rsi);
1308 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1309 vpxor 14 * 32(%rdx), %ymm0, %ymm1;
1310 vmovdqu %ymm0, 14 * 32(%rsi);
1312 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1313 vpxor 15 * 32(%rdx), %ymm0, %ymm15;
1314 vmovdqu %ymm15, 0 * 32(%rax);
1315 vmovdqu %ymm0, 15 * 32(%rsi);
1317 vextracti128 $1, %ymm0, %xmm0;
1318 gf128mul_x_ble(%xmm0, %xmm12, %xmm15);
1319 vmovdqu %xmm0, (%rcx);
1322 vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
1323 vpshufb .Lpack_bswap, %ymm15, %ymm15;
1324 vpxor 0 * 32(%rax), %ymm15, %ymm0;
1325 vpxor %ymm1, %ymm15, %ymm1;
1326 vpxor %ymm2, %ymm15, %ymm2;
1327 vpxor %ymm3, %ymm15, %ymm3;
1328 vpxor %ymm4, %ymm15, %ymm4;
1329 vpxor %ymm5, %ymm15, %ymm5;
1330 vpxor %ymm6, %ymm15, %ymm6;
1331 vpxor %ymm7, %ymm15, %ymm7;
1332 vpxor %ymm8, %ymm15, %ymm8;
1333 vpxor %ymm9, %ymm15, %ymm9;
1334 vpxor %ymm10, %ymm15, %ymm10;
1335 vpxor %ymm11, %ymm15, %ymm11;
1336 vpxor 12 * 32(%rax), %ymm15, %ymm12;
1337 vpxor 13 * 32(%rax), %ymm15, %ymm13;
1338 vpxor 14 * 32(%rax), %ymm15, %ymm14;
1339 vpxor 15 * 32(%rax), %ymm15, %ymm15;
1343 addq $(16 * 32), %rsp;
1345 vpxor 0 * 32(%rsi), %ymm7, %ymm7;
1346 vpxor 1 * 32(%rsi), %ymm6, %ymm6;
1347 vpxor 2 * 32(%rsi), %ymm5, %ymm5;
1348 vpxor 3 * 32(%rsi), %ymm4, %ymm4;
1349 vpxor 4 * 32(%rsi), %ymm3, %ymm3;
1350 vpxor 5 * 32(%rsi), %ymm2, %ymm2;
1351 vpxor 6 * 32(%rsi), %ymm1, %ymm1;
1352 vpxor 7 * 32(%rsi), %ymm0, %ymm0;
1353 vpxor 8 * 32(%rsi), %ymm15, %ymm15;
1354 vpxor 9 * 32(%rsi), %ymm14, %ymm14;
1355 vpxor 10 * 32(%rsi), %ymm13, %ymm13;
1356 vpxor 11 * 32(%rsi), %ymm12, %ymm12;
1357 vpxor 12 * 32(%rsi), %ymm11, %ymm11;
1358 vpxor 13 * 32(%rsi), %ymm10, %ymm10;
1359 vpxor 14 * 32(%rsi), %ymm9, %ymm9;
1360 vpxor 15 * 32(%rsi), %ymm8, %ymm8;
1361 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1362 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1369 ENDPROC(camellia_xts_crypt_32way)
1371 ENTRY(camellia_xts_enc_32way)
1374 * %rsi: dst (32 blocks)
1375 * %rdx: src (32 blocks)
1376 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1379 xorl %r8d, %r8d; /* input whitening key, 0 for enc */
1381 leaq __camellia_enc_blk32, %r9;
1383 jmp camellia_xts_crypt_32way;
1384 ENDPROC(camellia_xts_enc_32way)
1386 ENTRY(camellia_xts_dec_32way)
1389 * %rsi: dst (32 blocks)
1390 * %rdx: src (32 blocks)
1391 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1394 cmpl $16, key_length(CTX);
1397 cmovel %eax, %r8d; /* input whitening key, last for dec */
1399 leaq __camellia_dec_blk32, %r9;
1401 jmp camellia_xts_crypt_32way;
1402 ENDPROC(camellia_xts_dec_32way)