1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * x86_64/AVX2/AES-NI assembler implementation of Camellia
5 * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
8 #include <linux/linkage.h>
11 #define CAMELLIA_TABLE_BYTE_LEN 272
13 /* struct camellia_ctx: */
15 #define key_length CAMELLIA_TABLE_BYTE_LEN
21 /**********************************************************************
23 **********************************************************************/
24 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
25 vpand x, mask4bit, tmp0; \
26 vpandn x, mask4bit, x; \
29 vpshufb tmp0, lo_t, tmp0; \
50 /**********************************************************************
52 **********************************************************************/
56 * x0..x7: byte-sliced AB state
57 * mem_cd: register pointer storing CD state
58 * key: index for key material
60 * x0..x7: new byte-sliced CD state
62 #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
65 * S-function with AES subbytes \
67 vbroadcasti128 .Linv_shift_row, t4; \
68 vpbroadcastd .L0f0f0f0f, t7; \
69 vbroadcasti128 .Lpre_tf_lo_s1, t5; \
70 vbroadcasti128 .Lpre_tf_hi_s1, t6; \
71 vbroadcasti128 .Lpre_tf_lo_s4, t2; \
72 vbroadcasti128 .Lpre_tf_hi_s4, t3; \
74 /* AES inverse shift rows */ \
84 /* prefilter sboxes 1, 2 and 3 */ \
85 /* prefilter sbox 4 */ \
86 filter_8bit(x0, t5, t6, t7, t4); \
87 filter_8bit(x7, t5, t6, t7, t4); \
88 vextracti128 $1, x0, t0##_x; \
89 vextracti128 $1, x7, t1##_x; \
90 filter_8bit(x3, t2, t3, t7, t4); \
91 filter_8bit(x6, t2, t3, t7, t4); \
92 vextracti128 $1, x3, t3##_x; \
93 vextracti128 $1, x6, t2##_x; \
94 filter_8bit(x2, t5, t6, t7, t4); \
95 filter_8bit(x5, t5, t6, t7, t4); \
96 filter_8bit(x1, t5, t6, t7, t4); \
97 filter_8bit(x4, t5, t6, t7, t4); \
99 vpxor t4##_x, t4##_x, t4##_x; \
101 /* AES subbytes + AES shift rows */ \
102 vextracti128 $1, x2, t6##_x; \
103 vextracti128 $1, x5, t5##_x; \
104 vaesenclast t4##_x, x0##_x, x0##_x; \
105 vaesenclast t4##_x, t0##_x, t0##_x; \
106 vinserti128 $1, t0##_x, x0, x0; \
107 vaesenclast t4##_x, x7##_x, x7##_x; \
108 vaesenclast t4##_x, t1##_x, t1##_x; \
109 vinserti128 $1, t1##_x, x7, x7; \
110 vaesenclast t4##_x, x3##_x, x3##_x; \
111 vaesenclast t4##_x, t3##_x, t3##_x; \
112 vinserti128 $1, t3##_x, x3, x3; \
113 vaesenclast t4##_x, x6##_x, x6##_x; \
114 vaesenclast t4##_x, t2##_x, t2##_x; \
115 vinserti128 $1, t2##_x, x6, x6; \
116 vextracti128 $1, x1, t3##_x; \
117 vextracti128 $1, x4, t2##_x; \
118 vbroadcasti128 .Lpost_tf_lo_s1, t0; \
119 vbroadcasti128 .Lpost_tf_hi_s1, t1; \
120 vaesenclast t4##_x, x2##_x, x2##_x; \
121 vaesenclast t4##_x, t6##_x, t6##_x; \
122 vinserti128 $1, t6##_x, x2, x2; \
123 vaesenclast t4##_x, x5##_x, x5##_x; \
124 vaesenclast t4##_x, t5##_x, t5##_x; \
125 vinserti128 $1, t5##_x, x5, x5; \
126 vaesenclast t4##_x, x1##_x, x1##_x; \
127 vaesenclast t4##_x, t3##_x, t3##_x; \
128 vinserti128 $1, t3##_x, x1, x1; \
129 vaesenclast t4##_x, x4##_x, x4##_x; \
130 vaesenclast t4##_x, t2##_x, t2##_x; \
131 vinserti128 $1, t2##_x, x4, x4; \
133 /* postfilter sboxes 1 and 4 */ \
134 vbroadcasti128 .Lpost_tf_lo_s3, t2; \
135 vbroadcasti128 .Lpost_tf_hi_s3, t3; \
136 filter_8bit(x0, t0, t1, t7, t6); \
137 filter_8bit(x7, t0, t1, t7, t6); \
138 filter_8bit(x3, t0, t1, t7, t6); \
139 filter_8bit(x6, t0, t1, t7, t6); \
141 /* postfilter sbox 3 */ \
142 vbroadcasti128 .Lpost_tf_lo_s2, t4; \
143 vbroadcasti128 .Lpost_tf_hi_s2, t5; \
144 filter_8bit(x2, t2, t3, t7, t6); \
145 filter_8bit(x5, t2, t3, t7, t6); \
147 vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
149 /* postfilter sbox 2 */ \
150 filter_8bit(x1, t4, t5, t7, t2); \
151 filter_8bit(x4, t4, t5, t7, t2); \
154 vpsrldq $1, t0, t1; \
155 vpsrldq $2, t0, t2; \
156 vpshufb t7, t1, t1; \
157 vpsrldq $3, t0, t3; \
165 vpshufb t7, t2, t2; \
166 vpsrldq $4, t0, t4; \
167 vpshufb t7, t3, t3; \
168 vpsrldq $5, t0, t5; \
169 vpshufb t7, t4, t4; \
176 vpsrldq $6, t0, t6; \
177 vpshufb t7, t5, t5; \
178 vpshufb t7, t6, t6; \
188 vpxor x2, x7, x7; /* note: high and low parts swapped */ \
190 /* Add key material and result to CD (x becomes new CD) */ \
193 vpxor 5 * 32(mem_cd), x1, x1; \
195 vpsrldq $7, t0, t6; \
196 vpshufb t7, t0, t0; \
197 vpshufb t7, t6, t7; \
200 vpxor 4 * 32(mem_cd), x0, x0; \
203 vpxor 6 * 32(mem_cd), x2, x2; \
206 vpxor 7 * 32(mem_cd), x3, x3; \
209 vpxor 0 * 32(mem_cd), x4, x4; \
212 vpxor 1 * 32(mem_cd), x5, x5; \
215 vpxor 2 * 32(mem_cd), x6, x6; \
218 vpxor 3 * 32(mem_cd), x7, x7;
221 * Size optimization... with inlined roundsm32 binary would be over 5 times
222 * larger and would only marginally faster.
225 SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
226 roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
227 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
230 SYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
233 SYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
234 roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
235 %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
238 SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
242 * x0..x7: byte-sliced AB state preloaded
243 * mem_ab: byte-sliced AB state in memory
244 * mem_cb: byte-sliced CD state in memory
246 #define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
247 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
248 leaq (key_table + (i) * 8)(CTX), %r9; \
249 call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
251 vmovdqu x0, 4 * 32(mem_cd); \
252 vmovdqu x1, 5 * 32(mem_cd); \
253 vmovdqu x2, 6 * 32(mem_cd); \
254 vmovdqu x3, 7 * 32(mem_cd); \
255 vmovdqu x4, 0 * 32(mem_cd); \
256 vmovdqu x5, 1 * 32(mem_cd); \
257 vmovdqu x6, 2 * 32(mem_cd); \
258 vmovdqu x7, 3 * 32(mem_cd); \
260 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
261 call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
263 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
265 #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
267 #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
268 /* Store new AB state */ \
269 vmovdqu x4, 4 * 32(mem_ab); \
270 vmovdqu x5, 5 * 32(mem_ab); \
271 vmovdqu x6, 6 * 32(mem_ab); \
272 vmovdqu x7, 7 * 32(mem_ab); \
273 vmovdqu x0, 0 * 32(mem_ab); \
274 vmovdqu x1, 1 * 32(mem_ab); \
275 vmovdqu x2, 2 * 32(mem_ab); \
276 vmovdqu x3, 3 * 32(mem_ab);
278 #define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
279 y6, y7, mem_ab, mem_cd, i) \
280 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
281 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
282 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
283 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
284 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
285 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
287 #define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
288 y6, y7, mem_ab, mem_cd, i) \
289 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
290 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
291 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
292 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
293 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
294 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
298 * v0..3: byte-sliced 32-bit integers
302 #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
303 vpcmpgtb v0, zero, t0; \
307 vpcmpgtb v1, zero, t1; \
311 vpcmpgtb v2, zero, t2; \
317 vpcmpgtb v3, zero, t0; \
327 * r: byte-sliced AB state in memory
328 * l: byte-sliced CD state in memory
330 * x0..x7: new byte-sliced CD state
332 #define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
333 tt1, tt2, tt3, kll, klr, krl, krr) \
337 * lr ^= rol32(t0, 1); \
339 vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
340 vpxor tt0, tt0, tt0; \
341 vpshufb tt0, t0, t3; \
342 vpsrldq $1, t0, t0; \
343 vpshufb tt0, t0, t2; \
344 vpsrldq $1, t0, t0; \
345 vpshufb tt0, t0, t1; \
346 vpsrldq $1, t0, t0; \
347 vpshufb tt0, t0, t0; \
354 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
357 vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
358 vmovdqu l4, 4 * 32(l); \
360 vmovdqu l5, 5 * 32(l); \
362 vmovdqu l6, 6 * 32(l); \
364 vmovdqu l7, 7 * 32(l); \
372 vpshufb tt0, t0, t3; \
373 vpsrldq $1, t0, t0; \
374 vpshufb tt0, t0, t2; \
375 vpsrldq $1, t0, t0; \
376 vpshufb tt0, t0, t1; \
377 vpsrldq $1, t0, t0; \
378 vpshufb tt0, t0, t0; \
380 vpor 4 * 32(r), t0, t0; \
381 vpor 5 * 32(r), t1, t1; \
382 vpor 6 * 32(r), t2, t2; \
383 vpor 7 * 32(r), t3, t3; \
385 vpxor 0 * 32(r), t0, t0; \
386 vpxor 1 * 32(r), t1, t1; \
387 vpxor 2 * 32(r), t2, t2; \
388 vpxor 3 * 32(r), t3, t3; \
389 vmovdqu t0, 0 * 32(r); \
390 vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
391 vmovdqu t1, 1 * 32(r); \
392 vmovdqu t2, 2 * 32(r); \
393 vmovdqu t3, 3 * 32(r); \
398 * rr ^= rol32(t2, 1); \
400 vpshufb tt0, t0, t3; \
401 vpsrldq $1, t0, t0; \
402 vpshufb tt0, t0, t2; \
403 vpsrldq $1, t0, t0; \
404 vpshufb tt0, t0, t1; \
405 vpsrldq $1, t0, t0; \
406 vpshufb tt0, t0, t0; \
408 vpand 0 * 32(r), t0, t0; \
409 vpand 1 * 32(r), t1, t1; \
410 vpand 2 * 32(r), t2, t2; \
411 vpand 3 * 32(r), t3, t3; \
413 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
415 vpxor 4 * 32(r), t0, t0; \
416 vpxor 5 * 32(r), t1, t1; \
417 vpxor 6 * 32(r), t2, t2; \
418 vpxor 7 * 32(r), t3, t3; \
419 vmovdqu t0, 4 * 32(r); \
420 vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
421 vmovdqu t1, 5 * 32(r); \
422 vmovdqu t2, 6 * 32(r); \
423 vmovdqu t3, 7 * 32(r); \
431 vpshufb tt0, t0, t3; \
432 vpsrldq $1, t0, t0; \
433 vpshufb tt0, t0, t2; \
434 vpsrldq $1, t0, t0; \
435 vpshufb tt0, t0, t1; \
436 vpsrldq $1, t0, t0; \
437 vpshufb tt0, t0, t0; \
445 vmovdqu l0, 0 * 32(l); \
447 vmovdqu l1, 1 * 32(l); \
449 vmovdqu l2, 2 * 32(l); \
451 vmovdqu l3, 3 * 32(l);
453 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
454 vpunpckhdq x1, x0, t2; \
455 vpunpckldq x1, x0, x0; \
457 vpunpckldq x3, x2, t1; \
458 vpunpckhdq x3, x2, x2; \
460 vpunpckhqdq t1, x0, x1; \
461 vpunpcklqdq t1, x0, x0; \
463 vpunpckhqdq x2, t2, x3; \
464 vpunpcklqdq x2, t2, x2;
466 #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
467 a3, b3, c3, d3, st0, st1) \
470 transpose_4x4(a0, a1, a2, a3, d2, d3); \
471 transpose_4x4(b0, b1, b2, b3, d2, d3); \
477 transpose_4x4(c0, c1, c2, c3, a0, a1); \
478 transpose_4x4(d0, d1, d2, d3, a0, a1); \
480 vbroadcasti128 .Lshufb_16x16b, a0; \
482 vpshufb a0, a2, a2; \
483 vpshufb a0, a3, a3; \
484 vpshufb a0, b0, b0; \
485 vpshufb a0, b1, b1; \
486 vpshufb a0, b2, b2; \
487 vpshufb a0, b3, b3; \
488 vpshufb a0, a1, a1; \
489 vpshufb a0, c0, c0; \
490 vpshufb a0, c1, c1; \
491 vpshufb a0, c2, c2; \
492 vpshufb a0, c3, c3; \
493 vpshufb a0, d0, d0; \
494 vpshufb a0, d1, d1; \
495 vpshufb a0, d2, d2; \
496 vpshufb a0, d3, d3; \
499 vpshufb a0, d3, a0; \
502 transpose_4x4(a0, b0, c0, d0, d2, d3); \
503 transpose_4x4(a1, b1, c1, d1, d2, d3); \
509 transpose_4x4(a2, b2, c2, d2, b0, b1); \
510 transpose_4x4(a3, b3, c3, d3, b0, b1); \
513 /* does not adjust output bytes inside vectors */
515 /* load blocks to registers and apply pre-whitening */
516 #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
518 vpbroadcastq key, x0; \
519 vpshufb .Lpack_bswap, x0, x0; \
521 vpxor 0 * 32(rio), x0, y7; \
522 vpxor 1 * 32(rio), x0, y6; \
523 vpxor 2 * 32(rio), x0, y5; \
524 vpxor 3 * 32(rio), x0, y4; \
525 vpxor 4 * 32(rio), x0, y3; \
526 vpxor 5 * 32(rio), x0, y2; \
527 vpxor 6 * 32(rio), x0, y1; \
528 vpxor 7 * 32(rio), x0, y0; \
529 vpxor 8 * 32(rio), x0, x7; \
530 vpxor 9 * 32(rio), x0, x6; \
531 vpxor 10 * 32(rio), x0, x5; \
532 vpxor 11 * 32(rio), x0, x4; \
533 vpxor 12 * 32(rio), x0, x3; \
534 vpxor 13 * 32(rio), x0, x2; \
535 vpxor 14 * 32(rio), x0, x1; \
536 vpxor 15 * 32(rio), x0, x0;
538 /* byteslice pre-whitened blocks and store to temporary memory */
539 #define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
540 y6, y7, mem_ab, mem_cd) \
541 byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
542 y4, y5, y6, y7, (mem_ab), (mem_cd)); \
544 vmovdqu x0, 0 * 32(mem_ab); \
545 vmovdqu x1, 1 * 32(mem_ab); \
546 vmovdqu x2, 2 * 32(mem_ab); \
547 vmovdqu x3, 3 * 32(mem_ab); \
548 vmovdqu x4, 4 * 32(mem_ab); \
549 vmovdqu x5, 5 * 32(mem_ab); \
550 vmovdqu x6, 6 * 32(mem_ab); \
551 vmovdqu x7, 7 * 32(mem_ab); \
552 vmovdqu y0, 0 * 32(mem_cd); \
553 vmovdqu y1, 1 * 32(mem_cd); \
554 vmovdqu y2, 2 * 32(mem_cd); \
555 vmovdqu y3, 3 * 32(mem_cd); \
556 vmovdqu y4, 4 * 32(mem_cd); \
557 vmovdqu y5, 5 * 32(mem_cd); \
558 vmovdqu y6, 6 * 32(mem_cd); \
559 vmovdqu y7, 7 * 32(mem_cd);
561 /* de-byteslice, apply post-whitening and store blocks */
562 #define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
563 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
564 byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
565 y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
567 vmovdqu x0, stack_tmp0; \
569 vpbroadcastq key, x0; \
570 vpshufb .Lpack_bswap, x0, x0; \
587 vpxor stack_tmp0, x0, x0;
589 #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
591 vmovdqu x0, 0 * 32(rio); \
592 vmovdqu x1, 1 * 32(rio); \
593 vmovdqu x2, 2 * 32(rio); \
594 vmovdqu x3, 3 * 32(rio); \
595 vmovdqu x4, 4 * 32(rio); \
596 vmovdqu x5, 5 * 32(rio); \
597 vmovdqu x6, 6 * 32(rio); \
598 vmovdqu x7, 7 * 32(rio); \
599 vmovdqu y0, 8 * 32(rio); \
600 vmovdqu y1, 9 * 32(rio); \
601 vmovdqu y2, 10 * 32(rio); \
602 vmovdqu y3, 11 * 32(rio); \
603 vmovdqu y4, 12 * 32(rio); \
604 vmovdqu y5, 13 * 32(rio); \
605 vmovdqu y6, 14 * 32(rio); \
606 vmovdqu y7, 15 * 32(rio);
609 .section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
611 #define SHUFB_BYTES(idx) \
612 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
614 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
615 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
617 .section .rodata.cst32.pack_bswap, "aM", @progbits, 32
620 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
621 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
623 /* NB: section is mergeable, all elements must be aligned 16-byte blocks */
624 .section .rodata.cst16, "aM", @progbits, 16
628 * pre-SubByte transform
630 * pre-lookup for sbox1, sbox2, sbox3:
631 * swap_bitendianness(
632 * isom_map_camellia_to_aes(
634 * swap_bitendianess(in)
639 * (note: '⊕ 0xc5' inside camellia_f())
642 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
643 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
645 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
646 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
649 * pre-SubByte transform
651 * pre-lookup for sbox4:
652 * swap_bitendianness(
653 * isom_map_camellia_to_aes(
655 * swap_bitendianess(in <<< 1)
660 * (note: '⊕ 0xc5' inside camellia_f())
663 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
664 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
666 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
667 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
670 * post-SubByte transform
672 * post-lookup for sbox1, sbox4:
673 * swap_bitendianness(
675 * isom_map_aes_to_camellia(
676 * swap_bitendianness(
677 * aes_inverse_affine_transform(in)
683 * (note: '⊕ 0x6e' inside camellia_h())
686 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
687 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
689 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
690 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
693 * post-SubByte transform
695 * post-lookup for sbox2:
696 * swap_bitendianness(
698 * isom_map_aes_to_camellia(
699 * swap_bitendianness(
700 * aes_inverse_affine_transform(in)
706 * (note: '⊕ 0x6e' inside camellia_h())
709 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
710 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
712 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
713 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
716 * post-SubByte transform
718 * post-lookup for sbox3:
719 * swap_bitendianness(
721 * isom_map_aes_to_camellia(
722 * swap_bitendianness(
723 * aes_inverse_affine_transform(in)
729 * (note: '⊕ 0x6e' inside camellia_h())
732 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
733 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
735 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
736 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
738 /* For isolating SubBytes from AESENCLAST, inverse shift row */
740 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
741 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
743 .section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
752 SYM_FUNC_START_LOCAL(__camellia_enc_blk32)
755 * %rax: temporary storage, 512 bytes
756 * %ymm0..%ymm15: 32 plaintext blocks
758 * %ymm0..%ymm15: 32 encrypted blocks, order swapped:
759 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
763 leaq 8 * 32(%rax), %rcx;
765 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
766 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
769 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
770 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
771 %ymm15, %rax, %rcx, 0);
773 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
774 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
776 ((key_table + (8) * 8) + 0)(CTX),
777 ((key_table + (8) * 8) + 4)(CTX),
778 ((key_table + (8) * 8) + 8)(CTX),
779 ((key_table + (8) * 8) + 12)(CTX));
781 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
782 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
783 %ymm15, %rax, %rcx, 8);
785 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
786 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
788 ((key_table + (16) * 8) + 0)(CTX),
789 ((key_table + (16) * 8) + 4)(CTX),
790 ((key_table + (16) * 8) + 8)(CTX),
791 ((key_table + (16) * 8) + 12)(CTX));
793 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
794 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
795 %ymm15, %rax, %rcx, 16);
798 cmpl $16, key_length(CTX);
802 /* load CD for output */
803 vmovdqu 0 * 32(%rcx), %ymm8;
804 vmovdqu 1 * 32(%rcx), %ymm9;
805 vmovdqu 2 * 32(%rcx), %ymm10;
806 vmovdqu 3 * 32(%rcx), %ymm11;
807 vmovdqu 4 * 32(%rcx), %ymm12;
808 vmovdqu 5 * 32(%rcx), %ymm13;
809 vmovdqu 6 * 32(%rcx), %ymm14;
810 vmovdqu 7 * 32(%rcx), %ymm15;
812 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
813 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
814 %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
823 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
824 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
826 ((key_table + (24) * 8) + 0)(CTX),
827 ((key_table + (24) * 8) + 4)(CTX),
828 ((key_table + (24) * 8) + 8)(CTX),
829 ((key_table + (24) * 8) + 12)(CTX));
831 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
832 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
833 %ymm15, %rax, %rcx, 24);
836 SYM_FUNC_END(__camellia_enc_blk32)
839 SYM_FUNC_START_LOCAL(__camellia_dec_blk32)
842 * %rax: temporary storage, 512 bytes
843 * %r8d: 24 for 16 byte key, 32 for larger
844 * %ymm0..%ymm15: 16 encrypted blocks
846 * %ymm0..%ymm15: 16 plaintext blocks, order swapped:
847 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
851 leaq 8 * 32(%rax), %rcx;
853 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
854 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
861 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
862 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
863 %ymm15, %rax, %rcx, 16);
865 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
866 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
868 ((key_table + (16) * 8) + 8)(CTX),
869 ((key_table + (16) * 8) + 12)(CTX),
870 ((key_table + (16) * 8) + 0)(CTX),
871 ((key_table + (16) * 8) + 4)(CTX));
873 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
874 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
875 %ymm15, %rax, %rcx, 8);
877 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
878 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
880 ((key_table + (8) * 8) + 8)(CTX),
881 ((key_table + (8) * 8) + 12)(CTX),
882 ((key_table + (8) * 8) + 0)(CTX),
883 ((key_table + (8) * 8) + 4)(CTX));
885 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
886 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
887 %ymm15, %rax, %rcx, 0);
889 /* load CD for output */
890 vmovdqu 0 * 32(%rcx), %ymm8;
891 vmovdqu 1 * 32(%rcx), %ymm9;
892 vmovdqu 2 * 32(%rcx), %ymm10;
893 vmovdqu 3 * 32(%rcx), %ymm11;
894 vmovdqu 4 * 32(%rcx), %ymm12;
895 vmovdqu 5 * 32(%rcx), %ymm13;
896 vmovdqu 6 * 32(%rcx), %ymm14;
897 vmovdqu 7 * 32(%rcx), %ymm15;
899 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
900 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
901 %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
908 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
909 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
910 %ymm15, %rax, %rcx, 24);
912 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
913 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
915 ((key_table + (24) * 8) + 8)(CTX),
916 ((key_table + (24) * 8) + 12)(CTX),
917 ((key_table + (24) * 8) + 0)(CTX),
918 ((key_table + (24) * 8) + 4)(CTX));
921 SYM_FUNC_END(__camellia_dec_blk32)
923 SYM_FUNC_START(camellia_ecb_enc_32way)
926 * %rsi: dst (32 blocks)
927 * %rdx: src (32 blocks)
933 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
934 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
935 %ymm15, %rdx, (key_table)(CTX));
937 /* now dst can be used as temporary buffer (even in src == dst case) */
940 call __camellia_enc_blk32;
942 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
943 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
950 SYM_FUNC_END(camellia_ecb_enc_32way)
952 SYM_FUNC_START(camellia_ecb_dec_32way)
955 * %rsi: dst (32 blocks)
956 * %rdx: src (32 blocks)
962 cmpl $16, key_length(CTX);
965 cmovel %eax, %r8d; /* max */
967 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
968 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
969 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
971 /* now dst can be used as temporary buffer (even in src == dst case) */
974 call __camellia_dec_blk32;
976 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
977 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
984 SYM_FUNC_END(camellia_ecb_dec_32way)
986 SYM_FUNC_START(camellia_cbc_dec_32way)
989 * %rsi: dst (32 blocks)
990 * %rdx: src (32 blocks)
993 subq $(16 * 32), %rsp;
997 cmpl $16, key_length(CTX);
1000 cmovel %eax, %r8d; /* max */
1002 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1003 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1004 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
1007 je .Lcbc_dec_use_stack;
1009 /* dst can be used as temporary storage, src is not overwritten. */
1011 jmp .Lcbc_dec_continue;
1013 .Lcbc_dec_use_stack:
1015 * dst still in-use (because dst == src), so use stack for temporary
1021 call __camellia_dec_blk32;
1023 vmovdqu %ymm7, (%rax);
1024 vpxor %ymm7, %ymm7, %ymm7;
1025 vinserti128 $1, (%rdx), %ymm7, %ymm7;
1026 vpxor (%rax), %ymm7, %ymm7;
1027 vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
1028 vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
1029 vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
1030 vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
1031 vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
1032 vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
1033 vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
1034 vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
1035 vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
1036 vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
1037 vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
1038 vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
1039 vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
1040 vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
1041 vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
1042 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1043 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1048 addq $(16 * 32), %rsp;
1051 SYM_FUNC_END(camellia_cbc_dec_32way)