2 * x86_64/AVX/AES-NI assembler implementation of Camellia
4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
14 * Version licensed under 2-clause BSD License is available at:
15 * http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz
18 #include <linux/linkage.h>
19 #include <asm/frame.h>
20 #include <asm/nospec-branch.h>
22 #define CAMELLIA_TABLE_BYTE_LEN 272
24 /* struct camellia_ctx: */
26 #define key_length CAMELLIA_TABLE_BYTE_LEN
31 /**********************************************************************
33 **********************************************************************/
34 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
35 vpand x, mask4bit, tmp0; \
36 vpandn x, mask4bit, x; \
39 vpshufb tmp0, lo_t, tmp0; \
45 * x0..x7: byte-sliced AB state
46 * mem_cd: register pointer storing CD state
47 * key: index for key material
49 * x0..x7: new byte-sliced CD state
51 #define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
54 * S-function with AES subbytes \
56 vmovdqa .Linv_shift_row, t4; \
57 vbroadcastss .L0f0f0f0f, t7; \
58 vmovdqa .Lpre_tf_lo_s1, t0; \
59 vmovdqa .Lpre_tf_hi_s1, t1; \
61 /* AES inverse shift rows */ \
71 /* prefilter sboxes 1, 2 and 3 */ \
72 vmovdqa .Lpre_tf_lo_s4, t2; \
73 vmovdqa .Lpre_tf_hi_s4, t3; \
74 filter_8bit(x0, t0, t1, t7, t6); \
75 filter_8bit(x7, t0, t1, t7, t6); \
76 filter_8bit(x1, t0, t1, t7, t6); \
77 filter_8bit(x4, t0, t1, t7, t6); \
78 filter_8bit(x2, t0, t1, t7, t6); \
79 filter_8bit(x5, t0, t1, t7, t6); \
81 /* prefilter sbox 4 */ \
83 filter_8bit(x3, t2, t3, t7, t6); \
84 filter_8bit(x6, t2, t3, t7, t6); \
86 /* AES subbytes + AES shift rows */ \
87 vmovdqa .Lpost_tf_lo_s1, t0; \
88 vmovdqa .Lpost_tf_hi_s1, t1; \
89 vaesenclast t4, x0, x0; \
90 vaesenclast t4, x7, x7; \
91 vaesenclast t4, x1, x1; \
92 vaesenclast t4, x4, x4; \
93 vaesenclast t4, x2, x2; \
94 vaesenclast t4, x5, x5; \
95 vaesenclast t4, x3, x3; \
96 vaesenclast t4, x6, x6; \
98 /* postfilter sboxes 1 and 4 */ \
99 vmovdqa .Lpost_tf_lo_s3, t2; \
100 vmovdqa .Lpost_tf_hi_s3, t3; \
101 filter_8bit(x0, t0, t1, t7, t6); \
102 filter_8bit(x7, t0, t1, t7, t6); \
103 filter_8bit(x3, t0, t1, t7, t6); \
104 filter_8bit(x6, t0, t1, t7, t6); \
106 /* postfilter sbox 3 */ \
107 vmovdqa .Lpost_tf_lo_s2, t4; \
108 vmovdqa .Lpost_tf_hi_s2, t5; \
109 filter_8bit(x2, t2, t3, t7, t6); \
110 filter_8bit(x5, t2, t3, t7, t6); \
115 /* postfilter sbox 2 */ \
116 filter_8bit(x1, t4, t5, t7, t2); \
117 filter_8bit(x4, t4, t5, t7, t2); \
119 vpsrldq $5, t0, t5; \
120 vpsrldq $1, t0, t1; \
121 vpsrldq $2, t0, t2; \
122 vpsrldq $3, t0, t3; \
123 vpsrldq $4, t0, t4; \
124 vpshufb t6, t0, t0; \
125 vpshufb t6, t1, t1; \
126 vpshufb t6, t2, t2; \
127 vpshufb t6, t3, t3; \
128 vpshufb t6, t4, t4; \
129 vpsrldq $2, t5, t7; \
130 vpshufb t6, t7, t7; \
153 vpxor x2, x7, x7; /* note: high and low parts swapped */ \
156 * Add key material and result to CD (x becomes new CD) \
160 vpxor 0 * 16(mem_cd), x4, x4; \
163 vpxor 1 * 16(mem_cd), x5, x5; \
165 vpsrldq $1, t5, t3; \
166 vpshufb t6, t5, t5; \
167 vpshufb t6, t3, t6; \
170 vpxor 2 * 16(mem_cd), x6, x6; \
173 vpxor 3 * 16(mem_cd), x7, x7; \
176 vpxor 4 * 16(mem_cd), x0, x0; \
179 vpxor 5 * 16(mem_cd), x1, x1; \
182 vpxor 6 * 16(mem_cd), x2, x2; \
185 vpxor 7 * 16(mem_cd), x3, x3;
188 * Size optimization... with inlined roundsm16, binary would be over 5 times
189 * larger and would only be 0.5% faster (on sandy-bridge).
192 roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
193 roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
194 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
197 ENDPROC(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
200 roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
201 roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
202 %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
205 ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
209 * x0..x7: byte-sliced AB state preloaded
210 * mem_ab: byte-sliced AB state in memory
211 * mem_cb: byte-sliced CD state in memory
213 #define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
214 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
215 leaq (key_table + (i) * 8)(CTX), %r9; \
216 call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
218 vmovdqu x4, 0 * 16(mem_cd); \
219 vmovdqu x5, 1 * 16(mem_cd); \
220 vmovdqu x6, 2 * 16(mem_cd); \
221 vmovdqu x7, 3 * 16(mem_cd); \
222 vmovdqu x0, 4 * 16(mem_cd); \
223 vmovdqu x1, 5 * 16(mem_cd); \
224 vmovdqu x2, 6 * 16(mem_cd); \
225 vmovdqu x3, 7 * 16(mem_cd); \
227 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
228 call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
230 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
232 #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
234 #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
235 /* Store new AB state */ \
236 vmovdqu x0, 0 * 16(mem_ab); \
237 vmovdqu x1, 1 * 16(mem_ab); \
238 vmovdqu x2, 2 * 16(mem_ab); \
239 vmovdqu x3, 3 * 16(mem_ab); \
240 vmovdqu x4, 4 * 16(mem_ab); \
241 vmovdqu x5, 5 * 16(mem_ab); \
242 vmovdqu x6, 6 * 16(mem_ab); \
243 vmovdqu x7, 7 * 16(mem_ab);
245 #define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
246 y6, y7, mem_ab, mem_cd, i) \
247 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
248 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
249 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
250 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
251 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
252 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
254 #define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
255 y6, y7, mem_ab, mem_cd, i) \
256 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
257 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
258 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
259 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
260 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
261 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
265 * v0..3: byte-sliced 32-bit integers
269 #define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
270 vpcmpgtb v0, zero, t0; \
274 vpcmpgtb v1, zero, t1; \
278 vpcmpgtb v2, zero, t2; \
284 vpcmpgtb v3, zero, t0; \
294 * r: byte-sliced AB state in memory
295 * l: byte-sliced CD state in memory
297 * x0..x7: new byte-sliced CD state
299 #define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
300 tt1, tt2, tt3, kll, klr, krl, krr) \
304 * lr ^= rol32(t0, 1); \
306 vpxor tt0, tt0, tt0; \
308 vpshufb tt0, t0, t3; \
309 vpsrldq $1, t0, t0; \
310 vpshufb tt0, t0, t2; \
311 vpsrldq $1, t0, t0; \
312 vpshufb tt0, t0, t1; \
313 vpsrldq $1, t0, t0; \
314 vpshufb tt0, t0, t0; \
321 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
324 vmovdqu l4, 4 * 16(l); \
326 vmovdqu l5, 5 * 16(l); \
328 vmovdqu l6, 6 * 16(l); \
330 vmovdqu l7, 7 * 16(l); \
339 vpshufb tt0, t0, t3; \
340 vpsrldq $1, t0, t0; \
341 vpshufb tt0, t0, t2; \
342 vpsrldq $1, t0, t0; \
343 vpshufb tt0, t0, t1; \
344 vpsrldq $1, t0, t0; \
345 vpshufb tt0, t0, t0; \
347 vpor 4 * 16(r), t0, t0; \
348 vpor 5 * 16(r), t1, t1; \
349 vpor 6 * 16(r), t2, t2; \
350 vpor 7 * 16(r), t3, t3; \
352 vpxor 0 * 16(r), t0, t0; \
353 vpxor 1 * 16(r), t1, t1; \
354 vpxor 2 * 16(r), t2, t2; \
355 vpxor 3 * 16(r), t3, t3; \
356 vmovdqu t0, 0 * 16(r); \
357 vmovdqu t1, 1 * 16(r); \
358 vmovdqu t2, 2 * 16(r); \
359 vmovdqu t3, 3 * 16(r); \
364 * rr ^= rol32(t2, 1); \
367 vpshufb tt0, t0, t3; \
368 vpsrldq $1, t0, t0; \
369 vpshufb tt0, t0, t2; \
370 vpsrldq $1, t0, t0; \
371 vpshufb tt0, t0, t1; \
372 vpsrldq $1, t0, t0; \
373 vpshufb tt0, t0, t0; \
375 vpand 0 * 16(r), t0, t0; \
376 vpand 1 * 16(r), t1, t1; \
377 vpand 2 * 16(r), t2, t2; \
378 vpand 3 * 16(r), t3, t3; \
380 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
382 vpxor 4 * 16(r), t0, t0; \
383 vpxor 5 * 16(r), t1, t1; \
384 vpxor 6 * 16(r), t2, t2; \
385 vpxor 7 * 16(r), t3, t3; \
386 vmovdqu t0, 4 * 16(r); \
387 vmovdqu t1, 5 * 16(r); \
388 vmovdqu t2, 6 * 16(r); \
389 vmovdqu t3, 7 * 16(r); \
398 vpshufb tt0, t0, t3; \
399 vpsrldq $1, t0, t0; \
400 vpshufb tt0, t0, t2; \
401 vpsrldq $1, t0, t0; \
402 vpshufb tt0, t0, t1; \
403 vpsrldq $1, t0, t0; \
404 vpshufb tt0, t0, t0; \
412 vmovdqu l0, 0 * 16(l); \
414 vmovdqu l1, 1 * 16(l); \
416 vmovdqu l2, 2 * 16(l); \
418 vmovdqu l3, 3 * 16(l);
420 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
421 vpunpckhdq x1, x0, t2; \
422 vpunpckldq x1, x0, x0; \
424 vpunpckldq x3, x2, t1; \
425 vpunpckhdq x3, x2, x2; \
427 vpunpckhqdq t1, x0, x1; \
428 vpunpcklqdq t1, x0, x0; \
430 vpunpckhqdq x2, t2, x3; \
431 vpunpcklqdq x2, t2, x2;
433 #define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \
434 b3, c3, d3, st0, st1) \
437 transpose_4x4(a0, a1, a2, a3, d2, d3); \
438 transpose_4x4(b0, b1, b2, b3, d2, d3); \
444 transpose_4x4(c0, c1, c2, c3, a0, a1); \
445 transpose_4x4(d0, d1, d2, d3, a0, a1); \
447 vmovdqu .Lshufb_16x16b, a0; \
449 vpshufb a0, a2, a2; \
450 vpshufb a0, a3, a3; \
451 vpshufb a0, b0, b0; \
452 vpshufb a0, b1, b1; \
453 vpshufb a0, b2, b2; \
454 vpshufb a0, b3, b3; \
455 vpshufb a0, a1, a1; \
456 vpshufb a0, c0, c0; \
457 vpshufb a0, c1, c1; \
458 vpshufb a0, c2, c2; \
459 vpshufb a0, c3, c3; \
460 vpshufb a0, d0, d0; \
461 vpshufb a0, d1, d1; \
462 vpshufb a0, d2, d2; \
463 vpshufb a0, d3, d3; \
466 vpshufb a0, d3, a0; \
469 transpose_4x4(a0, b0, c0, d0, d2, d3); \
470 transpose_4x4(a1, b1, c1, d1, d2, d3); \
476 transpose_4x4(a2, b2, c2, d2, b0, b1); \
477 transpose_4x4(a3, b3, c3, d3, b0, b1); \
480 /* does not adjust output bytes inside vectors */
482 /* load blocks to registers and apply pre-whitening */
483 #define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
486 vpshufb .Lpack_bswap, x0, x0; \
488 vpxor 0 * 16(rio), x0, y7; \
489 vpxor 1 * 16(rio), x0, y6; \
490 vpxor 2 * 16(rio), x0, y5; \
491 vpxor 3 * 16(rio), x0, y4; \
492 vpxor 4 * 16(rio), x0, y3; \
493 vpxor 5 * 16(rio), x0, y2; \
494 vpxor 6 * 16(rio), x0, y1; \
495 vpxor 7 * 16(rio), x0, y0; \
496 vpxor 8 * 16(rio), x0, x7; \
497 vpxor 9 * 16(rio), x0, x6; \
498 vpxor 10 * 16(rio), x0, x5; \
499 vpxor 11 * 16(rio), x0, x4; \
500 vpxor 12 * 16(rio), x0, x3; \
501 vpxor 13 * 16(rio), x0, x2; \
502 vpxor 14 * 16(rio), x0, x1; \
503 vpxor 15 * 16(rio), x0, x0;
505 /* byteslice pre-whitened blocks and store to temporary memory */
506 #define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
507 y6, y7, mem_ab, mem_cd) \
508 byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
509 y5, y6, y7, (mem_ab), (mem_cd)); \
511 vmovdqu x0, 0 * 16(mem_ab); \
512 vmovdqu x1, 1 * 16(mem_ab); \
513 vmovdqu x2, 2 * 16(mem_ab); \
514 vmovdqu x3, 3 * 16(mem_ab); \
515 vmovdqu x4, 4 * 16(mem_ab); \
516 vmovdqu x5, 5 * 16(mem_ab); \
517 vmovdqu x6, 6 * 16(mem_ab); \
518 vmovdqu x7, 7 * 16(mem_ab); \
519 vmovdqu y0, 0 * 16(mem_cd); \
520 vmovdqu y1, 1 * 16(mem_cd); \
521 vmovdqu y2, 2 * 16(mem_cd); \
522 vmovdqu y3, 3 * 16(mem_cd); \
523 vmovdqu y4, 4 * 16(mem_cd); \
524 vmovdqu y5, 5 * 16(mem_cd); \
525 vmovdqu y6, 6 * 16(mem_cd); \
526 vmovdqu y7, 7 * 16(mem_cd);
528 /* de-byteslice, apply post-whitening and store blocks */
529 #define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
530 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
531 byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \
532 y7, x3, x7, stack_tmp0, stack_tmp1); \
534 vmovdqu x0, stack_tmp0; \
537 vpshufb .Lpack_bswap, x0, x0; \
554 vpxor stack_tmp0, x0, x0;
556 #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
558 vmovdqu x0, 0 * 16(rio); \
559 vmovdqu x1, 1 * 16(rio); \
560 vmovdqu x2, 2 * 16(rio); \
561 vmovdqu x3, 3 * 16(rio); \
562 vmovdqu x4, 4 * 16(rio); \
563 vmovdqu x5, 5 * 16(rio); \
564 vmovdqu x6, 6 * 16(rio); \
565 vmovdqu x7, 7 * 16(rio); \
566 vmovdqu y0, 8 * 16(rio); \
567 vmovdqu y1, 9 * 16(rio); \
568 vmovdqu y2, 10 * 16(rio); \
569 vmovdqu y3, 11 * 16(rio); \
570 vmovdqu y4, 12 * 16(rio); \
571 vmovdqu y5, 13 * 16(rio); \
572 vmovdqu y6, 14 * 16(rio); \
573 vmovdqu y7, 15 * 16(rio);
578 #define SHUFB_BYTES(idx) \
579 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
582 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
590 /* For CTR-mode IV byteswap */
592 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
594 /* For XTS mode IV generation */
595 .Lxts_gf128mul_and_shl1_mask:
596 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
599 * pre-SubByte transform
601 * pre-lookup for sbox1, sbox2, sbox3:
602 * swap_bitendianness(
603 * isom_map_camellia_to_aes(
605 * swap_bitendianess(in)
610 * (note: '⊕ 0xc5' inside camellia_f())
613 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
614 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
616 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
617 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
620 * pre-SubByte transform
622 * pre-lookup for sbox4:
623 * swap_bitendianness(
624 * isom_map_camellia_to_aes(
626 * swap_bitendianess(in <<< 1)
631 * (note: '⊕ 0xc5' inside camellia_f())
634 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
635 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
637 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
638 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
641 * post-SubByte transform
643 * post-lookup for sbox1, sbox4:
644 * swap_bitendianness(
646 * isom_map_aes_to_camellia(
647 * swap_bitendianness(
648 * aes_inverse_affine_transform(in)
654 * (note: '⊕ 0x6e' inside camellia_h())
657 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
658 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
660 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
661 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
664 * post-SubByte transform
666 * post-lookup for sbox2:
667 * swap_bitendianness(
669 * isom_map_aes_to_camellia(
670 * swap_bitendianness(
671 * aes_inverse_affine_transform(in)
677 * (note: '⊕ 0x6e' inside camellia_h())
680 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
681 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
683 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
684 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
687 * post-SubByte transform
689 * post-lookup for sbox3:
690 * swap_bitendianness(
692 * isom_map_aes_to_camellia(
693 * swap_bitendianness(
694 * aes_inverse_affine_transform(in)
700 * (note: '⊕ 0x6e' inside camellia_h())
703 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
704 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
706 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
707 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
709 /* For isolating SubBytes from AESENCLAST, inverse shift row */
711 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
712 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
722 __camellia_enc_blk16:
725 * %rax: temporary storage, 256 bytes
726 * %xmm0..%xmm15: 16 plaintext blocks
728 * %xmm0..%xmm15: 16 encrypted blocks, order swapped:
729 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
733 leaq 8 * 16(%rax), %rcx;
735 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
736 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
739 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
740 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
741 %xmm15, %rax, %rcx, 0);
743 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
744 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
746 ((key_table + (8) * 8) + 0)(CTX),
747 ((key_table + (8) * 8) + 4)(CTX),
748 ((key_table + (8) * 8) + 8)(CTX),
749 ((key_table + (8) * 8) + 12)(CTX));
751 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
752 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
753 %xmm15, %rax, %rcx, 8);
755 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
756 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
758 ((key_table + (16) * 8) + 0)(CTX),
759 ((key_table + (16) * 8) + 4)(CTX),
760 ((key_table + (16) * 8) + 8)(CTX),
761 ((key_table + (16) * 8) + 12)(CTX));
763 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
764 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
765 %xmm15, %rax, %rcx, 16);
768 cmpl $16, key_length(CTX);
772 /* load CD for output */
773 vmovdqu 0 * 16(%rcx), %xmm8;
774 vmovdqu 1 * 16(%rcx), %xmm9;
775 vmovdqu 2 * 16(%rcx), %xmm10;
776 vmovdqu 3 * 16(%rcx), %xmm11;
777 vmovdqu 4 * 16(%rcx), %xmm12;
778 vmovdqu 5 * 16(%rcx), %xmm13;
779 vmovdqu 6 * 16(%rcx), %xmm14;
780 vmovdqu 7 * 16(%rcx), %xmm15;
782 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
783 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
784 %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
793 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
794 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
796 ((key_table + (24) * 8) + 0)(CTX),
797 ((key_table + (24) * 8) + 4)(CTX),
798 ((key_table + (24) * 8) + 8)(CTX),
799 ((key_table + (24) * 8) + 12)(CTX));
801 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
802 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
803 %xmm15, %rax, %rcx, 24);
806 ENDPROC(__camellia_enc_blk16)
809 __camellia_dec_blk16:
812 * %rax: temporary storage, 256 bytes
813 * %r8d: 24 for 16 byte key, 32 for larger
814 * %xmm0..%xmm15: 16 encrypted blocks
816 * %xmm0..%xmm15: 16 plaintext blocks, order swapped:
817 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
821 leaq 8 * 16(%rax), %rcx;
823 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
824 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
831 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
832 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
833 %xmm15, %rax, %rcx, 16);
835 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
836 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
838 ((key_table + (16) * 8) + 8)(CTX),
839 ((key_table + (16) * 8) + 12)(CTX),
840 ((key_table + (16) * 8) + 0)(CTX),
841 ((key_table + (16) * 8) + 4)(CTX));
843 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
844 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
845 %xmm15, %rax, %rcx, 8);
847 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
848 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
850 ((key_table + (8) * 8) + 8)(CTX),
851 ((key_table + (8) * 8) + 12)(CTX),
852 ((key_table + (8) * 8) + 0)(CTX),
853 ((key_table + (8) * 8) + 4)(CTX));
855 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
856 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
857 %xmm15, %rax, %rcx, 0);
859 /* load CD for output */
860 vmovdqu 0 * 16(%rcx), %xmm8;
861 vmovdqu 1 * 16(%rcx), %xmm9;
862 vmovdqu 2 * 16(%rcx), %xmm10;
863 vmovdqu 3 * 16(%rcx), %xmm11;
864 vmovdqu 4 * 16(%rcx), %xmm12;
865 vmovdqu 5 * 16(%rcx), %xmm13;
866 vmovdqu 6 * 16(%rcx), %xmm14;
867 vmovdqu 7 * 16(%rcx), %xmm15;
869 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
870 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
871 %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
878 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
879 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
880 %xmm15, %rax, %rcx, 24);
882 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
883 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
885 ((key_table + (24) * 8) + 8)(CTX),
886 ((key_table + (24) * 8) + 12)(CTX),
887 ((key_table + (24) * 8) + 0)(CTX),
888 ((key_table + (24) * 8) + 4)(CTX));
891 ENDPROC(__camellia_dec_blk16)
893 ENTRY(camellia_ecb_enc_16way)
896 * %rsi: dst (16 blocks)
897 * %rdx: src (16 blocks)
901 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
902 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
903 %xmm15, %rdx, (key_table)(CTX));
905 /* now dst can be used as temporary buffer (even in src == dst case) */
908 call __camellia_enc_blk16;
910 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
911 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
916 ENDPROC(camellia_ecb_enc_16way)
918 ENTRY(camellia_ecb_dec_16way)
921 * %rsi: dst (16 blocks)
922 * %rdx: src (16 blocks)
926 cmpl $16, key_length(CTX);
929 cmovel %eax, %r8d; /* max */
931 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
932 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
933 %xmm15, %rdx, (key_table)(CTX, %r8, 8));
935 /* now dst can be used as temporary buffer (even in src == dst case) */
938 call __camellia_dec_blk16;
940 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
941 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
946 ENDPROC(camellia_ecb_dec_16way)
948 ENTRY(camellia_cbc_dec_16way)
951 * %rsi: dst (16 blocks)
952 * %rdx: src (16 blocks)
956 cmpl $16, key_length(CTX);
959 cmovel %eax, %r8d; /* max */
961 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
962 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
963 %xmm15, %rdx, (key_table)(CTX, %r8, 8));
966 * dst might still be in-use (in case dst == src), so use stack for
969 subq $(16 * 16), %rsp;
972 call __camellia_dec_blk16;
974 addq $(16 * 16), %rsp;
976 vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
977 vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
978 vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
979 vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
980 vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
981 vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
982 vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
983 vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
984 vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
985 vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
986 vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
987 vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
988 vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
989 vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
990 vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
991 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
992 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
997 ENDPROC(camellia_cbc_dec_16way)
999 #define inc_le128(x, minus_one, tmp) \
1000 vpcmpeqq minus_one, x, tmp; \
1001 vpsubq minus_one, x, x; \
1002 vpslldq $8, tmp, tmp; \
1005 ENTRY(camellia_ctr_16way)
1008 * %rsi: dst (16 blocks)
1009 * %rdx: src (16 blocks)
1010 * %rcx: iv (little endian, 128bit)
1014 subq $(16 * 16), %rsp;
1017 vmovdqa .Lbswap128_mask, %xmm14;
1019 /* load IV and byteswap */
1020 vmovdqu (%rcx), %xmm0;
1021 vpshufb %xmm14, %xmm0, %xmm15;
1022 vmovdqu %xmm15, 15 * 16(%rax);
1024 vpcmpeqd %xmm15, %xmm15, %xmm15;
1025 vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */
1028 inc_le128(%xmm0, %xmm15, %xmm13);
1029 vpshufb %xmm14, %xmm0, %xmm13;
1030 vmovdqu %xmm13, 14 * 16(%rax);
1031 inc_le128(%xmm0, %xmm15, %xmm13);
1032 vpshufb %xmm14, %xmm0, %xmm13;
1033 vmovdqu %xmm13, 13 * 16(%rax);
1034 inc_le128(%xmm0, %xmm15, %xmm13);
1035 vpshufb %xmm14, %xmm0, %xmm12;
1036 inc_le128(%xmm0, %xmm15, %xmm13);
1037 vpshufb %xmm14, %xmm0, %xmm11;
1038 inc_le128(%xmm0, %xmm15, %xmm13);
1039 vpshufb %xmm14, %xmm0, %xmm10;
1040 inc_le128(%xmm0, %xmm15, %xmm13);
1041 vpshufb %xmm14, %xmm0, %xmm9;
1042 inc_le128(%xmm0, %xmm15, %xmm13);
1043 vpshufb %xmm14, %xmm0, %xmm8;
1044 inc_le128(%xmm0, %xmm15, %xmm13);
1045 vpshufb %xmm14, %xmm0, %xmm7;
1046 inc_le128(%xmm0, %xmm15, %xmm13);
1047 vpshufb %xmm14, %xmm0, %xmm6;
1048 inc_le128(%xmm0, %xmm15, %xmm13);
1049 vpshufb %xmm14, %xmm0, %xmm5;
1050 inc_le128(%xmm0, %xmm15, %xmm13);
1051 vpshufb %xmm14, %xmm0, %xmm4;
1052 inc_le128(%xmm0, %xmm15, %xmm13);
1053 vpshufb %xmm14, %xmm0, %xmm3;
1054 inc_le128(%xmm0, %xmm15, %xmm13);
1055 vpshufb %xmm14, %xmm0, %xmm2;
1056 inc_le128(%xmm0, %xmm15, %xmm13);
1057 vpshufb %xmm14, %xmm0, %xmm1;
1058 inc_le128(%xmm0, %xmm15, %xmm13);
1059 vmovdqa %xmm0, %xmm13;
1060 vpshufb %xmm14, %xmm0, %xmm0;
1061 inc_le128(%xmm13, %xmm15, %xmm14);
1062 vmovdqu %xmm13, (%rcx);
1065 vmovq (key_table)(CTX), %xmm15;
1066 vpshufb .Lpack_bswap, %xmm15, %xmm15;
1067 vpxor %xmm0, %xmm15, %xmm0;
1068 vpxor %xmm1, %xmm15, %xmm1;
1069 vpxor %xmm2, %xmm15, %xmm2;
1070 vpxor %xmm3, %xmm15, %xmm3;
1071 vpxor %xmm4, %xmm15, %xmm4;
1072 vpxor %xmm5, %xmm15, %xmm5;
1073 vpxor %xmm6, %xmm15, %xmm6;
1074 vpxor %xmm7, %xmm15, %xmm7;
1075 vpxor %xmm8, %xmm15, %xmm8;
1076 vpxor %xmm9, %xmm15, %xmm9;
1077 vpxor %xmm10, %xmm15, %xmm10;
1078 vpxor %xmm11, %xmm15, %xmm11;
1079 vpxor %xmm12, %xmm15, %xmm12;
1080 vpxor 13 * 16(%rax), %xmm15, %xmm13;
1081 vpxor 14 * 16(%rax), %xmm15, %xmm14;
1082 vpxor 15 * 16(%rax), %xmm15, %xmm15;
1084 call __camellia_enc_blk16;
1086 addq $(16 * 16), %rsp;
1088 vpxor 0 * 16(%rdx), %xmm7, %xmm7;
1089 vpxor 1 * 16(%rdx), %xmm6, %xmm6;
1090 vpxor 2 * 16(%rdx), %xmm5, %xmm5;
1091 vpxor 3 * 16(%rdx), %xmm4, %xmm4;
1092 vpxor 4 * 16(%rdx), %xmm3, %xmm3;
1093 vpxor 5 * 16(%rdx), %xmm2, %xmm2;
1094 vpxor 6 * 16(%rdx), %xmm1, %xmm1;
1095 vpxor 7 * 16(%rdx), %xmm0, %xmm0;
1096 vpxor 8 * 16(%rdx), %xmm15, %xmm15;
1097 vpxor 9 * 16(%rdx), %xmm14, %xmm14;
1098 vpxor 10 * 16(%rdx), %xmm13, %xmm13;
1099 vpxor 11 * 16(%rdx), %xmm12, %xmm12;
1100 vpxor 12 * 16(%rdx), %xmm11, %xmm11;
1101 vpxor 13 * 16(%rdx), %xmm10, %xmm10;
1102 vpxor 14 * 16(%rdx), %xmm9, %xmm9;
1103 vpxor 15 * 16(%rdx), %xmm8, %xmm8;
1104 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
1105 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
1110 ENDPROC(camellia_ctr_16way)
1112 #define gf128mul_x_ble(iv, mask, tmp) \
1113 vpsrad $31, iv, tmp; \
1114 vpaddq iv, iv, iv; \
1115 vpshufd $0x13, tmp, tmp; \
1116 vpand mask, tmp, tmp; \
1120 camellia_xts_crypt_16way:
1123 * %rsi: dst (16 blocks)
1124 * %rdx: src (16 blocks)
1125 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1126 * %r8: index for input whitening key
1127 * %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16
1131 subq $(16 * 16), %rsp;
1134 vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14;
1137 vmovdqu (%rcx), %xmm0;
1138 vpxor 0 * 16(%rdx), %xmm0, %xmm15;
1139 vmovdqu %xmm15, 15 * 16(%rax);
1140 vmovdqu %xmm0, 0 * 16(%rsi);
1143 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1144 vpxor 1 * 16(%rdx), %xmm0, %xmm15;
1145 vmovdqu %xmm15, 14 * 16(%rax);
1146 vmovdqu %xmm0, 1 * 16(%rsi);
1148 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1149 vpxor 2 * 16(%rdx), %xmm0, %xmm13;
1150 vmovdqu %xmm0, 2 * 16(%rsi);
1152 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1153 vpxor 3 * 16(%rdx), %xmm0, %xmm12;
1154 vmovdqu %xmm0, 3 * 16(%rsi);
1156 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1157 vpxor 4 * 16(%rdx), %xmm0, %xmm11;
1158 vmovdqu %xmm0, 4 * 16(%rsi);
1160 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1161 vpxor 5 * 16(%rdx), %xmm0, %xmm10;
1162 vmovdqu %xmm0, 5 * 16(%rsi);
1164 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1165 vpxor 6 * 16(%rdx), %xmm0, %xmm9;
1166 vmovdqu %xmm0, 6 * 16(%rsi);
1168 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1169 vpxor 7 * 16(%rdx), %xmm0, %xmm8;
1170 vmovdqu %xmm0, 7 * 16(%rsi);
1172 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1173 vpxor 8 * 16(%rdx), %xmm0, %xmm7;
1174 vmovdqu %xmm0, 8 * 16(%rsi);
1176 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1177 vpxor 9 * 16(%rdx), %xmm0, %xmm6;
1178 vmovdqu %xmm0, 9 * 16(%rsi);
1180 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1181 vpxor 10 * 16(%rdx), %xmm0, %xmm5;
1182 vmovdqu %xmm0, 10 * 16(%rsi);
1184 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1185 vpxor 11 * 16(%rdx), %xmm0, %xmm4;
1186 vmovdqu %xmm0, 11 * 16(%rsi);
1188 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1189 vpxor 12 * 16(%rdx), %xmm0, %xmm3;
1190 vmovdqu %xmm0, 12 * 16(%rsi);
1192 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1193 vpxor 13 * 16(%rdx), %xmm0, %xmm2;
1194 vmovdqu %xmm0, 13 * 16(%rsi);
1196 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1197 vpxor 14 * 16(%rdx), %xmm0, %xmm1;
1198 vmovdqu %xmm0, 14 * 16(%rsi);
1200 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1201 vpxor 15 * 16(%rdx), %xmm0, %xmm15;
1202 vmovdqu %xmm15, 0 * 16(%rax);
1203 vmovdqu %xmm0, 15 * 16(%rsi);
1205 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1206 vmovdqu %xmm0, (%rcx);
1209 vmovq (key_table)(CTX, %r8, 8), %xmm15;
1210 vpshufb .Lpack_bswap, %xmm15, %xmm15;
1211 vpxor 0 * 16(%rax), %xmm15, %xmm0;
1212 vpxor %xmm1, %xmm15, %xmm1;
1213 vpxor %xmm2, %xmm15, %xmm2;
1214 vpxor %xmm3, %xmm15, %xmm3;
1215 vpxor %xmm4, %xmm15, %xmm4;
1216 vpxor %xmm5, %xmm15, %xmm5;
1217 vpxor %xmm6, %xmm15, %xmm6;
1218 vpxor %xmm7, %xmm15, %xmm7;
1219 vpxor %xmm8, %xmm15, %xmm8;
1220 vpxor %xmm9, %xmm15, %xmm9;
1221 vpxor %xmm10, %xmm15, %xmm10;
1222 vpxor %xmm11, %xmm15, %xmm11;
1223 vpxor %xmm12, %xmm15, %xmm12;
1224 vpxor %xmm13, %xmm15, %xmm13;
1225 vpxor 14 * 16(%rax), %xmm15, %xmm14;
1226 vpxor 15 * 16(%rax), %xmm15, %xmm15;
1230 addq $(16 * 16), %rsp;
1232 vpxor 0 * 16(%rsi), %xmm7, %xmm7;
1233 vpxor 1 * 16(%rsi), %xmm6, %xmm6;
1234 vpxor 2 * 16(%rsi), %xmm5, %xmm5;
1235 vpxor 3 * 16(%rsi), %xmm4, %xmm4;
1236 vpxor 4 * 16(%rsi), %xmm3, %xmm3;
1237 vpxor 5 * 16(%rsi), %xmm2, %xmm2;
1238 vpxor 6 * 16(%rsi), %xmm1, %xmm1;
1239 vpxor 7 * 16(%rsi), %xmm0, %xmm0;
1240 vpxor 8 * 16(%rsi), %xmm15, %xmm15;
1241 vpxor 9 * 16(%rsi), %xmm14, %xmm14;
1242 vpxor 10 * 16(%rsi), %xmm13, %xmm13;
1243 vpxor 11 * 16(%rsi), %xmm12, %xmm12;
1244 vpxor 12 * 16(%rsi), %xmm11, %xmm11;
1245 vpxor 13 * 16(%rsi), %xmm10, %xmm10;
1246 vpxor 14 * 16(%rsi), %xmm9, %xmm9;
1247 vpxor 15 * 16(%rsi), %xmm8, %xmm8;
1248 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
1249 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
1254 ENDPROC(camellia_xts_crypt_16way)
1256 ENTRY(camellia_xts_enc_16way)
1259 * %rsi: dst (16 blocks)
1260 * %rdx: src (16 blocks)
1261 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1263 xorl %r8d, %r8d; /* input whitening key, 0 for enc */
1265 leaq __camellia_enc_blk16, %r9;
1267 jmp camellia_xts_crypt_16way;
1268 ENDPROC(camellia_xts_enc_16way)
1270 ENTRY(camellia_xts_dec_16way)
1273 * %rsi: dst (16 blocks)
1274 * %rdx: src (16 blocks)
1275 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1278 cmpl $16, key_length(CTX);
1281 cmovel %eax, %r8d; /* input whitening key, last for dec */
1283 leaq __camellia_dec_blk16, %r9;
1285 jmp camellia_xts_crypt_16way;
1286 ENDPROC(camellia_xts_dec_16way)