1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
5 * Copyright (C) 2012 Johannes Goetzfried
6 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
8 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
11 #include <linux/linkage.h>
12 #include <asm/frame.h>
13 #include "glue_helper-asm-avx.S"
15 .file "cast6-avx-x86_64-asm_64.S"
22 /* structure of crypto context */
32 /**********************************************************************
34 **********************************************************************/
85 #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
86 movzbl src ## bh, RID1d; \
87 movzbl src ## bl, RID2d; \
89 movl s1(, RID1, 4), dst ## d; \
90 op1 s2(, RID2, 4), dst ## d; \
91 movzbl src ## bh, RID1d; \
92 movzbl src ## bl, RID2d; \
93 interleave_op(il_reg); \
94 op2 s3(, RID1, 4), dst ## d; \
95 op3 s4(, RID2, 4), dst ## d;
97 #define dummy(d) /* do nothing */
99 #define shr_next(reg) \
102 #define F_head(a, x, gi1, gi2, op0) \
104 vpslld RKRF, x, RTMP; \
111 #define F_tail(a, x, gi1, gi2, op1, op2, op3) \
112 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
113 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
115 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
118 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
123 vpinsrq $1, RFS3, x, x;
125 #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
126 F_head(b1, RX, RGI1, RGI2, op0); \
127 F_head(b2, RX, RGI3, RGI4, op0); \
129 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
130 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
135 #define F1_2(a1, b1, a2, b2) \
136 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
137 #define F2_2(a1, b1, a2, b2) \
138 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
139 #define F3_2(a1, b1, a2, b2) \
140 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
142 #define qop(in, out, f) \
143 F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2);
145 #define get_round_keys(nn) \
146 vbroadcastss (km+(4*(nn)))(CTX), RKM; \
147 vpand R1ST, RKR, RKRF; \
148 vpsubq RKRF, R32, RKRR; \
149 vpsrldq $1, RKR, RKR;
152 get_round_keys(4*n+0); \
155 get_round_keys(4*n+1); \
158 get_round_keys(4*n+2); \
161 get_round_keys(4*n+3); \
165 get_round_keys(4*n+3); \
168 get_round_keys(4*n+2); \
171 get_round_keys(4*n+1); \
174 get_round_keys(4*n+0); \
177 #define shuffle(mask) \
178 vpshufb mask, RKR, RKR;
180 #define preload_rkr(n, do_mask, mask) \
181 vbroadcastss .L16_mask, RKR; \
182 /* add 16-bit rotation to key rotations (mod 32) */ \
183 vpxor (kr+n*16)(CTX), RKR, RKR; \
186 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
187 vpunpckldq x1, x0, t0; \
188 vpunpckhdq x1, x0, t2; \
189 vpunpckldq x3, x2, t1; \
190 vpunpckhdq x3, x2, x3; \
192 vpunpcklqdq t1, t0, x0; \
193 vpunpckhqdq t1, t0, x1; \
194 vpunpcklqdq x3, t2, x2; \
195 vpunpckhqdq x3, t2, x3;
197 #define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
198 vpshufb rmask, x0, x0; \
199 vpshufb rmask, x1, x1; \
200 vpshufb rmask, x2, x2; \
201 vpshufb rmask, x3, x3; \
203 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
205 #define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
206 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
208 vpshufb rmask, x0, x0; \
209 vpshufb rmask, x1, x1; \
210 vpshufb rmask, x2, x2; \
211 vpshufb rmask, x3, x3;
213 .section .rodata.cst16, "aM", @progbits, 16
216 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
218 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
219 .Lrkr_enc_Q_Q_QBAR_QBAR:
220 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
221 .Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
222 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
224 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
225 .Lrkr_dec_Q_Q_QBAR_QBAR:
226 .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
227 .Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
228 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
230 .section .rodata.cst4.L16_mask, "aM", @progbits, 4
235 .section .rodata.cst4.L32_mask, "aM", @progbits, 4
240 .section .rodata.cst4.first_mask, "aM", @progbits, 4
248 SYM_FUNC_START_LOCAL(__cast6_enc_blk8)
251 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
253 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
261 vmovdqa .Lbswap_mask, RKM;
262 vmovd .Lfirst_mask, R1ST;
263 vmovd .L32_mask, R32;
265 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
266 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
268 preload_rkr(0, dummy, none);
273 preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR);
278 preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR);
287 vmovdqa .Lbswap_mask, RKM;
289 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
290 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
293 SYM_FUNC_END(__cast6_enc_blk8)
296 SYM_FUNC_START_LOCAL(__cast6_dec_blk8)
299 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
301 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
309 vmovdqa .Lbswap_mask, RKM;
310 vmovd .Lfirst_mask, R1ST;
311 vmovd .L32_mask, R32;
313 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
314 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
316 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
321 preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR);
326 preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR);
335 vmovdqa .Lbswap_mask, RKM;
336 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
337 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
340 SYM_FUNC_END(__cast6_dec_blk8)
342 SYM_FUNC_START(cast6_ecb_enc_8way)
354 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
356 call __cast6_enc_blk8;
358 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
363 SYM_FUNC_END(cast6_ecb_enc_8way)
365 SYM_FUNC_START(cast6_ecb_dec_8way)
377 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
379 call __cast6_dec_blk8;
381 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
386 SYM_FUNC_END(cast6_ecb_dec_8way)
388 SYM_FUNC_START(cast6_cbc_dec_8way)
402 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
404 call __cast6_dec_blk8;
406 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
412 SYM_FUNC_END(cast6_cbc_dec_8way)