Mention branches and keyring.
[releases.git] / x86 / crypto / blowfish-x86_64-asm_64.S
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Blowfish Cipher Algorithm (x86_64)
4  *
5  * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6  */
7
8 #include <linux/linkage.h>
9 #include <linux/cfi_types.h>
10
11 .file "blowfish-x86_64-asm.S"
12 .text
13
14 /* structure of crypto context */
15 #define p       0
16 #define s0      ((16 + 2) * 4)
17 #define s1      ((16 + 2 + (1 * 256)) * 4)
18 #define s2      ((16 + 2 + (2 * 256)) * 4)
19 #define s3      ((16 + 2 + (3 * 256)) * 4)
20
21 /* register macros */
22 #define CTX %r12
23 #define RIO %rsi
24
25 #define RX0 %rax
26 #define RX1 %rbx
27 #define RX2 %rcx
28 #define RX3 %rdx
29
30 #define RX0d %eax
31 #define RX1d %ebx
32 #define RX2d %ecx
33 #define RX3d %edx
34
35 #define RX0bl %al
36 #define RX1bl %bl
37 #define RX2bl %cl
38 #define RX3bl %dl
39
40 #define RX0bh %ah
41 #define RX1bh %bh
42 #define RX2bh %ch
43 #define RX3bh %dh
44
45 #define RT0 %rdi
46 #define RT1 %rsi
47 #define RT2 %r8
48 #define RT3 %r9
49
50 #define RT0d %edi
51 #define RT1d %esi
52 #define RT2d %r8d
53 #define RT3d %r9d
54
55 #define RKEY %r10
56
57 /***********************************************************************
58  * 1-way blowfish
59  ***********************************************************************/
60 #define F() \
61         rorq $16,               RX0; \
62         movzbl RX0bh,           RT0d; \
63         movzbl RX0bl,           RT1d; \
64         rolq $16,               RX0; \
65         movl s0(CTX,RT0,4),     RT0d; \
66         addl s1(CTX,RT1,4),     RT0d; \
67         movzbl RX0bh,           RT1d; \
68         movzbl RX0bl,           RT2d; \
69         rolq $32,               RX0; \
70         xorl s2(CTX,RT1,4),     RT0d; \
71         addl s3(CTX,RT2,4),     RT0d; \
72         xorq RT0,               RX0;
73
74 #define add_roundkey_enc(n) \
75         xorq p+4*(n)(CTX),      RX0;
76
77 #define round_enc(n) \
78         add_roundkey_enc(n); \
79         \
80         F(); \
81         F();
82
83 #define add_roundkey_dec(n) \
84         movq p+4*(n-1)(CTX),    RT0; \
85         rorq $32,               RT0; \
86         xorq RT0,               RX0;
87
88 #define round_dec(n) \
89         add_roundkey_dec(n); \
90         \
91         F(); \
92         F(); \
93
94 #define read_block() \
95         movq (RIO),             RX0; \
96         rorq $32,               RX0; \
97         bswapq                  RX0;
98
99 #define write_block() \
100         bswapq                  RX0; \
101         movq RX0,               (RIO);
102
103 #define xor_block() \
104         bswapq                  RX0; \
105         xorq RX0,               (RIO);
106
107 SYM_FUNC_START(__blowfish_enc_blk)
108         /* input:
109          *      %rdi: ctx
110          *      %rsi: dst
111          *      %rdx: src
112          *      %rcx: bool, if true: xor output
113          */
114         movq %r12, %r11;
115
116         movq %rdi, CTX;
117         movq %rsi, %r10;
118         movq %rdx, RIO;
119
120         read_block();
121
122         round_enc(0);
123         round_enc(2);
124         round_enc(4);
125         round_enc(6);
126         round_enc(8);
127         round_enc(10);
128         round_enc(12);
129         round_enc(14);
130         add_roundkey_enc(16);
131
132         movq %r11, %r12;
133
134         movq %r10, RIO;
135         test %cl, %cl;
136         jnz .L__enc_xor;
137
138         write_block();
139         RET;
140 .L__enc_xor:
141         xor_block();
142         RET;
143 SYM_FUNC_END(__blowfish_enc_blk)
144
145 SYM_TYPED_FUNC_START(blowfish_dec_blk)
146         /* input:
147          *      %rdi: ctx
148          *      %rsi: dst
149          *      %rdx: src
150          */
151         movq %r12, %r11;
152
153         movq %rdi, CTX;
154         movq %rsi, %r10;
155         movq %rdx, RIO;
156
157         read_block();
158
159         round_dec(17);
160         round_dec(15);
161         round_dec(13);
162         round_dec(11);
163         round_dec(9);
164         round_dec(7);
165         round_dec(5);
166         round_dec(3);
167         add_roundkey_dec(1);
168
169         movq %r10, RIO;
170         write_block();
171
172         movq %r11, %r12;
173
174         RET;
175 SYM_FUNC_END(blowfish_dec_blk)
176
177 /**********************************************************************
178   4-way blowfish, four blocks parallel
179  **********************************************************************/
180
181 /* F() for 4-way. Slower when used alone/1-way, but faster when used
182  * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
183  */
184 #define F4(x) \
185         movzbl x ## bh,         RT1d; \
186         movzbl x ## bl,         RT3d; \
187         rorq $16,               x; \
188         movzbl x ## bh,         RT0d; \
189         movzbl x ## bl,         RT2d; \
190         rorq $16,               x; \
191         movl s0(CTX,RT0,4),     RT0d; \
192         addl s1(CTX,RT2,4),     RT0d; \
193         xorl s2(CTX,RT1,4),     RT0d; \
194         addl s3(CTX,RT3,4),     RT0d; \
195         xorq RT0,               x;
196
197 #define add_preloaded_roundkey4() \
198         xorq RKEY,              RX0; \
199         xorq RKEY,              RX1; \
200         xorq RKEY,              RX2; \
201         xorq RKEY,              RX3;
202
203 #define preload_roundkey_enc(n) \
204         movq p+4*(n)(CTX),      RKEY;
205
206 #define add_roundkey_enc4(n) \
207         add_preloaded_roundkey4(); \
208         preload_roundkey_enc(n + 2);
209
210 #define round_enc4(n) \
211         add_roundkey_enc4(n); \
212         \
213         F4(RX0); \
214         F4(RX1); \
215         F4(RX2); \
216         F4(RX3); \
217         \
218         F4(RX0); \
219         F4(RX1); \
220         F4(RX2); \
221         F4(RX3);
222
223 #define preload_roundkey_dec(n) \
224         movq p+4*((n)-1)(CTX),  RKEY; \
225         rorq $32,               RKEY;
226
227 #define add_roundkey_dec4(n) \
228         add_preloaded_roundkey4(); \
229         preload_roundkey_dec(n - 2);
230
231 #define round_dec4(n) \
232         add_roundkey_dec4(n); \
233         \
234         F4(RX0); \
235         F4(RX1); \
236         F4(RX2); \
237         F4(RX3); \
238         \
239         F4(RX0); \
240         F4(RX1); \
241         F4(RX2); \
242         F4(RX3);
243
244 #define read_block4() \
245         movq (RIO),             RX0; \
246         rorq $32,               RX0; \
247         bswapq                  RX0; \
248         \
249         movq 8(RIO),            RX1; \
250         rorq $32,               RX1; \
251         bswapq                  RX1; \
252         \
253         movq 16(RIO),           RX2; \
254         rorq $32,               RX2; \
255         bswapq                  RX2; \
256         \
257         movq 24(RIO),           RX3; \
258         rorq $32,               RX3; \
259         bswapq                  RX3;
260
261 #define write_block4() \
262         bswapq                  RX0; \
263         movq RX0,               (RIO); \
264         \
265         bswapq                  RX1; \
266         movq RX1,               8(RIO); \
267         \
268         bswapq                  RX2; \
269         movq RX2,               16(RIO); \
270         \
271         bswapq                  RX3; \
272         movq RX3,               24(RIO);
273
274 #define xor_block4() \
275         bswapq                  RX0; \
276         xorq RX0,               (RIO); \
277         \
278         bswapq                  RX1; \
279         xorq RX1,               8(RIO); \
280         \
281         bswapq                  RX2; \
282         xorq RX2,               16(RIO); \
283         \
284         bswapq                  RX3; \
285         xorq RX3,               24(RIO);
286
287 SYM_FUNC_START(__blowfish_enc_blk_4way)
288         /* input:
289          *      %rdi: ctx
290          *      %rsi: dst
291          *      %rdx: src
292          *      %rcx: bool, if true: xor output
293          */
294         pushq %r12;
295         pushq %rbx;
296         pushq %rcx;
297
298         movq %rdi, CTX
299         movq %rsi, %r11;
300         movq %rdx, RIO;
301
302         preload_roundkey_enc(0);
303
304         read_block4();
305
306         round_enc4(0);
307         round_enc4(2);
308         round_enc4(4);
309         round_enc4(6);
310         round_enc4(8);
311         round_enc4(10);
312         round_enc4(12);
313         round_enc4(14);
314         add_preloaded_roundkey4();
315
316         popq %r12;
317         movq %r11, RIO;
318
319         test %r12b, %r12b;
320         jnz .L__enc_xor4;
321
322         write_block4();
323
324         popq %rbx;
325         popq %r12;
326         RET;
327
328 .L__enc_xor4:
329         xor_block4();
330
331         popq %rbx;
332         popq %r12;
333         RET;
334 SYM_FUNC_END(__blowfish_enc_blk_4way)
335
336 SYM_TYPED_FUNC_START(blowfish_dec_blk_4way)
337         /* input:
338          *      %rdi: ctx
339          *      %rsi: dst
340          *      %rdx: src
341          */
342         pushq %r12;
343         pushq %rbx;
344
345         movq %rdi, CTX;
346         movq %rsi, %r11
347         movq %rdx, RIO;
348
349         preload_roundkey_dec(17);
350         read_block4();
351
352         round_dec4(17);
353         round_dec4(15);
354         round_dec4(13);
355         round_dec4(11);
356         round_dec4(9);
357         round_dec4(7);
358         round_dec4(5);
359         round_dec4(3);
360         add_preloaded_roundkey4();
361
362         movq %r11, RIO;
363         write_block4();
364
365         popq %rbx;
366         popq %r12;
367
368         RET;
369 SYM_FUNC_END(blowfish_dec_blk_4way)