2 ***********************************************************************
4 * Implementation of the Skein block functions.
6 * Source code author: Doug Whiting, 2008.
8 * This algorithm and source code is released to the public domain.
10 * Compile-time switches:
12 * SKEIN_USE_ASM -- set bits (256/512/1024) to select which
13 * versions use ASM code for block processing
14 * [default: use C for all block sizes]
16 ***********************************************************************
19 #include <linux/string.h>
20 #include <linux/bitops.h>
21 #include "skein_base.h"
22 #include "skein_block.h"
24 /***************************** SKEIN_256 ******************************/
25 #if !(SKEIN_USE_ASM & 256)
26 void skein_256_process_block(struct skein_256_ctx *ctx, const u8 *blk_ptr,
27 size_t blk_cnt, size_t byte_cnt_add)
30 WCNT = SKEIN_256_STATE_WORDS
34 /* key schedule: chaining vars + tweak + "rot"*/
35 u64 kw[WCNT + 4 + (RCNT * 2)];
37 /* key schedule words : chaining vars + tweak */
40 u64 X0, X1, X2, X3; /* local copy of context vars, for speed */
41 u64 w[WCNT]; /* local copy of input block */
43 const u64 *X_ptr[4]; /* use for debugging (help cc put Xn in regs) */
50 skein_assert(blk_cnt != 0); /* never call with blk_cnt == 0! */
51 ts[0] = ctx->h.tweak[0];
52 ts[1] = ctx->h.tweak[1];
55 * this implementation only supports 2**64 input bytes
58 ts[0] += byte_cnt_add; /* update processed length */
60 /* precompute the key schedule for this block */
65 ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY;
67 ts[2] = ts[0] ^ ts[1];
69 /* get input block in little-endian format */
70 skein_get64_lsb_first(w, blk_ptr, WCNT);
71 debug_save_tweak(ctx);
73 /* do the first full key injection */
75 X1 = w[1] + ks[1] + ts[0];
76 X2 = w[2] + ks[2] + ts[1];
79 blk_ptr += SKEIN_256_BLOCK_BYTES;
83 r < (SKEIN_UNROLL_256 ? 2 * RCNT : 2);
84 r += (SKEIN_UNROLL_256 ? 2 * SKEIN_UNROLL_256 : 1)) {
113 #if R256_UNROLL_R(10)
116 #if R256_UNROLL_R(11)
119 #if R256_UNROLL_R(12)
122 #if R256_UNROLL_R(13)
125 #if R256_UNROLL_R(14)
129 /* do the final "feedforward" xor, update context chaining */
130 ctx->x[0] = X0 ^ w[0];
131 ctx->x[1] = X1 ^ w[1];
132 ctx->x[2] = X2 ^ w[2];
133 ctx->x[3] = X3 ^ w[3];
135 ts[1] &= ~SKEIN_T1_FLAG_FIRST;
137 ctx->h.tweak[0] = ts[0];
138 ctx->h.tweak[1] = ts[1];
141 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
142 size_t skein_256_process_block_code_size(void)
144 return ((u8 *)skein_256_process_block_code_size) -
145 ((u8 *)skein_256_process_block);
148 unsigned int skein_256_unroll_cnt(void)
150 return SKEIN_UNROLL_256;
155 /***************************** SKEIN_512 ******************************/
156 #if !(SKEIN_USE_ASM & 512)
157 void skein_512_process_block(struct skein_512_ctx *ctx, const u8 *blk_ptr,
158 size_t blk_cnt, size_t byte_cnt_add)
161 WCNT = SKEIN_512_STATE_WORDS
165 /* key sched: chaining vars + tweak + "rot"*/
166 u64 kw[WCNT + 4 + RCNT * 2];
168 /* key schedule words : chaining vars + tweak */
171 u64 X0, X1, X2, X3, X4, X5, X6, X7; /* local copies, for speed */
172 u64 w[WCNT]; /* local copy of input block */
174 const u64 *X_ptr[8]; /* use for debugging (help cc put Xn in regs) */
186 skein_assert(blk_cnt != 0); /* never call with blk_cnt == 0! */
187 ts[0] = ctx->h.tweak[0];
188 ts[1] = ctx->h.tweak[1];
191 * this implementation only supports 2**64 input bytes
192 * (no carry out here)
194 ts[0] += byte_cnt_add; /* update processed length */
196 /* precompute the key schedule for this block */
205 ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
206 ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
208 ts[2] = ts[0] ^ ts[1];
210 /* get input block in little-endian format */
211 skein_get64_lsb_first(w, blk_ptr, WCNT);
212 debug_save_tweak(ctx);
214 /* do the first full key injection */
220 X5 = w[5] + ks[5] + ts[0];
221 X6 = w[6] + ks[6] + ts[1];
224 blk_ptr += SKEIN_512_BLOCK_BYTES;
228 r < (SKEIN_UNROLL_512 ? 2 * RCNT : 2);
229 r += (SKEIN_UNROLL_512 ? 2 * SKEIN_UNROLL_512 : 1)) {
259 #if R512_UNROLL_R(10)
262 #if R512_UNROLL_R(11)
265 #if R512_UNROLL_R(12)
268 #if R512_UNROLL_R(13)
271 #if R512_UNROLL_R(14)
276 /* do the final "feedforward" xor, update context chaining */
277 ctx->x[0] = X0 ^ w[0];
278 ctx->x[1] = X1 ^ w[1];
279 ctx->x[2] = X2 ^ w[2];
280 ctx->x[3] = X3 ^ w[3];
281 ctx->x[4] = X4 ^ w[4];
282 ctx->x[5] = X5 ^ w[5];
283 ctx->x[6] = X6 ^ w[6];
284 ctx->x[7] = X7 ^ w[7];
286 ts[1] &= ~SKEIN_T1_FLAG_FIRST;
288 ctx->h.tweak[0] = ts[0];
289 ctx->h.tweak[1] = ts[1];
292 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
293 size_t skein_512_process_block_code_size(void)
295 return ((u8 *)skein_512_process_block_code_size) -
296 ((u8 *)skein_512_process_block);
299 unsigned int skein_512_unroll_cnt(void)
301 return SKEIN_UNROLL_512;
306 /***************************** SKEIN_1024 ******************************/
307 #if !(SKEIN_USE_ASM & 1024)
308 void skein_1024_process_block(struct skein_1024_ctx *ctx, const u8 *blk_ptr,
309 size_t blk_cnt, size_t byte_cnt_add)
310 { /* do it in C, always looping (unrolled is bigger AND slower!) */
312 WCNT = SKEIN_1024_STATE_WORDS
315 #if (SKEIN_UNROLL_1024 != 0)
316 /* key sched: chaining vars + tweak + "rot" */
317 u64 kw[WCNT + 4 + (RCNT * 2)];
319 /* key schedule words : chaining vars + tweak */
323 /* local copy of vars, for speed */
324 u64 X00, X01, X02, X03, X04, X05, X06, X07,
325 X08, X09, X10, X11, X12, X13, X14, X15;
326 u64 w[WCNT]; /* local copy of input block */
328 skein_assert(blk_cnt != 0); /* never call with blk_cnt == 0! */
329 ts[0] = ctx->h.tweak[0];
330 ts[1] = ctx->h.tweak[1];
333 * this implementation only supports 2**64 input bytes
334 * (no carry out here)
336 ts[0] += byte_cnt_add; /* update processed length */
338 /* precompute the key schedule for this block */
355 ks[16] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
356 ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^
357 ks[8] ^ ks[9] ^ ks[10] ^ ks[11] ^
358 ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY;
360 ts[2] = ts[0] ^ ts[1];
362 /* get input block in little-endian format */
363 skein_get64_lsb_first(w, blk_ptr, WCNT);
364 debug_save_tweak(ctx);
366 /* do the first full key injection */
377 X10 = w[10] + ks[10];
378 X11 = w[11] + ks[11];
379 X12 = w[12] + ks[12];
380 X13 = w[13] + ks[13] + ts[0];
381 X14 = w[14] + ks[14] + ts[1];
382 X15 = w[15] + ks[15];
385 r < (SKEIN_UNROLL_1024 ? 2 * RCNT : 2);
386 r += (SKEIN_UNROLL_1024 ? 2 * SKEIN_UNROLL_1024 : 1)) {
388 #if R1024_UNROLL_R(1)
391 #if R1024_UNROLL_R(2)
394 #if R1024_UNROLL_R(3)
397 #if R1024_UNROLL_R(4)
400 #if R1024_UNROLL_R(5)
403 #if R1024_UNROLL_R(6)
406 #if R1024_UNROLL_R(7)
409 #if R1024_UNROLL_R(8)
412 #if R1024_UNROLL_R(9)
415 #if R1024_UNROLL_R(10)
418 #if R1024_UNROLL_R(11)
421 #if R1024_UNROLL_R(12)
424 #if R1024_UNROLL_R(13)
427 #if R1024_UNROLL_R(14)
431 /* do the final "feedforward" xor, update context chaining */
433 ctx->x[0] = X00 ^ w[0];
434 ctx->x[1] = X01 ^ w[1];
435 ctx->x[2] = X02 ^ w[2];
436 ctx->x[3] = X03 ^ w[3];
437 ctx->x[4] = X04 ^ w[4];
438 ctx->x[5] = X05 ^ w[5];
439 ctx->x[6] = X06 ^ w[6];
440 ctx->x[7] = X07 ^ w[7];
441 ctx->x[8] = X08 ^ w[8];
442 ctx->x[9] = X09 ^ w[9];
443 ctx->x[10] = X10 ^ w[10];
444 ctx->x[11] = X11 ^ w[11];
445 ctx->x[12] = X12 ^ w[12];
446 ctx->x[13] = X13 ^ w[13];
447 ctx->x[14] = X14 ^ w[14];
448 ctx->x[15] = X15 ^ w[15];
450 ts[1] &= ~SKEIN_T1_FLAG_FIRST;
451 blk_ptr += SKEIN_1024_BLOCK_BYTES;
453 ctx->h.tweak[0] = ts[0];
454 ctx->h.tweak[1] = ts[1];
457 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
458 size_t skein_1024_process_block_code_size(void)
460 return ((u8 *)skein_1024_process_block_code_size) -
461 ((u8 *)skein_1024_process_block);
464 unsigned int skein_1024_unroll_cnt(void)
466 return SKEIN_UNROLL_1024;