1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
5 * Copyright (C) 2015 Martin Willi
8 #include <linux/linkage.h>
10 .section .rodata.cst16.ANMASK, "aM", @progbits, 16
12 ANMASK: .octa 0x0000000003ffffff0000000003ffffff
14 .section .rodata.cst16.ORMASK, "aM", @progbits, 16
16 ORMASK: .octa 0x00000000010000000000000001000000
49 ENTRY(poly1305_block_sse2)
50 # %rdi: Accumulator h[5]
51 # %rsi: 16 byte input block m
52 # %rdx: Poly1305 key r[5]
55 # This single block variant tries to improve performance by doing two
56 # multiplications in parallel using SSE instructions. There is quite
57 # some quardword packing involved, hence the speedup is marginal.
65 lea (%eax,%eax,4),%eax
68 lea (%eax,%eax,4),%eax
71 lea (%eax,%eax,4),%eax
74 lea (%eax,%eax,4),%eax
77 movdqa ANMASK(%rip),mask
80 # h01 = [0, h1, 0, h0]
81 # h23 = [0, h3, 0, h2]
82 # h44 = [0, h4, 0, h4]
92 # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
99 # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
107 # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
115 # t1[0] = h0 * r0 + h2 * s3
116 # t1[1] = h1 * s4 + h3 * s2
126 # t2[0] = h0 * r1 + h2 * s4
127 # t2[1] = h1 * r0 + h3 * s3
143 # d0 = t1[0] + t1[1] + t3[0]
144 # d1 = t2[0] + t2[1] + t3[1]
154 # t1[0] = h0 * r2 + h2 * r0
155 # t1[1] = h1 * r1 + h3 * s4
165 # t2[0] = h0 * r3 + h2 * r1
166 # t2[1] = h1 * r2 + h3 * r0
182 # d2 = t1[0] + t1[1] + t3[0]
183 # d3 = t2[0] + t2[1] + t3[1]
193 # t1[0] = h0 * r4 + h2 * r2
194 # t1[1] = h1 * r3 + h3 * r1
207 # d4 = t1[0] + t1[1] + t3[0]
218 # h0 = d0 & 0x3ffffff
226 # h1 = d1 & 0x3ffffff
235 # h2 = d2 & 0x3ffffff
244 # h3 = d3 & 0x3ffffff
249 # h0 += (d4 >> 26) * 5
252 lea (%rax,%rax,4),%rax
254 # h4 = d4 & 0x3ffffff
263 # h0 = h0 & 0x3ffffff
271 # Zeroing of key material
279 ENDPROC(poly1305_block_sse2)
304 ENTRY(poly1305_2block_sse2)
305 # %rdi: Accumulator h[5]
306 # %rsi: 16 byte input block m
307 # %rdx: Poly1305 key r[5]
308 # %rcx: Doubleblock count
309 # %r8: Poly1305 derived key r^2 u[5]
311 # This two-block variant further improves performance by using loop
312 # unrolled block processing. This is more straight forward and does
313 # less byte shuffling, but requires a second Poly1305 key r^2:
314 # h = (h + m) * r => h = (h + m1) * r^2 + m2 * r
325 # combine r1,u1 and s1=r1*5,v1=u1*5
333 # combine r2,u2 and s2=r2*5,v2=u2*5
341 # combine r3,u3 and s3=r3*5,v3=u3*5
349 # combine r4,u4 and s4=r4*5,v4=u4*5
358 # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ]
362 pand ANMASK(%rip),hc0
365 # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ]
370 pand ANMASK(%rip),hc1
373 # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ]
378 pand ANMASK(%rip),hc2
381 # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ]
386 pand ANMASK(%rip),hc3
389 # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ]
398 # t1 = [ hc0[1] * r0, hc0[0] * u0 ]
401 # t1 += [ hc1[1] * s4, hc1[0] * v4 ]
405 # t1 += [ hc2[1] * s3, hc2[0] * v3 ]
409 # t1 += [ hc3[1] * s2, hc3[0] * v2 ]
413 # t1 += [ hc4[1] * s1, hc4[0] * v1 ]
423 # t1 = [ hc0[1] * r1, hc0[0] * u1 ]
426 # t1 += [ hc1[1] * r0, hc1[0] * u0 ]
430 # t1 += [ hc2[1] * s4, hc2[0] * v4 ]
434 # t1 += [ hc3[1] * s3, hc3[0] * v3 ]
438 # t1 += [ hc4[1] * s2, hc4[0] * v2 ]
448 # t1 = [ hc0[1] * r2, hc0[0] * u2 ]
451 # t1 += [ hc1[1] * r1, hc1[0] * u1 ]
455 # t1 += [ hc2[1] * r0, hc2[0] * u0 ]
459 # t1 += [ hc3[1] * s4, hc3[0] * v4 ]
463 # t1 += [ hc4[1] * s3, hc4[0] * v3 ]
473 # t1 = [ hc0[1] * r3, hc0[0] * u3 ]
476 # t1 += [ hc1[1] * r2, hc1[0] * u2 ]
480 # t1 += [ hc2[1] * r1, hc2[0] * u1 ]
484 # t1 += [ hc3[1] * r0, hc3[0] * u0 ]
488 # t1 += [ hc4[1] * s4, hc4[0] * v4 ]
498 # t1 = [ hc0[1] * r4, hc0[0] * u4 ]
501 # t1 += [ hc1[1] * r3, hc1[0] * u3 ]
505 # t1 += [ hc2[1] * r2, hc2[0] * u2 ]
509 # t1 += [ hc3[1] * r1, hc3[0] * u1 ]
513 # t1 += [ hc4[1] * r0, hc4[0] * u0 ]
523 # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
524 # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
525 # amount. Careful: we must not assume the carry bits 'd0 >> 26',
526 # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
527 # integers. It's true in a single-block implementation, but not here.
533 # h0 = d0 & 0x3ffffff
541 # h1 = d1 & 0x3ffffff
550 # h2 = d2 & 0x3ffffff
559 # h3 = d3 & 0x3ffffff
564 # h0 += (d4 >> 26) * 5
567 lea (%rax,%rax,4),%rax
569 # h4 = d4 & 0x3ffffff
578 # h0 = h0 & 0x3ffffff
590 ENDPROC(poly1305_2block_sse2)