Linux 6.7-rc7
[linux-modified.git] / arch / arm64 / crypto / nh-neon-core.S
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * NH - ε-almost-universal hash function, ARM64 NEON accelerated version
4  *
5  * Copyright 2018 Google LLC
6  *
7  * Author: Eric Biggers <ebiggers@google.com>
8  */
9
10 #include <linux/linkage.h>
11 #include <linux/cfi_types.h>
12
13         KEY             .req    x0
14         MESSAGE         .req    x1
15         MESSAGE_LEN     .req    x2
16         HASH            .req    x3
17
18         PASS0_SUMS      .req    v0
19         PASS1_SUMS      .req    v1
20         PASS2_SUMS      .req    v2
21         PASS3_SUMS      .req    v3
22         K0              .req    v4
23         K1              .req    v5
24         K2              .req    v6
25         K3              .req    v7
26         T0              .req    v8
27         T1              .req    v9
28         T2              .req    v10
29         T3              .req    v11
30         T4              .req    v12
31         T5              .req    v13
32         T6              .req    v14
33         T7              .req    v15
34
35 .macro _nh_stride       k0, k1, k2, k3
36
37         // Load next message stride
38         ld1             {T3.16b}, [MESSAGE], #16
39
40         // Load next key stride
41         ld1             {\k3\().4s}, [KEY], #16
42
43         // Add message words to key words
44         add             T0.4s, T3.4s, \k0\().4s
45         add             T1.4s, T3.4s, \k1\().4s
46         add             T2.4s, T3.4s, \k2\().4s
47         add             T3.4s, T3.4s, \k3\().4s
48
49         // Multiply 32x32 => 64 and accumulate
50         mov             T4.d[0], T0.d[1]
51         mov             T5.d[0], T1.d[1]
52         mov             T6.d[0], T2.d[1]
53         mov             T7.d[0], T3.d[1]
54         umlal           PASS0_SUMS.2d, T0.2s, T4.2s
55         umlal           PASS1_SUMS.2d, T1.2s, T5.2s
56         umlal           PASS2_SUMS.2d, T2.2s, T6.2s
57         umlal           PASS3_SUMS.2d, T3.2s, T7.2s
58 .endm
59
60 /*
61  * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
62  *              __le64 hash[NH_NUM_PASSES])
63  *
64  * It's guaranteed that message_len % 16 == 0.
65  */
66 SYM_TYPED_FUNC_START(nh_neon)
67
68         ld1             {K0.4s,K1.4s}, [KEY], #32
69           movi          PASS0_SUMS.2d, #0
70           movi          PASS1_SUMS.2d, #0
71         ld1             {K2.4s}, [KEY], #16
72           movi          PASS2_SUMS.2d, #0
73           movi          PASS3_SUMS.2d, #0
74
75         subs            MESSAGE_LEN, MESSAGE_LEN, #64
76         blt             .Lloop4_done
77 .Lloop4:
78         _nh_stride      K0, K1, K2, K3
79         _nh_stride      K1, K2, K3, K0
80         _nh_stride      K2, K3, K0, K1
81         _nh_stride      K3, K0, K1, K2
82         subs            MESSAGE_LEN, MESSAGE_LEN, #64
83         bge             .Lloop4
84
85 .Lloop4_done:
86         ands            MESSAGE_LEN, MESSAGE_LEN, #63
87         beq             .Ldone
88         _nh_stride      K0, K1, K2, K3
89
90         subs            MESSAGE_LEN, MESSAGE_LEN, #16
91         beq             .Ldone
92         _nh_stride      K1, K2, K3, K0
93
94         subs            MESSAGE_LEN, MESSAGE_LEN, #16
95         beq             .Ldone
96         _nh_stride      K2, K3, K0, K1
97
98 .Ldone:
99         // Sum the accumulators for each pass, then store the sums to 'hash'
100         addp            T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d
101         addp            T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d
102         st1             {T0.16b,T1.16b}, [HASH]
103         ret
104 SYM_FUNC_END(nh_neon)