]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
33388b44 | 2 | # Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e | 3 | # |
81cae8ce | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
3e181369 AP |
9 | # |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | # | |
17 | # December 2011 | |
18 | # | |
19 | # The module implements GCM GHASH function and underlying single | |
20 | # multiplication operation in GF(2^128). Even though subroutines | |
21 | # have _4bit suffix, they are not using any tables, but rely on | |
22 | # hardware Galois Field Multiply support. Streamed GHASH processes | |
23 | # byte in ~7 cycles, which is >6x faster than "4-bit" table-driven | |
24 | # code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are | |
25 | # comparing apples vs. oranges, but compiler surely could have done | |
26 | # better, because theoretical [though not necessarily achievable] | |
27 | # estimate for "4-bit" table-driven implementation is ~12 cycles. | |
28 | ||
1aa89a7a | 29 | $output = pop and open STDOUT,">$output"; |
3e181369 AP |
30 | |
31 | ($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6"); # arguments | |
32 | ||
33 | ($Z0,$Z1,$Z2,$Z3, $H0, $H1, $H2, $H3, | |
34 | $H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27)); | |
35 | ($H01u,$H01y,$H2u,$H3u, $H0y,$H1y,$H2y,$H3y, | |
36 | $H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27)); | |
37 | ($FF000000,$E10000)=("B30","B31"); | |
38 | ($xip,$x0,$x1,$xib)=map("B$_",(6..9)); # $xip zaps $len | |
39 | $xia="A9"; | |
40 | ($rem,$res)=("B4","B5"); # $rem zaps $Htable | |
41 | ||
42 | $code.=<<___; | |
43 | .text | |
bd227733 AP |
44 | |
45 | .if .ASSEMBLER_VERSION<7000000 | |
46 | .asg 0,__TI_EABI__ | |
47 | .endif | |
904732f6 AP |
48 | .if __TI_EABI__ |
49 | .asg gcm_gmult_1bit,_gcm_gmult_1bit | |
50 | .asg gcm_gmult_4bit,_gcm_gmult_4bit | |
51 | .asg gcm_ghash_4bit,_gcm_ghash_4bit | |
52 | .endif | |
3e181369 AP |
53 | |
54 | .asg B3,RA | |
55 | ||
56 | .if 0 | |
57 | .global _gcm_gmult_1bit | |
58 | _gcm_gmult_1bit: | |
59 | ADDAD $Htable,2,$Htable | |
60 | .endif | |
61 | .global _gcm_gmult_4bit | |
62 | _gcm_gmult_4bit: | |
63 | .asmfunc | |
64 | LDDW *${Htable}[-1],$H1:$H0 ; H.lo | |
65 | LDDW *${Htable}[-2],$H3:$H2 ; H.hi | |
66 | || MV $Xip,${xip} ; reassign Xi | |
67 | || MVK 15,B1 ; SPLOOPD constant | |
68 | ||
69 | MVK 0xE1,$E10000 | |
70 | || LDBU *++${xip}[15],$x1 ; Xi[15] | |
71 | MVK 0xFF,$FF000000 | |
72 | || LDBU *--${xip},$x0 ; Xi[14] | |
73 | SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial | |
74 | SHL $FF000000,24,$FF000000 ; upper byte mask | |
75 | || BNOP ghash_loop? | |
76 | || MVK 1,B0 ; take a single spin | |
77 | ||
78 | PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes | |
79 | AND $H2,$FF000000,$H2u ; H2's upper byte | |
80 | AND $H3,$FF000000,$H3u ; H3's upper byte | |
81 | || SHRU $H2u,8,$H2u | |
82 | SHRU $H3u,8,$H3u | |
83 | || ZERO $Z1:$Z0 | |
84 | SHRU2 $xia,8,$H01u | |
85 | || ZERO $Z3:$Z2 | |
86 | .endasmfunc | |
87 | ||
88 | .global _gcm_ghash_4bit | |
89 | _gcm_ghash_4bit: | |
90 | .asmfunc | |
91 | LDDW *${Htable}[-1],$H1:$H0 ; H.lo | |
92 | || SHRU $len,4,B0 ; reassign len | |
93 | LDDW *${Htable}[-2],$H3:$H2 ; H.hi | |
94 | || MV $Xip,${xip} ; reassign Xi | |
95 | || MVK 15,B1 ; SPLOOPD constant | |
96 | ||
97 | MVK 0xE1,$E10000 | |
98 | || [B0] LDNDW *${inp}[1],$H1x:$H0x | |
99 | MVK 0xFF,$FF000000 | |
100 | || [B0] LDNDW *${inp}++[2],$H3x:$H2x | |
101 | SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial | |
102 | || LDDW *${xip}[1],$Z1:$Z0 | |
103 | SHL $FF000000,24,$FF000000 ; upper byte mask | |
104 | || LDDW *${xip}[0],$Z3:$Z2 | |
105 | ||
106 | PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes | |
107 | AND $H2,$FF000000,$H2u ; H2's upper byte | |
108 | AND $H3,$FF000000,$H3u ; H3's upper byte | |
109 | || SHRU $H2u,8,$H2u | |
110 | SHRU $H3u,8,$H3u | |
111 | SHRU2 $xia,8,$H01u | |
112 | ||
113 | || [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp | |
114 | || [B0] XOR $H1x,$Z1,$Z1 | |
115 | .if .LITTLE_ENDIAN | |
116 | [B0] XOR $H2x,$Z2,$Z2 | |
117 | || [B0] XOR $H3x,$Z3,$Z3 | |
118 | || [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall | |
119 | STDW $Z1:$Z0,*${xip}[1] | |
120 | || [B0] SHRU $Z1,16,$x0 ; Xi[14] | |
121 | || [B0] ZERO $Z1:$Z0 | |
122 | .else | |
123 | [B0] XOR $H2x,$Z2,$Z2 | |
124 | || [B0] XOR $H3x,$Z3,$Z3 | |
125 | || [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall | |
126 | STDW $Z1:$Z0,*${xip}[1] | |
127 | || [B0] SHRU $Z0,8,$x0 ; Xi[14] | |
128 | || [B0] ZERO $Z1:$Z0 | |
129 | .endif | |
130 | STDW $Z3:$Z2,*${xip}[0] | |
131 | || [B0] ZERO $Z3:$Z2 | |
132 | || [B0] MV $xia,$x1 | |
133 | [B0] ADDK 14,${xip} | |
134 | ||
135 | ghash_loop?: | |
136 | SPLOOPD 6 ; 6*16+7 | |
137 | || MVC B1,ILC | |
138 | || [B0] SUB B0,1,B0 | |
139 | || ZERO A0 | |
140 | || ADD $x1,$x1,$xib ; SHL $x1,1,$xib | |
141 | || SHL $x1,1,$xia | |
142 | ___ | |
143 | \f | |
144 | ########____________________________ | |
145 | # 0 D2. M1 M2 | | |
146 | # 1 M1 | | |
147 | # 2 M1 M2 | | |
148 | # 3 D1. M1 M2 | | |
149 | # 4 S1. L1 | | |
150 | # 5 S2 S1x L1 D2 L2 |____________________________ | |
151 | # 6/0 L1 S1 L2 S2x |D2. M1 M2 | | |
152 | # 7/1 L1 S1 D1x S2 M2 | M1 | | |
153 | # 8/2 S1 L1x S2 | M1 M2 | | |
154 | # 9/3 S1 L1x | D1. M1 M2 | | |
155 | # 10/4 D1x | S1. L1 | | |
156 | # 11/5 |S2 S1x L1 D2 L2 |____________ | |
157 | # 12/6/0 D1x __| L1 S1 L2 S2x |D2. .... | |
158 | # 7/1 L1 S1 D1x S2 M2 | .... | |
159 | # 8/2 S1 L1x S2 | .... | |
160 | #####... ................|............ | |
161 | $code.=<<___; | |
053fa39a | 162 | XORMPY $H0,$xia,$H0x ; 0 ; H·(Xi[i]<<1) |
3e181369 AP |
163 | || XORMPY $H01u,$xib,$H01y |
164 | || [A0] LDBU *--${xip},$x0 | |
165 | XORMPY $H1,$xia,$H1x ; 1 | |
166 | XORMPY $H2,$xia,$H2x ; 2 | |
167 | || XORMPY $H2u,$xib,$H2y | |
168 | XORMPY $H3,$xia,$H3x ; 3 | |
169 | || XORMPY $H3u,$xib,$H3y | |
170 | ||[!A0] MVK.D 15,A0 ; *--${xip} counter | |
053fa39a | 171 | XOR.L $H0x,$Z0,$Z0 ; 4 ; Z^=H·(Xi[i]<<1) |
3e181369 AP |
172 | || [A0] SUB.S A0,1,A0 |
173 | XOR.L $H1x,$Z1,$Z1 ; 5 | |
174 | || AND.D $H01y,$FF000000,$H0z | |
175 | || SWAP2.L $H01y,$H1y ; ; SHL $H01y,16,$H1y | |
176 | || SHL $x0,1,$xib | |
177 | || SHL $x0,1,$xia | |
178 | ||
179 | XOR.L $H2x,$Z2,$Z2 ; 6/0 ; [0,0] in epilogue | |
180 | || SHL $Z0,1,$rem ; ; rem=Z<<1 | |
181 | || SHRMB.S $Z1,$Z0,$Z0 ; ; Z>>=8 | |
182 | || AND.L $H1y,$FF000000,$H1z | |
183 | XOR.L $H3x,$Z3,$Z3 ; 7/1 | |
184 | || SHRMB.S $Z2,$Z1,$Z1 | |
185 | || XOR.D $H0z,$Z0,$Z0 ; merge upper byte products | |
186 | || AND.S $H2y,$FF000000,$H2z | |
187 | || XORMPY $E10000,$rem,$res ; ; implicit rem&0x1FE | |
188 | XOR.L $H1z,$Z1,$Z1 ; 8/2 | |
189 | || SHRMB.S $Z3,$Z2,$Z2 | |
190 | || AND.S $H3y,$FF000000,$H3z | |
191 | XOR.L $H2z,$Z2,$Z2 ; 9/3 | |
192 | || SHRU $Z3,8,$Z3 | |
193 | XOR.D $H3z,$Z3,$Z3 ; 10/4 | |
194 | NOP ; 11/5 | |
195 | ||
196 | SPKERNEL 0,2 | |
197 | || XOR.D $res,$Z3,$Z3 ; 12/6/0; Z^=res | |
198 | ||
199 | ; input pre-fetch is possible where D1 slot is available... | |
200 | [B0] LDNDW *${inp}[1],$H1x:$H0x ; 8/- | |
201 | [B0] LDNDW *${inp}++[2],$H3x:$H2x ; 9/- | |
202 | NOP ; 10/- | |
203 | .if .LITTLE_ENDIAN | |
204 | SWAP2 $Z0,$Z1 ; 11/- | |
205 | || SWAP4 $Z1,$Z0 | |
206 | SWAP4 $Z1,$Z1 ; 12/- | |
207 | || SWAP2 $Z0,$Z0 | |
208 | SWAP2 $Z2,$Z3 | |
209 | || SWAP4 $Z3,$Z2 | |
210 | ||[!B0] BNOP RA | |
211 | SWAP4 $Z3,$Z3 | |
212 | || SWAP2 $Z2,$Z2 | |
213 | || [B0] BNOP ghash_loop? | |
214 | [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp | |
215 | || [B0] XOR $H1x,$Z1,$Z1 | |
216 | [B0] XOR $H2x,$Z2,$Z2 | |
217 | || [B0] XOR $H3x,$Z3,$Z3 | |
218 | || [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall | |
219 | STDW $Z1:$Z0,*${xip}[1] | |
220 | || [B0] SHRU $Z1,16,$x0 ; Xi[14] | |
221 | || [B0] ZERO $Z1:$Z0 | |
222 | .else | |
223 | [!B0] BNOP RA ; 11/- | |
224 | [B0] BNOP ghash_loop? ; 12/- | |
225 | [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp | |
226 | || [B0] XOR $H1x,$Z1,$Z1 | |
227 | [B0] XOR $H2x,$Z2,$Z2 | |
228 | || [B0] XOR $H3x,$Z3,$Z3 | |
229 | || [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall | |
230 | STDW $Z1:$Z0,*${xip}[1] | |
231 | || [B0] SHRU $Z0,8,$x0 ; Xi[14] | |
232 | || [B0] ZERO $Z1:$Z0 | |
233 | .endif | |
234 | STDW $Z3:$Z2,*${xip}[0] | |
235 | || [B0] ZERO $Z3:$Z2 | |
236 | || [B0] MV $xia,$x1 | |
237 | [B0] ADDK 14,${xip} | |
238 | .endasmfunc | |
239 | ||
240 | .sect .const | |
241 | .cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>" | |
242 | .align 4 | |
243 | ___ | |
244 | ||
245 | print $code; | |
a21314db | 246 | close STDOUT or die "error closing STDOUT: $!"; |