]>
Commit | Line | Data |
---|---|---|
480cd6ab AP |
1 | #!/usr/bin/env perl |
2 | ||
3 | # ==================================================================== | |
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
5 | # project. The module is, however, dual licensed under OpenSSL and | |
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | # details see http://www.openssl.org/~appro/cryptogams/. | |
8 | # ==================================================================== | |
9 | # | |
10 | # March 2010 | |
11 | # | |
c3473126 AP |
12 | # The module implements "4-bit" GCM GHASH function and underlying |
13 | # single multiplication operation in GF(2^128). "4-bit" means that it | |
14 | # uses 256 bytes per-key table [+128 bytes shared table]. Streamed | |
15 | # GHASH performance was measured to be 6.35 cycles per processed byte | |
16 | # on Itanium 2, which is >90% better than Microsoft compiler generated | |
17 | # code. Well, the number should have been ~6.5. The deviation has | |
18 | # everything to do with the way performance is measured: as difference | |
19 | # between GCM and straightforward 128-bit counter mode. To anchor to | |
20 | # something else sha1-ia64.pl module processes one byte in 6.0 cycles. | |
21 | # On Itanium GHASH should run at ~8.5 cycles per byte. | |
480cd6ab | 22 | |
85e28dfa AP |
23 | # Note about "528B" variant. In Itanium 2 case it makes lesser sense |
24 | # to implement it for following reason. Because number of functional | |
25 | # units is naturally limited, it's impossible to implement "528B" loop | |
26 | # in 4 cycles, only in 5. This means that theoretically performance | |
27 | # improvement can't be more than 20%, ~15% is more realistic. This | |
28 | # is considered below justification level for implementing new code. | |
29 | # Not to mention that on original Itanium it would actually run | |
30 | # slower, spending >9 cycles per byte. | |
31 | ||
480cd6ab AP |
32 | $output=shift and (open STDOUT,">$output" or die "can't open $output: $!"); |
33 | ||
34 | if ($^O eq "hpux") { | |
35 | $ADDP="addp4"; | |
36 | for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } | |
37 | } else { $ADDP="add"; } | |
38 | for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); | |
39 | $big_endian=0 if (/\-DL_ENDIAN/); } | |
40 | if (!defined($big_endian)) | |
41 | { $big_endian=(unpack('L',pack('N',1))==1); } | |
42 | ||
43 | sub loop() { | |
44 | my $label=shift; | |
45 | my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp | |
46 | ||
47 | # Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e. | |
48 | # in scalable manner;-) Naturally assuming data in L1 cache... | |
49 | # Special note about 'dep' instruction, which is used to construct | |
50 | # &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128 | |
51 | # bytes boundary and lower 7 bits of its address are guaranteed to | |
52 | # be zero. | |
53 | $code.=<<___; | |
54 | $label: | |
55 | { .mfi; (p18) ld8 Hlo=[Hi[1]],-8 | |
56 | (p19) dep rem=Zlo,rem_4bitp,3,4 } | |
57 | { .mfi; (p19) xor Zhi=Zhi,Hhi | |
58 | ($p17) xor xi[1]=xi[1],in[1] };; | |
59 | { .mfi; (p18) ld8 Hhi=[Hi[1]] | |
60 | (p19) shrp Zlo=Zhi,Zlo,4 } | |
61 | { .mfi; (p19) ld8 rem=[rem] | |
62 | (p18) and Hi[1]=mask0xf0,xi[2] };; | |
63 | { .mmi; ($p16) ld1 in[0]=[inp],-1 | |
64 | (p18) xor Zlo=Zlo,Hlo | |
65 | (p19) shr.u Zhi=Zhi,4 } | |
66 | { .mib; (p19) xor Hhi=Hhi,rem | |
67 | (p18) add Hi[1]=Htbl,Hi[1] };; | |
68 | ||
69 | { .mfi; (p18) ld8 Hlo=[Hi[1]],-8 | |
70 | (p18) dep rem=Zlo,rem_4bitp,3,4 } | |
71 | { .mfi; (p17) shladd Hi[0]=xi[1],4,r0 | |
72 | (p18) xor Zhi=Zhi,Hhi };; | |
73 | { .mfi; (p18) ld8 Hhi=[Hi[1]] | |
74 | (p18) shrp Zlo=Zhi,Zlo,4 } | |
75 | { .mfi; (p18) ld8 rem=[rem] | |
76 | (p17) and Hi[0]=mask0xf0,Hi[0] };; | |
77 | { .mmi; (p16) ld1 xi[0]=[Xi],-1 | |
78 | (p18) xor Zlo=Zlo,Hlo | |
79 | (p18) shr.u Zhi=Zhi,4 } | |
80 | { .mib; (p18) xor Hhi=Hhi,rem | |
81 | (p17) add Hi[0]=Htbl,Hi[0] | |
82 | br.ctop.sptk $label };; | |
83 | ___ | |
84 | } | |
85 | ||
86 | $code=<<___; | |
87 | .explicit | |
88 | .text | |
89 | ||
90 | prevfs=r2; prevlc=r3; prevpr=r8; | |
91 | mask0xf0=r21; | |
92 | rem=r22; rem_4bitp=r23; | |
93 | Xi=r24; Htbl=r25; | |
94 | inp=r26; end=r27; | |
95 | Hhi=r28; Hlo=r29; | |
96 | Zhi=r30; Zlo=r31; | |
97 | ||
98 | .global gcm_gmult_4bit# | |
99 | .proc gcm_gmult_4bit# | |
100 | .align 128 | |
101 | .skip 16;; // aligns loop body | |
102 | gcm_gmult_4bit: | |
103 | .prologue | |
104 | { .mmi; .save ar.pfs,prevfs | |
105 | alloc prevfs=ar.pfs,2,6,0,8 | |
106 | $ADDP Xi=15,in0 // &Xi[15] | |
107 | mov rem_4bitp=ip } | |
108 | { .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo | |
109 | .save ar.lc,prevlc | |
110 | mov prevlc=ar.lc | |
111 | .save pr,prevpr | |
112 | mov prevpr=pr };; | |
113 | ||
114 | .body | |
115 | .rotr in[3],xi[3],Hi[2] | |
116 | ||
117 | { .mib; ld1 xi[2]=[Xi],-1 // Xi[15] | |
118 | mov mask0xf0=0xf0 | |
119 | brp.loop.imp .Loop1,.Lend1-16};; | |
120 | { .mmi; ld1 xi[1]=[Xi],-1 // Xi[14] | |
121 | };; | |
122 | { .mii; shladd Hi[1]=xi[2],4,r0 | |
123 | mov pr.rot=0x7<<16 | |
124 | mov ar.lc=13 };; | |
125 | { .mii; and Hi[1]=mask0xf0,Hi[1] | |
126 | mov ar.ec=3 | |
127 | xor Zlo=Zlo,Zlo };; | |
128 | { .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo | |
129 | add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp | |
130 | xor Zhi=Zhi,Zhi };; | |
131 | ___ | |
132 | &loop (".Loop1",1); | |
133 | $code.=<<___; | |
134 | .Lend1: | |
135 | { .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact | |
136 | { .mib; mux1 Zlo=Zlo,\@rev };; | |
137 | { .mib; mux1 Zhi=Zhi,\@rev };; | |
138 | { .mmi; add Hlo=9,Xi;; // ;; is here to prevent | |
139 | add Hhi=1,Xi };; // pipeline flush on Itanium | |
140 | { .mib; st8 [Hlo]=Zlo | |
141 | mov pr=prevpr,-2 };; | |
142 | { .mib; st8 [Hhi]=Zhi | |
143 | mov ar.lc=prevlc | |
144 | br.ret.sptk.many b0 };; | |
145 | .endp gcm_gmult_4bit# | |
146 | ||
147 | .global gcm_ghash_4bit# | |
148 | .proc gcm_ghash_4bit# | |
149 | .align 32;; | |
150 | gcm_ghash_4bit: | |
151 | .prologue | |
152 | { .mmi; .save ar.pfs,prevfs | |
153 | alloc prevfs=ar.pfs,4,4,0,8 | |
4f39edbf | 154 | $ADDP inp=15,in2 // &inp[15] |
480cd6ab | 155 | mov rem_4bitp=ip } |
4f39edbf AP |
156 | { .mmi; $ADDP end=in3,in2 // &inp[len] |
157 | $ADDP Xi=15,in0 // &Xi[15] | |
480cd6ab AP |
158 | .save ar.lc,prevlc |
159 | mov prevlc=ar.lc };; | |
4f39edbf | 160 | { .mmi; $ADDP Htbl=8,in1 // &Htbl[0].lo |
480cd6ab AP |
161 | mov mask0xf0=0xf0 |
162 | .save pr,prevpr | |
163 | mov prevpr=pr } | |
164 | ||
165 | .body | |
166 | .rotr in[3],xi[3],Hi[2] | |
167 | ||
168 | { .mmi; ld1 in[2]=[inp],-1 // inp[15] | |
169 | ld1 xi[2]=[Xi],-1 // Xi[15] | |
170 | add end=-17,end };; | |
171 | { .mmi; ld1 in[1]=[inp],-1 // inp[14] | |
172 | ld1 xi[1]=[Xi],-1 // Xi[14] | |
173 | xor xi[2]=xi[2],in[2] };; | |
174 | { .mii; shladd Hi[1]=xi[2],4,r0 | |
175 | mov pr.rot=0x7<<16 | |
176 | mov ar.lc=13 };; | |
177 | { .mii; and Hi[1]=mask0xf0,Hi[1] | |
178 | mov ar.ec=3 | |
179 | xor Zlo=Zlo,Zlo };; | |
180 | { .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo | |
181 | add rem_4bitp=rem_4bit#-gcm_ghash_4bit#,rem_4bitp | |
182 | xor Zhi=Zhi,Zhi };; | |
183 | ___ | |
184 | &loop (".LoopN"); | |
185 | $code.=<<___; | |
186 | { .mib; xor Zhi=Zhi,Hhi // modulo-scheduling artefact | |
187 | extr.u xi[2]=Zlo,0,8 } // Xi[15] | |
188 | { .mib; cmp.ltu p6,p0=inp,end // are we done? | |
189 | add inp=32,inp // advance inp | |
190 | clrrrb.pr };; | |
191 | { .mii; | |
192 | (p6) ld1 in[2]=[inp],-1 // inp[15] | |
193 | (p6) extr.u xi[1]=Zlo,8,8 // Xi[14] | |
194 | (p6) mov ar.lc=13 };; | |
195 | { .mii; | |
196 | (p6) ld1 in[1]=[inp],-1 // inp[14] | |
197 | (p6) mov ar.ec=3 | |
198 | mux1 Zlo=Zlo,\@rev };; | |
199 | { .mii; | |
200 | (p6) xor xi[2]=xi[2],in[2] | |
201 | mux1 Zhi=Zhi,\@rev };; | |
202 | { .mii; | |
203 | (p6) shladd Hi[1]=xi[2],4,r0 | |
204 | add Hlo=9,Xi // Xi is &Xi[-1] | |
205 | add Hhi=1,Xi };; | |
206 | { .mii; | |
207 | (p6) and Hi[1]=mask0xf0,Hi[1] | |
208 | (p6) add Xi=14,Xi // &Xi[13] | |
209 | (p6) mov pr.rot=0x7<<16 };; | |
210 | ||
211 | { .mii; st8 [Hlo]=Zlo | |
212 | (p6) xor Zlo=Zlo,Zlo | |
213 | (p6) add Hi[1]=Htbl,Hi[1] };; | |
214 | { .mib; st8 [Hhi]=Zhi | |
215 | (p6) xor Zhi=Zhi,Zhi | |
216 | (p6) br.cond.dptk.many .LoopN };; | |
217 | ||
218 | { .mib; mov pr=prevpr,-2 } | |
219 | { .mib; mov ar.lc=prevlc | |
220 | br.ret.sptk.many b0 };; | |
221 | .endp gcm_ghash_4bit# | |
222 | ||
223 | .align 128;; | |
224 | .type rem_4bit#,\@object | |
225 | rem_4bit: | |
226 | data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48 | |
227 | data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48 | |
228 | data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48 | |
229 | data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48 | |
230 | .size rem_4bit#,128 | |
231 | stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>" | |
232 | ___ | |
233 | ||
234 | $code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian); | |
235 | ||
236 | print $code; | |
237 | close STDOUT; |