]>
Commit | Line | Data |
---|---|---|
480cd6ab AP |
1 | #!/usr/bin/env perl |
2 | ||
3 | # ==================================================================== | |
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
5 | # project. The module is, however, dual licensed under OpenSSL and | |
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | # details see http://www.openssl.org/~appro/cryptogams/. | |
8 | # ==================================================================== | |
9 | # | |
10 | # March 2010 | |
11 | # | |
12 | # The module implements "4-bit" Galois field multiplication and | |
13 | # streamed GHASH function. "4-bit" means that it uses 256 bytes | |
14 | # per-key table [+128 bytes shared table]. Streamed GHASH performance | |
15 | # was measured to be 6.35 cycles per processed byte on Itanium 2, | |
16 | # which is >90% better than Microsoft compiler generated code. Well, | |
17 | # the number should have been ~6.5. The deviation has everything to do | |
18 | # with the way performance is measured, as difference between GCM and | |
19 | # straightforward 128-bit counter mode. To anchor to something else | |
20 | # sha1-ia64.pl module processes one byte in 6.0 cycles. On Itanium | |
21 | # GHASH should run at ~8.5 cycles per byte. | |
22 | ||
23 | $output=shift and (open STDOUT,">$output" or die "can't open $output: $!"); | |
24 | ||
25 | if ($^O eq "hpux") { | |
26 | $ADDP="addp4"; | |
27 | for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } | |
28 | } else { $ADDP="add"; } | |
29 | for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); | |
30 | $big_endian=0 if (/\-DL_ENDIAN/); } | |
31 | if (!defined($big_endian)) | |
32 | { $big_endian=(unpack('L',pack('N',1))==1); } | |
33 | ||
34 | sub loop() { | |
35 | my $label=shift; | |
36 | my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp | |
37 | ||
38 | # Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e. | |
39 | # in scalable manner;-) Naturally assuming data in L1 cache... | |
40 | # Special note about 'dep' instruction, which is used to construct | |
41 | # &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128 | |
42 | # bytes boundary and lower 7 bits of its address are guaranteed to | |
43 | # be zero. | |
44 | $code.=<<___; | |
45 | $label: | |
46 | { .mfi; (p18) ld8 Hlo=[Hi[1]],-8 | |
47 | (p19) dep rem=Zlo,rem_4bitp,3,4 } | |
48 | { .mfi; (p19) xor Zhi=Zhi,Hhi | |
49 | ($p17) xor xi[1]=xi[1],in[1] };; | |
50 | { .mfi; (p18) ld8 Hhi=[Hi[1]] | |
51 | (p19) shrp Zlo=Zhi,Zlo,4 } | |
52 | { .mfi; (p19) ld8 rem=[rem] | |
53 | (p18) and Hi[1]=mask0xf0,xi[2] };; | |
54 | { .mmi; ($p16) ld1 in[0]=[inp],-1 | |
55 | (p18) xor Zlo=Zlo,Hlo | |
56 | (p19) shr.u Zhi=Zhi,4 } | |
57 | { .mib; (p19) xor Hhi=Hhi,rem | |
58 | (p18) add Hi[1]=Htbl,Hi[1] };; | |
59 | ||
60 | { .mfi; (p18) ld8 Hlo=[Hi[1]],-8 | |
61 | (p18) dep rem=Zlo,rem_4bitp,3,4 } | |
62 | { .mfi; (p17) shladd Hi[0]=xi[1],4,r0 | |
63 | (p18) xor Zhi=Zhi,Hhi };; | |
64 | { .mfi; (p18) ld8 Hhi=[Hi[1]] | |
65 | (p18) shrp Zlo=Zhi,Zlo,4 } | |
66 | { .mfi; (p18) ld8 rem=[rem] | |
67 | (p17) and Hi[0]=mask0xf0,Hi[0] };; | |
68 | { .mmi; (p16) ld1 xi[0]=[Xi],-1 | |
69 | (p18) xor Zlo=Zlo,Hlo | |
70 | (p18) shr.u Zhi=Zhi,4 } | |
71 | { .mib; (p18) xor Hhi=Hhi,rem | |
72 | (p17) add Hi[0]=Htbl,Hi[0] | |
73 | br.ctop.sptk $label };; | |
74 | ___ | |
75 | } | |
76 | ||
77 | $code=<<___; | |
78 | .explicit | |
79 | .text | |
80 | ||
81 | prevfs=r2; prevlc=r3; prevpr=r8; | |
82 | mask0xf0=r21; | |
83 | rem=r22; rem_4bitp=r23; | |
84 | Xi=r24; Htbl=r25; | |
85 | inp=r26; end=r27; | |
86 | Hhi=r28; Hlo=r29; | |
87 | Zhi=r30; Zlo=r31; | |
88 | ||
89 | .global gcm_gmult_4bit# | |
90 | .proc gcm_gmult_4bit# | |
91 | .align 128 | |
92 | .skip 16;; // aligns loop body | |
93 | gcm_gmult_4bit: | |
94 | .prologue | |
95 | { .mmi; .save ar.pfs,prevfs | |
96 | alloc prevfs=ar.pfs,2,6,0,8 | |
97 | $ADDP Xi=15,in0 // &Xi[15] | |
98 | mov rem_4bitp=ip } | |
99 | { .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo | |
100 | .save ar.lc,prevlc | |
101 | mov prevlc=ar.lc | |
102 | .save pr,prevpr | |
103 | mov prevpr=pr };; | |
104 | ||
105 | .body | |
106 | .rotr in[3],xi[3],Hi[2] | |
107 | ||
108 | { .mib; ld1 xi[2]=[Xi],-1 // Xi[15] | |
109 | mov mask0xf0=0xf0 | |
110 | brp.loop.imp .Loop1,.Lend1-16};; | |
111 | { .mmi; ld1 xi[1]=[Xi],-1 // Xi[14] | |
112 | };; | |
113 | { .mii; shladd Hi[1]=xi[2],4,r0 | |
114 | mov pr.rot=0x7<<16 | |
115 | mov ar.lc=13 };; | |
116 | { .mii; and Hi[1]=mask0xf0,Hi[1] | |
117 | mov ar.ec=3 | |
118 | xor Zlo=Zlo,Zlo };; | |
119 | { .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo | |
120 | add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp | |
121 | xor Zhi=Zhi,Zhi };; | |
122 | ___ | |
123 | &loop (".Loop1",1); | |
124 | $code.=<<___; | |
125 | .Lend1: | |
126 | { .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact | |
127 | { .mib; mux1 Zlo=Zlo,\@rev };; | |
128 | { .mib; mux1 Zhi=Zhi,\@rev };; | |
129 | { .mmi; add Hlo=9,Xi;; // ;; is here to prevent | |
130 | add Hhi=1,Xi };; // pipeline flush on Itanium | |
131 | { .mib; st8 [Hlo]=Zlo | |
132 | mov pr=prevpr,-2 };; | |
133 | { .mib; st8 [Hhi]=Zhi | |
134 | mov ar.lc=prevlc | |
135 | br.ret.sptk.many b0 };; | |
136 | .endp gcm_gmult_4bit# | |
137 | ||
138 | .global gcm_ghash_4bit# | |
139 | .proc gcm_ghash_4bit# | |
140 | .align 32;; | |
141 | gcm_ghash_4bit: | |
142 | .prologue | |
143 | { .mmi; .save ar.pfs,prevfs | |
144 | alloc prevfs=ar.pfs,4,4,0,8 | |
145 | $ADDP inp=15,in0 // &inp[15] | |
146 | mov rem_4bitp=ip } | |
147 | { .mmi; $ADDP end=in1,in0 // &inp[len] | |
148 | $ADDP Xi=15,in2 // &Xi[15] | |
149 | .save ar.lc,prevlc | |
150 | mov prevlc=ar.lc };; | |
151 | { .mmi; $ADDP Htbl=8,in3 // &Htbl[0].lo | |
152 | mov mask0xf0=0xf0 | |
153 | .save pr,prevpr | |
154 | mov prevpr=pr } | |
155 | ||
156 | .body | |
157 | .rotr in[3],xi[3],Hi[2] | |
158 | ||
159 | { .mmi; ld1 in[2]=[inp],-1 // inp[15] | |
160 | ld1 xi[2]=[Xi],-1 // Xi[15] | |
161 | add end=-17,end };; | |
162 | { .mmi; ld1 in[1]=[inp],-1 // inp[14] | |
163 | ld1 xi[1]=[Xi],-1 // Xi[14] | |
164 | xor xi[2]=xi[2],in[2] };; | |
165 | { .mii; shladd Hi[1]=xi[2],4,r0 | |
166 | mov pr.rot=0x7<<16 | |
167 | mov ar.lc=13 };; | |
168 | { .mii; and Hi[1]=mask0xf0,Hi[1] | |
169 | mov ar.ec=3 | |
170 | xor Zlo=Zlo,Zlo };; | |
171 | { .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo | |
172 | add rem_4bitp=rem_4bit#-gcm_ghash_4bit#,rem_4bitp | |
173 | xor Zhi=Zhi,Zhi };; | |
174 | ___ | |
175 | &loop (".LoopN"); | |
176 | $code.=<<___; | |
177 | { .mib; xor Zhi=Zhi,Hhi // modulo-scheduling artefact | |
178 | extr.u xi[2]=Zlo,0,8 } // Xi[15] | |
179 | { .mib; cmp.ltu p6,p0=inp,end // are we done? | |
180 | add inp=32,inp // advance inp | |
181 | clrrrb.pr };; | |
182 | { .mii; | |
183 | (p6) ld1 in[2]=[inp],-1 // inp[15] | |
184 | (p6) extr.u xi[1]=Zlo,8,8 // Xi[14] | |
185 | (p6) mov ar.lc=13 };; | |
186 | { .mii; | |
187 | (p6) ld1 in[1]=[inp],-1 // inp[14] | |
188 | (p6) mov ar.ec=3 | |
189 | mux1 Zlo=Zlo,\@rev };; | |
190 | { .mii; | |
191 | (p6) xor xi[2]=xi[2],in[2] | |
192 | mux1 Zhi=Zhi,\@rev };; | |
193 | { .mii; | |
194 | (p6) shladd Hi[1]=xi[2],4,r0 | |
195 | add Hlo=9,Xi // Xi is &Xi[-1] | |
196 | add Hhi=1,Xi };; | |
197 | { .mii; | |
198 | (p6) and Hi[1]=mask0xf0,Hi[1] | |
199 | (p6) add Xi=14,Xi // &Xi[13] | |
200 | (p6) mov pr.rot=0x7<<16 };; | |
201 | ||
202 | { .mii; st8 [Hlo]=Zlo | |
203 | (p6) xor Zlo=Zlo,Zlo | |
204 | (p6) add Hi[1]=Htbl,Hi[1] };; | |
205 | { .mib; st8 [Hhi]=Zhi | |
206 | (p6) xor Zhi=Zhi,Zhi | |
207 | (p6) br.cond.dptk.many .LoopN };; | |
208 | ||
209 | { .mib; mov pr=prevpr,-2 } | |
210 | { .mib; mov ar.lc=prevlc | |
211 | br.ret.sptk.many b0 };; | |
212 | .endp gcm_ghash_4bit# | |
213 | ||
214 | .align 128;; | |
215 | .type rem_4bit#,\@object | |
216 | rem_4bit: | |
217 | data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48 | |
218 | data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48 | |
219 | data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48 | |
220 | data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48 | |
221 | .size rem_4bit#,128 | |
222 | stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>" | |
223 | ___ | |
224 | ||
225 | $code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian); | |
226 | ||
227 | print $code; | |
228 | close STDOUT; |