]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/rc4/asm/rc4-ia64.pl
Fix build with no-threads no-ec
[thirdparty/openssl.git] / crypto / rc4 / asm / rc4-ia64.pl
CommitLineData
6aa36e8e
RS
1#! /usr/bin/env perl
2# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
02703c74
AP
9#
10# ====================================================================
11# Written by David Mosberger <David.Mosberger@acm.org> based on the
12# Itanium optimized Crypto code which was released by HP Labs at
13# http://www.hpl.hp.com/research/linux/crypto/.
14#
15# Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
16#
17# Permission is hereby granted, free of charge, to any person obtaining
18# a copy of this software and associated documentation files (the
19# "Software"), to deal in the Software without restriction, including
20# without limitation the rights to use, copy, modify, merge, publish,
21# distribute, sublicense, and/or sell copies of the Software, and to
22# permit persons to whom the Software is furnished to do so, subject to
23# the following conditions:
24#
25# The above copyright notice and this permission notice shall be
26# included in all copies or substantial portions of the Software.
27
28# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
29# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
30# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
31# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
32# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
33# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
34# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
35
36
37
38# This is a little helper program which generates a software-pipelined
39# for RC4 encryption. The basic algorithm looks like this:
40#
41# for (counter = 0; counter < len; ++counter)
42# {
43# in = inp[counter];
44# SI = S[I];
45# J = (SI + J) & 0xff;
46# SJ = S[J];
47# T = (SI + SJ) & 0xff;
48# S[I] = SJ, S[J] = SI;
49# ST = S[T];
50# outp[counter] = in ^ ST;
51# I = (I + 1) & 0xff;
52# }
53#
54# Pipelining this loop isn't easy, because the stores to the S[] array
55# need to be observed in the right order. The loop generated by the
56# code below has the following pipeline diagram:
57#
58# cycle
59# | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |10 |11 |12 |13 |14 |15 |16 |17 |
60# iter
61# 1: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
62# 2: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
63# 3: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
64#
65# where:
66# LDI = load of S[I]
67# LDJ = load of S[J]
68# SWP = swap of S[I] and S[J]
69# LDT = load of S[T]
70#
71# Note that in the above diagram, the major trouble-spot is that LDI
72# of the 2nd iteration is performed BEFORE the SWP of the first
73# iteration. Fortunately, this is easy to detect (I of the 1st
74# iteration will be equal to J of the 2nd iteration) and when this
75# happens, we simply forward the proper value from the 1st iteration
76# to the 2nd one. The proper value in this case is simply the value
77# of S[I] from the first iteration (thanks to the fact that SWP
78# simply swaps the contents of S[I] and S[J]).
79#
80# Another potential trouble-spot is in cycle 7, where SWP of the 1st
81# iteration issues at the same time as the LDI of the 3rd iteration.
82# However, thanks to IA-64 execution semantics, this can be taken
83# care of simply by placing LDI later in the instruction-group than
84# SWP. IA-64 CPUs will automatically forward the value if they
85# detect that the SWP and LDI are accessing the same memory-location.
86
87# The core-loop that can be pipelined then looks like this (annotated
88# with McKinley/Madison issue port & latency numbers, assuming L1
89# cache hits for the most part):
90
91# operation: instruction: issue-ports: latency
92# ------------------ ----------------------------- ------------- -------
93
94# Data = *inp++ ld1 data = [inp], 1 M0-M1 1 cyc c0
95# shladd Iptr = I, KeyTable, 3 M0-M3, I0, I1 1 cyc
96# I = (I + 1) & 0xff padd1 nextI = I, one M0-M3, I0, I1 3 cyc
97# ;;
98# SI = S[I] ld8 SI = [Iptr] M0-M1 1 cyc c1 * after SWAP!
99# ;;
100# cmp.eq.unc pBypass = I, J * after J is valid!
101# J = SI + J add J = J, SI M0-M3, I0, I1 1 cyc c2
102# (pBypass) br.cond.spnt Bypass
103# ;;
104# ---------------------------------------------------------------------------------------
105# J = J & 0xff zxt1 J = J I0, I1, 1 cyc c3
106# ;;
107# shladd Jptr = J, KeyTable, 3 M0-M3, I0, I1 1 cyc c4
108# ;;
109# SJ = S[J] ld8 SJ = [Jptr] M0-M1 1 cyc c5
110# ;;
111# ---------------------------------------------------------------------------------------
112# T = (SI + SJ) add T = SI, SJ M0-M3, I0, I1 1 cyc c6
113# ;;
114# T = T & 0xff zxt1 T = T I0, I1 1 cyc
115# S[I] = SJ st8 [Iptr] = SJ M2-M3 c7
116# S[J] = SI st8 [Jptr] = SI M2-M3
117# ;;
118# shladd Tptr = T, KeyTable, 3 M0-M3, I0, I1 1 cyc c8
119# ;;
120# ---------------------------------------------------------------------------------------
121# T = S[T] ld8 T = [Tptr] M0-M1 1 cyc c9
122# ;;
123# data ^= T xor data = data, T M0-M3, I0, I1 1 cyc c10
124# ;;
125# *out++ = Data ^ T dep word = word, data, 8, POS I0, I1 1 cyc c11
126# ;;
127# ---------------------------------------------------------------------------------------
128
129# There are several points worth making here:
130
131# - Note that due to the bypass/forwarding-path, the first two
132# phases of the loop are strangly mingled together. In
133# particular, note that the first stage of the pipeline is
134# using the value of "J", as calculated by the second stage.
135# - Each bundle-pair will have exactly 6 instructions.
136# - Pipelined, the loop can execute in 3 cycles/iteration and
137# 4 stages. However, McKinley/Madison can issue "st1" to
138# the same bank at a rate of at most one per 4 cycles. Thus,
139# instead of storing each byte, we accumulate them in a word
140# and then write them back at once with a single "st8" (this
141# implies that the setup code needs to ensure that the output
142# buffer is properly aligned, if need be, by encoding the
143# first few bytes separately).
144# - There is no space for a "br.ctop" instruction. For this
145# reason we can't use module-loop support in IA-64 and have
146# to do a traditional, purely software-pipelined loop.
147# - We can't replace any of the remaining "add/zxt1" pairs with
148# "padd1" because the latency for that instruction is too high
149# and would push the loop to the point where more bypasses
150# would be needed, which we don't have space for.
151# - The above loop runs at around 3.26 cycles/byte, or roughly
152# 440 MByte/sec on a 1.5GHz Madison. This is well below the
153# system bus bandwidth and hence with judicious use of
154# "lfetch" this loop can run at (almost) peak speed even when
155# the input and output data reside in memory. The
156# max. latency that can be tolerated is (PREFETCH_DISTANCE *
157# L2_LINE_SIZE * 3 cyc), or about 384 cycles assuming (at
158# least) 1-ahead prefetching of 128 byte cache-lines. Note
159# that we do NOT prefetch into L1, since that would only
160# interfere with the S[] table values stored there. This is
161# acceptable because there is a 10 cycle latency between
162# load and first use of the input data.
163# - We use a branch to out-of-line bypass-code of cycle-pressure:
164# we calculate the next J, check for the need to activate the
165# bypass path, and activate the bypass path ALL IN THE SAME
166# CYCLE. If we didn't have these constraints, we could do
167# the bypass with a simple conditional move instruction.
168# Fortunately, the bypass paths get activated relatively
169# infrequently, so the extra branches don't cost all that much
170# (about 0.04 cycles/byte, measured on a 16396 byte file with
171# random input data).
172#
173
3c9d51ce
RL
174$output = pop;
175open STDOUT,">$output";
176
02703c74
AP
177$phases = 4; # number of stages/phases in the pipelined-loop
178$unroll_count = 6; # number of times we unrolled it
179$pComI = (1 << 0);
180$pComJ = (1 << 1);
181$pComT = (1 << 2);
182$pOut = (1 << 3);
183
184$NData = 4;
185$NIP = 3;
186$NJP = 2;
187$NI = 2;
188$NSI = 3;
189$NSJ = 2;
190$NT = 2;
191$NOutWord = 2;
192
193#
194# $threshold is the minimum length before we attempt to use the
195# big software-pipelined loop. It MUST be greater-or-equal
196# to:
197# PHASES * (UNROLL_COUNT + 1) + 7
198#
199# The "+ 7" comes from the fact we may have to encode up to
200# 7 bytes separately before the output pointer is aligned.
201#
202$threshold = (3 * ($phases * ($unroll_count + 1)) + 7);
203
204sub I {
205 local *code = shift;
206 local $format = shift;
4ac210c1 207 $code .= sprintf ("\t\t".$format."\n", @_);
02703c74
AP
208}
209
210sub P {
211 local *code = shift;
212 local $format = shift;
4ac210c1 213 $code .= sprintf ($format."\n", @_);
02703c74
AP
214}
215
216sub STOP {
217 local *code = shift;
218 $code .=<<___;
219 ;;
220___
221}
222
223sub emit_body {
224 local *c = shift;
225 local *bypass = shift;
226 local ($iteration, $p) = @_;
227
228 local $i0 = $iteration;
229 local $i1 = $iteration - 1;
230 local $i2 = $iteration - 2;
231 local $i3 = $iteration - 3;
232 local $iw0 = ($iteration - 3) / 8;
233 local $iw1 = ($iteration > 3) ? ($iteration - 4) / 8 : 1;
234 local $byte_num = ($iteration - 3) % 8;
235 local $label = $iteration + 1;
236 local $pAny = ($p & 0xf) == 0xf;
237 local $pByp = (($p & $pComI) && ($iteration > 0));
238
239 $c.=<<___;
240//////////////////////////////////////////////////
241___
242
243 if (($p & 0xf) == 0) {
2802ec65 244 $c.="#ifdef HOST_IS_BIG_ENDIAN\n";
4ac210c1
AP
245 &I(\$c,"shr.u OutWord[%u] = OutWord[%u], 32;;",
246 $iw1 % $NOutWord, $iw1 % $NOutWord);
247 $c.="#endif\n";
02703c74
AP
248 &I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord);
249 return;
250 }
251
252 # Cycle 0
253 &I(\$c, "{ .mmi") if ($pAny);
254 &I(\$c, "ld1 Data[%u] = [InPtr], 1", $i0 % $NData) if ($p & $pComI);
255 &I(\$c, "padd1 I[%u] = One, I[%u]", $i0 % $NI, $i1 % $NI)if ($p & $pComI);
256 &I(\$c, "zxt1 J = J") if ($p & $pComJ);
257 &I(\$c, "}") if ($pAny);
258 &I(\$c, "{ .mmi") if ($pAny);
259 &I(\$c, "LKEY T[%u] = [T[%u]]", $i1 % $NT, $i1 % $NT) if ($p & $pOut);
260 &I(\$c, "add T[%u] = SI[%u], SJ[%u]",
261 $i0 % $NT, $i2 % $NSI, $i1 % $NSJ) if ($p & $pComT);
262 &I(\$c, "KEYADDR(IPr[%u], I[%u])", $i0 % $NIP, $i1 % $NI) if ($p & $pComI);
263 &I(\$c, "}") if ($pAny);
264 &STOP(\$c);
265
266 # Cycle 1
267 &I(\$c, "{ .mmi") if ($pAny);
268 &I(\$c, "SKEY [IPr[%u]] = SJ[%u]", $i2 % $NIP, $i1%$NSJ)if ($p & $pComT);
269 &I(\$c, "SKEY [JP[%u]] = SI[%u]", $i1 % $NJP, $i2%$NSI) if ($p & $pComT);
270 &I(\$c, "zxt1 T[%u] = T[%u]", $i0 % $NT, $i0 % $NT) if ($p & $pComT);
271 &I(\$c, "}") if ($pAny);
272 &I(\$c, "{ .mmi") if ($pAny);
273 &I(\$c, "LKEY SI[%u] = [IPr[%u]]", $i0 % $NSI, $i0%$NIP)if ($p & $pComI);
274 &I(\$c, "KEYADDR(JP[%u], J)", $i0 % $NJP) if ($p & $pComJ);
275 &I(\$c, "xor Data[%u] = Data[%u], T[%u]",
276 $i3 % $NData, $i3 % $NData, $i1 % $NT) if ($p & $pOut);
277 &I(\$c, "}") if ($pAny);
278 &STOP(\$c);
279
280 # Cycle 2
281 &I(\$c, "{ .mmi") if ($pAny);
282 &I(\$c, "LKEY SJ[%u] = [JP[%u]]", $i0 % $NSJ, $i0%$NJP) if ($p & $pComJ);
283 &I(\$c, "cmp.eq pBypass, p0 = I[%u], J", $i1 % $NI) if ($pByp);
284 &I(\$c, "dep OutWord[%u] = Data[%u], OutWord[%u], BYTE_POS(%u), 8",
285 $iw0%$NOutWord, $i3%$NData, $iw1%$NOutWord, $byte_num) if ($p & $pOut);
286 &I(\$c, "}") if ($pAny);
287 &I(\$c, "{ .mmb") if ($pAny);
288 &I(\$c, "add J = J, SI[%u]", $i0 % $NSI) if ($p & $pComI);
289 &I(\$c, "KEYADDR(T[%u], T[%u])", $i0 % $NT, $i0 % $NT) if ($p & $pComT);
290 &P(\$c, "(pBypass)\tbr.cond.spnt.many .rc4Bypass%u",$label)if ($pByp);
291 &I(\$c, "}") if ($pAny);
292 &STOP(\$c);
293
294 &P(\$c, ".rc4Resume%u:", $label) if ($pByp);
295 if ($byte_num == 0 && $iteration >= $phases) {
296 &I(\$c, "st8 [OutPtr] = OutWord[%u], 8",
297 $iw1 % $NOutWord) if ($p & $pOut);
298 if ($iteration == (1 + $unroll_count) * $phases - 1) {
299 if ($unroll_count == 6) {
300 &I(\$c, "mov OutWord[%u] = OutWord[%u]",
301 $iw1 % $NOutWord, $iw0 % $NOutWord);
302 }
303 &I(\$c, "lfetch.nt1 [InPrefetch], %u",
304 $unroll_count * $phases);
305 &I(\$c, "lfetch.excl.nt1 [OutPrefetch], %u",
306 $unroll_count * $phases);
307 &I(\$c, "br.cloop.sptk.few .rc4Loop");
308 }
309 }
310
311 if ($pByp) {
312 &P(\$bypass, ".rc4Bypass%u:", $label);
313 &I(\$bypass, "sub J = J, SI[%u]", $i0 % $NSI);
314 &I(\$bypass, "nop 0");
315 &I(\$bypass, "nop 0");
316 &I(\$bypass, ";;");
317 &I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI);
318 &I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI);
319 &I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label);
4ac210c1 320 &I(\$bypass, ";;");
02703c74
AP
321 }
322}
323
324$code=<<___;
325.ident \"rc4-ia64.s, version 3.0\"
326.ident \"Copyright (c) 2005 Hewlett-Packard Development Company, L.P.\"
327
328#define LCSave r8
329#define PRSave r9
330
331/* Inputs become invalid once rotation begins! */
332
333#define StateTable in0
334#define DataLen in1
335#define InputBuffer in2
336#define OutputBuffer in3
337
338#define KTable r14
339#define J r15
340#define InPtr r16
341#define OutPtr r17
342#define InPrefetch r18
343#define OutPrefetch r19
344#define One r20
345#define LoopCount r21
346#define Remainder r22
347#define IFinal r23
348#define EndPtr r24
349
350#define tmp0 r25
351#define tmp1 r26
352
353#define pBypass p6
354#define pDone p7
355#define pSmall p8
356#define pAligned p9
357#define pUnaligned p10
358
359#define pComputeI pPhase[0]
360#define pComputeJ pPhase[1]
361#define pComputeT pPhase[2]
362#define pOutput pPhase[3]
363
364#define RetVal r8
365#define L_OK p7
366#define L_NOK p8
367
368#define _NINPUTS 4
369#define _NOUTPUT 0
370
371#define _NROTATE 24
372#define _NLOCALS (_NROTATE - _NINPUTS - _NOUTPUT)
373
374#ifndef SZ
375# define SZ 4 // this must be set to sizeof(RC4_INT)
376#endif
377
378#if SZ == 1
379# define LKEY ld1
380# define SKEY st1
381# define KEYADDR(dst, i) add dst = i, KTable
382#elif SZ == 2
383# define LKEY ld2
384# define SKEY st2
385# define KEYADDR(dst, i) shladd dst = i, 1, KTable
386#elif SZ == 4
387# define LKEY ld4
388# define SKEY st4
389# define KEYADDR(dst, i) shladd dst = i, 2, KTable
390#else
391# define LKEY ld8
392# define SKEY st8
393# define KEYADDR(dst, i) shladd dst = i, 3, KTable
394#endif
395
396#if defined(_HPUX_SOURCE) && !defined(_LP64)
397# define ADDP addp4
398#else
399# define ADDP add
400#endif
401
402/* Define a macro for the bit number of the n-th byte: */
403
4ac210c1 404#if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
2802ec65 405# define HOST_IS_BIG_ENDIAN
02703c74 406# define BYTE_POS(n) (56 - (8 * (n)))
4ac210c1
AP
407#else
408# define BYTE_POS(n) (8 * (n))
02703c74
AP
409#endif
410
411/*
412 We must perform the first phase of the pipeline explicitly since
413 we will always load from the stable the first time. The br.cexit
414 will never be taken since regardless of the number of bytes because
415 the epilogue count is 4.
416*/
4ac210c1
AP
417/* MODSCHED_RC4 macro was split to _PROLOGUE and _LOOP, because HP-UX
418 assembler failed on original macro with syntax error. <appro> */
419#define MODSCHED_RC4_PROLOGUE \\
02703c74
AP
420 { \\
421 ld1 Data[0] = [InPtr], 1; \\
422 add IFinal = 1, I[1]; \\
423 KEYADDR(IPr[0], I[1]); \\
424 } ;; \\
425 { \\
426 LKEY SI[0] = [IPr[0]]; \\
427 mov pr.rot = 0x10000; \\
428 mov ar.ec = 4; \\
429 } ;; \\
430 { \\
431 add J = J, SI[0]; \\
432 zxt1 I[0] = IFinal; \\
4ac210c1
AP
433 br.cexit.spnt.few .+16; /* never taken */ \\
434 } ;;
435#define MODSCHED_RC4_LOOP(label) \\
02703c74
AP
436label: \\
437 { .mmi; \\
438 (pComputeI) ld1 Data[0] = [InPtr], 1; \\
439 (pComputeI) add IFinal = 1, I[1]; \\
440 (pComputeJ) zxt1 J = J; \\
441 }{ .mmi; \\
442 (pOutput) LKEY T[1] = [T[1]]; \\
443 (pComputeT) add T[0] = SI[2], SJ[1]; \\
444 (pComputeI) KEYADDR(IPr[0], I[1]); \\
445 } ;; \\
446 { .mmi; \\
447 (pComputeT) SKEY [IPr[2]] = SJ[1]; \\
448 (pComputeT) SKEY [JP[1]] = SI[2]; \\
449 (pComputeT) zxt1 T[0] = T[0]; \\
450 }{ .mmi; \\
451 (pComputeI) LKEY SI[0] = [IPr[0]]; \\
452 (pComputeJ) KEYADDR(JP[0], J); \\
453 (pComputeI) cmp.eq.unc pBypass, p0 = I[1], J; \\
454 } ;; \\
455 { .mmi; \\
456 (pComputeJ) LKEY SJ[0] = [JP[0]]; \\
457 (pOutput) xor Data[3] = Data[3], T[1]; \\
458 nop 0x0; \\
459 }{ .mmi; \\
460 (pComputeT) KEYADDR(T[0], T[0]); \\
461 (pBypass) mov SI[0] = SI[1]; \\
462 (pComputeI) zxt1 I[0] = IFinal; \\
463 } ;; \\
464 { .mmb; \\
465 (pOutput) st1 [OutPtr] = Data[3], 1; \\
466 (pComputeI) add J = J, SI[0]; \\
467 br.ctop.sptk.few label; \\
468 } ;;
469
470 .text
471
472 .align 32
473
474 .type RC4, \@function
475 .global RC4
476
477 .proc RC4
478 .prologue
479
480RC4:
481 {
482 .mmi
483 alloc r2 = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
484
485 .rotr Data[4], I[2], IPr[3], SI[3], JP[2], SJ[2], T[2], \\
486 OutWord[2]
487 .rotp pPhase[4]
488
4ac210c1
AP
489 ADDP InPrefetch = 0, InputBuffer
490 ADDP KTable = 0, StateTable
02703c74 491 }
02703c74 492 {
4ac210c1
AP
493 .mmi
494 ADDP InPtr = 0, InputBuffer
495 ADDP OutPtr = 0, OutputBuffer
496 mov RetVal = r0
02703c74 497 }
02703c74
AP
498 ;;
499 {
500 .mmi
501 lfetch.nt1 [InPrefetch], 0x80
4ac210c1 502 ADDP OutPrefetch = 0, OutputBuffer
02703c74
AP
503 }
504 { // Return 0 if the input length is nonsensical
505 .mib
4ac210c1
AP
506 ADDP StateTable = 0, StateTable
507 cmp.ge.unc L_NOK, L_OK = r0, DataLen
02703c74
AP
508 (L_NOK) br.ret.sptk.few rp
509 }
510 ;;
511 {
512 .mib
4ac210c1
AP
513 cmp.eq.or L_NOK, L_OK = r0, InPtr
514 cmp.eq.or L_NOK, L_OK = r0, OutPtr
515 nop 0x0
02703c74 516 }
02703c74 517 {
4ac210c1
AP
518 .mib
519 cmp.eq.or L_NOK, L_OK = r0, StateTable
520 nop 0x0
02703c74
AP
521 (L_NOK) br.ret.sptk.few rp
522 }
523 ;;
4ac210c1 524 LKEY I[1] = [KTable], SZ
02703c74
AP
525/* Prefetch the state-table. It contains 256 elements of size SZ */
526
527#if SZ == 1
528 ADDP tmp0 = 1*128, StateTable
529#elif SZ == 2
530 ADDP tmp0 = 3*128, StateTable
531 ADDP tmp1 = 2*128, StateTable
532#elif SZ == 4
533 ADDP tmp0 = 7*128, StateTable
534 ADDP tmp1 = 6*128, StateTable
535#elif SZ == 8
536 ADDP tmp0 = 15*128, StateTable
537 ADDP tmp1 = 14*128, StateTable
538#endif
539 ;;
540#if SZ >= 8
541 lfetch.fault.nt1 [tmp0], -256 // 15
542 lfetch.fault.nt1 [tmp1], -256;;
543 lfetch.fault.nt1 [tmp0], -256 // 13
544 lfetch.fault.nt1 [tmp1], -256;;
545 lfetch.fault.nt1 [tmp0], -256 // 11
546 lfetch.fault.nt1 [tmp1], -256;;
547 lfetch.fault.nt1 [tmp0], -256 // 9
548 lfetch.fault.nt1 [tmp1], -256;;
549#endif
550#if SZ >= 4
551 lfetch.fault.nt1 [tmp0], -256 // 7
552 lfetch.fault.nt1 [tmp1], -256;;
553 lfetch.fault.nt1 [tmp0], -256 // 5
554 lfetch.fault.nt1 [tmp1], -256;;
555#endif
556#if SZ >= 2
557 lfetch.fault.nt1 [tmp0], -256 // 3
558 lfetch.fault.nt1 [tmp1], -256;;
559#endif
4ac210c1
AP
560 {
561 .mii
02703c74 562 lfetch.fault.nt1 [tmp0] // 1
4ac210c1
AP
563 add I[1]=1,I[1];;
564 zxt1 I[1]=I[1]
565 }
02703c74
AP
566 {
567 .mmi
568 lfetch.nt1 [InPrefetch], 0x80
569 lfetch.excl.nt1 [OutPrefetch], 0x80
570 .save pr, PRSave
571 mov PRSave = pr
572 } ;;
573 {
574 .mmi
575 lfetch.excl.nt1 [OutPrefetch], 0x80
4ac210c1
AP
576 LKEY J = [KTable], SZ
577 ADDP EndPtr = DataLen, InPtr
02703c74
AP
578 } ;;
579 {
580 .mmi
02703c74
AP
581 ADDP EndPtr = -1, EndPtr // Make it point to
582 // last data byte.
02703c74
AP
583 mov One = 1
584 .save ar.lc, LCSave
585 mov LCSave = ar.lc
586 .body
587 } ;;
588 {
589 .mmb
590 sub Remainder = 0, OutPtr
591 cmp.gtu pSmall, p0 = $threshold, DataLen
592(pSmall) br.cond.dpnt .rc4Remainder // Data too small for
593 // big loop.
594 } ;;
595 {
596 .mmi
597 and Remainder = 0x7, Remainder
598 ;;
599 cmp.eq pAligned, pUnaligned = Remainder, r0
600 nop 0x0
601 } ;;
602 {
603 .mmb
4ac210c1 604.pred.rel "mutex",pUnaligned,pAligned
02703c74
AP
605(pUnaligned) add Remainder = -1, Remainder
606(pAligned) sub Remainder = EndPtr, InPtr
607(pAligned) br.cond.dptk.many .rc4Aligned
608 } ;;
609 {
610 .mmi
611 nop 0x0
612 nop 0x0
613 mov.i ar.lc = Remainder
614 }
615
616/* Do the initial few bytes via the compact, modulo-scheduled loop
617 until the output pointer is 8-byte-aligned. */
618
4ac210c1
AP
619 MODSCHED_RC4_PROLOGUE
620 MODSCHED_RC4_LOOP(.RC4AlignLoop)
02703c74
AP
621
622 {
623 .mib
624 sub Remainder = EndPtr, InPtr
625 zxt1 IFinal = IFinal
626 clrrrb // Clear CFM.rrb.pr so
627 ;; // next "mov pr.rot = N"
628 // does the right thing.
629 }
630 {
631 .mmi
632 mov I[1] = IFinal
633 nop 0x0
634 nop 0x0
635 } ;;
636
637
638.rc4Aligned:
639
640/*
641 Unrolled loop count = (Remainder - ($unroll_count+1)*$phases)/($unroll_count*$phases)
642 */
643
644 {
645 .mlx
646 add LoopCount = 1 - ($unroll_count + 1)*$phases, Remainder
647 movl Remainder = 0xaaaaaaaaaaaaaaab
648 } ;;
649 {
650 .mmi
651 setf.sig f6 = LoopCount // M2, M3 6 cyc
652 setf.sig f7 = Remainder // M2, M3 6 cyc
653 nop 0x0
654 } ;;
655 {
656 .mfb
657 nop 0x0
658 xmpy.hu f6 = f6, f7
659 nop 0x0
660 } ;;
661 {
662 .mmi
4ac210c1 663 getf.sig LoopCount = f6;; // M2 5 cyc
02703c74
AP
664 nop 0x0
665 shr.u LoopCount = LoopCount, 4
666 } ;;
667 {
668 .mmi
669 nop 0x0
670 nop 0x0
671 mov.i ar.lc = LoopCount
672 } ;;
673
674/* Now comes the unrolled loop: */
675
676.rc4Prologue:
677___
678
679$iteration = 0;
680
681# Generate the prologue:
682$predicates = 1;
683for ($i = 0; $i < $phases; ++$i) {
684 &emit_body (\$code, \$bypass, $iteration++, $predicates);
685 $predicates = ($predicates << 1) | 1;
686}
687
688$code.=<<___;
689.rc4Loop:
690___
691
692# Generate the body:
693for ($i = 0; $i < $unroll_count*$phases; ++$i) {
694 &emit_body (\$code, \$bypass, $iteration++, $predicates);
695}
696
697$code.=<<___;
698.rc4Epilogue:
699___
700
701# Generate the epilogue:
702for ($i = 0; $i < $phases; ++$i) {
703 $predicates <<= 1;
704 &emit_body (\$code, \$bypass, $iteration++, $predicates);
705}
706
707$code.=<<___;
708 {
709 .mmi
710 lfetch.nt1 [EndPtr] // fetch line with last byte
711 mov IFinal = I[1]
712 nop 0x0
713 }
714
715.rc4Remainder:
716 {
717 .mmi
718 sub Remainder = EndPtr, InPtr // Calculate
719 // # of bytes
720 // left - 1
721 nop 0x0
722 nop 0x0
723 } ;;
724 {
725 .mib
726 cmp.eq pDone, p0 = -1, Remainder // done already?
727 mov.i ar.lc = Remainder
728(pDone) br.cond.dptk.few .rc4Complete
729 }
730
731/* Do the remaining bytes via the compact, modulo-scheduled loop */
732
4ac210c1
AP
733 MODSCHED_RC4_PROLOGUE
734 MODSCHED_RC4_LOOP(.RC4RestLoop)
02703c74
AP
735
736.rc4Complete:
737 {
738 .mmi
4ac210c1
AP
739 add KTable = -SZ, KTable
740 add IFinal = -1, IFinal
02703c74
AP
741 mov ar.lc = LCSave
742 } ;;
743 {
744 .mii
4ac210c1
AP
745 SKEY [KTable] = J,-SZ
746 zxt1 IFinal = IFinal
747 mov pr = PRSave, 0x1FFFF
748 } ;;
02703c74
AP
749 {
750 .mib
4ac210c1
AP
751 SKEY [KTable] = IFinal
752 add RetVal = 1, r0
02703c74
AP
753 br.ret.sptk.few rp
754 } ;;
755___
756
757# Last but not least, emit the code for the bypass-code of the unrolled loop:
758
759$code.=$bypass;
760
761$code.=<<___;
762 .endp RC4
763___
764
765print $code;
3c9d51ce
RL
766
767close STDOUT;