]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/md5/asm/md5-ia64.S
Add final(?) set of copyrights.
[thirdparty/openssl.git] / crypto / md5 / asm / md5-ia64.S
1 /*
2 *
3 * Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
4 *
5 * Licensed under the OpenSSL license (the "License"). You may not use
6 * this file except in compliance with the License. You can obtain a copy
7 * in the file LICENSE in the source distribution or at
8 * https://www.openssl.org/source/license.html
9 */
10
11 /* Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
12
13 Permission is hereby granted, free of charge, to any person obtaining
14 a copy of this software and associated documentation files (the
15 "Software"), to deal in the Software without restriction, including
16 without limitation the rights to use, copy, modify, merge, publish,
17 distribute, sublicense, and/or sell copies of the Software, and to
18 permit persons to whom the Software is furnished to do so, subject to
19 the following conditions:
20
21 The above copyright notice and this permission notice shall be
22 included in all copies or substantial portions of the Software.
23
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
28 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
29 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
30 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
31
32 // Common registers are assigned as follows:
33 //
34 // COMMON
35 //
36 // t0 Const Tbl Ptr TPtr
37 // t1 Round Constant TRound
38 // t4 Block residual LenResid
39 // t5 Residual Data DTmp
40 //
41 // {in,out}0 Block 0 Cycle RotateM0
42 // {in,out}1 Block Value 12 M12
43 // {in,out}2 Block Value 8 M8
44 // {in,out}3 Block Value 4 M4
45 // {in,out}4 Block Value 0 M0
46 // {in,out}5 Block 1 Cycle RotateM1
47 // {in,out}6 Block Value 13 M13
48 // {in,out}7 Block Value 9 M9
49 // {in,out}8 Block Value 5 M5
50 // {in,out}9 Block Value 1 M1
51 // {in,out}10 Block 2 Cycle RotateM2
52 // {in,out}11 Block Value 14 M14
53 // {in,out}12 Block Value 10 M10
54 // {in,out}13 Block Value 6 M6
55 // {in,out}14 Block Value 2 M2
56 // {in,out}15 Block 3 Cycle RotateM3
57 // {in,out}16 Block Value 15 M15
58 // {in,out}17 Block Value 11 M11
59 // {in,out}18 Block Value 7 M7
60 // {in,out}19 Block Value 3 M3
61 // {in,out}20 Scratch Z
62 // {in,out}21 Scratch Y
63 // {in,out}22 Scratch X
64 // {in,out}23 Scratch W
65 // {in,out}24 Digest A A
66 // {in,out}25 Digest B B
67 // {in,out}26 Digest C C
68 // {in,out}27 Digest D D
69 // {in,out}28 Active Data Ptr DPtr
70 // in28 Dummy Value -
71 // out28 Dummy Value -
72 // bt0 Coroutine Link QUICK_RTN
73 //
74 /// These predicates are used for computing the padding block(s) and
75 /// are shared between the driver and digest co-routines
76 //
77 // pt0 Extra Pad Block pExtra
78 // pt1 Load next word pLoad
79 // pt2 Skip next word pSkip
80 // pt3 Search for Pad pNoPad
81 // pt4 Pad Word 0 pPad0
82 // pt5 Pad Word 1 pPad1
83 // pt6 Pad Word 2 pPad2
84 // pt7 Pad Word 3 pPad3
85
86 #define DTmp r19
87 #define LenResid r18
88 #define QUICK_RTN b6
89 #define TPtr r14
90 #define TRound r15
91 #define pExtra p6
92 #define pLoad p7
93 #define pNoPad p9
94 #define pPad0 p10
95 #define pPad1 p11
96 #define pPad2 p12
97 #define pPad3 p13
98 #define pSkip p8
99
100 #define A_ out24
101 #define B_ out25
102 #define C_ out26
103 #define D_ out27
104 #define DPtr_ out28
105 #define M0_ out4
106 #define M1_ out9
107 #define M10_ out12
108 #define M11_ out17
109 #define M12_ out1
110 #define M13_ out6
111 #define M14_ out11
112 #define M15_ out16
113 #define M2_ out14
114 #define M3_ out19
115 #define M4_ out3
116 #define M5_ out8
117 #define M6_ out13
118 #define M7_ out18
119 #define M8_ out2
120 #define M9_ out7
121 #define RotateM0_ out0
122 #define RotateM1_ out5
123 #define RotateM2_ out10
124 #define RotateM3_ out15
125 #define W_ out23
126 #define X_ out22
127 #define Y_ out21
128 #define Z_ out20
129
130 #define A in24
131 #define B in25
132 #define C in26
133 #define D in27
134 #define DPtr in28
135 #define M0 in4
136 #define M1 in9
137 #define M10 in12
138 #define M11 in17
139 #define M12 in1
140 #define M13 in6
141 #define M14 in11
142 #define M15 in16
143 #define M2 in14
144 #define M3 in19
145 #define M4 in3
146 #define M5 in8
147 #define M6 in13
148 #define M7 in18
149 #define M8 in2
150 #define M9 in7
151 #define RotateM0 in0
152 #define RotateM1 in5
153 #define RotateM2 in10
154 #define RotateM3 in15
155 #define W in23
156 #define X in22
157 #define Y in21
158 #define Z in20
159
160 /* register stack configuration for md5_block_asm_data_order(): */
161 #define MD5_NINP 3
162 #define MD5_NLOC 0
163 #define MD5_NOUT 29
164 #define MD5_NROT 0
165
166 /* register stack configuration for helpers: */
167 #define _NINPUTS MD5_NOUT
168 #define _NLOCALS 0
169 #define _NOUTPUT 0
170 #define _NROTATE 24 /* this must be <= _NINPUTS */
171
172 #if defined(_HPUX_SOURCE) && !defined(_LP64)
173 #define ADDP addp4
174 #else
175 #define ADDP add
176 #endif
177
178 #if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
179 #define HOST_IS_BIG_ENDIAN
180 #endif
181
182 // Macros for getting the left and right portions of little-endian words
183
184 #define GETLW(dst, src, align) dep.z dst = src, 32 - 8 * align, 8 * align
185 #define GETRW(dst, src, align) extr.u dst = src, 8 * align, 32 - 8 * align
186
187 // MD5 driver
188 //
189 // Reads an input block, then calls the digest block
190 // subroutine and adds the results to the accumulated
191 // digest. It allocates 32 outs which the subroutine
192 // uses as it's inputs and rotating
193 // registers. Initializes the round constant pointer and
194 // takes care of saving/restoring ar.lc
195 //
196 /// INPUT
197 //
198 // in0 Context Ptr CtxPtr0
199 // in1 Input Data Ptr DPtrIn
200 // in2 Integral Blocks BlockCount
201 // rp Return Address -
202 //
203 /// CODE
204 //
205 // v2 Input Align InAlign
206 // t0 Shared w/digest -
207 // t1 Shared w/digest -
208 // t2 Shared w/digest -
209 // t3 Shared w/digest -
210 // t4 Shared w/digest -
211 // t5 Shared w/digest -
212 // t6 PFS Save PFSSave
213 // t7 ar.lc Save LCSave
214 // t8 Saved PR PRSave
215 // t9 2nd CtxPtr CtxPtr1
216 // t10 Table Base CTable
217 // t11 Table[0] CTable0
218 // t13 Accumulator A AccumA
219 // t14 Accumulator B AccumB
220 // t15 Accumulator C AccumC
221 // t16 Accumulator D AccumD
222 // pt0 Shared w/digest -
223 // pt1 Shared w/digest -
224 // pt2 Shared w/digest -
225 // pt3 Shared w/digest -
226 // pt4 Shared w/digest -
227 // pt5 Shared w/digest -
228 // pt6 Shared w/digest -
229 // pt7 Shared w/digest -
230 // pt8 Not Aligned pOff
231 // pt8 Blocks Left pAgain
232
233 #define AccumA r27
234 #define AccumB r28
235 #define AccumC r29
236 #define AccumD r30
237 #define CTable r24
238 #define CTable0 r25
239 #define CtxPtr0 in0
240 #define CtxPtr1 r23
241 #define DPtrIn in1
242 #define BlockCount in2
243 #define InAlign r10
244 #define LCSave r21
245 #define PFSSave r20
246 #define PRSave r22
247 #define pAgain p63
248 #define pOff p63
249
250 .text
251
252 /* md5_block_asm_data_order(MD5_CTX *c, const void *data, size_t num)
253
254 where:
255 c: a pointer to a structure of this type:
256
257 typedef struct MD5state_st
258 {
259 MD5_LONG A,B,C,D;
260 MD5_LONG Nl,Nh;
261 MD5_LONG data[MD5_LBLOCK];
262 unsigned int num;
263 }
264 MD5_CTX;
265
266 data: a pointer to the input data (may be misaligned)
267 num: the number of 16-byte blocks to hash (i.e., the length
268 of DATA is 16*NUM.
269
270 */
271
272 .type md5_block_asm_data_order, @function
273 .global md5_block_asm_data_order
274 .align 32
275 .proc md5_block_asm_data_order
276 md5_block_asm_data_order:
277 .md5_block:
278 .prologue
279 { .mmi
280 .save ar.pfs, PFSSave
281 alloc PFSSave = ar.pfs, MD5_NINP, MD5_NLOC, MD5_NOUT, MD5_NROT
282 ADDP CtxPtr1 = 8, CtxPtr0
283 mov CTable = ip
284 }
285 { .mmi
286 ADDP DPtrIn = 0, DPtrIn
287 ADDP CtxPtr0 = 0, CtxPtr0
288 .save ar.lc, LCSave
289 mov LCSave = ar.lc
290 }
291 ;;
292 { .mmi
293 add CTable = .md5_tbl_data_order#-.md5_block#, CTable
294 and InAlign = 0x3, DPtrIn
295 }
296
297 { .mmi
298 ld4 AccumA = [CtxPtr0], 4
299 ld4 AccumC = [CtxPtr1], 4
300 .save pr, PRSave
301 mov PRSave = pr
302 .body
303 }
304 ;;
305 { .mmi
306 ld4 AccumB = [CtxPtr0]
307 ld4 AccumD = [CtxPtr1]
308 dep DPtr_ = 0, DPtrIn, 0, 2
309 } ;;
310 #ifdef HOST_IS_BIG_ENDIAN
311 rum psr.be;; // switch to little-endian
312 #endif
313 { .mmb
314 ld4 CTable0 = [CTable], 4
315 cmp.ne pOff, p0 = 0, InAlign
316 (pOff) br.cond.spnt.many .md5_unaligned
317 } ;;
318
319 // The FF load/compute loop rotates values three times, so that
320 // loading into M12 here produces the M0 value, M13 -> M1, etc.
321
322 .md5_block_loop0:
323 { .mmi
324 ld4 M12_ = [DPtr_], 4
325 mov TPtr = CTable
326 mov TRound = CTable0
327 } ;;
328 { .mmi
329 ld4 M13_ = [DPtr_], 4
330 mov A_ = AccumA
331 mov B_ = AccumB
332 } ;;
333 { .mmi
334 ld4 M14_ = [DPtr_], 4
335 mov C_ = AccumC
336 mov D_ = AccumD
337 } ;;
338 { .mmb
339 ld4 M15_ = [DPtr_], 4
340 add BlockCount = -1, BlockCount
341 br.call.sptk.many QUICK_RTN = md5_digest_block0
342 } ;;
343
344 // Now, we add the new digest values and do some clean-up
345 // before checking if there's another full block to process
346
347 { .mmi
348 add AccumA = AccumA, A_
349 add AccumB = AccumB, B_
350 cmp.ne pAgain, p0 = 0, BlockCount
351 }
352 { .mib
353 add AccumC = AccumC, C_
354 add AccumD = AccumD, D_
355 (pAgain) br.cond.dptk.many .md5_block_loop0
356 } ;;
357
358 .md5_exit:
359 #ifdef HOST_IS_BIG_ENDIAN
360 sum psr.be;; // switch back to big-endian mode
361 #endif
362 { .mmi
363 st4 [CtxPtr0] = AccumB, -4
364 st4 [CtxPtr1] = AccumD, -4
365 mov pr = PRSave, 0x1ffff ;;
366 }
367 { .mmi
368 st4 [CtxPtr0] = AccumA
369 st4 [CtxPtr1] = AccumC
370 mov ar.lc = LCSave
371 } ;;
372 { .mib
373 mov ar.pfs = PFSSave
374 br.ret.sptk.few rp
375 } ;;
376
377 #define MD5UNALIGNED(offset) \
378 .md5_process##offset: \
379 { .mib ; \
380 nop 0x0 ; \
381 GETRW(DTmp, DTmp, offset) ; \
382 } ;; \
383 .md5_block_loop##offset: \
384 { .mmi ; \
385 ld4 Y_ = [DPtr_], 4 ; \
386 mov TPtr = CTable ; \
387 mov TRound = CTable0 ; \
388 } ;; \
389 { .mmi ; \
390 ld4 M13_ = [DPtr_], 4 ; \
391 mov A_ = AccumA ; \
392 mov B_ = AccumB ; \
393 } ;; \
394 { .mii ; \
395 ld4 M14_ = [DPtr_], 4 ; \
396 GETLW(W_, Y_, offset) ; \
397 mov C_ = AccumC ; \
398 } \
399 { .mmi ; \
400 mov D_ = AccumD ;; \
401 or M12_ = W_, DTmp ; \
402 GETRW(DTmp, Y_, offset) ; \
403 } \
404 { .mib ; \
405 ld4 M15_ = [DPtr_], 4 ; \
406 add BlockCount = -1, BlockCount ; \
407 br.call.sptk.many QUICK_RTN = md5_digest_block##offset; \
408 } ;; \
409 { .mmi ; \
410 add AccumA = AccumA, A_ ; \
411 add AccumB = AccumB, B_ ; \
412 cmp.ne pAgain, p0 = 0, BlockCount ; \
413 } \
414 { .mib ; \
415 add AccumC = AccumC, C_ ; \
416 add AccumD = AccumD, D_ ; \
417 (pAgain) br.cond.dptk.many .md5_block_loop##offset ; \
418 } ;; \
419 { .mib ; \
420 nop 0x0 ; \
421 nop 0x0 ; \
422 br.cond.sptk.many .md5_exit ; \
423 } ;;
424
425 .align 32
426 .md5_unaligned:
427 //
428 // Because variable shifts are expensive, we special case each of
429 // the four alignements. In practice, this won't hurt too much
430 // since only one working set of code will be loaded.
431 //
432 { .mib
433 ld4 DTmp = [DPtr_], 4
434 cmp.eq pOff, p0 = 1, InAlign
435 (pOff) br.cond.dpnt.many .md5_process1
436 } ;;
437 { .mib
438 cmp.eq pOff, p0 = 2, InAlign
439 nop 0x0
440 (pOff) br.cond.dpnt.many .md5_process2
441 } ;;
442 MD5UNALIGNED(3)
443 MD5UNALIGNED(1)
444 MD5UNALIGNED(2)
445
446 .endp md5_block_asm_data_order
447
448
449 // MD5 Perform the F function and load
450 //
451 // Passed the first 4 words (M0 - M3) and initial (A, B, C, D) values,
452 // computes the FF() round of functions, then branches to the common
453 // digest code to finish up with GG(), HH, and II().
454 //
455 // INPUT
456 //
457 // rp Return Address -
458 //
459 // CODE
460 //
461 // v0 PFS bit bucket PFS
462 // v1 Loop Trip Count LTrip
463 // pt0 Load next word pMore
464
465 /* For F round: */
466 #define LTrip r9
467 #define PFS r8
468 #define pMore p6
469
470 /* For GHI rounds: */
471 #define T r9
472 #define U r10
473 #define V r11
474
475 #define COMPUTE(a, b, s, M, R) \
476 { \
477 .mii ; \
478 ld4 TRound = [TPtr], 4 ; \
479 dep.z Y = Z, 32, 32 ;; \
480 shrp Z = Z, Y, 64 - s ; \
481 } ;; \
482 { \
483 .mmi ; \
484 add a = Z, b ; \
485 mov R = M ; \
486 nop 0x0 ; \
487 } ;;
488
489 #define LOOP(a, b, s, M, R, label) \
490 { .mii ; \
491 ld4 TRound = [TPtr], 4 ; \
492 dep.z Y = Z, 32, 32 ;; \
493 shrp Z = Z, Y, 64 - s ; \
494 } ;; \
495 { .mib ; \
496 add a = Z, b ; \
497 mov R = M ; \
498 br.ctop.sptk.many label ; \
499 } ;;
500
501 // G(B, C, D) = (B & D) | (C & ~D)
502
503 #define G(a, b, c, d, M) \
504 { .mmi ; \
505 add Z = M, TRound ; \
506 and Y = b, d ; \
507 andcm X = c, d ; \
508 } ;; \
509 { .mii ; \
510 add Z = Z, a ; \
511 or Y = Y, X ;; \
512 add Z = Z, Y ; \
513 } ;;
514
515 // H(B, C, D) = B ^ C ^ D
516
517 #define H(a, b, c, d, M) \
518 { .mmi ; \
519 add Z = M, TRound ; \
520 xor Y = b, c ; \
521 nop 0x0 ; \
522 } ;; \
523 { .mii ; \
524 add Z = Z, a ; \
525 xor Y = Y, d ;; \
526 add Z = Z, Y ; \
527 } ;;
528
529 // I(B, C, D) = C ^ (B | ~D)
530 //
531 // However, since we have an andcm operator, we use the fact that
532 //
533 // Y ^ Z == ~Y ^ ~Z
534 //
535 // to rewrite the expression as
536 //
537 // I(B, C, D) = ~C ^ (~B & D)
538
539 #define I(a, b, c, d, M) \
540 { .mmi ; \
541 add Z = M, TRound ; \
542 andcm Y = d, b ; \
543 andcm X = -1, c ; \
544 } ;; \
545 { .mii ; \
546 add Z = Z, a ; \
547 xor Y = Y, X ;; \
548 add Z = Z, Y ; \
549 } ;;
550
551 #define GG4(label) \
552 G(A, B, C, D, M0) \
553 COMPUTE(A, B, 5, M0, RotateM0) \
554 G(D, A, B, C, M1) \
555 COMPUTE(D, A, 9, M1, RotateM1) \
556 G(C, D, A, B, M2) \
557 COMPUTE(C, D, 14, M2, RotateM2) \
558 G(B, C, D, A, M3) \
559 LOOP(B, C, 20, M3, RotateM3, label)
560
561 #define HH4(label) \
562 H(A, B, C, D, M0) \
563 COMPUTE(A, B, 4, M0, RotateM0) \
564 H(D, A, B, C, M1) \
565 COMPUTE(D, A, 11, M1, RotateM1) \
566 H(C, D, A, B, M2) \
567 COMPUTE(C, D, 16, M2, RotateM2) \
568 H(B, C, D, A, M3) \
569 LOOP(B, C, 23, M3, RotateM3, label)
570
571 #define II4(label) \
572 I(A, B, C, D, M0) \
573 COMPUTE(A, B, 6, M0, RotateM0) \
574 I(D, A, B, C, M1) \
575 COMPUTE(D, A, 10, M1, RotateM1) \
576 I(C, D, A, B, M2) \
577 COMPUTE(C, D, 15, M2, RotateM2) \
578 I(B, C, D, A, M3) \
579 LOOP(B, C, 21, M3, RotateM3, label)
580
581 #define FFLOAD(a, b, c, d, M, N, s) \
582 { .mii ; \
583 (pMore) ld4 N = [DPtr], 4 ; \
584 add Z = M, TRound ; \
585 and Y = c, b ; \
586 } \
587 { .mmi ; \
588 andcm X = d, b ;; \
589 add Z = Z, a ; \
590 or Y = Y, X ; \
591 } ;; \
592 { .mii ; \
593 ld4 TRound = [TPtr], 4 ; \
594 add Z = Z, Y ;; \
595 dep.z Y = Z, 32, 32 ; \
596 } ;; \
597 { .mii ; \
598 nop 0x0 ; \
599 shrp Z = Z, Y, 64 - s ;; \
600 add a = Z, b ; \
601 } ;;
602
603 #define FFLOOP(a, b, c, d, M, N, s, dest) \
604 { .mii ; \
605 (pMore) ld4 N = [DPtr], 4 ; \
606 add Z = M, TRound ; \
607 and Y = c, b ; \
608 } \
609 { .mmi ; \
610 andcm X = d, b ;; \
611 add Z = Z, a ; \
612 or Y = Y, X ; \
613 } ;; \
614 { .mii ; \
615 ld4 TRound = [TPtr], 4 ; \
616 add Z = Z, Y ;; \
617 dep.z Y = Z, 32, 32 ; \
618 } ;; \
619 { .mii ; \
620 nop 0x0 ; \
621 shrp Z = Z, Y, 64 - s ;; \
622 add a = Z, b ; \
623 } \
624 { .mib ; \
625 cmp.ne pMore, p0 = 0, LTrip ; \
626 add LTrip = -1, LTrip ; \
627 br.ctop.dptk.many dest ; \
628 } ;;
629
630 .type md5_digest_block0, @function
631 .align 32
632
633 .proc md5_digest_block0
634 .prologue
635 md5_digest_block0:
636 .altrp QUICK_RTN
637 .body
638 { .mmi
639 alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
640 mov LTrip = 2
641 mov ar.lc = 3
642 } ;;
643 { .mii
644 cmp.eq pMore, p0 = r0, r0
645 mov ar.ec = 0
646 nop 0x0
647 } ;;
648
649 .md5_FF_round0:
650 FFLOAD(A, B, C, D, M12, RotateM0, 7)
651 FFLOAD(D, A, B, C, M13, RotateM1, 12)
652 FFLOAD(C, D, A, B, M14, RotateM2, 17)
653 FFLOOP(B, C, D, A, M15, RotateM3, 22, .md5_FF_round0)
654 //
655 // !!! Fall through to md5_digest_GHI
656 //
657 .endp md5_digest_block0
658
659 .type md5_digest_GHI, @function
660 .align 32
661
662 .proc md5_digest_GHI
663 .prologue
664 .regstk _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
665 md5_digest_GHI:
666 .altrp QUICK_RTN
667 .body
668 //
669 // The following sequence shuffles the block counstants round for the
670 // next round:
671 //
672 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
673 // 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12
674 //
675 { .mmi
676 mov Z = M0
677 mov Y = M15
678 mov ar.lc = 3
679 }
680 { .mmi
681 mov X = M2
682 mov W = M9
683 mov V = M4
684 } ;;
685
686 { .mmi
687 mov M0 = M1
688 mov M15 = M12
689 mov ar.ec = 1
690 }
691 { .mmi
692 mov M2 = M11
693 mov M9 = M14
694 mov M4 = M5
695 } ;;
696
697 { .mmi
698 mov M1 = M6
699 mov M12 = M13
700 mov U = M3
701 }
702 { .mmi
703 mov M11 = M8
704 mov M14 = M7
705 mov M5 = M10
706 } ;;
707
708 { .mmi
709 mov M6 = Y
710 mov M13 = X
711 mov M3 = Z
712 }
713 { .mmi
714 mov M8 = W
715 mov M7 = V
716 mov M10 = U
717 } ;;
718
719 .md5_GG_round:
720 GG4(.md5_GG_round)
721
722 // The following sequence shuffles the block constants round for the
723 // next round:
724 //
725 // 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12
726 // 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2
727
728 { .mmi
729 mov Z = M0
730 mov Y = M1
731 mov ar.lc = 3
732 }
733 { .mmi
734 mov X = M3
735 mov W = M5
736 mov V = M6
737 } ;;
738
739 { .mmi
740 mov M0 = M4
741 mov M1 = M11
742 mov ar.ec = 1
743 }
744 { .mmi
745 mov M3 = M9
746 mov U = M8
747 mov T = M13
748 } ;;
749
750 { .mmi
751 mov M4 = Z
752 mov M11 = Y
753 mov M5 = M7
754 }
755 { .mmi
756 mov M6 = M14
757 mov M8 = M12
758 mov M13 = M15
759 } ;;
760
761 { .mmi
762 mov M7 = W
763 mov M14 = V
764 nop 0x0
765 }
766 { .mmi
767 mov M9 = X
768 mov M12 = U
769 mov M15 = T
770 } ;;
771
772 .md5_HH_round:
773 HH4(.md5_HH_round)
774
775 // The following sequence shuffles the block constants round for the
776 // next round:
777 //
778 // 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2
779 // 0 7 14 5 12 3 10 1 8 15 6 13 4 11 2 9
780
781 { .mmi
782 mov Z = M0
783 mov Y = M15
784 mov ar.lc = 3
785 }
786 { .mmi
787 mov X = M10
788 mov W = M1
789 mov V = M4
790 } ;;
791
792 { .mmi
793 mov M0 = M9
794 mov M15 = M12
795 mov ar.ec = 1
796 }
797 { .mmi
798 mov M10 = M11
799 mov M1 = M6
800 mov M4 = M13
801 } ;;
802
803 { .mmi
804 mov M9 = M14
805 mov M12 = M5
806 mov U = M3
807 }
808 { .mmi
809 mov M11 = M8
810 mov M6 = M7
811 mov M13 = M2
812 } ;;
813
814 { .mmi
815 mov M14 = Y
816 mov M5 = X
817 mov M3 = Z
818 }
819 { .mmi
820 mov M8 = W
821 mov M7 = V
822 mov M2 = U
823 } ;;
824
825 .md5_II_round:
826 II4(.md5_II_round)
827
828 { .mib
829 nop 0x0
830 nop 0x0
831 br.ret.sptk.many QUICK_RTN
832 } ;;
833
834 .endp md5_digest_GHI
835
836 #define FFLOADU(a, b, c, d, M, P, N, s, offset) \
837 { .mii ; \
838 (pMore) ld4 N = [DPtr], 4 ; \
839 add Z = M, TRound ; \
840 and Y = c, b ; \
841 } \
842 { .mmi ; \
843 andcm X = d, b ;; \
844 add Z = Z, a ; \
845 or Y = Y, X ; \
846 } ;; \
847 { .mii ; \
848 ld4 TRound = [TPtr], 4 ; \
849 GETLW(W, P, offset) ; \
850 add Z = Z, Y ; \
851 } ;; \
852 { .mii ; \
853 or W = W, DTmp ; \
854 dep.z Y = Z, 32, 32 ;; \
855 shrp Z = Z, Y, 64 - s ; \
856 } ;; \
857 { .mii ; \
858 add a = Z, b ; \
859 GETRW(DTmp, P, offset) ; \
860 mov P = W ; \
861 } ;;
862
863 #define FFLOOPU(a, b, c, d, M, P, N, s, offset) \
864 { .mii ; \
865 (pMore) ld4 N = [DPtr], 4 ; \
866 add Z = M, TRound ; \
867 and Y = c, b ; \
868 } \
869 { .mmi ; \
870 andcm X = d, b ;; \
871 add Z = Z, a ; \
872 or Y = Y, X ; \
873 } ;; \
874 { .mii ; \
875 ld4 TRound = [TPtr], 4 ; \
876 (pMore) GETLW(W, P, offset) ; \
877 add Z = Z, Y ; \
878 } ;; \
879 { .mii ; \
880 (pMore) or W = W, DTmp ; \
881 dep.z Y = Z, 32, 32 ;; \
882 shrp Z = Z, Y, 64 - s ; \
883 } ;; \
884 { .mii ; \
885 add a = Z, b ; \
886 (pMore) GETRW(DTmp, P, offset) ; \
887 (pMore) mov P = W ; \
888 } \
889 { .mib ; \
890 cmp.ne pMore, p0 = 0, LTrip ; \
891 add LTrip = -1, LTrip ; \
892 br.ctop.sptk.many .md5_FF_round##offset ; \
893 } ;;
894
895 #define MD5FBLOCK(offset) \
896 .type md5_digest_block##offset, @function ; \
897 \
898 .align 32 ; \
899 .proc md5_digest_block##offset ; \
900 .prologue ; \
901 .altrp QUICK_RTN ; \
902 .body ; \
903 md5_digest_block##offset: \
904 { .mmi ; \
905 alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE ; \
906 mov LTrip = 2 ; \
907 mov ar.lc = 3 ; \
908 } ;; \
909 { .mii ; \
910 cmp.eq pMore, p0 = r0, r0 ; \
911 mov ar.ec = 0 ; \
912 nop 0x0 ; \
913 } ;; \
914 \
915 .pred.rel "mutex", pLoad, pSkip ; \
916 .md5_FF_round##offset: \
917 FFLOADU(A, B, C, D, M12, M13, RotateM0, 7, offset) \
918 FFLOADU(D, A, B, C, M13, M14, RotateM1, 12, offset) \
919 FFLOADU(C, D, A, B, M14, M15, RotateM2, 17, offset) \
920 FFLOOPU(B, C, D, A, M15, RotateM0, RotateM3, 22, offset) \
921 \
922 { .mib ; \
923 nop 0x0 ; \
924 nop 0x0 ; \
925 br.cond.sptk.many md5_digest_GHI ; \
926 } ;; \
927 .endp md5_digest_block##offset
928
929 MD5FBLOCK(1)
930 MD5FBLOCK(2)
931 MD5FBLOCK(3)
932
933 .align 64
934 .type md5_constants, @object
935 md5_constants:
936 .md5_tbl_data_order: // To ensure little-endian data
937 // order, code as bytes.
938 data1 0x78, 0xa4, 0x6a, 0xd7 // 0
939 data1 0x56, 0xb7, 0xc7, 0xe8 // 1
940 data1 0xdb, 0x70, 0x20, 0x24 // 2
941 data1 0xee, 0xce, 0xbd, 0xc1 // 3
942 data1 0xaf, 0x0f, 0x7c, 0xf5 // 4
943 data1 0x2a, 0xc6, 0x87, 0x47 // 5
944 data1 0x13, 0x46, 0x30, 0xa8 // 6
945 data1 0x01, 0x95, 0x46, 0xfd // 7
946 data1 0xd8, 0x98, 0x80, 0x69 // 8
947 data1 0xaf, 0xf7, 0x44, 0x8b // 9
948 data1 0xb1, 0x5b, 0xff, 0xff // 10
949 data1 0xbe, 0xd7, 0x5c, 0x89 // 11
950 data1 0x22, 0x11, 0x90, 0x6b // 12
951 data1 0x93, 0x71, 0x98, 0xfd // 13
952 data1 0x8e, 0x43, 0x79, 0xa6 // 14
953 data1 0x21, 0x08, 0xb4, 0x49 // 15
954 data1 0x62, 0x25, 0x1e, 0xf6 // 16
955 data1 0x40, 0xb3, 0x40, 0xc0 // 17
956 data1 0x51, 0x5a, 0x5e, 0x26 // 18
957 data1 0xaa, 0xc7, 0xb6, 0xe9 // 19
958 data1 0x5d, 0x10, 0x2f, 0xd6 // 20
959 data1 0x53, 0x14, 0x44, 0x02 // 21
960 data1 0x81, 0xe6, 0xa1, 0xd8 // 22
961 data1 0xc8, 0xfb, 0xd3, 0xe7 // 23
962 data1 0xe6, 0xcd, 0xe1, 0x21 // 24
963 data1 0xd6, 0x07, 0x37, 0xc3 // 25
964 data1 0x87, 0x0d, 0xd5, 0xf4 // 26
965 data1 0xed, 0x14, 0x5a, 0x45 // 27
966 data1 0x05, 0xe9, 0xe3, 0xa9 // 28
967 data1 0xf8, 0xa3, 0xef, 0xfc // 29
968 data1 0xd9, 0x02, 0x6f, 0x67 // 30
969 data1 0x8a, 0x4c, 0x2a, 0x8d // 31
970 data1 0x42, 0x39, 0xfa, 0xff // 32
971 data1 0x81, 0xf6, 0x71, 0x87 // 33
972 data1 0x22, 0x61, 0x9d, 0x6d // 34
973 data1 0x0c, 0x38, 0xe5, 0xfd // 35
974 data1 0x44, 0xea, 0xbe, 0xa4 // 36
975 data1 0xa9, 0xcf, 0xde, 0x4b // 37
976 data1 0x60, 0x4b, 0xbb, 0xf6 // 38
977 data1 0x70, 0xbc, 0xbf, 0xbe // 39
978 data1 0xc6, 0x7e, 0x9b, 0x28 // 40
979 data1 0xfa, 0x27, 0xa1, 0xea // 41
980 data1 0x85, 0x30, 0xef, 0xd4 // 42
981 data1 0x05, 0x1d, 0x88, 0x04 // 43
982 data1 0x39, 0xd0, 0xd4, 0xd9 // 44
983 data1 0xe5, 0x99, 0xdb, 0xe6 // 45
984 data1 0xf8, 0x7c, 0xa2, 0x1f // 46
985 data1 0x65, 0x56, 0xac, 0xc4 // 47
986 data1 0x44, 0x22, 0x29, 0xf4 // 48
987 data1 0x97, 0xff, 0x2a, 0x43 // 49
988 data1 0xa7, 0x23, 0x94, 0xab // 50
989 data1 0x39, 0xa0, 0x93, 0xfc // 51
990 data1 0xc3, 0x59, 0x5b, 0x65 // 52
991 data1 0x92, 0xcc, 0x0c, 0x8f // 53
992 data1 0x7d, 0xf4, 0xef, 0xff // 54
993 data1 0xd1, 0x5d, 0x84, 0x85 // 55
994 data1 0x4f, 0x7e, 0xa8, 0x6f // 56
995 data1 0xe0, 0xe6, 0x2c, 0xfe // 57
996 data1 0x14, 0x43, 0x01, 0xa3 // 58
997 data1 0xa1, 0x11, 0x08, 0x4e // 59
998 data1 0x82, 0x7e, 0x53, 0xf7 // 60
999 data1 0x35, 0xf2, 0x3a, 0xbd // 61
1000 data1 0xbb, 0xd2, 0xd7, 0x2a // 62
1001 data1 0x91, 0xd3, 0x86, 0xeb // 63
1002 .size md5_constants#,64*4