crypto/md5/asm/md5-ia64.S

   1 /*
   2  *
   3  * Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
   4  *
   5  * Licensed under the OpenSSL license (the "License").  You may not use
   6  * this file except in compliance with the License.  You can obtain a copy
   7  * in the file LICENSE in the source distribution or at
   8  * https://www.openssl.org/source/license.html
   9  */
  10
  11 /* Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
  12
  13 Permission is hereby granted, free of charge, to any person obtaining
  14 a copy of this software and associated documentation files (the
  15 "Software"), to deal in the Software without restriction, including
  16 without limitation the rights to use, copy, modify, merge, publish,
  17 distribute, sublicense, and/or sell copies of the Software, and to
  18 permit persons to whom the Software is furnished to do so, subject to
  19 the following conditions:
  20
  21 The above copyright notice and this permission notice shall be
  22 included in all copies or substantial portions of the Software.
  23
  24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  25 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  26 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  27 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  28 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  29 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  30 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
  31
  32 //      Common registers are assigned as follows:
  33 //
  34 //      COMMON
  35 //
  36 //      t0              Const Tbl Ptr   TPtr
  37 //      t1              Round Constant  TRound
  38 //      t4              Block residual  LenResid
  39 //      t5              Residual Data   DTmp
  40 //
  41 //      {in,out}0       Block 0 Cycle   RotateM0
  42 //      {in,out}1       Block Value 12  M12
  43 //      {in,out}2       Block Value 8   M8
  44 //      {in,out}3       Block Value 4   M4
  45 //      {in,out}4       Block Value 0   M0
  46 //      {in,out}5       Block 1 Cycle   RotateM1
  47 //      {in,out}6       Block Value 13  M13
  48 //      {in,out}7       Block Value 9   M9
  49 //      {in,out}8       Block Value 5   M5
  50 //      {in,out}9       Block Value 1   M1
  51 //      {in,out}10      Block 2 Cycle   RotateM2
  52 //      {in,out}11      Block Value 14  M14
  53 //      {in,out}12      Block Value 10  M10
  54 //      {in,out}13      Block Value 6   M6
  55 //      {in,out}14      Block Value 2   M2
  56 //      {in,out}15      Block 3 Cycle   RotateM3
  57 //      {in,out}16      Block Value 15  M15
  58 //      {in,out}17      Block Value 11  M11
  59 //      {in,out}18      Block Value 7   M7
  60 //      {in,out}19      Block Value 3   M3
  61 //      {in,out}20      Scratch                 Z
  62 //      {in,out}21      Scratch                 Y
  63 //      {in,out}22      Scratch                 X
  64 //      {in,out}23      Scratch                 W
  65 //      {in,out}24      Digest A                A
  66 //      {in,out}25      Digest B                B
  67 //      {in,out}26      Digest C                C
  68 //      {in,out}27      Digest D                D
  69 //      {in,out}28      Active Data Ptr DPtr
  70 //      in28            Dummy Value             -
  71 //      out28           Dummy Value             -
  72 //      bt0                     Coroutine Link  QUICK_RTN
  73 //
  74 ///     These predicates are used for computing the padding block(s) and
  75 ///     are shared between the driver and digest co-routines
  76 //
  77 //      pt0                     Extra Pad Block pExtra
  78 //      pt1                     Load next word  pLoad
  79 //      pt2                     Skip next word  pSkip
  80 //      pt3                     Search for Pad  pNoPad
  81 //      pt4                     Pad Word 0              pPad0
  82 //      pt5                     Pad Word 1              pPad1
  83 //      pt6                     Pad Word 2              pPad2
  84 //      pt7                     Pad Word 3              pPad3
  85
  86 #define DTmp            r19
  87 #define LenResid        r18
  88 #define QUICK_RTN       b6
  89 #define TPtr            r14
  90 #define TRound          r15
  91 #define pExtra          p6
  92 #define pLoad           p7
  93 #define pNoPad          p9
  94 #define pPad0           p10
  95 #define pPad1           p11
  96 #define pPad2           p12
  97 #define pPad3           p13
  98 #define pSkip           p8
  99
 100 #define A_              out24
 101 #define B_              out25
 102 #define C_              out26
 103 #define D_              out27
 104 #define DPtr_           out28
 105 #define M0_             out4
 106 #define M1_             out9
 107 #define M10_            out12
 108 #define M11_            out17
 109 #define M12_            out1
 110 #define M13_            out6
 111 #define M14_            out11
 112 #define M15_            out16
 113 #define M2_             out14
 114 #define M3_             out19
 115 #define M4_             out3
 116 #define M5_             out8
 117 #define M6_             out13
 118 #define M7_             out18
 119 #define M8_             out2
 120 #define M9_             out7
 121 #define RotateM0_       out0
 122 #define RotateM1_       out5
 123 #define RotateM2_       out10
 124 #define RotateM3_       out15
 125 #define W_              out23
 126 #define X_              out22
 127 #define Y_              out21
 128 #define Z_              out20
 129
 130 #define A               in24
 131 #define B               in25
 132 #define C               in26
 133 #define D               in27
 134 #define DPtr            in28
 135 #define M0              in4
 136 #define M1              in9
 137 #define M10             in12
 138 #define M11             in17
 139 #define M12             in1
 140 #define M13             in6
 141 #define M14             in11
 142 #define M15             in16
 143 #define M2              in14
 144 #define M3              in19
 145 #define M4              in3
 146 #define M5              in8
 147 #define M6              in13
 148 #define M7              in18
 149 #define M8              in2
 150 #define M9              in7
 151 #define RotateM0        in0
 152 #define RotateM1        in5
 153 #define RotateM2        in10
 154 #define RotateM3        in15
 155 #define W               in23
 156 #define X               in22
 157 #define Y               in21
 158 #define Z               in20
 159
 160 /* register stack configuration for md5_block_asm_data_order(): */
 161 #define MD5_NINP        3
 162 #define MD5_NLOC        0
 163 #define MD5_NOUT        29
 164 #define MD5_NROT        0
 165
 166 /* register stack configuration for helpers: */
 167 #define _NINPUTS        MD5_NOUT
 168 #define _NLOCALS        0
 169 #define _NOUTPUT        0
 170 #define _NROTATE        24      /* this must be <= _NINPUTS */
 171
 172 #if defined(_HPUX_SOURCE) && !defined(_LP64)
 173 #define ADDP    addp4
 174 #else
 175 #define ADDP    add
 176 #endif
 177
 178 #if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
 179 #define HOST_IS_BIG_ENDIAN
 180 #endif
 181
 182 //      Macros for getting the left and right portions of little-endian words
 183
 184 #define GETLW(dst, src, align)  dep.z dst = src, 32 - 8 * align, 8 * align
 185 #define GETRW(dst, src, align)  extr.u dst = src, 8 * align, 32 - 8 * align
 186
 187 //      MD5 driver
 188 //
 189 //              Reads an input block, then calls the digest block
 190 //              subroutine and adds the results to the accumulated
 191 //              digest.  It allocates 32 outs which the subroutine
 192 //              uses as it's inputs and rotating
 193 //              registers. Initializes the round constant pointer and
 194 //              takes care of saving/restoring ar.lc
 195 //
 196 ///     INPUT
 197 //
 198 //      in0             Context Ptr             CtxPtr0
 199 //      in1             Input Data Ptr          DPtrIn
 200 //      in2             Integral Blocks         BlockCount
 201 //      rp              Return Address          -
 202 //
 203 ///     CODE
 204 //
 205 //      v2              Input Align             InAlign
 206 //      t0              Shared w/digest         -
 207 //      t1              Shared w/digest         -
 208 //      t2              Shared w/digest         -
 209 //      t3              Shared w/digest         -
 210 //      t4              Shared w/digest         -
 211 //      t5              Shared w/digest         -
 212 //      t6              PFS Save                PFSSave
 213 //      t7              ar.lc Save              LCSave
 214 //      t8              Saved PR                PRSave
 215 //      t9              2nd CtxPtr              CtxPtr1
 216 //      t10             Table Base              CTable
 217 //      t11             Table[0]                CTable0
 218 //      t13             Accumulator A           AccumA
 219 //      t14             Accumulator B           AccumB
 220 //      t15             Accumulator C           AccumC
 221 //      t16             Accumulator D           AccumD
 222 //      pt0             Shared w/digest         -
 223 //      pt1             Shared w/digest         -
 224 //      pt2             Shared w/digest         -
 225 //      pt3             Shared w/digest         -
 226 //      pt4             Shared w/digest         -
 227 //      pt5             Shared w/digest         -
 228 //      pt6             Shared w/digest         -
 229 //      pt7             Shared w/digest         -
 230 //      pt8             Not Aligned             pOff
 231 //      pt8             Blocks Left             pAgain
 232
 233 #define AccumA          r27
 234 #define AccumB          r28
 235 #define AccumC          r29
 236 #define AccumD          r30
 237 #define CTable          r24
 238 #define CTable0         r25
 239 #define CtxPtr0         in0
 240 #define CtxPtr1         r23
 241 #define DPtrIn          in1
 242 #define BlockCount      in2
 243 #define InAlign         r10
 244 #define LCSave          r21
 245 #define PFSSave         r20
 246 #define PRSave          r22
 247 #define pAgain          p63
 248 #define pOff            p63
 249
 250         .text
 251
 252 /* md5_block_asm_data_order(MD5_CTX *c, const void *data, size_t num)
 253
 254      where:
 255       c: a pointer to a structure of this type:
 256
 257            typedef struct MD5state_st
 258              {
 259                MD5_LONG A,B,C,D;
 260                MD5_LONG Nl,Nh;
 261                MD5_LONG data[MD5_LBLOCK];
 262                unsigned int num;
 263              }
 264            MD5_CTX;
 265
 266       data: a pointer to the input data (may be misaligned)
 267       num:  the number of 16-byte blocks to hash (i.e., the length
 268             of DATA is 16*NUM.
 269
 270    */
 271
 272         .type   md5_block_asm_data_order, @function
 273         .global md5_block_asm_data_order
 274         .align  32
 275         .proc   md5_block_asm_data_order
 276 md5_block_asm_data_order:
 277 .md5_block:
 278         .prologue
 279 {       .mmi
 280         .save   ar.pfs, PFSSave
 281         alloc   PFSSave = ar.pfs, MD5_NINP, MD5_NLOC, MD5_NOUT, MD5_NROT
 282         ADDP    CtxPtr1 = 8, CtxPtr0
 283         mov     CTable = ip
 284 }
 285 {       .mmi
 286         ADDP    DPtrIn = 0, DPtrIn
 287         ADDP    CtxPtr0 = 0, CtxPtr0
 288         .save   ar.lc, LCSave
 289         mov     LCSave = ar.lc
 290 }
 291 ;;
 292 {       .mmi
 293         add     CTable = .md5_tbl_data_order#-.md5_block#, CTable
 294         and     InAlign = 0x3, DPtrIn
 295 }
 296
 297 {       .mmi
 298         ld4     AccumA = [CtxPtr0], 4
 299         ld4     AccumC = [CtxPtr1], 4
 300         .save pr, PRSave
 301         mov     PRSave = pr
 302         .body
 303 }
 304 ;;
 305 {       .mmi
 306         ld4     AccumB = [CtxPtr0]
 307         ld4     AccumD = [CtxPtr1]
 308         dep     DPtr_ = 0, DPtrIn, 0, 2
 309 } ;;
 310 #ifdef HOST_IS_BIG_ENDIAN
 311         rum     psr.be;;        // switch to little-endian
 312 #endif
 313 {       .mmb
 314         ld4     CTable0 = [CTable], 4
 315         cmp.ne  pOff, p0 = 0, InAlign
 316 (pOff)  br.cond.spnt.many .md5_unaligned
 317 } ;;
 318
 319 //      The FF load/compute loop rotates values three times, so that
 320 //      loading into M12 here produces the M0 value, M13 -> M1, etc.
 321
 322 .md5_block_loop0:
 323 {       .mmi
 324         ld4     M12_ = [DPtr_], 4
 325         mov     TPtr = CTable
 326         mov     TRound = CTable0
 327 } ;;
 328 {       .mmi
 329         ld4     M13_ = [DPtr_], 4
 330         mov     A_ = AccumA
 331         mov     B_ = AccumB
 332 } ;;
 333 {       .mmi
 334         ld4     M14_ = [DPtr_], 4
 335         mov     C_ = AccumC
 336         mov     D_ = AccumD
 337 } ;;
 338 {       .mmb
 339         ld4     M15_ = [DPtr_], 4
 340         add     BlockCount = -1, BlockCount
 341         br.call.sptk.many QUICK_RTN = md5_digest_block0
 342 } ;;
 343
 344 //      Now, we add the new digest values and do some clean-up
 345 //      before checking if there's another full block to process
 346
 347 {       .mmi
 348         add     AccumA = AccumA, A_
 349         add     AccumB = AccumB, B_
 350         cmp.ne  pAgain, p0 = 0, BlockCount
 351 }
 352 {       .mib
 353         add     AccumC = AccumC, C_
 354         add     AccumD = AccumD, D_
 355 (pAgain) br.cond.dptk.many .md5_block_loop0
 356 } ;;
 357
 358 .md5_exit:
 359 #ifdef HOST_IS_BIG_ENDIAN
 360         sum     psr.be;;        // switch back to big-endian mode
 361 #endif
 362 {       .mmi
 363         st4     [CtxPtr0] = AccumB, -4
 364         st4     [CtxPtr1] = AccumD, -4
 365         mov     pr = PRSave, 0x1ffff ;;
 366 }
 367 {       .mmi
 368         st4     [CtxPtr0] = AccumA
 369         st4     [CtxPtr1] = AccumC
 370         mov     ar.lc = LCSave
 371 } ;;
 372 {       .mib
 373         mov     ar.pfs = PFSSave
 374         br.ret.sptk.few rp
 375 } ;;
 376
 377 #define MD5UNALIGNED(offset)                                            \
 378 .md5_process##offset:                                                   \
 379 {       .mib ;                                                          \
 380         nop     0x0     ;                                               \
 381         GETRW(DTmp, DTmp, offset) ;                                     \
 382 } ;;                                                                    \
 383 .md5_block_loop##offset:                                                \
 384 {       .mmi ;                                                          \
 385         ld4     Y_ = [DPtr_], 4 ;                                       \
 386         mov     TPtr = CTable ;                                         \
 387         mov     TRound = CTable0 ;                                      \
 388 } ;;                                                                    \
 389 {       .mmi ;                                                          \
 390         ld4     M13_ = [DPtr_], 4 ;                                     \
 391         mov     A_ = AccumA ;                                           \
 392         mov     B_ = AccumB ;                                           \
 393 } ;;                                                                    \
 394 {       .mii ;                                                          \
 395         ld4     M14_ = [DPtr_], 4 ;                                     \
 396         GETLW(W_, Y_, offset) ;                                         \
 397         mov     C_ = AccumC ;                                           \
 398 }                                                                       \
 399 {       .mmi ;                                                          \
 400         mov     D_ = AccumD ;;                                          \
 401         or      M12_ = W_, DTmp ;                                       \
 402         GETRW(DTmp, Y_, offset) ;                                       \
 403 }                                                                       \
 404 {       .mib ;                                                          \
 405         ld4     M15_ = [DPtr_], 4 ;                                     \
 406         add     BlockCount = -1, BlockCount ;                           \
 407         br.call.sptk.many QUICK_RTN = md5_digest_block##offset;         \
 408 } ;;                                                                    \
 409 {       .mmi ;                                                          \
 410         add     AccumA = AccumA, A_ ;                                   \
 411         add     AccumB = AccumB, B_ ;                                   \
 412         cmp.ne  pAgain, p0 = 0, BlockCount ;                            \
 413 }                                                                       \
 414 {       .mib ;                                                          \
 415         add     AccumC = AccumC, C_ ;                                   \
 416         add     AccumD = AccumD, D_ ;                                   \
 417 (pAgain) br.cond.dptk.many .md5_block_loop##offset ;                    \
 418 } ;;                                                                    \
 419 {       .mib ;                                                          \
 420         nop     0x0 ;                                                   \
 421         nop     0x0 ;                                                   \
 422         br.cond.sptk.many .md5_exit ;                                   \
 423 } ;;
 424
 425         .align  32
 426 .md5_unaligned:
 427 //
 428 //      Because variable shifts are expensive, we special case each of
 429 //      the four alignements. In practice, this won't hurt too much
 430 //      since only one working set of code will be loaded.
 431 //
 432 {       .mib
 433         ld4     DTmp = [DPtr_], 4
 434         cmp.eq  pOff, p0 = 1, InAlign
 435 (pOff)  br.cond.dpnt.many .md5_process1
 436 } ;;
 437 {       .mib
 438         cmp.eq  pOff, p0 = 2, InAlign
 439         nop     0x0
 440 (pOff)  br.cond.dpnt.many .md5_process2
 441 } ;;
 442         MD5UNALIGNED(3)
 443         MD5UNALIGNED(1)
 444         MD5UNALIGNED(2)
 445
 446         .endp md5_block_asm_data_order
 447
 448
 449 // MD5 Perform the F function and load
 450 //
 451 // Passed the first 4 words (M0 - M3) and initial (A, B, C, D) values,
 452 // computes the FF() round of functions, then branches to the common
 453 // digest code to finish up with GG(), HH, and II().
 454 //
 455 // INPUT
 456 //
 457 // rp Return Address -
 458 //
 459 // CODE
 460 //
 461 // v0 PFS bit bucket PFS
 462 // v1 Loop Trip Count LTrip
 463 // pt0 Load next word pMore
 464
 465 /* For F round: */
 466 #define LTrip   r9
 467 #define PFS     r8
 468 #define pMore   p6
 469
 470 /* For GHI rounds: */
 471 #define T       r9
 472 #define U       r10
 473 #define V       r11
 474
 475 #define COMPUTE(a, b, s, M, R)                  \
 476 {                                               \
 477         .mii ;                                  \
 478         ld4 TRound = [TPtr], 4 ;                \
 479         dep.z Y = Z, 32, 32 ;;                  \
 480         shrp Z = Z, Y, 64 - s ;                 \
 481 } ;;                                            \
 482 {                                               \
 483         .mmi ;                                  \
 484         add a = Z, b ;                          \
 485         mov R = M ;                             \
 486         nop 0x0 ;                               \
 487 } ;;
 488
 489 #define LOOP(a, b, s, M, R, label)              \
 490 {       .mii ;                                  \
 491         ld4 TRound = [TPtr], 4 ;                \
 492         dep.z Y = Z, 32, 32 ;;                  \
 493         shrp Z = Z, Y, 64 - s ;                 \
 494 } ;;                                            \
 495 {       .mib ;                                  \
 496         add a = Z, b ;                          \
 497         mov R = M ;                             \
 498         br.ctop.sptk.many label ;               \
 499 } ;;
 500
 501 // G(B, C, D) = (B & D) | (C & ~D)
 502
 503 #define G(a, b, c, d, M)                        \
 504 {       .mmi ;                                  \
 505         add Z = M, TRound ;                     \
 506         and Y = b, d ;                          \
 507         andcm X = c, d ;                        \
 508 } ;;                                            \
 509 {       .mii ;                                  \
 510         add Z = Z, a ;                          \
 511         or Y = Y, X ;;                          \
 512         add Z = Z, Y ;                          \
 513 } ;;
 514
 515 // H(B, C, D) = B ^ C ^ D
 516
 517 #define H(a, b, c, d, M)                        \
 518 {       .mmi ;                                  \
 519         add Z = M, TRound ;                     \
 520         xor Y = b, c ;                          \
 521         nop 0x0 ;                               \
 522 } ;;                                            \
 523 {       .mii ;                                  \
 524         add Z = Z, a ;                          \
 525         xor Y = Y, d ;;                         \
 526         add Z = Z, Y ;                          \
 527 } ;;
 528
 529 // I(B, C, D) = C ^ (B | ~D)
 530 //
 531 // However, since we have an andcm operator, we use the fact that
 532 //
 533 // Y ^ Z == ~Y ^ ~Z
 534 //
 535 // to rewrite the expression as
 536 //
 537 // I(B, C, D) = ~C ^ (~B & D)
 538
 539 #define I(a, b, c, d, M)                        \
 540 {       .mmi ;                                  \
 541         add Z = M, TRound ;                     \
 542         andcm Y = d, b ;                        \
 543         andcm X = -1, c ;                       \
 544 } ;;                                            \
 545 {       .mii ;                                  \
 546         add Z = Z, a ;                          \
 547         xor Y = Y, X ;;                         \
 548         add Z = Z, Y ;                          \
 549 } ;;
 550
 551 #define GG4(label)                              \
 552         G(A, B, C, D, M0)                       \
 553         COMPUTE(A, B, 5, M0, RotateM0)          \
 554         G(D, A, B, C, M1)                       \
 555         COMPUTE(D, A, 9, M1, RotateM1)          \
 556         G(C, D, A, B, M2)                       \
 557         COMPUTE(C, D, 14, M2, RotateM2)         \
 558         G(B, C, D, A, M3)                       \
 559         LOOP(B, C, 20, M3, RotateM3, label)
 560
 561 #define HH4(label)                              \
 562         H(A, B, C, D, M0)                       \
 563         COMPUTE(A, B, 4, M0, RotateM0)          \
 564         H(D, A, B, C, M1)                       \
 565         COMPUTE(D, A, 11, M1, RotateM1)         \
 566         H(C, D, A, B, M2)                       \
 567         COMPUTE(C, D, 16, M2, RotateM2)         \
 568         H(B, C, D, A, M3)                       \
 569         LOOP(B, C, 23, M3, RotateM3, label)
 570
 571 #define II4(label)                              \
 572         I(A, B, C, D, M0)                       \
 573         COMPUTE(A, B, 6, M0, RotateM0)          \
 574         I(D, A, B, C, M1)                       \
 575         COMPUTE(D, A, 10, M1, RotateM1)         \
 576         I(C, D, A, B, M2)                       \
 577         COMPUTE(C, D, 15, M2, RotateM2)         \
 578         I(B, C, D, A, M3)                       \
 579         LOOP(B, C, 21, M3, RotateM3, label)
 580
 581 #define FFLOAD(a, b, c, d, M, N, s)             \
 582 {       .mii ;                                  \
 583 (pMore) ld4 N = [DPtr], 4 ;                     \
 584         add Z = M, TRound ;                     \
 585         and Y = c, b ;                          \
 586 }                                               \
 587 {       .mmi ;                                  \
 588         andcm X = d, b ;;                       \
 589         add Z = Z, a ;                          \
 590         or Y = Y, X ;                           \
 591 } ;;                                            \
 592 {       .mii ;                                  \
 593         ld4 TRound = [TPtr], 4 ;                \
 594         add Z = Z, Y ;;                         \
 595         dep.z Y = Z, 32, 32 ;                   \
 596 } ;;                                            \
 597 {       .mii ;                                  \
 598         nop 0x0 ;                               \
 599         shrp Z = Z, Y, 64 - s ;;                \
 600         add a = Z, b ;                          \
 601 } ;;
 602
 603 #define FFLOOP(a, b, c, d, M, N, s, dest)       \
 604 {       .mii ;                                  \
 605 (pMore) ld4 N = [DPtr], 4 ;                     \
 606         add Z = M, TRound ;                     \
 607         and Y = c, b ;                          \
 608 }                                               \
 609 {       .mmi ;                                  \
 610         andcm X = d, b ;;                       \
 611         add Z = Z, a ;                          \
 612         or Y = Y, X ;                           \
 613 } ;;                                            \
 614 {       .mii ;                                  \
 615         ld4 TRound = [TPtr], 4 ;                \
 616         add Z = Z, Y ;;                         \
 617         dep.z Y = Z, 32, 32 ;                   \
 618 } ;;                                            \
 619 {       .mii ;                                  \
 620         nop 0x0 ;                               \
 621         shrp Z = Z, Y, 64 - s ;;                \
 622         add a = Z, b ;                          \
 623 }                                               \
 624 {       .mib ;                                  \
 625         cmp.ne pMore, p0 = 0, LTrip ;           \
 626         add LTrip = -1, LTrip ;                 \
 627         br.ctop.dptk.many dest ;                \
 628 } ;;
 629
 630         .type md5_digest_block0, @function
 631         .align 32
 632
 633         .proc md5_digest_block0
 634         .prologue
 635 md5_digest_block0:
 636         .altrp QUICK_RTN
 637         .body
 638 {       .mmi
 639         alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
 640         mov LTrip = 2
 641         mov ar.lc = 3
 642 } ;;
 643 {       .mii
 644         cmp.eq pMore, p0 = r0, r0
 645         mov ar.ec = 0
 646         nop 0x0
 647 } ;;
 648
 649 .md5_FF_round0:
 650         FFLOAD(A, B, C, D, M12, RotateM0, 7)
 651         FFLOAD(D, A, B, C, M13, RotateM1, 12)
 652         FFLOAD(C, D, A, B, M14, RotateM2, 17)
 653         FFLOOP(B, C, D, A, M15, RotateM3, 22, .md5_FF_round0)
 654         //
 655         // !!! Fall through to md5_digest_GHI
 656         //
 657         .endp md5_digest_block0
 658
 659         .type md5_digest_GHI, @function
 660         .align 32
 661
 662         .proc md5_digest_GHI
 663         .prologue
 664         .regstk _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
 665 md5_digest_GHI:
 666         .altrp QUICK_RTN
 667         .body
 668 //
 669 // The following sequence shuffles the block counstants round for the
 670 // next round:
 671 //
 672 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
 673 // 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12
 674 //
 675 {       .mmi
 676         mov Z = M0
 677         mov Y = M15
 678         mov ar.lc = 3
 679 }
 680 {       .mmi
 681         mov X = M2
 682         mov W = M9
 683         mov V = M4
 684 } ;;
 685
 686 {       .mmi
 687         mov M0 = M1
 688         mov M15 = M12
 689         mov ar.ec = 1
 690 }
 691 {       .mmi
 692         mov M2 = M11
 693         mov M9 = M14
 694         mov M4 = M5
 695 } ;;
 696
 697 {       .mmi
 698         mov M1 = M6
 699         mov M12 = M13
 700         mov U = M3
 701 }
 702 {       .mmi
 703         mov M11 = M8
 704         mov M14 = M7
 705         mov M5 = M10
 706 } ;;
 707
 708 {       .mmi
 709         mov M6 = Y
 710         mov M13 = X
 711         mov M3 = Z
 712 }
 713 {       .mmi
 714         mov M8 = W
 715         mov M7 = V
 716         mov M10 = U
 717 } ;;
 718
 719 .md5_GG_round:
 720         GG4(.md5_GG_round)
 721
 722 // The following sequence shuffles the block constants round for the
 723 // next round:
 724 //
 725 // 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12
 726 // 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2
 727
 728 {       .mmi
 729         mov Z = M0
 730         mov Y = M1
 731         mov ar.lc = 3
 732 }
 733 {       .mmi
 734         mov X = M3
 735         mov W = M5
 736         mov V = M6
 737 } ;;
 738
 739 {       .mmi
 740         mov M0 = M4
 741         mov M1 = M11
 742         mov ar.ec = 1
 743 }
 744 {       .mmi
 745         mov M3 = M9
 746         mov U = M8
 747         mov T = M13
 748 } ;;
 749
 750 {       .mmi
 751         mov M4 = Z
 752         mov M11 = Y
 753         mov M5 = M7
 754 }
 755 {       .mmi
 756         mov M6 = M14
 757         mov M8 = M12
 758         mov M13 = M15
 759 } ;;
 760
 761 {       .mmi
 762         mov M7 = W
 763         mov M14 = V
 764         nop 0x0
 765 }
 766 {       .mmi
 767         mov M9 = X
 768         mov M12 = U
 769         mov M15 = T
 770 } ;;
 771
 772 .md5_HH_round:
 773         HH4(.md5_HH_round)
 774
 775 // The following sequence shuffles the block constants round for the
 776 // next round:
 777 //
 778 // 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2
 779 // 0 7 14 5 12 3 10 1 8 15 6 13 4 11 2 9
 780
 781 {       .mmi
 782         mov Z = M0
 783         mov Y = M15
 784         mov ar.lc = 3
 785 }
 786 {       .mmi
 787         mov X = M10
 788         mov W = M1
 789         mov V = M4
 790 } ;;
 791
 792 {       .mmi
 793         mov M0 = M9
 794         mov M15 = M12
 795         mov ar.ec = 1
 796 }
 797 {       .mmi
 798         mov M10 = M11
 799         mov M1 = M6
 800         mov M4 = M13
 801 } ;;
 802
 803 {       .mmi
 804         mov M9 = M14
 805         mov M12 = M5
 806         mov U = M3
 807 }
 808 {       .mmi
 809         mov M11 = M8
 810         mov M6 = M7
 811         mov M13 = M2
 812 } ;;
 813
 814 {       .mmi
 815         mov M14 = Y
 816         mov M5 = X
 817         mov M3 = Z
 818 }
 819 {       .mmi
 820         mov M8 = W
 821         mov M7 = V
 822         mov M2 = U
 823 } ;;
 824
 825 .md5_II_round:
 826         II4(.md5_II_round)
 827
 828 {       .mib
 829         nop 0x0
 830         nop 0x0
 831         br.ret.sptk.many QUICK_RTN
 832 } ;;
 833
 834         .endp md5_digest_GHI
 835
 836 #define FFLOADU(a, b, c, d, M, P, N, s, offset) \
 837 {       .mii ;                                  \
 838 (pMore) ld4 N = [DPtr], 4 ;                     \
 839         add Z = M, TRound ;                     \
 840         and Y = c, b ;                          \
 841 }                                               \
 842 {       .mmi ;                                  \
 843         andcm X = d, b ;;                       \
 844         add Z = Z, a ;                          \
 845         or Y = Y, X ;                           \
 846 } ;;                                            \
 847 {       .mii ;                                  \
 848         ld4 TRound = [TPtr], 4 ;                \
 849         GETLW(W, P, offset) ;                   \
 850         add Z = Z, Y ;                          \
 851 } ;;                                            \
 852 {       .mii ;                                  \
 853         or W = W, DTmp ;                        \
 854         dep.z Y = Z, 32, 32 ;;                  \
 855         shrp Z = Z, Y, 64 - s ;                 \
 856 } ;;                                            \
 857 {       .mii ;                                  \
 858         add a = Z, b ;                          \
 859         GETRW(DTmp, P, offset) ;                \
 860         mov P = W ;                             \
 861 } ;;
 862
 863 #define FFLOOPU(a, b, c, d, M, P, N, s, offset)         \
 864 {       .mii ;                                          \
 865 (pMore) ld4 N = [DPtr], 4 ;                             \
 866         add Z = M, TRound ;                             \
 867         and Y = c, b ;                                  \
 868 }                                                       \
 869 {       .mmi ;                                          \
 870         andcm X = d, b ;;                               \
 871         add Z = Z, a ;                                  \
 872         or Y = Y, X ;                                   \
 873 } ;;                                                    \
 874 {       .mii ;                                          \
 875         ld4 TRound = [TPtr], 4 ;                        \
 876 (pMore) GETLW(W, P, offset)     ;                       \
 877         add Z = Z, Y ;                                  \
 878 } ;;                                                    \
 879 {       .mii ;                                          \
 880 (pMore) or W = W, DTmp ;                                \
 881         dep.z Y = Z, 32, 32 ;;                          \
 882         shrp Z = Z, Y, 64 - s ;                         \
 883 } ;;                                                    \
 884 {       .mii ;                                          \
 885         add a = Z, b ;                                  \
 886 (pMore) GETRW(DTmp, P, offset)  ;                       \
 887 (pMore) mov P = W ;                                     \
 888 }                                                       \
 889 {       .mib ;                                          \
 890         cmp.ne pMore, p0 = 0, LTrip ;                   \
 891         add LTrip = -1, LTrip ;                         \
 892         br.ctop.sptk.many .md5_FF_round##offset ;       \
 893 } ;;
 894
 895 #define MD5FBLOCK(offset)                                               \
 896         .type md5_digest_block##offset, @function ;                     \
 897                                                                         \
 898         .align 32 ;                                                     \
 899         .proc md5_digest_block##offset ;                                \
 900         .prologue ;                                                     \
 901         .altrp QUICK_RTN ;                                              \
 902         .body ;                                                         \
 903 md5_digest_block##offset:                                               \
 904 {       .mmi ;                                                          \
 905         alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE ;    \
 906         mov LTrip = 2 ;                                                 \
 907         mov ar.lc = 3 ;                                                 \
 908 } ;;                                                                    \
 909 {       .mii ;                                                          \
 910         cmp.eq pMore, p0 = r0, r0 ;                                     \
 911         mov ar.ec = 0 ;                                                 \
 912         nop 0x0 ;                                                       \
 913 } ;;                                                                    \
 914                                                                         \
 915         .pred.rel "mutex", pLoad, pSkip ;                               \
 916 .md5_FF_round##offset:                                                  \
 917         FFLOADU(A, B, C, D, M12, M13, RotateM0, 7, offset)              \
 918         FFLOADU(D, A, B, C, M13, M14, RotateM1, 12, offset)             \
 919         FFLOADU(C, D, A, B, M14, M15, RotateM2, 17, offset)             \
 920         FFLOOPU(B, C, D, A, M15, RotateM0, RotateM3, 22, offset)        \
 921                                                                         \
 922 {       .mib ;                                                          \
 923         nop 0x0 ;                                                       \
 924         nop 0x0 ;                                                       \
 925         br.cond.sptk.many md5_digest_GHI ;                              \
 926 } ;;                                                                    \
 927         .endp md5_digest_block##offset
 928
 929 MD5FBLOCK(1)
 930 MD5FBLOCK(2)
 931 MD5FBLOCK(3)
 932
 933         .align 64
 934         .type md5_constants, @object
 935 md5_constants:
 936 .md5_tbl_data_order:                    // To ensure little-endian data
 937                                         // order, code as bytes.
 938         data1 0x78, 0xa4, 0x6a, 0xd7    //     0
 939         data1 0x56, 0xb7, 0xc7, 0xe8    //     1
 940         data1 0xdb, 0x70, 0x20, 0x24    //     2
 941         data1 0xee, 0xce, 0xbd, 0xc1    //     3
 942         data1 0xaf, 0x0f, 0x7c, 0xf5    //     4
 943         data1 0x2a, 0xc6, 0x87, 0x47    //     5
 944         data1 0x13, 0x46, 0x30, 0xa8    //     6
 945         data1 0x01, 0x95, 0x46, 0xfd    //     7
 946         data1 0xd8, 0x98, 0x80, 0x69    //     8
 947         data1 0xaf, 0xf7, 0x44, 0x8b    //     9
 948         data1 0xb1, 0x5b, 0xff, 0xff    //    10
 949         data1 0xbe, 0xd7, 0x5c, 0x89    //    11
 950         data1 0x22, 0x11, 0x90, 0x6b    //    12
 951         data1 0x93, 0x71, 0x98, 0xfd    //    13
 952         data1 0x8e, 0x43, 0x79, 0xa6    //    14
 953         data1 0x21, 0x08, 0xb4, 0x49    //    15
 954         data1 0x62, 0x25, 0x1e, 0xf6    //    16
 955         data1 0x40, 0xb3, 0x40, 0xc0    //    17
 956         data1 0x51, 0x5a, 0x5e, 0x26    //    18
 957         data1 0xaa, 0xc7, 0xb6, 0xe9    //    19
 958         data1 0x5d, 0x10, 0x2f, 0xd6    //    20
 959         data1 0x53, 0x14, 0x44, 0x02    //    21
 960         data1 0x81, 0xe6, 0xa1, 0xd8    //    22
 961         data1 0xc8, 0xfb, 0xd3, 0xe7    //    23
 962         data1 0xe6, 0xcd, 0xe1, 0x21    //    24
 963         data1 0xd6, 0x07, 0x37, 0xc3    //    25
 964         data1 0x87, 0x0d, 0xd5, 0xf4    //    26
 965         data1 0xed, 0x14, 0x5a, 0x45    //    27
 966         data1 0x05, 0xe9, 0xe3, 0xa9    //    28
 967         data1 0xf8, 0xa3, 0xef, 0xfc    //    29
 968         data1 0xd9, 0x02, 0x6f, 0x67    //    30
 969         data1 0x8a, 0x4c, 0x2a, 0x8d    //    31
 970         data1 0x42, 0x39, 0xfa, 0xff    //    32
 971         data1 0x81, 0xf6, 0x71, 0x87    //    33
 972         data1 0x22, 0x61, 0x9d, 0x6d    //    34
 973         data1 0x0c, 0x38, 0xe5, 0xfd    //    35
 974         data1 0x44, 0xea, 0xbe, 0xa4    //    36
 975         data1 0xa9, 0xcf, 0xde, 0x4b    //    37
 976         data1 0x60, 0x4b, 0xbb, 0xf6    //    38
 977         data1 0x70, 0xbc, 0xbf, 0xbe    //    39
 978         data1 0xc6, 0x7e, 0x9b, 0x28    //    40
 979         data1 0xfa, 0x27, 0xa1, 0xea    //    41
 980         data1 0x85, 0x30, 0xef, 0xd4    //    42
 981         data1 0x05, 0x1d, 0x88, 0x04    //    43
 982         data1 0x39, 0xd0, 0xd4, 0xd9    //    44
 983         data1 0xe5, 0x99, 0xdb, 0xe6    //    45
 984         data1 0xf8, 0x7c, 0xa2, 0x1f    //    46
 985         data1 0x65, 0x56, 0xac, 0xc4    //    47
 986         data1 0x44, 0x22, 0x29, 0xf4    //    48
 987         data1 0x97, 0xff, 0x2a, 0x43    //    49
 988         data1 0xa7, 0x23, 0x94, 0xab    //    50
 989         data1 0x39, 0xa0, 0x93, 0xfc    //    51
 990         data1 0xc3, 0x59, 0x5b, 0x65    //    52
 991         data1 0x92, 0xcc, 0x0c, 0x8f    //    53
 992         data1 0x7d, 0xf4, 0xef, 0xff    //    54
 993         data1 0xd1, 0x5d, 0x84, 0x85    //    55
 994         data1 0x4f, 0x7e, 0xa8, 0x6f    //    56
 995         data1 0xe0, 0xe6, 0x2c, 0xfe    //    57
 996         data1 0x14, 0x43, 0x01, 0xa3    //    58
 997         data1 0xa1, 0x11, 0x08, 0x4e    //    59
 998         data1 0x82, 0x7e, 0x53, 0xf7    //    60
 999         data1 0x35, 0xf2, 0x3a, 0xbd    //    61
1000         data1 0xbb, 0xd2, 0xd7, 0x2a    //    62
1001         data1 0x91, 0xd3, 0x86, 0xeb    //    63
1002 .size   md5_constants#,64*4