alpha: Fix end-of-count checks in strncmp

author Richard Henderson <rth@twiddle.net>

Wed, 6 Jun 2012 21:51:05 +0000 (14:51 -0700)

committer Richard Henderson <rth@twiddle.net>

Wed, 6 Jun 2012 21:51:05 +0000 (14:51 -0700)
author Richard Henderson <rth@twiddle.net>
Wed, 6 Jun 2012 21:51:05 +0000 (14:51 -0700)
committer Richard Henderson <rth@twiddle.net>
Wed, 6 Jun 2012 21:51:05 +0000 (14:51 -0700)
diff --git a/ChangeLog.alpha b/ChangeLog.alpha

index d291df970a3e4dd41b1e838723b7f7594f3e480c..824083c32397601e8a1c08a57e364d9fa21517e2 100644 (file)
--- a/ChangeLog.alpha
+++ b/ChangeLog.alpha
@@ -1,5 +1,8 @@
  2012-06-06  Richard Henderson  <rth@twiddle.net>
  
+       * sysdeps/alpha/strncmp.S: Bound count to LONG_MAX at startup.
+       Re-organize checks vs s2 end-of-count.
+
         [BZ #13718]
         * sysdeps/alpha/stxncmp.S: Bound count to LONG_MAX at startup.
         * sysdeps/alpha/alphaev6/stxncmp.S: Likewise.
diff --git a/sysdeps/alpha/strncmp.S b/sysdeps/alpha/strncmp.S

index c9981e1b66972d3523888afad87f5960d7ddd6eb..828f1b97038c686c3c0f3557c65f5853471099c6 100644 (file)
--- a/sysdeps/alpha/strncmp.S
+++ b/sysdeps/alpha/strncmp.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 1996, 1997, 2003 Free Software Foundation, Inc.
+/* Copyright (C) 1996-2012 Free Software Foundation, Inc.
     Contributed by Richard Henderson (rth@tamu.edu)
     This file is part of the GNU C Library.
  
@@ -23,6 +23,15 @@
         .set noat
         .set noreorder
  
+/* EV6 only predicts one branch per octaword.  We'll use these to push
+   subsequent branches back to the next bundle.  This will generally add
+   a fetch+decode cycle to older machines, so skip in that case.  */
+#ifdef __alpha_fix__
+# define ev6_unop      unop
+#else
+# define ev6_unop
+#endif
+
         .text
  
  ENTRY(strncmp)
@@ -35,128 +44,140 @@ ENTRY(strncmp)
         .prologue 0
  #endif
  
-       xor     a0, a1, t2      # e0    : are s1 and s2 co-aligned?
-       beq     a2, $zerolength # .. e1 :
-       ldq_u   t0, 0(a0)       # e0    : give cache time to catch up
-       ldq_u   t1, 0(a1)       # .. e1 :
-       and     t2, 7, t2       # e0    :
-       and     a0, 7, t4       # .. e1 : find s1 misalignment
-       lda     t3, -1          # e0    :
-       addq    a2, t4, a2      # .. e1 : bias count by s1 misalignment
-       and     a2, 7, t10      # e1    : ofs of last byte in last word
-       srl     a2, 3, a2       # .. e0 : remaining full words in count
-       and     a1, 7, t5       # e0    : find s2 misalignment
-       bne     t2, $unaligned  # .. e1 :
+       xor     a0, a1, t2      # are s1 and s2 co-aligned?
+       beq     a2, $zerolength
+       ldq_u   t0, 0(a0)       # load asap to give cache time to catch up
+       ldq_u   t1, 0(a1)
+       lda     t3, -1
+       and     t2, 7, t2
+       srl     t3, 1, t6
+       and     a0, 7, t4       # find s1 misalignment
+       and     a1, 7, t5       # find s2 misalignment
+       cmovlt  a2, t6, a2      # bound neg count to LONG_MAX
+       addq    a1, a2, a3      # s2+count
+       addq    a2, t4, a2      # bias count by s1 misalignment
+       and     a2, 7, t10      # ofs of last byte in s1 last word
+       srl     a2, 3, a2       # remaining full words in s1 count
+       bne     t2, $unaligned
  
         /* On entry to this basic block:
            t0 == the first word of s1.
            t1 == the first word of s2.
            t3 == -1.  */
-
  $aligned:
-       mskqh   t3, a1, t3      # e0    : mask off leading garbage
-       nop                     # .. e1 :
-       ornot   t1, t3, t1      # e0    :
-       ornot   t0, t3, t0      # .. e1 :
-       cmpbge  zero, t1, t7    # e0    : bits set iff null found
-       beq     a2, $eoc        # .. e1 : check end of count
-       unop                    # e0    :
-       bne     t7, $eos        # .. e1 :
-       unop                    # e0    :
-       beq     t10, $ant_loop  # .. e1 :
+       mskqh   t3, a1, t8      # mask off leading garbage
+       ornot   t1, t8, t1
+       ornot   t0, t8, t0
+       cmpbge  zero, t1, t7    # bits set iff null found
+       beq     a2, $eoc        # check end of count
+       bne     t7, $eos
+       beq     t10, $ant_loop
  
         /* Aligned compare main loop.
            On entry to this basic block:
            t0 == an s1 word.
            t1 == an s2 word not containing a null.  */
  
+       .align 4
  $a_loop:
         xor     t0, t1, t2      # e0    :
         bne     t2, $wordcmp    # .. e1 (zdb)
         ldq_u   t1, 8(a1)       # e0    :
         ldq_u   t0, 8(a0)       # .. e1 :
+
         subq    a2, 1, a2       # e0    :
         addq    a1, 8, a1       # .. e1 :
         addq    a0, 8, a0       # e0    :
         beq     a2, $eoc        # .. e1 :
+
         cmpbge  zero, t1, t7    # e0    :
         beq     t7, $a_loop     # .. e1 :
-       unop                    # e0    :
-       br      $eos            # .. e1 :
+
+       br      $eos
  
         /* Alternate aligned compare loop, for when there's no trailing
            bytes on the count.  We have to avoid reading too much data.  */
+       .align 4
  $ant_loop:
         xor     t0, t1, t2      # e0    :
+       ev6_unop
+       ev6_unop
         bne     t2, $wordcmp    # .. e1 (zdb)
+
         subq    a2, 1, a2       # e0    :
         beq     a2, $zerolength # .. e1 :
         ldq_u   t1, 8(a1)       # e0    :
         ldq_u   t0, 8(a0)       # .. e1 :
+
         addq    a1, 8, a1       # e0    :
         addq    a0, 8, a0       # .. e1 :
         cmpbge  zero, t1, t7    # e0    :
         beq     t7, $ant_loop   # .. e1 :
-       unop                    # e0    :
-       br      $eos            # .. e1 :
+
+       br      $eos
  
         /* The two strings are not co-aligned.  Align s1 and cope.  */
+       /* On entry to this basic block:
+          t0 == the first word of s1.
+          t1 == the first word of s2.
+          t3 == -1.
+          t4 == misalignment of s1.
+          t5 == misalignment of s2.
+         t10 == misalignment of s1 end.  */
+       .align  4
  $unaligned:
-       subq    a1, t4, a1      # e0     :
-       unop                    #        :
-
-       /* If s2 misalignment is larger than s2 misalignment, we need
+       /* If s1 misalignment is larger than s2 misalignment, we need
            extra startup checks to avoid SEGV.  */
+       subq    a1, t4, a1      # adjust s2 for s1 misalignment
+       cmpult  t4, t5, t9
+       subq    a3, 1, a3       # last byte of s2
+       bic     a1, 7, t8
+       mskqh   t3, t5, t7      # mask garbage in s2
+       subq    a3, t8, a3
+       ornot   t1, t7, t7
+       srl     a3, 3, a3       # remaining full words in s2 count
+       beq     t9, $u_head
+
+       /* Failing that, we need to look for both eos and eoc within the
+          first word of s2.  If we find either, we can continue by
+          pretending that the next word of s2 is all zeros.  */
+       lda     t2, 0           # next = zero
+       cmpeq   a3, 0, t8       # eoc in the first word of s2?
+       cmpbge  zero, t7, t7    # eos in the first word of s2?
+       or      t7, t8, t8
+       bne     t8, $u_head_nl
  
-       cmplt   t4, t5, t8      # .. e1 :
-       beq     t8, $u_head     # e1    :
-
-       mskqh   t3, t5, t3      # e0    :
-       ornot   t1, t3, t3      # e0    :
-       cmpbge  zero, t3, t7    # e1    : is there a zero?
-       beq     t7, $u_head     # e1    :
-
-       /* We've found a zero in the first partial word of s2.  Align
-          our current s1 and s2 words and compare what we've got.  */
-
-       extql   t1, t5, t1      # e0    :
-       lda     t3, -1          # .. e1 :
-       insql   t1, a0, t1      # e0    :
-       mskqh   t3, a0, t3      # e0    :
-       ornot   t1, t3, t1      # e0    :
-       ornot   t0, t3, t0      # .. e1 :
-       cmpbge  zero, t1, t7    # e0    : find that zero again
-       beq     a2, $eoc        # .. e1 : and finish up
-       br      $eos            # e1    :
-
-       .align 3
-$u_head:
         /* We know just enough now to be able to assemble the first
            full word of s2.  We can still find a zero at the end of it.
  
            On entry to this basic block:
            t0 == first word of s1
-          t1 == first partial word of s2.  */
-
-       ldq_u   t2, 8(a1)       # e0    : load second partial s2 word
-       lda     t3, -1          # .. e1 : create leading garbage mask
-       extql   t1, a1, t1      # e0    : create first s2 word
-       mskqh   t3, a0, t3      # e0    :
-       extqh   t2, a1, t4      # e0    :
-       ornot   t0, t3, t0      # .. e1 : kill s1 garbage
-       or      t1, t4, t1      # e0    : s2 word now complete
-       ornot   t1, t3, t1      # e1    : kill s2 garbage
-       cmpbge  zero, t0, t7    # e0    : find zero in first s1 word
-       beq     a2, $eoc        # .. e1 :
-       lda     t3, -1          # e0    :
-       bne     t7, $eos        # .. e1 :
-       subq    a2, 1, a2       # e0    :
-       xor     t0, t1, t4      # .. e1 : compare aligned words
-       mskql   t3, a1, t3      # e0    : mask out s2[1] bits we have seen
-       bne     t4, $wordcmp    # .. e1 :
-       or      t2, t3, t3      # e0    :
-       cmpbge  zero, t3, t7    # e1    : find zero in high bits of s2[1]
-       bne     t7, $u_final    # e1    :
+          t1 == first partial word of s2.
+          t3 == -1.
+          t10 == ofs of last byte in s1 last word.
+          t11 == ofs of last byte in s2 last word.  */
+$u_head:
+       ldq_u   t2, 8(a1)       # load second partial s2 word
+       subq    a3, 1, a3
+$u_head_nl:
+       extql   t1, a1, t1      # create first s2 word
+       mskqh   t3, a0, t8
+       extqh   t2, a1, t4
+       ornot   t0, t8, t0      # kill s1 garbage
+       or      t1, t4, t1      # s2 word now complete
+       cmpbge  zero, t0, t7    # find eos in first s1 word
+       ornot   t1, t8, t1      # kill s2 garbage
+       beq     a2, $eoc
+       subq    a2, 1, a2
+       bne     t7, $eos
+       mskql   t3, a1, t8      # mask out s2[1] bits we have seen
+       xor     t0, t1, t4      # compare aligned words
+       or      t2, t8, t8
+       bne     t4, $wordcmp
+       cmpbge  zero, t8, t7    # eos in high bits of s2[1]?
+       cmpeq   a3, 0, t8       # eoc in s2[1]?
+       or      t7, t8, t7
+       bne     t7, $u_final
  
         /* Unaligned copy main loop.  In order to avoid reading too much,
            the loop is structured to detect zeros in aligned words from s2.
@@ -166,43 +187,54 @@ $u_head:
            to run as fast as possible.
  
            On entry to this basic block:
-          t2 == the unshifted low-bits from the next s2 word.  */
-
-       .align 3
+          t2 == the unshifted low-bits from the next s2 word.
+          t10 == ofs of last byte in s1 last word.
+          t11 == ofs of last byte in s2 last word.  */
+       .align 4
  $u_loop:
         extql   t2, a1, t3      # e0    :
         ldq_u   t2, 16(a1)      # .. e1 : load next s2 high bits
         ldq_u   t0, 8(a0)       # e0    : load next s1 word
         addq    a1, 8, a1       # .. e1 :
+
         addq    a0, 8, a0       # e0    :
-       nop                     # .. e1 :
+       subq    a3, 1, a3       # .. e1 :
         extqh   t2, a1, t1      # e0    :
-       cmpbge  zero, t0, t7    # .. e1 : find zero in current s1 word
+       cmpbge  zero, t0, t7    # .. e1 : eos in current s1 word
+
         or      t1, t3, t1      # e0    :
-       beq     a2, $eoc        # .. e1 : check for end of count
+       beq     a2, $eoc        # .. e1 : eoc in current s1 word
         subq    a2, 1, a2       # e0    :
+       cmpbge  zero, t2, t4    # .. e1 : eos in s2[1]
+
+       xor     t0, t1, t3      # e0    : compare the words
+       ev6_unop
+       ev6_unop
         bne     t7, $eos        # .. e1 :
-       xor     t0, t1, t4      # e0    : compare the words
-       bne     t4, $wordcmp    # .. e1 (zdb)
-       cmpbge  zero, t2, t4    # e0    : find zero in next low bits
+
+       cmpeq   a3, 0, t5       # e0    : eoc in s2[1]
+       ev6_unop
+       ev6_unop
+       bne     t3, $wordcmp    # .. e1 :
+
+       or      t4, t5, t4      # e0    : eos or eoc in s2[1].
         beq     t4, $u_loop     # .. e1 (zdb)
  
         /* We've found a zero in the low bits of the last s2 word.  Get
            the next s1 word and align them.  */
+       .align 3
  $u_final:
-       ldq_u   t0, 8(a0)       # e1    :
-       extql   t2, a1, t1      # .. e0 :
-       cmpbge  zero, t1, t7    # e0    :
-       bne     a2, $eos        # .. e1 :
+       ldq_u   t0, 8(a0)
+       extql   t2, a1, t1
+       cmpbge  zero, t1, t7
+       bne     a2, $eos
  
         /* We've hit end of count.  Zero everything after the count
            and compare whats left.  */
-
         .align 3
  $eoc:
         mskql   t0, t10, t0
         mskql   t1, t10, t1
-       unop
         cmpbge  zero, t1, t7
  
         /* We've found a zero somewhere in a word we just read.
@@ -210,32 +242,31 @@ $eoc:
            t0 == s1 word
            t1 == s2 word
            t7 == cmpbge mask containing the zero.  */
-
+       .align 3
  $eos:
-       negq    t7, t6          # e0    : create bytemask of valid data
-       and     t6, t7, t8      # e1    :
-       subq    t8, 1, t6       # e0    :
-       or      t6, t8, t7      # e1    :
-       zapnot  t0, t7, t0      # e0    : kill the garbage
-       zapnot  t1, t7, t1      # .. e1 :
-       xor     t0, t1, v0      # e0    : and compare
-       beq     v0, $done       # .. e1 :
+       negq    t7, t6          # create bytemask of valid data
+       and     t6, t7, t8
+       subq    t8, 1, t6
+       or      t6, t8, t7
+       zapnot  t0, t7, t0      # kill the garbage
+       zapnot  t1, t7, t1
+       xor     t0, t1, v0      # ... and compare
+       beq     v0, $done
  
         /* Here we have two differing co-aligned words in t0 & t1.
            Bytewise compare them and return (t0 > t1 ? 1 : -1).  */
         .align 3
  $wordcmp:
-       cmpbge  t0, t1, t2      # e0    : comparison yields bit mask of ge
-       cmpbge  t1, t0, t3      # .. e1 :
-       xor     t2, t3, t0      # e0    : bits set iff t0/t1 bytes differ
-       negq    t0, t1          # e1    : clear all but least bit
-       and     t0, t1, t0      # e0    :
-       lda     v0, -1          # .. e1 :
-       and     t0, t2, t1      # e0    : was bit set in t0 > t1?
-       cmovne  t1, 1, v0       # .. e1 (zdb)
-
+       cmpbge  t0, t1, t2      # comparison yields bit mask of ge
+       cmpbge  t1, t0, t3
+       xor     t2, t3, t0      # bits set iff t0/t1 bytes differ
+       negq    t0, t1          # clear all but least bit
+       and     t0, t1, t0
+       lda     v0, -1
+       and     t0, t2, t1      # was bit set in t0 > t1?
+       cmovne  t1, 1, v0
  $done:
-       ret                     # e1    :
+       ret
  
         .align 3
  $zerolength:
author	Richard Henderson <rth@twiddle.net>
	Wed, 6 Jun 2012 21:51:05 +0000 (14:51 -0700)
committer	Richard Henderson <rth@twiddle.net>
	Wed, 6 Jun 2012 21:51:05 +0000 (14:51 -0700)
ChangeLog.alpha		patch \| blob \| blame \| history
sysdeps/alpha/strncmp.S		patch \| blob \| blame \| history