Update copyright dates with scripts/update-copyrights.

[thirdparty/glibc.git] / sysdeps / alpha / memchr.c
diff --git a/sysdeps/alpha/memchr.c b/sysdeps/alpha/memchr.c

index 048c9fa25055b202a4309d6053ef1e8ec761a7b2..c87c730ac6192988150fc6a533154b5f1e385629 100644 (file)
--- a/sysdeps/alpha/memchr.c
+++ b/sysdeps/alpha/memchr.c
@@ -1,72 +1,174 @@
-/* Copyright (C) 1992, 1993 Free Software Foundation, Inc.
+/* Copyright (C) 2010-2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
  
-The GNU C Library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Library General Public License as
-published by the Free Software Foundation; either version 2 of the
-License, or (at your option) any later version.
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
  
-The GNU C Library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-Library General Public License for more details.
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
  
-You should have received a copy of the GNU Library General Public
-License along with the GNU C Library; see the file COPYING.LIB.  If
-not, write to the Free Software Foundation, Inc., 675 Mass Ave,
-Cambridge, MA 02139, USA.  */
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
  
  #include <string.h>
  
+typedef unsigned long word;
+
+static inline word
+ldq_u(const void *s)
+{
+  return *(const word *)((word)s & -8);
+}
+
+#define unlikely(X)    __builtin_expect ((X), 0)
+#define prefetch(X)    __builtin_prefetch ((void *)(X), 0)
+
+#define cmpbeq0(X)     __builtin_alpha_cmpbge(0, (X))
+#define find(X, Y)     cmpbeq0 ((X) ^ (Y))
+
  /* Search no more than N bytes of S for C.  */
  
  void *
-memchr (const void *s, int c, size_t n)
+__memchr (const void *s, int xc, size_t n)
  {
-  const char *char_ptr;
-  const unsigned long int *longword_ptr;
-  unsigned long int charmask;
+  const word *s_align;
+  word t, current, found, mask, offset;
+
+  if (unlikely (n == 0))
+    return 0;
  
-  c = (unsigned char) c;
+  current = ldq_u (s);
  
-  /* Handle the first few characters by reading one character at a time.
-     Do this until STR is aligned on a 8-byte border.  */
-  for (char_ptr = s; n > 0 && ((unsigned long int) char_ptr & 7) != 0;
-       --n, ++char_ptr)
-    if (*char_ptr == c)
-      return (void *) char_ptr;
+  /* Replicate low byte of XC into all bytes of C.  */
+  t = xc & 0xff;                       /* 0000000c */
+  t = (t << 8) | t;                    /* 000000cc */
+  t = (t << 16) | t;                   /* 0000cccc */
+  const word c = (t << 32) | t;                /* cccccccc */
  
-  longword_ptr = (unsigned long int *) char_ptr;
+  /* Align the source, and decrement the count by the number
+     of bytes searched in the first word.  */
+  s_align = (const word *)((word)s & -8);
+  n += ((word)s & 7);
  
-  /* Set up a longword, each of whose bytes is C.  */
-  charmask = c | (c << 8);
-  charmask |= charmask << 16;
-  charmask |= charmask << 32;
+  /* Deal with misalignment in the first word for the comparison.  */
+  mask = (1ul << ((word)s & 7)) - 1;
  
-  for (;;)
+  /* If the entire string fits within one word, we may need masking
+     at both the front and the back of the string.  */
+  if (unlikely (n <= 8))
      {
-      const unsigned long int longword = *longword_ptr++;
-      int ge, le;
+      mask |= -1ul << n;
+      goto last_quad;
+    }
  
-      /* Set bits in GE if bytes in CHARMASK are >= bytes in LONGWORD.  */
-      asm ("cmpbge %1, %2, %0" : "=r" (ge) : "r" (charmask), "r" (longword));
+  found = find (current, c) & ~mask;
+  if (unlikely (found))
+    goto found_it;
  
-      /* Set bits in LE if bytes in CHARMASK are <= bytes in LONGWORD.  */
-      asm ("cmpbge %2, %1, %0" : "=r" (le) : "r" (charmask), "r" (longword));
+  s_align++;
+  n -= 8;
  
-      /* Bytes that are both <= and >= are == to C.  */
-      if (ge & le)
-       {
-         /* Which of the bytes was the C?  */
+  /* If the block is sufficiently large, align to cacheline and prefetch.  */
+  if (unlikely (n >= 256))
+    {
+      /* Prefetch 3 cache lines beyond the one we're working on.  */
+      prefetch (s_align + 8);
+      prefetch (s_align + 16);
+      prefetch (s_align + 24);
  
-         char *cp = (char *) (longword_ptr - 1);
+      while ((word)s_align & 63)
+       {
+         current = *s_align;
+         found = find (current, c);
+         if (found)
+           goto found_it;
+         s_align++;
+         n -= 8;
+       }
  
-         if (cp[0] == c)
-           return cp;
-         if (cp[1] == c)
-           return &cp[1];
-         if (cp[2] == c)
-           return &cp[2];
-         return &cp[3];
+       /* Within each cacheline, advance the load for the next word
+          before the test for the previous word is complete.  This
+          allows us to hide the 3 cycle L1 cache load latency.  We
+          only perform this advance load within a cacheline to prevent
+          reading across page boundary.  */
+#define CACHELINE_LOOP                         \
+       do {                                    \
+         word i, next = s_align[0];            \
+         for (i = 0; i < 7; ++i)               \
+           {                                   \
+             current = next;                   \
+             next = s_align[1];                \
+             found = find (current, c);        \
+             if (unlikely (found))             \
+               goto found_it;                  \
+             s_align++;                        \
+           }                                   \
+         current = next;                       \
+         found = find (current, c);            \
+         if (unlikely (found))                 \
+           goto found_it;                      \
+         s_align++;                            \
+         n -= 64;                              \
+       } while (0)
+
+      /* While there's still lots more data to potentially be read,
+        continue issuing prefetches for the 4th cacheline out.  */
+      while (n >= 256)
+       {
+         prefetch (s_align + 24);
+         CACHELINE_LOOP;
         }
+
+      /* Up to 3 cache lines remaining.  Continue issuing advanced
+        loads, but stop prefetching.  */
+      while (n >= 64)
+       CACHELINE_LOOP;
+
+      /* We may have exhausted the buffer.  */
+      if (n == 0)
+       return NULL;
      }
+
+  /* Quadword aligned loop.  */
+  current = *s_align;
+  while (n > 8)
+    {
+      found = find (current, c);
+      if (unlikely (found))
+       goto found_it;
+      current = *++s_align;
+      n -= 8;
+    }
+
+  /* The last word may need masking at the tail of the compare.  */
+  mask = -1ul << n;
+ last_quad:
+  found = find (current, c) & ~mask;
+  if (found == 0)
+    return NULL;
+
+ found_it:
+#ifdef __alpha_cix__
+  offset = __builtin_alpha_cttz (found);
+#else
+  /* Extract LSB.  */
+  found &= -found;
+
+  /* Binary search for the LSB.  */
+  offset  = (found & 0x0f ? 0 : 4);
+  offset += (found & 0x33 ? 0 : 2);
+  offset += (found & 0x55 ? 0 : 1);
+#endif
+
+  return (void *)((word)s_align + offset);
  }
+
+#ifdef weak_alias
+weak_alias (__memchr, memchr)
+#endif
+libc_hidden_builtin_def (memchr)