[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[commits] r14620 - in /fsf/trunk/libc: ./ crypt/ elf/ manual/ nscd/ string/ sysdeps/generic/ sysdeps/unix/sysv/linux/s390/s390-32/ sys...



Author: eglibc
Date: Wed Jul 20 00:02:35 2011
New Revision: 14620

Log:
Import glibc-mainline for 2011-07-20

Added:
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strcat-ssse3.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strcat.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strncat-c.c
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strncat-ssse3.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strncat.S
Modified:
    fsf/trunk/libc/ChangeLog
    fsf/trunk/libc/NEWS
    fsf/trunk/libc/crypt/md5.c
    fsf/trunk/libc/crypt/md5.h
    fsf/trunk/libc/crypt/sha256.c
    fsf/trunk/libc/crypt/sha256.h
    fsf/trunk/libc/crypt/sha512.c
    fsf/trunk/libc/crypt/sha512.h
    fsf/trunk/libc/elf/cache.c
    fsf/trunk/libc/elf/dl-close.c
    fsf/trunk/libc/elf/dl-fini.c
    fsf/trunk/libc/manual/intro.texi
    fsf/trunk/libc/nscd/nscd.c
    fsf/trunk/libc/nscd/nscd_conf.c
    fsf/trunk/libc/string/strncat.c
    fsf/trunk/libc/string/strxfrm_l.c
    fsf/trunk/libc/sysdeps/generic/ldsodefs.h
    fsf/trunk/libc/sysdeps/unix/sysv/linux/s390/s390-32/sysdep.h
    fsf/trunk/libc/sysdeps/unix/sysv/linux/s390/s390-64/sysdep.h
    fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile
    fsf/trunk/libc/sysdeps/x86_64/multiarch/init-arch.c
    fsf/trunk/libc/sysdeps/x86_64/multiarch/init-arch.h
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strcpy-ssse3.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen-no-bsf.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen.S

Modified: fsf/trunk/libc/ChangeLog
==============================================================================
--- fsf/trunk/libc/ChangeLog (original)
+++ fsf/trunk/libc/ChangeLog Wed Jul 20 00:02:35 2011
@@ -1,3 +1,80 @@
+2011-07-13  Andreas Krebbel  <Andreas.Krebbel@xxxxxxxxxx>
+
+	* sysdeps/unix/sysv/linux/s390/s390-32/sysdep.h
+	(INTERNAL_VSYSCALL_NCS): Use r10 for backing up the return address
+	register in order to avoid conflicts with the soft frame pointer
+	being held in r11 when necessary.
+	* sysdeps/unix/sysv/linux/s390/s390-64/sysdep.h
+	(INTERNAL_VSYSCALL_NCS): Likewise.
+
+2011-07-14  Marek Polacek  <mpolacek@xxxxxxxxxx>
+
+	* elf/dl-fini.c (_dl_sort_fini): Remove unused link_map *l argument,
+	* elf/dl-fini.c (_dl_fini): Adjust caller.
+	* elf/dl-close.c (_dl_close_worker): Likewise.
+	* sysdeps/generic/ldsodefs.h: Adjust declaration.
+
+2011-07-15  Marek Polacek  <mpolacek@xxxxxxxxxx>
+
+	* elf/cache.c (load_aux_cache): Remove unnecessary condition of
+	"aux_cache->nlibs < 0".
+
+	* nscd/nscd_conf.c (nscd_parse_file): Remove unnecessary condition
+	in the reload-count case.
+
+2011-07-15  Liubov Dmitrieva  <liubov.dmitrieva@xxxxxxxxx>
+
+	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+	strcat-ssse3 strcat-sse2-unaligned strncat-ssse3
+	strncat-sse2-unaligned strncat-c strlen-sse2-pminub
+	* sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S: New file.
+	* sysdeps/x86_64/multiarch/strcat.S: New file.
+	* sysdeps/x86_64/multiarch/strncat.S: New file.
+	* sysdeps/x86_64/multiarch/strncat-c.c: New file.
+	* sysdeps/x86_64/multiarch/strcat-ssse3.S: New file.
+	* sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S: New file.
+	* sysdeps/x86_64/multiarch/strncat-ssse3.S: New file.
+	* sysdeps/x86_64/multiarch/strcpy-ssse3.S
+	(USE_AS_STRCAT): Define.
+	Add strcat and strncat support.
+	* sysdeps/x86_64/multiarch/strlen-no-bsf.S: Likewise.
+	* sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Likewise.
+	* sysdeps/x86_64/multiarch/strlen-sse2-pminub.S: New file.
+	* string/strncat.c: Update.
+	(USE_AS_STRNCAT): Define.
+	* sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features):
+	Turn on bit_Prefer_PMINUB_for_stringop for Intel Core i3, i5
+	and i7.
+	* sysdeps/x86_64/multiarch/init-arch.h
+	(bit_Prefer_PMINUB_for_stringop): New.
+	(index_Prefer_PMINUB_for_stringop): Likewise.
+	* sysdeps/x86_64/multiarch/strlen.S (strlen): Check
+	bit_Prefer_PMINUB_for_stringop.
+
+2011-07-19  Ulrich Drepper  <drepper@xxxxxxxxx>
+
+	* crypt/sha512.h (struct sha512_ctx): Move buffer into union and add
+	buffer64.
+	* crypt/sha512.c (__sha512_finish_ctx): Use buffer64 for writes instead
+	of casting of buffer.
+	* crypt/sha256.h (struct sha256_ctx): Move buffer into union and add
+	buffer32 and buffer64.
+	* crypt/sha256.c (__sha256_finish_ctx): Use buffer32 or buffer64 for
+	writes instead of casting of buffer.
+	* crypt/md5.h (struct md5_ctx): Move buffer into union and add
+	buffer32.
+	* crypt/md5.c (md5_finish_ctx): Use buffer32 for writes instead of
+	casting of buffer.
+
+2011-07-19  Andreas Schwab  <schwab@xxxxxxxxxx>
+
+	* string/strxfrm_l.c (STRXFRM): Fix alloca accounting.
+
+2011-07-19  Ulrich Drepper  <drepper@xxxxxxxxx>
+
+	* nscd/nscd.c (termination_handler): Don't do anything for a database
+	if it has not yet been initialized.
+
 2011-07-18  Ulrich Drepper  <drepper@xxxxxxxxx>
 
 	* sysdeps/unix/sysv/linux/bits/sched.h (__CPU_EQUAL_S): Fix a typo.

Modified: fsf/trunk/libc/NEWS
==============================================================================
--- fsf/trunk/libc/NEWS (original)
+++ fsf/trunk/libc/NEWS Wed Jul 20 00:02:35 2011
@@ -1,4 +1,4 @@
-GNU C Library NEWS -- history of user-visible changes.  2011-7-6
+GNU C Library NEWS -- history of user-visible changes.  2011-7-19
 Copyright (C) 1992-2009, 2010, 2011 Free Software Foundation, Inc.
 See the end for copying conditions.
 
@@ -23,6 +23,9 @@
 
 * Improved strcpy, strncpy, stpcpy, stpncpy for SSE2 and SSSE3 on x86-64.
   Contributed by HJ Lu.
+
+* Optimized strcat and strncat on x86-64.
+  Contributed by Liubov Dmitrieva.
 
 Version 2.14
 

Modified: fsf/trunk/libc/crypt/md5.c
==============================================================================
--- fsf/trunk/libc/crypt/md5.c (original)
+++ fsf/trunk/libc/crypt/md5.c Wed Jul 20 00:02:35 2011
@@ -1,6 +1,6 @@
 /* Functions to compute MD5 message digest of files or memory blocks.
    according to the definition of MD5 in RFC 1321 from April 1992.
-   Copyright (C) 1995,1996,1997,1999,2000,2001,2005
+   Copyright (C) 1995,1996,1997,1999,2000,2001,2005,2011
 	Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -123,9 +123,9 @@
   memcpy (&ctx->buffer[bytes], fillbuf, pad);
 
   /* Put the 64-bit file length in *bits* at the end of the buffer.  */
-  *(md5_uint32 *) &ctx->buffer[bytes + pad] = SWAP (ctx->total[0] << 3);
-  *(md5_uint32 *) &ctx->buffer[bytes + pad + 4] = SWAP ((ctx->total[1] << 3) |
-							(ctx->total[0] >> 29));
+  ctx->buffer32[(bytes + pad) / 4] = SWAP (ctx->total[0] << 3);
+  ctx->buffer32[(bytes + pad + 4) / 4] = SWAP ((ctx->total[1] << 3) |
+					       (ctx->total[0] >> 29));
 
   /* Process last bytes.  */
   md5_process_block (ctx->buffer, bytes + pad + 8, ctx);
@@ -168,7 +168,7 @@
 	}
       while (sum < BLOCKSIZE && n != 0);
       if (n == 0 && ferror (stream))
-        return 1;
+	return 1;
 
       /* If end of file is reached, end the loop.  */
       if (n == 0)
@@ -340,12 +340,12 @@
 
 #define OP(a, b, c, d, s, T)						\
       do								\
-        {								\
+	{								\
 	  a += FF (b, c, d) + (*cwp++ = SWAP (*words)) + T;		\
 	  ++words;							\
 	  CYCLIC (a, s);						\
 	  a += b;							\
-        }								\
+	}								\
       while (0)
 
       /* It is unfortunate that C does not provide an operator for

Modified: fsf/trunk/libc/crypt/md5.h
==============================================================================
--- fsf/trunk/libc/crypt/md5.h (original)
+++ fsf/trunk/libc/crypt/md5.h Wed Jul 20 00:02:35 2011
@@ -1,6 +1,6 @@
 /* Declaration of functions and data types used for MD5 sum computing
    library functions.
-   Copyright (C) 1995-1997,1999,2000,2001,2004,2005
+   Copyright (C) 1995-1997,1999,2000,2001,2004,2005,2011
       Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -68,7 +68,7 @@
      typedef unsigned long md5_uint32;
 #   else
      /* The following line is intended to evoke an error.
-        Using #error is not portable enough.  */
+	Using #error is not portable enough.  */
      "Cannot determine unsigned 32-bit data type."
 #   endif
 #  endif
@@ -88,7 +88,11 @@
 
   md5_uint32 total[2];
   md5_uint32 buflen;
-  char buffer[128] __attribute__ ((__aligned__ (__alignof__ (md5_uint32))));
+  union
+  {
+    char buffer[128];
+    md5_uint32 buffer32[32];
+  };
 };
 
 /*

Modified: fsf/trunk/libc/crypt/sha256.c
==============================================================================
--- fsf/trunk/libc/crypt/sha256.c (original)
+++ fsf/trunk/libc/crypt/sha256.c Wed Jul 20 00:02:35 2011
@@ -222,13 +222,11 @@
 
   /* Put the 64-bit file length in *bits* at the end of the buffer.  */
 #ifdef _STRING_ARCH_unaligned
-  *(uint64_t *)  &ctx->buffer[bytes + pad] = SWAP64 (ctx->total64 << 3);
+  ctx->buffer64[(bytes + pad) / 8] = SWAP64 (ctx->total64 << 3);
 #else
-  *(uint32_t *) &ctx->buffer[bytes + pad + 4]
-    = SWAP (ctx->total[TOTAL64_low] << 3);
-  *(uint32_t *) &ctx->buffer[bytes + pad]
-    = SWAP ((ctx->total[TOTAL64_high] << 3) |
-	    (ctx->total[TOTAL64_low] >> 29));
+  ctx->buffer32[(bytes + pad + 4) / 4] = SWAP (ctx->total[TOTAL64_low] << 3);
+  ctx->buffer32[(bytes + pad) / 4] = SWAP ((ctx->total[TOTAL64_high] << 3) |
+					   (ctx->total[TOTAL64_low] >> 29));
 #endif
 
   /* Process last bytes.  */

Modified: fsf/trunk/libc/crypt/sha256.h
==============================================================================
--- fsf/trunk/libc/crypt/sha256.h (original)
+++ fsf/trunk/libc/crypt/sha256.h Wed Jul 20 00:02:35 2011
@@ -40,7 +40,12 @@
     uint32_t total[2];
   };
   uint32_t buflen;
-  char buffer[128] __attribute__ ((__aligned__ (__alignof__ (uint32_t))));
+  union
+  {
+    char buffer[128];
+    uint32_t buffer32[32];
+    uint64_t buffer64[16];
+  };
 };
 
 /* Initialize structure containing state of computation.

Modified: fsf/trunk/libc/crypt/sha512.c
==============================================================================
--- fsf/trunk/libc/crypt/sha512.c (original)
+++ fsf/trunk/libc/crypt/sha512.c Wed Jul 20 00:02:35 2011
@@ -253,11 +253,9 @@
   memcpy (&ctx->buffer[bytes], fillbuf, pad);
 
   /* Put the 128-bit file length in *bits* at the end of the buffer.  */
-  *(uint64_t *) &ctx->buffer[bytes + pad + 8]
-    = SWAP (ctx->total[TOTAL128_low] << 3);
-  *(uint64_t *) &ctx->buffer[bytes + pad]
-    = SWAP ((ctx->total[TOTAL128_high] << 3) |
-	    (ctx->total[TOTAL128_low] >> 61));
+  ctx->buffer64[(bytes + pad + 8) / 8] = SWAP (ctx->total[TOTAL128_low] << 3);
+  ctx->buffer64[(bytes + pad) / 8] = SWAP ((ctx->total[TOTAL128_high] << 3) |
+					   (ctx->total[TOTAL128_low] >> 61));
 
   /* Process last bytes.  */
   sha512_process_block (ctx->buffer, bytes + pad + 16, ctx);

Modified: fsf/trunk/libc/crypt/sha512.h
==============================================================================
--- fsf/trunk/libc/crypt/sha512.h (original)
+++ fsf/trunk/libc/crypt/sha512.h Wed Jul 20 00:02:35 2011
@@ -44,7 +44,11 @@
     uint64_t total[2];
   };
   uint64_t buflen;
-  char buffer[256] __attribute__ ((__aligned__ (__alignof__ (uint64_t))));
+  union
+  {
+    char buffer[256];
+    uint64_t buffer64[32];
+  };
 };
 
 /* Initialize structure containing state of computation.

Modified: fsf/trunk/libc/elf/cache.c
==============================================================================
--- fsf/trunk/libc/elf/cache.c (original)
+++ fsf/trunk/libc/elf/cache.c Wed Jul 20 00:02:35 2011
@@ -675,7 +675,6 @@
   if (aux_cache == MAP_FAILED
       || aux_cache_size < sizeof (struct aux_cache_file)
       || memcmp (aux_cache->magic, AUX_CACHEMAGIC, sizeof AUX_CACHEMAGIC - 1)
-      || aux_cache->nlibs < 0
       || aux_cache->nlibs >= aux_cache_size)
     {
       close (fd);

Modified: fsf/trunk/libc/elf/dl-close.c
==============================================================================
--- fsf/trunk/libc/elf/dl-close.c (original)
+++ fsf/trunk/libc/elf/dl-close.c Wed Jul 20 00:02:35 2011
@@ -231,7 +231,7 @@
     }
 
   /* Sort the entries.  */
-  _dl_sort_fini (ns->_ns_loaded, maps, nloaded, used, nsid);
+  _dl_sort_fini (maps, nloaded, used, nsid);
 
   /* Call all termination functions at once.  */
 #ifdef SHARED

Modified: fsf/trunk/libc/elf/dl-fini.c
==============================================================================
--- fsf/trunk/libc/elf/dl-fini.c (original)
+++ fsf/trunk/libc/elf/dl-fini.c Wed Jul 20 00:02:35 2011
@@ -30,8 +30,7 @@
 
 void
 internal_function
-_dl_sort_fini (struct link_map *l, struct link_map **maps, size_t nmaps,
-	       char *used, Lmid_t ns)
+_dl_sort_fini (struct link_map **maps, size_t nmaps, char *used, Lmid_t ns)
 {
   /* A list of one element need not be sorted.  */
   if (nmaps == 1)
@@ -199,7 +198,7 @@
       nmaps = i;
 
       /* Now we have to do the sorting.  */
-      _dl_sort_fini (GL(dl_ns)[ns]._ns_loaded, maps, nmaps, NULL, ns);
+      _dl_sort_fini (maps, nmaps, NULL, ns);
 
       /* We do not rely on the linked list of loaded object anymore from
 	 this point on.  We have our own list here (maps).  The various

Modified: fsf/trunk/libc/manual/intro.texi
==============================================================================
--- fsf/trunk/libc/manual/intro.texi (original)
+++ fsf/trunk/libc/manual/intro.texi Wed Jul 20 00:02:35 2011
@@ -714,9 +714,12 @@
 and says what standard or system each is derived from.
 
 @item
-@ref{Maintenance}, explains how to build and install the GNU C library on
-your system, how to report any bugs you might find, and how to add new
-functions or port the library to a new system.
+@ref{Installation}, explains how to build and install the GNU C library on
+your system, and how to report any bugs you might find.
+
+@item
+@ref{Maintenance}, explains how to add new functions or port the
+library to a new system.
 @end itemize
 
 If you already know the name of the facility you are interested in, you

Modified: fsf/trunk/libc/nscd/nscd.c
==============================================================================
--- fsf/trunk/libc/nscd/nscd.c (original)
+++ fsf/trunk/libc/nscd/nscd.c Wed Jul 20 00:02:35 2011
@@ -493,7 +493,7 @@
   /* Synchronize memory.  */
   for (int cnt = 0; cnt < lastdb; ++cnt)
     {
-      if (!dbs[cnt].enabled)
+      if (!dbs[cnt].enabled || dbs[cnt].head == NULL)
 	continue;
 
       /* Make sure nobody keeps using the database.  */

Modified: fsf/trunk/libc/nscd/nscd_conf.c
==============================================================================
--- fsf/trunk/libc/nscd/nscd_conf.c (original)
+++ fsf/trunk/libc/nscd/nscd_conf.c Wed Jul 20 00:02:35 2011
@@ -1,4 +1,4 @@
-/* Copyright (c) 1998, 2000, 2003-2007, 2008 Free Software Foundation, Inc.
+/* Copyright (c) 1998, 2000, 2003-2008, 2011 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Thorsten Kukuk <kukuk@xxxxxxx>, 1998.
 
@@ -189,17 +189,17 @@
 	  max_nthreads = MAX (atol (arg1), lastdb);
 	}
       else if (strcmp (entry, "server-user") == 0)
-        {
-          if (!arg1)
-            error (0, 0, _("Must specify user name for server-user option"));
-          else
-            server_user = xstrdup (arg1);
-        }
+	{
+	  if (!arg1)
+	    error (0, 0, _("Must specify user name for server-user option"));
+	  else
+	    server_user = xstrdup (arg1);
+	}
       else if (strcmp (entry, "stat-user") == 0)
-        {
-          if (arg1 == NULL)
-            error (0, 0, _("Must specify user name for stat-user option"));
-          else
+	{
+	  if (arg1 == NULL)
+	    error (0, 0, _("Must specify user name for stat-user option"));
+	  else
 	    {
 	      stat_user = xstrdup (arg1);
 
@@ -207,7 +207,7 @@
 	      if (pw != NULL)
 		stat_uid = pw->pw_uid;
 	    }
-        }
+	}
       else if (strcmp (entry, "persistent") == 0)
 	{
 	  int idx = find_db (arg1);
@@ -236,13 +236,11 @@
 	    reload_count = UINT_MAX;
 	  else
 	    {
-	      unsigned int count = strtoul (arg1, NULL, 0);
+	      unsigned long int count = strtoul (arg1, NULL, 0);
 	      if (count > UINT8_MAX - 1)
 		reload_count = UINT_MAX;
-	      else if (count >= 0)
-	    reload_count = count;
 	      else
-		error (0, 0, _("invalid value for 'reload-count': %u"), count);
+		reload_count = count;
 	    }
 	}
       else if (strcmp (entry, "paranoia") == 0)
@@ -257,7 +255,7 @@
 	  if (arg1 != NULL)
 	    restart_interval = atol (arg1);
 	  else
-            error (0, 0, _("Must specify value for restart-interval option"));
+	    error (0, 0, _("Must specify value for restart-interval option"));
 	}
       else if (strcmp (entry, "auto-propagate") == 0)
 	{

Modified: fsf/trunk/libc/string/strncat.c
==============================================================================
--- fsf/trunk/libc/string/strncat.c (original)
+++ fsf/trunk/libc/string/strncat.c Wed Jul 20 00:02:35 2011
@@ -24,10 +24,12 @@
 typedef char reg_char;
 #endif
 
-#undef strncat
+#ifndef STRNCAT
+# define STRNCAT  strncat
+#endif
 
 char *
-strncat (s1, s2, n)
+STRNCAT (s1, s2, n)
      char *s1;
      const char *s2;
      size_t n;

Modified: fsf/trunk/libc/string/strxfrm_l.c
==============================================================================
--- fsf/trunk/libc/string/strxfrm_l.c (original)
+++ fsf/trunk/libc/string/strxfrm_l.c Wed Jul 20 00:02:35 2011
@@ -1,4 +1,5 @@
-/* Copyright (C) 1995-1997,2002,2004-2006,2010 Free Software Foundation, Inc.
+/* Copyright (C) 1995-1997,2002,2004-2006,2010,2011
+   Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Written by Ulrich Drepper <drepper@xxxxxxx>, 1995.
 
@@ -150,7 +151,7 @@
      values.  But since there is no limit on the length of the string
      we have to use `malloc' if the string is too long.  We should be
      very conservative here.  */
-  if (! __libc_use_alloca (srclen))
+  if (! __libc_use_alloca ((srclen + 1) * (sizeof (int32_t) + 1)))
     {
       idxarr = (int32_t *) malloc ((srclen + 1) * (sizeof (int32_t) + 1));
       rulearr = (unsigned char *) &idxarr[srclen];

Modified: fsf/trunk/libc/sysdeps/generic/ldsodefs.h
==============================================================================
--- fsf/trunk/libc/sysdeps/generic/ldsodefs.h (original)
+++ fsf/trunk/libc/sysdeps/generic/ldsodefs.h Wed Jul 20 00:02:35 2011
@@ -1,5 +1,5 @@
 /* Run-time dynamic linker data structures for loaded ELF shared objects.
-   Copyright (C) 1995-2009, 2010 Free Software Foundation, Inc.
+   Copyright (C) 1995-2009, 2010, 2011 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -947,8 +947,8 @@
 extern void _dl_fini (void) internal_function;
 
 /* Sort array MAPS according to dependencies of the contained objects.  */
-extern void _dl_sort_fini (struct link_map *l, struct link_map **maps,
-			   size_t nmaps, char *used, Lmid_t ns)
+extern void _dl_sort_fini (struct link_map **maps, size_t nmaps, char *used,
+			   Lmid_t ns)
      internal_function;
 
 /* The dynamic linker calls this function before and having changing

Modified: fsf/trunk/libc/sysdeps/unix/sysv/linux/s390/s390-32/sysdep.h
==============================================================================
--- fsf/trunk/libc/sysdeps/unix/sysv/linux/s390/s390-32/sysdep.h (original)
+++ fsf/trunk/libc/sysdeps/unix/sysv/linux/s390/s390-32/sysdep.h Wed Jul 20 00:02:35 2011
@@ -50,7 +50,7 @@
    even if the call succeeded.  E.g., the `lseek' system call might return
    a large offset.  Therefore we must not anymore test for < 0, but test
    for a real error by making sure the value in gpr2 is a real error
-   number.  Linus said he will make sure the no syscall returns a value
+   number.  Linus said he will make sure that no syscall returns a value
    in -1 .. -4095 as a valid result so we can savely test with -4095.  */
 
 #undef PSEUDO
@@ -368,12 +368,12 @@
     DECLARGS_##nr(args)							      \
     register long _ret asm("2");						      \
     asm volatile (							      \
-    "lr 11,14\n\t"							      \
+    "lr 10,14\n\t"                                                           \
     "basr 14,%1\n\t"							      \
-    "lr 14,11\n\t"							      \
+    "lr 14,10\n\t"                                                           \
     : "=d" (_ret)							      \
     : "d" (fn) ASMFMT_##nr						      \
-    : "cc", "memory", "0", "1", "11" CLOBBER_##nr);			      \
+    : "cc", "memory", "0", "1", "10" CLOBBER_##nr);                          \
     _ret; })
 
 /* Pointer mangling support.  */

Modified: fsf/trunk/libc/sysdeps/unix/sysv/linux/s390/s390-64/sysdep.h
==============================================================================
--- fsf/trunk/libc/sysdeps/unix/sysv/linux/s390/s390-64/sysdep.h (original)
+++ fsf/trunk/libc/sysdeps/unix/sysv/linux/s390/s390-64/sysdep.h Wed Jul 20 00:02:35 2011
@@ -62,7 +62,7 @@
    even if the call succeeded.	E.g., the `lseek' system call might return
    a large offset.  Therefore we must not anymore test for < 0, but test
    for a real error by making sure the value in gpr2 is a real error
-   number.  Linus said he will make sure the no syscall returns a value
+   number.  Linus said he will make sure that no syscall returns a value
    in -1 .. -4095 as a valid result so we can savely test with -4095.  */
 
 #undef PSEUDO
@@ -370,12 +370,12 @@
     DECLARGS_##nr(args)							      \
     register long _ret asm("2");					      \
     asm volatile (							      \
-    "lgr 11,14\n\t"							      \
+    "lgr 10,14\n\t"                                                          \
     "basr 14,%1\n\t"							      \
-    "lgr 14,11\n\t"							      \
+    "lgr 14,10\n\t"                                                          \
     : "=d" (_ret)							      \
     : "a" (fn) ASMFMT_##nr						      \
-    : "cc", "memory", "0", "1", "11" CLOBBER_##nr);			      \
+    : "cc", "memory", "0", "1", "10" CLOBBER_##nr);                          \
     _ret; })
 
 /* Pointer mangling support.  */

Modified: fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile (original)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile Wed Jul 20 00:02:35 2011
@@ -5,14 +5,16 @@
 
 ifeq ($(subdir),string)
 
-sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
+sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
 		   strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
 		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
 		   memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
 		   strncase_l-ssse3 strlen-sse4 strlen-no-bsf memset-x86-64 \
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
-		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned
+		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
+		   strcat-sse2-unaligned strncat-sse2-unaligned \
+		   strcat-ssse3 strncat-ssse3 strlen-sse2-pminub
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
 CFLAGS-varshift.c += -msse4

Modified: fsf/trunk/libc/sysdeps/x86_64/multiarch/init-arch.c
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/init-arch.c (original)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/init-arch.c Wed Jul 20 00:02:35 2011
@@ -97,18 +97,22 @@
 	    case 0x2c:
 	    case 0x2e:
 	    case 0x2f:
-	      /* Rep string instructions, copy backward and unaligned loads
-		 are fast on Intel Core i3, i5 and i7.  */
+	      /* Rep string instructions, copy backward, unaligned loads
+		 and pminub are fast on Intel Core i3, i5 and i7.  */
 #if index_Fast_Rep_String != index_Fast_Copy_Backward
 # error index_Fast_Rep_String != index_Fast_Copy_Backward
 #endif
 #if index_Fast_Rep_String != index_Fast_Unaligned_Load
 # error index_Fast_Rep_String != index_Fast_Unaligned_Load
 #endif
+#if index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop
+# error index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop
+#endif
 	      __cpu_features.feature[index_Fast_Rep_String]
 		|= (bit_Fast_Rep_String
 		    | bit_Fast_Copy_Backward
-		    | bit_Fast_Unaligned_Load);
+		    | bit_Fast_Unaligned_Load
+		    | bit_Prefer_PMINUB_for_stringop);
 	      break;
 	    }
 	}

Modified: fsf/trunk/libc/sysdeps/x86_64/multiarch/init-arch.h
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/init-arch.h (original)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/init-arch.h Wed Jul 20 00:02:35 2011
@@ -21,6 +21,7 @@
 #define bit_Slow_BSF			(1 << 2)
 #define bit_Prefer_SSE_for_memop	(1 << 3)
 #define bit_Fast_Unaligned_Load		(1 << 4)
+#define bit_Prefer_PMINUB_for_stringop	(1 << 5)
 
 #ifdef	__ASSEMBLER__
 
@@ -41,6 +42,7 @@
 # define index_Slow_BSF			FEATURE_INDEX_1*FEATURE_SIZE
 # define index_Prefer_SSE_for_memop	FEATURE_INDEX_1*FEATURE_SIZE
 # define index_Fast_Unaligned_Load	FEATURE_INDEX_1*FEATURE_SIZE
+# define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1*FEATURE_SIZE
 
 #else	/* __ASSEMBLER__ */
 

Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S Wed Jul 20 00:02:35 2011
@@ -1,0 +1,54 @@
+/* strcat with SSE2
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# ifndef STRCAT
+#  define STRCAT  __strcat_sse2_unaligned
+# endif
+
+# define USE_AS_STRCAT
+
+.text
+ENTRY (STRCAT)
+	mov	%rdi, %r9
+# ifdef USE_AS_STRNCAT
+	mov	%rdx, %r8
+# endif
+
+# define RETURN  jmp L(StartStrcpyPart)
+# include "strlen-sse2-pminub.S"
+# undef RETURN
+
+L(StartStrcpyPart):
+	lea	(%r9, %rax), %rdi
+	mov	%rsi, %rcx
+	mov	%r9, %rax      /* save result */
+
+# ifdef USE_AS_STRNCAT
+	test	%r8, %r8
+	jz	L(ExitZero)
+#  define USE_AS_STRNCPY
+# endif
+
+# include "strcpy-sse2-unaligned.S"
+#endif

Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strcat-ssse3.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strcat-ssse3.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strcat-ssse3.S Wed Jul 20 00:02:35 2011
@@ -1,0 +1,558 @@
+/* strcat with SSSE3
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# ifndef STRCAT
+#  define STRCAT  __strcat_ssse3
+# endif
+
+# define USE_AS_STRCAT
+
+.text
+ENTRY (STRCAT)
+# ifdef USE_AS_STRNCAT
+	mov	%rdx, %r8
+# endif
+
+# define RETURN  jmp L(StartStrcpyPart)
+# include "strlen-no-bsf.S"
+
+# undef RETURN
+
+L(StartStrcpyPart):
+	mov	%rsi, %rcx
+	lea	(%rdi, %rax), %rdx
+# ifdef USE_AS_STRNCAT
+	test	%r8, %r8
+	jz	L(StrncatExit0)
+	cmp	$8, %r8
+	jbe	L(StrncatExit8Bytes)
+# endif
+	cmpb	$0, (%rcx)
+	jz	L(Exit1)
+	cmpb	$0, 1(%rcx)
+	jz	L(Exit2)
+	cmpb	$0, 2(%rcx)
+	jz	L(Exit3)
+	cmpb	$0, 3(%rcx)
+	jz	L(Exit4)
+	cmpb	$0, 4(%rcx)
+	jz	L(Exit5)
+	cmpb	$0, 5(%rcx)
+	jz	L(Exit6)
+	cmpb	$0, 6(%rcx)
+	jz	L(Exit7)
+	cmpb	$0, 7(%rcx)
+	jz	L(Exit8)
+	cmpb	$0, 8(%rcx)
+	jz	L(Exit9)
+# ifdef USE_AS_STRNCAT
+	cmp	$16, %r8
+	jb	L(StrncatExit15Bytes)
+# endif
+	cmpb	$0, 9(%rcx)
+	jz	L(Exit10)
+	cmpb	$0, 10(%rcx)
+	jz	L(Exit11)
+	cmpb	$0, 11(%rcx)
+	jz	L(Exit12)
+	cmpb	$0, 12(%rcx)
+	jz	L(Exit13)
+	cmpb	$0, 13(%rcx)
+	jz	L(Exit14)
+	cmpb	$0, 14(%rcx)
+	jz	L(Exit15)
+	cmpb	$0, 15(%rcx)
+	jz	L(Exit16)
+# ifdef USE_AS_STRNCAT
+	cmp	$16, %r8
+	je	L(StrncatExit16)
+#  define USE_AS_STRNCPY
+# endif
+
+# include "strcpy-ssse3.S"
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%rsi, %rdx
+	add	%rsi, %rcx
+
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	movlpd	(%rcx), %xmm0
+	movlpd	8(%rcx), %xmm1
+	movlpd	%xmm0, (%rdx)
+	movlpd	%xmm1, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit1):
+	xor	%ah, %ah
+	movb	%ah, 1(%rdx)
+L(Exit1):
+	movb	(%rcx), %al
+	movb	%al, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit2):
+	xor	%ah, %ah
+	movb	%ah, 2(%rdx)
+L(Exit2):
+	movw	(%rcx), %ax
+	movw	%ax, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit3):
+	xor	%ah, %ah
+	movb	%ah, 3(%rdx)
+L(Exit3):
+	movw	(%rcx), %ax
+	movw	%ax, (%rdx)
+	movb	2(%rcx), %al
+	movb	%al, 2(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit4):
+	xor	%ah, %ah
+	movb	%ah, 4(%rdx)
+L(Exit4):
+	mov	(%rcx), %eax
+	mov	%eax, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit5):
+	xor	%ah, %ah
+	movb	%ah, 5(%rdx)
+L(Exit5):
+	mov	(%rcx), %eax
+	mov	%eax, (%rdx)
+	movb	4(%rcx), %al
+	movb	%al, 4(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit6):
+	xor	%ah, %ah
+	movb	%ah, 6(%rdx)
+L(Exit6):
+	mov	(%rcx), %eax
+	mov	%eax, (%rdx)
+	movw	4(%rcx), %ax
+	movw	%ax, 4(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit7):
+	xor	%ah, %ah
+	movb	%ah, 7(%rdx)
+L(Exit7):
+	mov	(%rcx), %eax
+	mov	%eax, (%rdx)
+	mov	3(%rcx), %eax
+	mov	%eax, 3(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit8):
+	xor	%ah, %ah
+	movb	%ah, 8(%rdx)
+L(Exit8):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit9):
+	xor	%ah, %ah
+	movb	%ah, 9(%rdx)
+L(Exit9):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movb	8(%rcx), %al
+	movb	%al, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit10):
+	xor	%ah, %ah
+	movb	%ah, 10(%rdx)
+L(Exit10):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movw	8(%rcx), %ax
+	movw	%ax, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit11):
+	xor	%ah, %ah
+	movb	%ah, 11(%rdx)
+L(Exit11):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	mov	7(%rcx), %eax
+	mov	%eax, 7(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit12):
+	xor	%ah, %ah
+	movb	%ah, 12(%rdx)
+L(Exit12):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	mov	8(%rcx), %eax
+	mov	%eax, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit13):
+	xor	%ah, %ah
+	movb	%ah, 13(%rdx)
+L(Exit13):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movlpd	5(%rcx), %xmm1
+	movlpd	%xmm1, 5(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit14):
+	xor	%ah, %ah
+	movb	%ah, 14(%rdx)
+L(Exit14):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movlpd	6(%rcx), %xmm1
+	movlpd	%xmm1, 6(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit15):
+	xor	%ah, %ah
+	movb	%ah, 15(%rdx)
+L(Exit15):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movlpd	7(%rcx), %xmm1
+	movlpd	%xmm1, 7(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit16):
+	xor	%ah, %ah
+	movb	%ah, 16(%rdx)
+L(Exit16):
+	movlpd	(%rcx), %xmm0
+	movlpd	8(%rcx), %xmm1
+	movlpd	%xmm0, (%rdx)
+	movlpd	%xmm1, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+# ifdef USE_AS_STRNCPY
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %r8
+	add	%rsi, %rcx
+	lea	(%rsi, %rdx), %rsi
+	lea	-9(%r8), %rdx
+	and	$1<<7, %dh
+	or	%al, %dh
+	test	%dh, %dh
+	lea	(%rsi), %rdx
+	jz	L(ExitHighCase2)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	cmp	$1, %r8
+	je	L(StrncatExit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	cmp	$2, %r8
+	je	L(StrncatExit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	cmp	$3, %r8
+	je	L(StrncatExit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	cmp	$4, %r8
+	je	L(StrncatExit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	cmp	$5, %r8
+	je	L(StrncatExit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	cmp	$6, %r8
+	je	L(StrncatExit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	cmp	$7, %r8
+	je	L(StrncatExit7)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	lea	7(%rdx), %rax
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+	xor	%cl, %cl
+	movb	%cl, (%rax)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(ExitHighCase2):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	cmp	$9, %r8
+	je	L(StrncatExit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	cmp	$10, %r8
+	je	L(StrncatExit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	cmp	$11, %r8
+	je	L(StrncatExit11)
+	test	$0x8, %ah
+	jnz	L(Exit12)
+	cmp	$12, %r8
+	je	L(StrncatExit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	cmp	$13, %r8
+	je	L(StrncatExit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	cmp	$14, %r8
+	je	L(StrncatExit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	cmp	$15, %r8
+	je	L(StrncatExit15)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movlpd	8(%rcx), %xmm1
+	movlpd	%xmm1, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase3):
+	add	$16, %r8
+	add	%rsi, %rdx
+	add	%rsi, %rcx
+
+	cmp	$8, %r8
+	ja	L(ExitHighCase3)
+	cmp	$1, %r8
+	je	L(StrncatExit1)
+	cmp	$2, %r8
+	je	L(StrncatExit2)
+	cmp	$3, %r8
+	je	L(StrncatExit3)
+	cmp	$4, %r8
+	je	L(StrncatExit4)
+	cmp	$5, %r8
+	je	L(StrncatExit5)
+	cmp	$6, %r8
+	je	L(StrncatExit6)
+	cmp	$7, %r8
+	je	L(StrncatExit7)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	xor	%ah, %ah
+	movb	%ah, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(ExitHighCase3):
+	cmp	$9, %r8
+	je	L(StrncatExit9)
+	cmp	$10, %r8
+	je	L(StrncatExit10)
+	cmp	$11, %r8
+	je	L(StrncatExit11)
+	cmp	$12, %r8
+	je	L(StrncatExit12)
+	cmp	$13, %r8
+	je	L(StrncatExit13)
+	cmp	$14, %r8
+	je	L(StrncatExit14)
+	cmp	$15, %r8
+	je	L(StrncatExit15)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movlpd	8(%rcx), %xmm1
+	movlpd	%xmm1, 8(%rdx)
+	xor	%ah, %ah
+	movb	%ah, 16(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit0):
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit15Bytes):
+	cmp	$9, %r8
+	je	L(StrncatExit9)
+	cmpb	$0, 9(%rcx)
+	jz	L(Exit10)
+	cmp	$10, %r8
+	je	L(StrncatExit10)
+	cmpb	$0, 10(%rcx)
+	jz	L(Exit11)
+	cmp	$11, %r8
+	je	L(StrncatExit11)
+	cmpb	$0, 11(%rcx)
+	jz	L(Exit12)
+	cmp	$12, %r8
+	je	L(StrncatExit12)
+	cmpb	$0, 12(%rcx)
+	jz	L(Exit13)
+	cmp	$13, %r8
+	je	L(StrncatExit13)
+	cmpb	$0, 13(%rcx)
+	jz	L(Exit14)
+	cmp	$14, %r8
+	je	L(StrncatExit14)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movlpd	7(%rcx), %xmm1
+	movlpd	%xmm1, 7(%rdx)
+	lea	14(%rdx), %rax
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+	xor	%cl, %cl
+	movb	%cl, (%rax)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit8Bytes):
+	cmpb	$0, (%rcx)
+	jz	L(Exit1)
+	cmp	$1, %r8
+	je	L(StrncatExit1)
+	cmpb	$0, 1(%rcx)
+	jz	L(Exit2)
+	cmp	$2, %r8
+	je	L(StrncatExit2)
+	cmpb	$0, 2(%rcx)
+	jz	L(Exit3)
+	cmp	$3, %r8
+	je	L(StrncatExit3)
+	cmpb	$0, 3(%rcx)
+	jz	L(Exit4)
+	cmp	$4, %r8
+	je	L(StrncatExit4)
+	cmpb	$0, 4(%rcx)
+	jz	L(Exit5)
+	cmp	$5, %r8
+	je	L(StrncatExit5)
+	cmpb	$0, 5(%rcx)
+	jz	L(Exit6)
+	cmp	$6, %r8
+	je	L(StrncatExit6)
+	cmpb	$0, 6(%rcx)
+	jz	L(Exit7)
+	cmp	$7, %r8
+	je	L(StrncatExit7)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	lea	7(%rdx), %rax
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+	xor	%cl, %cl
+	movb	%cl, (%rax)
+	mov	%rdi, %rax
+	ret
+
+# endif
+END (STRCAT)
+#endif

Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strcat.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strcat.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strcat.S Wed Jul 20 00:02:35 2011
@@ -1,0 +1,85 @@
+/* Multiple versions of strcat
+   Copyright (C) 2009, 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef USE_AS_STRNCAT
+# ifndef STRCAT
+#  define STRCAT strcat
+# endif
+#endif
+
+#ifdef USE_AS_STRNCAT
+# define STRCAT_SSSE3	         	__strncat_ssse3
+# define STRCAT_SSE2	            	__strncat_sse2
+# define STRCAT_SSE2_UNALIGNED    	__strncat_sse2_unaligned
+# define __GI_STRCAT	            	__GI_strncat
+# define __GI___STRCAT              __GI___strncat
+#else
+# define STRCAT_SSSE3	         	__strcat_ssse3
+# define STRCAT_SSE2	            	__strcat_sse2
+# define STRCAT_SSE2_UNALIGNED    	__strcat_sse2_unaligned
+# define __GI_STRCAT	            	__GI_strcat
+# define __GI___STRCAT              __GI___strcat
+#endif
+
+
+/* Define multiple versions only for the definition in libc.  */
+#ifndef NOT_IN_libc
+	.text
+ENTRY(STRCAT)
+	.type	STRCAT, @gnu_indirect_function
+	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	STRCAT_SSE2_UNALIGNED(%rip), %rax
+	testl	$bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+	jnz	2f
+	leaq	STRCAT_SSE2(%rip), %rax
+	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	jz	2f
+	leaq	STRCAT_SSSE3(%rip), %rax
+2:	ret
+END(STRCAT)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCAT_SSE2, @function; \
+	.align 16; \
+	STRCAT_SSE2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCAT_SSE2, .-STRCAT_SSE2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcat calls through a PLT.
+   The speedup we get from using SSSE3 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCAT; __GI_STRCAT = STRCAT_SSE2
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+	.globl __GI___STRCAT; __GI___STRCAT = STRCAT_SSE2
+#endif
+
+#ifndef USE_AS_STRNCAT
+# include "../strcat.S"
+#endif

Modified: fsf/trunk/libc/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S (original)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S Wed Jul 20 00:02:35 2011
@@ -20,10 +20,13 @@
 
 #ifndef NOT_IN_libc
 
-# include <sysdep.h>
-
-# ifndef STRCPY
-#  define STRCPY  __strcpy_sse2_unaligned
+# ifndef USE_AS_STRCAT
+#  include <sysdep.h>
+
+#  ifndef STRCPY
+#   define STRCPY  __strcpy_sse2_unaligned
+#  endif
+
 # endif
 
 # define JMPTBL(I, B)	I - B
@@ -33,16 +36,20 @@
 	lea	(%r11, %rcx), %rcx;                             \
 	jmp	*%rcx
 
-	.text
+# ifndef USE_AS_STRCAT
+
+.text
 ENTRY (STRCPY)
-# ifdef USE_AS_STRNCPY
+#  ifdef USE_AS_STRNCPY
 	mov	%rdx, %r8
 	test	%r8, %r8
 	jz	L(ExitZero)
-# endif
+#  endif
 	mov	%rsi, %rcx
-# ifndef USE_AS_STPCPY
+#  ifndef USE_AS_STPCPY
 	mov	%rdi, %rax      /* save result */
+#  endif
+
 # endif
 
 	and	$15, %rcx
@@ -59,7 +66,7 @@
 	pmovmskb %xmm1, %rdx
 	shr	%cl, %rdx
 # ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
 	cmp	$16, %r8
 #  else
 	cmp	$17, %r8
@@ -72,7 +79,7 @@
 	pcmpeqb	16(%rsi), %xmm0
 	pmovmskb %xmm0, %rdx
 # ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
 	cmp	$32, %r8
 #  else
 	cmp	$33, %r8
@@ -102,7 +109,7 @@
 	jbe	L(CopyFrom1To16BytesCase2OrCase3)
 # endif
 	test	%rdx, %rdx
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
 # else
 	jnz	L(CopyFrom1To16Bytes)
@@ -118,7 +125,7 @@
 	jbe	L(CopyFrom1To16BytesCase2OrCase3)
 # endif
 	test	%rdx, %rdx
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
 # else
 	jnz	L(CopyFrom1To16Bytes)
@@ -134,7 +141,7 @@
 	jbe	L(CopyFrom1To16BytesCase2OrCase3)
 # endif
 	test	%rdx, %rdx
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
 # else
 	jnz	L(CopyFrom1To16Bytes)
@@ -150,7 +157,7 @@
 	jbe	L(CopyFrom1To16BytesCase2OrCase3)
 # endif
 	test	%rdx, %rdx
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	jnz	L(CopyFrom1To16BytesUnalignedXmm1)
 # else
 	jnz	L(CopyFrom1To16Bytes)
@@ -166,7 +173,7 @@
 	jbe	L(CopyFrom1To16BytesCase2OrCase3)
 # endif
 	test	%rdx, %rdx
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
 # else
 	jnz	L(CopyFrom1To16Bytes)
@@ -182,7 +189,7 @@
 	jbe	L(CopyFrom1To16BytesCase2OrCase3)
 # endif
 	test	%rdx, %rdx
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
 # else
 	jnz	L(CopyFrom1To16Bytes)
@@ -264,10 +271,10 @@
 	movdqu	%xmm4, (%rdi)
 	movdqu	%xmm5, 16(%rdi)
 	movdqu	%xmm6, 32(%rdi)
-# if defined USE_AS_STRNCPY
-#  ifdef USE_AS_STPCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
 	lea	48(%rdi, %rdx), %rax
-#  endif
+# endif
 	movdqu	%xmm7, 48(%rdi)
 	add	$15, %r8
 	sub	%rdx, %r8
@@ -288,7 +295,7 @@
 	pmovmskb %xmm0, %rdx
 
 # ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
 	cmp	$16, %r8
 #  else
 	cmp	$17, %r8
@@ -303,7 +310,7 @@
 	pmovmskb %xmm0, %rdx
 
 # ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
 	cmp	$32, %r8
 #  else
 	cmp	$33, %r8
@@ -314,11 +321,11 @@
 	jnz	L(CopyFrom1To32Bytes1)
 	jmp	L(Unalign16Both)
 
-/* ------End of main part with loops--------------------- */
+/*------End of main part with loops---------------------*/
 
 /* Case1 */
 
-# if (!defined USE_AS_STRNCPY)
+# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
 	.p2align 4
 L(CopyFrom1To16Bytes):
 	add	%rcx, %rdi
@@ -328,7 +335,7 @@
 # endif
 	.p2align 4
 L(CopyFrom1To16BytesTail):
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	%rcx, %r8
 # endif
 	add	%rcx, %rsi
@@ -339,7 +346,7 @@
 L(CopyFrom1To32Bytes1):
 	add	$16, %rsi
 	add	$16, %rdi
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$16, %r8
 # endif
 L(CopyFrom1To16BytesTail1):
@@ -348,7 +355,7 @@
 
 	.p2align 4
 L(CopyFrom1To32Bytes):
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	%rcx, %r8
 # endif
 	bsf	%rdx, %rdx
@@ -360,10 +367,10 @@
 	.p2align 4
 L(CopyFrom1To16BytesUnaligned_0):
 	bsf	%rdx, %rdx
-# if defined USE_AS_STRNCPY
-#  ifdef USE_AS_STPCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
 	lea	(%rdi, %rdx), %rax
-#  endif
+# endif
 	movdqu	%xmm4, (%rdi)
 	add	$63, %r8
 	sub	%rdx, %r8
@@ -377,10 +384,10 @@
 L(CopyFrom1To16BytesUnaligned_16):
 	bsf	%rcx, %rdx
 	movdqu	%xmm4, (%rdi)
-# if defined USE_AS_STRNCPY
-#  ifdef USE_AS_STPCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
 	lea	16(%rdi, %rdx), %rax
-#  endif
+# endif
 	movdqu	%xmm5, 16(%rdi)
 	add	$47, %r8
 	sub	%rdx, %r8
@@ -397,10 +404,10 @@
 	bsf	%rdx, %rdx
 	movdqu	%xmm4, (%rdi)
 	movdqu	%xmm5, 16(%rdi)
-# if defined USE_AS_STRNCPY
-#  ifdef USE_AS_STPCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
 	lea	32(%rdi, %rdx), %rax
-#  endif
+# endif
 	movdqu	%xmm6, 32(%rdi)
 	add	$31, %r8
 	sub	%rdx, %r8
@@ -413,6 +420,7 @@
 # endif
 
 # ifdef USE_AS_STRNCPY
+#  ifndef USE_AS_STRCAT
 	.p2align 4
 L(CopyFrom1To16BytesUnalignedXmm6):
 	movdqu	%xmm6, (%rdi, %rcx)
@@ -437,6 +445,7 @@
 L(CopyFrom1To16BytesUnalignedXmm1):
 	movdqu	%xmm1, (%rdi, %rcx)
 	jmp	L(CopyFrom1To16BytesXmmExit)
+#  endif
 
 	.p2align 4
 L(CopyFrom1To16BytesExit):
@@ -519,7 +528,7 @@
 
 # endif
 
-/* ----End labels regarding with copying 1-16 bytes--and 1-32 bytes---- */
+/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/
 
 	.p2align 4
 L(Exit1):
@@ -527,7 +536,7 @@
 # ifdef USE_AS_STPCPY
 	lea	(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$1, %r8
 	lea	1(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -541,7 +550,7 @@
 # ifdef USE_AS_STPCPY
 	lea	1(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$2, %r8
 	lea	2(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -556,7 +565,7 @@
 # ifdef USE_AS_STPCPY
 	lea	2(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$3, %r8
 	lea	3(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -570,7 +579,7 @@
 # ifdef USE_AS_STPCPY
 	lea	3(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$4, %r8
 	lea	4(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -585,7 +594,7 @@
 # ifdef USE_AS_STPCPY
 	lea	4(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$5, %r8
 	lea	5(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -601,7 +610,7 @@
 # ifdef USE_AS_STPCPY
 	lea	5(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$6, %r8
 	lea	6(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -617,7 +626,7 @@
 # ifdef USE_AS_STPCPY
 	lea	6(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$7, %r8
 	lea	7(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -631,7 +640,7 @@
 # ifdef USE_AS_STPCPY
 	lea	7(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$8, %r8
 	lea	8(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -646,7 +655,7 @@
 # ifdef USE_AS_STPCPY
 	lea	8(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$9, %r8
 	lea	9(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -662,7 +671,7 @@
 # ifdef USE_AS_STPCPY
 	lea	9(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$10, %r8
 	lea	10(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -678,7 +687,7 @@
 # ifdef USE_AS_STPCPY
 	lea	10(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$11, %r8
 	lea	11(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -694,7 +703,7 @@
 # ifdef USE_AS_STPCPY
 	lea	11(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$12, %r8
 	lea	12(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -710,7 +719,7 @@
 # ifdef USE_AS_STPCPY
 	lea	12(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$13, %r8
 	lea	13(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -726,7 +735,7 @@
 # ifdef USE_AS_STPCPY
 	lea	13(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$14, %r8
 	lea	14(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -742,7 +751,7 @@
 # ifdef USE_AS_STPCPY
 	lea	14(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$15, %r8
 	lea	15(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -756,7 +765,7 @@
 # ifdef USE_AS_STPCPY
 	lea	15(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$16, %r8
 	lea	16(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -771,7 +780,7 @@
 # ifdef USE_AS_STPCPY
 	lea	16(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$17, %r8
 	lea	17(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -787,7 +796,7 @@
 # ifdef USE_AS_STPCPY
 	lea	17(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$18, %r8
 	lea	18(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -803,7 +812,7 @@
 # ifdef USE_AS_STPCPY
 	lea	18(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$19, %r8
 	lea	19(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -819,7 +828,7 @@
 # ifdef USE_AS_STPCPY
 	lea	19(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$20, %r8
 	lea	20(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -836,7 +845,7 @@
 # ifdef USE_AS_STPCPY
 	lea	20(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$21, %r8
 	lea	21(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -852,7 +861,7 @@
 # ifdef USE_AS_STPCPY
 	lea	21(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$22, %r8
 	lea	22(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -868,7 +877,7 @@
 # ifdef USE_AS_STPCPY
 	lea	22(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$23, %r8
 	lea	23(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -884,7 +893,7 @@
 # ifdef USE_AS_STPCPY
 	lea	23(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$24, %r8
 	lea	24(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -901,7 +910,7 @@
 # ifdef USE_AS_STPCPY
 	lea	24(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$25, %r8
 	lea	25(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -919,7 +928,7 @@
 # ifdef USE_AS_STPCPY
 	lea	25(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$26, %r8
 	lea	26(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -937,7 +946,7 @@
 # ifdef USE_AS_STPCPY
 	lea	26(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$27, %r8
 	lea	27(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -955,7 +964,7 @@
 # ifdef USE_AS_STPCPY
 	lea	27(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$28, %r8
 	lea	28(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -971,7 +980,7 @@
 # ifdef USE_AS_STPCPY
 	lea	28(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$29, %r8
 	lea	29(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -987,7 +996,7 @@
 # ifdef USE_AS_STPCPY
 	lea	29(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$30, %r8
 	lea	30(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -1003,7 +1012,7 @@
 # ifdef USE_AS_STPCPY
 	lea	30(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$31, %r8
 	lea	31(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -1019,7 +1028,7 @@
 # ifdef USE_AS_STPCPY
 	lea	31(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$32, %r8
 	lea	32(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -1030,27 +1039,39 @@
 
 	.p2align 4
 L(StrncpyExit0):
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	mov	%rdi, %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, (%rdi)
+#  endif
 	ret
 
 	.p2align 4
 L(StrncpyExit1):
 	mov	(%rsi), %dl
 	mov	%dl, (%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	1(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 1(%rdi)
+#  endif
 	ret
 
 	.p2align 4
 L(StrncpyExit2):
 	mov	(%rsi), %dx
 	mov	%dx, (%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	2(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 2(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1059,18 +1080,26 @@
 	mov	2(%rsi), %dl
 	mov	%cx, (%rdi)
 	mov	%dl, 2(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	3(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 3(%rdi)
+#  endif
 	ret
 
 	.p2align 4
 L(StrncpyExit4):
 	mov	(%rsi), %edx
 	mov	%edx, (%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	4(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 4(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1079,9 +1108,13 @@
 	mov	4(%rsi), %dl
 	mov	%ecx, (%rdi)
 	mov	%dl, 4(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	5(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 5(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1090,9 +1123,13 @@
 	mov	4(%rsi), %dx
 	mov	%ecx, (%rdi)
 	mov	%dx, 4(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	6(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 6(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1101,18 +1138,26 @@
 	mov	3(%rsi), %edx
 	mov	%ecx, (%rdi)
 	mov	%edx, 3(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	7(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 7(%rdi)
+#  endif
 	ret
 
 	.p2align 4
 L(StrncpyExit8):
 	mov	(%rsi), %rdx
 	mov	%rdx, (%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	8(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 8(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1121,9 +1166,13 @@
 	mov	8(%rsi), %dl
 	mov	%rcx, (%rdi)
 	mov	%dl, 8(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	9(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 9(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1132,9 +1181,13 @@
 	mov	8(%rsi), %dx
 	mov	%rcx, (%rdi)
 	mov	%dx, 8(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	10(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 10(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1143,9 +1196,13 @@
 	mov	7(%rsi), %edx
 	mov	%rcx, (%rdi)
 	mov	%edx, 7(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	11(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 11(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1154,9 +1211,13 @@
 	mov	8(%rsi), %edx
 	mov	%rcx, (%rdi)
 	mov	%edx, 8(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	12(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 12(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1165,9 +1226,13 @@
 	mov	5(%rsi), %rdx
 	mov	%rcx, (%rdi)
 	mov	%rdx, 5(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	13(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 13(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1176,9 +1241,13 @@
 	mov	6(%rsi), %rdx
 	mov	%rcx, (%rdi)
 	mov	%rdx, 6(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	14(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 14(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1187,18 +1256,26 @@
 	mov	7(%rsi), %rdx
 	mov	%rcx, (%rdi)
 	mov	%rdx, 7(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	15(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 15(%rdi)
+#  endif
 	ret
 
 	.p2align 4
 L(StrncpyExit16):
 	movdqu	(%rsi), %xmm0
 	movdqu	%xmm0, (%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	16(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 16(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1207,9 +1284,13 @@
 	mov	16(%rsi), %cl
 	movdqu	%xmm0, (%rdi)
 	mov	%cl, 16(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	17(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 17(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1218,9 +1299,13 @@
 	mov	16(%rsi), %cx
 	movdqu	%xmm0, (%rdi)
 	mov	%cx, 16(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	18(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 18(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1229,9 +1314,13 @@
 	mov	15(%rsi), %ecx
 	movdqu	%xmm0, (%rdi)
 	mov	%ecx, 15(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	19(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 19(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1240,9 +1329,13 @@
 	mov	16(%rsi), %ecx
 	movdqu	%xmm0, (%rdi)
 	mov	%ecx, 16(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	20(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 20(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1253,9 +1346,13 @@
 	movdqu	%xmm0, (%rdi)
 	mov	%ecx, 16(%rdi)
 	mov	%dl, 20(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	21(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 21(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1264,9 +1361,13 @@
 	mov	14(%rsi), %rcx
 	movdqu	%xmm0, (%rdi)
 	mov	%rcx, 14(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	22(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 22(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1275,9 +1376,13 @@
 	mov	15(%rsi), %rcx
 	movdqu	%xmm0, (%rdi)
 	mov	%rcx, 15(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	23(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 23(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1286,9 +1391,13 @@
 	mov	16(%rsi), %rcx
 	movdqu	%xmm0, (%rdi)
 	mov	%rcx, 16(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	24(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 24(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1299,9 +1408,13 @@
 	movdqu	%xmm0, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%cl, 24(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	25(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 25(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1312,9 +1425,13 @@
 	movdqu	%xmm0, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%cx, 24(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	26(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 26(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1325,9 +1442,13 @@
 	movdqu	%xmm0, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%ecx, 23(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	27(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 27(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1338,9 +1459,13 @@
 	movdqu	%xmm0, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%ecx, 24(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	28(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 28(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1349,9 +1474,13 @@
 	movdqu	13(%rsi), %xmm2
 	movdqu	%xmm0, (%rdi)
 	movdqu	%xmm2, 13(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	29(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 29(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1360,9 +1489,13 @@
 	movdqu	14(%rsi), %xmm2
 	movdqu	%xmm0, (%rdi)
 	movdqu	%xmm2, 14(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	30(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 30(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1371,9 +1504,13 @@
 	movdqu	15(%rsi), %xmm2
 	movdqu	%xmm0, (%rdi)
 	movdqu	%xmm2, 15(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	31(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 31(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1382,9 +1519,13 @@
 	movdqu	16(%rsi), %xmm2
 	movdqu	%xmm0, (%rdi)
 	movdqu	%xmm2, 16(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	32(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 32(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1395,7 +1536,13 @@
 	movdqu	%xmm0, (%rdi)
 	movdqu	%xmm2, 16(%rdi)
 	mov	%cl, 32(%rdi)
-	ret
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 33(%rdi)
+#  endif
+	ret
+
+#  ifndef USE_AS_STRCAT
 
 	.p2align 4
 L(Fill0):
@@ -1498,9 +1645,9 @@
 	bsf	%rdx, %rdx
 	add	$15, %r8
 	add	%rcx, %rdi
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	lea	(%rdi, %rdx), %rax
-# endif
+#   endif
 	sub	%rdx, %r8
 	lea	1(%rdi, %rdx), %rdi
 
@@ -1552,6 +1699,9 @@
 L(StrncpyFillExit):
 	add	$16, %r8
 	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+/* end of ifndef USE_AS_STRCAT */
+#  endif
 
 	.p2align 4
 L(UnalignedLeaveCase2OrCase3):
@@ -1572,9 +1722,13 @@
 	sub	$16, %r8
 	jb	L(CopyFrom1To16BytesCase3)
 	movdqu	%xmm7, 48(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	64(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 64(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1585,8 +1739,11 @@
 	add	$48, %r8
 	jle	L(CopyFrom1To16BytesCase2OrCase3)
 	test	%rdx, %rdx
+#  ifndef USE_AS_STRCAT
 	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
-
+#  else
+	jnz	L(CopyFrom1To16Bytes)
+#  endif
 	pcmpeqb	%xmm5, %xmm0
 	pmovmskb %xmm0, %rdx
 	movdqu	%xmm4, (%rdi)
@@ -1594,7 +1751,11 @@
 	sub	$16, %r8
 	jbe	L(CopyFrom1To16BytesCase2OrCase3)
 	test	%rdx, %rdx
+#  ifndef USE_AS_STRCAT
 	jnz	L(CopyFrom1To16BytesUnalignedXmm5)
+#  else
+	jnz	L(CopyFrom1To16Bytes)
+#  endif
 
 	pcmpeqb	%xmm6, %xmm0
 	pmovmskb %xmm0, %rdx
@@ -1603,7 +1764,11 @@
 	sub	$16, %r8
 	jbe	L(CopyFrom1To16BytesCase2OrCase3)
 	test	%rdx, %rdx
+#  ifndef USE_AS_STRCAT
 	jnz	L(CopyFrom1To16BytesUnalignedXmm6)
+#  else
+	jnz	L(CopyFrom1To16Bytes)
+#  endif
 
 	pcmpeqb	%xmm7, %xmm0
 	pmovmskb %xmm0, %rdx
@@ -1617,13 +1782,18 @@
 
 	.p2align 4
 L(ExitZero):
+#  ifndef USE_AS_STRCAT
 	mov	%rdi, %rax
-	ret
-
-# endif
-
+#  endif
+	ret
+
+# endif
+
+# ifndef USE_AS_STRCAT
 END (STRCPY)
-
+# else
+END (STRCAT)
+# endif
 	.p2align 4
 	.section .rodata
 L(ExitTable):
@@ -1695,6 +1865,7 @@
 	.int	JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+#  ifndef USE_AS_STRCAT
 	.p2align 4
 L(FillTable):
 	.int	JMPTBL(L(Fill0), L(FillTable))
@@ -1714,5 +1885,6 @@
 	.int	JMPTBL(L(Fill14), L(FillTable))
 	.int	JMPTBL(L(Fill15), L(FillTable))
 	.int	JMPTBL(L(Fill16), L(FillTable))
+#  endif
 # endif
 #endif

Modified: fsf/trunk/libc/sysdeps/x86_64/multiarch/strcpy-ssse3.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strcpy-ssse3.S (original)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strcpy-ssse3.S Wed Jul 20 00:02:35 2011
@@ -20,25 +20,26 @@
 
 #ifndef NOT_IN_libc
 
-# include <sysdep.h>
-
-# ifndef STRCPY
-#  define STRCPY  __strcpy_ssse3
-# endif
+# ifndef USE_AS_STRCAT
+#  include <sysdep.h>
+
+#  ifndef STRCPY
+#   define STRCPY  __strcpy_ssse3
+#  endif
 
 	.section .text.ssse3,"ax",@progbits
 ENTRY (STRCPY)
 	mov	%rsi, %rcx
-# ifdef USE_AS_STRNCPY
+#  ifdef USE_AS_STRNCPY
 	mov	%rdx, %r8
-# endif
+#  endif
 	mov	%rdi, %rdx
-# ifdef USE_AS_STRNCPY
+#  ifdef USE_AS_STRNCPY
 	test	%r8, %r8
 	jz	L(Exit0)
 	cmp	$8, %r8
 	jbe	L(StrncpyExit8Bytes)
-# endif
+#  endif
 	cmpb	$0, (%rcx)
 	jz	L(Exit1)
 	cmpb	$0, 1(%rcx)
@@ -55,10 +56,10 @@
 	jz	L(Exit7)
 	cmpb	$0, 7(%rcx)
 	jz	L(Exit8)
-# ifdef USE_AS_STRNCPY
+#  ifdef USE_AS_STRNCPY
 	cmp	$16, %r8
 	jb	L(StrncpyExit15Bytes)
-# endif
+#  endif
 	cmpb	$0, 8(%rcx)
 	jz	L(Exit9)
 	cmpb	$0, 9(%rcx)
@@ -73,12 +74,13 @@
 	jz	L(Exit14)
 	cmpb	$0, 14(%rcx)
 	jz	L(Exit15)
-# ifdef USE_AS_STRNCPY
+#  ifdef USE_AS_STRNCPY
 	cmp	$16, %r8
 	je	L(Exit16)
-# endif
+#  endif
 	cmpb	$0, 15(%rcx)
 	jz	L(Exit16)
+# endif
 
 # ifdef USE_AS_STRNCPY
 	mov	%rcx, %rsi
@@ -2180,12 +2182,12 @@
 	jmp	L(CopyFrom1To16Bytes)
 # endif
 
-
+# ifndef USE_AS_STRCAT
 	.p2align 4
 L(CopyFrom1To16Bytes):
-# ifdef USE_AS_STRNCPY
+#  ifdef USE_AS_STRNCPY
 	add	$16, %r8
-# endif
+#  endif
 	add	%rsi, %rdx
 	add	%rsi, %rcx
 
@@ -2210,20 +2212,20 @@
 L(Exit8):
 	mov	(%rcx), %rax
 	mov	%rax, (%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	7(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$8, %r8
 	lea	8(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
@@ -2249,23 +2251,23 @@
 	mov	%rax, (%rdx)
 	mov	8(%rcx), %rax
 	mov	%rax, 8(%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	15(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	lea	16(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
-# ifdef USE_AS_STRNCPY
+#  ifdef USE_AS_STRNCPY
 
 	.p2align 4
 L(CopyFrom1To16BytesCase2):
@@ -2381,46 +2383,46 @@
 	jl	L(Exit9)
 	je	L(Exit10)
 	jg	L(Exit11)
-# endif
+#  endif
 
 	.p2align 4
 L(Exit1):
 	movb	(%rcx), %al
 	movb	%al, (%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$1, %r8
 	lea	1(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
 L(Exit2):
 	movw	(%rcx), %ax
 	movw	%ax, (%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	1(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$2, %r8
 	lea	2(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
@@ -2429,40 +2431,40 @@
 	movw	%ax, (%rdx)
 	movb	2(%rcx), %al
 	movb	%al, 2(%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	2(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$3, %r8
 	lea	3(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
 L(Exit4):
 	movl	(%rcx), %eax
 	movl	%eax, (%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	3(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$4, %r8
 	lea	4(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#  endif
+#  endif
 	ret
 
 	.p2align 4
@@ -2471,20 +2473,20 @@
 	movl	%eax, (%rdx)
 	movb	4(%rcx), %al
 	movb	%al, 4(%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	4(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$5, %r8
 	lea	5(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
@@ -2493,20 +2495,20 @@
 	movl	%eax, (%rdx)
 	movw	4(%rcx), %ax
 	movw	%ax, 4(%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	5(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$6, %r8
 	lea	6(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
@@ -2515,20 +2517,20 @@
 	movl	%eax, (%rdx)
 	movl	3(%rcx), %eax
 	movl	%eax, 3(%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	6(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$7, %r8
 	lea	7(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
@@ -2537,20 +2539,20 @@
 	mov	%rax, (%rdx)
 	mov	5(%rcx), %eax
 	mov	%eax, 5(%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	8(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$9, %r8
 	lea	9(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
@@ -2559,20 +2561,20 @@
 	mov	%rax, (%rdx)
 	mov	6(%rcx), %eax
 	mov	%eax, 6(%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	9(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$10, %r8
 	lea	10(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
@@ -2581,20 +2583,20 @@
 	mov	%rax, (%rdx)
 	mov	7(%rcx), %eax
 	mov	%eax, 7(%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	10(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$11, %r8
 	lea	11(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
@@ -2603,20 +2605,20 @@
 	mov	%rax, (%rdx)
 	mov	8(%rcx), %eax
 	mov	%eax, 8(%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	11(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$12, %r8
 	lea	12(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
@@ -2625,20 +2627,20 @@
 	mov	%rax, (%rdx)
 	mov	5(%rcx), %rax
 	mov	%rax, 5(%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	12(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$13, %r8
 	lea	13(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
@@ -2647,20 +2649,20 @@
 	mov	%rax, (%rdx)
 	mov	6(%rcx), %rax
 	mov	%rax, 6(%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	13(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$14, %r8
 	lea	14(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
@@ -2669,23 +2671,23 @@
 	mov	%rax, (%rdx)
 	mov	7(%rcx), %rax
 	mov	%rax, 7(%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	14(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$15, %r8
 	lea	15(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
-# ifdef USE_AS_STRNCPY
+#  ifdef USE_AS_STRNCPY
 	.p2align 4
 L(Fill0):
 	ret
@@ -2902,13 +2904,13 @@
 	mov	%rax, (%rdx)
 	mov	7(%rcx), %rax
 	mov	%rax, 7(%rdx)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	lea	14(%rdx), %rax
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# else
+#   else
 	mov	%rdi, %rax
-# endif
+#   endif
 	ret
 
 	.p2align 4
@@ -2943,14 +2945,16 @@
 	jz	L(Exit7)
 	mov	(%rcx), %rax
 	mov	%rax, (%rdx)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	lea	7(%rdx), %rax
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# else
+#   else
 	mov	%rdi, %rax
-# endif
+#   endif
 	ret
+
+#  endif
 
 # endif
 
@@ -3715,7 +3719,7 @@
 	lea	1(%rsi), %rsi
 	jmp	L(CopyFrom1To16BytesCase3)
 # endif
-
+# ifndef USE_AS_STRCAT
 END (STRCPY)
-
+# endif
 #endif

Modified: fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen-no-bsf.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen-no-bsf.S (original)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen-no-bsf.S Wed Jul 20 00:02:35 2011
@@ -1,5 +1,5 @@
-/* strlen without BSF
-   Copyright (C) 2010 Free Software Foundation, Inc.
+/* strlen SSE2 without bsf
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -18,12 +18,17 @@
    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
    02111-1307 USA.  */
 
-#if defined SHARED && !defined NOT_IN_libc
-
-#include <sysdep.h>
-
-	.section .text.slow,"ax",@progbits
+#if (defined SHARED || defined USE_AS_STRCAT) && !defined NOT_IN_libc
+
+# ifndef USE_AS_STRCAT
+
+#  include <sysdep.h>
+
+#  define RETURN ret
+
+	.section .text.sse2,"ax",@progbits
 ENTRY (__strlen_no_bsf)
+# endif
 	xor	%eax, %eax
 	cmpb	$0, (%rdi)
 	jz	L(exit_tail0)
@@ -165,39 +170,37 @@
 	jnz	L(exit)
 
 	and	$-0x40, %rax
-	xor	%r8d, %r8d
 L(aligned_64):
 	pcmpeqb	(%rax), %xmm0
 	pcmpeqb	16(%rax), %xmm1
 	pcmpeqb	32(%rax), %xmm2
 	pcmpeqb	48(%rax), %xmm3
 	pmovmskb %xmm0, %edx
-	pmovmskb %xmm1, %esi
-	pmovmskb %xmm2, %edi
+	pmovmskb %xmm1, %r11d
+	pmovmskb %xmm2, %r10d
 	pmovmskb %xmm3, %r9d
-	or	%edx, %r8d
-	or	%esi, %r8d
-	or	%edi, %r8d
-	or	%r9d, %r8d
+	or	%edx, %r9d
+	or	%r11d, %r9d
+	or	%r10d, %r9d
 	lea	64(%rax), %rax
 	jz	L(aligned_64)
 
 	test	%edx, %edx
 	jnz	L(aligned_64_exit_16)
-	test	%esi, %esi
+	test	%r11d, %r11d
 	jnz	L(aligned_64_exit_32)
-	test	%edi, %edi
+	test	%r10d, %r10d
 	jnz	L(aligned_64_exit_48)
 L(aligned_64_exit_64):
-	mov	%r9d, %edx
+	pmovmskb %xmm3, %edx
 	jmp	L(aligned_64_exit)
 L(aligned_64_exit_48):
 	lea	-16(%rax), %rax
-	mov	%edi, %edx
+	mov	%r10d, %edx
 	jmp	L(aligned_64_exit)
 L(aligned_64_exit_32):
 	lea	-32(%rax), %rax
-	mov	%esi, %edx
+	mov	%r11d, %edx
 	jmp	L(aligned_64_exit)
 L(aligned_64_exit_16):
 	lea	-48(%rax), %rax
@@ -228,7 +231,7 @@
 	jnz	L(exit_tail6)
 	add	$7, %eax
 L(exit_tail0):
-	ret
+	RETURN
 
 L(exit_high):
 	add	$8, %eax
@@ -253,57 +256,58 @@
 	test	$0x40, %dh
 	jnz	L(exit_tail6)
 	add	$7, %eax
-	ret
+	RETURN
 	.p2align 4
 L(exit_tail1):
 	add	$1, %eax
-	ret
+	RETURN
 
 L(exit_tail2):
 	add	$2, %eax
-	ret
+	RETURN
 
 L(exit_tail3):
 	add	$3, %eax
-	ret
+	RETURN
 
 L(exit_tail4):
 	add	$4, %eax
-	ret
+	RETURN
 
 L(exit_tail5):
 	add	$5, %eax
-	ret
+	RETURN
 L(exit_tail6):
 	add	$6, %eax
-	ret
+	RETURN
 L(exit_tail7):
 	add	$7, %eax
-	ret
+	RETURN
 L(exit_tail8):
 	add	$8, %eax
-	ret
+	RETURN
 L(exit_tail9):
 	add	$9, %eax
-	ret
+	RETURN
 L(exit_tail10):
 	add	$10, %eax
-	ret
+	RETURN
 L(exit_tail11):
 	add	$11, %eax
-	ret
+	RETURN
 L(exit_tail12):
 	add	$12, %eax
-	ret
+	RETURN
 L(exit_tail13):
 	add	$13, %eax
-	ret
+	RETURN
 L(exit_tail14):
 	add	$14, %eax
-	ret
+	RETURN
 L(exit_tail15):
 	add	$15, %eax
-	ret
+# ifndef USE_AS_STRCAT
+	RETURN
 END (__strlen_no_bsf)
-
+# endif
 #endif

Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S Wed Jul 20 00:02:35 2011
@@ -1,0 +1,260 @@
+/* strlen SSE2
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#if !defined NOT_IN_libc && (defined SHARED || defined USE_AS_STRCAT)
+
+# ifndef USE_AS_STRCAT
+
+#  include <sysdep.h>
+
+#  define RETURN ret
+
+	.section .text.sse2,"ax",@progbits
+ENTRY (__strlen_sse2_pminub)
+
+# endif
+	xor	%rax, %rax
+	mov	%edi, %ecx
+	and	$0x3f, %ecx
+	pxor	%xmm0, %xmm0
+	cmp	$0x30, %ecx
+	ja	L(next)
+	movdqu	(%rdi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit_less16)
+	mov	%rdi, %rax
+	and	$-16, %rax
+	jmp	L(align16_start)
+L(next):
+	mov	%rdi, %rax
+	and	$-16, %rax
+	pcmpeqb	(%rax), %xmm0
+	mov	$-1, %r10d
+	sub	%rax, %rcx
+	shl	%cl, %r10d
+	pmovmskb %xmm0, %edx
+	and	%r10d, %edx
+	jnz	L(exit)
+L(align16_start):
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	pcmpeqb	16(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$64, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$64, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$64, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$80, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%rax), %xmm1
+	add	$16, %rax
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%rax), %xmm2
+	add	$16, %rax
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%rax), %xmm3
+	add	$16, %rax
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	add	$16, %rax
+	.p2align 4
+	L(align64_loop):
+	movaps	(%rax),	%xmm4
+	pminub	16(%rax),	%xmm4
+	movaps	32(%rax),	%xmm5
+	pminub	48(%rax),	%xmm5
+	add	$64,	%rax
+	pminub	%xmm4,	%xmm5
+	pcmpeqb	%xmm0,	%xmm5
+	pmovmskb %xmm5,	%edx
+	test	%edx,	%edx
+	jz	L(align64_loop)
+
+
+	pcmpeqb	-64(%rax), %xmm0
+	sub	$80,	%rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$64, %rax
+	RETURN
+
+	.p2align 4
+L(exit):
+	sub	%rdi, %rax
+L(exit_less16):
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	RETURN
+	.p2align 4
+L(exit16):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$16, %rax
+	RETURN
+	.p2align 4
+L(exit32):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$32, %rax
+	RETURN
+	.p2align 4
+L(exit48):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$48, %rax
+	RETURN
+	.p2align 4
+L(exit64):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$64, %rax
+# ifndef USE_AS_STRCAT
+	RETURN
+
+END (__strlen_sse2_pminub)
+# endif
+#endif

Modified: fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen.S (original)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen.S Wed Jul 20 00:02:35 2011
@@ -32,7 +32,10 @@
 	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
 	jne	1f
 	call	__init_cpu_features
-1:	leaq	__strlen_sse2(%rip), %rax
+1:	leaq	__strlen_sse2_pminub(%rip), %rax
+	testl	$bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip)
+	jnz	2f
+	leaq	__strlen_sse2(%rip), %rax
 	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
 	jz	2f
 	leaq	__strlen_sse42(%rip), %rax

Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strncat-c.c
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strncat-c.c (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strncat-c.c Wed Jul 20 00:02:35 2011
@@ -1,0 +1,8 @@
+#define STRNCAT __strncat_sse2
+#ifdef SHARED
+#undef libc_hidden_def
+#define libc_hidden_def(name) \
+  __hidden_ver1 (__strncat_sse2, __GI___strncat, __strncat_sse2);
+#endif
+
+#include "string/strncat.c"

Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S Wed Jul 20 00:02:35 2011
@@ -1,0 +1,3 @@
+#define USE_AS_STRNCAT
+#define STRCAT __strncat_sse2_unaligned
+#include "strcat-sse2-unaligned.S"

Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strncat-ssse3.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strncat-ssse3.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strncat-ssse3.S Wed Jul 20 00:02:35 2011
@@ -1,0 +1,3 @@
+#define USE_AS_STRNCAT
+#define STRCAT __strncat_ssse3
+#include "strcat-ssse3.S"

Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strncat.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strncat.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strncat.S Wed Jul 20 00:02:35 2011
@@ -1,0 +1,3 @@
+#define STRCAT strncat
+#define USE_AS_STRNCAT
+#include "strcat.S"