[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[commits] r8634 - in /fsf/trunk/libc: ./ elf/ malloc/ string/ sysdeps/x86_64/ sysdeps/x86_64/multiarch/



Author: eglibc
Date: Fri Jul  3 00:08:28 2009
New Revision: 8634

Log:
Import glibc-mainline for 2009-07-03

Added:
    fsf/trunk/libc/elf/tst-audit3.c
    fsf/trunk/libc/elf/tst-auditmod3a.c
    fsf/trunk/libc/elf/tst-auditmod3b.c
    fsf/trunk/libc/sysdeps/x86_64/multiarch/stpcpy.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/stpncpy-c.c
    fsf/trunk/libc/sysdeps/x86_64/multiarch/stpncpy.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strcpy.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strncpy-c.c
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strncpy.S
Modified:
    fsf/trunk/libc/ChangeLog
    fsf/trunk/libc/elf/Makefile
    fsf/trunk/libc/malloc/malloc.c
    fsf/trunk/libc/string/stpncpy.c
    fsf/trunk/libc/string/strncpy.c
    fsf/trunk/libc/sysdeps/x86_64/dl-trampoline.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile

Modified: fsf/trunk/libc/ChangeLog
==============================================================================
--- fsf/trunk/libc/ChangeLog (original)
+++ fsf/trunk/libc/ChangeLog Fri Jul  3 00:08:28 2009
@@ -1,3 +1,40 @@
+2009-06-30  H.J. Lu  <hongjiu.lu@xxxxxxxxx>
+
+	* elf/Makefile (distribute): Remove tst-audit.sh.  Add
+	tst-audit2.c, tst-audit3.c, tst-auditmod3a.c, tst-auditmod3b.c.
+	(tests): Add tst-audit3 for x86_64.
+	(modules-names): Add tst-auditmod3a, tst-auditmod3b.
+	($(objpfx)tst-audit3): Define.
+	($(objpfx)tst-audit3.out): Define.
+	(tst-audit3-ENV): Define.
+	* elf/tst-audit3.c: New file.
+	* elf/tst-auditmod3a.c: New file.
+	* elf/tst-auditmod3b.c: New file.
+	* sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Save
+	and restore xmm6.
+
+	* string/stpncpy.c (STPNCPY): New.  Defined if not defined.
+	(__stpncpy): Renamed to ...
+	(STPNCPY): This.
+	(stpncpy): Create alias only if STPNCPY is not defined.
+	* string/strncpy.c (STRNCPY): New.  Defined to strncpy if not
+	defined.
+	(strncpy): Renamed to ...
+	(STRNCPY): This.
+	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+	 stpncpy-c strncpy-c for string.
+	* sysdeps/x86_64/multiarch/stpcpy.S: New file.
+	* sysdeps/x86_64/multiarch/stpncpy-c.c: New file.
+	* sysdeps/x86_64/multiarch/stpncpy.S: New file.
+	* sysdeps/x86_64/multiarch/strcpy.S: New file.
+	* sysdeps/x86_64/multiarch/strncpy-c.c: New file.
+	* sysdeps/x86_64/multiarch/strncpy.S: New file.
+
+2009-07-02  Ulrich Drepper  <drepper@xxxxxxxxxx>
+
+	* malloc/malloc.c [ATOMIC_FASTBINS] (_int_free): Add full barrier when
+	adding to fast bin list.
+
 2009-07-01  Ulrich Drepper  <drepper@xxxxxxxxxx>
 
 	* nis/nss_nis/nis-network.c (_nss_nis_getnetbyaddr_r): Don't use

Modified: fsf/trunk/libc/elf/Makefile
==============================================================================
--- fsf/trunk/libc/elf/Makefile (original)
+++ fsf/trunk/libc/elf/Makefile Fri Jul  3 00:08:28 2009
@@ -89,7 +89,8 @@
 		   unload4mod1.c unload4mod2.c unload4mod3.c unload4mod4.c \
 		   unload6mod1.c unload6mod2.c unload6mod3.c \
 		   unload7mod1.c unload7mod2.c \
-		   tst-auditmod1.c tst-audit.sh \
+		   tst-audit1.c tst-audit2.c tst-audit3.c \
+		   tst-auditmod1.c tst-auditmod3a.c tst-auditmod3b.c \
 		   order2mod1.c order2mod2.c order2mod3.c order2mod4.c \
 		   tst-stackguard1.c tst-stackguard1-static.c \
 		   tst-array5.c tst-array5-static.c tst-array5dep.c \
@@ -193,6 +194,9 @@
 #	 reldep9
 test-srcs = tst-pathopt
 tests-execstack-yes = tst-execstack tst-execstack-needed tst-execstack-prog
+ifeq (x86_64,$(config-machine))
+tests += tst-audit3
+endif
 endif
 ifeq (yesyes,$(have-fpie)$(build-shared))
 tests: $(objpfx)tst-pie1.out
@@ -230,6 +234,7 @@
 		$(modules-execstack-$(have-z-execstack)) \
 		tst-dlopenrpathmod tst-deep1mod1 tst-deep1mod2 tst-deep1mod3 \
 		tst-dlmopen1mod tst-auditmod1 \
+		tst-auditmod3a tst-auditmod3b \
 		unload3mod1 unload3mod2 unload3mod3 unload3mod4 \
 		unload4mod1 unload4mod2 unload4mod3 unload4mod4 \
 		unload6mod1 unload6mod2 unload6mod3 \
@@ -959,6 +964,10 @@
 $(objpfx)tst-audit2.out: $(objpfx)tst-auditmod1.so
 tst-audit2-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so
 
+$(objpfx)tst-audit3: $(objpfx)tst-auditmod3a.so
+$(objpfx)tst-audit3.out: $(objpfx)tst-auditmod3b.so
+tst-audit3-ENV = LD_AUDIT=$(objpfx)tst-auditmod3b.so
+
 $(objpfx)tst-global1: $(libdl)
 $(objpfx)tst-global1.out: $(objpfx)testobj6.so $(objpfx)testobj2.so
 

Added: fsf/trunk/libc/elf/tst-audit3.c
==============================================================================
--- fsf/trunk/libc/elf/tst-audit3.c (added)
+++ fsf/trunk/libc/elf/tst-audit3.c Fri Jul  3 00:08:28 2009
@@ -1,0 +1,20 @@
+/* Test case for x86-64 preserved registers in dynamic linker.  */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <emmintrin.h>
+
+extern __m128i audit_test (__m128i, __m128i, __m128i, __m128i,
+			   __m128i, __m128i, __m128i, __m128i);
+int
+main (void)
+{
+  __m128i xmm = _mm_setzero_si128 ();
+  __m128i ret = audit_test (xmm, xmm, xmm, xmm, xmm, xmm, xmm, xmm);
+
+  if (memcmp (&xmm, &ret, sizeof (ret)))
+    abort ();
+
+  return 0;
+}

Added: fsf/trunk/libc/elf/tst-auditmod3a.c
==============================================================================
--- fsf/trunk/libc/elf/tst-auditmod3a.c (added)
+++ fsf/trunk/libc/elf/tst-auditmod3a.c Fri Jul  3 00:08:28 2009
@@ -1,0 +1,24 @@
+/* Test case for x86-64 preserved registers in dynamic linker.  */
+
+#include <stdlib.h>
+#include <string.h>
+#include <emmintrin.h>
+
+__m128i
+audit_test (__m128i x0, __m128i x1, __m128i x2, __m128i x3,
+	    __m128i x4, __m128i x5, __m128i x6, __m128i x7)
+{
+  __m128i xmm = _mm_setzero_si128 ();
+
+  if (memcmp (&xmm, &x0, sizeof (xmm))
+      || memcmp (&xmm, &x1, sizeof (xmm))
+      || memcmp (&xmm, &x2, sizeof (xmm))
+      || memcmp (&xmm, &x3, sizeof (xmm))
+      || memcmp (&xmm, &x4, sizeof (xmm))
+      || memcmp (&xmm, &x5, sizeof (xmm))
+      || memcmp (&xmm, &x6, sizeof (xmm))
+      || memcmp (&xmm, &x7, sizeof (xmm)))
+    abort ();
+
+  return xmm;
+}

Added: fsf/trunk/libc/elf/tst-auditmod3b.c
==============================================================================
--- fsf/trunk/libc/elf/tst-auditmod3b.c (added)
+++ fsf/trunk/libc/elf/tst-auditmod3b.c Fri Jul  3 00:08:28 2009
@@ -1,0 +1,156 @@
+/* Verify that changing xmm registers in audit library won't affect
+   function parameter passing/return.  */
+
+#include <dlfcn.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <bits/wordsize.h>
+#include <gnu/lib-names.h>
+#include <emmintrin.h>
+
+unsigned int
+la_version (unsigned int v)
+{
+  setlinebuf (stdout);
+
+  printf ("version: %u\n", v);
+
+  char buf[20];
+  sprintf (buf, "%u", v);
+
+  return v;
+}
+
+void
+la_activity (uintptr_t *cookie, unsigned int flag)
+{
+  if (flag == LA_ACT_CONSISTENT)
+    printf ("activity: consistent\n");
+  else if (flag == LA_ACT_ADD)
+    printf ("activity: add\n");
+  else if (flag == LA_ACT_DELETE)
+    printf ("activity: delete\n");
+  else
+    printf ("activity: unknown activity %u\n", flag);
+}
+
+char *
+la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
+{
+  char buf[100];
+  const char *flagstr;
+  if (flag == LA_SER_ORIG)
+    flagstr = "LA_SET_ORIG";
+  else if (flag == LA_SER_LIBPATH)
+    flagstr = "LA_SER_LIBPATH";
+  else if (flag == LA_SER_RUNPATH)
+    flagstr = "LA_SER_RUNPATH";
+  else if (flag == LA_SER_CONFIG)
+    flagstr = "LA_SER_CONFIG";
+  else if (flag == LA_SER_DEFAULT)
+    flagstr = "LA_SER_DEFAULT";
+  else if (flag == LA_SER_SECURE)
+    flagstr = "LA_SER_SECURE";
+  else
+    {
+       sprintf (buf, "unknown flag %d", flag);
+       flagstr = buf;
+    }
+  printf ("objsearch: %s, %s\n", name, flagstr);
+
+  return (char *) name;
+}
+
+unsigned int
+la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie)
+{
+  printf ("objopen: %ld, %s\n", lmid, l->l_name);
+
+  return 3;
+}
+
+void
+la_preinit (uintptr_t *cookie)
+{
+  printf ("preinit\n");
+}
+
+unsigned int
+la_objclose  (uintptr_t *cookie)
+{
+  printf ("objclose\n");
+  return 0;
+}
+
+uintptr_t
+la_symbind32 (Elf32_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind32: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+uintptr_t
+la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+#define pltenter la_x86_64_gnu_pltenter
+#define pltexit la_x86_64_gnu_pltexit
+#define La_regs La_x86_64_regs
+#define La_retval La_x86_64_retval
+#define int_retval lrv_rax
+
+#include <tst-audit.h>
+
+ElfW(Addr)
+pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	  uintptr_t *defcook, La_regs *regs, unsigned int *flags,
+	  const char *symname, long int *framesizep)
+{
+  printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  __m128i xmm = _mm_set1_epi32 (-1);
+  asm volatile ("movdqa %0, %%xmm0" : : "x" (xmm) : "xmm0" );
+  asm volatile ("movdqa %0, %%xmm1" : : "x" (xmm) : "xmm1" );
+  asm volatile ("movdqa %0, %%xmm2" : : "x" (xmm) : "xmm2" );
+  asm volatile ("movdqa %0, %%xmm3" : : "x" (xmm) : "xmm3" );
+  asm volatile ("movdqa %0, %%xmm4" : : "x" (xmm) : "xmm4" );
+  asm volatile ("movdqa %0, %%xmm5" : : "x" (xmm) : "xmm5" );
+  asm volatile ("movdqa %0, %%xmm6" : : "x" (xmm) : "xmm6" );
+  asm volatile ("movdqa %0, %%xmm7" : : "x" (xmm) : "xmm7" );
+
+  return sym->st_value;
+}
+
+unsigned int
+pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	 uintptr_t *defcook, const La_regs *inregs, La_retval *outregs,
+	 const char *symname)
+{
+  printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n",
+	  symname, (long int) sym->st_value, ndx, outregs->int_retval);
+
+  __m128i xmm = _mm_set1_epi32 (-1);
+  asm volatile ("movdqa %0, %%xmm0" : : "x" (xmm) : "xmm0" );
+  asm volatile ("movdqa %0, %%xmm1" : : "x" (xmm) : "xmm1" );
+  asm volatile ("movdqa %0, %%xmm2" : : "x" (xmm) : "xmm2" );
+  asm volatile ("movdqa %0, %%xmm3" : : "x" (xmm) : "xmm3" );
+  asm volatile ("movdqa %0, %%xmm4" : : "x" (xmm) : "xmm4" );
+  asm volatile ("movdqa %0, %%xmm5" : : "x" (xmm) : "xmm5" );
+  asm volatile ("movdqa %0, %%xmm6" : : "x" (xmm) : "xmm6" );
+  asm volatile ("movdqa %0, %%xmm7" : : "x" (xmm) : "xmm7" );
+
+  return 0;
+}

Modified: fsf/trunk/libc/malloc/malloc.c
==============================================================================
--- fsf/trunk/libc/malloc/malloc.c (original)
+++ fsf/trunk/libc/malloc/malloc.c Fri Jul  3 00:08:28 2009
@@ -4822,6 +4822,7 @@
 	    goto errout;
 	  }
 	p->fd = fd = old;
+	atomic_full_barrier ();
       }
     while ((old = catomic_compare_and_exchange_val_acq (fb, p, fd)) != fd);
 #else

Modified: fsf/trunk/libc/string/stpncpy.c
==============================================================================
--- fsf/trunk/libc/string/stpncpy.c (original)
+++ fsf/trunk/libc/string/stpncpy.c Fri Jul  3 00:08:28 2009
@@ -28,17 +28,19 @@
 # include <sys/types.h>
 #endif
 
-#ifndef weak_alias
-# define __stpncpy stpncpy
+#ifndef STPNCPY
+# ifdef weak_alias
+#  define STPNCPY	__stpncpy
+weak_alias (__stpncpy, stpncpy)
+# else
+#  define STPNCPY	stpncpy
+# endif
 #endif
 
 /* Copy no more than N characters of SRC to DEST, returning the address of
    the terminating '\0' in DEST, if any, or else DEST + N.  */
 char *
-__stpncpy (dest, src, n)
-     char *dest;
-     const char *src;
-     size_t n;
+STPNCPY (char *dest, const char *src, size_t n)
 {
   char c;
   char *s = dest;
@@ -96,5 +98,4 @@
 }
 #ifdef weak_alias
 libc_hidden_def (__stpncpy)
-weak_alias (__stpncpy, stpncpy)
 #endif

Modified: fsf/trunk/libc/string/strncpy.c
==============================================================================
--- fsf/trunk/libc/string/strncpy.c (original)
+++ fsf/trunk/libc/string/strncpy.c Fri Jul  3 00:08:28 2009
@@ -21,11 +21,12 @@
 
 #undef strncpy
 
+#ifndef STRNCPY
+#define STRNCPY strncpy
+#endif
+
 char *
-strncpy (s1, s2, n)
-     char *s1;
-     const char *s2;
-     size_t n;
+STRNCPY (char *s1, const char *s2, size_t n)
 {
   reg_char c;
   char *s = s1;

Modified: fsf/trunk/libc/sysdeps/x86_64/dl-trampoline.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/dl-trampoline.S (original)
+++ fsf/trunk/libc/sysdeps/x86_64/dl-trampoline.S Fri Jul  3 00:08:28 2009
@@ -107,7 +107,8 @@
 	movaps %xmm3, 112(%rsp)
 	movaps %xmm4, 128(%rsp)
 	movaps %xmm5, 144(%rsp)
-	movaps %xmm7, 160(%rsp)
+	movaps %xmm6, 160(%rsp)
+	movaps %xmm7, 176(%rsp)
 
 	movq %rsp, %rcx		# La_x86_64_regs pointer to %rcx.
 	movq 48(%rbx), %rdx	# Load return address if needed.
@@ -128,7 +129,8 @@
 	movaps 112(%rsp), %xmm3
 	movaps 128(%rsp), %xmm4
 	movaps 144(%rsp), %xmm5
-	movaps 160(%rsp), %xmm7
+	movaps 160(%rsp), %xmm6
+	movaps 176(%rsp), %xmm7
 
 	movq 16(%rbx), %r10	# Anything in framesize?
 	testq %r10, %r10

Modified: fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile (original)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile Fri Jul  3 00:08:28 2009
@@ -4,5 +4,5 @@
 endif
 
 ifeq ($(subdir),string)
-sysdep_routines += strncmp-c
+sysdep_routines += stpncpy-c strncpy-c strncmp-c
 endif

Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/stpcpy.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/stpcpy.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/stpcpy.S Fri Jul  3 00:08:28 2009
@@ -1,0 +1,7 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy
+#include "strcpy.S"
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)

Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/stpncpy-c.c
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/stpncpy-c.c (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/stpncpy-c.c Fri Jul  3 00:08:28 2009
@@ -1,0 +1,8 @@
+#define STPNCPY __stpncpy_sse2
+#ifdef SHARED
+#undef libc_hidden_def
+#define libc_hidden_def(name) \
+  __hidden_ver1 (__stpncpy_sse2, __GI___stpncpy, __stpncpy_sse2);
+#endif
+
+#include "stpncpy.c"

Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/stpncpy.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/stpncpy.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/stpncpy.S Fri Jul  3 00:08:28 2009
@@ -1,0 +1,6 @@
+#define STRCPY __stpncpy
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#include "strcpy.S"
+
+weak_alias (__stpncpy, stpncpy)

Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strcpy.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strcpy.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strcpy.S Fri Jul  3 00:08:28 2009
@@ -1,0 +1,1917 @@
+/* strcpy with SSSE3
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <ifunc-defines.h>
+
+#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY)
+# ifndef STRCPY
+#  define STRCPY strcpy
+# endif
+#endif
+
+#ifdef USE_AS_STPCPY
+# ifdef USE_AS_STRNCPY
+#  define STRCPY_SSSE3	__stpncpy_ssse3
+#  define STRCPY_SSE2	__stpncpy_sse2
+#  define __GI_STRCPY	__GI_stpncpy
+# else
+#  define STRCPY_SSSE3	__stpcpy_ssse3
+#  define STRCPY_SSE2	__stpcpy_sse2
+#  define __GI_STRCPY	__GI_stpcpy
+#  define __GI___STRCPY	__GI___stpcpy
+# endif
+#else
+# ifdef USE_AS_STRNCPY
+#  define STRCPY_SSSE3	__strncpy_ssse3
+#  define STRCPY_SSE2	__strncpy_sse2
+#  define __GI_STRCPY	__GI_strncpy
+# else
+#  define STRCPY_SSSE3	__strcpy_ssse3
+#  define STRCPY_SSE2	__strcpy_sse2
+#  define __GI_STRCPY	__GI_strcpy
+# endif
+#endif
+
+#ifndef LABEL
+#define LABEL(l) L(l)
+#endif
+
+/* Define multiple versions only for the definition in libc.  */
+#ifndef NOT_IN_libc
+	.text
+ENTRY(STRCPY)
+	.type	STRCPY, @gnu_indirect_function
+	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	STRCPY_SSE2(%rip), %rax
+	testl	$(1<<9), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
+	jz	3f
+/* Avoid SSSE3 strcpy on Atom since it is slow.  */
+	cmpl	$1, __cpu_features+KIND_OFFSET(%rip)
+	jne	2f
+	cmpl	$6, __cpu_features+FAMILY_OFFSET(%rip)
+	jne	2f
+	cmpl	$28, __cpu_features+MODEL_OFFSET(%rip)
+	jz	3f
+2:	leaq	STRCPY_SSSE3(%rip), %rax
+3:	ret
+END(STRCPY)
+
+	.section .text.ssse3,"ax",@progbits
+STRCPY_SSSE3:
+	cfi_startproc
+	CALL_MCOUNT
+
+/*
+ * This implementation uses SSE to copy up to 16 bytes at a time.
+ */
+#ifdef USE_AS_STRNCPY
+	test    %rdx, %rdx
+	jz      LABEL(strncpy_exitz)
+	mov     %rdx, %r8
+#else
+	xor	%edx, %edx
+#endif
+	mov	%esi, %ecx
+	and	$0xfffffffffffffff0, %rsi	/*force rsi 16 byte align*/
+	and	$15, %ecx
+	mov	%rdi, %rax			/*store return parameter*/
+
+
+	pxor	%xmm0, %xmm0			/* clear %xmm0 */
+	pcmpeqb	(%rsi), %xmm0			/* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/
+	pmovmskb %xmm0, %edx			/* move each byte mask of %xmm0 to edx*/
+	shr	%cl, %edx			/* get real bits left in edx*/
+	test	%edx, %edx			/* edx must be 0 if there is no null char from rsi+%rcx */
+	jnz	LABEL(less16bytes)
+
+#ifdef USE_AS_STRNCPY
+	lea	-16(%r8,%rcx), %r11
+	cmp	$0, %r11
+	jle	LABEL(less16bytes)		/* if r8 + rcx <= 16, branch to less16bytes.  */
+#endif
+
+	mov	%rcx, %r9
+	or	%edi, %ecx
+	and	$15, %ecx
+	lea	-16(%r9), %r10
+	jz	LABEL(ashr_0)			/* ecx must be 0 if offset of rsi and rdi is 16 byte align*/
+
+	neg	%r10				/* store the rest in rsi aligned 16 bytes for unaligned_exit*/
+
+	pxor	%xmm0, %xmm0			/* clear %xmm0, may be polluted by unaligned operation*/
+	pcmpeqb	16(%rsi), %xmm0			/* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(less32bytes)
+	/*
+	* at least 16 byte available to fill destination rdi
+	*/
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(less32bytes_strncpy_truncation)
+#endif
+	mov	(%rsi, %r9), %rdx
+	mov	%rdx, (%rdi)
+	mov	8(%rsi, %r9), %rdx
+	mov	%rdx, 8(%rdi)
+
+	/*
+	* so far destatination rdi may be aligned by 16, re-calculate rsi to jump
+	* crossponding case
+	* rcx is offset of rsi
+	* rax is offset of rdi
+	*/
+
+	and	$0xfffffffffffffff0, %rdi	/* force rdi 16 byte align */
+	mov	%rax, %rdx			/* rax store orignal rdi */
+	xor	%rdi, %rdx			/* equal to and $15, %rdx */
+#ifdef USE_AS_STRNCPY
+	add     %rdx, %r8
+#endif
+
+	add	$16, %rdi			/* next 16 bytes for rdi */
+	sub	%rdx, %r9
+
+	lea	16(%r9, %rsi), %rsi		/*re-calculate rsi by (16 - rdx)+ rcx */
+	mov	%esi, %ecx			/*store offset of rsi */
+	and	$0xfffffffffffffff0, %rsi	/* force rsi 16 byte align */
+
+	and	$15, %ecx			/* ecx must be 0 if rdx is equal to rcx*/
+	jz	LABEL(ashr_0)
+
+	lea	-16(%rcx), %r10
+	mov	%rcx, %r9
+	neg	%r10
+	lea	LABEL(unaligned_table)(%rip), %r11
+	movslq  (%r11, %rcx,4), %rcx
+	lea	(%r11, %rcx), %rcx
+	jmp	*%rcx
+
+ /*
+ * The following cases will be handled by ashr_0 & ashr_0_start
+ *  rcx(offset of rsi)  rax(offset of rdi)  relative offset  corresponding case
+ *	0		    0		  0		 ashr_0
+ *	n(1~15)	     n(1~15)	   0		 ashr_0_start
+ *
+ */
+	.p2align 5
+LABEL(ashr_0):
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_aligned)
+#endif
+	movdqa  (%rsi), %xmm1	   /* fetch first 16 bytes from rsi */
+	movdqa  %xmm1, (%rdi)	   /* store first 16 bytes into rdi */
+	add     $16, %rsi
+	add     $16, %rdi
+	pcmpeqb  (%rsi), %xmm0		   /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char */
+	pmovmskb  %xmm0, %edx		   /* move each byte mask of %xmm0 to edx*/
+
+	test    %edx, %edx		  /* edx must be 0 if there is no null char in rsi*/
+	jnz	LABEL(aligned_16bytes)
+
+LABEL(ashr_0_loop):
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_aligned)
+#endif
+	movdqa  (%rsi, %rcx), %xmm1
+	movdqa  %xmm1, (%rdi, %rcx)
+	add	$16, %rcx
+	pcmpeqb  (%rsi, %rcx), %xmm0
+	pmovmskb  %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(aligned_exit)
+
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_aligned)
+#endif
+	movdqa  (%rsi, %rcx), %xmm1
+	movdqa  %xmm1, (%rdi, %rcx)
+	add	$16, %rcx
+	pcmpeqb  (%rsi, %rcx), %xmm0
+	pmovmskb  %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(aligned_exit)
+
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_aligned)
+#endif
+	movdqa  (%rsi, %rcx), %xmm1
+	movdqa  %xmm1, (%rdi, %rcx)
+	add	$16, %rcx
+	pcmpeqb  (%rsi, %rcx), %xmm0
+	pmovmskb  %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(aligned_exit)
+
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_aligned)
+#endif
+	movdqa  (%rsi, %rcx), %xmm1
+	movdqa  %xmm1, (%rdi, %rcx)
+	add	$16, %rcx
+	pcmpeqb  (%rsi, %rcx), %xmm0
+	pmovmskb  %xmm0, %edx
+	test	%edx, %edx
+	jz	LABEL(ashr_0_loop)
+
+	jmp	LABEL(aligned_exit)
+        .p2align 4
+
+/*
+ * The following cases will be handled by ashr_15
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(15)		n - 15		15((16 - (n -15) + n)%16	 ashr_15
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_15):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_15_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $15, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $15, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_15_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_14
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(14~15)		n - 14		14((16 - (n -14) + n)%16	 ashr_14
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_14):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_14_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $14, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $14, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_14_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_13
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(13~15)		n - 13		13((16 - (n -13) + n)%16	 ashr_13
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_13):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_13_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $13, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $13, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_13_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_12
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(12~15)		n - 12		12((16 - (n -12) + n)%16	 ashr_12
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_12):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_12_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $12, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $12, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_12_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_11
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(11~15)		n - 11		11((16 - (n -11) + n)%16	 ashr_11
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_11):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_11_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $11, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $11, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_11_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_10
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(10~15)		n - 10		10((16 - (n -10) + n)%16	 ashr_10
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_10):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_10_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $10, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $10, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_10_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_9
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(9~15)		n - 9		9((16 - (n -9) + n)%16	 ashr_9
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_9):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_9_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $9, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $9, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_9_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_8
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(8~15)		n - 8		8((16 - (n -8) + n)%16	 ashr_8
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_8):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_8_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $8, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $8, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_8_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_7
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(7~15)		n - 7		7((16 - (n -7) + n)%16	 ashr_7
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_7):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	.p2align 4
+
+LABEL(ashr_7_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $7, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $7, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_7_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_6
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(6~15)		n - 6		6((16 - (n -6) + n)%16	 ashr_6
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_6):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_6_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $6, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $6, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_6_use_ssse3)
+
+ /*
+ * The following cases will be handled by ashr_5
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(5~15)		n - 5		5((16 - (n -5) + n)%16	 ashr_5
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_5):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_5_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $5, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $5, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_5_use_ssse3)
+
+/*
+ *
+ * The following cases will be handled by ashr_4
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(4~15)		n - 4		4((16 - (n -4) + n)%16	 ashr_4
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_4):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_4_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $4, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $4, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_4_use_ssse3)
+
+/*
+ *
+ * The following cases will be handled by ashr_3
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(3~15)		n - 3		3((16 - (n -3) + n)%16	 ashr_3
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_3):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_3_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $3, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $3, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_3_use_ssse3)
+
+/*
+ *
+ * The following cases will be handled by ashr_2
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(2~15)		n - 2		2((16 - (n -2) + n)%16	 ashr_2
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_2):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_2_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $2, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $2, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_2_use_ssse3)
+
+/*
+ *
+ * The following cases will be handled by ashr_1
+ *  rcx(offset of rsi)  rax(offset of rdi)  relative offset  		corresponding case
+ *	n(1~15)		n - 1	   	1 ((16 - (n -1) + n)%16	 ashr_1
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_1):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_1_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $1, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+	palignr $1, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_1_use_ssse3)
+
+	.p2align 4
+LABEL(less32bytes):
+	xor	%ecx, %ecx
+LABEL(unaligned_exit):
+	add	%r9, %rsi		/* r9 stores original offset of rsi*/
+	mov	%rcx, %r9
+	mov	%r10, %rcx
+	shl	%cl, %edx		/* after shl, calculate the exact number to be filled*/
+	mov	%r9, %rcx
+	.p2align 4
+LABEL(aligned_exit):
+	add	%rcx, %rdi		/*locate exact address for rdi */
+LABEL(less16bytes):
+	add	%rcx, %rsi		/*locate exact address for rsi */
+LABEL(aligned_16bytes):
+#ifdef USE_AS_STRNCPY
+	mov     $1, %r9d
+	lea     -1(%r8), %rcx
+	shl     %cl, %r9d
+	cmp     $32, %r8
+	ja      LABEL(strncpy_tail)
+	or      %r9d, %edx
+LABEL(strncpy_tail):
+#endif
+	bsf	%rdx, %rcx		/*If a least significant 1 bit in %rdx is found, its bit index is stored in %rcx*/
+	lea	LABEL(tail_table)(%rip), %r11
+	movslq	(%r11, %rcx,4), %rcx
+	lea	(%r11, %rcx), %rcx
+	jmp	*%rcx
+
+#ifdef USE_AS_STRNCPY
+	.p2align 4
+LABEL(less32bytes_strncpy_truncation):
+	xor     %ecx, %ecx
+LABEL(strncpy_truncation_unaligned):
+	add      %r9, %rsi
+LABEL(strncpy_truncation_aligned):
+	add      %rcx, %rdi
+	add      %rcx, %rsi
+	add     $16, %r8
+	lea     -1(%r8), %rcx
+	lea     LABEL(tail_table)(%rip), %r11
+	movslq  (%r11, %rcx,4), %rcx
+	lea     (%r11, %rcx), %rcx
+	jmp     *%rcx
+	.p2align 4
+LABEL(strncpy_exitz):
+	mov     %rdi, %rax
+	ret
+#endif
+
+#ifdef USE_AS_STRNCPY
+	.p2align 4
+LABEL(strncpy_fill_tail):
+	mov	%rax, %rdx
+	movzx	%cl, %rax
+	mov	%r8, %rcx
+	add	%rax, %rdi
+	xor	%eax, %eax
+	shr	$3, %ecx
+	jz	LABEL(strncpy_fill_less_8)
+
+	rep	stosq
+LABEL(strncpy_fill_less_8):
+	mov	%r8, %rcx
+	and	$7, %ecx
+	jz	LABEL(strncpy_fill_return)
+LABEL(strncpy_fill_less_7):
+	sub	$1, %ecx
+	mov	%al, (%rdi, %rcx)
+	jnz	LABEL(strncpy_fill_less_7)
+LABEL(strncpy_fill_return):
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rdx)
+	sbb	$-1, %rdx
+#endif
+	mov	%rdx, %rax
+	ret
+#endif
+	.p2align 4
+LABEL(tail_0):
+	mov	(%rsi), %cl
+	mov	%cl, (%rdi)
+#ifdef USE_AS_STPCPY
+	mov	%rdi, %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$1, %cl
+	sub	$1, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_1):
+	mov	(%rsi), %cx
+	mov	%cx, (%rdi)
+#ifdef USE_AS_STPCPY
+	lea	1(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$2, %cl
+	sub	$2, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_2):
+	mov	(%rsi), %cx
+	mov	%cx, (%rdi)
+	mov	1(%rsi), %cx
+	mov	%cx, 1(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	2(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$3, %cl
+	sub	$3, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_3):
+	mov	(%rsi), %ecx
+	mov	%ecx, (%rdi)
+#ifdef USE_AS_STPCPY
+	lea	3(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$4, %cl
+	sub	$4, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_4):
+	mov	(%rsi), %ecx
+	mov	%ecx, (%rdi)
+	mov	1(%rsi), %edx
+	mov	%edx, 1(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	4(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$5, %cl
+	sub	$5, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_5):
+	mov	(%rsi), %ecx
+	mov	%ecx, (%rdi)
+	mov	2(%rsi), %edx
+	mov	%edx, 2(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	5(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$6, %cl
+	sub	$6, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_6):
+	mov	(%rsi), %ecx
+	mov	%ecx, (%rdi)
+	mov	3(%rsi), %edx
+	mov	%edx,3(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	6(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$7, %cl
+	sub	$7, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_7):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+#ifdef USE_AS_STPCPY
+	lea	7(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$8, %cl
+	sub	$8, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_8):
+
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	5(%rsi), %edx
+	mov	%edx, 5(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	8(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$9, %cl
+	sub	$9, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_9):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	6(%rsi), %edx
+	mov	%edx, 6(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	9(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$10, %cl
+	sub	$10, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_10):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	7(%rsi), %edx
+	mov	%edx, 7(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	10(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$11, %cl
+	sub	$11, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_11):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %edx
+	mov	%edx, 8(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	11(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$12, %cl
+	sub	$12, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_12):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	5(%rsi), %rcx
+	mov	%rcx, 5(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	12(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$13, %cl
+	sub	$13, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_13):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	6(%rsi), %rcx
+	mov	%rcx, 6(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	13(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$14, %cl
+	sub	$14, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_14):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	7(%rsi), %rcx
+	mov	%rcx, 7(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	14(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$15, %cl
+	sub	$15, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+LABEL(tail_15):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	15(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$16, %cl
+	sub	$16, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+
+	ret
+
+	.p2align 4
+LABEL(tail_16):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %cl
+	mov	%cl, 16(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	16(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$17, %cl
+	sub	$17, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_17):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %cx
+	mov	%cx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	17(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$18, %cl
+	sub	$18, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_18):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	15(%rsi), %ecx
+	mov	%ecx,15(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	18(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$19, %cl
+	sub	$19, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_19):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %ecx
+	mov	%ecx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	19(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$20, %cl
+	sub	$20, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_20):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	13(%rsi), %rcx
+	mov	%rcx, 13(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	20(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$21, %cl
+	sub	$21, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_21):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	14(%rsi), %rcx
+	mov	%rcx, 14(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	21(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$22, %cl
+	sub	$22, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_22):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	15(%rsi), %rcx
+	mov	%rcx, 15(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	22(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$23, %cl
+	sub	$23, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_23):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	23(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$24, %cl
+	sub	$24, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+
+	ret
+
+	.p2align 4
+LABEL(tail_24):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	21(%rsi), %edx
+	mov	%edx, 21(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	24(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$25, %cl
+	sub	$25, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_25):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	22(%rsi), %edx
+	mov	%edx, 22(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	25(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$26, %cl
+	sub	$26, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_26):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	23(%rsi), %edx
+	mov	%edx, 23(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	26(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$27, %cl
+	sub	$27, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_27):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	24(%rsi), %edx
+	mov	%edx, 24(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	27(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$28, %cl
+	sub	$28, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_28):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	21(%rsi), %rdx
+	mov	%rdx, 21(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	28(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$29, %cl
+	sub	$29, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+
+	ret
+
+	.p2align 4
+LABEL(tail_29):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	22(%rsi), %rdx
+	mov	%rdx, 22(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	29(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$30, %cl
+	sub	$30, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+
+	ret
+
+
+	.p2align 4
+LABEL(tail_30):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	23(%rsi), %rdx
+	mov	%rdx, 23(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	30(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$31, %cl
+	sub	$31, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_31):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	24(%rsi), %rdx
+	mov	%rdx, 24(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	31(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$32, %cl
+	sub	$32, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	cfi_endproc
+	.size	STRCPY_SSSE3, .-STRCPY_SSSE3
+
+	.p2align 4
+	.section .rodata.ssse3,"a",@progbits
+LABEL(tail_table):
+	.int	LABEL(tail_0) - LABEL(tail_table)
+	.int	LABEL(tail_1) - LABEL(tail_table)
+	.int	LABEL(tail_2) - LABEL(tail_table)
+	.int	LABEL(tail_3) - LABEL(tail_table)
+	.int	LABEL(tail_4) - LABEL(tail_table)
+	.int	LABEL(tail_5) - LABEL(tail_table)
+	.int	LABEL(tail_6) - LABEL(tail_table)
+	.int	LABEL(tail_7) - LABEL(tail_table)
+	.int	LABEL(tail_8) - LABEL(tail_table)
+	.int	LABEL(tail_9) - LABEL(tail_table)
+	.int	LABEL(tail_10) - LABEL(tail_table)
+	.int	LABEL(tail_11) - LABEL(tail_table)
+	.int	LABEL(tail_12) - LABEL(tail_table)
+	.int	LABEL(tail_13) - LABEL(tail_table)
+	.int	LABEL(tail_14) - LABEL(tail_table)
+	.int	LABEL(tail_15) - LABEL(tail_table)
+	.int	LABEL(tail_16) - LABEL(tail_table)
+	.int	LABEL(tail_17) - LABEL(tail_table)
+	.int	LABEL(tail_18) - LABEL(tail_table)
+	.int	LABEL(tail_19) - LABEL(tail_table)
+	.int	LABEL(tail_20) - LABEL(tail_table)
+	.int	LABEL(tail_21) - LABEL(tail_table)
+	.int	LABEL(tail_22) - LABEL(tail_table)
+	.int	LABEL(tail_23) - LABEL(tail_table)
+	.int	LABEL(tail_24) - LABEL(tail_table)
+	.int	LABEL(tail_25) - LABEL(tail_table)
+	.int	LABEL(tail_26) - LABEL(tail_table)
+	.int	LABEL(tail_27) - LABEL(tail_table)
+	.int	LABEL(tail_28) - LABEL(tail_table)
+	.int	LABEL(tail_29) - LABEL(tail_table)
+	.int	LABEL(tail_30) - LABEL(tail_table)
+	.int	LABEL(tail_31) - LABEL(tail_table)
+
+	.p2align 4
+LABEL(unaligned_table):
+	.int	LABEL(ashr_0) - LABEL(unaligned_table)
+	.int	LABEL(ashr_1) - LABEL(unaligned_table)
+	.int	LABEL(ashr_2) - LABEL(unaligned_table)
+	.int	LABEL(ashr_3) - LABEL(unaligned_table)
+	.int	LABEL(ashr_4) - LABEL(unaligned_table)
+	.int	LABEL(ashr_5) - LABEL(unaligned_table)
+	.int	LABEL(ashr_6) - LABEL(unaligned_table)
+	.int	LABEL(ashr_7) - LABEL(unaligned_table)
+	.int	LABEL(ashr_8) - LABEL(unaligned_table)
+	.int	LABEL(ashr_9) - LABEL(unaligned_table)
+	.int	LABEL(ashr_10) - LABEL(unaligned_table)
+	.int	LABEL(ashr_11) - LABEL(unaligned_table)
+	.int	LABEL(ashr_12) - LABEL(unaligned_table)
+	.int	LABEL(ashr_13) - LABEL(unaligned_table)
+	.int	LABEL(ashr_14) - LABEL(unaligned_table)
+	.int	LABEL(ashr_15) - LABEL(unaligned_table)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCPY_SSE2, @function; \
+	STRCPY_SSE2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCPY_SSE2, .-STRCPY_SSE2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcpy calls through a PLT.
+   The speedup we get from using SSSE3 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCPY; __GI_STRCPY = STRCPY_SSE2
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+	.globl __GI___STRCPY; __GI___STRCPY = STRCPY_SSE2
+#endif
+
+#ifndef USE_AS_STRNCPY
+#include "../strcpy.S"
+#endif

Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strncpy-c.c
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strncpy-c.c (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strncpy-c.c Fri Jul  3 00:08:28 2009
@@ -1,0 +1,8 @@
+#define STRNCPY __strncpy_sse2
+#ifdef SHARED
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name) \
+  __hidden_ver1 (__strncpy_sse2, __GI_strncpy, __strncpy_sse2);
+#endif
+
+#include "strncpy.c"

Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strncpy.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strncpy.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strncpy.S Fri Jul  3 00:08:28 2009
@@ -1,0 +1,3 @@
+#define STRCPY strncpy
+#define USE_AS_STRNCPY
+#include "strcpy.S"