[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[commits] r14314 - in /fsf/trunk/libc: ./ nss/nss_db/ sysdeps/i386/i686/multiarch/ sysdeps/unix/sysv/linux/ sysdeps/wordsize-64/ sysde...



Author: eglibc
Date: Sat Jun 25 00:02:39 2011
New Revision: 14314

Log:
Import glibc-mainline for 2011-06-25

Added:
    fsf/trunk/libc/sysdeps/i386/i686/multiarch/stpcpy-sse2.S
    fsf/trunk/libc/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S
    fsf/trunk/libc/sysdeps/i386/i686/multiarch/stpcpy.S
    fsf/trunk/libc/sysdeps/i386/i686/multiarch/stpncpy-sse2.S
    fsf/trunk/libc/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S
    fsf/trunk/libc/sysdeps/i386/i686/multiarch/stpncpy.S
    fsf/trunk/libc/sysdeps/i386/i686/multiarch/strcpy-sse2.S
    fsf/trunk/libc/sysdeps/i386/i686/multiarch/strcpy-ssse3.S
    fsf/trunk/libc/sysdeps/i386/i686/multiarch/strcpy.S
    fsf/trunk/libc/sysdeps/i386/i686/multiarch/strncpy-c.c
    fsf/trunk/libc/sysdeps/i386/i686/multiarch/strncpy-sse2.S
    fsf/trunk/libc/sysdeps/i386/i686/multiarch/strncpy-ssse3.S
    fsf/trunk/libc/sysdeps/i386/i686/multiarch/strncpy.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strcpy-ssse3.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strncpy-ssse3.S
Modified:
    fsf/trunk/libc/ChangeLog
    fsf/trunk/libc/NEWS
    fsf/trunk/libc/nss/nss_db/db-open.c
    fsf/trunk/libc/sysdeps/i386/i686/multiarch/Makefile
    fsf/trunk/libc/sysdeps/unix/sysv/linux/Makefile
    fsf/trunk/libc/sysdeps/wordsize-64/tst-writev.c
    fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile
    fsf/trunk/libc/sysdeps/x86_64/multiarch/init-arch.c
    fsf/trunk/libc/sysdeps/x86_64/multiarch/init-arch.h
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strcpy.S

Modified: fsf/trunk/libc/ChangeLog
==============================================================================
--- fsf/trunk/libc/ChangeLog (original)
+++ fsf/trunk/libc/ChangeLog Sat Jun 25 00:02:39 2011
@@ -1,3 +1,63 @@
+2011-06-24  H.J. Lu  <hongjiu.lu@xxxxxxxxx>
+
+	* sysdeps/i386/i686/multiarch/strcpy-sse2.S (RETURN): Fix a typo.
+
+	* sysdeps/i386/i686/multiarch/strcpy-ssse3.S: Correct unwind
+	info.
+
+2011-06-22  H.J. Lu  <hongjiu.lu@xxxxxxxxx>
+
+	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+	strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3
+	strcpy-sse2-unaligned strncpy-sse2-unaligned
+	stpcpy-sse2-unaligned stpncpy-sse2-unaligned.
+	* sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S: New file.
+	* sysdeps/x86_64/multiarch/stpcpy-ssse3.S: New file.
+	* sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S: New file.
+	* sysdeps/x86_64/multiarch/stpncpy-ssse3.S: New file.
+	* sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: New file.
+	* sysdeps/x86_64/multiarch/strcpy-ssse3.S: New file.
+	* sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S: New file.
+	* sysdeps/x86_64/multiarch/strncpy-ssse3.S: New file.
+	* sysdeps/x86_64/multiarch/strcpy.S: Remove strcpy with SSSE3.
+	(STRCPY): Support SSE2 and SSSE3 versions.
+
+2011-06-24  Ulrich Drepper  <drepper@xxxxxxxxx>
+
+	[BZ #12874]
+	* sysdeps/unix/sysv/linux/Makefile (CFLAGS-tst-writev.c): Define.
+	* sysdeps/wordsize-64/tst-writev.c: Work around problem with 2.6.38+
+	kernels which artificially limit size of requests.
+
+2011-06-22  H.J. Lu  <hongjiu.lu@xxxxxxxxx>
+
+	* sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add
+	strncpy-c strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3
+	strcpy-sse2 strncpy-sse2 stpcpy-sse2 stpncpy-sse2.
+	* sysdeps/i386/i686/multiarch/stpcpy-sse2.S: New file.
+	* sysdeps/i386/i686/multiarch/stpcpy-ssse3.S: New file.
+	* sysdeps/i386/i686/multiarch/stpncpy-sse2.S: New file.
+	* sysdeps/i386/i686/multiarch/stpncpy-ssse3.S: New file.
+	* sysdeps/i386/i686/multiarch/stpncpy.S : New file.
+	* sysdeps/i386/i686/multiarch/strcpy-sse2.S : New file.
+	* sysdeps/i386/i686/multiarch/strcpy-ssse3.S: New file.
+	* sysdeps/i386/i686/multiarch/strcpy.S: New file.
+	* sysdeps/i386/i686/multiarch/strncpy-c.c: New file.
+	* sysdeps/i386/i686/multiarch/strncpy-sse2.S: New file.
+	* sysdeps/i386/i686/multiarch/strncpy-ssse3.S: New file.
+	* sysdeps/i386/i686/multiarch/strncpy.S: New file.
+	* sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features):
+	Enable unaligned load optimization for Intel Core i3, i5 and i7
+	processors.
+	* sysdeps/x86_64/multiarch/init-arch.h (bit_Fast_Unaligned_Load):
+	Define.
+	(index_Fast_Unaligned_Load): Define.
+	(HAS_FAST_UNALIGNED_LOAD): Define.
+
+2011-06-23  Marek Polacek  <mpolacek@xxxxxxxxxx>
+
+	* nss/nss_db/db-open.c: Include <unistd.h> for read declaration.
+
 2011-06-22  Ulrich Drepper  <drepper@xxxxxxxxx>
 
 	[BZ #12907]

Modified: fsf/trunk/libc/NEWS
==============================================================================
--- fsf/trunk/libc/NEWS (original)
+++ fsf/trunk/libc/NEWS Sat Jun 25 00:02:39 2011
@@ -1,4 +1,4 @@
-GNU C Library NEWS -- history of user-visible changes.  2011-6-22
+GNU C Library NEWS -- history of user-visible changes.  2011-6-24
 Copyright (C) 1992-2009, 2010, 2011 Free Software Foundation, Inc.
 See the end for copying conditions.
 
@@ -9,7 +9,7 @@
 
 * The following bugs are resolved with this release:
 
-  12885, 12907
+  12874, 12885, 12907
 
 * New program pldd to list loaded object of a process
   Implemented by Ulrich Drepper.
@@ -17,6 +17,12 @@
 * Add nss_db support back to glibc.  No more dependency on Berkeley db
   and support for initgroups lookups.
   Implemented by Ulrich Drepper.
+
+* Optimized strcpy, strncpy, stpcpy, stpncpy for SSE2 and SSSE3 on x86-32.
+  Contributed by HJ Lu.
+
+* Improved strcpy, strncpy, stpcpy, stpncpy for SSE2 and SSSE3 on x86-64.
+  Contributed by HJ Lu.
 
 Version 2.14
 

Modified: fsf/trunk/libc/nss/nss_db/db-open.c
==============================================================================
--- fsf/trunk/libc/nss/nss_db/db-open.c (original)
+++ fsf/trunk/libc/nss/nss_db/db-open.c Sat Jun 25 00:02:39 2011
@@ -22,6 +22,7 @@
 #include <dlfcn.h>
 #include <stdlib.h>
 #include <string.h>
+#include <unistd.h>
 #include <sys/mman.h>
 #include <not-cancel.h>
 

Modified: fsf/trunk/libc/sysdeps/i386/i686/multiarch/Makefile
==============================================================================
--- fsf/trunk/libc/sysdeps/i386/i686/multiarch/Makefile (original)
+++ fsf/trunk/libc/sysdeps/i386/i686/multiarch/Makefile Sat Jun 25 00:02:39 2011
@@ -10,7 +10,9 @@
 		   memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \
 		   strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \
 		   memcmp-ssse3 memcmp-sse4 strcasestr-nonascii varshift \
-		   strlen-sse2 strlen-sse2-bsf
+		   strlen-sse2 strlen-sse2-bsf strncpy-c strcpy-ssse3 \
+		   strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 strcpy-sse2 \
+		   strncpy-sse2 stpcpy-sse2 stpncpy-sse2
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
 CFLAGS-varshift.c += -msse4

Added: fsf/trunk/libc/sysdeps/i386/i686/multiarch/stpcpy-sse2.S
==============================================================================
--- fsf/trunk/libc/sysdeps/i386/i686/multiarch/stpcpy-sse2.S (added)
+++ fsf/trunk/libc/sysdeps/i386/i686/multiarch/stpcpy-sse2.S Sat Jun 25 00:02:39 2011
@@ -1,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_sse2
+#include "strcpy-sse2.S"

Added: fsf/trunk/libc/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S
==============================================================================
--- fsf/trunk/libc/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S (added)
+++ fsf/trunk/libc/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S Sat Jun 25 00:02:39 2011
@@ -1,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_ssse3
+#include "strcpy-ssse3.S"

Added: fsf/trunk/libc/sysdeps/i386/i686/multiarch/stpcpy.S
==============================================================================
--- fsf/trunk/libc/sysdeps/i386/i686/multiarch/stpcpy.S (added)
+++ fsf/trunk/libc/sysdeps/i386/i686/multiarch/stpcpy.S Sat Jun 25 00:02:39 2011
@@ -1,0 +1,7 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy
+#include "strcpy.S"
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)

Added: fsf/trunk/libc/sysdeps/i386/i686/multiarch/stpncpy-sse2.S
==============================================================================
--- fsf/trunk/libc/sysdeps/i386/i686/multiarch/stpncpy-sse2.S (added)
+++ fsf/trunk/libc/sysdeps/i386/i686/multiarch/stpncpy-sse2.S Sat Jun 25 00:02:39 2011
@@ -1,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_sse2
+#include "strcpy-sse2.S"

Added: fsf/trunk/libc/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S
==============================================================================
--- fsf/trunk/libc/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S (added)
+++ fsf/trunk/libc/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S Sat Jun 25 00:02:39 2011
@@ -1,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_ssse3
+#include "strcpy-ssse3.S"

Added: fsf/trunk/libc/sysdeps/i386/i686/multiarch/stpncpy.S
==============================================================================
--- fsf/trunk/libc/sysdeps/i386/i686/multiarch/stpncpy.S (added)
+++ fsf/trunk/libc/sysdeps/i386/i686/multiarch/stpncpy.S Sat Jun 25 00:02:39 2011
@@ -1,0 +1,6 @@
+#define STRCPY __stpncpy
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#include "strcpy.S"
+
+weak_alias (__stpncpy, stpncpy)

Added: fsf/trunk/libc/sysdeps/i386/i686/multiarch/strcpy-sse2.S
==============================================================================
--- fsf/trunk/libc/sysdeps/i386/i686/multiarch/strcpy-sse2.S (added)
+++ fsf/trunk/libc/sysdeps/i386/i686/multiarch/strcpy-sse2.S Sat Jun 25 00:02:39 2011
@@ -1,0 +1,2251 @@
+/* strcpy with SSE2 and unaligned load
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+
+# define CFI_PUSH(REG)                  \
+	cfi_adjust_cfa_offset (4);     \
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)                   \
+	cfi_adjust_cfa_offset (-4);    \
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef STRCPY
+#  define STRCPY  __strcpy_sse2
+# endif
+
+# define STR1  PARMS
+# define STR2  STR1+4
+# define LEN  STR2+4
+
+# ifdef USE_AS_STRNCPY
+#  define PARMS  16
+#  define ENTRANCE PUSH(%ebx); PUSH(%esi); PUSH(%edi)
+#  define RETURN  POP(%edi); POP(%esi); POP(%ebx); ret;          \
+	CFI_PUSH(%ebx); CFI_PUSH(%esi); CFI_PUSH(%edi);
+
+# ifdef SHARED
+#  define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into ECX and branch to it. TABLE is a
+	jump table with relative offsets.
+	INDEX is a register contains the index into the jump table.
+	SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)            \
+	/* We first load PC into ECX.  */                       \
+	call	__i686.get_pc_thunk.cx;                         \
+	/* Get the address of the jump table.  */               \
+	addl	$(TABLE - .), %ecx;                             \
+	/* Get the entry and convert the relative offset to the \
+	absolute	address.  */                            \
+	addl	(%ecx,INDEX,SCALE), %ecx;                       \
+	/* We loaded the jump table and adjuested ECX. Go.  */  \
+	jmp	*%ecx
+# else
+#  define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+	absolute	offsets.  INDEX is a register contains the index into the
+	jump	table.  SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+	jmp	*TABLE(,INDEX,SCALE)
+# endif
+
+.text
+ENTRY (STRCPY)
+	ENTRANCE
+	mov	STR1(%esp), %edi
+	mov	STR2(%esp), %esi
+	movl	LEN(%esp), %ebx
+	test	%ebx, %ebx
+	jz	L(ExitZero)
+
+	mov	%esi, %ecx
+# ifndef USE_AS_STPCPY
+	mov	%edi, %eax      /* save result */
+# endif
+	and	$15, %ecx
+	jz	L(SourceStringAlignmentZero)
+
+	and	$-16, %esi
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	(%esi), %xmm1
+	add	%ecx, %ebx
+	pmovmskb %xmm1, %edx
+	shr	%cl, %edx
+# ifdef USE_AS_STPCPY
+	cmp	$16, %ebx
+	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
+# else
+	cmp	$17, %ebx
+	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail)
+
+	pcmpeqb	16(%esi), %xmm0
+	pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+	cmp	$32, %ebx
+	jbe	L(CopyFrom1To32BytesCase2OrCase3)
+# else
+	cmp	$33, %ebx
+	jbe	L(CopyFrom1To32BytesCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes)
+
+	movdqu	(%esi, %ecx), %xmm1   /* copy 16 bytes */
+	movdqu	%xmm1, (%edi)
+
+	sub	%ecx, %edi
+
+/* If source adress alignment != destination adress alignment */
+	.p2align 4
+L(Unalign16Both):
+	mov	$16, %ecx
+	movdqa	(%esi, %ecx), %xmm1
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%edi, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$48, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
+
+	movaps	16(%esi, %ecx), %xmm3
+	movdqu	%xmm2, (%edi, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
+
+	movaps	16(%esi, %ecx), %xmm4
+	movdqu	%xmm3, (%edi, %ecx)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
+
+	movaps	16(%esi, %ecx), %xmm1
+	movdqu	%xmm4, (%edi, %ecx)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm1)
+
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%edi, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
+
+	movaps	16(%esi, %ecx), %xmm3
+	movdqu	%xmm2, (%edi, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
+
+	movdqu	%xmm3, (%edi, %ecx)
+	mov	%esi, %edx
+	lea	16(%esi, %ecx), %esi
+	and	$-0x40, %esi
+	sub	%esi, %edx
+	sub	%edx, %edi
+	lea	128(%ebx, %edx), %ebx
+
+L(Unaligned64Loop):
+	movaps	(%esi), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%esi), %xmm5
+	movaps	32(%esi), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%esi), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+	sub	$64, %ebx
+	jbe	L(UnalignedLeaveCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(Unaligned64Leave)
+L(Unaligned64Loop_start):
+	add	$64, %edi
+	add	$64, %esi
+	movdqu	%xmm4, -64(%edi)
+	movaps	(%esi), %xmm2
+	movdqa	%xmm2, %xmm4
+	movdqu	%xmm5, -48(%edi)
+	movaps	16(%esi), %xmm5
+	pminub	%xmm5, %xmm2
+	movaps	32(%esi), %xmm3
+	movdqu	%xmm6, -32(%edi)
+	movaps	%xmm3, %xmm6
+	movdqu	%xmm7, -16(%edi)
+	movaps	48(%esi), %xmm7
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+	sub	$64, %ebx
+	jbe	L(UnalignedLeaveCase2OrCase3)
+	test	%edx, %edx
+	jz	L(Unaligned64Loop_start)
+L(Unaligned64Leave):
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	%xmm4, %xmm0
+	pcmpeqb	%xmm5, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnaligned_0)
+	test	%ecx, %ecx
+	jnz	L(CopyFrom1To16BytesUnaligned_16)
+
+	pcmpeqb	%xmm6, %xmm0
+	pcmpeqb	%xmm7, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnaligned_32)
+
+	bsf	%ecx, %edx
+	movdqu	%xmm4, (%edi)
+	movdqu	%xmm5, 16(%edi)
+	movdqu	%xmm6, 32(%edi)
+# ifdef USE_AS_STPCPY
+	lea	48(%edi, %edx), %eax
+# endif
+	movdqu	%xmm7, 48(%edi)
+	add	$15, %ebx
+	sub	%edx, %ebx
+	lea	49(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+
+/* If source adress alignment == destination adress alignment */
+
+L(SourceStringAlignmentZero):
+	pxor	%xmm0, %xmm0
+	movdqa	(%esi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+	cmp	$16, %ebx
+	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
+# else
+	cmp	$17, %ebx
+	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail1)
+
+	pcmpeqb	16(%esi), %xmm0
+	movdqu	%xmm1, (%edi)
+	pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+	cmp	$32, %ebx
+	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
+# else
+	cmp	$33, %ebx
+	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes1)
+
+	jmp	L(Unalign16Both)
+
+/*-----------------End of main part---------------------------*/
+
+/* Case1 */
+	.p2align 4
+L(CopyFrom1To16BytesTail):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1):
+	add	$16, %esi
+	add	$16, %edi
+	sub	$16, %ebx
+L(CopyFrom1To16BytesTail1):
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes):
+	sub	%ecx, %ebx
+	bsf	%edx, %edx
+	add	%ecx, %esi
+	add	$16, %edx
+	sub	%ecx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+	bsf	%edx, %edx
+# ifdef USE_AS_STPCPY
+	lea	(%edi, %edx), %eax
+# endif
+	movdqu	%xmm4, (%edi)
+	add	$63, %ebx
+	sub	%edx, %ebx
+	lea	1(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+	bsf	%ecx, %edx
+	movdqu	%xmm4, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	16(%edi, %edx), %eax
+# endif
+	movdqu	%xmm5, 16(%edi)
+	add	$47, %ebx
+	sub	%edx, %ebx
+	lea	17(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+	bsf	%edx, %edx
+	movdqu	%xmm4, (%edi)
+	movdqu	%xmm5, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	32(%edi, %edx), %eax
+# endif
+	movdqu	%xmm6, 32(%edi)
+	add	$31, %ebx
+	sub	%edx, %ebx
+	lea	33(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm6):
+	movdqu	%xmm6, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm5):
+	movdqu	%xmm5, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm4):
+	movdqu	%xmm4, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm3):
+	movdqu	%xmm3, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm1):
+	movdqu	%xmm1, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesExit):
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+/* Case2 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %ebx
+	add	%ecx, %edi
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	add	$16, %edx
+	sub	%ecx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+/* Case2 or Case3,  Case3 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+	add	$16, %ebx
+	add	%ecx, %edi
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32BytesCase2)
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTailCase2)
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+	add	$16, %edi
+	add	$16, %esi
+	sub	$16, %ebx
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail1Case2)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(Exit0):
+# ifdef USE_AS_STPCPY
+	mov	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit1):
+	movb	%dh, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	(%edi), %eax
+# endif
+	sub	$1, %ebx
+	lea	1(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit2):
+	movw	(%esi), %dx
+	movw	%dx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	1(%edi), %eax
+# endif
+	sub	$2, %ebx
+	lea	2(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit3):
+	movw	(%esi), %cx
+	movw	%cx, (%edi)
+	movb	%dh, 2(%edi)
+# ifdef USE_AS_STPCPY
+	lea	2(%edi), %eax
+# endif
+	sub	$3, %ebx
+	lea	3(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	movl	(%esi), %edx
+	movl	%edx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	3(%edi), %eax
+# endif
+	sub	$4, %ebx
+	lea	4(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit5):
+	movl	(%esi), %ecx
+	movb	%dh, 4(%edi)
+	movl	%ecx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	4(%edi), %eax
+# endif
+	sub	$5, %ebx
+	lea	5(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit6):
+	movl	(%esi), %ecx
+	movw	4(%esi), %dx
+	movl	%ecx, (%edi)
+	movw	%dx, 4(%edi)
+# ifdef USE_AS_STPCPY
+	lea	5(%edi), %eax
+# endif
+	sub	$6, %ebx
+	lea	6(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit7):
+	movl	(%esi), %ecx
+	movl	3(%esi), %edx
+	movl	%ecx, (%edi)
+	movl	%edx, 3(%edi)
+# ifdef USE_AS_STPCPY
+	lea	6(%edi), %eax
+# endif
+	sub	$7, %ebx
+	lea	7(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit8):
+	movlpd	(%esi), %xmm0
+	movlpd	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	7(%edi), %eax
+# endif
+	sub	$8, %ebx
+	lea	8(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit9):
+	movlpd	(%esi), %xmm0
+	movb	%dh, 8(%edi)
+	movlpd	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	8(%edi), %eax
+# endif
+	sub	$9, %ebx
+	lea	9(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit10):
+	movlpd	(%esi), %xmm0
+	movw	8(%esi), %dx
+	movlpd	%xmm0, (%edi)
+	movw	%dx, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	9(%edi), %eax
+# endif
+	sub	$10, %ebx
+	lea	10(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit11):
+	movlpd	(%esi), %xmm0
+	movl	7(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 7(%edi)
+# ifdef USE_AS_STPCPY
+	lea	10(%edi), %eax
+# endif
+	sub	$11, %ebx
+	lea	11(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	movlpd	(%esi), %xmm0
+	movl	8(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	11(%edi), %eax
+# endif
+	sub	$12, %ebx
+	lea	12(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit13):
+	movlpd	(%esi), %xmm0
+	movlpd	5(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 5(%edi)
+# ifdef USE_AS_STPCPY
+	lea	12(%edi), %eax
+# endif
+	sub	$13, %ebx
+	lea	13(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit14):
+	movlpd	(%esi), %xmm0
+	movlpd	6(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 6(%edi)
+# ifdef USE_AS_STPCPY
+	lea	13(%edi), %eax
+# endif
+	sub	$14, %ebx
+	lea	14(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit15):
+	movlpd	(%esi), %xmm0
+	movlpd	7(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 7(%edi)
+# ifdef USE_AS_STPCPY
+	lea	14(%edi), %eax
+# endif
+	sub	$15, %ebx
+	lea	15(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit16):
+	movdqu	(%esi), %xmm0
+	movdqu	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	15(%edi), %eax
+# endif
+	sub	$16, %ebx
+	lea	16(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit17):
+	movdqu	(%esi), %xmm0
+	movdqu	%xmm0, (%edi)
+	movb	%dh, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	16(%edi), %eax
+# endif
+	sub	$17, %ebx
+	lea	17(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit18):
+	movdqu	(%esi), %xmm0
+	movw	16(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movw	%cx, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	17(%edi), %eax
+# endif
+	sub	$18, %ebx
+	lea	18(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit19):
+	movdqu	(%esi), %xmm0
+	movl	15(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	18(%edi), %eax
+# endif
+	sub	$19, %ebx
+	lea	19(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit20):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	19(%edi), %eax
+# endif
+	sub	$20, %ebx
+	lea	20(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit21):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+	movb	%dh, 20(%edi)
+# ifdef USE_AS_STPCPY
+	lea	20(%edi), %eax
+# endif
+	sub	$21, %ebx
+	lea	21(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit22):
+	movdqu	(%esi), %xmm0
+	movlpd	14(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 14(%edi)
+# ifdef USE_AS_STPCPY
+	lea	21(%edi), %eax
+# endif
+	sub	$22, %ebx
+	lea	22(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit23):
+	movdqu	(%esi), %xmm0
+	movlpd	15(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	22(%edi), %eax
+# endif
+	sub	$23, %ebx
+	lea	23(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit24):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	23(%edi), %eax
+# endif
+	sub	$24, %ebx
+	lea	24(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit25):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movb	%dh, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	24(%edi), %eax
+# endif
+	sub	$25, %ebx
+	lea	25(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit26):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movw	24(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movw	%cx, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	25(%edi), %eax
+# endif
+	sub	$26, %ebx
+	lea	26(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit27):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	23(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 23(%edi)
+# ifdef USE_AS_STPCPY
+	lea	26(%edi), %eax
+# endif
+	sub	$27, %ebx
+	lea	27(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit28):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	24(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	27(%edi), %eax
+# endif
+	sub	$28, %ebx
+	lea	28(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit29):
+	movdqu	(%esi), %xmm0
+	movdqu	13(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 13(%edi)
+# ifdef USE_AS_STPCPY
+	lea	28(%edi), %eax
+# endif
+	sub	$29, %ebx
+	lea	29(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit30):
+	movdqu	(%esi), %xmm0
+	movdqu	14(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 14(%edi)
+# ifdef USE_AS_STPCPY
+	lea	29(%edi), %eax
+# endif
+	sub	$30, %ebx
+	lea	30(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+
+	.p2align 4
+L(Exit31):
+	movdqu	(%esi), %xmm0
+	movdqu	15(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	30(%edi), %eax
+# endif
+	sub	$31, %ebx
+	lea	31(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(Exit32):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	31(%edi), %eax
+# endif
+	sub	$32, %ebx
+	lea	32(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+	RETURN
+
+	.p2align 4
+L(StrncpyExit1):
+	movb	(%esi), %dl
+	movb	%dl, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	1(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit2):
+	movw	(%esi), %dx
+	movw	%dx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	2(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit3):
+	movw	(%esi), %cx
+	movb	2(%esi), %dl
+	movw	%cx, (%edi)
+	movb	%dl, 2(%edi)
+# ifdef USE_AS_STPCPY
+	lea	3(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit4):
+	movl	(%esi), %edx
+	movl	%edx, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	4(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit5):
+	movl	(%esi), %ecx
+	movb	4(%esi), %dl
+	movl	%ecx, (%edi)
+	movb	%dl, 4(%edi)
+# ifdef USE_AS_STPCPY
+	lea	5(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit6):
+	movl	(%esi), %ecx
+	movw	4(%esi), %dx
+	movl	%ecx, (%edi)
+	movw	%dx, 4(%edi)
+# ifdef USE_AS_STPCPY
+	lea	6(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit7):
+	movl	(%esi), %ecx
+	movl	3(%esi), %edx
+	movl	%ecx, (%edi)
+	movl	%edx, 3(%edi)
+# ifdef USE_AS_STPCPY
+	lea	7(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit8):
+	movlpd	(%esi), %xmm0
+	movlpd	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	8(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit9):
+	movlpd	(%esi), %xmm0
+	movb	8(%esi), %dl
+	movlpd	%xmm0, (%edi)
+	movb	%dl, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	9(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit10):
+	movlpd	(%esi), %xmm0
+	movw	8(%esi), %dx
+	movlpd	%xmm0, (%edi)
+	movw	%dx, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	10(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit11):
+	movlpd	(%esi), %xmm0
+	movl	7(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 7(%edi)
+# ifdef USE_AS_STPCPY
+	lea	11(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit12):
+	movlpd	(%esi), %xmm0
+	movl	8(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 8(%edi)
+# ifdef USE_AS_STPCPY
+	lea	12(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit13):
+	movlpd	(%esi), %xmm0
+	movlpd	5(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 5(%edi)
+# ifdef USE_AS_STPCPY
+	lea	13(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit14):
+	movlpd	(%esi), %xmm0
+	movlpd	6(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 6(%edi)
+# ifdef USE_AS_STPCPY
+	lea	14(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit15):
+	movlpd	(%esi), %xmm0
+	movlpd	7(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 7(%edi)
+# ifdef USE_AS_STPCPY
+	lea	15(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit16):
+	movdqu	(%esi), %xmm0
+	movdqu	%xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+	lea	16(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit17):
+	movdqu	(%esi), %xmm0
+	movb	16(%esi), %cl
+	movdqu	%xmm0, (%edi)
+	movb	%cl, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	17(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit18):
+	movdqu	(%esi), %xmm0
+	movw	16(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movw	%cx, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	18(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit19):
+	movdqu	(%esi), %xmm0
+	movl	15(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	19(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit20):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	20(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit21):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movb	20(%esi), %dl
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+	movb	%dl, 20(%edi)
+# ifdef USE_AS_STPCPY
+	lea	21(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit22):
+	movdqu	(%esi), %xmm0
+	movlpd	14(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 14(%edi)
+# ifdef USE_AS_STPCPY
+	lea	22(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit23):
+	movdqu	(%esi), %xmm0
+	movlpd	15(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	23(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit24):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	24(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit25):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movb	24(%esi), %cl
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movb	%cl, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	25(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit26):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movw	24(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movw	%cx, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	26(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit27):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	23(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 23(%edi)
+# ifdef USE_AS_STPCPY
+	lea	27(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit28):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	24(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 24(%edi)
+# ifdef USE_AS_STPCPY
+	lea	28(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit29):
+	movdqu	(%esi), %xmm0
+	movdqu	13(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 13(%edi)
+# ifdef USE_AS_STPCPY
+	lea	29(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit30):
+	movdqu	(%esi), %xmm0
+	movdqu	14(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 14(%edi)
+# ifdef USE_AS_STPCPY
+	lea	30(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit31):
+	movdqu	(%esi), %xmm0
+	movdqu	15(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 15(%edi)
+# ifdef USE_AS_STPCPY
+	lea	31(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit32):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+	lea	32(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit33):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movb	32(%esi), %cl
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 16(%edi)
+	movb	%cl, 32(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill0):
+	RETURN
+
+	.p2align 4
+L(Fill1):
+	movb	%dl, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill2):
+	movw	%dx, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill3):
+	movl	%edx, -1(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill4):
+	movl	%edx, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill5):
+	movl	%edx, (%edi)
+	movb	%dl, 4(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill6):
+	movl	%edx, (%edi)
+	movw	%dx, 4(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill7):
+	movlpd	%xmm0, -1(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill8):
+	movlpd	%xmm0, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill9):
+	movlpd	%xmm0, (%edi)
+	movb	%dl, 8(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill10):
+	movlpd	%xmm0, (%edi)
+	movw	%dx, 8(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill11):
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 7(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill12):
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 8(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill13):
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm0, 5(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill14):
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm0, 6(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill15):
+	movdqu	%xmm0, -1(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill16):
+	movdqu	%xmm0, (%edi)
+	RETURN
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm2):
+	movdqu	%xmm2, (%edi, %ecx)
+
+	.p2align 4
+L(CopyFrom1To16BytesXmmExit):
+	bsf	%edx, %edx
+	add	$15, %ebx
+	add	%ecx, %edi
+# ifdef USE_AS_STPCPY
+	lea	(%edi, %edx), %eax
+# endif
+	sub	%edx, %ebx
+	lea	1(%edi, %edx), %edi
+
+	.p2align 4
+L(StrncpyFillTailWithZero):
+	pxor	%xmm0, %xmm0
+	xor	%edx, %edx
+	sub	$16, %ebx
+	jbe	L(StrncpyFillExit)
+
+	movdqu	%xmm0, (%edi)
+	add	$16, %edi
+
+	mov	%edi, %esi
+	and	$0xf, %esi
+	sub	%esi, %edi
+	add	%esi, %ebx
+	sub	$64, %ebx
+	jb	L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+	movdqa	%xmm0, (%edi)
+	movdqa	%xmm0, 16(%edi)
+	movdqa	%xmm0, 32(%edi)
+	movdqa	%xmm0, 48(%edi)
+	add	$64, %edi
+	sub	$64, %ebx
+	jae	L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+	add	$32, %ebx
+	jl	L(StrncpyFillLess32)
+	movdqa	%xmm0, (%edi)
+	movdqa	%xmm0, 16(%edi)
+	add	$32, %edi
+	sub	$16, %ebx
+	jl	L(StrncpyFillExit)
+	movdqa	%xmm0, (%edi)
+	add	$16, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+L(StrncpyFillLess32):
+	add	$16, %ebx
+	jl	L(StrncpyFillExit)
+	movdqa	%xmm0, (%edi)
+	add	$16, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+L(StrncpyFillExit):
+	add	$16, %ebx
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+	.p2align 4
+L(UnalignedLeaveCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+	lea	64(%ebx), %ecx
+	and	$-16, %ecx
+	add	$48, %ebx
+	jl	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm4, (%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm5, 16(%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm6, 32(%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm7, 48(%edi)
+# ifdef USE_AS_STPCPY
+	lea	64(%edi), %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Unaligned64LeaveCase2):
+	xor	%ecx, %ecx
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$48, %ebx
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm4, (%edi)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm5)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm5, 16(%edi)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm6)
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm6, 32(%edi)
+	lea	16(%edi, %ecx), %edi
+	lea	16(%esi, %ecx), %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(ExitZero):
+	movl	%edi, %eax
+	RETURN
+
+END (STRCPY)
+
+	.p2align 4
+	.section .rodata
+L(ExitTable):
+	.int	JMPTBL(L(Exit1), L(ExitTable))
+	.int	JMPTBL(L(Exit2), L(ExitTable))
+	.int	JMPTBL(L(Exit3), L(ExitTable))
+	.int	JMPTBL(L(Exit4), L(ExitTable))
+	.int	JMPTBL(L(Exit5), L(ExitTable))
+	.int	JMPTBL(L(Exit6), L(ExitTable))
+	.int	JMPTBL(L(Exit7), L(ExitTable))
+	.int	JMPTBL(L(Exit8), L(ExitTable))
+	.int	JMPTBL(L(Exit9), L(ExitTable))
+	.int	JMPTBL(L(Exit10), L(ExitTable))
+	.int	JMPTBL(L(Exit11), L(ExitTable))
+	.int	JMPTBL(L(Exit12), L(ExitTable))
+	.int	JMPTBL(L(Exit13), L(ExitTable))
+	.int	JMPTBL(L(Exit14), L(ExitTable))
+	.int	JMPTBL(L(Exit15), L(ExitTable))
+	.int	JMPTBL(L(Exit16), L(ExitTable))
+	.int	JMPTBL(L(Exit17), L(ExitTable))
+	.int	JMPTBL(L(Exit18), L(ExitTable))
+	.int	JMPTBL(L(Exit19), L(ExitTable))
+	.int	JMPTBL(L(Exit20), L(ExitTable))
+	.int	JMPTBL(L(Exit21), L(ExitTable))
+	.int	JMPTBL(L(Exit22), L(ExitTable))
+	.int    JMPTBL(L(Exit23), L(ExitTable))
+	.int	JMPTBL(L(Exit24), L(ExitTable))
+	.int	JMPTBL(L(Exit25), L(ExitTable))
+	.int	JMPTBL(L(Exit26), L(ExitTable))
+	.int	JMPTBL(L(Exit27), L(ExitTable))
+	.int	JMPTBL(L(Exit28), L(ExitTable))
+	.int	JMPTBL(L(Exit29), L(ExitTable))
+	.int	JMPTBL(L(Exit30), L(ExitTable))
+	.int	JMPTBL(L(Exit31), L(ExitTable))
+	.int	JMPTBL(L(Exit32), L(ExitTable))
+
+L(ExitStrncpyTable):
+	.int	JMPTBL(L(Exit0), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
+	.int    JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+
+	.p2align 4
+L(FillTable):
+	.int	JMPTBL(L(Fill0), L(FillTable))
+	.int	JMPTBL(L(Fill1), L(FillTable))
+	.int	JMPTBL(L(Fill2), L(FillTable))
+	.int	JMPTBL(L(Fill3), L(FillTable))
+	.int	JMPTBL(L(Fill4), L(FillTable))
+	.int	JMPTBL(L(Fill5), L(FillTable))
+	.int	JMPTBL(L(Fill6), L(FillTable))
+	.int	JMPTBL(L(Fill7), L(FillTable))
+	.int	JMPTBL(L(Fill8), L(FillTable))
+	.int	JMPTBL(L(Fill9), L(FillTable))
+	.int	JMPTBL(L(Fill10), L(FillTable))
+	.int	JMPTBL(L(Fill11), L(FillTable))
+	.int	JMPTBL(L(Fill12), L(FillTable))
+	.int	JMPTBL(L(Fill13), L(FillTable))
+	.int	JMPTBL(L(Fill14), L(FillTable))
+	.int	JMPTBL(L(Fill15), L(FillTable))
+	.int	JMPTBL(L(Fill16), L(FillTable))
+# else
+#  define PARMS  4
+#  define ENTRANCE
+#  define RETURN  POP (%edi); ret; CFI_PUSH (%edi)
+#  define RETURN1  ret
+
+	.text
+ENTRY (STRCPY)
+	ENTRANCE
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %ecx
+
+	cmpb	$0, (%ecx)
+	jz	L(ExitTail1)
+	cmpb	$0, 1(%ecx)
+	jz	L(ExitTail2)
+	cmpb	$0, 2(%ecx)
+	jz	L(ExitTail3)
+	cmpb	$0, 3(%ecx)
+	jz	L(ExitTail4)
+	cmpb	$0, 4(%ecx)
+	jz	L(ExitTail5)
+	cmpb	$0, 5(%ecx)
+	jz	L(ExitTail6)
+	cmpb	$0, 6(%ecx)
+	jz	L(ExitTail7)
+	cmpb	$0, 7(%ecx)
+	jz	L(ExitTail8)
+	cmpb	$0, 8(%ecx)
+	jz	L(ExitTail9)
+	cmpb	$0, 9(%ecx)
+	jz	L(ExitTail10)
+	cmpb	$0, 10(%ecx)
+	jz	L(ExitTail11)
+	cmpb	$0, 11(%ecx)
+	jz	L(ExitTail12)
+	cmpb	$0, 12(%ecx)
+	jz	L(ExitTail13)
+	cmpb	$0, 13(%ecx)
+	jz	L(ExitTail14)
+	cmpb	$0, 14(%ecx)
+	jz	L(ExitTail15)
+	cmpb	$0, 15(%ecx)
+	jz	L(ExitTail16)
+
+	PUSH	(%edi)
+	PUSH	(%ebx)
+
+	mov	%edx, %edi
+	lea	16(%ecx), %ebx
+	and	$-16, %ebx
+	pxor	%xmm0, %xmm0
+	movdqu	(%ecx), %xmm1
+	movdqu	%xmm1, (%edx)
+	pcmpeqb	(%ebx), %xmm0
+	pmovmskb %xmm0, %eax
+	sub	%ecx, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%ecx, %eax
+	lea	16(%ecx), %ecx
+	and	$-16, %ecx
+	sub	%ecx, %eax
+	sub	%eax, %edx
+	xor	%ebx, %ebx
+
+	.p2align 4
+	movdqa	(%ecx), %xmm1
+	movaps	16(%ecx), %xmm2
+	movdqu	%xmm1, (%edx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm3
+	movdqu	%xmm2, (%edx, %ebx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm4
+	movdqu	%xmm3, (%edx, %ebx)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm1
+	movdqu	%xmm4, (%edx, %ebx)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm2
+	movdqu	%xmm1, (%edx, %ebx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %ebx), %xmm3
+	movdqu	%xmm2, (%edx, %ebx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$16, %ebx
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movdqu	%xmm3, (%edx, %ebx)
+	mov	%ecx, %eax
+	lea	16(%ecx, %ebx), %ecx
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	sub	%eax, %edx
+
+L(Aligned64Loop):
+	movaps	(%ecx), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%ecx), %xmm5
+	movaps	32(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%ecx), %xmm7
+	pminub	%xmm5, %xmm2
+	add	$64, %ecx
+	pminub	%xmm7, %xmm3
+	add	$64, %edx
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(Aligned64Leave)
+L(Aligned64Loop_start):
+	movdqu	%xmm4, -64(%edx)
+	movaps	(%ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+	movdqu	%xmm5, -48(%edx)
+	movaps	16(%ecx), %xmm5
+	pminub	%xmm5, %xmm2
+	movaps	32(%ecx), %xmm3
+	movdqu	%xmm6, -32(%edx)
+	movaps	%xmm3, %xmm6
+	movdqu	%xmm7, -16(%edx)
+	movaps	48(%ecx), %xmm7
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	add	$64, %edx
+	add	$64, %ecx
+	test	%eax, %eax
+	jz	L(Aligned64Loop_start)
+L(Aligned64Leave):
+	sub	$0xa0, %ebx
+	pxor	%xmm0, %xmm0
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %eax
+	movdqu	%xmm4, -64(%edx)
+	test	%eax, %eax
+	lea	16(%ebx), %ebx
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %eax
+	movdqu	%xmm5, -48(%edx)
+	test	%eax, %eax
+	lea	16(%ebx), %ebx
+	jnz	L(CopyFrom1To16Bytes)
+
+	movdqu	%xmm6, -32(%edx)
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%ebx), %ebx
+
+/*-----------------End of main part---------------------------*/
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%ebx, %edx
+	add	%ebx, %ecx
+
+	POP	(%ebx)
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	/* Exit 8 */
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	7(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	/* Exit 16 */
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	8(%ecx), %xmm0
+	movlpd	%xmm0, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	15(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	1(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+# ifdef USE_AS_STPCPY
+	lea	2(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	3(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	4(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	5(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+# ifdef USE_AS_STPCPY
+	lea	6(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit9):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movb	8(%ecx), %al
+	movb	%al, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	8(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit10):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movw	8(%ecx), %ax
+	movw	%ax, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	9(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit11):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movl	7(%ecx), %eax
+	movl	%eax, 7(%edx)
+# ifdef USE_AS_STPCPY
+	lea	10(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	11(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit13):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	5(%ecx), %xmm0
+	movlpd	%xmm0, 5(%edx)
+# ifdef USE_AS_STPCPY
+	lea	12(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit14):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	6(%ecx), %xmm0
+	movlpd	%xmm0, 6(%edx)
+# ifdef USE_AS_STPCPY
+	lea	13(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+	.p2align 4
+L(Exit15):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+# ifdef USE_AS_STPCPY
+	lea	14(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+	RETURN
+
+CFI_POP (%edi)
+
+	.p2align 4
+L(ExitTail1):
+	movb	(%ecx), %al
+	movb	%al, (%edx)
+	movl	%edx, %eax
+	RETURN1
+
+	.p2align 4
+L(ExitTail2):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	1(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail3):
+	movw	(%ecx), %ax
+	movw	%ax, (%edx)
+	movb	2(%ecx), %al
+	movb	%al, 2(%edx)
+# ifdef USE_AS_STPCPY
+	lea	2(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	3(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail5):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movb	4(%ecx), %al
+	movb	%al, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	4(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail6):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movw	4(%ecx), %ax
+	movw	%ax, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	5(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail7):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	3(%ecx), %eax
+	movl	%eax, 3(%edx)
+# ifdef USE_AS_STPCPY
+	lea	6(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail8):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+# ifdef USE_AS_STPCPY
+	lea	7(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail9):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movb	8(%ecx), %al
+	movb	%al, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	8(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail10):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movw	8(%ecx), %ax
+	movw	%ax, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	9(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail11):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movl	7(%ecx), %eax
+	movl	%eax, 7(%edx)
+# ifdef USE_AS_STPCPY
+	lea	10(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail12):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	4(%ecx), %eax
+	movl	%eax, 4(%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	11(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail13):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	5(%ecx), %xmm0
+	movlpd	%xmm0, 5(%edx)
+# ifdef USE_AS_STPCPY
+	lea	12(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail14):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	6(%ecx), %xmm0
+	movlpd	%xmm0, 6(%edx)
+# ifdef USE_AS_STPCPY
+	lea	13(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail15):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	7(%ecx), %xmm0
+	movlpd	%xmm0, 7(%edx)
+# ifdef USE_AS_STPCPY
+	lea	14(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitTail16):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movlpd	8(%ecx), %xmm0
+	movlpd	%xmm0, 8(%edx)
+# ifdef USE_AS_STPCPY
+	lea	15(%edx), %eax
+# else
+	movl	%edx, %eax
+# endif
+	RETURN1
+
+END (STRCPY)
+# endif
+
+#endif

Added: fsf/trunk/libc/sysdeps/i386/i686/multiarch/strcpy-ssse3.S
==============================================================================
--- fsf/trunk/libc/sysdeps/i386/i686/multiarch/strcpy-ssse3.S (added)
+++ fsf/trunk/libc/sysdeps/i386/i686/multiarch/strcpy-ssse3.S Sat Jun 25 00:02:39 2011
@@ -1,0 +1,4094 @@
+/* strcpy with SSSE3
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)                  \
+	cfi_adjust_cfa_offset (4);     \
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)                   \
+	cfi_adjust_cfa_offset (-4);    \
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef STRCPY
+#  define STRCPY  __strcpy_ssse3
+# endif
+
+# ifdef USE_AS_STRNCPY
+#  define PARMS  8
+#  define ENTRANCE PUSH(%ebx)
+#  define RETURN  POP(%ebx); ret; CFI_PUSH(%ebx);
+#  define RETURN1  POP(%edi); POP(%ebx); ret; CFI_PUSH(%ebx); CFI_PUSH(%edi)
+# else
+#  define PARMS  4
+#  define ENTRANCE
+#  define RETURN  ret
+#  define RETURN1  POP(%edi); ret; CFI_PUSH(%edi)
+# endif
+
+# define STR1  PARMS
+# define STR2  STR1+4
+# define LEN  STR2+4
+
+/* In this code following instructions are used for copying:
+	movb	- 1 byte
+	movw	- 2 byte
+	movl	- 4 byte
+	movlpd	- 8 byte
+	movaps	- 16 byte - requires 16 byte alignment
+	of	sourse and destination adresses.
+	16 byte alignment: adress is 32bit value,
+	right	four bit of adress shall be 0.
+*/
+
+.text
+ENTRY (STRCPY)
+	ENTRANCE
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %ecx
+# ifdef USE_AS_STRNCPY
+	movl	LEN(%esp), %ebx
+	test	%ebx, %ebx
+	jz	L(ExitTail0)
+	cmp	$8, %ebx
+	jbe	L(StrncpyExit8Bytes)
+# endif
+	cmpb	$0, (%ecx)
+	jz	L(ExitTail1)
+	cmpb	$0, 1(%ecx)
+	jz	L(ExitTail2)
+	cmpb	$0, 2(%ecx)
+	jz	L(ExitTail3)
+	cmpb	$0, 3(%ecx)
+	jz	L(ExitTail4)
+	cmpb	$0, 4(%ecx)
+	jz	L(ExitTail5)
+	cmpb	$0, 5(%ecx)
+	jz	L(ExitTail6)
+	cmpb	$0, 6(%ecx)
+	jz	L(ExitTail7)
+	cmpb	$0, 7(%ecx)
+	jz	L(ExitTail8)
+# ifdef USE_AS_STRNCPY
+	cmp	$16, %ebx
+	jb	L(StrncpyExit15Bytes)
+# endif
+	cmpb	$0, 8(%ecx)
+	jz	L(ExitTail9)
+	cmpb	$0, 9(%ecx)
+	jz	L(ExitTail10)
+	cmpb	$0, 10(%ecx)
+	jz	L(ExitTail11)
+	cmpb	$0, 11(%ecx)
+	jz	L(ExitTail12)
+	cmpb	$0, 12(%ecx)
+	jz	L(ExitTail13)
+	cmpb	$0, 13(%ecx)
+	jz	L(ExitTail14)
+	cmpb	$0, 14(%ecx)
+	jz	L(ExitTail15)
+# ifdef USE_AS_STRNCPY
+	cmp	$16, %ebx
+	je	L(ExitTail16)
+# endif
+	cmpb	$0, 15(%ecx)
+	jz	L(ExitTail16)
+
+	PUSH	(%edi)
+	mov	%edx, %edi
+	PUSH	(%esi)
+# ifdef USE_AS_STRNCPY
+	mov	%ecx, %esi
+	and	$0xf, %esi
+
+/* add 16 bytes ecx_shift to ebx */
+
+	add	%esi, %ebx
+# endif
+	lea	16(%ecx), %esi
+/* Now:
+	esi	= alignment_16(ecx) + ecx_shift + 16;
+	ecx_shift = ecx - alignment_16(ecx)
+*/
+	and	$-16, %esi
+/* Now:
+	esi	= alignment_16(ecx) + 16
+*/
+	pxor	%xmm0, %xmm0
+	movlpd	(%ecx), %xmm1
+	movlpd	%xmm1, (%edx)
+/*
+	look	if there is zero symbol in next 16 bytes of string
+	from	esi to esi + 15 and form mask in xmm0
+*/
+	pcmpeqb	(%esi), %xmm0
+	movlpd	8(%ecx), %xmm1
+	movlpd	%xmm1, 8(%edx)
+
+/* convert byte mask in xmm0 to bit mask */
+
+	pmovmskb %xmm0, %eax
+	sub	%ecx, %esi
+
+/* esi = 16 - ecx_shift */
+
+/* eax = 0: there isn't end of string from position esi to esi+15 */
+
+# ifdef USE_AS_STRNCPY
+	sub	$32, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%edx, %eax
+	lea	16(%edx), %edx
+/* Now:
+	edx	= edx + 16 = alignment_16(edx) + edx_shift + 16
+*/
+	and	$-16, %edx
+
+/* Now: edx = alignment_16(edx) + 16 */
+
+	sub	%edx, %eax
+
+/* Now: eax = edx_shift - 16 */
+
+# ifdef USE_AS_STRNCPY
+	add	%eax, %esi
+	lea	-1(%esi), %esi
+	and	$1<<31, %esi
+	test	%esi, %esi
+	jnz	L(ContinueCopy)
+	lea	16(%ebx), %ebx
+
+L(ContinueCopy):
+# endif
+	sub	%eax, %ecx
+/* Now:
+	case	ecx_shift >= edx_shift:
+	ecx	= alignment_16(ecx) + (ecx_shift  - edx_shift) + 16
+	case	ecx_shift < edx_shift:
+	ecx	= alignment_16(ecx) + (16 + ecx_shift  - edx_shift)
+*/
+	mov	%ecx, %eax
+	and	$0xf, %eax
+/* Now:
+	case	ecx_shift >= edx_shift: eax = ecx_shift  - edx_shift
+	case	ecx_shift < edx_shift: eax = (16 + ecx_shift  - edx_shift)
+	eax	can be 0, 1, ..., 15
+*/
+	mov	$0, %esi
+
+/* case: ecx_shift == edx_shift */
+
+	jz	L(Align16Both)
+
+	cmp	$8, %eax
+	jae	L(ShlHigh8)
+	cmp	$1, %eax
+	je	L(Shl1)
+	cmp	$2, %eax
+	je	L(Shl2)
+	cmp	$3, %eax
+	je	L(Shl3)
+	cmp	$4, %eax
+	je	L(Shl4)
+	cmp	$5, %eax
+	je	L(Shl5)
+	cmp	$6, %eax
+	je	L(Shl6)
+	jmp	L(Shl7)
+
+L(ShlHigh8):
+	je	L(Shl8)
+	cmp	$9, %eax
+	je	L(Shl9)
+	cmp	$10, %eax
+	je	L(Shl10)
+	cmp	$11, %eax
+	je	L(Shl11)
+	cmp	$12, %eax
+	je	L(Shl12)
+	cmp	$13, %eax
+	je	L(Shl13)
+	cmp	$14, %eax
+	je	L(Shl14)
+	jmp	L(Shl15)
+
+L(Align16Both):
+	movaps	(%ecx), %xmm1
+	movaps	16(%ecx), %xmm2
+	movaps	%xmm1, (%edx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm3
+	movaps	%xmm2, (%edx, %esi)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm4
+	movaps	%xmm3, (%edx, %esi)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm1
+	movaps	%xmm4, (%edx, %esi)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm2
+	movaps	%xmm1, (%edx, %esi)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm3
+	movaps	%xmm2, (%edx, %esi)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm3, (%edx, %esi)
+	mov	%ecx, %eax
+	lea	16(%ecx, %esi), %ecx
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	lea	48+64(%ebx, %eax), %ebx
+# endif
+	mov	$-0x40, %esi
+
+L(Aligned64Loop):
+	movaps	(%ecx), %xmm2
+	movaps	32(%ecx), %xmm3
+	movaps	%xmm2, %xmm4
+	movaps	16(%ecx), %xmm5
+	movaps	%xmm3, %xmm6
+	movaps	48(%ecx), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	lea	64(%edx), %edx
+	pcmpeqb	%xmm0, %xmm3
+	lea	64(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeaveCase2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Aligned64Leave)
+	movaps	%xmm4, -64(%edx)
+	movaps	%xmm5, -48(%edx)
+	movaps	%xmm6, -32(%edx)
+	movaps	%xmm7, -16(%edx)
+	jmp	L(Aligned64Loop)
+
+L(Aligned64Leave):
+# ifdef USE_AS_STRNCPY
+	lea	48(%ebx), %ebx
+# endif
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm5, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%ebx), %ebx
+# endif
+	pmovmskb %xmm0, %eax
+	movaps	%xmm4, -64(%edx)
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqb	%xmm6, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%ebx), %ebx
+# endif
+	pmovmskb %xmm0, %eax
+	movaps	%xmm5, -48(%edx)
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm6, -32(%edx)
+	pcmpeqb	%xmm7, %xmm0
+# ifdef USE_AS_STRNCPY
+	lea	-16(%ebx), %ebx
+# endif
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl1):
+	movaps	-1(%ecx), %xmm1
+	movaps	15(%ecx), %xmm2
+L(Shl1Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	31(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	31(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	31(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit1Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl1LoopExit)
+
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	31(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-15(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-1(%ecx), %xmm1
+
+L(Shl1LoopStart):
+	movaps	15(%ecx), %xmm2
+	movaps	31(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	47(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	63(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$1, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$1, %xmm3, %xmm4
+	jnz	L(Shl1Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave1)
+# endif
+	palignr	$1, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl1LoopStart)
+
+L(Shl1LoopExit):
+	movaps	(%edx), %xmm6
+	psrldq	$15, %xmm6
+	mov	$15, %esi
+	palignr	$1, %xmm1, %xmm6
+	movaps	%xmm6, (%edx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl2):
+	movaps	-2(%ecx), %xmm1
+	movaps	14(%ecx), %xmm2
+L(Shl2Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	30(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	30(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	30(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit2Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl2LoopExit)
+
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	30(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-14(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-2(%ecx), %xmm1
+
+L(Shl2LoopStart):
+	movaps	14(%ecx), %xmm2
+	movaps	30(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	46(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	62(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$2, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$2, %xmm3, %xmm4
+	jnz	L(Shl2Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave2)
+# endif
+	palignr	$2, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl2LoopStart)
+
+L(Shl2LoopExit):
+	movaps	(%edx), %xmm6
+	psrldq	$14, %xmm6
+	mov	$14, %esi
+	palignr	$2, %xmm1, %xmm6
+	movaps	%xmm6, (%edx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl3):
+	movaps	-3(%ecx), %xmm1
+	movaps	13(%ecx), %xmm2
+L(Shl3Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	29(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	29(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	29(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit3Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl3LoopExit)
+
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	29(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-13(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-3(%ecx), %xmm1
+
+L(Shl3LoopStart):
+	movaps	13(%ecx), %xmm2
+	movaps	29(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	45(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	61(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$3, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$3, %xmm3, %xmm4
+	jnz	L(Shl3Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave3)
+# endif
+	palignr	$3, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl3LoopStart)
+
+L(Shl3LoopExit):
+	movaps	(%edx), %xmm6
+	psrldq	$13, %xmm6
+	mov	$13, %esi
+	palignr	$3, %xmm1, %xmm6
+	movaps	%xmm6, (%edx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl4):
+	movaps	-4(%ecx), %xmm1
+	movaps	12(%ecx), %xmm2
+L(Shl4Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit4Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	28(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-12(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-4(%ecx), %xmm1
+
+L(Shl4LoopStart):
+	movaps	12(%ecx), %xmm2
+	movaps	28(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	44(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	60(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$4, %xmm3, %xmm4
+	jnz	L(Shl4Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave4)
+# endif
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+	movaps	(%edx), %xmm6
+	psrldq	$12, %xmm6
+	mov	$12, %esi
+	palignr	$4, %xmm1, %xmm6
+	movaps	%xmm6, (%edx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl5):
+	movaps	-5(%ecx), %xmm1
+	movaps	11(%ecx), %xmm2
+L(Shl5Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	27(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	27(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	27(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit5Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl5LoopExit)
+
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	27(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-11(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-5(%ecx), %xmm1
+
+L(Shl5LoopStart):
+	movaps	11(%ecx), %xmm2
+	movaps	27(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	43(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	59(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$5, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$5, %xmm3, %xmm4
+	jnz	L(Shl5Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave5)
+# endif
+	palignr	$5, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl5LoopStart)
+
+L(Shl5LoopExit):
+	movaps	(%edx), %xmm6
+	psrldq	$11, %xmm6
+	mov	$11, %esi
+	palignr	$5, %xmm1, %xmm6
+	movaps	%xmm6, (%edx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl6):
+	movaps	-6(%ecx), %xmm1
+	movaps	10(%ecx), %xmm2
+L(Shl6Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	26(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	26(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	26(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit6Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl6LoopExit)
+
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	26(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-10(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-6(%ecx), %xmm1
+
+L(Shl6LoopStart):
+	movaps	10(%ecx), %xmm2
+	movaps	26(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	42(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	58(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$6, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$6, %xmm3, %xmm4
+	jnz	L(Shl6Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave6)
+# endif
+	palignr	$6, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl6LoopStart)
+
+L(Shl6LoopExit):
+	movaps	(%edx), %xmm6
+	psrldq	$10, %xmm6
+	mov	$10, %esi
+	palignr	$6, %xmm1, %xmm6
+	movaps	%xmm6, (%edx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl7):
+	movaps	-7(%ecx), %xmm1
+	movaps	9(%ecx), %xmm2
+L(Shl7Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	25(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	25(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	25(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit7Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl7LoopExit)
+
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	25(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-9(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-7(%ecx), %xmm1
+
+L(Shl7LoopStart):
+	movaps	9(%ecx), %xmm2
+	movaps	25(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	41(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	57(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$7, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$7, %xmm3, %xmm4
+	jnz	L(Shl7Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave7)
+# endif
+	palignr	$7, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl7LoopStart)
+
+L(Shl7LoopExit):
+	movaps	(%edx), %xmm6
+	psrldq	$9, %xmm6
+	mov	$9, %esi
+	palignr	$7, %xmm1, %xmm6
+	movaps	%xmm6, (%edx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl8):
+	movaps	-8(%ecx), %xmm1
+	movaps	8(%ecx), %xmm2
+L(Shl8Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit8Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	24(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-8(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-8(%ecx), %xmm1
+
+L(Shl8LoopStart):
+	movaps	8(%ecx), %xmm2
+	movaps	24(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	40(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	56(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$8, %xmm3, %xmm4
+	jnz	L(Shl8Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave8)
+# endif
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+	movaps	(%edx), %xmm6
+	psrldq	$8, %xmm6
+	mov	$8, %esi
+	palignr	$8, %xmm1, %xmm6
+	movaps	%xmm6, (%edx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl9):
+	movaps	-9(%ecx), %xmm1
+	movaps	7(%ecx), %xmm2
+L(Shl9Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	23(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	23(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	23(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit9Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl9LoopExit)
+
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	23(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-7(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-9(%ecx), %xmm1
+
+L(Shl9LoopStart):
+	movaps	7(%ecx), %xmm2
+	movaps	23(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	39(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	55(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$9, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$9, %xmm3, %xmm4
+	jnz	L(Shl9Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave9)
+# endif
+	palignr	$9, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl9LoopStart)
+
+L(Shl9LoopExit):
+	movaps	(%edx), %xmm6
+	psrldq	$7, %xmm6
+	mov	$7, %esi
+	palignr	$9, %xmm1, %xmm6
+	movaps	%xmm6, (%edx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl10):
+	movaps	-10(%ecx), %xmm1
+	movaps	6(%ecx), %xmm2
+L(Shl10Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	22(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	22(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	22(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit10Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl10LoopExit)
+
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	22(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-6(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-10(%ecx), %xmm1
+
+L(Shl10LoopStart):
+	movaps	6(%ecx), %xmm2
+	movaps	22(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	38(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	54(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$10, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$10, %xmm3, %xmm4
+	jnz	L(Shl10Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave10)
+# endif
+	palignr	$10, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl10LoopStart)
+
+L(Shl10LoopExit):
+	movaps	(%edx), %xmm6
+	psrldq	$6, %xmm6
+	mov	$6, %esi
+	palignr	$10, %xmm1, %xmm6
+	movaps	%xmm6, (%edx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl11):
+	movaps	-11(%ecx), %xmm1
+	movaps	5(%ecx), %xmm2
+L(Shl11Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	21(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	21(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	21(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit11Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl11LoopExit)
+
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	21(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-5(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-11(%ecx), %xmm1
+
+L(Shl11LoopStart):
+	movaps	5(%ecx), %xmm2
+	movaps	21(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	37(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	53(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$11, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$11, %xmm3, %xmm4
+	jnz	L(Shl11Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave11)
+# endif
+	palignr	$11, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl11LoopStart)
+
+L(Shl11LoopExit):
+	movaps	(%edx), %xmm6
+	psrldq	$5, %xmm6
+	mov	$5, %esi
+	palignr	$11, %xmm1, %xmm6
+	movaps	%xmm6, (%edx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl12):
+	movaps	-12(%ecx), %xmm1
+	movaps	4(%ecx), %xmm2
+L(Shl12Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit12Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	20(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-4(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-12(%ecx), %xmm1
+
+L(Shl12LoopStart):
+	movaps	4(%ecx), %xmm2
+	movaps	20(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	36(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	52(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$12, %xmm3, %xmm4
+	jnz	L(Shl12Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave12)
+# endif
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+	movaps	(%edx), %xmm6
+	psrldq	$4, %xmm6
+	mov	$4, %esi
+	palignr	$12, %xmm1, %xmm6
+	movaps	%xmm6, (%edx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl13):
+	movaps	-13(%ecx), %xmm1
+	movaps	3(%ecx), %xmm2
+L(Shl13Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	19(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	19(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	19(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit13Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl13LoopExit)
+
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	19(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-3(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-13(%ecx), %xmm1
+
+L(Shl13LoopStart):
+	movaps	3(%ecx), %xmm2
+	movaps	19(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	35(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	51(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$13, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$13, %xmm3, %xmm4
+	jnz	L(Shl13Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave13)
+# endif
+	palignr	$13, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl13LoopStart)
+
+L(Shl13LoopExit):
+	movaps	(%edx), %xmm6
+	psrldq	$3, %xmm6
+	mov	$3, %esi
+	palignr	$13, %xmm1, %xmm6
+	movaps	%xmm6, (%edx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl14):
+	movaps	-14(%ecx), %xmm1
+	movaps	2(%ecx), %xmm2
+L(Shl14Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	18(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	18(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	18(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit14Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl14LoopExit)
+
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	18(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-2(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-14(%ecx), %xmm1
+
+L(Shl14LoopStart):
+	movaps	2(%ecx), %xmm2
+	movaps	18(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	34(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	50(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$14, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$14, %xmm3, %xmm4
+	jnz	L(Shl14Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave14)
+# endif
+	palignr	$14, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl14LoopStart)
+
+L(Shl14LoopExit):
+	movaps	(%edx), %xmm6
+	psrldq	$2, %xmm6
+	mov	$2, %esi
+	palignr	$14, %xmm1, %xmm6
+	movaps	%xmm6, (%edx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl15):
+	movaps	-15(%ecx), %xmm1
+	movaps	1(%ecx), %xmm2
+L(Shl15Start):
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	17(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	17(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	17(%ecx), %xmm2
+
+	pcmpeqb	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(StrncpyExit15Case2OrCase3)
+# endif
+	test	%eax, %eax
+	jnz	L(Shl15LoopExit)
+
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	17(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-1(%ecx), %ecx
+	sub	%eax, %edx
+# ifdef USE_AS_STRNCPY
+	add	%eax, %ebx
+# endif
+	movaps	-15(%ecx), %xmm1
+
+L(Shl15LoopStart):
+	movaps	1(%ecx), %xmm2
+	movaps	17(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	33(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	49(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqb	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$15, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$15, %xmm3, %xmm4
+	jnz	L(Shl15Start)
+# ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(StrncpyLeave15)
+# endif
+	palignr	$15, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl15LoopStart)
+
+L(Shl15LoopExit):
+	movaps	(%edx), %xmm6
+	psrldq	$1, %xmm6
+	mov	$1, %esi
+	palignr	$15, %xmm1, %xmm6
+	movaps	%xmm6, (%edx)
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+# ifdef USE_AS_STRNCPY
+	add	$16, %ebx
+# endif
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+
+	.p2align 4
+L(Exit8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+# ifdef USE_AS_STPCPY
+	lea	7(%edx), %eax
+# else
+	movl	%edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+	sub	$8, %ebx
+	lea	8(%edx), %ecx
+	jnz	L(StrncpyFillTailWithZero1)
+#  ifdef USE_AS_STPCPY
+	cmpb	$1, (%eax)
+	sbb	$-1, %eax
+#  endif
+# endif
+	RETURN1
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)

[... 9541 lines stripped ...]