[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Commits] r17844 - in /fsf/trunk/libc: ./ crypt/ iconvdata/ resolv/ resolv/nss_dns/ sysdeps/i386/i686/multiarch/ sysdeps/unix/sysv/lin...



Author: eglibc
Date: Sat Mar 31 00:01:56 2012
New Revision: 17844

Log:
Import glibc-mainline for 2012-03-31

Added:
    fsf/trunk/libc/wcsmbs/tst-mbsnrtowcs.c
Modified:
    fsf/trunk/libc/ChangeLog
    fsf/trunk/libc/NEWS
    fsf/trunk/libc/crypt/md5-crypt.c
    fsf/trunk/libc/crypt/sha256-crypt.c
    fsf/trunk/libc/crypt/sha512-crypt.c
    fsf/trunk/libc/iconvdata/tcvn5712-1.c
    fsf/trunk/libc/resolv/nss_dns/dns-host.c
    fsf/trunk/libc/resolv/res_send.c
    fsf/trunk/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
    fsf/trunk/libc/sysdeps/unix/sysv/linux/bits/socket.h
    fsf/trunk/libc/wcsmbs/Makefile

Modified: fsf/trunk/libc/ChangeLog
==============================================================================
--- fsf/trunk/libc/ChangeLog (original)
+++ fsf/trunk/libc/ChangeLog Sat Mar 31 00:01:56 2012
@@ -1,3 +1,42 @@
+2012-03-22  Liubov Dmitrieva  <liubov.dmitrieva@xxxxxxxxx>
+
+	* sysdeps/i386/i686/multiarch/memcpy-ssse3.S: Update.
+	Optimize memcpy with prefetch if
+	DATA_CACHE_SIZE_HALF <= len <  SHARED_CACHE_SIZE_HALF and
+	src, dst pointers have unequal 16 byte alignments.
+
+2012-03-30  Siddhesh Poyarekar  <siddhesh@xxxxxxxxxx>
+
+	[BZ #13928]
+	* resolv/nss_dns/dns-host.c (getanswer_r): Also consider ttl
+	from a CNAME entry and return the minimum ttl for the query.
+	(gaih_getanswer_slice): Likewise.
+
+2012-03-30  Jeff Law  <law@xxxxxxxxxx>
+
+	* crypt/md5-crypt.c (__md5_crypt_r): Avoid unbounded alloca uses
+	due to long keys.
+	* crypt/sha256-crypt.c (__sha256_crypt_r): Likewise.
+	* crypt/sha512-crypt.c (__sha512_crypt_r): Likewise.
+
+	* resolv/nss_dns/dns-host.c: Update copyright year.
+
+2012-03-30  Ulrich Drepper  <drepper@xxxxxxxxx>
+
+	* resolv/res_send.c (send_dg): Use sendmmsg if we have to write two
+	requests to save a system call.  Fix check that all bytes are sent.
+
+	* sysdeps/unix/sysv/linux/bits/socket.h (struct mmsghdr): Fix up
+	comments for sendmmsg.
+
+2012-03-30  Tulio Magno Quites Machado Filho  <tuliom@xxxxxxxxxxxxxxxxxx>
+
+	[BZ #13691]
+	* iconvdata/tcvn5712-1.c (FROM_LOOP): Fix a bug when converting strings
+	with only 1 character between 0x0041 and 0x01b0.
+	* wcsmbs/Makefile (tests): Add tst-mbsnrtowcs.
+	* wcsmbs/tst-mbsnrtowcs.c: New file.
+
 2012-03-29  David S. Miller  <davem@xxxxxxxxxxxxx>
 
 	* libio/fileops.c (_IO_new_file_xsputn): Don't try to optimize

Modified: fsf/trunk/libc/NEWS
==============================================================================
--- fsf/trunk/libc/NEWS (original)
+++ fsf/trunk/libc/NEWS Sat Mar 31 00:01:56 2012
@@ -15,10 +15,10 @@
   10110, 10135, 10140, 10210, 10545, 10716, 11174, 11322, 11365, 11451,
   11494, 12047, 13058, 13525, 13526, 13527, 13528, 13529, 13530, 13531,
   13532, 13533, 13547, 13551, 13552, 13553, 13555, 13559, 13566, 13583,
-  13618, 13637, 13656, 13658, 13673, 13695, 13704, 13706, 13726, 13738,
-  13760, 13761, 13786, 13792, 13806, 13824, 13840, 13841, 13844, 13846,
-  13851, 13852, 13854, 13871, 13879, 13883, 13892, 13910, 13911, 13912,
-  13913, 13915, 13916, 13917, 13918, 13919, 13920, 13921
+  13618, 13637, 13656, 13658, 13673, 13691, 13695, 13704, 13706, 13726,
+  13738, 13760, 13761, 13786, 13792, 13806, 13824, 13840, 13841, 13844,
+  13846, 13851, 13852, 13854, 13871, 13879, 13883, 13892, 13910, 13911,
+  13912, 13913, 13915, 13916, 13917, 13918, 13919, 13920, 13921, 13928
 
 * ISO C11 support:
 

Modified: fsf/trunk/libc/crypt/md5-crypt.c
==============================================================================
--- fsf/trunk/libc/crypt/md5-crypt.c (original)
+++ fsf/trunk/libc/crypt/md5-crypt.c Sat Mar 31 00:01:56 2012
@@ -1,6 +1,6 @@
 /* One way encryption based on MD5 sum.
    Compatible with the behavior of MD5 crypt introduced in FreeBSD 2.0.
-   Copyright (C) 1996, 1997, 1999, 2000, 2001, 2002, 2004, 2009
+   Copyright (C) 1996, 1997, 1999, 2000, 2001, 2002, 2004, 2009, 2012
    Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@xxxxxxxxxx>, 1996.
@@ -107,6 +107,8 @@
   char *cp;
   char *copied_key = NULL;
   char *copied_salt = NULL;
+  char *free_key = NULL;
+  size_t alloca_used = 0;
 
   /* Find beginning of salt string.  The prefix should normally always
      be present.  Just in case it is not.  */
@@ -119,7 +121,17 @@
 
   if ((key - (char *) 0) % __alignof__ (md5_uint32) != 0)
     {
-      char *tmp = (char *) alloca (key_len + __alignof__ (md5_uint32));
+      char *tmp;
+
+      if (__libc_use_alloca (alloca_used + key_len + __alignof__ (md5_uint32)))
+	tmp = (char *) alloca (key_len + __alignof__ (md5_uint32));
+      else
+	{
+	  free_key = tmp = (char *) malloc (key_len + __alignof__ (md5_uint32));
+	  if (tmp == NULL)
+	    return NULL;
+	}
+
       key = copied_key =
 	memcpy (tmp + __alignof__ (md5_uint32)
 		- (tmp - (char *) 0) % __alignof__ (md5_uint32),
@@ -141,7 +153,10 @@
   /* Initialize libfreebl3.  */
   NSSLOWInitContext *nss_ictx = NSSLOW_Init ();
   if (nss_ictx == NULL)
-    return NULL;
+    {
+      free (free_key);
+      return NULL;
+    }
   NSSLOWHASHContext *nss_ctx = NULL;
   NSSLOWHASHContext *nss_alt_ctx = NULL;
 #else
@@ -295,6 +310,7 @@
   if (copied_salt != NULL)
     memset (copied_salt, '\0', salt_len);
 
+  free (free_key);
   return buffer;
 }
 

Modified: fsf/trunk/libc/crypt/sha256-crypt.c
==============================================================================
--- fsf/trunk/libc/crypt/sha256-crypt.c (original)
+++ fsf/trunk/libc/crypt/sha256-crypt.c Sat Mar 31 00:01:56 2012
@@ -1,5 +1,5 @@
 /* One way encryption based on SHA256 sum.
-   Copyright (C) 2007, 2009 Free Software Foundation, Inc.
+   Copyright (C) 2007, 2009, 2012 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@xxxxxxxxxx>, 2007.
 
@@ -122,6 +122,9 @@
   /* Default number of rounds.  */
   size_t rounds = ROUNDS_DEFAULT;
   bool rounds_custom = false;
+  size_t alloca_used = 0;
+  char *free_key = NULL;
+  char *free_pbytes = NULL;
 
   /* Find beginning of salt string.  The prefix should normally always
      be present.  Just in case it is not.  */
@@ -148,7 +151,17 @@
 
   if ((key - (char *) 0) % __alignof__ (uint32_t) != 0)
     {
-      char *tmp = (char *) alloca (key_len + __alignof__ (uint32_t));
+      char *tmp;
+
+      if (__libc_use_alloca (alloca_used + key_len + __alignof__ (uint32_t)))
+	tmp = alloca_account (key_len + __alignof__ (uint32_t), alloca_used);
+      else
+	{
+	  free_key = tmp = (char *) malloc (key_len + __alignof__ (uint32_t));
+	  if (tmp == NULL)
+	    return NULL;
+	}
+
       key = copied_key =
 	memcpy (tmp + __alignof__ (uint32_t)
 		- (tmp - (char *) 0) % __alignof__ (uint32_t),
@@ -159,6 +172,7 @@
   if ((salt - (char *) 0) % __alignof__ (uint32_t) != 0)
     {
       char *tmp = (char *) alloca (salt_len + __alignof__ (uint32_t));
+      alloca_used += salt_len + __alignof__ (uint32_t);
       salt = copied_salt =
 	memcpy (tmp + __alignof__ (uint32_t)
 		- (tmp - (char *) 0) % __alignof__ (uint32_t),
@@ -170,7 +184,10 @@
   /* Initialize libfreebl3.  */
   NSSLOWInitContext *nss_ictx = NSSLOW_Init ();
   if (nss_ictx == NULL)
-    return NULL;
+    {
+      free (free_key);
+      return NULL;
+    }
   NSSLOWHASHContext *nss_ctx = NULL;
   NSSLOWHASHContext *nss_alt_ctx = NULL;
 #else
@@ -233,7 +250,18 @@
   sha256_finish_ctx (&alt_ctx, nss_alt_ctx, temp_result);
 
   /* Create byte sequence P.  */
-  cp = p_bytes = alloca (key_len);
+  if (__libc_use_alloca (alloca_used + key_len))
+    cp = p_bytes = (char *) alloca (key_len);
+  else
+    {
+      free_pbytes = cp = p_bytes = (char *)malloc (key_len);
+      if (free_pbytes == NULL)
+	{
+	  free (free_key);
+	  return NULL;
+	}
+    }
+
   for (cnt = key_len; cnt >= 32; cnt -= 32)
     cp = mempcpy (cp, temp_result, 32);
   memcpy (cp, temp_result, cnt);
@@ -361,6 +389,8 @@
   if (copied_salt != NULL)
     memset (copied_salt, '\0', salt_len);
 
+  free (free_key);
+  free (free_pbytes);
   return buffer;
 }
 

Modified: fsf/trunk/libc/crypt/sha512-crypt.c
==============================================================================
--- fsf/trunk/libc/crypt/sha512-crypt.c (original)
+++ fsf/trunk/libc/crypt/sha512-crypt.c Sat Mar 31 00:01:56 2012
@@ -1,5 +1,5 @@
 /* One way encryption based on SHA512 sum.
-   Copyright (C) 2007, 2009 Free Software Foundation, Inc.
+   Copyright (C) 2007, 2009, 2012 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@xxxxxxxxxx>, 2007.
 
@@ -122,6 +122,9 @@
   /* Default number of rounds.  */
   size_t rounds = ROUNDS_DEFAULT;
   bool rounds_custom = false;
+  size_t alloca_used = 0;
+  char *free_key = NULL;
+  char *free_pbytes = NULL;
 
   /* Find beginning of salt string.  The prefix should normally always
      be present.  Just in case it is not.  */
@@ -148,7 +151,17 @@
 
   if ((key - (char *) 0) % __alignof__ (uint64_t) != 0)
     {
-      char *tmp = (char *) alloca (key_len + __alignof__ (uint64_t));
+      char *tmp;
+
+      if (__libc_use_alloca (alloca_used + key_len + __alignof__ (uint64_t)))
+	tmp = alloca_account (key_len + __alignof__ (uint64_t), alloca_used);
+      else
+	{
+	  free_key = tmp = (char *) malloc (key_len + __alignof__ (uint64_t));
+	  if (tmp == NULL)
+	    return NULL;
+	}
+
       key = copied_key =
 	memcpy (tmp + __alignof__ (uint64_t)
 		- (tmp - (char *) 0) % __alignof__ (uint64_t),
@@ -170,7 +183,10 @@
   /* Initialize libfreebl3.  */
   NSSLOWInitContext *nss_ictx = NSSLOW_Init ();
   if (nss_ictx == NULL)
-    return NULL;
+    {
+      free (free_key);
+      return NULL;
+    }
   NSSLOWHASHContext *nss_ctx = NULL;
   NSSLOWHASHContext *nss_alt_ctx = NULL;
 #else
@@ -233,7 +249,18 @@
   sha512_finish_ctx (&alt_ctx, nss_alt_ctx, temp_result);
 
   /* Create byte sequence P.  */
-  cp = p_bytes = alloca (key_len);
+  if (__libc_use_alloca (alloca_used + key_len))
+    cp = p_bytes = (char *) alloca (key_len);
+  else
+    {
+      free_pbytes = cp = p_bytes = (char *)malloc (key_len);
+      if (free_pbytes == NULL)
+	{
+	  free (free_key);
+	  return NULL;
+	}
+    }
+
   for (cnt = key_len; cnt >= 64; cnt -= 64)
     cp = mempcpy (cp, temp_result, 64);
   memcpy (cp, temp_result, cnt);
@@ -373,6 +400,8 @@
   if (copied_salt != NULL)
     memset (copied_salt, '\0', salt_len);
 
+  free (free_key);
+  free (free_pbytes);
   return buffer;
 }
 

Modified: fsf/trunk/libc/iconvdata/tcvn5712-1.c
==============================================================================
--- fsf/trunk/libc/iconvdata/tcvn5712-1.c (original)
+++ fsf/trunk/libc/iconvdata/tcvn5712-1.c Sat Mar 31 00:01:56 2012
@@ -1,5 +1,5 @@
 /* Conversion to and from TCVN5712-1.
-   Copyright (C) 2001, 2002, 2004, 2011 Free Software Foundation, Inc.
+   Copyright (C) 2001-2012 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@xxxxxxxxxx>, 2001.
 
@@ -379,7 +379,7 @@
     last_ch = *statep >> 3;						      \
 									      \
     /* We have to buffer ch if it is a possible match in comp_table_data.  */ \
-    must_buffer_ch = (ch >= 0x0041 && ch <= 0x01b0);			      \
+    must_buffer_ch = last_ch && (ch >= 0x0041 && ch <= 0x01b0);		      \
 									      \
     if (last_ch)							      \
       {									      \

Modified: fsf/trunk/libc/resolv/nss_dns/dns-host.c
==============================================================================
--- fsf/trunk/libc/resolv/nss_dns/dns-host.c (original)
+++ fsf/trunk/libc/resolv/nss_dns/dns-host.c Sat Mar 31 00:01:56 2012
@@ -1,4 +1,4 @@
-/* Copyright (C) 1996-2004, 2007-2009, 2010 Free Software Foundation, Inc.
+/* Copyright (C) 1996-2004, 2007-2010, 2012 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Extended from original form by Ulrich Drepper <drepper@xxxxxxxxxx>, 1996.
 
@@ -744,6 +744,10 @@
 
       if ((qtype == T_A || qtype == T_AAAA) && type == T_CNAME)
 	{
+	  /* A CNAME could also have a TTL entry.  */
+	  if (ttlp != NULL && ttl < *ttlp)
+	      *ttlp = ttl;
+
 	  if (ap >= &host_data->aliases[MAX_NR_ALIASES - 1])
 	    continue;
 	  n = dn_expand (answer->buf, end_of_message, cp, tbuf, sizeof tbuf);
@@ -905,7 +909,10 @@
 	    {
 	      register int nn;
 
-	      if (ttlp != NULL)
+	      /* We compose a single hostent out of the entire chain of
+	         entries, so the TTL of the hostent is essentially the lowest
+		 TTL in the chain.  */
+	      if (ttlp != NULL && ttl < *ttlp)
 		*ttlp = ttl;
 	      if (canonp != NULL)
 		*canonp = bp;
@@ -1081,6 +1088,11 @@
       if (type == T_CNAME)
 	{
 	  char tbuf[MAXDNAME];
+
+	  /* A CNAME could also have a TTL entry.  */
+	  if (ttlp != NULL && ttl < *ttlp)
+	      *ttlp = ttl;
+
 	  n = dn_expand (answer->buf, end_of_message, cp, tbuf, sizeof tbuf);
 	  if (__builtin_expect (n < 0 || res_hnok (tbuf) == 0, 0))
 	    {
@@ -1161,7 +1173,10 @@
 
       if (*firstp)
 	{
-	  if (ttlp != NULL)
+	  /* We compose a single hostent out of the entire chain of
+	     entries, so the TTL of the hostent is essentially the lowest
+	     TTL in the chain.  */
+	  if (ttlp != NULL && ttl < *ttlp)
 	    *ttlp = ttl;
 
 	  (*pat)->name = canon ?: h_name;

Modified: fsf/trunk/libc/resolv/res_send.c
==============================================================================
--- fsf/trunk/libc/resolv/res_send.c (original)
+++ fsf/trunk/libc/resolv/res_send.c Sat Mar 31 00:01:56 2012
@@ -1013,8 +1013,9 @@
 		seconds /= statp->nscount;
 	if (seconds <= 0)
 		seconds = 1;
-	bool single_request = (statp->options & RES_SNGLKUP) != 0;
 	bool single_request_reopen = (statp->options & RES_SNGLKUPREOP) != 0;
+	bool single_request = (((statp->options & RES_SNGLKUP) != 0)
+			       | single_request_reopen);
 	int save_gotsomewhere = *gotsomewhere;
 
 	int retval;
@@ -1100,24 +1101,91 @@
 	}
 	__set_errno (0);
 	if (pfd[0].revents & POLLOUT) {
-		ssize_t sr;
-		if (nwritten != 0)
-		  sr = send (pfd[0].fd, buf2, buflen2, MSG_NOSIGNAL);
+#ifndef __ASSUME_SENDMMSG
+		static int have_sendmmsg;
+#else
+# define have_sendmmsg 1
+#endif
+		if (have_sendmmsg >= 0 && nwritten == 0 && buf2 != NULL
+		    && !single_request)
+		  {
+		    struct iovec iov[2];
+		    struct mmsghdr reqs[2];
+		    reqs[0].msg_hdr.msg_name = NULL;
+		    reqs[0].msg_hdr.msg_namelen = 0;
+		    reqs[0].msg_hdr.msg_iov = &iov[0];
+		    reqs[0].msg_hdr.msg_iovlen = 1;
+		    iov[0].iov_base = (void *) buf;
+		    iov[0].iov_len = buflen;
+		    reqs[0].msg_hdr.msg_control = NULL;
+		    reqs[0].msg_hdr.msg_controllen = 0;
+
+		    reqs[1].msg_hdr.msg_name = NULL;
+		    reqs[1].msg_hdr.msg_namelen = 0;
+		    reqs[1].msg_hdr.msg_iov = &iov[1];
+		    reqs[1].msg_hdr.msg_iovlen = 1;
+		    iov[1].iov_base = (void *) buf2;
+		    iov[1].iov_len = buflen2;
+		    reqs[1].msg_hdr.msg_control = NULL;
+		    reqs[1].msg_hdr.msg_controllen = 0;
+
+		    int ndg = sendmmsg (pfd[0].fd, reqs, 2, MSG_NOSIGNAL);
+		    if (__builtin_expect (ndg == 2, 1))
+		      {
+			if (reqs[0].msg_len != buflen
+			    || reqs[1].msg_len != buflen2)
+			  goto fail_sendmmsg;
+
+			pfd[0].events = POLLIN;
+			nwritten += 2;
+		      }
+		    else if (ndg == 1 && reqs[0].msg_len == buflen)
+		      goto just_one;
+		    else if (ndg < 0 && (errno == EINTR || errno == EAGAIN))
+		      goto recompute_resend;
+		    else
+		      {
+#ifndef __ASSUME_SENDMMSG
+			if (__builtin_expect (have_sendmmsg == 0, 0))
+			  {
+			    if (ndg < 0 && errno == ENOSYS)
+			      {
+				have_sendmmsg = -1;
+				goto try_send;
+			      }
+			    have_sendmmsg = 1;
+			  }
+#endif
+
+		      fail_sendmmsg:
+			Perror(statp, stderr, "sendmmsg", errno);
+			goto err_out;
+		      }
+		  }
 		else
-		  sr = send (pfd[0].fd, buf, buflen, MSG_NOSIGNAL);
-
-		if (sr != buflen) {
-			if (errno == EINTR || errno == EAGAIN)
-				goto recompute_resend;
-			Perror(statp, stderr, "send", errno);
-			goto err_out;
-		}
-		if (nwritten != 0 || buf2 == NULL
-		    || single_request || single_request_reopen)
-		  pfd[0].events = POLLIN;
-		else
-		  pfd[0].events = POLLIN | POLLOUT;
-		++nwritten;
+		  {
+		    ssize_t sr;
+#ifndef __ASSUME_SENDMMSG
+		  try_send:
+#endif
+		    if (nwritten != 0)
+		      sr = send (pfd[0].fd, buf2, buflen2, MSG_NOSIGNAL);
+		    else
+		      sr = send (pfd[0].fd, buf, buflen, MSG_NOSIGNAL);
+
+		    if (sr != (nwritten != 0 ? buflen2 : buflen)) {
+		      if (errno == EINTR || errno == EAGAIN)
+			goto recompute_resend;
+		      Perror(statp, stderr, "send", errno);
+		      goto err_out;
+		    }
+		  just_one:
+		    if (nwritten != 0 || buf2 == NULL || single_request)
+		      pfd[0].events = POLLIN;
+		    else
+		      pfd[0].events = POLLIN | POLLOUT;
+		    ++nwritten;
+		  }
 		goto wait;
 	} else if (pfd[0].revents & POLLIN) {
 		int *thisanssizp;
@@ -1327,7 +1395,7 @@
 			recvresp2 = 1;
 		/* Repeat waiting if we have a second answer to arrive.  */
 		if ((recvresp1 & recvresp2) == 0) {
-			if (single_request || single_request_reopen) {
+			if (single_request) {
 				pfd[0].events = POLLOUT;
 				if (single_request_reopen) {
 					__res_iclose (statp, false);

Modified: fsf/trunk/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
==============================================================================
--- fsf/trunk/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S (original)
+++ fsf/trunk/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S Sat Mar 31 00:01:56 2012
@@ -17,109 +17,100 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-
 #if !defined NOT_IN_libc \
     && (defined SHARED \
 	|| defined USE_AS_MEMMOVE \
 	|| !defined USE_MULTIARCH)
 
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY		__memcpy_ssse3
-# define MEMCPY_CHK	__memcpy_chk_ssse3
-#endif
-
-#ifdef USE_AS_BCOPY
-# define SRC		PARMS
-# define DEST		SRC+4
-# define LEN		DEST+4
-#else
-# define DEST		PARMS
-# define SRC		DEST+4
-# define LEN		SRC+4
-#endif
-
-#define CFI_PUSH(REG)						\
-  cfi_adjust_cfa_offset (4);					\
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+# ifndef MEMCPY
+#  define MEMCPY		__memcpy_ssse3
+#  define MEMCPY_CHK	__memcpy_chk_ssse3
+# endif
+
+# ifdef USE_AS_BCOPY
+#  define SRC		PARMS
+#  define DEST		SRC+4
+#  define LEN		DEST+4
+# else
+#  define DEST		PARMS
+#  define SRC		DEST+4
+#  define LEN		SRC+4
+# endif
+
+# define CFI_PUSH(REG)		\
+  cfi_adjust_cfa_offset (4);		\
   cfi_rel_offset (REG, 0)
 
-#define CFI_POP(REG)						\
-  cfi_adjust_cfa_offset (-4);					\
+# define CFI_POP(REG)		\
+  cfi_adjust_cfa_offset (-4);		\
   cfi_restore (REG)
 
-#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
-#define POP(REG)	popl REG; CFI_POP (REG)
-
-#ifdef SHARED
-# define PARMS		8		/* Preserve EBX.  */
-# define ENTRANCE	PUSH (%ebx);
-# define RETURN_END	POP (%ebx); ret
-# define RETURN		RETURN_END; CFI_PUSH (%ebx)
-# define JMPTBL(I, B)	I - B
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# ifdef SHARED
+#  define PARMS		8		/* Preserve EBX.  */
+#  define ENTRANCE	PUSH (%ebx);
+#  define RETURN_END	POP (%ebx); ret
+#  define RETURN		RETURN_END; CFI_PUSH (%ebx)
+#  define JMPTBL(I, B)	I - B
 
 /* Load an entry in a jump table into EBX and branch to it.  TABLE is a
-   jump table with relative offsets.  INDEX is a register contains the
-   index into the jump table.   SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-    /* We first load PC into EBX.  */				\
-    SETUP_PIC_REG(bx);						\
-    /* Get the address of the jump table.  */			\
-    addl	$(TABLE - .), %ebx;				\
-    /* Get the entry and convert the relative offset to the	\
-       absolute address.  */					\
-    addl	(%ebx,INDEX,SCALE), %ebx;			\
-    /* We loaded the jump table.  Go.  */			\
-    jmp		*%ebx
-
-# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)			\
-    addl	$(TABLE - .), %ebx
-
-# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)	\
-    addl	(%ebx,INDEX,SCALE), %ebx;			\
-    /* We loaded the jump table.  Go.  */			\
-    jmp		*%ebx
-#else
-# define PARMS		4
-# define ENTRANCE
-# define RETURN_END	ret
-# define RETURN		RETURN_END
-# define JMPTBL(I, B)	I
+	jump table with relative offsets.  INDEX is a register contains the
+	index into the jump table.   SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    /* We first load PC into EBX.  */		\
+	SETUP_PIC_REG(bx);		\
+    /* Get the address of the jump table.  */		\
+	addl	$(TABLE - .), %ebx;		\
+    /* Get the entry and convert the relative offset to the		\
+	absolute	address.  */		\
+	addl	(%ebx, INDEX, SCALE), %ebx;		\
+    /* We loaded the jump table.  Go.  */		\
+	jmp	*%ebx
+# else
+
+#  define PARMS		4
+#  define ENTRANCE
+#  define RETURN_END	ret
+#  define RETURN		RETURN_END
+#  define JMPTBL(I, B)	I
 
 /* Branch to an entry in a jump table.  TABLE is a jump table with
-   absolute offsets.  INDEX is a register contains the index into the
-   jump table.  SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-    jmp		*TABLE(,INDEX,SCALE)
-
-# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
-
-# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)		\
-    jmp		*TABLE(,INDEX,SCALE)
-#endif
+	absolute offsets.  INDEX is a register contains the index into the
+	jump table.  SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+	jmp	*TABLE(, INDEX, SCALE)
+# endif
 
 	.section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_BCOPY
+# if !defined USE_AS_BCOPY
 ENTRY (MEMCPY_CHK)
 	movl	12(%esp), %eax
 	cmpl	%eax, 16(%esp)
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMCPY_CHK)
-#endif
+# endif
 ENTRY (MEMCPY)
 	ENTRANCE
 	movl	LEN(%esp), %ecx
 	movl	SRC(%esp), %eax
 	movl	DEST(%esp), %edx
 
-#ifdef USE_AS_MEMMOVE
+# ifdef USE_AS_MEMMOVE
 	cmp	%eax, %edx
 	jb	L(copy_forward)
 	je	L(fwd_write_0bytes)
 	cmp	$32, %ecx
 	jae	L(memmove_bwd)
 	jmp	L(bk_write_less32bytes_2)
+
+	.p2align 4
 L(memmove_bwd):
 	add	%ecx, %eax
 	cmp	%eax, %edx
@@ -127,67 +118,72 @@
 	jb	L(copy_backward)
 
 L(copy_forward):
-#endif
+# endif
 	cmp	$48, %ecx
 	jae	L(48bytesormore)
 
 L(fwd_write_less32bytes):
-#ifndef USE_AS_MEMMOVE
+# ifndef USE_AS_MEMMOVE
 	cmp	%dl, %al
 	jb	L(bk_write)
-#endif
+# endif
 	add	%ecx, %edx
 	add	%ecx, %eax
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-#ifndef USE_AS_MEMMOVE
+# ifndef USE_AS_MEMMOVE
+	.p2align 4
 L(bk_write):
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
-#endif
-
-	ALIGN (4)
-/* ECX > 32 and EDX is 4 byte aligned.  */
+# endif
+
+	.p2align 4
 L(48bytesormore):
+# ifndef USE_AS_MEMMOVE
+	movlpd	(%eax), %xmm0
+	movlpd	8(%eax), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+# else
 	movdqu	(%eax), %xmm0
+# endif
 	PUSH (%edi)
 	movl	%edx, %edi
 	and	$-16, %edx
-	PUSH (%esi)
-	cfi_remember_state
 	add	$16, %edx
-	movl	%edi, %esi
 	sub	%edx, %edi
 	add	%edi, %ecx
 	sub	%edi, %eax
 
-#ifdef SHARED_CACHE_SIZE_HALF
+# ifdef SHARED_CACHE_SIZE_HALF
 	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
-#else
-# ifdef SHARED
+# else
+#  ifdef SHARED
 	SETUP_PIC_REG(bx)
 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
-# else
+#  else
 	cmp	__x86_shared_cache_size_half, %ecx
-# endif
-#endif
+#  endif
+# endif
 
 	mov	%eax, %edi
 	jae	L(large_page)
 	and	$0xf, %edi
 	jz	L(shl_0)
-
 	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shl_0):
-	movdqu	%xmm0, (%esi)
+# ifdef USE_AS_MEMMOVE
+	movl	DEST+4(%esp), %edi
+	movdqu	%xmm0, (%edi)
+# endif
 	xor	%edi, %edi
-	POP (%esi)
 	cmp	$127, %ecx
 	ja	L(shl_0_gobble)
 	lea	-32(%ecx), %ecx
+
+	.p2align 4
 L(shl_0_loop):
 	movdqa	(%eax, %edi), %xmm0
 	movdqa	16(%eax, %edi), %xmm1
@@ -219,6 +215,7 @@
 	movdqa	%xmm0, (%edx, %edi)
 	movdqa	%xmm1, 16(%edx, %edi)
 	lea	32(%edi), %edi
+
 L(shl_0_end):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
@@ -228,23 +225,25 @@
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
 
 	CFI_PUSH (%edi)
+
+	.p2align 4
 L(shl_0_gobble):
-
-#ifdef DATA_CACHE_SIZE_HALF
+# ifdef DATA_CACHE_SIZE_HALF
 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# ifdef SHARED
+# else
+#  ifdef SHARED
 	SETUP_PIC_REG(bx)
 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
+#  else
 	cmp	__x86_data_cache_size_half, %ecx
-# endif
-#endif
-
-	POP (%edi)
+#  endif
+# endif
+	POP	(%edi)
 	lea	-128(%ecx), %ecx
 	jae	L(shl_0_gobble_mem_loop)
+
+	.p2align 4
 L(shl_0_gobble_cache_loop):
 	movdqa	(%eax), %xmm0
 	movdqa	0x10(%eax), %xmm1
@@ -274,17 +273,15 @@
 	movdqa	(%eax), %xmm0
 	sub	$0x40, %ecx
 	movdqa	0x10(%eax), %xmm1
-
 	movdqa	%xmm0, (%edx)
 	movdqa	%xmm1, 0x10(%edx)
-
 	movdqa	0x20(%eax), %xmm0
 	movdqa	0x30(%eax), %xmm1
 	add	$0x40, %eax
-
 	movdqa	%xmm0, 0x20(%edx)
 	movdqa	%xmm1, 0x30(%edx)
 	add	$0x40, %edx
+
 L(shl_0_cache_less_64bytes):
 	cmp	$0x20, %ecx
 	jb	L(shl_0_cache_less_32bytes)
@@ -295,6 +292,7 @@
 	movdqa	%xmm0, (%edx)
 	movdqa	%xmm1, 0x10(%edx)
 	add	$0x20, %edx
+
 L(shl_0_cache_less_32bytes):
 	cmp	$0x10, %ecx
 	jb	L(shl_0_cache_less_16bytes)
@@ -303,13 +301,13 @@
 	add	$0x10, %eax
 	movdqa	%xmm0, (%edx)
 	add	$0x10, %edx
+
 L(shl_0_cache_less_16bytes):
 	add	%ecx, %edx
 	add	%ecx, %eax
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
 
-
-	ALIGN (4)
+	.p2align 4
 L(shl_0_gobble_mem_loop):
 	prefetcht0 0x1c0(%eax)
 	prefetcht0 0x280(%eax)
@@ -354,6 +352,7 @@
 	movdqa	%xmm0, 0x20(%edx)
 	movdqa	%xmm1, 0x30(%edx)
 	add	$0x40, %edx
+
 L(shl_0_mem_less_64bytes):
 	cmp	$0x20, %ecx
 	jb	L(shl_0_mem_less_32bytes)
@@ -364,6 +363,7 @@
 	movdqa	%xmm0, (%edx)
 	movdqa	%xmm1, 0x10(%edx)
 	add	$0x20, %edx
+
 L(shl_0_mem_less_32bytes):
 	cmp	$0x10, %ecx
 	jb	L(shl_0_mem_less_16bytes)
@@ -372,24 +372,84 @@
 	add	$0x10, %eax
 	movdqa	%xmm0, (%edx)
 	add	$0x10, %edx
+
 L(shl_0_mem_less_16bytes):
 	add	%ecx, %edx
 	add	%ecx, %eax
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shl_1):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+	movaps	-1(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-1(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_1_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl1LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	15(%eax), %xmm2
+	movaps	31(%eax), %xmm3
+	movaps	47(%eax), %xmm4
+	movaps	63(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$1, %xmm4, %xmm5
+	palignr	$1, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$1, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl1LoopStart)
+
+L(Shl1LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	15(%eax), %xmm2
+	movaps	31(%eax), %xmm3
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_1_no_prefetch):
+	lea	-32(%ecx), %ecx
 	lea	-1(%eax), %eax
-	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_1_loop):
-
+
+	.p2align 4
+L(sh_1_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -399,8 +459,7 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(shl_1_end)
+	jb	L(sh_1_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -411,30 +470,90 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_1_loop)
-
-L(shl_1_end):
+	jae	L(sh_1_no_prefetch_loop)
+
+L(sh_1_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	1(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_2):
+# ifndef USE_AS_MEMMOVE
+	movaps	-2(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-2(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_2_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl2LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	14(%eax), %xmm2
+	movaps	30(%eax), %xmm3
+	movaps	46(%eax), %xmm4
+	movaps	62(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$2, %xmm4, %xmm5
+	palignr	$2, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$2, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl2LoopStart)
+
+L(Shl2LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	14(%eax), %xmm2
+	movaps	30(%eax), %xmm3
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
 	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
-L(shl_2):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_2_no_prefetch):
+	lea	-32(%ecx), %ecx
 	lea	-2(%eax), %eax
-	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_2_loop):
-
+
+	.p2align 4
+L(sh_2_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -444,8 +563,7 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(shl_2_end)
+	jb	L(sh_2_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -456,30 +574,90 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_2_loop)
-
-L(shl_2_end):
+	jae	L(sh_2_no_prefetch_loop)
+
+L(sh_2_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	2(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_3):
+# ifndef USE_AS_MEMMOVE
+	movaps	-3(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-3(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_3_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl3LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	13(%eax), %xmm2
+	movaps	29(%eax), %xmm3
+	movaps	45(%eax), %xmm4
+	movaps	61(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$3, %xmm4, %xmm5
+	palignr	$3, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$3, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl3LoopStart)
+
+L(Shl3LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	13(%eax), %xmm2
+	movaps	29(%eax), %xmm3
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
 	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
-L(shl_3):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_3_no_prefetch):
+	lea	-32(%ecx), %ecx
 	lea	-3(%eax), %eax
-	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_3_loop):
-
+
+	.p2align 4
+L(sh_3_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -490,7 +668,7 @@
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jb	L(shl_3_end)
+	jb	L(sh_3_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -502,29 +680,90 @@
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jae	L(shl_3_loop)
-
-L(shl_3_end):
+	jae	L(sh_3_no_prefetch_loop)
+
+L(sh_3_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	3(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_4):
+# ifndef USE_AS_MEMMOVE
+	movaps	-4(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-4(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_4_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl4LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	12(%eax), %xmm2
+	movaps	28(%eax), %xmm3
+	movaps	44(%eax), %xmm4
+	movaps	60(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	palignr	$4, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl4LoopStart)
+
+L(Shl4LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	12(%eax), %xmm2
+	movaps	28(%eax), %xmm3
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
 	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
-L(shl_4):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_4_no_prefetch):
+	lea	-32(%ecx), %ecx
 	lea	-4(%eax), %eax
-	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_4_loop):
-
+
+	.p2align 4
+L(sh_4_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -535,7 +774,7 @@
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jb	L(shl_4_end)
+	jb	L(sh_4_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -547,29 +786,90 @@
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jae	L(shl_4_loop)
-
-L(shl_4_end):
+	jae	L(sh_4_no_prefetch_loop)
+
+L(sh_4_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	4(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_5):
+# ifndef USE_AS_MEMMOVE
+	movaps	-5(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-5(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_5_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl5LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	11(%eax), %xmm2
+	movaps	27(%eax), %xmm3
+	movaps	43(%eax), %xmm4
+	movaps	59(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$5, %xmm4, %xmm5
+	palignr	$5, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$5, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl5LoopStart)
+
+L(Shl5LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	11(%eax), %xmm2
+	movaps	27(%eax), %xmm3
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
 	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
-L(shl_5):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_5_no_prefetch):
+	lea	-32(%ecx), %ecx
 	lea	-5(%eax), %eax
-	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_5_loop):
-
+
+	.p2align 4
+L(sh_5_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -580,7 +880,7 @@
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jb	L(shl_5_end)
+	jb	L(sh_5_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -592,29 +892,90 @@
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jae	L(shl_5_loop)
-
-L(shl_5_end):
+	jae	L(sh_5_no_prefetch_loop)
+
+L(sh_5_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	5(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_6):
+# ifndef USE_AS_MEMMOVE
+	movaps	-6(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-6(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_6_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl6LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	10(%eax), %xmm2
+	movaps	26(%eax), %xmm3
+	movaps	42(%eax), %xmm4
+	movaps	58(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$6, %xmm4, %xmm5
+	palignr	$6, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$6, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl6LoopStart)
+
+L(Shl6LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	10(%eax), %xmm2
+	movaps	26(%eax), %xmm3
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
 	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
-L(shl_6):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_6_no_prefetch):
+	lea	-32(%ecx), %ecx
 	lea	-6(%eax), %eax
-	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_6_loop):
-
+
+	.p2align 4
+L(sh_6_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -625,7 +986,7 @@
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jb	L(shl_6_end)
+	jb	L(sh_6_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -637,29 +998,90 @@
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jae	L(shl_6_loop)
-
-L(shl_6_end):
+	jae	L(sh_6_no_prefetch_loop)
+
+L(sh_6_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	6(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_7):
+# ifndef USE_AS_MEMMOVE
+	movaps	-7(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-7(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_7_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl7LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	9(%eax), %xmm2
+	movaps	25(%eax), %xmm3
+	movaps	41(%eax), %xmm4
+	movaps	57(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$7, %xmm4, %xmm5
+	palignr	$7, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$7, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl7LoopStart)
+
+L(Shl7LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	9(%eax), %xmm2
+	movaps	25(%eax), %xmm3
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
 	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
-L(shl_7):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_7_no_prefetch):
+	lea	-32(%ecx), %ecx
 	lea	-7(%eax), %eax
-	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_7_loop):
-
+
+	.p2align 4
+L(sh_7_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -669,8 +1091,7 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(shl_7_end)
+	jb	L(sh_7_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -681,30 +1102,90 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_7_loop)
-
-L(shl_7_end):
+	jae	L(sh_7_no_prefetch_loop)
+
+L(sh_7_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	7(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_8):
+# ifndef USE_AS_MEMMOVE
+	movaps	-8(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-8(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_8_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl8LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	8(%eax), %xmm2
+	movaps	24(%eax), %xmm3
+	movaps	40(%eax), %xmm4
+	movaps	56(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	palignr	$8, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl8LoopStart)
+
+L(LoopLeave8):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	8(%eax), %xmm2
+	movaps	24(%eax), %xmm3
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
 	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
-L(shl_8):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_8_no_prefetch):
+	lea	-32(%ecx), %ecx
 	lea	-8(%eax), %eax
-	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_8_loop):
-
+
+	.p2align 4
+L(sh_8_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -714,8 +1195,7 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(shl_8_end)
+	jb	L(sh_8_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -726,30 +1206,91 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_8_loop)
-
-L(shl_8_end):
+	jae	L(sh_8_no_prefetch_loop)
+
+L(sh_8_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	8(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_9):
+# ifndef USE_AS_MEMMOVE
+	movaps	-9(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-9(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_9_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl9LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	7(%eax), %xmm2
+	movaps	23(%eax), %xmm3
+	movaps	39(%eax), %xmm4
+	movaps	55(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$9, %xmm4, %xmm5
+	palignr	$9, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$9, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl9LoopStart)
+
+L(Shl9LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	7(%eax), %xmm2
+	movaps	23(%eax), %xmm3
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
 	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
-L(shl_9):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_9_no_prefetch):
+	lea	-32(%ecx), %ecx
 	lea	-9(%eax), %eax
-	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_9_loop):
-
+
+	.p2align 4
+L(sh_9_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -759,8 +1300,7 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(shl_9_end)
+	jb	L(sh_9_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -771,30 +1311,91 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_9_loop)
-
-L(shl_9_end):
+	jae	L(sh_9_no_prefetch_loop)
+
+L(sh_9_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	9(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_10):
+# ifndef USE_AS_MEMMOVE
+	movaps	-10(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-10(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_10_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl10LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	6(%eax), %xmm2
+	movaps	22(%eax), %xmm3
+	movaps	38(%eax), %xmm4
+	movaps	54(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$10, %xmm4, %xmm5
+	palignr	$10, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$10, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl10LoopStart)
+
+L(Shl10LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	6(%eax), %xmm2
+	movaps	22(%eax), %xmm3
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
 	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
-L(shl_10):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_10_no_prefetch):
+	lea	-32(%ecx), %ecx
 	lea	-10(%eax), %eax
-	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_10_loop):
-
+
+	.p2align 4
+L(sh_10_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -804,8 +1405,7 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(shl_10_end)
+	jb	L(sh_10_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -816,30 +1416,91 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_10_loop)
-
-L(shl_10_end):
+	jae	L(sh_10_no_prefetch_loop)
+
+L(sh_10_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	10(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_11):
+# ifndef USE_AS_MEMMOVE
+	movaps	-11(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-11(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_11_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl11LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	5(%eax), %xmm2
+	movaps	21(%eax), %xmm3
+	movaps	37(%eax), %xmm4
+	movaps	53(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$11, %xmm4, %xmm5
+	palignr	$11, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$11, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl11LoopStart)
+
+L(Shl11LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	5(%eax), %xmm2
+	movaps	21(%eax), %xmm3
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
 	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
-L(shl_11):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_11_no_prefetch):
+	lea	-32(%ecx), %ecx
 	lea	-11(%eax), %eax
-	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_11_loop):
-
+
+	.p2align 4
+L(sh_11_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -849,8 +1510,7 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(shl_11_end)
+	jb	L(sh_11_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -861,30 +1521,91 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_11_loop)
-
-L(shl_11_end):
+	jae	L(sh_11_no_prefetch_loop)
+
+L(sh_11_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	11(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_12):
+# ifndef USE_AS_MEMMOVE
+	movaps	-12(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-12(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_12_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl12LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	4(%eax), %xmm2
+	movaps	20(%eax), %xmm3
+	movaps	36(%eax), %xmm4
+	movaps	52(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	palignr	$12, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl12LoopStart)
+
+L(Shl12LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	4(%eax), %xmm2
+	movaps	20(%eax), %xmm3
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
 	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
-L(shl_12):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_12_no_prefetch):
+	lea	-32(%ecx), %ecx
 	lea	-12(%eax), %eax
-	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_12_loop):
-
+
+	.p2align 4
+L(sh_12_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -894,8 +1615,7 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(shl_12_end)
+	jb	L(sh_12_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -906,30 +1626,91 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_12_loop)
-
-L(shl_12_end):
+	jae	L(sh_12_no_prefetch_loop)
+
+L(sh_12_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	12(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_13):
+# ifndef USE_AS_MEMMOVE
+	movaps	-13(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-13(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_13_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl13LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	3(%eax), %xmm2
+	movaps	19(%eax), %xmm3
+	movaps	35(%eax), %xmm4
+	movaps	51(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$13, %xmm4, %xmm5
+	palignr	$13, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$13, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl13LoopStart)
+
+L(Shl13LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	3(%eax), %xmm2
+	movaps	19(%eax), %xmm3
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
 	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
-L(shl_13):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_13_no_prefetch):
+	lea	-32(%ecx), %ecx
 	lea	-13(%eax), %eax
-	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_13_loop):
-
+
+	.p2align 4
+L(sh_13_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -939,8 +1720,7 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(shl_13_end)
+	jb	L(sh_13_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -951,30 +1731,91 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_13_loop)
-
-L(shl_13_end):
+	jae	L(sh_13_no_prefetch_loop)
+
+L(sh_13_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	13(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_14):
+# ifndef USE_AS_MEMMOVE
+	movaps	-14(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-14(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_14_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl14LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	2(%eax), %xmm2
+	movaps	18(%eax), %xmm3
+	movaps	34(%eax), %xmm4
+	movaps	50(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$14, %xmm4, %xmm5
+	palignr	$14, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$14, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl14LoopStart)
+
+L(Shl14LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	2(%eax), %xmm2
+	movaps	18(%eax), %xmm3
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
 	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
-L(shl_14):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_14_no_prefetch):
+	lea	-32(%ecx), %ecx
 	lea	-14(%eax), %eax
-	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_14_loop):
-
+
+	.p2align 4
+L(sh_14_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -984,8 +1825,7 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(shl_14_end)
+	jb	L(sh_14_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -996,30 +1836,91 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_14_loop)
-
-L(shl_14_end):
+	jae	L(sh_14_no_prefetch_loop)
+
+L(sh_14_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	14(%edi, %eax), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_15):
+# ifndef USE_AS_MEMMOVE
+	movaps	-15(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-15(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_15_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl15LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	1(%eax), %xmm2
+	movaps	17(%eax), %xmm3
+	movaps	33(%eax), %xmm4
+	movaps	49(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$15, %xmm4, %xmm5
+	palignr	$15, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$15, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl15LoopStart)
+
+L(Shl15LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	1(%eax), %xmm2
+	movaps	17(%eax), %xmm3
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm1, %xmm2
+
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
 	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
-L(shl_15):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_15_no_prefetch):
+	lea	-32(%ecx), %ecx
 	lea	-15(%eax), %eax
-	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_15_loop):
-
+
+	.p2align 4
+L(sh_15_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -1029,8 +1930,7 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(shl_15_end)
+	jb	L(sh_15_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -1041,19 +1941,27 @@
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jae	L(shl_15_loop)
-
-L(shl_15_end):
+	jae	L(sh_15_no_prefetch_loop)
+
+L(sh_15_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	15(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
-
-
-	ALIGN (4)
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(shl_end_0):
+	lea	32(%ecx), %ecx
+	lea	(%edx, %ecx), %edx
+	lea	(%eax, %ecx), %eax
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	.p2align 4
 L(fwd_write_44bytes):
 	movq	-44(%eax), %xmm0
 	movq	%xmm0, -44(%edx)
@@ -1072,16 +1980,16 @@
 L(fwd_write_4bytes):
 	movl	-4(%eax), %ecx
 	movl	%ecx, -4(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_40bytes):
 	movq	-40(%eax), %xmm0
 	movq	%xmm0, -40(%edx)
@@ -1098,31 +2006,31 @@
 	movq	-8(%eax), %xmm0
 	movq	%xmm0, -8(%edx)
 L(fwd_write_0bytes):
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_5bytes):
 	movl	-5(%eax), %ecx
 	movl	-4(%eax), %eax
 	movl	%ecx, -5(%edx)
 	movl	%eax, -4(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_45bytes):
 	movq	-45(%eax), %xmm0
 	movq	%xmm0, -45(%edx)
@@ -1142,16 +2050,16 @@
 	movl	%ecx, -5(%edx)
 	movzbl	-1(%eax), %ecx
 	movb	%cl, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_41bytes):
 	movq	-41(%eax), %xmm0
 	movq	%xmm0, -41(%edx)
@@ -1170,16 +2078,16 @@
 L(fwd_write_1bytes):
 	movzbl	-1(%eax), %ecx
 	movb	%cl, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_46bytes):
 	movq	-46(%eax), %xmm0
 	movq	%xmm0, -46(%edx)
@@ -1200,16 +2108,16 @@
 	movl	%ecx, -6(%edx)
 	movzwl	-2(%eax), %ecx
 	movw	%cx, -2(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_42bytes):
 	movq	-42(%eax), %xmm0
 	movq	%xmm0, -42(%edx)
@@ -1228,16 +2136,16 @@
 L(fwd_write_2bytes):
 	movzwl	-2(%eax), %ecx
 	movw	%cx, -2(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_47bytes):
 	movq	-47(%eax), %xmm0
 	movq	%xmm0, -47(%edx)
@@ -1260,16 +2168,16 @@
 	movzbl	-1(%eax), %eax
 	movw	%cx, -3(%edx)
 	movb	%al, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_43bytes):
 	movq	-43(%eax), %xmm0
 	movq	%xmm0, -43(%edx)
@@ -1290,16 +2198,16 @@
 	movzbl	-1(%eax), %eax
 	movw	%cx, -3(%edx)
 	movb	%al, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_40bytes_align):
 	movdqa	-40(%eax), %xmm0
 	movdqa	%xmm0, -40(%edx)
@@ -1310,47 +2218,47 @@
 	movq	-8(%eax), %xmm0
 	movq	%xmm0, -8(%edx)
 L(fwd_write_0bytes_align):
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_32bytes_align):
 	movdqa	-32(%eax), %xmm0
 	movdqa	%xmm0, -32(%edx)
 L(fwd_write_16bytes_align):
 	movdqa	-16(%eax), %xmm0
 	movdqa	%xmm0, -16(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_5bytes_align):
 	movl	-5(%eax), %ecx
 	movl	-4(%eax), %eax
 	movl	%ecx, -5(%edx)
 	movl	%eax, -4(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_45bytes_align):
 	movdqa	-45(%eax), %xmm0
 	movdqa	%xmm0, -45(%edx)
@@ -1364,16 +2272,16 @@
 	movl	%ecx, -5(%edx)
 	movzbl	-1(%eax), %ecx
 	movb	%cl, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_37bytes_align):
 	movdqa	-37(%eax), %xmm0
 	movdqa	%xmm0, -37(%edx)
@@ -1384,16 +2292,16 @@
 	movl	%ecx, -5(%edx)
 	movzbl	-1(%eax), %ecx
 	movb	%cl, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_41bytes_align):
 	movdqa	-41(%eax), %xmm0
 	movdqa	%xmm0, -41(%edx)
@@ -1406,16 +2314,16 @@
 L(fwd_write_1bytes_align):
 	movzbl	-1(%eax), %ecx
 	movb	%cl, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_33bytes_align):
 	movdqa	-33(%eax), %xmm0
 	movdqa	%xmm0, -33(%edx)
@@ -1424,16 +2332,16 @@
 	movdqa	%xmm0, -17(%edx)
 	movzbl	-1(%eax), %ecx
 	movb	%cl, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_46bytes_align):
 	movdqa	-46(%eax), %xmm0
 	movdqa	%xmm0, -46(%edx)
@@ -1448,16 +2356,16 @@
 	movl	%ecx, -6(%edx)
 	movzwl	-2(%eax), %ecx
 	movw	%cx, -2(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_38bytes_align):
 	movdqa	-38(%eax), %xmm0
 	movdqa	%xmm0, -38(%edx)
@@ -1468,16 +2376,16 @@
 	movl	%ecx, -6(%edx)
 	movzwl	-2(%eax), %ecx
 	movw	%cx, -2(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_42bytes_align):
 	movdqa	-42(%eax), %xmm0
 	movdqa	%xmm0, -42(%edx)
@@ -1490,16 +2398,16 @@
 L(fwd_write_2bytes_align):
 	movzwl	-2(%eax), %ecx
 	movw	%cx, -2(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_34bytes_align):
 	movdqa	-34(%eax), %xmm0
 	movdqa	%xmm0, -34(%edx)
@@ -1508,16 +2416,16 @@
 	movdqa	%xmm0, -18(%edx)
 	movzwl	-2(%eax), %ecx
 	movw	%cx, -2(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_47bytes_align):
 	movdqa	-47(%eax), %xmm0
 	movdqa	%xmm0, -47(%edx)
@@ -1534,16 +2442,16 @@
 	movzbl	-1(%eax), %eax
 	movw	%cx, -3(%edx)
 	movb	%al, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_39bytes_align):
 	movdqa	-39(%eax), %xmm0
 	movdqa	%xmm0, -39(%edx)
@@ -1556,16 +2464,16 @@
 	movzbl	-1(%eax), %eax
 	movw	%cx, -3(%edx)
 	movb	%al, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_43bytes_align):
 	movdqa	-43(%eax), %xmm0
 	movdqa	%xmm0, -43(%edx)
@@ -1580,16 +2488,16 @@
 	movzbl	-1(%eax), %eax
 	movw	%cx, -3(%edx)
 	movb	%al, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_35bytes_align):
 	movdqa	-35(%eax), %xmm0
 	movdqa	%xmm0, -35(%edx)
@@ -1600,16 +2508,16 @@
 	movzbl	-1(%eax), %eax
 	movw	%cx, -3(%edx)
 	movb	%al, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_44bytes_align):
 	movdqa	-44(%eax), %xmm0
 	movdqa	%xmm0, -44(%edx)
@@ -1622,16 +2530,16 @@
 L(fwd_write_4bytes_align):
 	movl	-4(%eax), %ecx
 	movl	%ecx, -4(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(fwd_write_36bytes_align):
 	movdqa	-36(%eax), %xmm0
 	movdqa	%xmm0, -36(%edx)
@@ -1640,27 +2548,31 @@
 	movdqa	%xmm0, -20(%edx)
 	movl	-4(%eax), %ecx
 	movl	%ecx, -4(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+#  ifdef USE_AS_MEMPCPY
 	movl	%edx, %eax
-# else
-	movl	DEST(%esp), %eax
-# endif
-#endif
+#  else
+	movl	DEST(%esp), %eax
+#  endif
+# endif
 	RETURN_END
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	CFI_PUSH (%edi)
+
+	.p2align 4
 L(large_page):
 	movdqu	(%eax), %xmm1
+# ifdef USE_AS_MEMMOVE
+	movl	DEST+4(%esp), %edi
+	movdqu	%xmm0, (%edi)
+# endif
 	lea	16(%eax), %eax
-	movdqu	%xmm0, (%esi)
 	movntdq	%xmm1, (%edx)
 	lea	16(%edx), %edx
-	POP (%esi)
 	lea	-0x90(%ecx), %ecx
 	POP (%edi)
+
+	.p2align 4
 L(large_page_loop):
 	movdqu	(%eax), %xmm0
 	movdqu	0x10(%eax), %xmm1
@@ -1715,8 +2627,7 @@
 	sfence
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
 
-
-	ALIGN (4)
+	.p2align 4
 L(bk_write_44bytes):
 	movq	36(%eax), %xmm0
 	movq	%xmm0, 36(%edx)
@@ -1736,16 +2647,16 @@
 	movl	(%eax), %ecx
 	movl	%ecx, (%edx)
 L(bk_write_0bytes):
-#ifndef USE_AS_BCOPY
-	movl	DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
 	movl	LEN(%esp), %ecx
 	add	%ecx, %eax
-# endif
-#endif
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(bk_write_40bytes):
 	movq	32(%eax), %xmm0
 	movq	%xmm0, 32(%edx)
@@ -1761,16 +2672,16 @@
 L(bk_write_8bytes):
 	movq	(%eax), %xmm0
 	movq	%xmm0, (%edx)
-#ifndef USE_AS_BCOPY
-	movl	DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
 	movl	LEN(%esp), %ecx
 	add	%ecx, %eax
-# endif
-#endif
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(bk_write_45bytes):
 	movq	37(%eax), %xmm0
 	movq	%xmm0, 37(%edx)
@@ -1792,16 +2703,16 @@
 L(bk_write_1bytes):
 	movzbl	(%eax), %ecx
 	movb	%cl, (%edx)
-#ifndef USE_AS_BCOPY
-	movl	DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
 	movl	LEN(%esp), %ecx
 	add	%ecx, %eax
-# endif
-#endif
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(bk_write_41bytes):
 	movq	33(%eax), %xmm0
 	movq	%xmm0, 33(%edx)
@@ -1819,16 +2730,16 @@
 	movq	%xmm0, 1(%edx)
 	movzbl	(%eax), %ecx
 	movb	%cl, (%edx)
-#ifndef USE_AS_BCOPY
-	movl	DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
 	movl	LEN(%esp), %ecx
 	add	%ecx, %eax
-# endif
-#endif
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(bk_write_46bytes):
 	movq	38(%eax), %xmm0
 	movq	%xmm0, 38(%edx)
@@ -1849,16 +2760,16 @@
 	movl	%ecx, 2(%edx)
 	movzwl	(%eax), %ecx
 	movw	%cx, (%edx)
-#ifndef USE_AS_BCOPY
-	movl	DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
 	movl	LEN(%esp), %ecx
 	add	%ecx, %eax
-# endif
-#endif
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(bk_write_42bytes):
 	movq	34(%eax), %xmm0
 	movq	%xmm0, 34(%edx)
@@ -1877,16 +2788,16 @@
 L(bk_write_2bytes):
 	movzwl	(%eax), %ecx
 	movw	%cx, (%edx)
-#ifndef USE_AS_BCOPY
-	movl	DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
 	movl	LEN(%esp), %ecx
 	add	%ecx, %eax
-# endif
-#endif
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(bk_write_47bytes):
 	movq	39(%eax), %xmm0
 	movq	%xmm0, 39(%edx)
@@ -1909,16 +2820,16 @@
 	movw	%cx, 1(%edx)
 	movzbl	(%eax), %eax
 	movb	%al, (%edx)
-#ifndef USE_AS_BCOPY
-	movl	DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
 	movl	LEN(%esp), %ecx
 	add	%ecx, %eax
-# endif
-#endif
+#  endif
+# endif
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(bk_write_43bytes):
 	movq	35(%eax), %xmm0
 	movq	%xmm0, 35(%edx)
@@ -1939,18 +2850,18 @@
 	movw	%cx, 1(%edx)
 	movzbl	(%eax), %eax
 	movb	%al, (%edx)
-#ifndef USE_AS_BCOPY
-	movl	DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+#  ifdef USE_AS_MEMPCPY
 	movl	LEN(%esp), %ecx
 	add	%ecx, %eax
-# endif
-#endif
+#  endif
+# endif
 	RETURN_END
 
 
 	.pushsection .rodata.ssse3,"a",@progbits
-	ALIGN (2)
+	.p2align 2
 L(table_48bytes_fwd):
 	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
 	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
@@ -2001,7 +2912,7 @@
 	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
 	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
 
-	ALIGN (2)
+	.p2align 2
 L(table_48bytes_fwd_align):
 	.int	JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
 	.int	JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
@@ -2052,7 +2963,7 @@
 	.int	JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
 	.int	JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
 
-	ALIGN (2)
+	.p2align 2
 L(shl_table):
 	.int	JMPTBL (L(shl_0), L(shl_table))
 	.int	JMPTBL (L(shl_1), L(shl_table))
@@ -2071,7 +2982,7 @@
 	.int	JMPTBL (L(shl_14), L(shl_table))
 	.int	JMPTBL (L(shl_15), L(shl_table))
 
-	ALIGN (2)
+	.p2align 2
 L(table_48_bytes_bwd):
 	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
 	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
@@ -2124,13 +3035,13 @@
 
 	.popsection
 
-#ifdef USE_AS_MEMMOVE
-	ALIGN (4)
+# ifdef USE_AS_MEMMOVE
+	.p2align 4
 L(copy_backward):
-	PUSH (%esi)
-	movl	%eax, %esi
+	PUSH (%edi)
+	movl	%eax, %edi
 	lea	(%ecx,%edx,1),%edx
-	lea	(%ecx,%esi,1),%esi
+	lea	(%ecx,%edi,1),%edi
 	testl	$0x3, %edx
 	jnz	L(bk_align)
 
@@ -2145,52 +3056,53 @@
 L(bk_write_more32bytes):
 	/* Copy 32 bytes at a time.  */
 	sub	$32, %ecx
-	movq	-8(%esi), %xmm0
+	movq	-8(%edi), %xmm0
 	movq	%xmm0, -8(%edx)
-	movq	-16(%esi), %xmm0
+	movq	-16(%edi), %xmm0
 	movq	%xmm0, -16(%edx)
-	movq	-24(%esi), %xmm0
+	movq	-24(%edi), %xmm0
 	movq	%xmm0, -24(%edx)
-	movq	-32(%esi), %xmm0
+	movq	-32(%edi), %xmm0
 	movq	%xmm0, -32(%edx)
 	sub	$32, %edx
-	sub	$32, %esi
+	sub	$32, %edi
 
 L(bk_write_less32bytes):
-	movl	%esi, %eax
+	movl	%edi, %eax
 	sub	%ecx, %edx
 	sub	%ecx, %eax
-	POP (%esi)
+	POP (%edi)
 L(bk_write_less32bytes_2):
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
 
-	CFI_PUSH (%esi)
-	ALIGN (4)
+	CFI_PUSH (%edi)
+
+	.p2align 4
 L(bk_align):
 	cmp	$8, %ecx
 	jbe	L(bk_write_less32bytes)
 	testl	$1, %edx
 	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
-	   then (EDX & 2) must be != 0.  */
+	then	(EDX & 2) must be != 0.  */
 	jz	L(bk_got2)
-	sub	$1, %esi
+	sub	$1, %edi
 	sub	$1, %ecx
 	sub	$1, %edx
-	movzbl	(%esi), %eax
+	movzbl	(%edi), %eax
 	movb	%al, (%edx)
 
 	testl	$2, %edx
 	jz	L(bk_aligned_4)
 
 L(bk_got2):
-	sub	$2, %esi
+	sub	$2, %edi
 	sub	$2, %ecx
 	sub	$2, %edx
-	movzwl	(%esi), %eax
+	movzwl	(%edi), %eax
 	movw	%ax, (%edx)
 	jmp	L(bk_aligned_4)
 
-	ALIGN (4)
+	.p2align 4
 L(bk_write_more64bytes):
 	/* Check alignment of last byte.  */
 	testl	$15, %edx
@@ -2198,52 +3110,53 @@
 
 /* EDX is aligned 4 bytes, but not 16 bytes.  */
 L(bk_ssse3_align):
-	sub	$4, %esi
+	sub	$4, %edi
 	sub	$4, %ecx
 	sub	$4, %edx
-	movl	(%esi), %eax
+	movl	(%edi), %eax
 	movl	%eax, (%edx)
 
 	testl	$15, %edx
 	jz	L(bk_ssse3_cpy_pre)
 
-	sub	$4, %esi
+	sub	$4, %edi
 	sub	$4, %ecx
 	sub	$4, %edx
-	movl	(%esi), %eax
+	movl	(%edi), %eax
 	movl	%eax, (%edx)
 
 	testl	$15, %edx
 	jz	L(bk_ssse3_cpy_pre)
 
-	sub	$4, %esi
+	sub	$4, %edi
 	sub	$4, %ecx
 	sub	$4, %edx
-	movl	(%esi), %eax
+	movl	(%edi), %eax
 	movl	%eax, (%edx)
 
 L(bk_ssse3_cpy_pre):
 	cmp	$64, %ecx
 	jb	L(bk_write_more32bytes)
 
+	.p2align 4
 L(bk_ssse3_cpy):
-	sub	$64, %esi
+	sub	$64, %edi
 	sub	$64, %ecx
 	sub	$64, %edx
-	movdqu	0x30(%esi), %xmm3
+	movdqu	0x30(%edi), %xmm3
 	movdqa	%xmm3, 0x30(%edx)
-	movdqu	0x20(%esi), %xmm2
+	movdqu	0x20(%edi), %xmm2
 	movdqa	%xmm2, 0x20(%edx)
-	movdqu	0x10(%esi), %xmm1
+	movdqu	0x10(%edi), %xmm1
 	movdqa	%xmm1, 0x10(%edx)
-	movdqu	(%esi), %xmm0
+	movdqu	(%edi), %xmm0
 	movdqa	%xmm0, (%edx)
 	cmp	$64, %ecx
 	jae	L(bk_ssse3_cpy)
 	jmp	L(bk_write_64bytesless)
 
+# endif
+
+END (MEMCPY)
+
 #endif
-
-END (MEMCPY)
-
-#endif

Modified: fsf/trunk/libc/sysdeps/unix/sysv/linux/bits/socket.h
==============================================================================
--- fsf/trunk/libc/sysdeps/unix/sysv/linux/bits/socket.h (original)
+++ fsf/trunk/libc/sysdeps/unix/sysv/linux/bits/socket.h Sat Mar 31 00:01:56 2012
@@ -236,11 +236,12 @@
   };
 
 #ifdef __USE_GNU
-/* For `recvmmsg'.  */
+/* For `recvmmsg' and 'sendmmsg'.  */
 struct mmsghdr
   {
     struct msghdr msg_hdr;	/* Actual message header.  */
-    unsigned int msg_len;	/* Number of received bytes for the entry.  */
+    unsigned int msg_len;	/* Number of received or sent bytes
+				   for the entry.  */
   };
 #endif
 

Modified: fsf/trunk/libc/wcsmbs/Makefile
==============================================================================
--- fsf/trunk/libc/wcsmbs/Makefile (original)
+++ fsf/trunk/libc/wcsmbs/Makefile Sat Mar 31 00:01:56 2012
@@ -44,7 +44,7 @@
 strop-tests :=  wcscmp wmemcmp wcslen wcschr wcsrchr wcscpy
 tests := tst-wcstof wcsmbs-tst1 tst-wcsnlen tst-btowc tst-mbrtowc \
 	 tst-wcrtomb tst-wcpncpy tst-mbsrtowcs tst-wchar-h tst-mbrtowc2 \
-	 tst-c16c32-1 \
+	 tst-c16c32-1 tst-mbsnrtowcs \
 	 wcsatcliff $(addprefix test-,$(strop-tests))
 
 include ../Rules
@@ -85,3 +85,4 @@
 tst-wcrtomb-ENV = LOCPATH=$(common-objpfx)localedata
 tst-mbrtowc2-ENV = LOCPATH=$(common-objpfx)localedata
 tst-c16c32-1-ENV = LOCPATH=$(common-objpfx)localedata
+tst-mbsnrtowcs-ENV = LOCPATH=$(common-objpfx)localedata

Added: fsf/trunk/libc/wcsmbs/tst-mbsnrtowcs.c
==============================================================================
--- fsf/trunk/libc/wcsmbs/tst-mbsnrtowcs.c (added)
+++ fsf/trunk/libc/wcsmbs/tst-mbsnrtowcs.c Sat Mar 31 00:01:56 2012
@@ -1,0 +1,83 @@
+/* Copyright (C) 2012 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Tulio Magno Quites Machado Filho <tuliom@xxxxxxxxxxxxxxxxxx>,
+   2012.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Test bugzilla 13691  */
+
+#include <stdio.h>
+#include <string.h>
+#include <wchar.h>
+#include <locale.h>
+
+static int
+do_test (void)
+{
+  const char * in = "A";
+  const char *inbuf = in;
+  size_t inlen = strchr (in, '\0') - inbuf;
+
+  wchar_t out[5];
+  mbstate_t ps;
+
+  const char *locale = "vi_VN.TCVN5712-1";
+  if (!setlocale (LC_ALL, locale))
+    {
+      printf ("Locale not available.\n");
+      return 1;
+    }
+
+  memset (&ps, '\0', sizeof (ps));
+  memset (out, '\0', sizeof (out));
+
+  /* If the bug isn't fixed, it isn't going to return from mbsnrtowcs due to
+     an assert().  */
+  size_t n = mbsnrtowcs (out, &inbuf, inlen, sizeof(out) - 1, &ps);
+
+  int result = 0;
+
+  if (n != 1)
+    {
+      printf ("n = %zu, expected 1\n", n);
+      result = 1;
+    }
+
+  int i;
+  printf ("in  = ");
+  for (i = 0; i < inlen; i++)
+    {
+      printf ("0x%X ", in[i]);
+    }
+  printf ("\n");
+
+  char * outb = (char *) out;
+  printf ("out =");
+  for (i = 0; i < sizeof (out); i++)
+    {
+      if (i % 4 == 0)
+	{
+	  printf (" 0x");
+	}
+      printf ("%X", outb[i]);
+    }
+  printf ("\n");
+
+  return result;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"

_______________________________________________
Commits mailing list
Commits@xxxxxxxxxx
http://eglibc.org/cgi-bin/mailman/listinfo/commits