[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[commits] r10247 - in /fsf/trunk/libc: ./ elf/ sysdeps/i386/ sysdeps/i386/i686/multiarch/ sysdeps/x86_64/elf/ sysdeps/x86_64/multiarch/



Author: eglibc
Date: Thu Apr 15 00:03:15 2010
New Revision: 10247

Log:
Import glibc-mainline for 2010-04-15

Added:
    fsf/trunk/libc/sysdeps/i386/i686/multiarch/Versions
    fsf/trunk/libc/sysdeps/i386/i686/multiarch/s_fma-fma.c
    fsf/trunk/libc/sysdeps/i386/i686/multiarch/s_fma.c
    fsf/trunk/libc/sysdeps/i386/i686/multiarch/s_fmaf-fma.c
    fsf/trunk/libc/sysdeps/i386/i686/multiarch/s_fmaf.c
    fsf/trunk/libc/sysdeps/x86_64/multiarch/memcmp-sse4.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/memcmp.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/rtld-memcmp.c
Modified:
    fsf/trunk/libc/ChangeLog
    fsf/trunk/libc/elf/dl-version.c
    fsf/trunk/libc/sysdeps/i386/configure
    fsf/trunk/libc/sysdeps/i386/configure.in
    fsf/trunk/libc/sysdeps/i386/i686/multiarch/Makefile
    fsf/trunk/libc/sysdeps/x86_64/elf/configure
    fsf/trunk/libc/sysdeps/x86_64/elf/configure.in
    fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile

Modified: fsf/trunk/libc/ChangeLog
==============================================================================
--- fsf/trunk/libc/ChangeLog (original)
+++ fsf/trunk/libc/ChangeLog Thu Apr 15 00:03:15 2010
@@ -1,3 +1,37 @@
+2010-04-14  H.J. Lu  <hongjiu.lu@xxxxxxxxx>
+
+	* sysdeps/x86_64/elf/configure.in: Move AVX test to ....
+	* sysdeps/i386/configure.in: ...here.
+	* sysdeps/i386/i686/multiarch/Makefile (libm-sysdep_routines): Define.
+	(CFLAGS-s_fma-fma.c): Define.
+	(CFLAGS-s_fmaf-fma.c): Define.
+	* sysdeps/i386/i686/multiarch/Versions: New file.
+	* sysdeps/i386/i686/multiarch/s_fma-fma.c: New file.
+	* sysdeps/i386/i686/multiarch/s_fma.c: New file.
+	* sysdeps/i386/i686/multiarch/s_fmaf-fma.c: New file.
+	* sysdeps/i386/i686/multiarch/s_fmaf.c: New file.
+
+	* sysdeps/x86_64/multiarch/memcmp-sse4.S: Check
+	DATA_CACHE_SIZE_HALF instead of SHARED_CACHE_SIZE_HALF.
+
+2010-04-14  Andreas Schwab  <schwab@xxxxxxxxxx>
+
+	* elf/dl-version.c (_dl_check_map_versions): Avoid index overflow
+	when dependencies are missing.
+
+2010-04-14  H.J. Lu  <hongjiu.lu@xxxxxxxxx>
+
+	* sysdeps/x86_64/multiarch/memcmp-sse4.S: Optimized for unaligned
+	data.
+
+2010-04-12  H.J. Lu  <hongjiu.lu@xxxxxxxxx>
+
+	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+	memcmp-sse4.
+	* sysdeps/x86_64/multiarch/memcmp-sse4.S: New file.
+	* sysdeps/x86_64/multiarch/memcmp.S: New file.
+	* sysdeps/x86_64/multiarch/rtld-memcmp.c: New file.
+
 2010-04-13  Ulrich Drepper  <drepper@xxxxxxxxxx>
 
 	* sysdeps/x86_64/multiarch/init-arch.h: Pretty printing.

Modified: fsf/trunk/libc/elf/dl-version.c
==============================================================================
--- fsf/trunk/libc/elf/dl-version.c (original)
+++ fsf/trunk/libc/elf/dl-version.c Thu Apr 15 00:03:15 2010
@@ -322,10 +322,14 @@
 	      while (1)
 		{
 		  ElfW(Half) ndx = aux->vna_other & 0x7fff;
-		  map->l_versions[ndx].hash = aux->vna_hash;
-		  map->l_versions[ndx].hidden = aux->vna_other & 0x8000;
-		  map->l_versions[ndx].name = &strtab[aux->vna_name];
-		  map->l_versions[ndx].filename = &strtab[ent->vn_file];
+		  /* In trace mode, dependencies may be missing.  */
+		  if (__builtin_expect (ndx < map->l_nversions, 1))
+		    {
+		      map->l_versions[ndx].hash = aux->vna_hash;
+		      map->l_versions[ndx].hidden = aux->vna_other & 0x8000;
+		      map->l_versions[ndx].name = &strtab[aux->vna_name];
+		      map->l_versions[ndx].filename = &strtab[ent->vn_file];
+		    }
 
 		  if (aux->vna_next == 0)
 		    /* No more symbols.  */

Modified: fsf/trunk/libc/sysdeps/i386/configure
==============================================================================
--- fsf/trunk/libc/sysdeps/i386/configure (original)
+++ fsf/trunk/libc/sysdeps/i386/configure Thu Apr 15 00:03:15 2010
@@ -656,3 +656,28 @@
 fi
 { $as_echo "$as_me:$LINENO: result: $libc_cv_as_i686" >&5
 $as_echo "$libc_cv_as_i686" >&6; }
+
+{ $as_echo "$as_me:$LINENO: checking for AVX support" >&5
+$as_echo_n "checking for AVX support... " >&6; }
+if test "${libc_cv_cc_avx+set}" = set; then
+  $as_echo_n "(cached) " >&6
+else
+  if { ac_try='${CC-cc} -mavx -xc /dev/null -S -o /dev/null'
+  { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  libc_cv_cc_avx=yes
+else
+  libc_cv_cc_avx=no
+fi
+fi
+{ $as_echo "$as_me:$LINENO: result: $libc_cv_cc_avx" >&5
+$as_echo "$libc_cv_cc_avx" >&6; }
+if test $libc_cv_cc_avx = yes; then
+  cat >>confdefs.h <<\_ACEOF
+#define HAVE_AVX_SUPPORT 1
+_ACEOF
+
+fi

Modified: fsf/trunk/libc/sysdeps/i386/configure.in
==============================================================================
--- fsf/trunk/libc/sysdeps/i386/configure.in (original)
+++ fsf/trunk/libc/sysdeps/i386/configure.in Thu Apr 15 00:03:15 2010
@@ -55,3 +55,14 @@
 else
   libc_cv_as_i686=no
 fi])
+
+dnl Check if -mavx works.
+AC_CACHE_CHECK(for AVX support, libc_cv_cc_avx, [dnl
+if AC_TRY_COMMAND([${CC-cc} -mavx -xc /dev/null -S -o /dev/null]); then
+  libc_cv_cc_avx=yes
+else
+  libc_cv_cc_avx=no
+fi])
+if test $libc_cv_cc_avx = yes; then
+  AC_DEFINE(HAVE_AVX_SUPPORT)
+fi

Modified: fsf/trunk/libc/sysdeps/i386/i686/multiarch/Makefile
==============================================================================
--- fsf/trunk/libc/sysdeps/i386/i686/multiarch/Makefile (original)
+++ fsf/trunk/libc/sysdeps/i386/i686/multiarch/Makefile Thu Apr 15 00:03:15 2010
@@ -19,3 +19,9 @@
 CFLAGS-strcasestr.c += -msse4
 endif
 endif
+
+ifeq (mathyes,$(subdir)$(config-cflags-avx))
+libm-sysdep_routines += s_fma-fma s_fmaf-fma
+CFLAGS-s_fma-fma.c += -mavx -mfpmath=sse
+CFLAGS-s_fmaf-fma.c += -mavx -mfpmath=sse
+endif

Added: fsf/trunk/libc/sysdeps/i386/i686/multiarch/Versions
==============================================================================
--- fsf/trunk/libc/sysdeps/i386/i686/multiarch/Versions (added)
+++ fsf/trunk/libc/sysdeps/i386/i686/multiarch/Versions Thu Apr 15 00:03:15 2010
@@ -1,0 +1,5 @@
+libc {
+  GLIBC_PRIVATE {
+    __get_cpu_features;
+  }
+}

Added: fsf/trunk/libc/sysdeps/i386/i686/multiarch/s_fma-fma.c
==============================================================================
--- fsf/trunk/libc/sysdeps/i386/i686/multiarch/s_fma-fma.c (added)
+++ fsf/trunk/libc/sysdeps/i386/i686/multiarch/s_fma-fma.c Thu Apr 15 00:03:15 2010
@@ -1,0 +1,30 @@
+/* FMA version of fma.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA. */
+
+#include <config.h>
+
+#ifdef HAVE_AVX_SUPPORT
+double
+__fma_fma (double x, double y, double z)
+{
+  asm ("vfmadd213sd %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z));
+  return x;
+}
+#endif

Added: fsf/trunk/libc/sysdeps/i386/i686/multiarch/s_fma.c
==============================================================================
--- fsf/trunk/libc/sysdeps/i386/i686/multiarch/s_fma.c (added)
+++ fsf/trunk/libc/sysdeps/i386/i686/multiarch/s_fma.c Thu Apr 15 00:03:15 2010
@@ -1,0 +1,36 @@
+/* Multiple versions of fma.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA. */
+
+#include <config.h>
+
+#ifdef HAVE_AVX_SUPPORT
+#include <math.h>
+#include <init-arch.h>
+
+extern double __fma_ia32 (double x, double y, double z) attribute_hidden;
+extern double __fma_fma (double x, double y, double z) attribute_hidden;
+
+libm_ifunc (__fma, HAS_FMA ? __fma_fma : __fma_ia32);
+weak_alias (__fma, fma)
+
+# define __fma __fma_ia32
+#endif
+
+#include <math/s_fma.c>

Added: fsf/trunk/libc/sysdeps/i386/i686/multiarch/s_fmaf-fma.c
==============================================================================
--- fsf/trunk/libc/sysdeps/i386/i686/multiarch/s_fmaf-fma.c (added)
+++ fsf/trunk/libc/sysdeps/i386/i686/multiarch/s_fmaf-fma.c Thu Apr 15 00:03:15 2010
@@ -1,0 +1,30 @@
+/* FMA version of fmaf.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA. */
+
+#include <config.h>
+
+#ifdef HAVE_AVX_SUPPORT
+float
+__fmaf_fma (float x, float y, float z)
+{
+  asm ("vfmadd213ss %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z));
+  return x;
+}
+#endif

Added: fsf/trunk/libc/sysdeps/i386/i686/multiarch/s_fmaf.c
==============================================================================
--- fsf/trunk/libc/sysdeps/i386/i686/multiarch/s_fmaf.c (added)
+++ fsf/trunk/libc/sysdeps/i386/i686/multiarch/s_fmaf.c Thu Apr 15 00:03:15 2010
@@ -1,0 +1,36 @@
+/* Multiple versions of fmaf.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA. */
+
+#include <config.h>
+
+#ifdef HAVE_AVX_SUPPORT
+#include <math.h>
+#include <init-arch.h>
+
+extern float __fmaf_ia32 (float x, float y, float z) attribute_hidden;
+extern float __fmaf_fma (float x, float y, float z) attribute_hidden;
+
+libm_ifunc (__fmaf, HAS_FMA ? __fmaf_fma : __fmaf_ia32);
+weak_alias (__fmaf, fmaf)
+
+# define __fmaf __fmaf_ia32
+#endif
+
+#include <math/s_fmaf.c>

Modified: fsf/trunk/libc/sysdeps/x86_64/elf/configure
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/elf/configure (original)
+++ fsf/trunk/libc/sysdeps/x86_64/elf/configure Thu Apr 15 00:03:15 2010
@@ -46,29 +46,3 @@
 cat >>confdefs.h <<\_ACEOF
 #define PI_STATIC_AND_HIDDEN 1
 _ACEOF
-
-
-{ $as_echo "$as_me:$LINENO: checking for AVX support" >&5
-$as_echo_n "checking for AVX support... " >&6; }
-if test "${libc_cv_cc_avx+set}" = set; then
-  $as_echo_n "(cached) " >&6
-else
-  if { ac_try='${CC-cc} -mavx -xc /dev/null -S -o /dev/null'
-  { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); }; }; then
-  libc_cv_cc_avx=yes
-else
-  libc_cv_cc_avx=no
-fi
-fi
-{ $as_echo "$as_me:$LINENO: result: $libc_cv_cc_avx" >&5
-$as_echo "$libc_cv_cc_avx" >&6; }
-if test $libc_cv_cc_avx = yes; then
-  cat >>confdefs.h <<\_ACEOF
-#define HAVE_AVX_SUPPORT 1
-_ACEOF
-
-fi

Modified: fsf/trunk/libc/sysdeps/x86_64/elf/configure.in
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/elf/configure.in (original)
+++ fsf/trunk/libc/sysdeps/x86_64/elf/configure.in Thu Apr 15 00:03:15 2010
@@ -32,14 +32,3 @@
 dnl It is always possible to access static and hidden symbols in an
 dnl position independent way.
 AC_DEFINE(PI_STATIC_AND_HIDDEN)
-
-dnl Check if -mavx works.
-AC_CACHE_CHECK(for AVX support, libc_cv_cc_avx, [dnl
-if AC_TRY_COMMAND([${CC-cc} -mavx -xc /dev/null -S -o /dev/null]); then
-  libc_cv_cc_avx=yes
-else
-  libc_cv_cc_avx=no
-fi])
-if test $libc_cv_cc_avx = yes; then
-  AC_DEFINE(HAVE_AVX_SUPPORT)
-fi

Modified: fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile (original)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile Thu Apr 15 00:03:15 2010
@@ -5,7 +5,7 @@
 
 ifeq ($(subdir),string)
 sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
-		   strend-sse4
+		   strend-sse4 memcmp-sse4
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
 CFLAGS-strcspn-c.c += -msse4

Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/memcmp-sse4.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/memcmp-sse4.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/memcmp-sse4.S Thu Apr 15 00:03:15 2010
@@ -1,0 +1,1635 @@
+/* memcmp with SSE4.1
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#ifndef MEMCMP
+# define MEMCMP		__memcmp_sse4_1
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n)	.p2align n
+#endif
+
+#define JMPTBL(I, B)	(I - B)
+
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+  lea		TABLE(%rip), %r11;				\
+  movslq	(%r11, INDEX, SCALE), %rcx;			\
+  add		%r11, %rcx;					\
+  jmp		*%rcx;						\
+  ud2
+
+	.section .text.sse4.1,"ax",@progbits
+ENTRY (MEMCMP)
+	pxor	%xmm0, %xmm0
+	cmp	$79, %rdx
+	ja	L(79bytesormore)
+	cmp	$1, %rdx
+	je	L(firstbyte)
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+	ALIGN (4)
+L(firstbyte):
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	sub	%ecx, %eax
+	ret
+
+	ALIGN (4)
+L(79bytesormore):
+	movdqu	(%rsi), %xmm1
+	movdqu	(%rdi), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+	mov	%rsi, %rcx
+	and	$-16, %rsi
+	add	$16, %rsi
+	sub	%rsi, %rcx
+
+	sub	%rcx, %rdi
+	add	%rcx, %rdx
+	test	$0xf, %rdi
+	jz	L(2aligned)
+
+	cmp	$128, %rdx
+	ja	L(128bytesormore)
+L(less128bytes):
+	sub	$64, %rdx
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqu	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqu	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+	cmp	$32, %rdx
+	jb	L(less32bytesin64)
+
+	movdqu	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqu	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin64):
+	add	$64, %rdi
+	add	$64, %rsi
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+L(128bytesormore):
+	cmp	$512, %rdx
+	ja	L(512bytesormore)
+	cmp	$256, %rdx
+	ja	L(less512bytes)
+L(less256bytes):
+	sub	$128, %rdx
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqu	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqu	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+
+	movdqu	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqu	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+
+	movdqu	96(%rdi), %xmm2
+	pxor	96(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(112bytesin256)
+
+	movdqu	112(%rdi), %xmm2
+	pxor	112(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(128bytesin256)
+
+	add	$128, %rsi
+	add	$128, %rdi
+
+	cmp	$64, %rdx
+	jae	L(less128bytes)
+
+	cmp	$32, %rdx
+	jb	L(less32bytesin128)
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin128):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+L(less512bytes):
+	sub	$256, %rdx
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqu	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqu	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+
+	movdqu	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqu	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+
+	movdqu	96(%rdi), %xmm2
+	pxor	96(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(112bytesin256)
+
+	movdqu	112(%rdi), %xmm2
+	pxor	112(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(128bytesin256)
+
+	movdqu	128(%rdi), %xmm2
+	pxor	128(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(144bytesin256)
+
+	movdqu	144(%rdi), %xmm2
+	pxor	144(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(160bytesin256)
+
+	movdqu	160(%rdi), %xmm2
+	pxor	160(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(176bytesin256)
+
+	movdqu	176(%rdi), %xmm2
+	pxor	176(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(192bytesin256)
+
+	movdqu	192(%rdi), %xmm2
+	pxor	192(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(208bytesin256)
+
+	movdqu	208(%rdi), %xmm2
+	pxor	208(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(224bytesin256)
+
+	movdqu	224(%rdi), %xmm2
+	pxor	224(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(240bytesin256)
+
+	movdqu	240(%rdi), %xmm2
+	pxor	240(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(256bytesin256)
+
+	add	$256, %rsi
+	add	$256, %rdi
+
+	cmp	$128, %rdx
+	jae	L(less256bytes)
+
+	cmp	$64, %rdx
+	jae	L(less128bytes)
+
+	cmp	$32, %rdx
+	jb	L(less32bytesin256)
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin256):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+	ALIGN (4)
+L(512bytesormore):
+#ifdef DATA_CACHE_SIZE_HALF
+	mov	$DATA_CACHE_SIZE_HALF, %r8
+#else
+	mov	__x86_64_data_cache_size_half(%rip), %r8
+#endif
+	mov	%r8, %r9
+	shr	$1, %r8
+	add	%r9, %r8
+	cmp	%r8, %rdx
+	ja	L(L2_L3_cache_unaglined)
+	sub	$64, %rdx
+	ALIGN (4)
+L(64bytesormore_loop):
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	movdqa	%xmm2, %xmm1
+
+	movdqu	16(%rdi), %xmm3
+	pxor	16(%rsi), %xmm3
+	por	%xmm3, %xmm1
+
+	movdqu	32(%rdi), %xmm4
+	pxor	32(%rsi), %xmm4
+	por	%xmm4, %xmm1
+
+	movdqu	48(%rdi), %xmm5
+	pxor	48(%rsi), %xmm5
+	por	%xmm5, %xmm1
+
+	ptest	%xmm1, %xmm0
+	jnc	L(64bytesormore_loop_end)
+	add	$64, %rsi
+	add	$64, %rdi
+	sub	$64, %rdx
+	jae	L(64bytesormore_loop)
+
+	add	$64, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+L(L2_L3_cache_unaglined):
+	sub	$64, %rdx
+	ALIGN (4)
+L(L2_L3_unaligned_128bytes_loop):
+	prefetchnta 0x1c0(%rdi)
+	prefetchnta 0x1c0(%rsi)
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	movdqa	%xmm2, %xmm1
+
+	movdqu	16(%rdi), %xmm3
+	pxor	16(%rsi), %xmm3
+	por	%xmm3, %xmm1
+
+	movdqu	32(%rdi), %xmm4
+	pxor	32(%rsi), %xmm4
+	por	%xmm4, %xmm1
+
+	movdqu	48(%rdi), %xmm5
+	pxor	48(%rsi), %xmm5
+	por	%xmm5, %xmm1
+
+	ptest	%xmm1, %xmm0
+	jnc	L(64bytesormore_loop_end)
+	add	$64, %rsi
+	add	$64, %rdi
+	sub	$64, %rdx
+	jae	L(L2_L3_unaligned_128bytes_loop)
+
+	add	$64, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+/*
+ * This case is for machines which are sensitive for unaligned instructions.
+ */
+	ALIGN (4)
+L(2aligned):
+	cmp	$128, %rdx
+	ja	L(128bytesormorein2aligned)
+L(less128bytesin2aligned):
+	sub	$64, %rdx
+
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqa	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqa	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqa	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+	cmp	$32, %rdx
+	jb	L(less32bytesin64in2alinged)
+
+	movdqa	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqa	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin64in2alinged):
+	add	$64, %rdi
+	add	$64, %rsi
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+	ALIGN (4)
+L(128bytesormorein2aligned):
+	cmp	$512, %rdx
+	ja	L(512bytesormorein2aligned)
+	cmp	$256, %rdx
+	ja	L(256bytesormorein2aligned)
+L(less256bytesin2alinged):
+	sub	$128, %rdx
+
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqa	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqa	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqa	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+
+	movdqa	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqa	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+
+	movdqa	96(%rdi), %xmm2
+	pxor	96(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(112bytesin256)
+
+	movdqa	112(%rdi), %xmm2
+	pxor	112(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(128bytesin256)
+
+	add	$128, %rsi
+	add	$128, %rdi
+
+	cmp	$64, %rdx
+	jae	L(less128bytesin2aligned)
+
+	cmp	$32, %rdx
+	jb	L(less32bytesin128in2aligned)
+
+	movdqu	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqu	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin128in2aligned):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+	ALIGN (4)
+L(256bytesormorein2aligned):
+
+	sub	$256, %rdx
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqa	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+
+	movdqa	32(%rdi), %xmm2
+	pxor	32(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(48bytesin256)
+
+	movdqa	48(%rdi), %xmm2
+	pxor	48(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(64bytesin256)
+
+	movdqa	64(%rdi), %xmm2
+	pxor	64(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(80bytesin256)
+
+	movdqa	80(%rdi), %xmm2
+	pxor	80(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(96bytesin256)
+
+	movdqa	96(%rdi), %xmm2
+	pxor	96(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(112bytesin256)
+
+	movdqa	112(%rdi), %xmm2
+	pxor	112(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(128bytesin256)
+
+	movdqa	128(%rdi), %xmm2
+	pxor	128(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(144bytesin256)
+
+	movdqa	144(%rdi), %xmm2
+	pxor	144(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(160bytesin256)
+
+	movdqa	160(%rdi), %xmm2
+	pxor	160(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(176bytesin256)
+
+	movdqa	176(%rdi), %xmm2
+	pxor	176(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(192bytesin256)
+
+	movdqa	192(%rdi), %xmm2
+	pxor	192(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(208bytesin256)
+
+	movdqa	208(%rdi), %xmm2
+	pxor	208(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(224bytesin256)
+
+	movdqa	224(%rdi), %xmm2
+	pxor	224(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(240bytesin256)
+
+	movdqa	240(%rdi), %xmm2
+	pxor	240(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(256bytesin256)
+
+	add	$256, %rsi
+	add	$256, %rdi
+
+	cmp	$128, %rdx
+	jae	L(less256bytesin2alinged)
+
+	cmp	$64, %rdx
+	jae	L(less128bytesin2aligned)
+
+	cmp	$32, %rdx
+	jb	L(less32bytesin256in2alinged)
+
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytesin256)
+
+	movdqa	16(%rdi), %xmm2
+	pxor	16(%rsi), %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(32bytesin256)
+	sub	$32, %rdx
+	add	$32, %rdi
+	add	$32, %rsi
+L(less32bytesin256in2alinged):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+	ALIGN (4)
+L(512bytesormorein2aligned):
+#ifdef DATA_CACHE_SIZE_HALF
+	mov	$DATA_CACHE_SIZE_HALF, %r8
+#else
+	mov	__x86_64_data_cache_size_half(%rip), %r8
+#endif
+	mov	%r8, %r9
+	shr	$1, %r8
+	add	%r9, %r8
+	cmp	%r8, %rdx
+	ja	L(L2_L3_cache_aglined)
+
+	sub	$64, %rdx
+	ALIGN (4)
+L(64bytesormore_loopin2aligned):
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	movdqa	%xmm2, %xmm1
+
+	movdqa	16(%rdi), %xmm3
+	pxor	16(%rsi), %xmm3
+	por	%xmm3, %xmm1
+
+	movdqa	32(%rdi), %xmm4
+	pxor	32(%rsi), %xmm4
+	por	%xmm4, %xmm1
+
+	movdqa	48(%rdi), %xmm5
+	pxor	48(%rsi), %xmm5
+	por	%xmm5, %xmm1
+
+	ptest	%xmm1, %xmm0
+	jnc	L(64bytesormore_loop_end)
+	add	$64, %rsi
+	add	$64, %rdi
+	sub	$64, %rdx
+	jae	L(64bytesormore_loopin2aligned)
+
+	add	$64, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+L(L2_L3_cache_aglined):
+	sub	$64, %rdx
+	ALIGN (4)
+L(L2_L3_aligned_128bytes_loop):
+	prefetchnta 0x1c0(%rdi)
+	prefetchnta 0x1c0(%rsi)
+	movdqa	(%rdi), %xmm2
+	pxor	(%rsi), %xmm2
+	movdqa	%xmm2, %xmm1
+
+	movdqa	16(%rdi), %xmm3
+	pxor	16(%rsi), %xmm3
+	por	%xmm3, %xmm1
+
+	movdqa	32(%rdi), %xmm4
+	pxor	32(%rsi), %xmm4
+	por	%xmm4, %xmm1
+
+	movdqa	48(%rdi), %xmm5
+	pxor	48(%rsi), %xmm5
+	por	%xmm5, %xmm1
+
+	ptest	%xmm1, %xmm0
+	jnc	L(64bytesormore_loop_end)
+	add	$64, %rsi
+	add	$64, %rdi
+	sub	$64, %rdx
+	jae	L(L2_L3_aligned_128bytes_loop)
+
+	add	$64, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+
+	ALIGN (4)
+L(64bytesormore_loop_end):
+	add	$16, %rdi
+	add	$16, %rsi
+	ptest	%xmm2, %xmm0
+	jnc	L(16bytes)
+
+	add	$16, %rdi
+	add	$16, %rsi
+	ptest	%xmm3, %xmm0
+	jnc	L(16bytes)
+
+	add	$16, %rdi
+	add	$16, %rsi
+	ptest	%xmm4, %xmm0
+	jnc	L(16bytes)
+
+	add	$16, %rdi
+	add	$16, %rsi
+	jmp	L(16bytes)
+
+L(256bytesin256):
+	add	$256, %rdi
+	add	$256, %rsi
+	jmp	L(16bytes)
+L(240bytesin256):
+	add	$240, %rdi
+	add	$240, %rsi
+	jmp	L(16bytes)
+L(224bytesin256):
+	add	$224, %rdi
+	add	$224, %rsi
+	jmp	L(16bytes)
+L(208bytesin256):
+	add	$208, %rdi
+	add	$208, %rsi
+	jmp	L(16bytes)
+L(192bytesin256):
+	add	$192, %rdi
+	add	$192, %rsi
+	jmp	L(16bytes)
+L(176bytesin256):
+	add	$176, %rdi
+	add	$176, %rsi
+	jmp	L(16bytes)
+L(160bytesin256):
+	add	$160, %rdi
+	add	$160, %rsi
+	jmp	L(16bytes)
+L(144bytesin256):
+	add	$144, %rdi
+	add	$144, %rsi
+	jmp	L(16bytes)
+L(128bytesin256):
+	add	$128, %rdi
+	add	$128, %rsi
+	jmp	L(16bytes)
+L(112bytesin256):
+	add	$112, %rdi
+	add	$112, %rsi
+	jmp	L(16bytes)
+L(96bytesin256):
+	add	$96, %rdi
+	add	$96, %rsi
+	jmp	L(16bytes)
+L(80bytesin256):
+	add	$80, %rdi
+	add	$80, %rsi
+	jmp	L(16bytes)
+L(64bytesin256):
+	add	$64, %rdi
+	add	$64, %rsi
+	jmp	L(16bytes)
+L(48bytesin256):
+	add	$16, %rdi
+	add	$16, %rsi
+L(32bytesin256):
+	add	$16, %rdi
+	add	$16, %rsi
+L(16bytesin256):
+	add	$16, %rdi
+	add	$16, %rsi
+L(16bytes):
+	mov	-16(%rdi), %rax
+	mov	-16(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(8bytes):
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(12bytes):
+	mov	-12(%rdi), %rax
+	mov	-12(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(4bytes):
+	mov	-4(%rsi), %ecx
+	mov	-4(%rdi), %eax
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+L(0bytes):
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(65bytes):
+	movdqu	-65(%rdi), %xmm1
+	movdqu	-65(%rsi), %xmm2
+	mov	$-65, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(49bytes):
+	movdqu	-49(%rdi), %xmm1
+	movdqu	-49(%rsi), %xmm2
+	mov	$-49, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(33bytes):
+	movdqu	-33(%rdi), %xmm1
+	movdqu	-33(%rsi), %xmm2
+	mov	$-33, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(17bytes):
+	mov	-17(%rdi), %rax
+	mov	-17(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(9bytes):
+	mov	-9(%rdi), %rax
+	mov	-9(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	movzbl	-1(%rdi), %eax
+	movzbl	-1(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	ALIGN (4)
+L(13bytes):
+	mov	-13(%rdi), %rax
+	mov	-13(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(5bytes):
+	mov	-5(%rdi), %eax
+	mov	-5(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	movzbl	-1(%rdi), %eax
+	movzbl	-1(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	ALIGN (4)
+L(66bytes):
+	movdqu	-66(%rdi), %xmm1
+	movdqu	-66(%rsi), %xmm2
+	mov	$-66, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(50bytes):
+	movdqu	-50(%rdi), %xmm1
+	movdqu	-50(%rsi), %xmm2
+	mov	$-50, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(34bytes):
+	movdqu	-34(%rdi), %xmm1
+	movdqu	-34(%rsi), %xmm2
+	mov	$-34, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(18bytes):
+	mov	-18(%rdi), %rax
+	mov	-18(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(10bytes):
+	mov	-10(%rdi), %rax
+	mov	-10(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	movzwl	-2(%rdi), %eax
+	movzwl	-2(%rsi), %ecx
+	cmp	%cl, %al
+	jne	L(end)
+	and	$0xffff, %eax
+	and	$0xffff, %ecx
+	sub	%ecx, %eax
+	ret
+
+	ALIGN (4)
+L(14bytes):
+	mov	-14(%rdi), %rax
+	mov	-14(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(6bytes):
+	mov	-6(%rdi), %eax
+	mov	-6(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+L(2bytes):
+	movzwl	-2(%rsi), %ecx
+	movzwl	-2(%rdi), %eax
+	cmp	%cl, %al
+	jne	L(end)
+	and	$0xffff, %eax
+	and	$0xffff, %ecx
+	sub	%ecx, %eax
+	ret
+
+	ALIGN (4)
+L(67bytes):
+	movdqu	-67(%rdi), %xmm2
+	movdqu	-67(%rsi), %xmm1
+	mov	$-67, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(51bytes):
+	movdqu	-51(%rdi), %xmm2
+	movdqu	-51(%rsi), %xmm1
+	mov	$-51, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(35bytes):
+	movdqu	-35(%rsi), %xmm1
+	movdqu	-35(%rdi), %xmm2
+	mov	$-35, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(19bytes):
+	mov	-19(%rdi), %rax
+	mov	-19(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+L(11bytes):
+	mov	-11(%rdi), %rax
+	mov	-11(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-4(%rdi), %eax
+	mov	-4(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(15bytes):
+	mov	-15(%rdi), %rax
+	mov	-15(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(7bytes):
+	mov	-7(%rdi), %eax
+	mov	-7(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	mov	-4(%rdi), %eax
+	mov	-4(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(3bytes):
+	movzwl	-3(%rdi), %eax
+	movzwl	-3(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin2bytes)
+L(1bytes):
+	movzbl	-1(%rdi), %eax
+	movzbl	-1(%rsi), %ecx
+	sub	%ecx, %eax
+	ret
+
+	ALIGN (4)
+L(68bytes):
+	movdqu	-68(%rdi), %xmm2
+	movdqu	-68(%rsi), %xmm1
+	mov	$-68, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(52bytes):
+	movdqu	-52(%rdi), %xmm2
+	movdqu	-52(%rsi), %xmm1
+	mov	$-52, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(36bytes):
+	movdqu	-36(%rdi), %xmm2
+	movdqu	-36(%rsi), %xmm1
+	mov	$-36, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(20bytes):
+	movdqu	-20(%rdi), %xmm2
+	movdqu	-20(%rsi), %xmm1
+	mov	$-20, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-4(%rdi), %eax
+	mov	-4(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(69bytes):
+	movdqu	-69(%rsi), %xmm1
+	movdqu	-69(%rdi), %xmm2
+	mov	$-69, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(53bytes):
+	movdqu	-53(%rsi), %xmm1
+	movdqu	-53(%rdi), %xmm2
+	mov	$-53, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(37bytes):
+	movdqu	-37(%rsi), %xmm1
+	movdqu	-37(%rdi), %xmm2
+	mov	$-37, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(21bytes):
+	movdqu	-21(%rsi), %xmm1
+	movdqu	-21(%rdi), %xmm2
+	mov	$-21, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(70bytes):
+	movdqu	-70(%rsi), %xmm1
+	movdqu	-70(%rdi), %xmm2
+	mov	$-70, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(54bytes):
+	movdqu	-54(%rsi), %xmm1
+	movdqu	-54(%rdi), %xmm2
+	mov	$-54, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(38bytes):
+	movdqu	-38(%rsi), %xmm1
+	movdqu	-38(%rdi), %xmm2
+	mov	$-38, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(22bytes):
+	movdqu	-22(%rsi), %xmm1
+	movdqu	-22(%rdi), %xmm2
+	mov	$-22, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(71bytes):
+	movdqu	-71(%rsi), %xmm1
+	movdqu	-71(%rdi), %xmm2
+	mov	$-71, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(55bytes):
+	movdqu	-55(%rdi), %xmm2
+	movdqu	-55(%rsi), %xmm1
+	mov	$-55, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(39bytes):
+	movdqu	-39(%rdi), %xmm2
+	movdqu	-39(%rsi), %xmm1
+	mov	$-39, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(23bytes):
+	movdqu	-23(%rdi), %xmm2
+	movdqu	-23(%rsi), %xmm1
+	mov	$-23, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(72bytes):
+	movdqu	-72(%rsi), %xmm1
+	movdqu	-72(%rdi), %xmm2
+	mov	$-72, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(56bytes):
+	movdqu	-56(%rdi), %xmm2
+	movdqu	-56(%rsi), %xmm1
+	mov	$-56, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(40bytes):
+	movdqu	-40(%rdi), %xmm2
+	movdqu	-40(%rsi), %xmm1
+	mov	$-40, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(24bytes):
+	movdqu	-24(%rdi), %xmm2
+	movdqu	-24(%rsi), %xmm1
+	mov	$-24, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(73bytes):
+	movdqu	-73(%rsi), %xmm1
+	movdqu	-73(%rdi), %xmm2
+	mov	$-73, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(57bytes):
+	movdqu	-57(%rdi), %xmm2
+	movdqu	-57(%rsi), %xmm1
+	mov	$-57, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(41bytes):
+	movdqu	-41(%rdi), %xmm2
+	movdqu	-41(%rsi), %xmm1
+	mov	$-41, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(25bytes):
+	movdqu	-25(%rdi), %xmm2
+	movdqu	-25(%rsi), %xmm1
+	mov	$-25, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-9(%rdi), %rax
+	mov	-9(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	movzbl	-1(%rdi), %eax
+	movzbl	-1(%rsi), %ecx
+	sub	%ecx, %eax
+	ret
+
+	ALIGN (4)
+L(74bytes):
+	movdqu	-74(%rsi), %xmm1
+	movdqu	-74(%rdi), %xmm2
+	mov	$-74, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(58bytes):
+	movdqu	-58(%rdi), %xmm2
+	movdqu	-58(%rsi), %xmm1
+	mov	$-58, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(42bytes):
+	movdqu	-42(%rdi), %xmm2
+	movdqu	-42(%rsi), %xmm1
+	mov	$-42, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(26bytes):
+	movdqu	-26(%rdi), %xmm2
+	movdqu	-26(%rsi), %xmm1
+	mov	$-26, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-10(%rdi), %rax
+	mov	-10(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	movzwl	-2(%rdi), %eax
+	movzwl	-2(%rsi), %ecx
+	jmp	L(diffin2bytes)
+
+	ALIGN (4)
+L(75bytes):
+	movdqu	-75(%rsi), %xmm1
+	movdqu	-75(%rdi), %xmm2
+	mov	$-75, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(59bytes):
+	movdqu	-59(%rdi), %xmm2
+	movdqu	-59(%rsi), %xmm1
+	mov	$-59, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(43bytes):
+	movdqu	-43(%rdi), %xmm2
+	movdqu	-43(%rsi), %xmm1
+	mov	$-43, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(27bytes):
+	movdqu	-27(%rdi), %xmm2
+	movdqu	-27(%rsi), %xmm1
+	mov	$-27, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-11(%rdi), %rax
+	mov	-11(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-4(%rdi), %eax
+	mov	-4(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(76bytes):
+	movdqu	-76(%rsi), %xmm1
+	movdqu	-76(%rdi), %xmm2
+	mov	$-76, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(60bytes):
+	movdqu	-60(%rdi), %xmm2
+	movdqu	-60(%rsi), %xmm1
+	mov	$-60, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(44bytes):
+	movdqu	-44(%rdi), %xmm2
+	movdqu	-44(%rsi), %xmm1
+	mov	$-44, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(28bytes):
+	movdqu	-28(%rdi), %xmm2
+	movdqu	-28(%rsi), %xmm1
+	mov	$-28, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-12(%rdi), %rax
+	mov	-12(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-4(%rdi), %eax
+	mov	-4(%rsi), %ecx
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(77bytes):
+	movdqu	-77(%rsi), %xmm1
+	movdqu	-77(%rdi), %xmm2
+	mov	$-77, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(61bytes):
+	movdqu	-61(%rdi), %xmm2
+	movdqu	-61(%rsi), %xmm1
+	mov	$-61, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(45bytes):
+	movdqu	-45(%rdi), %xmm2
+	movdqu	-45(%rsi), %xmm1
+	mov	$-45, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(29bytes):
+	movdqu	-29(%rdi), %xmm2
+	movdqu	-29(%rsi), %xmm1
+	mov	$-29, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-13(%rdi), %rax
+	mov	-13(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(78bytes):
+	movdqu	-78(%rsi), %xmm1
+	movdqu	-78(%rdi), %xmm2
+	mov	$-78, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(62bytes):
+	movdqu	-62(%rdi), %xmm2
+	movdqu	-62(%rsi), %xmm1
+	mov	$-62, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(46bytes):
+	movdqu	-46(%rdi), %xmm2
+	movdqu	-46(%rsi), %xmm1
+	mov	$-46, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(30bytes):
+	movdqu	-30(%rdi), %xmm2
+	movdqu	-30(%rsi), %xmm1
+	mov	$-30, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-14(%rdi), %rax
+	mov	-14(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(79bytes):
+	movdqu	-79(%rsi), %xmm1
+	movdqu	-79(%rdi), %xmm2
+	mov	$-79, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(63bytes):
+	movdqu	-63(%rdi), %xmm2
+	movdqu	-63(%rsi), %xmm1
+	mov	$-63, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(47bytes):
+	movdqu	-47(%rdi), %xmm2
+	movdqu	-47(%rsi), %xmm1
+	mov	$-47, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(31bytes):
+	movdqu	-31(%rdi), %xmm2
+	movdqu	-31(%rsi), %xmm1
+	mov	$-31, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-15(%rdi), %rax
+	mov	-15(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(64bytes):
+	movdqu	-64(%rdi), %xmm2
+	movdqu	-64(%rsi), %xmm1
+	mov	$-64, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(48bytes):
+	movdqu	-48(%rdi), %xmm2
+	movdqu	-48(%rsi), %xmm1
+	mov	$-48, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(32bytes):
+	movdqu	-32(%rdi), %xmm2
+	movdqu	-32(%rsi), %xmm1
+	mov	$-32, %dl
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-16(%rdi), %rax
+	mov	-16(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+
+	mov	-8(%rdi), %rax
+	mov	-8(%rsi), %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+/*
+ * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
+ */
+	ALIGN (3)
+L(less16bytes):
+	movsbq	%dl, %rdx
+	mov	(%rsi, %rdx), %rcx
+	mov	(%rdi, %rdx), %rax
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	mov	8(%rsi, %rdx), %rcx
+	mov	8(%rdi, %rdx), %rax
+L(diffin8bytes):
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	shr	$32, %rcx
+	shr	$32, %rax
+L(diffin4bytes):
+	cmp	%cx, %ax
+	jne	L(diffin2bytes)
+	shr	$16, %ecx
+	shr	$16, %eax
+L(diffin2bytes):
+	cmp	%cl, %al
+	jne	L(end)
+	and	$0xffff, %eax
+	and	$0xffff, %ecx
+	sub	%ecx, %eax
+	ret
+
+	ALIGN (4)
+L(end):
+	and	$0xff, %eax
+	and	$0xff, %ecx
+	sub	%ecx, %eax
+	ret
+
+END (MEMCMP)
+
+	.section .rodata.sse4.1,"a",@progbits
+	ALIGN (3)
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(1bytes), L(table_64bytes))
+	.int	JMPTBL (L(2bytes), L(table_64bytes))
+	.int	JMPTBL (L(3bytes), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(5bytes), L(table_64bytes))
+	.int	JMPTBL (L(6bytes), L(table_64bytes))
+	.int	JMPTBL (L(7bytes), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(9bytes), L(table_64bytes))
+	.int	JMPTBL (L(10bytes), L(table_64bytes))
+	.int	JMPTBL (L(11bytes), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(13bytes), L(table_64bytes))
+	.int	JMPTBL (L(14bytes), L(table_64bytes))
+	.int	JMPTBL (L(15bytes), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(17bytes), L(table_64bytes))
+	.int	JMPTBL (L(18bytes), L(table_64bytes))
+	.int	JMPTBL (L(19bytes), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(21bytes), L(table_64bytes))
+	.int	JMPTBL (L(22bytes), L(table_64bytes))
+	.int	JMPTBL (L(23bytes), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(25bytes), L(table_64bytes))
+	.int	JMPTBL (L(26bytes), L(table_64bytes))
+	.int	JMPTBL (L(27bytes), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(29bytes), L(table_64bytes))
+	.int	JMPTBL (L(30bytes), L(table_64bytes))
+	.int	JMPTBL (L(31bytes), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(33bytes), L(table_64bytes))
+	.int	JMPTBL (L(34bytes), L(table_64bytes))
+	.int	JMPTBL (L(35bytes), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(37bytes), L(table_64bytes))
+	.int	JMPTBL (L(38bytes), L(table_64bytes))
+	.int	JMPTBL (L(39bytes), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(41bytes), L(table_64bytes))
+	.int	JMPTBL (L(42bytes), L(table_64bytes))
+	.int	JMPTBL (L(43bytes), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(45bytes), L(table_64bytes))
+	.int	JMPTBL (L(46bytes), L(table_64bytes))
+	.int	JMPTBL (L(47bytes), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(49bytes), L(table_64bytes))
+	.int	JMPTBL (L(50bytes), L(table_64bytes))
+	.int	JMPTBL (L(51bytes), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(53bytes), L(table_64bytes))
+	.int	JMPTBL (L(54bytes), L(table_64bytes))
+	.int	JMPTBL (L(55bytes), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(57bytes), L(table_64bytes))
+	.int	JMPTBL (L(58bytes), L(table_64bytes))
+	.int	JMPTBL (L(59bytes), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(61bytes), L(table_64bytes))
+	.int	JMPTBL (L(62bytes), L(table_64bytes))
+	.int	JMPTBL (L(63bytes), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+	.int	JMPTBL (L(65bytes), L(table_64bytes))
+	.int	JMPTBL (L(66bytes), L(table_64bytes))
+	.int	JMPTBL (L(67bytes), L(table_64bytes))
+	.int	JMPTBL (L(68bytes), L(table_64bytes))
+	.int	JMPTBL (L(69bytes), L(table_64bytes))
+	.int	JMPTBL (L(70bytes), L(table_64bytes))
+	.int	JMPTBL (L(71bytes), L(table_64bytes))
+	.int	JMPTBL (L(72bytes), L(table_64bytes))
+	.int	JMPTBL (L(73bytes), L(table_64bytes))
+	.int	JMPTBL (L(74bytes), L(table_64bytes))
+	.int	JMPTBL (L(75bytes), L(table_64bytes))
+	.int	JMPTBL (L(76bytes), L(table_64bytes))
+	.int	JMPTBL (L(77bytes), L(table_64bytes))
+	.int	JMPTBL (L(78bytes), L(table_64bytes))
+	.int	JMPTBL (L(79bytes), L(table_64bytes))
+#endif

Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/memcmp.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/memcmp.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/memcmp.S Thu Apr 15 00:03:15 2010
@@ -1,0 +1,59 @@
+/* Multiple versions of memcmp
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#ifndef NOT_IN_libc
+	.text
+ENTRY(memcmp)
+	.type	memcmp, @gnu_indirect_function
+	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	__memcmp_sse2(%rip), %rax
+	testl	$bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	jz	2f
+	leaq	__memcmp_sse4_1(%rip), %rax
+2:	ret
+END(memcmp)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memcmp_sse2, @function; \
+	.p2align 4; \
+	__memcmp_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memcmp; __GI_memcmp = __memcmp_sse2
+# endif
+#endif
+
+#include "../memcmp.S"

Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/rtld-memcmp.c
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/rtld-memcmp.c (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/rtld-memcmp.c Thu Apr 15 00:03:15 2010
@@ -1,0 +1,1 @@
+#include "../rtld-memcmp.c"