[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[commits] r8708 - in /fsf/trunk/libc: ./ elf/ nptl/ resolv/ sysdeps/unix/sysv/linux/ sysdeps/x86_64/ sysdeps/x86_64/multiarch/
- To: commits@xxxxxxxxxx
- Subject: [commits] r8708 - in /fsf/trunk/libc: ./ elf/ nptl/ resolv/ sysdeps/unix/sysv/linux/ sysdeps/x86_64/ sysdeps/x86_64/multiarch/
- From: eglibc@xxxxxxxxxx
- Date: Mon, 27 Jul 2009 07:07:22 -0000
Author: eglibc
Date: Mon Jul 27 00:07:22 2009
New Revision: 8708
Log:
Import glibc-mainline for 2009-07-27
Added:
fsf/trunk/libc/sysdeps/x86_64/multiarch/rtld-rawmemchr.c
fsf/trunk/libc/sysdeps/x86_64/multiarch/rtld-strlen.S
fsf/trunk/libc/sysdeps/x86_64/rtld-memchr.c
fsf/trunk/libc/sysdeps/x86_64/rtld-memcmp.c
fsf/trunk/libc/sysdeps/x86_64/rtld-rawmemchr.c
fsf/trunk/libc/sysdeps/x86_64/rtld-strchr.S
fsf/trunk/libc/sysdeps/x86_64/rtld-strcmp.S
fsf/trunk/libc/sysdeps/x86_64/rtld-strlen.S
fsf/trunk/libc/sysdeps/x86_64/strncmp.S
fsf/trunk/libc/sysdeps/x86_64/tst-xmmymm.sh (with props)
Removed:
fsf/trunk/libc/sysdeps/x86_64/multiarch/strncmp-c.c
Modified:
fsf/trunk/libc/ChangeLog
fsf/trunk/libc/elf/elf.h
fsf/trunk/libc/nptl/ChangeLog
fsf/trunk/libc/nptl/pthread_mutex_lock.c
fsf/trunk/libc/nptl/pthread_mutex_timedlock.c
fsf/trunk/libc/resolv/res_send.c
fsf/trunk/libc/sysdeps/unix/sysv/linux/eventfd.c
fsf/trunk/libc/sysdeps/unix/sysv/linux/kernel-features.h
fsf/trunk/libc/sysdeps/unix/sysv/linux/signalfd.c
fsf/trunk/libc/sysdeps/x86_64/Makefile
fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile
fsf/trunk/libc/sysdeps/x86_64/multiarch/strcmp.S
fsf/trunk/libc/sysdeps/x86_64/strcmp.S
Modified: fsf/trunk/libc/ChangeLog
==============================================================================
--- fsf/trunk/libc/ChangeLog (original)
+++ fsf/trunk/libc/ChangeLog Mon Jul 27 00:07:22 2009
@@ -1,3 +1,53 @@
+2009-07-26 Ulrich Drepper <drepper@xxxxxxxxxx>
+
+ * sysdeps/x86_64/tst-xmmymm.sh: New file. Check whether any of the
+ functions used in ld.so modify xmm/ymm registers.
+ * sysdeps/x86_64/Makefile: Hook new test up.
+ * sysdeps/x86_64/rtld-memchr.c: New file.
+ * sysdeps/x86_64/rtld-memcmp.c: New file.
+ * sysdeps/x86_64/rtld-rawmemchr.c: New file.
+ * sysdeps/x86_64/rtld-strchr.S: New file.
+ * sysdeps/x86_64/rtld-strcmp.S: New file.
+ * sysdeps/x86_64/rtld-strlen.S: New file.
+ * sysdeps/x86_64/multiarch/rtld-rawmemchr.c: New file.
+ * sysdeps/x86_64/multiarch/rtld-strlen.S: New file.
+
+2009-07-26 H.J. Lu <hongjiu.lu@xxxxxxxxx>
+
+ * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Remove
+ strncmp-c.
+ * sysdeps/x86_64/multiarch/strcmp.S (aftertail): Removed.
+ (exit): Likewise.
+ (Byte1): Likewise.
+ (Byte2): Likewise.
+ (Byte3): Likewise.
+ (Byte4): Likewise.
+ (Byte5): Likewise.
+ (Byte6): Likewise.
+ (next_8_bytes): Likewise.
+ (Byte0): Remove commented out codes.
+ (unaligned_table): Align jump table at 8 bytes.
+ Add _sse4_2 to all labels. Always include "../strcmp.S".
+ * sysdeps/x86_64/multiarch/strncmp-c.c: Removed.
+ * sysdeps/x86_64/strcmp.S: Add SSE2 support.
+ * sysdeps/x86_64/strncmp.S: New file.
+
+2009-07-26 Ulrich Drepper <drepper@xxxxxxxxxx>
+
+ [BZ #10422]
+ * sysdeps/unix/sysv/linux/eventfd.c: Add compatibility for old
+ kernels, dropped when eventfd2 support was added.
+ * sysdeps/unix/sysv/linux/signalfd.c: Add compatibility for old
+ kernels, dropped when signalfd4 support was added.
+ * sysdeps/unix/sysv/linux/kernel-features.h: More CLOEXEC syscalls
+ added, name them.
+
+ [BZ #10452]
+ * resolv/res_send.c (send_dg): Pass full SERVFAIL, NOTIMP, REFUSED
+ replies up.
+
+ * elf/elf.h: Define NT_GNU_GOLD_VERSION.
+
2009-07-25 Ulrich Drepper <drepper@xxxxxxxxxx>
* sysdeps/x86_64/multiarch/strcmp.S: Some more optimizations for
Modified: fsf/trunk/libc/elf/elf.h
==============================================================================
--- fsf/trunk/libc/elf/elf.h (original)
+++ fsf/trunk/libc/elf/elf.h Mon Jul 27 00:07:22 2009
@@ -1053,6 +1053,9 @@
/* Build ID bits as generated by ld --build-id.
The descriptor consists of any nonzero number of bytes. */
#define NT_GNU_BUILD_ID 3
+
+/* Version note generated by GNU gold containing a version string. */
+#define NT_GNU_GOLD_VERSION 4
/* Move records. */
Modified: fsf/trunk/libc/nptl/ChangeLog
==============================================================================
--- fsf/trunk/libc/nptl/ChangeLog (original)
+++ fsf/trunk/libc/nptl/ChangeLog Mon Jul 27 00:07:22 2009
@@ -1,3 +1,10 @@
+2009-07-26 Ulrich Drepper <drepper@xxxxxxxxxx>
+
+ [BZ #10418]
+ * pthread_mutex_lock.c (pthread_mutex_lock): Use _rel instead of of
+ _acq variants of cmpxchg.
+ * pthread_mutex_timedlock.c (pthread_mutex_timedlock): Likewise.
+
2009-07-23 Ulrich Drepper <drepper@xxxxxxxxxx>
* sysdeps/x86_64/configure.in: New file.
Modified: fsf/trunk/libc/nptl/pthread_mutex_lock.c
==============================================================================
--- fsf/trunk/libc/nptl/pthread_mutex_lock.c (original)
+++ fsf/trunk/libc/nptl/pthread_mutex_lock.c Mon Jul 27 00:07:22 2009
@@ -1,4 +1,4 @@
-/* Copyright (C) 2002-2007, 2008 Free Software Foundation, Inc.
+/* Copyright (C) 2002-2007, 2008, 2009 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@xxxxxxxxxx>, 2002.
@@ -160,7 +160,7 @@
#endif
newval
- = atomic_compare_and_exchange_val_acq (&mutex->__data.__lock,
+ = atomic_compare_and_exchange_val_rel (&mutex->__data.__lock,
newval, oldval);
if (newval != oldval)
@@ -285,7 +285,7 @@
#ifdef NO_INCR
newval |= FUTEX_WAITERS;
#endif
- oldval = atomic_compare_and_exchange_val_acq (&mutex->__data.__lock,
+ oldval = atomic_compare_and_exchange_val_rel (&mutex->__data.__lock,
newval, 0);
if (oldval != 0)
@@ -420,7 +420,7 @@
oldprio = ceiling;
oldval
- = atomic_compare_and_exchange_val_acq (&mutex->__data.__lock,
+ = atomic_compare_and_exchange_val_rel (&mutex->__data.__lock,
#ifdef NO_INCR
ceilval | 2,
#else
@@ -434,7 +434,7 @@
do
{
oldval
- = atomic_compare_and_exchange_val_acq (&mutex->__data.__lock,
+ = atomic_compare_and_exchange_val_rel (&mutex->__data.__lock,
ceilval | 2,
ceilval | 1);
@@ -445,7 +445,7 @@
lll_futex_wait (&mutex->__data.__lock, ceilval | 2,
PTHREAD_MUTEX_PSHARED (mutex));
}
- while (atomic_compare_and_exchange_val_acq (&mutex->__data.__lock,
+ while (atomic_compare_and_exchange_val_rel (&mutex->__data.__lock,
ceilval | 2, ceilval)
!= ceilval);
}
Modified: fsf/trunk/libc/nptl/pthread_mutex_timedlock.c
==============================================================================
--- fsf/trunk/libc/nptl/pthread_mutex_timedlock.c (original)
+++ fsf/trunk/libc/nptl/pthread_mutex_timedlock.c Mon Jul 27 00:07:22 2009
@@ -1,4 +1,4 @@
-/* Copyright (C) 2002-2007, 2008 Free Software Foundation, Inc.
+/* Copyright (C) 2002-2007, 2008, 2009 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@xxxxxxxxxx>, 2002.
@@ -126,7 +126,7 @@
int newval = id | (oldval & FUTEX_WAITERS);
newval
- = atomic_compare_and_exchange_val_acq (&mutex->__data.__lock,
+ = atomic_compare_and_exchange_val_rel (&mutex->__data.__lock,
newval, oldval);
if (newval != oldval)
{
@@ -246,7 +246,7 @@
}
}
- oldval = atomic_compare_and_exchange_val_acq (&mutex->__data.__lock,
+ oldval = atomic_compare_and_exchange_val_rel (&mutex->__data.__lock,
id, 0);
if (oldval != 0)
@@ -404,7 +404,7 @@
oldprio = ceiling;
oldval
- = atomic_compare_and_exchange_val_acq (&mutex->__data.__lock,
+ = atomic_compare_and_exchange_val_rel (&mutex->__data.__lock,
ceilval | 1, ceilval);
if (oldval == ceilval)
@@ -413,7 +413,7 @@
do
{
oldval
- = atomic_compare_and_exchange_val_acq (&mutex->__data.__lock,
+ = atomic_compare_and_exchange_val_rel (&mutex->__data.__lock,
ceilval | 2,
ceilval | 1);
@@ -456,7 +456,7 @@
PTHREAD_MUTEX_PSHARED (mutex));
}
}
- while (atomic_compare_and_exchange_val_acq (&mutex->__data.__lock,
+ while (atomic_compare_and_exchange_val_rel (&mutex->__data.__lock,
ceilval | 2, ceilval)
!= ceilval);
}
Modified: fsf/trunk/libc/resolv/res_send.c
==============================================================================
--- fsf/trunk/libc/resolv/res_send.c (original)
+++ fsf/trunk/libc/resolv/res_send.c Mon Jul 27 00:07:22 2009
@@ -1278,14 +1278,10 @@
? *thisanssiz : *thisresplen);
if (recvresp1 || (buf2 != NULL && recvresp2))
- {
- *resplen2 = 1;
- return resplen;
- }
+ return resplen;
if (buf2 != NULL)
{
/* We are waiting for a possible second reply. */
- resplen = 1;
if (hp->id == anhp->id)
recvresp1 = 1;
else
Modified: fsf/trunk/libc/sysdeps/unix/sysv/linux/eventfd.c
==============================================================================
--- fsf/trunk/libc/sysdeps/unix/sysv/linux/eventfd.c (original)
+++ fsf/trunk/libc/sysdeps/unix/sysv/linux/eventfd.c Mon Jul 27 00:07:22 2009
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007, 2008 Free Software Foundation, Inc.
+/* Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -19,14 +19,21 @@
#include <errno.h>
#include <sys/eventfd.h>
#include <sysdep.h>
+#include <kernel-features.h>
int
eventfd (int count, int flags)
{
#ifdef __NR_eventfd2
- return INLINE_SYSCALL (eventfd2, 2, count, flags);
-#else
+ int res = INLINE_SYSCALL (eventfd2, 2, count, flags);
+# ifndef __ASSUME_EVENTFD2
+ if (res != -1 || errno != ENOSYS)
+# endif
+ return res;
+#endif
+
+#ifndef __ASSUME_EVENTFD2
/* The old system call has no flag parameter which is bad. So we have
to wait until we have to support to pass additional values to the
kernel (sys_indirect) before implementing setting flags like
@@ -43,5 +50,7 @@
__set_errno (ENOSYS);
return -1;
# endif
+#elif !defined __NR_eventfd2
+# error "__ASSUME_EVENTFD2 defined but not __NR_eventfd2"
#endif
}
Modified: fsf/trunk/libc/sysdeps/unix/sysv/linux/kernel-features.h
==============================================================================
--- fsf/trunk/libc/sysdeps/unix/sysv/linux/kernel-features.h (original)
+++ fsf/trunk/libc/sysdeps/unix/sysv/linux/kernel-features.h Mon Jul 27 00:07:22 2009
@@ -516,6 +516,8 @@
# define __ASSUME_SOCK_CLOEXEC 1
# define __ASSUME_IN_NONBLOCK 1
# define __ASSUME_PIPE2 1
+# define __ASSUME_EVENTFD2 1
+# define __ASSUME_SIGNALFD4 1
#endif
/* Support for the accept4 syscall was added in 2.6.28. */
Modified: fsf/trunk/libc/sysdeps/unix/sysv/linux/signalfd.c
==============================================================================
--- fsf/trunk/libc/sysdeps/unix/sysv/linux/signalfd.c (original)
+++ fsf/trunk/libc/sysdeps/unix/sysv/linux/signalfd.c Mon Jul 27 00:07:22 2009
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007, 2008 Free Software Foundation, Inc.
+/* Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -20,14 +20,21 @@
#include <signal.h>
#include <sys/signalfd.h>
#include <sysdep.h>
+#include <kernel-features.h>
int
signalfd (int fd, const sigset_t *mask, int flags)
{
#ifdef __NR_signalfd4
- return INLINE_SYSCALL (signalfd4, 4, fd, mask, _NSIG / 8, flags);
-#else
+ int res = INLINE_SYSCALL (signalfd4, 4, fd, mask, _NSIG / 8, flags);
+# ifndef __ASSUME_SIGNALFD4
+ if (res != -1 || errno != ENOSYS)
+# endif
+ return res;
+#endif
+
+#ifndef __ASSUME_SIGNALFD4
/* The old system call has no flag parameter which is bad. So we have
to wait until we have to support to pass additional values to the
kernel (sys_indirect) before implementing setting flags like
@@ -44,5 +51,7 @@
__set_errno (ENOSYS);
return -1;
# endif
+#elif !defined __NR_signalfd4
+# error "__ASSUME_SIGNALFD4 defined but not __NR_signalfd4"
#endif
}
Modified: fsf/trunk/libc/sysdeps/x86_64/Makefile
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/Makefile (original)
+++ fsf/trunk/libc/sysdeps/x86_64/Makefile Mon Jul 27 00:07:22 2009
@@ -19,6 +19,10 @@
sysdep-dl-routines += tlsdesc dl-tlsdesc
sysdep_routines += tlsdesc dl-tlsdesc
sysdep-rtld-routines += tlsdesc dl-tlsdesc
+
+tests: $(objpfx)tst-xmmymm.out
+$(objpfx)tst-xmmymm.out: ../sysdeps/x86_64/tst-xmmymm.sh $(objpfx)ld.so
+ $(SHELL) -e $< $(objpfx) > $@
endif
ifeq ($(subdir),csu)
Modified: fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile (original)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile Mon Jul 27 00:07:22 2009
@@ -4,7 +4,7 @@
endif
ifeq ($(subdir),string)
-sysdep_routines += stpncpy-c strncpy-c strncmp-c
+sysdep_routines += stpncpy-c strncpy-c
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
CFLAGS-strcspn-c.c += -msse4
Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/rtld-rawmemchr.c
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/rtld-rawmemchr.c (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/rtld-rawmemchr.c Mon Jul 27 00:07:22 2009
@@ -1,0 +1,1 @@
+#include "../rtld-rawmemchr.c"
Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/rtld-strlen.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/rtld-strlen.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/rtld-strlen.S Mon Jul 27 00:07:22 2009
@@ -1,0 +1,1 @@
+#include "../rtld-strlen.S"
Modified: fsf/trunk/libc/sysdeps/x86_64/multiarch/strcmp.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strcmp.S (original)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strcmp.S Mon Jul 27 00:07:22 2009
@@ -28,9 +28,9 @@
/* calculate left number to compare */ \
lea -16(%rcx, %r11), %r9; \
cmp %r9, %r11; \
- jb LABEL(strcmp_exitz); \
+ jb LABEL(strcmp_exitz_sse4_2); \
test %r9, %r9; \
- je LABEL(strcmp_exitz); \
+ je LABEL(strcmp_exitz_sse4_2); \
mov %r9, %r11
#define STRCMP_SSE42 __strncmp_sse42
@@ -106,9 +106,9 @@
*/
#ifdef USE_AS_STRNCMP
test %rdx, %rdx
- je LABEL(strcmp_exitz)
+ je LABEL(strcmp_exitz_sse4_2)
cmp $1, %rdx
- je LABEL(Byte0)
+ je LABEL(Byte0_sse4_2)
mov %rdx, %r11
#endif
mov %esi, %ecx
@@ -117,9 +117,9 @@
and $0x3f, %rcx /* rsi alignment in cache line */
and $0x3f, %rax /* rdi alignment in cache line */
cmp $0x30, %ecx
- ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */
+ ja LABEL(crosscache_sse4_2)/* rsi: 16-byte load will cross cache line */
cmp $0x30, %eax
- ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */
+ ja LABEL(crosscache_sse4_2)/* rdi: 16-byte load will cross cache line */
movdqu (%rdi), %xmm1
movdqu (%rsi), %xmm2
pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
@@ -128,10 +128,10 @@
psubb %xmm0, %xmm1 /* packed sub of comparison results*/
pmovmskb %xmm1, %edx
sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
- jnz LABEL(less16bytes) /* If not, find different value or null char */
-#ifdef USE_AS_STRNCMP
- sub $16, %r11
- jbe LABEL(strcmp_exitz) /* finish comparision */
+ jnz LABEL(less16bytes_sse4_2)/* If not, find different value or null char */
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz_sse4_2)/* finish comparision */
#endif
add $16, %rsi /* prepare to search next 16 bytes */
add $16, %rdi /* prepare to search next 16 bytes */
@@ -142,7 +142,7 @@
* below to use.
*/
.p2align 4
-LABEL(crosscache):
+LABEL(crosscache_sse4_2):
and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
mov $0xffff, %edx /* for equivalent offset */
@@ -150,15 +150,15 @@
and $0xf, %ecx /* offset of rsi */
and $0xf, %eax /* offset of rdi */
cmp %eax, %ecx
- je LABEL(ashr_0) /* rsi and rdi relative offset same */
- ja LABEL(bigger)
+ je LABEL(ashr_0_sse4_2) /* rsi and rdi relative offset same */
+ ja LABEL(bigger_sse4_2)
mov %edx, %r8d /* r8d is offset flag for exit tail */
xchg %ecx, %eax
xchg %rsi, %rdi
-LABEL(bigger):
+LABEL(bigger_sse4_2):
lea 15(%rax), %r9
sub %rcx, %r9
- lea LABEL(unaligned_table)(%rip), %r10
+ lea LABEL(unaligned_table_sse4_2)(%rip), %r10
movslq (%r10, %r9,4), %r9
lea (%r10, %r9), %r10
jmp *%r10 /* jump to corresponding case */
@@ -169,7 +169,7 @@
* n(0~15) n(0~15) 15(15+ n-n) ashr_0
*/
.p2align 4
-LABEL(ashr_0):
+LABEL(ashr_0_sse4_2):
movdqa (%rsi), %xmm1
pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
@@ -184,7 +184,7 @@
* edx must be the same with r9d if in left byte (16-rcx) is equal to
* the start from (16-rax) and no null char was seen.
*/
- jne LABEL(less32bytes) /* mismatch or null char */
+ jne LABEL(less32bytes_sse4_2) /* mismatch or null char */
UPDATE_STRNCMP_COUNTER
mov $16, %rcx
mov $16, %r9
@@ -203,7 +203,7 @@
jbe LABEL(ashr_0_use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
movdqa (%rdi,%rdx), %xmm0
@@ -212,17 +212,17 @@
jbe LABEL(ashr_0_use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
jmp LABEL(ashr_0_use_sse4_2)
.p2align 4
LABEL(ashr_0_use_sse4_2_exit):
- jnc LABEL(strcmp_exitz)
+ jnc LABEL(strcmp_exitz_sse4_2)
#ifdef USE_AS_STRNCMP
sub %rcx, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
lea -16(%rdx, %rcx), %rcx
movzbl (%rdi, %rcx), %eax
@@ -239,7 +239,7 @@
* n(15) n -15 0(15 +(n-15) - n) ashr_1
*/
.p2align 4
-LABEL(ashr_1):
+LABEL(ashr_1_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
@@ -251,7 +251,7 @@
shr %cl, %edx /* adjust 0xffff for offset */
shr %cl, %r9d /* adjust for 16-byte offset */
sub %r9d, %edx
- jnz LABEL(less32bytes) /* mismatch or null char seen */
+ jnz LABEL(less32bytes_sse4_2)/* mismatch or null char seen */
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
@@ -279,7 +279,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
@@ -292,7 +292,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_1_use_sse4_2)
@@ -318,7 +318,7 @@
* n(14~15) n -14 1(15 +(n-14) - n) ashr_2
*/
.p2align 4
-LABEL(ashr_2):
+LABEL(ashr_2_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
@@ -330,7 +330,7 @@
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
- jnz LABEL(less32bytes)
+ jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
@@ -358,7 +358,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
@@ -371,7 +371,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_2_use_sse4_2)
@@ -397,7 +397,7 @@
* n(13~15) n -13 2(15 +(n-13) - n) ashr_3
*/
.p2align 4
-LABEL(ashr_3):
+LABEL(ashr_3_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
@@ -409,7 +409,7 @@
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
- jnz LABEL(less32bytes)
+ jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
@@ -437,7 +437,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
@@ -450,7 +450,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_3_use_sse4_2)
@@ -476,7 +476,7 @@
* n(12~15) n -12 3(15 +(n-12) - n) ashr_4
*/
.p2align 4
-LABEL(ashr_4):
+LABEL(ashr_4_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
@@ -488,7 +488,7 @@
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
- jnz LABEL(less32bytes)
+ jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
@@ -517,7 +517,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
@@ -530,7 +530,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_4_use_sse4_2)
@@ -556,7 +556,7 @@
* n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
*/
.p2align 4
-LABEL(ashr_5):
+LABEL(ashr_5_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
@@ -568,7 +568,7 @@
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
- jnz LABEL(less32bytes)
+ jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
@@ -597,7 +597,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
@@ -611,7 +611,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_5_use_sse4_2)
@@ -637,7 +637,7 @@
* n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
*/
.p2align 4
-LABEL(ashr_6):
+LABEL(ashr_6_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
@@ -649,7 +649,7 @@
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
- jnz LABEL(less32bytes)
+ jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
@@ -678,7 +678,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
@@ -691,7 +691,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_6_use_sse4_2)
@@ -717,7 +717,7 @@
* n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
*/
.p2align 4
-LABEL(ashr_7):
+LABEL(ashr_7_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
@@ -729,7 +729,7 @@
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
- jnz LABEL(less32bytes)
+ jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
@@ -758,7 +758,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
@@ -771,7 +771,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_7_use_sse4_2)
@@ -797,7 +797,7 @@
* n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
*/
.p2align 4
-LABEL(ashr_8):
+LABEL(ashr_8_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
@@ -809,7 +809,7 @@
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
- jnz LABEL(less32bytes)
+ jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
@@ -838,7 +838,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
@@ -851,7 +851,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_8_use_sse4_2)
@@ -877,7 +877,7 @@
* n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
*/
.p2align 4
-LABEL(ashr_9):
+LABEL(ashr_9_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
@@ -889,7 +889,7 @@
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
- jnz LABEL(less32bytes)
+ jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
@@ -919,7 +919,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
@@ -932,7 +932,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_9_use_sse4_2)
@@ -958,7 +958,7 @@
* n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
*/
.p2align 4
-LABEL(ashr_10):
+LABEL(ashr_10_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
@@ -970,7 +970,7 @@
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
- jnz LABEL(less32bytes)
+ jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
@@ -999,7 +999,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
@@ -1012,7 +1012,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_10_use_sse4_2)
@@ -1038,7 +1038,7 @@
* n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
*/
.p2align 4
-LABEL(ashr_11):
+LABEL(ashr_11_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
@@ -1050,7 +1050,7 @@
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
- jnz LABEL(less32bytes)
+ jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
@@ -1079,7 +1079,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
@@ -1092,7 +1092,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_11_use_sse4_2)
@@ -1118,7 +1118,7 @@
* n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
*/
.p2align 4
-LABEL(ashr_12):
+LABEL(ashr_12_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
@@ -1130,7 +1130,7 @@
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
- jnz LABEL(less32bytes)
+ jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
@@ -1159,7 +1159,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
@@ -1172,7 +1172,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_12_use_sse4_2)
@@ -1198,7 +1198,7 @@
* n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
*/
.p2align 4
-LABEL(ashr_13):
+LABEL(ashr_13_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
@@ -1210,7 +1210,7 @@
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
- jnz LABEL(less32bytes)
+ jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
@@ -1240,7 +1240,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
@@ -1253,7 +1253,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_13_use_sse4_2)
@@ -1279,7 +1279,7 @@
* n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
*/
.p2align 4
-LABEL(ashr_14):
+LABEL(ashr_14_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
@@ -1291,7 +1291,7 @@
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
- jnz LABEL(less32bytes)
+ jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
@@ -1321,7 +1321,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
@@ -1334,7 +1334,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_14_use_sse4_2)
@@ -1360,7 +1360,7 @@
* n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
*/
.p2align 4
-LABEL(ashr_15):
+LABEL(ashr_15_sse4_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
@@ -1372,7 +1372,7 @@
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
- jnz LABEL(less32bytes)
+ jnz LABEL(less32bytes_sse4_2)
movdqa (%rdi), %xmm3
@@ -1404,7 +1404,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
@@ -1417,7 +1417,7 @@
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add $16, %rdx
jmp LABEL(loop_ashr_15_use_sse4_2)
@@ -1439,56 +1439,37 @@
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
.p2align 4
LABEL(use_sse4_2_exit):
- jnc LABEL(strcmp_exitz)
+ jnc LABEL(strcmp_exitz_sse4_2)
#ifdef USE_AS_STRNCMP
sub %rcx, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
add %rcx, %rdx
lea -16(%rdi, %r9), %rdi
movzbl (%rdi, %rdx), %eax
movzbl (%rsi, %rdx), %edx
test %r8d, %r8d
- jz LABEL(use_sse4_2_ret)
+ jz LABEL(use_sse4_2_ret_sse4_2)
xchg %eax, %edx
-LABEL(use_sse4_2_ret):
+LABEL(use_sse4_2_ret_sse4_2):
sub %edx, %eax
ret
-#if 0
- /* This code was in the origial submission but isn't used.
- --drepper */
- .p2align 4
-LABEL(aftertail):
- pcmpeqb %xmm3, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- not %edx
-
- .p2align 4
-LABEL(exit):
- lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */
-#endif
-
-LABEL(less32bytes):
+LABEL(less32bytes_sse4_2):
lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
test %r8d, %r8d
- jz LABEL(ret)
+ jz LABEL(ret_sse4_2)
xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
.p2align 4
-LABEL(ret):
-LABEL(less16bytes):
- /*
- * Check to see if BSF is fast on this processor. If not, use a different
- * exit tail.
- */
+LABEL(ret_sse4_2):
+LABEL(less16bytes_sse4_2):
bsf %rdx, %rdx /* find and store bit index in %rdx */
#ifdef USE_AS_STRNCMP
sub %rdx, %r11
- jbe LABEL(strcmp_exitz)
+ jbe LABEL(strcmp_exitz_sse4_2)
#endif
movzbl (%rsi, %rdx), %ecx
movzbl (%rdi, %rdx), %eax
@@ -1496,138 +1477,14 @@
sub %ecx, %eax
ret
-LABEL(strcmp_exitz):
+LABEL(strcmp_exitz_sse4_2):
xor %eax, %eax
ret
.p2align 4
-LABEL(Byte0):
- /*
- * never need to handle byte 0 for strncmpy
-#ifdef USE_AS_STRNCMP
- sub $0, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- */
+LABEL(Byte0_sse4_2):
movzx (%rsi), %ecx
movzx (%rdi), %eax
-
- sub %ecx, %eax
- ret
-
- .p2align 4
-LABEL(Byte1):
-
-#ifdef USE_AS_STRNCMP
- sub $1, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- movzx 1(%rsi), %ecx
- movzx 1(%rdi), %eax
-
- sub %ecx, %eax
- ret
-
- .p2align 4
-LABEL(Byte2):
-
-#ifdef USE_AS_STRNCMP
- sub $2, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- movzx 2(%rsi), %ecx
- movzx 2(%rdi), %eax
-
- sub %ecx, %eax
- ret
-
- .p2align 4
-LABEL(Byte3):
-
-#ifdef USE_AS_STRNCMP
- sub $3, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- movzx 3(%rsi), %ecx
- movzx 3(%rdi), %eax
-
- sub %ecx, %eax
- ret
-
- .p2align 4
-LABEL(Byte4):
-
-#ifdef USE_AS_STRNCMP
- sub $4, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- movzx 4(%rsi), %ecx
- movzx 4(%rdi), %eax
-
- sub %ecx, %eax
- ret
-
- .p2align 4
-LABEL(Byte5):
-
-#ifdef USE_AS_STRNCMP
- sub $5, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- movzx 5(%rsi), %ecx
- movzx 5(%rdi), %eax
-
- sub %ecx, %eax
- ret
-
- .p2align 4
-LABEL(Byte6):
-
-#ifdef USE_AS_STRNCMP
- sub $6, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- movzx 6(%rsi), %ecx
- movzx 6(%rdi), %eax
-
- sub %ecx, %eax
- ret
-
- .p2align 4
-LABEL(next_8_bytes):
- add $8, %rdi
- add $8, %rsi
-#ifdef USE_AS_STRNCMP
- sub $8, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- test $0x01, %dh
- jnz LABEL(Byte0)
-
- test $0x02, %dh
- jnz LABEL(Byte1)
-
- test $0x04, %dh
- jnz LABEL(Byte2)
-
- test $0x08, %dh
- jnz LABEL(Byte3)
-
- test $0x10, %dh
- jnz LABEL(Byte4)
-
- test $0x20, %dh
- jnz LABEL(Byte5)
-
- test $0x40, %dh
- jnz LABEL(Byte6)
-
-#ifdef USE_AS_STRNCMP
- sub $7, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- movzx 7(%rsi), %ecx
- movzx 7(%rdi), %eax
sub %ecx, %eax
ret
@@ -1636,24 +1493,24 @@
/* Put all SSE 4.2 functions together. */
.section .rodata.sse4.2,"a",@progbits
- .p2align 4
-LABEL(unaligned_table):
- .int LABEL(ashr_1) - LABEL(unaligned_table)
- .int LABEL(ashr_2) - LABEL(unaligned_table)
- .int LABEL(ashr_3) - LABEL(unaligned_table)
- .int LABEL(ashr_4) - LABEL(unaligned_table)
- .int LABEL(ashr_5) - LABEL(unaligned_table)
- .int LABEL(ashr_6) - LABEL(unaligned_table)
- .int LABEL(ashr_7) - LABEL(unaligned_table)
- .int LABEL(ashr_8) - LABEL(unaligned_table)
- .int LABEL(ashr_9) - LABEL(unaligned_table)
- .int LABEL(ashr_10) - LABEL(unaligned_table)
- .int LABEL(ashr_11) - LABEL(unaligned_table)
- .int LABEL(ashr_12) - LABEL(unaligned_table)
- .int LABEL(ashr_13) - LABEL(unaligned_table)
- .int LABEL(ashr_14) - LABEL(unaligned_table)
- .int LABEL(ashr_15) - LABEL(unaligned_table)
- .int LABEL(ashr_0) - LABEL(unaligned_table)
+ .p2align 3
+LABEL(unaligned_table_sse4_2):
+ .int LABEL(ashr_1_sse4_2) - LABEL(unaligned_table_sse4_2)
+ .int LABEL(ashr_2_sse4_2) - LABEL(unaligned_table_sse4_2)
+ .int LABEL(ashr_3_sse4_2) - LABEL(unaligned_table_sse4_2)
+ .int LABEL(ashr_4_sse4_2) - LABEL(unaligned_table_sse4_2)
+ .int LABEL(ashr_5_sse4_2) - LABEL(unaligned_table_sse4_2)
+ .int LABEL(ashr_6_sse4_2) - LABEL(unaligned_table_sse4_2)
+ .int LABEL(ashr_7_sse4_2) - LABEL(unaligned_table_sse4_2)
+ .int LABEL(ashr_8_sse4_2) - LABEL(unaligned_table_sse4_2)
+ .int LABEL(ashr_9_sse4_2) - LABEL(unaligned_table_sse4_2)
+ .int LABEL(ashr_10_sse4_2) - LABEL(unaligned_table_sse4_2)
+ .int LABEL(ashr_11_sse4_2) - LABEL(unaligned_table_sse4_2)
+ .int LABEL(ashr_12_sse4_2) - LABEL(unaligned_table_sse4_2)
+ .int LABEL(ashr_13_sse4_2) - LABEL(unaligned_table_sse4_2)
+ .int LABEL(ashr_14_sse4_2) - LABEL(unaligned_table_sse4_2)
+ .int LABEL(ashr_15_sse4_2) - LABEL(unaligned_table_sse4_2)
+ .int LABEL(ashr_0_sse4_2) - LABEL(unaligned_table_sse4_2)
# undef ENTRY
@@ -1673,6 +1530,4 @@
.globl __GI_STRCMP; __GI_STRCMP = STRCMP_SSE2
#endif
-#ifndef USE_AS_STRNCMP
#include "../strcmp.S"
-#endif
Added: fsf/trunk/libc/sysdeps/x86_64/rtld-memchr.c
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/rtld-memchr.c (added)
+++ fsf/trunk/libc/sysdeps/x86_64/rtld-memchr.c Mon Jul 27 00:07:22 2009
@@ -1,0 +1,1 @@
+#include <string/memchr.c>
Added: fsf/trunk/libc/sysdeps/x86_64/rtld-memcmp.c
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/rtld-memcmp.c (added)
+++ fsf/trunk/libc/sysdeps/x86_64/rtld-memcmp.c Mon Jul 27 00:07:22 2009
@@ -1,0 +1,1 @@
+#include <string/memcmp.c>
Added: fsf/trunk/libc/sysdeps/x86_64/rtld-rawmemchr.c
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/rtld-rawmemchr.c (added)
+++ fsf/trunk/libc/sysdeps/x86_64/rtld-rawmemchr.c Mon Jul 27 00:07:22 2009
@@ -1,0 +1,1 @@
+#include <string/rawmemchr.c>
Added: fsf/trunk/libc/sysdeps/x86_64/rtld-strchr.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/rtld-strchr.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/rtld-strchr.S Mon Jul 27 00:07:22 2009
@@ -1,0 +1,291 @@
+/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR.
+ For AMD x86-64.
+ Copyright (C) 2002, 2005 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+#include "bp-sym.h"
+#include "bp-asm.h"
+
+
+ .text
+ENTRY (BP_SYM (strchr))
+
+ /* Before we start with the main loop we process single bytes
+ until the source pointer is aligned. This has two reasons:
+ 1. aligned 64-bit memory access is faster
+ and (more important)
+ 2. we process in the main loop 64 bit in one step although
+ we don't know the end of the string. But accessing at
+ 8-byte alignment guarantees that we never access illegal
+ memory if this would not also be done by the trivial
+ implementation (this is because all processor inherent
+ boundaries are multiples of 8). */
+
+ movq %rdi, %rdx
+ andl $7, %edx /* Mask alignment bits */
+ movq %rdi, %rax /* duplicate destination. */
+ jz 1f /* aligned => start loop */
+ neg %edx
+ addl $8, %edx /* Align to 8 bytes. */
+
+ /* Search the first bytes directly. */
+0: movb (%rax), %cl /* load byte */
+ cmpb %cl,%sil /* compare byte. */
+ je 6f /* target found */
+ testb %cl,%cl /* is byte NUL? */
+ je 7f /* yes => return NULL */
+ incq %rax /* increment pointer */
+ decl %edx
+ jnz 0b
+
+
+1:
+ /* At the moment %rsi contains C. What we need for the
+ algorithm is C in all bytes of the register. Avoid
+ operations on 16 bit words because these require an
+ prefix byte (and one more cycle). */
+ /* Populate 8 bit data to full 64-bit. */
+ movabs $0x0101010101010101,%r9
+ movzbl %sil,%edx
+ imul %rdx,%r9
+
+ movq $0xfefefefefefefeff, %r8 /* Save magic. */
+
+ /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+ change any of the hole bits of LONGWORD.
+
+ 1) Is this safe? Will it catch all the zero bytes?
+ Suppose there is a byte with all zeros. Any carry bits
+ propagating from its left will fall into the hole at its
+ least significant bit and stop. Since there will be no
+ carry from its most significant bit, the LSB of the
+ byte to the left will be unchanged, and the zero will be
+ detected.
+
+ 2) Is this worthwhile? Will it ignore everything except
+ zero bytes? Suppose every byte of QUARDWORD has a bit set
+ somewhere. There will be a carry into bit 8. If bit 8
+ is set, this will carry into bit 16. If bit 8 is clear,
+ one of bits 9-15 must be set, so there will be a carry
+ into bit 16. Similarly, there will be a carry into bit
+ 24 tec.. If one of bits 54-63 is set, there will be a carry
+ into bit 64 (=carry flag), so all of the hole bits will
+ be changed.
+
+ 3) But wait! Aren't we looking for C, not zero?
+ Good point. So what we do is XOR LONGWORD with a longword,
+ each of whose bytes is C. This turns each byte that is C
+ into a zero. */
+
+ .p2align 4
+4:
+ /* Main Loop is unrolled 4 times. */
+ /* First unroll. */
+ movq (%rax), %rcx /* get double word (= 8 bytes) in question */
+ addq $8,%rax /* adjust pointer for next word */
+ movq %r8, %rdx /* magic value */
+ xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c
+ are now 0 */
+ addq %rcx, %rdx /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc 3f /* highest byte is NUL => return pointer */
+ xorq %rcx, %rdx /* (word+magic)^word */
+ orq %r8, %rdx /* set all non-carry bits */
+ incq %rdx /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz 3f /* found c => return pointer */
+
+ /* The quadword we looked at does not contain the value we're looking
+ for. Let's search now whether we have reached the end of the
+ string. */
+ xorq %r9, %rcx /* restore original dword without reload */
+ movq %r8, %rdx /* magic value */
+ addq %rcx, %rdx /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc 7f /* highest byte is NUL => return NULL */
+ xorq %rcx, %rdx /* (word+magic)^word */
+ orq %r8, %rdx /* set all non-carry bits */
+ incq %rdx /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz 7f /* found NUL => return NULL */
+
+ /* Second unroll. */
+ movq (%rax), %rcx /* get double word (= 8 bytes) in question */
+ addq $8,%rax /* adjust pointer for next word */
+ movq %r8, %rdx /* magic value */
+ xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c
+ are now 0 */
+ addq %rcx, %rdx /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc 3f /* highest byte is NUL => return pointer */
+ xorq %rcx, %rdx /* (word+magic)^word */
+ orq %r8, %rdx /* set all non-carry bits */
+ incq %rdx /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz 3f /* found c => return pointer */
+
+ /* The quadword we looked at does not contain the value we're looking
+ for. Let's search now whether we have reached the end of the
+ string. */
+ xorq %r9, %rcx /* restore original dword without reload */
+ movq %r8, %rdx /* magic value */
+ addq %rcx, %rdx /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc 7f /* highest byte is NUL => return NULL */
+ xorq %rcx, %rdx /* (word+magic)^word */
+ orq %r8, %rdx /* set all non-carry bits */
+ incq %rdx /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz 7f /* found NUL => return NULL */
+ /* Third unroll. */
+ movq (%rax), %rcx /* get double word (= 8 bytes) in question */
+ addq $8,%rax /* adjust pointer for next word */
+ movq %r8, %rdx /* magic value */
+ xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c
+ are now 0 */
+ addq %rcx, %rdx /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc 3f /* highest byte is NUL => return pointer */
+ xorq %rcx, %rdx /* (word+magic)^word */
+ orq %r8, %rdx /* set all non-carry bits */
+ incq %rdx /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz 3f /* found c => return pointer */
+
+ /* The quadword we looked at does not contain the value we're looking
+ for. Let's search now whether we have reached the end of the
+ string. */
+ xorq %r9, %rcx /* restore original dword without reload */
+ movq %r8, %rdx /* magic value */
+ addq %rcx, %rdx /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc 7f /* highest byte is NUL => return NULL */
+ xorq %rcx, %rdx /* (word+magic)^word */
+ orq %r8, %rdx /* set all non-carry bits */
+ incq %rdx /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz 7f /* found NUL => return NULL */
+ /* Fourth unroll. */
+ movq (%rax), %rcx /* get double word (= 8 bytes) in question */
+ addq $8,%rax /* adjust pointer for next word */
+ movq %r8, %rdx /* magic value */
+ xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c
+ are now 0 */
+ addq %rcx, %rdx /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc 3f /* highest byte is NUL => return pointer */
+ xorq %rcx, %rdx /* (word+magic)^word */
+ orq %r8, %rdx /* set all non-carry bits */
+ incq %rdx /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz 3f /* found c => return pointer */
+
+ /* The quadword we looked at does not contain the value we're looking
+ for. Let's search now whether we have reached the end of the
+ string. */
+ xorq %r9, %rcx /* restore original dword without reload */
+ movq %r8, %rdx /* magic value */
+ addq %rcx, %rdx /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc 7f /* highest byte is NUL => return NULL */
+ xorq %rcx, %rdx /* (word+magic)^word */
+ orq %r8, %rdx /* set all non-carry bits */
+ incq %rdx /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jz 4b /* no NUL found => restart loop */
+
+
+7: /* Return NULL. */
+ xorl %eax, %eax
+ retq
+
+
+ /* We now scan for the byte in which the character was matched.
+ But we have to take care of the case that a NUL char is
+ found before this in the dword. Note that we XORed %rcx
+ with the byte we're looking for, therefore the tests below look
+ reversed. */
+
+
+ .p2align 4 /* Align, it's a jump target. */
+3: movq %r9,%rdx /* move to %rdx so that we can access bytes */
+ subq $8,%rax /* correct pointer increment. */
+ testb %cl, %cl /* is first byte C? */
+ jz 6f /* yes => return pointer */
+ cmpb %dl, %cl /* is first byte NUL? */
+ je 7b /* yes => return NULL */
+ incq %rax /* increment pointer */
+
+ testb %ch, %ch /* is second byte C? */
+ jz 6f /* yes => return pointer */
+ cmpb %dl, %ch /* is second byte NUL? */
+ je 7b /* yes => return NULL? */
+ incq %rax /* increment pointer */
+
+ shrq $16, %rcx /* make upper bytes accessible */
+ testb %cl, %cl /* is third byte C? */
+ jz 6f /* yes => return pointer */
+ cmpb %dl, %cl /* is third byte NUL? */
+ je 7b /* yes => return NULL */
+ incq %rax /* increment pointer */
+
+ testb %ch, %ch /* is fourth byte C? */
+ jz 6f /* yes => return pointer */
+ cmpb %dl, %ch /* is fourth byte NUL? */
+ je 7b /* yes => return NULL? */
+ incq %rax /* increment pointer */
+
+ shrq $16, %rcx /* make upper bytes accessible */
+ testb %cl, %cl /* is fifth byte C? */
+ jz 6f /* yes => return pointer */
+ cmpb %dl, %cl /* is fifth byte NUL? */
+ je 7b /* yes => return NULL */
+ incq %rax /* increment pointer */
+
+ testb %ch, %ch /* is sixth byte C? */
+ jz 6f /* yes => return pointer */
+ cmpb %dl, %ch /* is sixth byte NUL? */
+ je 7b /* yes => return NULL? */
+ incq %rax /* increment pointer */
+
+ shrq $16, %rcx /* make upper bytes accessible */
+ testb %cl, %cl /* is seventh byte C? */
+ jz 6f /* yes => return pointer */
+ cmpb %dl, %cl /* is seventh byte NUL? */
+ je 7b /* yes => return NULL */
+
+ /* It must be in the eigth byte and it cannot be NUL. */
+ incq %rax
+
+6:
+ nop
+ retq
+END (BP_SYM (strchr))
+
+weak_alias (BP_SYM (strchr), BP_SYM (index))
+libc_hidden_builtin_def (strchr)
Added: fsf/trunk/libc/sysdeps/x86_64/rtld-strcmp.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/rtld-strcmp.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/rtld-strcmp.S Mon Jul 27 00:07:22 2009
@@ -1,0 +1,28 @@
+#include <sysdep.h>
+#include "asm-syntax.h"
+#include "bp-sym.h"
+#include "bp-asm.h"
+
+#ifndef LABEL
+#define LABEL(l) L(l)
+#endif
+
+ .text
+ENTRY (BP_SYM (STRCMP))
+/* Simple version since we can't use SSE registers in ld.so. */
+L(oop): movb (%rdi), %al
+ cmpb (%rsi), %al
+ jne L(neq)
+ incq %rdi
+ incq %rsi
+ testb %al, %al
+ jnz L(oop)
+
+ xorl %eax, %eax
+ ret
+
+L(neq): movl $1, %eax
+ movl $-1, %ecx
+ cmovbl %ecx, %eax
+ ret
+END (BP_SYM (STRCMP))
Added: fsf/trunk/libc/sysdeps/x86_64/rtld-strlen.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/rtld-strlen.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/rtld-strlen.S Mon Jul 27 00:07:22 2009
@@ -1,0 +1,139 @@
+/* strlen(str) -- determine the length of the string STR.
+ Copyright (C) 2002, 2003 Free Software Foundation, Inc.
+ Based on i486 version contributed by Ulrich Drepper <drepper@xxxxxxxxxx>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+#include "bp-sym.h"
+#include "bp-asm.h"
+
+
+ .text
+ENTRY (strlen)
+ movq %rdi, %rcx /* Duplicate source pointer. */
+ andl $7, %ecx /* mask alignment bits */
+ movq %rdi, %rax /* duplicate destination. */
+ jz 1f /* aligned => start loop */
+
+ neg %ecx /* We need to align to 8 bytes. */
+ addl $8,%ecx
+ /* Search the first bytes directly. */
+0: cmpb $0x0,(%rax) /* is byte NUL? */
+ je 2f /* yes => return */
+ incq %rax /* increment pointer */
+ decl %ecx
+ jnz 0b
+
+1: movq $0xfefefefefefefeff,%r8 /* Save magic. */
+
+ .p2align 4 /* Align loop. */
+4: /* Main Loop is unrolled 4 times. */
+ /* First unroll. */
+ movq (%rax), %rcx /* get double word (= 8 bytes) in question */
+ addq $8,%rax /* adjust pointer for next word */
+ movq %r8, %rdx /* magic value */
+ addq %rcx, %rdx /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc 3f /* highest byte is NUL => return pointer */
+ xorq %rcx, %rdx /* (word+magic)^word */
+ orq %r8, %rdx /* set all non-carry bits */
+ incq %rdx /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz 3f /* found NUL => return pointer */
+
+ /* Second unroll. */
+ movq (%rax), %rcx /* get double word (= 8 bytes) in question */
+ addq $8,%rax /* adjust pointer for next word */
+ movq %r8, %rdx /* magic value */
+ addq %rcx, %rdx /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc 3f /* highest byte is NUL => return pointer */
+ xorq %rcx, %rdx /* (word+magic)^word */
+ orq %r8, %rdx /* set all non-carry bits */
+ incq %rdx /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz 3f /* found NUL => return pointer */
+
+ /* Third unroll. */
+ movq (%rax), %rcx /* get double word (= 8 bytes) in question */
+ addq $8,%rax /* adjust pointer for next word */
+ movq %r8, %rdx /* magic value */
+ addq %rcx, %rdx /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc 3f /* highest byte is NUL => return pointer */
+ xorq %rcx, %rdx /* (word+magic)^word */
+ orq %r8, %rdx /* set all non-carry bits */
+ incq %rdx /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz 3f /* found NUL => return pointer */
+
+ /* Fourth unroll. */
+ movq (%rax), %rcx /* get double word (= 8 bytes) in question */
+ addq $8,%rax /* adjust pointer for next word */
+ movq %r8, %rdx /* magic value */
+ addq %rcx, %rdx /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc 3f /* highest byte is NUL => return pointer */
+ xorq %rcx, %rdx /* (word+magic)^word */
+ orq %r8, %rdx /* set all non-carry bits */
+ incq %rdx /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jz 4b /* no NUL found => continue loop */
+
+ .p2align 4 /* Align, it's a jump target. */
+3: subq $8,%rax /* correct pointer increment. */
+
+ testb %cl, %cl /* is first byte NUL? */
+ jz 2f /* yes => return */
+ incq %rax /* increment pointer */
+
+ testb %ch, %ch /* is second byte NUL? */
+ jz 2f /* yes => return */
+ incq %rax /* increment pointer */
+
+ testl $0x00ff0000, %ecx /* is third byte NUL? */
+ jz 2f /* yes => return pointer */
+ incq %rax /* increment pointer */
+
+ testl $0xff000000, %ecx /* is fourth byte NUL? */
+ jz 2f /* yes => return pointer */
+ incq %rax /* increment pointer */
+
+ shrq $32, %rcx /* look at other half. */
+
+ testb %cl, %cl /* is first byte NUL? */
+ jz 2f /* yes => return */
+ incq %rax /* increment pointer */
+
+ testb %ch, %ch /* is second byte NUL? */
+ jz 2f /* yes => return */
+ incq %rax /* increment pointer */
+
+ testl $0xff0000, %ecx /* is third byte NUL? */
+ jz 2f /* yes => return pointer */
+ incq %rax /* increment pointer */
+2:
+ subq %rdi, %rax /* compute difference to string start */
+ ret
+END (strlen)
+libc_hidden_builtin_def (strlen)
Modified: fsf/trunk/libc/sysdeps/x86_64/strcmp.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/strcmp.S (original)
+++ fsf/trunk/libc/sysdeps/x86_64/strcmp.S Mon Jul 27 00:07:22 2009
@@ -1,8 +1,10 @@
/* Highly optimized version for x86-64.
- Copyright (C) 1999, 2000, 2002, 2003, 2005 Free Software Foundation, Inc.
+ Copyright (C) 1999, 2000, 2002, 2003, 2005, 2009
+ Free Software Foundation, Inc.
This file is part of the GNU C Library.
Based on i686 version contributed by Ulrich Drepper
<drepper@xxxxxxxxxx>, 1999.
+ Updated with SSE2 support contributed by Intel Corporation.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -24,8 +26,35 @@
#include "bp-sym.h"
#include "bp-asm.h"
- .text
-ENTRY (BP_SYM (strcmp))
+#undef UPDATE_STRNCMP_COUNTER
+
+#ifndef LABEL
+#define LABEL(l) L(l)
+#endif
+
+#ifdef USE_AS_STRNCMP
+/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
+ if the new counter > the old one or is 0. */
+# define UPDATE_STRNCMP_COUNTER \
+ /* calculate left number to compare */ \
+ lea -16(%rcx, %r11), %r9; \
+ cmp %r9, %r11; \
+ jb LABEL(strcmp_exitz); \
+ test %r9, %r9; \
+ je LABEL(strcmp_exitz); \
+ mov %r9, %r11
+
+#else
+# define UPDATE_STRNCMP_COUNTER
+# ifndef STRCMP
+# define STRCMP strcmp
+# endif
+#endif
+
+ .text
+ENTRY (BP_SYM (STRCMP))
+#ifdef NOT_IN_libc
+/* Simple version since we can't use SSE registers in ld.so. */
L(oop): movb (%rdi), %al
cmpb (%rsi), %al
jne L(neq)
@@ -41,5 +70,1914 @@
movl $-1, %ecx
cmovbl %ecx, %eax
ret
-END (BP_SYM (strcmp))
-libc_hidden_builtin_def (strcmp)
+END (BP_SYM (STRCMP))
+#else /* NOT_IN_libc */
+/*
+ * This implementation uses SSE to compare up to 16 bytes at a time.
+ */
+#ifdef USE_AS_STRNCMP
+ test %rdx, %rdx
+ je LABEL(strcmp_exitz)
+ cmp $1, %rdx
+ je LABEL(Byte0)
+ mov %rdx, %r11
+#endif
+ mov %esi, %ecx
+ mov %edi, %eax
+/* Use 64bit AND here to avoid long NOP padding. */
+ and $0x3f, %rcx /* rsi alignment in cache line */
+ and $0x3f, %rax /* rdi alignment in cache line */
+ cmp $0x30, %ecx
+ ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */
+ cmp $0x30, %eax
+ ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */
+ movlpd (%rdi), %xmm1
+ movlpd (%rsi), %xmm2
+ movhpd 8(%rdi), %xmm1
+ movhpd 8(%rsi), %xmm2
+ pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
+ pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
+ jnz LABEL(less16bytes) /* If not, find different value or null char */
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz) /* finish comparision */
+#endif
+ add $16, %rsi /* prepare to search next 16 bytes */
+ add $16, %rdi /* prepare to search next 16 bytes */
+
+ /*
+ * Determine source and destination string offsets from 16-byte alignment.
+ * Use relative offset difference between the two to determine which case
+ * below to use.
+ */
+ .p2align 4
+LABEL(crosscache):
+ and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
+ and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
+ mov $0xffff, %edx /* for equivalent offset */
+ xor %r8d, %r8d
+ and $0xf, %ecx /* offset of rsi */
+ and $0xf, %eax /* offset of rdi */
+ cmp %eax, %ecx
+ je LABEL(ashr_0) /* rsi and rdi relative offset same */
+ ja LABEL(bigger)
+ mov %edx, %r8d /* r8d is offset flag for exit tail */
+ xchg %ecx, %eax
+ xchg %rsi, %rdi
+LABEL(bigger):
+ lea 15(%rax), %r9
+ sub %rcx, %r9
+ lea LABEL(unaligned_table)(%rip), %r10
+ movslq (%r10, %r9,4), %r9
+ lea (%r10, %r9), %r10
+ jmp *%r10 /* jump to corresponding case */
+
+/*
+ * The following cases will be handled by ashr_0
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(0~15) n(0~15) 15(15+ n-n) ashr_0
+ */
+ .p2align 4
+LABEL(ashr_0):
+
+ movdqa (%rsi), %xmm1
+ pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
+ pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %r9d
+ shr %cl, %edx /* adjust 0xffff for offset */
+ shr %cl, %r9d /* adjust for 16-byte offset */
+ sub %r9d, %edx
+ /*
+ * edx must be the same with r9d if in left byte (16-rcx) is equal to
+ * the start from (16-rax) and no null char was seen.
+ */
+ jne LABEL(less32bytes) /* mismatch or null char */
+ UPDATE_STRNCMP_COUNTER
+ mov $16, %rcx
+ mov $16, %r9
+ pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */
+
+ /*
+ * Now both strings are aligned at 16-byte boundary. Loop over strings
+ * checking 32-bytes per iteration.
+ */
+ .p2align 4
+LABEL(loop_ashr_0):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit) /* mismatch or null char seen */
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ add $16, %rcx
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ add $16, %rcx
+ jmp LABEL(loop_ashr_0)
+
+/*
+ * The following cases will be handled by ashr_1
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(15) n -15 0(15 +(n-15) - n) ashr_1
+ */
+ .p2align 4
+LABEL(ashr_1):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
+ pslldq $15, %xmm2 /* shift first string to align with second */
+ pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx /* adjust 0xffff for offset */
+ shr %cl, %r9d /* adjust for 16-byte offset */
+ sub %r9d, %edx
+ jnz LABEL(less32bytes) /* mismatch or null char seen */
+ movdqa (%rdi), %xmm3
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads*/
+ mov $1, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 1(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_1):
+ add $16, %r10
+ jg LABEL(nibble_ashr_1) /* cross page boundary */
+
+LABEL(gobble_ashr_1):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4 /* store for next cycle */
+
+ psrldq $1, %xmm3
+ pslldq $15, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_1) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4 /* store for next cycle */
+
+ psrldq $1, %xmm3
+ pslldq $15, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_1)
+
+ /*
+ * Nibble avoids loads across page boundary. This is to avoid a potential
+ * access into unmapped memory.
+ */
+ .p2align 4
+LABEL(nibble_ashr_1):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/
+ pmovmskb %xmm0, %edx
+ test $0xfffe, %edx
+ jnz LABEL(ashr_1_exittail) /* find null char*/
+
+#ifdef USE_AS_STRNCMP
+ cmp $14, %r11
+ jbe LABEL(ashr_1_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10 /* substract 4K from %r10 */
+ jmp LABEL(gobble_ashr_1)
+
+ /*
+ * Once find null char, determine if there is a string mismatch
+ * before the null char.
+ */
+ .p2align 4
+LABEL(ashr_1_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $1, %xmm0
+ psrldq $1, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_2
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
+ */
+ .p2align 4
+LABEL(ashr_2):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $14, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $2, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 2(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_2):
+ add $16, %r10
+ jg LABEL(nibble_ashr_2)
+
+LABEL(gobble_ashr_2):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $2, %xmm3
+ pslldq $14, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_2) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $2, %xmm3
+ pslldq $14, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_2)
+
+ .p2align 4
+LABEL(nibble_ashr_2):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xfffc, %edx
+ jnz LABEL(ashr_2_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $13, %r11
+ jbe LABEL(ashr_2_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_2)
+
+ .p2align 4
+LABEL(ashr_2_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $2, %xmm0
+ psrldq $2, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_3
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
+ */
+ .p2align 4
+LABEL(ashr_3):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $13, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $3, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 3(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_3):
+ add $16, %r10
+ jg LABEL(nibble_ashr_3)
+
+LABEL(gobble_ashr_3):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $3, %xmm3
+ pslldq $13, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_3) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $3, %xmm3
+ pslldq $13, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_3)
+
+ .p2align 4
+LABEL(nibble_ashr_3):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xfff8, %edx
+ jnz LABEL(ashr_3_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $12, %r11
+ jbe LABEL(ashr_3_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_3)
+
+ .p2align 4
+LABEL(ashr_3_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $3, %xmm0
+ psrldq $3, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_4
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
+ */
+ .p2align 4
+LABEL(ashr_4):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $12, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $4, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 4(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_4):
+ add $16, %r10
+ jg LABEL(nibble_ashr_4)
+
+LABEL(gobble_ashr_4):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $4, %xmm3
+ pslldq $12, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_4) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $4, %xmm3
+ pslldq $12, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_4)
+
+ .p2align 4
+LABEL(nibble_ashr_4):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xfff0, %edx
+ jnz LABEL(ashr_4_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $11, %r11
+ jbe LABEL(ashr_4_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_4)
+
+ .p2align 4
+LABEL(ashr_4_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $4, %xmm0
+ psrldq $4, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_5
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
+ */
+ .p2align 4
+LABEL(ashr_5):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $11, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $5, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 5(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_5):
+ add $16, %r10
+ jg LABEL(nibble_ashr_5)
+
+LABEL(gobble_ashr_5):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $5, %xmm3
+ pslldq $11, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_5) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $5, %xmm3
+ pslldq $11, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_5)
+
+ .p2align 4
+LABEL(nibble_ashr_5):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xffe0, %edx
+ jnz LABEL(ashr_5_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $10, %r11
+ jbe LABEL(ashr_5_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_5)
+
+ .p2align 4
+LABEL(ashr_5_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $5, %xmm0
+ psrldq $5, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_6
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
+ */
+ .p2align 4
+LABEL(ashr_6):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $10, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $6, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 6(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_6):
+ add $16, %r10
+ jg LABEL(nibble_ashr_6)
+
+LABEL(gobble_ashr_6):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $6, %xmm3
+ pslldq $10, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_6) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $6, %xmm3
+ pslldq $10, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_6)
+
+ .p2align 4
+LABEL(nibble_ashr_6):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xffc0, %edx
+ jnz LABEL(ashr_6_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $9, %r11
+ jbe LABEL(ashr_6_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_6)
+
+ .p2align 4
+LABEL(ashr_6_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $6, %xmm0
+ psrldq $6, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_7
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
+ */
+ .p2align 4
+LABEL(ashr_7):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $9, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $7, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 7(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_7):
+ add $16, %r10
+ jg LABEL(nibble_ashr_7)
+
+LABEL(gobble_ashr_7):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $7, %xmm3
+ pslldq $9, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_7) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $7, %xmm3
+ pslldq $9, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_7)
+
+ .p2align 4
+LABEL(nibble_ashr_7):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xff80, %edx
+ jnz LABEL(ashr_7_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $8, %r11
+ jbe LABEL(ashr_7_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_7)
+
+ .p2align 4
+LABEL(ashr_7_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $7, %xmm0
+ psrldq $7, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_8
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
+ */
+ .p2align 4
+LABEL(ashr_8):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $8, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $8, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 8(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_8):
+ add $16, %r10
+ jg LABEL(nibble_ashr_8)
+
+LABEL(gobble_ashr_8):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $8, %xmm3
+ pslldq $8, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_8) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $8, %xmm3
+ pslldq $8, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_8)
+
+ .p2align 4
+LABEL(nibble_ashr_8):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xff00, %edx
+ jnz LABEL(ashr_8_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $7, %r11
+ jbe LABEL(ashr_8_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_8)
+
+ .p2align 4
+LABEL(ashr_8_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $8, %xmm0
+ psrldq $8, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_9
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
+ */
+ .p2align 4
+LABEL(ashr_9):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $7, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $9, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 9(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_9):
+ add $16, %r10
+ jg LABEL(nibble_ashr_9)
+
+LABEL(gobble_ashr_9):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $9, %xmm3
+ pslldq $7, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_9) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $9, %xmm3
+ pslldq $7, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3 /* store for next cycle */
+ jmp LABEL(loop_ashr_9)
+
+ .p2align 4
+LABEL(nibble_ashr_9):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xfe00, %edx
+ jnz LABEL(ashr_9_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $6, %r11
+ jbe LABEL(ashr_9_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_9)
+
+ .p2align 4
+LABEL(ashr_9_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $9, %xmm0
+ psrldq $9, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_10
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
+ */
+ .p2align 4
+LABEL(ashr_10):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $6, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $10, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 10(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_10):
+ add $16, %r10
+ jg LABEL(nibble_ashr_10)
+
+LABEL(gobble_ashr_10):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $10, %xmm3
+ pslldq $6, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_10) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $10, %xmm3
+ pslldq $6, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_10)
+
+ .p2align 4
+LABEL(nibble_ashr_10):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xfc00, %edx
+ jnz LABEL(ashr_10_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $5, %r11
+ jbe LABEL(ashr_10_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_10)
+
+ .p2align 4
+LABEL(ashr_10_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $10, %xmm0
+ psrldq $10, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_11
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
+ */
+ .p2align 4
+LABEL(ashr_11):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $5, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $11, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 11(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_11):
+ add $16, %r10
+ jg LABEL(nibble_ashr_11)
+
+LABEL(gobble_ashr_11):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $11, %xmm3
+ pslldq $5, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_11) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $11, %xmm3
+ pslldq $5, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_11)
+
+ .p2align 4
+LABEL(nibble_ashr_11):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xf800, %edx
+ jnz LABEL(ashr_11_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $4, %r11
+ jbe LABEL(ashr_11_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_11)
+
+ .p2align 4
+LABEL(ashr_11_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $11, %xmm0
+ psrldq $11, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_12
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
+ */
+ .p2align 4
+LABEL(ashr_12):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $4, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $12, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 12(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_12):
+ add $16, %r10
+ jg LABEL(nibble_ashr_12)
+
+LABEL(gobble_ashr_12):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $12, %xmm3
+ pslldq $4, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_12) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $12, %xmm3
+ pslldq $4, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_12)
+
+ .p2align 4
+LABEL(nibble_ashr_12):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xf000, %edx
+ jnz LABEL(ashr_12_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $3, %r11
+ jbe LABEL(ashr_12_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_12)
+
+ .p2align 4
+LABEL(ashr_12_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $12, %xmm0
+ psrldq $12, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_13
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
+ */
+ .p2align 4
+LABEL(ashr_13):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $3, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $13, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 13(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_13):
+ add $16, %r10
+ jg LABEL(nibble_ashr_13)
+
+LABEL(gobble_ashr_13):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $13, %xmm3
+ pslldq $3, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_13) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $13, %xmm3
+ pslldq $3, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_13)
+
+ .p2align 4
+LABEL(nibble_ashr_13):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xe000, %edx
+ jnz LABEL(ashr_13_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $2, %r11
+ jbe LABEL(ashr_13_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_13)
+
+ .p2align 4
+LABEL(ashr_13_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $13, %xmm0
+ psrldq $13, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_14
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
+ */
+ .p2align 4
+LABEL(ashr_14):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $2, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $14, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 14(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_14):
+ add $16, %r10
+ jg LABEL(nibble_ashr_14)
+
+LABEL(gobble_ashr_14):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $14, %xmm3
+ pslldq $2, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_14) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $14, %xmm3
+ pslldq $2, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_14)
+
+ .p2align 4
+LABEL(nibble_ashr_14):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xc000, %edx
+ jnz LABEL(ashr_14_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $1, %r11
+ jbe LABEL(ashr_14_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_14)
+
+ .p2align 4
+LABEL(ashr_14_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $14, %xmm0
+ psrldq $14, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_15
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
+ */
+ .p2align 4
+LABEL(ashr_15):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $1, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $15, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 15(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_15):
+ add $16, %r10
+ jg LABEL(nibble_ashr_15)
+
+LABEL(gobble_ashr_15):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $15, %xmm3
+ pslldq $1, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_15) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $15, %xmm3
+ pslldq $1, %xmm2
+ por %xmm3, %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_15)
+
+ .p2align 4
+LABEL(nibble_ashr_15):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0x8000, %edx
+ jnz LABEL(ashr_15_exittail)
+
+#ifdef USE_AS_STRNCMP
+ test %r11, %r11
+ je LABEL(ashr_15_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_15)
+
+ .p2align 4
+LABEL(ashr_15_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $15, %xmm3
+ psrldq $15, %xmm0
+
+ .p2align 4
+LABEL(aftertail):
+ pcmpeqb %xmm3, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ not %edx
+
+ .p2align 4
+LABEL(exit):
+ lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */
+LABEL(less32bytes):
+ lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
+ lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
+ test %r8d, %r8d
+ jz LABEL(ret)
+ xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
+
+ .p2align 4
+LABEL(ret):
+LABEL(less16bytes):
+ bsf %rdx, %rdx /* find and store bit index in %rdx */
+
+#ifdef USE_AS_STRNCMP
+ sub %rdx, %r11
+ jbe LABEL(strcmp_exitz)
+#endif
+ movzbl (%rsi, %rdx), %ecx
+ movzbl (%rdi, %rdx), %eax
+
+ sub %ecx, %eax
+ ret
+
+LABEL(strcmp_exitz):
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+LABEL(Byte0):
+ movzx (%rsi), %ecx
+ movzx (%rdi), %eax
+
+ sub %ecx, %eax
+ ret
+END (BP_SYM (STRCMP))
+
+ .section .rodata,"a",@progbits
+ .p2align 3
+LABEL(unaligned_table):
+ .int LABEL(ashr_1) - LABEL(unaligned_table)
+ .int LABEL(ashr_2) - LABEL(unaligned_table)
+ .int LABEL(ashr_3) - LABEL(unaligned_table)
+ .int LABEL(ashr_4) - LABEL(unaligned_table)
+ .int LABEL(ashr_5) - LABEL(unaligned_table)
+ .int LABEL(ashr_6) - LABEL(unaligned_table)
+ .int LABEL(ashr_7) - LABEL(unaligned_table)
+ .int LABEL(ashr_8) - LABEL(unaligned_table)
+ .int LABEL(ashr_9) - LABEL(unaligned_table)
+ .int LABEL(ashr_10) - LABEL(unaligned_table)
+ .int LABEL(ashr_11) - LABEL(unaligned_table)
+ .int LABEL(ashr_12) - LABEL(unaligned_table)
+ .int LABEL(ashr_13) - LABEL(unaligned_table)
+ .int LABEL(ashr_14) - LABEL(unaligned_table)
+ .int LABEL(ashr_15) - LABEL(unaligned_table)
+ .int LABEL(ashr_0) - LABEL(unaligned_table)
+#endif /* NOT_IN_libc */
+libc_hidden_builtin_def (STRCMP)
Added: fsf/trunk/libc/sysdeps/x86_64/strncmp.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/strncmp.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/strncmp.S Mon Jul 27 00:07:22 2009
@@ -1,0 +1,3 @@
+#define STRCMP strncmp
+#define USE_AS_STRNCMP
+#include "strcmp.S"
Added: fsf/trunk/libc/sysdeps/x86_64/tst-xmmymm.sh
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/tst-xmmymm.sh (added)
+++ fsf/trunk/libc/sysdeps/x86_64/tst-xmmymm.sh Mon Jul 27 00:07:22 2009
@@ -1,0 +1,17 @@
+#! /bin/sh
+objpfx="$1"
+
+tmp=$(mktemp ${objpfx}tst-xmmymm.XXXXXX)
+trap 'rm -f "$tmp"' 1 2 3 15
+
+objdump -d "${objpfx}ld.so" |
+awk 'BEGIN { last="" } /^[[:xdigit:]]* <[_[:alnum:]]*>:$/ { fct=substr($2, 2, length($2)-3) } /,%[xy]mm[[:digit:]]*$/ { if (last != fct) { print fct; last=fct} }' |
+tee "$tmp"
+
+echo "Functions which incorrectly modify xmm/ymm registers:"
+err=1
+egrep -vs '^_dl_runtime_profile$' "$tmp" || err=0
+if test $err -eq 0; then echo "None"; fi
+
+rm "$tmp"
+exit $err
Propchange: fsf/trunk/libc/sysdeps/x86_64/tst-xmmymm.sh
------------------------------------------------------------------------------
svn:executable = *