[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[commits] r11365 - in /fsf/trunk/libc: ./ sysdeps/x86_64/ sysdeps/x86_64/multiarch/
- To: commits@xxxxxxxxxx
- Subject: [commits] r11365 - in /fsf/trunk/libc: ./ sysdeps/x86_64/ sysdeps/x86_64/multiarch/
- From: eglibc@xxxxxxxxxx
- Date: Fri, 27 Aug 2010 07:03:09 -0000
Author: eglibc
Date: Fri Aug 27 00:03:08 2010
New Revision: 11365
Log:
Import glibc-mainline for 2010-08-27
Added:
fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen-no-bsf.S
fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen-sse4.S
Modified:
fsf/trunk/libc/ChangeLog
fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile
fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen.S
fsf/trunk/libc/sysdeps/x86_64/multiarch/varshift.c
fsf/trunk/libc/sysdeps/x86_64/strlen.S
Modified: fsf/trunk/libc/ChangeLog
==============================================================================
--- fsf/trunk/libc/ChangeLog (original)
+++ fsf/trunk/libc/ChangeLog Fri Aug 27 00:03:08 2010
@@ -1,3 +1,20 @@
+2010-08-27 Ulrich Drepper <drepper@xxxxxxxxxx>
+
+ * sysdeps/x86_64/multiarch/strlen-no-bsf.S: Move to .text.slow section.
+
+ * sysdeps/x86_64/strlen.S: Minimal code improvement.
+
+2010-08-26 H.J. Lu <hongjiu.lu@xxxxxxxxx>
+
+ * sysdeps/x86_64/strlen.S: Unroll the loop.
+ * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+ strlen-sse2 strlen-sse2-bsf.
+ * sysdeps/x86_64/multiarch/strlen.S ((strlen): Return
+ __strlen_no_bsf if bit_Slow_BSF is set.
+ (__strlen_sse42): Removed.
+ * sysdeps/x86_64/multiarch/strlen-no-bsf.S: New file.
+ * sysdeps/x86_64/multiarch/strlen-sse4.S: New file.
+
2010-08-25 Roland McGrath <roland@xxxxxxxxxx>
* sysdeps/x86_64/multiarch/varshift.S: File removed.
Modified: fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile (original)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile Fri Aug 27 00:03:08 2010
@@ -8,7 +8,7 @@
strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
- strncase_l-ssse3
+ strncase_l-ssse3 strlen-sse4 strlen-no-bsf
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
CFLAGS-varshift.c += -msse4
Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen-no-bsf.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen-no-bsf.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen-no-bsf.S Fri Aug 27 00:03:08 2010
@@ -1,0 +1,309 @@
+/* strlen without BSF
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#if defined SHARED && !defined NOT_IN_libc
+
+#include <sysdep.h>
+
+ .section .text.slow,"ax",@progbits
+ENTRY (__strlen_no_bsf)
+ xor %eax, %eax
+ cmpb $0, (%rdi)
+ jz L(exit_tail0)
+ cmpb $0, 1(%rdi)
+ jz L(exit_tail1)
+ cmpb $0, 2(%rdi)
+ jz L(exit_tail2)
+ cmpb $0, 3(%rdi)
+ jz L(exit_tail3)
+ cmpb $0, 4(%rdi)
+ jz L(exit_tail4)
+ cmpb $0, 5(%rdi)
+ jz L(exit_tail5)
+ cmpb $0, 6(%rdi)
+ jz L(exit_tail6)
+ cmpb $0, 7(%rdi)
+ jz L(exit_tail7)
+ cmpb $0, 8(%rdi)
+ jz L(exit_tail8)
+ cmpb $0, 9(%rdi)
+ jz L(exit_tail9)
+ cmpb $0, 10(%rdi)
+ jz L(exit_tail10)
+ cmpb $0, 11(%rdi)
+ jz L(exit_tail11)
+ cmpb $0, 12(%rdi)
+ jz L(exit_tail12)
+ cmpb $0, 13(%rdi)
+ jz L(exit_tail13)
+ cmpb $0, 14(%rdi)
+ jz L(exit_tail14)
+ cmpb $0, 15(%rdi)
+ jz L(exit_tail15)
+ pxor %xmm0, %xmm0
+ mov %rdi, %rcx
+ mov %rdi, %rax
+ and $-16, %rax
+ add $16, %rax
+ add $16, %rcx
+
+ pcmpeqb (%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ pxor %xmm1, %xmm1
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ pxor %xmm2, %xmm2
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+
+ pcmpeqb (%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ pxor %xmm3, %xmm3
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ and $-0x40, %rax
+ xor %r8d, %r8d
+L(aligned_64):
+ pcmpeqb (%rax), %xmm0
+ pcmpeqb 16(%rax), %xmm1
+ pcmpeqb 32(%rax), %xmm2
+ pcmpeqb 48(%rax), %xmm3
+ pmovmskb %xmm0, %edx
+ pmovmskb %xmm1, %esi
+ pmovmskb %xmm2, %edi
+ pmovmskb %xmm3, %r9d
+ or %edx, %r8d
+ or %esi, %r8d
+ or %edi, %r8d
+ or %r9d, %r8d
+ lea 64(%rax), %rax
+ jz L(aligned_64)
+
+ test %edx, %edx
+ jnz L(aligned_64_exit_16)
+ test %esi, %esi
+ jnz L(aligned_64_exit_32)
+ test %edi, %edi
+ jnz L(aligned_64_exit_48)
+L(aligned_64_exit_64):
+ mov %r9d, %edx
+ jmp L(aligned_64_exit)
+L(aligned_64_exit_48):
+ lea -16(%rax), %rax
+ mov %edi, %edx
+ jmp L(aligned_64_exit)
+L(aligned_64_exit_32):
+ lea -32(%rax), %rax
+ mov %esi, %edx
+ jmp L(aligned_64_exit)
+L(aligned_64_exit_16):
+ lea -48(%rax), %rax
+L(aligned_64_exit):
+L(exit):
+ sub %rcx, %rax
+ test %dl, %dl
+ jz L(exit_high)
+ test $0x01, %dl
+ jnz L(exit_tail0)
+
+ test $0x02, %dl
+ jnz L(exit_tail1)
+
+ test $0x04, %dl
+ jnz L(exit_tail2)
+
+ test $0x08, %dl
+ jnz L(exit_tail3)
+
+ test $0x10, %dl
+ jnz L(exit_tail4)
+
+ test $0x20, %dl
+ jnz L(exit_tail5)
+
+ test $0x40, %dl
+ jnz L(exit_tail6)
+ add $7, %eax
+L(exit_tail0):
+ ret
+
+L(exit_high):
+ add $8, %eax
+ test $0x01, %dh
+ jnz L(exit_tail0)
+
+ test $0x02, %dh
+ jnz L(exit_tail1)
+
+ test $0x04, %dh
+ jnz L(exit_tail2)
+
+ test $0x08, %dh
+ jnz L(exit_tail3)
+
+ test $0x10, %dh
+ jnz L(exit_tail4)
+
+ test $0x20, %dh
+ jnz L(exit_tail5)
+
+ test $0x40, %dh
+ jnz L(exit_tail6)
+ add $7, %eax
+ ret
+ .p2align 4
+L(exit_tail1):
+ add $1, %eax
+ ret
+
+L(exit_tail2):
+ add $2, %eax
+ ret
+
+L(exit_tail3):
+ add $3, %eax
+ ret
+
+L(exit_tail4):
+ add $4, %eax
+ ret
+
+L(exit_tail5):
+ add $5, %eax
+ ret
+L(exit_tail6):
+ add $6, %eax
+ ret
+L(exit_tail7):
+ add $7, %eax
+ ret
+L(exit_tail8):
+ add $8, %eax
+ ret
+L(exit_tail9):
+ add $9, %eax
+ ret
+L(exit_tail10):
+ add $10, %eax
+ ret
+L(exit_tail11):
+ add $11, %eax
+ ret
+L(exit_tail12):
+ add $12, %eax
+ ret
+L(exit_tail13):
+ add $13, %eax
+ ret
+L(exit_tail14):
+ add $14, %eax
+ ret
+L(exit_tail15):
+ add $15, %eax
+ ret
+END (__strlen_no_bsf)
+
+#endif
Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen-sse4.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen-sse4.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen-sse4.S Fri Aug 27 00:03:08 2010
@@ -1,0 +1,85 @@
+/* strlen with SSE4
+ Copyright (C) 2009, 2010 Free Software Foundation, Inc.
+ Contributed by Ulrich Drepper <drepper@xxxxxxxxxx>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#if defined SHARED && !defined NOT_IN_libc
+
+#include <sysdep.h>
+
+ .section .text.sse4.2,"ax",@progbits
+ENTRY (__strlen_sse42)
+ pxor %xmm1, %xmm1
+ movl %edi, %ecx
+ movq %rdi, %r8
+ andq $~15, %rdi
+ xor %edi, %ecx
+ pcmpeqb (%rdi), %xmm1
+ pmovmskb %xmm1, %edx
+ shrl %cl, %edx
+ shll %cl, %edx
+ andl %edx, %edx
+ jnz L(less16bytes)
+ pxor %xmm1, %xmm1
+
+ .p2align 4
+L(more64bytes_loop):
+ pcmpistri $0x08, 16(%rdi), %xmm1
+ jz L(more32bytes)
+
+ pcmpistri $0x08, 32(%rdi), %xmm1
+ jz L(more48bytes)
+
+ pcmpistri $0x08, 48(%rdi), %xmm1
+ jz L(more64bytes)
+
+ add $64, %rdi
+ pcmpistri $0x08, (%rdi), %xmm1
+ jnz L(more64bytes_loop)
+ leaq (%rdi,%rcx), %rax
+ subq %r8, %rax
+ ret
+
+ .p2align 4
+L(more32bytes):
+ leaq 16(%rdi,%rcx, 1), %rax
+ subq %r8, %rax
+ ret
+
+ .p2align 4
+L(more48bytes):
+ leaq 32(%rdi,%rcx, 1), %rax
+ subq %r8, %rax
+ ret
+
+ .p2align 4
+L(more64bytes):
+ leaq 48(%rdi,%rcx, 1), %rax
+ subq %r8, %rax
+ ret
+
+ .p2align 4
+L(less16bytes):
+ subq %r8, %rdi
+ bsfl %edx, %eax
+ addq %rdi, %rax
+ ret
+
+END (__strlen_sse42)
+
+#endif
Modified: fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen.S (original)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen.S Fri Aug 27 00:03:08 2010
@@ -1,5 +1,5 @@
/* strlen(str) -- determine the length of the string STR.
- Copyright (C) 2009 Free Software Foundation, Inc.
+ Copyright (C) 2009, 2010 Free Software Foundation, Inc.
Contributed by Ulrich Drepper <drepper@xxxxxxxxxx>.
This file is part of the GNU C Library.
@@ -36,74 +36,12 @@
testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
jz 2f
leaq __strlen_sse42(%rip), %rax
-2: ret
+ ret
+2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
+ jz 3f
+ leaq __strlen_no_bsf(%rip), %rax
+3: ret
END(strlen)
-
-
- .section .text.sse4.2,"ax",@progbits
- .align 16
- .type __strlen_sse42, @function
-__strlen_sse42:
- cfi_startproc
- CALL_MCOUNT
- pxor %xmm1, %xmm1
- movl %edi, %ecx
- movq %rdi, %r8
- andq $~15, %rdi
- xor %edi, %ecx
- pcmpeqb (%rdi), %xmm1
- pmovmskb %xmm1, %edx
- shrl %cl, %edx
- shll %cl, %edx
- andl %edx, %edx
- jnz L(less16bytes)
- pxor %xmm1, %xmm1
-
- .p2align 4
-L(more64bytes_loop):
- pcmpistri $0x08, 16(%rdi), %xmm1
- jz L(more32bytes)
-
- pcmpistri $0x08, 32(%rdi), %xmm1
- jz L(more48bytes)
-
- pcmpistri $0x08, 48(%rdi), %xmm1
- jz L(more64bytes)
-
- add $64, %rdi
- pcmpistri $0x08, (%rdi), %xmm1
- jnz L(more64bytes_loop)
- leaq (%rdi,%rcx), %rax
- subq %r8, %rax
- ret
-
- .p2align 4
-L(more32bytes):
- leaq 16(%rdi,%rcx, 1), %rax
- subq %r8, %rax
- ret
-
- .p2align 4
-L(more48bytes):
- leaq 32(%rdi,%rcx, 1), %rax
- subq %r8, %rax
- ret
-
- .p2align 4
-L(more64bytes):
- leaq 48(%rdi,%rcx, 1), %rax
- subq %r8, %rax
- ret
-
- .p2align 4
-L(less16bytes):
- subq %r8, %rdi
- bsfl %edx, %eax
- addq %rdi, %rax
- ret
- cfi_endproc
- .size __strlen_sse42, .-__strlen_sse42
-
# undef ENTRY
# define ENTRY(name) \
Modified: fsf/trunk/libc/sysdeps/x86_64/multiarch/varshift.c
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/varshift.c (original)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/varshift.c Fri Aug 27 00:03:08 2010
@@ -21,6 +21,6 @@
const int8_t ___m128i_shift_right[31] attribute_hidden =
{
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};
Modified: fsf/trunk/libc/sysdeps/x86_64/strlen.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/strlen.S (original)
+++ fsf/trunk/libc/sysdeps/x86_64/strlen.S Fri Aug 27 00:03:08 2010
@@ -1,5 +1,5 @@
/* strlen(str) -- determine the length of the string STR.
- Copyright (C) 2009 Free Software Foundation, Inc.
+ Copyright (C) 2009, 2010 Free Software Foundation, Inc.
Contributed by Ulrich Drepper <drepper@xxxxxxxxxx>.
This file is part of the GNU C Library.
@@ -23,29 +23,80 @@
.text
ENTRY(strlen)
- pxor %xmm2, %xmm2
- movq %rdi, %rcx
- movq %rdi, %r8
- andq $~15, %rdi
- movdqa %xmm2, %xmm1
- pcmpeqb (%rdi), %xmm2
- orl $0xffffffff, %esi
- subq %rdi, %rcx
- shll %cl, %esi
- pmovmskb %xmm2, %edx
- andl %esi, %edx
- jnz 1f
-
-2: movdqa 16(%rdi), %xmm0
- leaq 16(%rdi), %rdi
+ xor %rax, %rax
+ mov %edi, %ecx
+ and $0x3f, %ecx
+ pxor %xmm0, %xmm0
+ cmp $0x30, %ecx
+ ja L(next)
+ movdqu (%rdi), %xmm1
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %edx
- testl %edx, %edx
- jz 2b
+ test %edx, %edx
+ jnz L(exit_less16)
+ mov %rdi, %rax
+ and $-16, %rax
+ jmp L(align16_start)
+L(next):
+ mov %rdi, %rax
+ and $-16, %rax
+ pcmpeqb (%rax), %xmm0
+ mov $-1, %esi
+ sub %rax, %rcx
+ shl %cl, %esi
+ pmovmskb %xmm0, %edx
+ and %esi, %edx
+ jnz L(exit)
+L(align16_start):
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+ .p2align 4
+L(align16_loop):
+ pcmpeqb 16(%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
-1: subq %r8, %rdi
- bsfl %edx, %eax
- addq %rdi, %rax
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea 64(%rax), %rax
+ test %edx, %edx
+ jz L(align16_loop)
+L(exit):
+ sub %rdi, %rax
+L(exit_less16):
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ ret
+ .p2align 4
+L(exit16):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ lea 16(%rdx,%rax), %rax
+ ret
+ .p2align 4
+L(exit32):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ lea 32(%rdx,%rax), %rax
+ ret
+ .p2align 4
+L(exit48):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ lea 48(%rdx,%rax), %rax
ret
END(strlen)
libc_hidden_builtin_def (strlen)