[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[commits] r8637 - in /fsf/trunk/libc: ./ sysdeps/i386/ sysdeps/x86_64/multiarch/
- To: commits@xxxxxxxxxx
- Subject: [commits] r8637 - in /fsf/trunk/libc: ./ sysdeps/i386/ sysdeps/x86_64/multiarch/
- From: eglibc@xxxxxxxxxx
- Date: Sat, 04 Jul 2009 07:05:52 -0000
Author: eglibc
Date: Sat Jul 4 00:05:52 2009
New Revision: 8637
Log:
Import glibc-mainline for 2009-07-04
Added:
fsf/trunk/libc/sysdeps/x86_64/multiarch/strcspn-c.c
fsf/trunk/libc/sysdeps/x86_64/multiarch/strcspn.S
fsf/trunk/libc/sysdeps/x86_64/multiarch/strpbrk-c.c
fsf/trunk/libc/sysdeps/x86_64/multiarch/strpbrk.S
fsf/trunk/libc/sysdeps/x86_64/multiarch/strspn-c.c
fsf/trunk/libc/sysdeps/x86_64/multiarch/strspn.S
Modified:
fsf/trunk/libc/ChangeLog
fsf/trunk/libc/config.h.in
fsf/trunk/libc/config.make.in
fsf/trunk/libc/configure
fsf/trunk/libc/configure.in
fsf/trunk/libc/sysdeps/i386/configure
fsf/trunk/libc/sysdeps/i386/configure.in
fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile
fsf/trunk/libc/sysdeps/x86_64/multiarch/rawmemchr.S
fsf/trunk/libc/sysdeps/x86_64/multiarch/strcmp.S
fsf/trunk/libc/sysdeps/x86_64/multiarch/strcpy.S
fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen.S
Modified: fsf/trunk/libc/ChangeLog
==============================================================================
--- fsf/trunk/libc/ChangeLog (original)
+++ fsf/trunk/libc/ChangeLog Sat Jul 4 00:05:52 2009
@@ -1,3 +1,30 @@
+2009-07-03 Ulrich Drepper <drepper@xxxxxxxxxx>
+
+ * sysdeps/x86_64/multiarch/strcspn-c.c: Minor cleanups.
+ * sysdeps/x86_64/multiarch/strspn-c.c: Likewise.
+
+ * sysdeps/x86_64/multiarch/strcmp.S: Make sure functions are all
+ aligned to 16 byte boundaries.
+ * sysdeps/x86_64/multiarch/strcpy.S: Likewise.
+ * sysdeps/x86_64/multiarch/strlen.S: Likewise.
+ * sysdeps/x86_64/multiarch/rawmemchr.S: Likewise.
+
+2009-07-02 H.J. Lu <hongjiu.lu@xxxxxxxxx>
+
+ * config.h.in (HAVE_SSE4_SUPPORT): New macro.
+ * config.make.in (config-cflags-sse4): New variable.
+ * configure.in: Substitute libc_cv_cc_sse4.
+ * sysdeps/i386/configure.in: Set libc_cv_cc_sse4 and
+ HAVE_SSE4_SUPPORT.
+ * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+ strcspn-c, strpbrk-c, strspn-c for string if gcc supports SSE4.
+ * sysdeps/x86_64/multiarch/strcspn-c.c: New file.
+ * sysdeps/x86_64/multiarch/strcspn.S: New file.
+ * sysdeps/x86_64/multiarch/strpbrk-c.c: New file.
+ * sysdeps/x86_64/multiarch/strpbrk.S: New file.
+ * sysdeps/x86_64/multiarch/strspn-c.c: New file.
+ * sysdeps/x86_64/multiarch/strspn.S: New file.
+
2009-06-30 H.J. Lu <hongjiu.lu@xxxxxxxxx>
* elf/Makefile (distribute): Remove tst-audit.sh. Add
Modified: fsf/trunk/libc/config.h.in
==============================================================================
--- fsf/trunk/libc/config.h.in (original)
+++ fsf/trunk/libc/config.h.in Sat Jul 4 00:05:52 2009
@@ -128,6 +128,9 @@
/* Define if binutils support TLS handling. */
#undef HAVE_TLS_SUPPORT
+
+/* Define if gcc supports SSE4. */
+#undef HAVE_SSE4_SUPPORT
/* Define if the compiler's exception support is based on libunwind. */
#undef HAVE_CC_WITH_LIBUNWIND
Modified: fsf/trunk/libc/config.make.in
==============================================================================
--- fsf/trunk/libc/config.make.in (original)
+++ fsf/trunk/libc/config.make.in Sat Jul 4 00:05:52 2009
@@ -33,6 +33,8 @@
config-sysdirs = @sysnames@
cflags-cpu = @libc_cv_cc_submachine@
asflags-cpu = @libc_cv_cc_submachine@
+
+config-cflags-sse4 = @libc_cv_cc_sse4@
defines = @DEFINES@
sysincludes = @SYSINCLUDES@
Modified: fsf/trunk/libc/configure
==============================================================================
--- fsf/trunk/libc/configure (original)
+++ fsf/trunk/libc/configure Sat Jul 4 00:05:52 2009
@@ -657,6 +657,7 @@
elf
ldd_rewrite_script
use_ldconfig
+libc_cv_cc_sse4
libc_cv_cpp_asm_debuginfo
libc_cv_forced_unwind
libc_cv_rootsbindir
@@ -8744,6 +8745,7 @@
+
if test $elf = yes; then
cat >>confdefs.h <<\_ACEOF
#define HAVE_ELF 1
Modified: fsf/trunk/libc/configure.in
==============================================================================
--- fsf/trunk/libc/configure.in (original)
+++ fsf/trunk/libc/configure.in Sat Jul 4 00:05:52 2009
@@ -2259,6 +2259,7 @@
dnl sysdeps/CPU/configure.in checks set this via arch-specific asm tests
AC_SUBST(libc_cv_cpp_asm_debuginfo)
+AC_SUBST(libc_cv_cc_sse4)
AC_SUBST(use_ldconfig)
AC_SUBST(ldd_rewrite_script)
Modified: fsf/trunk/libc/sysdeps/i386/configure
==============================================================================
--- fsf/trunk/libc/sysdeps/i386/configure (original)
+++ fsf/trunk/libc/sysdeps/i386/configure Sat Jul 4 00:05:52 2009
@@ -1,10 +1,42 @@
+as_nl='
+'
+export as_nl
+# Printing a long string crashes Solaris 7 /usr/bin/printf.
+as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
+if (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
+ as_echo='printf %s\n'
+ as_echo_n='printf %s'
+else
+ if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
+ as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
+ as_echo_n='/usr/ucb/echo -n'
+ else
+ as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
+ as_echo_n_body='eval
+ arg=$1;
+ case $arg in
+ *"$as_nl"*)
+ expr "X$arg" : "X\\(.*\\)$as_nl";
+ arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
+ esac;
+ expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
+ '
+ export as_echo_n_body
+ as_echo_n='sh -c $as_echo_n_body as_echo'
+ fi
+ export as_echo_body
+ as_echo='sh -c $as_echo_body as_echo'
+fi
+
# This file is generated from configure.in by Autoconf. DO NOT EDIT!
# Local configure fragment for sysdeps/i386.
-echo "$as_me:$LINENO: checking if -g produces usable source locations for assembler-with-cpp" >&5
-echo $ECHO_N "checking if -g produces usable source locations for assembler-with-cpp... $ECHO_C" >&6
+{ $as_echo "$as_me:$LINENO: checking if -g produces usable source locations for assembler-with-cpp" >&5
+$as_echo_n "checking if -g produces usable source locations for assembler-with-cpp... " >&6; }
if test "${libc_cv_cpp_asm_debuginfo+set}" = set; then
- echo $ECHO_N "(cached) $ECHO_C" >&6
+ $as_echo_n "(cached) " >&6
else
cat > conftest.S <<EOF
#include "confdefs.h"
@@ -27,7 +59,7 @@
{ (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
(eval $ac_try) 2>&5
ac_status=$?
- echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
(exit $ac_status); }; } && {
ac_pattern='conftest\.S'
{ ac_try='readelf --debug-dump=line conftest.o |
@@ -35,7 +67,7 @@
{ (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
(eval $ac_try) 2>&5
ac_status=$?
- echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
(exit $ac_status); }; }
}; then
libc_cv_cpp_asm_debuginfo=yes
@@ -44,11 +76,36 @@
fi
rm -f conftest*
fi
-echo "$as_me:$LINENO: result: $libc_cv_cpp_asm_debuginfo" >&5
-echo "${ECHO_T}$libc_cv_cpp_asm_debuginfo" >&6
+{ $as_echo "$as_me:$LINENO: result: $libc_cv_cpp_asm_debuginfo" >&5
+$as_echo "$libc_cv_cpp_asm_debuginfo" >&6; }
if test $libc_cv_cpp_asm_debuginfo = yes; then
cat >>confdefs.h <<\_ACEOF
#define HAVE_CPP_ASM_DEBUGINFO 1
_ACEOF
fi
+
+{ $as_echo "$as_me:$LINENO: checking for SSE4 support" >&5
+$as_echo_n "checking for SSE4 support... " >&6; }
+if test "${libc_cv_cc_sse4+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ if { ac_try='${CC-cc} -msse4 -xc /dev/null -S -o /dev/null'
+ { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; }; then
+ libc_cv_cc_sse4=yes
+else
+ libc_cv_cc_sse4=no
+fi
+fi
+{ $as_echo "$as_me:$LINENO: result: $libc_cv_cc_sse4" >&5
+$as_echo "$libc_cv_cc_sse4" >&6; }
+if test $libc_cv_cc_sse4 = yes; then
+ cat >>confdefs.h <<\_ACEOF
+#define HAVE_SSE4_SUPPORT 1
+_ACEOF
+
+fi
Modified: fsf/trunk/libc/sysdeps/i386/configure.in
==============================================================================
--- fsf/trunk/libc/sysdeps/i386/configure.in (original)
+++ fsf/trunk/libc/sysdeps/i386/configure.in Sat Jul 4 00:05:52 2009
@@ -33,3 +33,14 @@
if test $libc_cv_cpp_asm_debuginfo = yes; then
AC_DEFINE(HAVE_CPP_ASM_DEBUGINFO)
fi
+
+dnl Check if -msse4 works.
+AC_CACHE_CHECK(for SSE4 support, libc_cv_cc_sse4, [dnl
+if AC_TRY_COMMAND([${CC-cc} -msse4 -xc /dev/null -S -o /dev/null]); then
+ libc_cv_cc_sse4=yes
+else
+ libc_cv_cc_sse4=no
+fi])
+if test $libc_cv_cc_sse4 = yes; then
+ AC_DEFINE(HAVE_SSE4_SUPPORT)
+fi
Modified: fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile (original)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile Sat Jul 4 00:05:52 2009
@@ -5,4 +5,10 @@
ifeq ($(subdir),string)
sysdep_routines += stpncpy-c strncpy-c strncmp-c
+ifeq (yes,$(config-cflags-sse4))
+sysdep_routines += strcspn-c strpbrk-c strspn-c
+CFLAGS-strcspn-c.c += -msse4
+CFLAGS-strpbrk-c.c += -msse4
+CFLAGS-strspn-c.c += -msse4
endif
+endif
Modified: fsf/trunk/libc/sysdeps/x86_64/multiarch/rawmemchr.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/rawmemchr.S (original)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/rawmemchr.S Sat Jul 4 00:05:52 2009
@@ -77,6 +77,7 @@
# undef ENTRY
# define ENTRY(name) \
.type __rawmemchr_sse2, @function; \
+ .align 16; \
__rawmemchr_sse2: cfi_startproc; \
CALL_MCOUNT
# undef END
Modified: fsf/trunk/libc/sysdeps/x86_64/multiarch/strcmp.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strcmp.S (original)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strcmp.S Sat Jul 4 00:05:52 2009
@@ -1659,6 +1659,7 @@
# undef ENTRY
# define ENTRY(name) \
.type STRCMP_SSE2, @function; \
+ .align 16; \
STRCMP_SSE2: cfi_startproc; \
CALL_MCOUNT
# undef END
Modified: fsf/trunk/libc/sysdeps/x86_64/multiarch/strcpy.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strcpy.S (original)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strcpy.S Sat Jul 4 00:05:52 2009
@@ -1896,6 +1896,7 @@
# undef ENTRY
# define ENTRY(name) \
.type STRCPY_SSE2, @function; \
+ .align 16; \
STRCPY_SSE2: cfi_startproc; \
CALL_MCOUNT
# undef END
Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strcspn-c.c
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strcspn-c.c (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strcspn-c.c Sat Jul 4 00:05:52 2009
@@ -1,0 +1,312 @@
+/* strcspn with SSE4.2 intrinsics
+ Copyright (C) 2009 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <nmmintrin.h>
+#include <string.h>
+
+/* We use 0x2:
+ _SIDD_SBYTE_OPS
+ | _SIDD_CMP_EQUAL_ANY
+ | _SIDD_POSITIVE_POLARITY
+ | _SIDD_LEAST_SIGNIFICANT
+ on pcmpistri to compare xmm/mem128
+
+ 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ X X X X X X X X X X X X X X X X
+
+ against xmm
+
+ 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ A A A A A A A A A A A A A A A A
+
+ to find out if the first 16byte data element has any byte A and
+ the offset of the first byte. There are 3 cases:
+
+ 1. The first 16byte data element has the byte A at the offset X.
+ 2. The first 16byte data element has EOS and doesn't have the byte A.
+ 3. The first 16byte data element is valid and doesn't have the byte A.
+
+ Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
+
+ 1 X 1 0/1 0
+ 2 16 0 1 0
+ 3 16 0 0 0
+
+ We exit from the loop for cases 1 and 2 with jbe which branches
+ when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset
+ X for case 1. */
+
+#ifndef STRCSPN_SSE2
+# define STRCSPN_SSE2 __strcspn_sse2
+# define STRCSPN_SSE42 __strcspn_sse42
+#endif
+
+#ifdef USE_AS_STRPBRK
+# define RETURN(val1, val2) return val1
+#else
+# define RETURN(val1, val2) return val2
+#endif
+
+extern
+#ifdef USE_AS_STRPBRK
+char *
+#else
+size_t
+#endif
+STRCSPN_SSE2 (const char *, const char *);
+
+
+#ifdef USE_AS_STRPBRK
+char *
+#else
+size_t
+#endif
+__attribute__ ((section (".text.sse4.2")))
+STRCSPN_SSE42 (const char *s, const char *a)
+{
+ if (*a == 0)
+ RETURN (NULL, strlen (s));
+
+ const char *aligned;
+ __m128i mask;
+ int offset = (int) ((size_t) a & 15);
+ if (offset != 0)
+ {
+ /* Load masks. */
+ aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L);
+ __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
+
+ switch (offset)
+ {
+ case 1:
+ mask = _mm_srli_si128 (mask0, 1);
+ break;
+ case 2:
+ mask = _mm_srli_si128 (mask0, 2);
+ break;
+ case 3:
+ mask = _mm_srli_si128 (mask0, 3);
+ break;
+ case 4:
+ mask = _mm_srli_si128 (mask0, 4);
+ break;
+ case 5:
+ mask = _mm_srli_si128 (mask0, 5);
+ break;
+ case 6:
+ mask = _mm_srli_si128 (mask0, 6);
+ break;
+ case 7:
+ mask = _mm_srli_si128 (mask0, 7);
+ break;
+ case 8:
+ mask = _mm_srli_si128 (mask0, 8);
+ break;
+ case 9:
+ mask = _mm_srli_si128 (mask0, 9);
+ break;
+ case 10:
+ mask = _mm_srli_si128 (mask0, 10);
+ break;
+ case 11:
+ mask = _mm_srli_si128 (mask0, 11);
+ break;
+ case 12:
+ mask = _mm_srli_si128 (mask0, 12);
+ break;
+ case 13:
+ mask = _mm_srli_si128 (mask0, 13);
+ break;
+ case 14:
+ mask = _mm_srli_si128 (mask0, 14);
+ break;
+ case 15:
+ mask = _mm_srli_si128 (mask0, 15);
+ break;
+ }
+
+ /* Find where the NULL terminator is. */
+ int length = _mm_cmpistri (mask, mask, 0x3a);
+ if (length == 16 - offset)
+ {
+ /* There is no NULL terminator. */
+ __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
+ int index = _mm_cmpistri (mask1, mask1, 0x3a);
+ length += index;
+
+ /* Don't use SSE4.2 if the length of A > 16. */
+ if (length > 16)
+ return STRCSPN_SSE2 (s, a);
+
+ if (index != 0)
+ {
+ /* Combine mask0 and mask1. */
+ switch (offset)
+ {
+ case 1:
+ mask = _mm_alignr_epi8 (mask1, mask0, 1);
+ break;
+ case 2:
+ mask = _mm_alignr_epi8 (mask1, mask0, 2);
+ break;
+ case 3:
+ mask = _mm_alignr_epi8 (mask1, mask0, 3);
+ break;
+ case 4:
+ mask = _mm_alignr_epi8 (mask1, mask0, 4);
+ break;
+ case 5:
+ mask = _mm_alignr_epi8 (mask1, mask0, 5);
+ break;
+ case 6:
+ mask = _mm_alignr_epi8 (mask1, mask0, 6);
+ break;
+ case 7:
+ mask = _mm_alignr_epi8 (mask1, mask0, 7);
+ break;
+ case 8:
+ mask = _mm_alignr_epi8 (mask1, mask0, 8);
+ break;
+ case 9:
+ mask = _mm_alignr_epi8 (mask1, mask0, 9);
+ break;
+ case 10:
+ mask = _mm_alignr_epi8 (mask1, mask0, 10);
+ break;
+ case 11:
+ mask = _mm_alignr_epi8 (mask1, mask0, 11);
+ break;
+ case 12:
+ mask = _mm_alignr_epi8 (mask1, mask0, 12);
+ break;
+ case 13:
+ mask = _mm_alignr_epi8 (mask1, mask0, 13);
+ break;
+ case 14:
+ mask = _mm_alignr_epi8 (mask1, mask0, 14);
+ break;
+ case 15:
+ mask = _mm_alignr_epi8 (mask1, mask0, 15);
+ break;
+ }
+ }
+ }
+ }
+ else
+ {
+ /* A is aligned. */
+ mask = _mm_load_si128 ((__m128i *) a);
+
+ /* Find where the NULL terminator is. */
+ int length = _mm_cmpistri (mask, mask, 0x3a);
+ if (length == 16)
+ {
+ /* There is no NULL terminator. Don't use SSE4.2 if the length
+ of A > 16. */
+ if (a[16] != 0)
+ return STRCSPN_SSE2 (s, a);
+ }
+ }
+
+ offset = (int) ((size_t) s & 15);
+ if (offset != 0)
+ {
+ /* Check partial string. */
+ aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L);
+ __m128i value = _mm_load_si128 ((__m128i *) aligned);
+
+ switch (offset)
+ {
+ case 1:
+ value = _mm_srli_si128 (value, 1);
+ break;
+ case 2:
+ value = _mm_srli_si128 (value, 2);
+ break;
+ case 3:
+ value = _mm_srli_si128 (value, 3);
+ break;
+ case 4:
+ value = _mm_srli_si128 (value, 4);
+ break;
+ case 5:
+ value = _mm_srli_si128 (value, 5);
+ break;
+ case 6:
+ value = _mm_srli_si128 (value, 6);
+ break;
+ case 7:
+ value = _mm_srli_si128 (value, 7);
+ break;
+ case 8:
+ value = _mm_srli_si128 (value, 8);
+ break;
+ case 9:
+ value = _mm_srli_si128 (value, 9);
+ break;
+ case 10:
+ value = _mm_srli_si128 (value, 10);
+ break;
+ case 11:
+ value = _mm_srli_si128 (value, 11);
+ break;
+ case 12:
+ value = _mm_srli_si128 (value, 12);
+ break;
+ case 13:
+ value = _mm_srli_si128 (value, 13);
+ break;
+ case 14:
+ value = _mm_srli_si128 (value, 14);
+ break;
+ case 15:
+ value = _mm_srli_si128 (value, 15);
+ break;
+ }
+
+ int length = _mm_cmpistri (mask, value, 0x2);
+ /* No need to check ZFlag since ZFlag is always 1. */
+ int cflag = _mm_cmpistrc (mask, value, 0x2);
+ if (cflag)
+ RETURN ((char *) (s + length), length);
+ /* Find where the NULL terminator is. */
+ int index = _mm_cmpistri (value, value, 0x3a);
+ if (index < 16 - offset)
+ RETURN (NULL, index);
+ aligned += 16;
+ }
+ else
+ aligned = s;
+
+ while (1)
+ {
+ __m128i value = _mm_load_si128 ((__m128i *) aligned);
+ int index = _mm_cmpistri (mask, value, 0x2);
+ int cflag = _mm_cmpistrc (mask, value, 0x2);
+ int zflag = _mm_cmpistrz (mask, value, 0x2);
+ if (cflag)
+ RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
+ if (zflag)
+ RETURN (NULL,
+ /* Find where the NULL terminator is. */
+ (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s));
+ aligned += 16;
+ }
+}
Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strcspn.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strcspn.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strcspn.S Sat Jul 4 00:05:52 2009
@@ -1,0 +1,82 @@
+/* Multiple versions of strcspn
+ Copyright (C) 2009 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <config.h>
+
+#ifdef HAVE_SSE4_SUPPORT
+
+#include <sysdep.h>
+#include <ifunc-defines.h>
+
+#ifdef USE_AS_STRPBRK
+#define STRCSPN_SSE42 __strpbrk_sse42
+#define STRCSPN_SSE2 __strpbrk_sse2
+#define __GI_STRCSPN __GI_strpbrk
+#else
+#ifndef STRCSPN
+#define STRCSPN strcspn
+#define STRCSPN_SSE42 __strcspn_sse42
+#define STRCSPN_SSE2 __strcspn_sse2
+#define __GI_STRCSPN __GI_strcspn
+#endif
+#endif
+
+/* Define multiple versions only for the definition in libc. Don't
+ define multiple versions for strpbrk in static library since we
+ need strpbrk before the initialization happened. */
+#if (defined SHARED || !defined USE_AS_STRPBRK) && !defined NOT_IN_libc
+ .text
+ENTRY(STRCSPN)
+ .type STRCSPN, @gnu_indirect_function
+ cmpl $0, __cpu_features+KIND_OFFSET(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq STRCSPN_SSE2(%rip), %rax
+ testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
+ jz 2f
+ leaq STRCSPN_SSE42(%rip), %rax
+2: ret
+END(STRCSPN)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type STRCSPN_SSE2, @function; \
+ .globl STRCSPN_SSE2; \
+ .align 16; \
+ STRCSPN_SSE2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcspn calls through a PLT.
+ The speedup we get from using SSE4.2 instruction is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_SSE2
+#endif
+
+#endif /* HAVE_SSE4_SUPPORT */
+
+#ifdef USE_AS_STRPBRK
+#include "../strpbrk.S"
+#else
+#include "../strcspn.S"
+#endif
Modified: fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen.S (original)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen.S Sat Jul 4 00:05:52 2009
@@ -77,6 +77,7 @@
# undef ENTRY
# define ENTRY(name) \
.type __strlen_sse2, @function; \
+ .align 16; \
__strlen_sse2: cfi_startproc; \
CALL_MCOUNT
# undef END
Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strpbrk-c.c
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strpbrk-c.c (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strpbrk-c.c Sat Jul 4 00:05:52 2009
@@ -1,0 +1,4 @@
+#define USE_AS_STRPBRK
+#define STRCSPN_SSE2 __strpbrk_sse2
+#define STRCSPN_SSE42 __strpbrk_sse42
+#include "strcspn-c.c"
Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strpbrk.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strpbrk.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strpbrk.S Sat Jul 4 00:05:52 2009
@@ -1,0 +1,3 @@
+#define STRCSPN strpbrk
+#define USE_AS_STRPBRK
+#include "strcspn.S"
Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strspn-c.c
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strspn-c.c (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strspn-c.c Sat Jul 4 00:05:52 2009
@@ -1,0 +1,284 @@
+/* strspn with SSE4.2 intrinsics
+ Copyright (C) 2009 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <nmmintrin.h>
+#include <string.h>
+
+/* We use 0x12:
+ _SIDD_SBYTE_OPS
+ | _SIDD_CMP_EQUAL_ANY
+ | _SIDD_NEGATIVE_POLARITY
+ | _SIDD_LEAST_SIGNIFICANT
+ on pcmpistri to compare xmm/mem128
+
+ 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ X X X X X X X X X X X X X X X X
+
+ against xmm
+
+ 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ A A A A A A A A A A A A A A A A
+
+ to find out if the first 16byte data element has any non-A byte and
+ the offset of the first byte. There are 2 cases:
+
+ 1. The first 16byte data element has the non-A byte, including
+ EOS, at the offset X.
+ 2. The first 16byte data element is valid and doesn't have the non-A
+ byte.
+
+ Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
+
+ case ECX CFlag ZFlag SFlag
+ 1 X 1 0/1 0
+ 2 16 0 0 0
+
+ We exit from the loop for case 1. */
+
+extern size_t __strspn_sse2 (const char *, const char *);
+
+
+size_t
+__attribute__ ((section (".text.sse4.2")))
+__strspn_sse42 (const char *s, const char *a)
+{
+ if (*a == 0)
+ return 0;
+
+ const char *aligned;
+ __m128i mask;
+ int offset = (int) ((size_t) a & 15);
+ if (offset != 0)
+ {
+ /* Load masks. */
+ aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L);
+ __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
+
+ switch (offset)
+ {
+ case 1:
+ mask = _mm_srli_si128 (mask0, 1);
+ break;
+ case 2:
+ mask = _mm_srli_si128 (mask0, 2);
+ break;
+ case 3:
+ mask = _mm_srli_si128 (mask0, 3);
+ break;
+ case 4:
+ mask = _mm_srli_si128 (mask0, 4);
+ break;
+ case 5:
+ mask = _mm_srli_si128 (mask0, 5);
+ break;
+ case 6:
+ mask = _mm_srli_si128 (mask0, 6);
+ break;
+ case 7:
+ mask = _mm_srli_si128 (mask0, 7);
+ break;
+ case 8:
+ mask = _mm_srli_si128 (mask0, 8);
+ break;
+ case 9:
+ mask = _mm_srli_si128 (mask0, 9);
+ break;
+ case 10:
+ mask = _mm_srli_si128 (mask0, 10);
+ break;
+ case 11:
+ mask = _mm_srli_si128 (mask0, 11);
+ break;
+ case 12:
+ mask = _mm_srli_si128 (mask0, 12);
+ break;
+ case 13:
+ mask = _mm_srli_si128 (mask0, 13);
+ break;
+ case 14:
+ mask = _mm_srli_si128 (mask0, 14);
+ break;
+ case 15:
+ mask = _mm_srli_si128 (mask0, 15);
+ break;
+ }
+
+ /* Find where the NULL terminator is. */
+ int length = _mm_cmpistri (mask, mask, 0x3a);
+ if (length == 16 - offset)
+ {
+ /* There is no NULL terminator. */
+ __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
+ int index = _mm_cmpistri (mask1, mask1, 0x3a);
+ length += index;
+
+ /* Don't use SSE4.2 if the length of A > 16. */
+ if (length > 16)
+ return __strspn_sse2 (s, a);
+
+ if (index != 0)
+ {
+ /* Combine mask0 and mask1. */
+ switch (offset)
+ {
+ case 1:
+ mask = _mm_alignr_epi8 (mask1, mask0, 1);
+ break;
+ case 2:
+ mask = _mm_alignr_epi8 (mask1, mask0, 2);
+ break;
+ case 3:
+ mask = _mm_alignr_epi8 (mask1, mask0, 3);
+ break;
+ case 4:
+ mask = _mm_alignr_epi8 (mask1, mask0, 4);
+ break;
+ case 5:
+ mask = _mm_alignr_epi8 (mask1, mask0, 5);
+ break;
+ case 6:
+ mask = _mm_alignr_epi8 (mask1, mask0, 6);
+ break;
+ case 7:
+ mask = _mm_alignr_epi8 (mask1, mask0, 7);
+ break;
+ case 8:
+ mask = _mm_alignr_epi8 (mask1, mask0, 8);
+ break;
+ case 9:
+ mask = _mm_alignr_epi8 (mask1, mask0, 9);
+ break;
+ case 10:
+ mask = _mm_alignr_epi8 (mask1, mask0, 10);
+ break;
+ case 11:
+ mask = _mm_alignr_epi8 (mask1, mask0, 11);
+ break;
+ case 12:
+ mask = _mm_alignr_epi8 (mask1, mask0, 12);
+ break;
+ case 13:
+ mask = _mm_alignr_epi8 (mask1, mask0, 13);
+ break;
+ case 14:
+ mask = _mm_alignr_epi8 (mask1, mask0, 14);
+ break;
+ case 15:
+ mask = _mm_alignr_epi8 (mask1, mask0, 15);
+ break;
+ }
+ }
+ }
+ }
+ else
+ {
+ /* A is aligned. */
+ mask = _mm_load_si128 ((__m128i *) a);
+
+ /* Find where the NULL terminator is. */
+ int length = _mm_cmpistri (mask, mask, 0x3a);
+ if (length == 16)
+ {
+ /* There is no NULL terminator. Don't use SSE4.2 if the length
+ of A > 16. */
+ if (a[16] != 0)
+ return __strspn_sse2 (s, a);
+ }
+ }
+
+ offset = (int) ((size_t) s & 15);
+ if (offset != 0)
+ {
+ /* Check partial string. */
+ aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L);
+ __m128i value = _mm_load_si128 ((__m128i *) aligned);
+
+ switch (offset)
+ {
+ case 1:
+ value = _mm_srli_si128 (value, 1);
+ break;
+ case 2:
+ value = _mm_srli_si128 (value, 2);
+ break;
+ case 3:
+ value = _mm_srli_si128 (value, 3);
+ break;
+ case 4:
+ value = _mm_srli_si128 (value, 4);
+ break;
+ case 5:
+ value = _mm_srli_si128 (value, 5);
+ break;
+ case 6:
+ value = _mm_srli_si128 (value, 6);
+ break;
+ case 7:
+ value = _mm_srli_si128 (value, 7);
+ break;
+ case 8:
+ value = _mm_srli_si128 (value, 8);
+ break;
+ case 9:
+ value = _mm_srli_si128 (value, 9);
+ break;
+ case 10:
+ value = _mm_srli_si128 (value, 10);
+ break;
+ case 11:
+ value = _mm_srli_si128 (value, 11);
+ break;
+ case 12:
+ value = _mm_srli_si128 (value, 12);
+ break;
+ case 13:
+ value = _mm_srli_si128 (value, 13);
+ break;
+ case 14:
+ value = _mm_srli_si128 (value, 14);
+ break;
+ case 15:
+ value = _mm_srli_si128 (value, 15);
+ break;
+ }
+
+ int length = _mm_cmpistri (mask, value, 0x12);
+ /* No need to check CFlag since it is always 1. */
+ if (length < 16 - offset)
+ return length;
+ /* Find where the NULL terminator is. */
+ int index = _mm_cmpistri (value, value, 0x3a);
+ if (index < 16 - offset)
+ return length;
+ aligned += 16;
+ }
+ else
+ aligned = s;
+
+ while (1)
+ {
+ __m128i value = _mm_load_si128 ((__m128i *) aligned);
+ int index = _mm_cmpistri (mask, value, 0x12);
+ int cflag = _mm_cmpistrc (mask, value, 0x12);
+ if (cflag)
+ return (size_t) (aligned + index - s);
+ aligned += 16;
+ }
+}
Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strspn.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strspn.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strspn.S Sat Jul 4 00:05:52 2009
@@ -1,0 +1,63 @@
+/* Multiple versions of strspn
+ Copyright (C) 2009 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <config.h>
+
+#ifdef HAVE_SSE4_SUPPORT
+
+#include <sysdep.h>
+#include <ifunc-defines.h>
+
+/* Define multiple versions only for the definition in libc. */
+#ifndef NOT_IN_libc
+ .text
+ENTRY(strspn)
+ .type strspn, @gnu_indirect_function
+ cmpl $0, __cpu_features+KIND_OFFSET(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq __strspn_sse2(%rip), %rax
+ testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
+ jz 2f
+ leaq __strspn_sse42(%rip), %rax
+2: ret
+END(strspn)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __strspn_sse2, @function; \
+ .globl __strspn_sse2; \
+ .align 16; \
+ __strspn_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __strspn_sse2, .-__strspn_sse2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strspn calls through a PLT.
+ The speedup we get from using SSE4.2 instruction is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_strspn; __GI_strspn = __strspn_sse2
+#endif
+
+#endif /* HAVE_SSE4_SUPPORT */
+
+#include "../strspn.S"