[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[commits] r8688 - in /fsf/trunk/libc: ./ locale/ locale/programs/ nptl/ nptl/sysdeps/unix/sysv/linux/x86_64/ string/ sysdeps/x86_64/mu...



Author: eglibc
Date: Tue Jul 21 00:08:39 2009
New Revision: 8688

Log:
Import glibc-mainline for 2009-07-21

Added:
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strcasestr-c.c
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strcasestr.c
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strstr-c.c
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strstr.c
Modified:
    fsf/trunk/libc/ChangeLog
    fsf/trunk/libc/locale/C-ctype.c
    fsf/trunk/libc/locale/langinfo.h
    fsf/trunk/libc/locale/localeinfo.h
    fsf/trunk/libc/locale/programs/ld-ctype.c
    fsf/trunk/libc/nptl/ChangeLog
    fsf/trunk/libc/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S
    fsf/trunk/libc/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S
    fsf/trunk/libc/string/strcasestr.c
    fsf/trunk/libc/string/strstr.c
    fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile

Modified: fsf/trunk/libc/ChangeLog
==============================================================================
--- fsf/trunk/libc/ChangeLog (original)
+++ fsf/trunk/libc/ChangeLog Tue Jul 21 00:08:39 2009
@@ -1,3 +1,39 @@
+2009-07-20  Ulrich Drepper  <drepper@xxxxxxxxxx>
+
+	* sysdeps/x86_64/multiarch/strstr.c [USE_AS_STRCASESTR] (STRSTR_SSE42):
+	Use NONASCII_CASE information provided by the locale to determine
+	whether optimized string load function can be used.  Minor cleanups.
+
+2009-07-20  H.J. Lu  <hongjiu.lu@xxxxxxxxx>
+
+	* string/strcasestr.c (STRCASESTR): New macro.
+	(__strcasestr): Renamed to ..
+	(STRCASESTR): ...this.
+	* string/strstr.c (STRSTR): New macro.
+	(strstr): Renamed to ..
+	(STRSTR): ...this.
+	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+	strstr-c strcasestr-c
+	(CFLAGS-strstr.c): New.
+	(CFLAGS-strcasestr.c): Likewise.
+	* sysdeps/x86_64/multiarch/strcasestr-c.c: New file.
+	* sysdeps/x86_64/multiarch/strcasestr.c: New file.
+	* sysdeps/x86_64/multiarch/strstr-c.c: New file.
+	* sysdeps/x86_64/multiarch/strstr.c: New file.
+
+2009-07-20  Ulrich Drepper  <drepper@xxxxxxxxxx>
+
+	* locale/localeinfo.h (LIMAGIC): Update value for LC_CTYPE.
+	* locale/langinfo.h: Define _NL_CTYPE_NONASCII_CASE.
+	* locale/C-ctype.c (_nl_C_LC_CTYPE): Add initializer for
+	_NL_CTYPE_NONASCII_CASE.
+	* locale/programs/ld-ctype.c (locale_ctype_t): Add nonascii_case
+	field.
+	(ctype_finish): Check whether there are any 8-bit characters outside
+	the range ASCII has or whether the mapping isn't the same as for
+	ASCII (±0x20).  Set nonascii_case appropriately.
+	(ctype_output): Add output handler for nonascii_case.
+
 2009-07-17  Ulrich Drepper  <drepper@xxxxxxxxxx>
 
 	* sysdeps/generic/sysdep.h: Define cfi_personality, cfi_lsda,

Modified: fsf/trunk/libc/locale/C-ctype.c
==============================================================================
--- fsf/trunk/libc/locale/C-ctype.c (original)
+++ fsf/trunk/libc/locale/C-ctype.c Tue Jul 21 00:08:39 2009
@@ -1,4 +1,4 @@
-/* Copyright (C) 1995-2002, 2003 Free Software Foundation, Inc.
+/* Copyright (C) 1995-2002, 2003, 2009 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@xxxxxxxxxx>, 1995.
 
@@ -528,7 +528,7 @@
 };
 
 /* Number of fields with fixed meanings, starting at 0.  */
-#define NR_FIXED 71
+#define NR_FIXED 72
 /* Number of class fields, starting at CLASS_OFFSET.  */
 #define NR_CLASSES 12
 /* Number of map fields, starting at MAP_OFFSET.  */
@@ -667,6 +667,8 @@
     { .wstr = NULL },
     /* _NL_CTYPE_MAP_TO_NONASCII */
     { .word = 0 },
+    /* _NL_CTYPE_NONASCII_CASE */
+    { .word = 0 },
     /* NR_CLASSES wctype_tables */
     { .string = (const char *) _nl_C_LC_CTYPE_class_upper.header },
     { .string = (const char *) _nl_C_LC_CTYPE_class_lower.header },

Modified: fsf/trunk/libc/locale/langinfo.h
==============================================================================
--- fsf/trunk/libc/locale/langinfo.h (original)
+++ fsf/trunk/libc/locale/langinfo.h Tue Jul 21 00:08:39 2009
@@ -334,6 +334,7 @@
   _NL_CTYPE_TRANSLIT_IGNORE_LEN,
   _NL_CTYPE_TRANSLIT_IGNORE,
   _NL_CTYPE_MAP_TO_NONASCII,
+  _NL_CTYPE_NONASCII_CASE,
   _NL_CTYPE_EXTRA_MAP_1,
   _NL_CTYPE_EXTRA_MAP_2,
   _NL_CTYPE_EXTRA_MAP_3,

Modified: fsf/trunk/libc/locale/localeinfo.h
==============================================================================
--- fsf/trunk/libc/locale/localeinfo.h (original)
+++ fsf/trunk/libc/locale/localeinfo.h Tue Jul 21 00:08:39 2009
@@ -1,5 +1,5 @@
 /* Declarations for internal libc locale interfaces
-   Copyright (C) 1995-2003, 2005, 2006, 2007, 2008
+   Copyright (C) 1995-2003, 2005, 2006, 2007, 2008, 2009
    Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -35,6 +35,8 @@
 #define	LIMAGIC(category) \
   (category == LC_COLLATE						\
    ? ((unsigned int) (0x20051014 ^ (category)))				\
+   : category == LC_CTYPE						\
+   ? ((unsigned int) (0x20090720 ^ (category)))				\
    : ((unsigned int) (0x20031115 ^ (category))))
 
 /* Two special weight constants for the collation data.  */

Modified: fsf/trunk/libc/locale/programs/ld-ctype.c
==============================================================================
--- fsf/trunk/libc/locale/programs/ld-ctype.c (original)
+++ fsf/trunk/libc/locale/programs/ld-ctype.c Tue Jul 21 00:08:39 2009
@@ -1,4 +1,4 @@
-/* Copyright (C) 1995-2006, 2007 Free Software Foundation, Inc.
+/* Copyright (C) 1995-2006, 2007, 2009 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@xxxxxxx>, 1995.
 
@@ -181,6 +181,7 @@
   size_t default_missing_lineno;
 
   uint32_t to_nonascii;
+  uint32_t nonascii_case;
 
   /* The arrays for the binary representation.  */
   char_class_t *ctype_b;
@@ -625,6 +626,27 @@
   else
     ctype->class256_collection[space_seq->bytes[0]] |= BIT (tok_print);
 
+  /* Check whether all single-byte characters make to their upper/lowercase
+     equivalent according to the ASCII rules.  */
+  for (cnt = 'A'; cnt <= 'Z'; ++cnt)
+    {
+      uint32_t uppval = ctype->map256_collection[0][cnt];
+      uint32_t lowval = ctype->map256_collection[1][cnt];
+      uint32_t lowuppval = ctype->map256_collection[0][lowval];
+      uint32_t lowlowval = ctype->map256_collection[1][lowval];
+
+      if (uppval != cnt
+	  || lowval != cnt + 0x20
+	  || lowuppval != cnt
+	  || lowlowval != cnt + 0x20)
+	ctype->nonascii_case = 1;
+    }
+  for (cnt = 0; cnt < 256; ++cnt)
+    if (cnt < 'A' || (cnt > 'Z' && cnt < 'a') || cnt > 'z')
+      if (ctype->map256_collection[0][cnt] != cnt
+	  || ctype->map256_collection[1][cnt] != cnt)
+	ctype->nonascii_case = 1;
+
   /* Now that the tests are done make sure the name array contains all
      characters which are handled in the WIDTH section of the
      character set definition file.  */
@@ -1044,6 +1066,9 @@
 
 	  CTYPE_DATA (_NL_CTYPE_MAP_TO_NONASCII,
 		      &ctype->to_nonascii, sizeof (uint32_t));
+
+	  CTYPE_DATA (_NL_CTYPE_NONASCII_CASE,
+		      &ctype->nonascii_case, sizeof (uint32_t));
 
 	  case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_MB_LEN):
 	    iov[2 + elem + offset].iov_base = alloca (sizeof (uint32_t));

Modified: fsf/trunk/libc/nptl/ChangeLog
==============================================================================
--- fsf/trunk/libc/nptl/ChangeLog (original)
+++ fsf/trunk/libc/nptl/ChangeLog Tue Jul 21 00:08:39 2009
@@ -1,3 +1,9 @@
+2009-07-20  Ulrich Drepper  <drepper@xxxxxxxxxx>
+
+	* sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S: Minor
+	optimizations of last changes.
+	* sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S: Likewise.
+
 2009-07-19  Ulrich Drepper  <drepper@xxxxxxxxxx>
 
 	* sysdeps/unix/sysv/linux/x86_64/lowlevellock.h: Define

Modified: fsf/trunk/libc/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S
==============================================================================
--- fsf/trunk/libc/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S (original)
+++ fsf/trunk/libc/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S Tue Jul 21 00:08:39 2009
@@ -160,16 +160,14 @@
 	movq	8(%rsp), %rdi
 
 	movq	%r13, %r10
+	movl	$FUTEX_WAIT_BITSET, %esi
 	cmpq	$-1, dep_mutex(%rdi)
-	movl	$FUTEX_WAIT_BITSET, %eax
-	movl	$(FUTEX_WAIT_BITSET|FUTEX_PRIVATE_FLAG), %esi
-	cmove	%eax, %esi
 	je	60f
 
 	movq	dep_mutex(%rdi), %r8
 	/* Requeue to a PI mutex if the PI bit is set.  */
 	testl	$PI_BIT, MUTEX_KIND(%r8)
-	je	60f
+	je	61f
 
 	movl	$(FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG), %esi
 	xorl	%eax, %eax
@@ -191,10 +189,10 @@
 	cmpq	$-4095, %rax
 	jnae	62f
 
-	movl	$(FUTEX_WAIT_BITSET|FUTEX_PRIVATE_FLAG), %esi
 	subq	$cond_futex, %rdi
 #endif
 
+61:	movl	$(FUTEX_WAIT_BITSET|FUTEX_PRIVATE_FLAG), %esi
 60:	xorl	%r15d, %r15d
 	xorl	%eax, %eax
 	/* The following only works like this because we only support

Modified: fsf/trunk/libc/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S
==============================================================================
--- fsf/trunk/libc/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S (original)
+++ fsf/trunk/libc/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S Tue Jul 21 00:08:39 2009
@@ -128,28 +128,15 @@
 	movq	8(%rsp), %rdi
 	xorq	%r10, %r10
 	movq	%r12, %rdx
-	// XXX reverse + lea
-	addq	$cond_futex, %rdi
-	cmpq	$-1, dep_mutex-cond_futex(%rdi)
-#ifdef __ASSUME_PRIVATE_FUTEX
-	movl	$FUTEX_WAIT, %eax
-	movl	$(FUTEX_WAIT|FUTEX_PRIVATE_FLAG), %esi
-	cmove	%eax, %esi
-#else
-	movl	$0, %eax
-	movl	%fs:PRIVATE_FUTEX, %esi
-	cmove	%eax, %esi
-# if FUTEX_WAIT != 0
-#  error "cc destroyed by following orl"
-	orl	$FUTEX_WAIT, %esi
-# endif
-#endif
+	cmpq	$-1, dep_mutex(%rdi)
+	leaq	cond_futex(%rdi), %rdi
+	movl	$FUTEX_WAIT, %esi
 	je	60f
 
 	movq	dep_mutex-cond_futex(%rdi), %r8
 	/* Requeue to a PI mutex if the PI bit is set.  */
 	testl	$PI_BIT, MUTEX_KIND(%r8)
-	je	60f
+	je	61f
 
 	movl	$(FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG), %esi
 	movl	$SYS_futex, %eax
@@ -162,9 +149,17 @@
 	cmpq	$-4095, %rax
 	jnae	62f
 
+# ifndef __ASSUME_PRIVATE_FUTEX
+	movl	$FUTEX_WAIT, %esi
+# endif
+#endif
+
+61:
+#ifdef __ASSUME_PRIVATE_FUTEX
 	movl	$(FUTEX_WAIT|FUTEX_PRIVATE_FLAG), %esi
-#endif
-
+#else
+	orl	%fs:PRIVATE_FUTEX, %esi
+#endif
 60:	xorl	%r13d, %r13d
 	movl	$SYS_futex, %eax
 	syscall

Modified: fsf/trunk/libc/string/strcasestr.c
==============================================================================
--- fsf/trunk/libc/string/strcasestr.c (original)
+++ fsf/trunk/libc/string/strcasestr.c Tue Jul 21 00:08:39 2009
@@ -1,5 +1,6 @@
 /* Return the offset of one string within another.
-   Copyright (C) 1994, 1996-2000, 2004, 2008 Free Software Foundation, Inc.
+   Copyright (C) 1994, 1996-2000, 2004, 2008, 2009
+   Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -52,11 +53,16 @@
 #undef strcasestr
 #undef __strcasestr
 
+#ifndef STRCASESTR
+#define STRCASESTR __strcasestr
+#endif
+
+
 /* Find the first occurrence of NEEDLE in HAYSTACK, using
    case-insensitive comparison.  This function gives unspecified
    results in multibyte locales.  */
 char *
-__strcasestr (const char *haystack_start, const char *needle_start)
+STRCASESTR (const char *haystack_start, const char *needle_start)
 {
   const char *haystack = haystack_start;
   const char *needle = needle_start;

Modified: fsf/trunk/libc/string/strstr.c
==============================================================================
--- fsf/trunk/libc/string/strstr.c (original)
+++ fsf/trunk/libc/string/strstr.c Tue Jul 21 00:08:39 2009
@@ -1,5 +1,6 @@
 /* Return the offset of one string within another.
-   Copyright (C) 1994,1996,1997,2000,2001,2003,2008 Free Software Foundation, Inc.
+   Copyright (C) 1994,1996,1997,2000,2001,2003,2008,2009
+   Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -40,11 +41,15 @@
 
 #undef strstr
 
+#ifndef STRSTR
+#define STRSTR strstr
+#endif
+
 /* Return the first occurrence of NEEDLE in HAYSTACK.  Return HAYSTACK
    if NEEDLE is empty, otherwise NULL if NEEDLE is not found in
    HAYSTACK.  */
 char *
-strstr (const char *haystack_start, const char *needle_start)
+STRSTR (const char *haystack_start, const char *needle_start)
 {
   const char *haystack = haystack_start;
   const char *needle = needle_start;

Modified: fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile (original)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile Tue Jul 21 00:08:39 2009
@@ -6,9 +6,11 @@
 ifeq ($(subdir),string)
 sysdep_routines += stpncpy-c strncpy-c strncmp-c
 ifeq (yes,$(config-cflags-sse4))
-sysdep_routines += strcspn-c strpbrk-c strspn-c
+sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
 CFLAGS-strcspn-c.c += -msse4
 CFLAGS-strpbrk-c.c += -msse4
 CFLAGS-strspn-c.c += -msse4
+CFLAGS-strstr.c += -msse4
+CFLAGS-strcasestr.c += -msse4
 endif
 endif

Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strcasestr-c.c
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strcasestr-c.c (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strcasestr-c.c Tue Jul 21 00:08:39 2009
@@ -1,0 +1,18 @@
+#include "init-arch.h"
+
+#define STRCASESTR __strcasestr_sse2
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name) \
+  __hidden_ver1 (__strcasestr_sse2, __GI_strcasestr, __strcasestr_sse2);
+
+#include "string/strcasestr.c"
+
+extern char *__strcasestr_sse42 (const char *, const char *);
+
+#if 1
+libc_ifunc (__strcasestr,
+	    HAS_SSE4_2 ? __strcasestr_sse42 : __strcasestr_sse2);
+#else
+libc_ifunc (__strcasestr,
+	    0 ? __strcasestr_sse42 : __strcasestr_sse2);
+#endif

Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strcasestr.c
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strcasestr.c (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strcasestr.c Tue Jul 21 00:08:39 2009
@@ -1,0 +1,3 @@
+#define USE_AS_STRCASESTR
+#define STRSTR_SSE42 __strcasestr_sse42
+#include "strstr.c"

Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strstr-c.c
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strstr-c.c (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strstr-c.c Tue Jul 21 00:08:39 2009
@@ -1,0 +1,12 @@
+#include "init-arch.h"
+
+#define STRSTR __strstr_sse2
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name) \
+  __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2);
+
+#include "string/strstr.c"
+
+extern char *__strstr_sse42 (const char *, const char *);
+
+libc_ifunc (strstr, HAS_SSE4_2 ? __strstr_sse42 : __strstr_sse2);

Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strstr.c
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strstr.c (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strstr.c Tue Jul 21 00:08:39 2009
@@ -1,0 +1,487 @@
+/* strstr with SSE4.2 intrinsics
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <nmmintrin.h>
+
+#ifndef STRSTR_SSE42
+# define STRSTR_SSE42 __strstr_sse42
+#endif
+
+#ifdef USE_AS_STRCASESTR
+# include <ctype.h>
+# include <locale/localeinfo.h>
+
+# define LOADBYTE(C)		tolower (C)
+# define CMPBYTE(C1, C2) \
+  ((C1) == (C2) || tolower (C1) == tolower (C2))
+#else
+# define LOADBYTE(C)		(C)
+# define CMPBYTE(C1, C2)	((C1) == (C2))
+#endif
+
+/* We use 0xe ordered-compare:
+	_SIDD_SBYTE_OPS
+	| _SIDD_CMP_EQUAL_ORDER
+	| _SIDD_LEAST_SIGNIFICANT
+   on pcmpistri to do the scanning and string comparsion requirements of
+   sub-string match.  In the scanning phase, we process Cflag and ECX
+   index to locate the first fragment match; once the first fragment
+   match position has been identified, we do comparison of subsequent
+   string fragments until we can conclude false or true match; whe
+   n concluding a false match, we may need to repeat scanning process
+   from next relevant offset in the target string.
+
+   In the scanning phase we have 4 cases:
+   case		ECX	CFlag	ZFlag	SFlag
+    1		16	  0	  0	  0
+    2a		16	  0	  0	  1
+    2b		16	  0	  1	  0
+    2c		16	  0	  1	  1
+
+   1. No ordered-comparison match, both 16B fragments are valid, so
+      continue to next fragment.
+   2. No ordered-comparison match, there is EOS in either fragment,
+   2a. Zflg = 0, Sflg = 1, we continue
+   2b. Zflg = 1, Sflg = 0, we conclude no match and return.
+   2c. Zflg = 1, sflg = 1, lenth determine match or no match
+
+   In the string comparison phase, the 1st fragment match is fixed up
+   to produce ECX = 0.  Subsequent fragment compare of nonzero index
+   and no match conclude a false match.
+
+   case		ECX	CFlag	ZFlag	SFlag
+    3		 X	  1	  0	  0/1
+    4a		 0  	  1	  0	  0
+    4b		 0  	  1	  0	  1
+    4c		0 < X  	  1	  0	  0/1
+    5		16 	  0	  1	  0
+
+   3. An initial ordered-comparison fragment match, we fix up to do
+      subsequent string comparison
+   4a. Continuation of fragment comparison of a string compare.
+   4b. EOS reached in the reference string, we conclude true match and
+       return
+   4c. String compare failed if index is nonzero, we need to go back to
+       scanning
+   5.  failed string compare, go back to scanning
+ */
+
+/* Fix-up of removal of unneeded data due to 16B aligned load
+   parameters:
+     value: 16B data loaded from 16B aligned address.
+     offset: Offset of target data address relative to 16B aligned load
+	     address.
+ */
+
+static __inline__ __m128i
+__m128i_shift_right (__m128i value, int offset)
+{
+  switch (offset)
+    {
+    case 1:
+      value = _mm_srli_si128 (value, 1);
+      break;
+    case 2:
+      value = _mm_srli_si128 (value, 2);
+      break;
+    case 3:
+      value = _mm_srli_si128 (value, 3);
+      break;
+    case 4:
+      value = _mm_srli_si128 (value, 4);
+      break;
+    case 5:
+      value = _mm_srli_si128 (value, 5);
+      break;
+    case 6:
+      value = _mm_srli_si128 (value, 6);
+      break;
+    case 7:
+      value = _mm_srli_si128 (value, 7);
+      break;
+    case 8:
+      value = _mm_srli_si128 (value, 8);
+      break;
+    case 9:
+      value = _mm_srli_si128 (value, 9);
+      break;
+    case 10:
+      value = _mm_srli_si128 (value, 10);
+      break;
+    case 11:
+      value = _mm_srli_si128 (value, 11);
+      break;
+    case 12:
+      value = _mm_srli_si128 (value, 12);
+      break;
+    case 13:
+      value = _mm_srli_si128 (value, 13);
+      break;
+    case 14:
+      value = _mm_srli_si128 (value, 14);
+      break;
+    case 15:
+      value = _mm_srli_si128 (value, 15);
+      break;
+    }
+  return value;
+}
+
+/* Simple replacement of movdqu to address 4KB boundary cross issue.
+   If EOS occurs within less than 16B before 4KB boundary, we don't
+   cross to next page.  */
+
+static __m128i
+__attribute__ ((section (".text.sse4.2")))
+__m128i_strloadu (const unsigned char * p)
+{
+  int offset = ((size_t) p & (16 - 1));
+
+  if (offset && (int) ((size_t) p & 0xfff) > 0xff0)
+    {
+      __m128i a = _mm_load_si128 ((__m128i *) (p - offset));
+      __m128i zero = _mm_setzero_si128 ();
+      int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero));
+      if ((bmsk >> offset) != 0)
+	return __m128i_shift_right (a, offset);
+    }
+  return _mm_loadu_si128 ((__m128i *) p);
+}
+
+#ifdef USE_AS_STRCASESTR
+
+/* Similar to __m128i_strloadu.  Convert to lower case for POSIX/C
+   locale.  */
+
+static __m128i
+__attribute__ ((section (".text.sse4.2")))
+__m128i_strloadu_tolower_posix (const unsigned char * p)
+{
+  __m128i frag = __m128i_strloadu (p);
+
+  /* Convert frag to lower case for POSIX/C locale.  */
+  __m128i rangeuc = _mm_set_epi64x (0x0, 0x5a41);
+  __m128i u2ldelta = _mm_set1_epi64x (0xe0e0e0e0e0e0e0e0);
+  __m128i mask1 = _mm_cmpistrm (rangeuc, frag, 0x44);
+  __m128i mask2 = _mm_blendv_epi8 (u2ldelta, frag, mask1);
+  mask2 = _mm_sub_epi8 (mask2, u2ldelta);
+  return  _mm_blendv_epi8 (frag, mask2, mask1);
+}
+
+/* Similar to __m128i_strloadu.  Convert to lower case for none-POSIX/C
+   locale.  */
+
+static __m128i
+__attribute__ ((section (".text.sse4.2")))
+__m128i_strloadu_tolower (const unsigned char * p)
+{
+  union
+    {
+      char b[16];
+      __m128i x;
+    } u;
+
+  for (int i = 0; i < 16; i++)
+    if (p[i] == 0)
+      {
+	u.b[i] = 0;
+	break;
+      }
+    else
+      u.b[i] = tolower (p[i]);
+
+  return u.x;
+}
+#endif
+
+/* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
+   algorithm) overlap for a fully populated 16B vector.
+   Input parameter: 1st 16Byte loaded from the reference string of a
+		    strstr function.
+   We don't use KMP algorithm if reference string is less than 16B.
+ */
+
+static int
+__inline__ __attribute__ ((__always_inline__,))
+KMP16Bovrlap (__m128i s2)
+{
+  __m128i a, b;
+  int bmsk, k1;
+
+  b = _mm_unpacklo_epi8 (s2, s2);
+  a = _mm_unpacklo_epi8 (b, b);
+  a = _mm_shuffle_epi32 (a, 0);
+  b = _mm_srli_si128 (s2, sizeof (char));
+  bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a));
+
+  /* _BitScanForward(&k1, bmsk); */
+  __asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk));
+  if (!bmsk)
+    return 16;
+  else if (bmsk == 0x7fff)
+    return 1;
+  else if (!k1)
+    {
+      /* There are al least two ditinct char in s2.  If byte 0 and 1 are
+	 idential and the distinct value lies farther down, we can deduce
+	 the next byte offset to restart full compare is least no earlier
+	 than byte 3.  */
+      return 3;
+    }
+  else
+    {
+      /* Byte 1 is not degenerated to byte 0.  */
+      return k1 + 1;
+    }
+}
+
+char *
+__attribute__ ((section (".text.sse4.2")))
+STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
+{
+  int len;
+  int len1;
+  const unsigned char *p1 = s1;
+  const unsigned char *p2 = s2;
+  __m128i frag1;
+  __m128i frag2;
+  __m128i zero;
+  int cmp;
+  int cmp_c;
+  int cmp_z;
+  int cmp_s;
+  int kmp_fwd;
+  int bmsk;
+  int bmsk1;
+  const unsigned char *pt;
+
+  if (!p2[0])
+    return (char *) p1;
+
+  if (!p1[0])
+    return NULL;
+
+  /* Check if p1 length is 1 byte long.  */
+  if (!p1[1])
+    return !p2[1] && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL;
+
+#ifdef USE_AS_STRCASESTR
+  __m128i (*strloadu) (const unsigned char *);
+
+  if (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE) == 0)
+    strloadu = __m128i_strloadu_tolower_posix;
+  else
+    strloadu = __m128i_strloadu_tolower;
+#else
+# define strloadu __m128i_strloadu
+#endif
+
+  /* p1 > 1 byte long.  Load up to 16 bytes of fragment.  */
+  frag1 = strloadu (p1);
+
+  if (p2[1])
+    {
+      /* p2 is > 1 byte long.  */
+      frag2 = strloadu (p2);
+    }
+  else
+    {
+      zero = _mm_setzero_si128 ();
+      frag2 = _mm_insert_epi8 (zero, LOADBYTE(p2[0]), 0);
+    }
+
+  /* Unsigned bytes, equal order, does frag2 has null?  */
+  cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+  cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+  cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+  cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
+  if (cmp_s & cmp_c)
+    {
+      zero = _mm_setzero_si128 ();
+      bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2, zero));
+      __asm ("bsfl %[bmsk], %[len]"
+	     : [len] "=r" (len) : [bmsk] "r" (bmsk));
+      p1 += cmp;
+      if ((len + cmp) <= 16)
+	return (char *) p1;
+      else
+	{
+	  /* Load up to 16 bytes of fragment.  */
+	  frag1 = strloadu (p1);
+	  cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+	  cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
+	  cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+	  cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+	  if ((len + cmp) <= 16)
+	    return (char *) p1 + cmp;
+	}
+    }
+
+  if (cmp_s)
+    {
+      /* Adjust addr for 16B alginment in ensuing loop.  */
+      while (!cmp_z)
+	{
+	  p1 += cmp;
+	  /* Load up to 16 bytes of fragment.  */
+	  frag1 = strloadu (p1);
+	  cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+	  cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+	  cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+	  /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp
+	     once already, this time cmp will be zero and we can exit.  */
+	  if ((!cmp) & cmp_c)
+	    break;
+	}
+
+      if (!cmp_c)
+	return NULL;
+      else
+	{
+	  /* Since s2 is less than 16 bytes, com_c is definitive
+	     determination of full match.  */
+	  return (char *) p1 + cmp;
+	}
+    }
+
+  /* General case, s2 is at least 16 bytes or more.
+     First, the common case of false-match at first byte of p2.  */
+  pt = NULL;
+  kmp_fwd = 0;
+re_trace:
+  while (!cmp_c)
+    {
+      /* frag1 has null. */
+      if (cmp_z)
+	return NULL;
+
+      /* frag 1 has no null, advance 16 bytes.  */
+      p1 += 16;
+      /* Load up to 16 bytes of fragment.  */
+      frag1 = strloadu (p1);
+      /* Unsigned bytes, equal order, is there a partial match?  */
+      cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+      cmp = _mm_cmpistri(frag2, frag1, 0x0c);
+      cmp_z = _mm_cmpistrz(frag2, frag1, 0x0c);
+    }
+
+  /* Next, handle inital positive match as first byte of p2.  We have
+     a partial fragment match, make full determination until we reached
+     end of s2.  */
+  if (!cmp)
+    {
+      if (cmp_z)
+	return (char *) p1;
+
+      pt = p1;
+      p1 += 16;
+      p2 += 16;
+      /* Load up to 16 bytes of fragment.  */
+      frag2 = strloadu(p2);
+    }
+  else
+    {
+      /* Adjust 16B alignment.  */
+      p1 += cmp;
+      pt = p1;
+    }
+
+  /* Load up to 16 bytes of fragment.  */
+  frag1 = strloadu (p1);
+
+  /* Unsigned bytes, equal order, does frag2 has null?  */
+  cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+  cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+  cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+  cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
+  while (!(cmp | cmp_z | cmp_s))
+    {
+      p1 += 16;
+      p2 += 16;
+      /* Load up to 16 bytes of fragment.  */
+      frag2 = strloadu (p2);
+      /* Load up to 16 bytes of fragment.  */
+      frag1 = strloadu (p1);
+      /* Unsigned bytes, equal order, does frag2 has null?  */
+      cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+      cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+      cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+      cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
+    }
+
+  /* Full determination yielded false result, retrace s1 to next
+     starting position.
+     Zflg	1      0      1			0/1
+     Sflg	0      1      1			0/1
+     cmp	na     0      0			>0
+     action   done   done   continue    continue if s2 < s1
+	      false  match  retrace s1     else false
+   */
+
+  if(cmp_s & !cmp)
+    return (char *) pt;
+  else if (cmp_z)
+    {
+      if (!cmp_s)
+	return NULL;
+
+      /* Handle both zero and sign flag set and s1 is shorter in
+	 length.  */
+      zero = _mm_setzero_si128 ();
+      bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2));
+      bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1));
+      __asm ("bsfl %[bmsk], %[len]"
+	     : [len] "=r" (len) : [bmsk] "r" (bmsk));
+      __asm ("bsfl %[bmsk1], %[len1]"
+	     : [len1] "=r" (len1) : [bmsk1] "r" (bmsk1));
+      if (len >= len1)
+	return NULL;
+    }
+  else if (!cmp)
+    return (char *) pt;
+
+  /* Otherwise, we have to retrace and continue.  Default of multiple
+     paths that need to retrace from next byte in s1.  */
+  p2 = s2;
+  frag2 = strloadu (p2);
+
+  if (!kmp_fwd)
+    kmp_fwd = KMP16Bovrlap (frag2);
+
+  /* KMP algorithm predicted overlap needs to be corrected for
+     partial fragment compare.  */
+  p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd);
+
+  /* Since s2 is at least 16 bytes long, we're certain there is no
+     match.  */
+  if (!p1[0])
+    return NULL;
+  else
+    {
+      /* Load up to 16 bytes of fragment.  */
+      frag1 = strloadu (p1);
+    }
+
+  /* Unsigned bytes, equal order, is there a partial match?  */
+  cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+  cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+  cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+  goto re_trace;
+}