[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[commits] r8786 - in /fsf/trunk/libc: ./ elf/ nptl/ nptl/sysdeps/unix/sysv/linux/x86_64/ sysdeps/i386/ sysdeps/i386/i686/multiarch/ sy...



Author: eglibc
Date: Sun Aug  9 00:05:21 2009
New Revision: 8786

Log:
Import glibc-mainline for 2009-08-09

Added:
    fsf/trunk/libc/elf/tst-audit6.c
    fsf/trunk/libc/elf/tst-audit7.c
    fsf/trunk/libc/elf/tst-auditmod6a.c
    fsf/trunk/libc/elf/tst-auditmod6b.c
    fsf/trunk/libc/elf/tst-auditmod6c.c
    fsf/trunk/libc/elf/tst-auditmod7a.c
    fsf/trunk/libc/elf/tst-auditmod7b.c
    fsf/trunk/libc/sysdeps/x86_64/dl-trampoline.h
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strcmp-ssse3.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strncmp-ssse3.S
Modified:
    fsf/trunk/libc/ChangeLog
    fsf/trunk/libc/NEWS
    fsf/trunk/libc/elf/Makefile
    fsf/trunk/libc/nptl/ChangeLog
    fsf/trunk/libc/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S
    fsf/trunk/libc/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S
    fsf/trunk/libc/sysdeps/i386/configure
    fsf/trunk/libc/sysdeps/i386/configure.in
    fsf/trunk/libc/sysdeps/i386/i686/multiarch/strcspn.S
    fsf/trunk/libc/sysdeps/i386/i686/multiarch/strspn.S
    fsf/trunk/libc/sysdeps/x86_64/dl-trampoline.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile
    fsf/trunk/libc/sysdeps/x86_64/multiarch/rawmemchr.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strcmp.S
    fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen.S
    fsf/trunk/libc/sysdeps/x86_64/strcmp.S

Modified: fsf/trunk/libc/ChangeLog
==============================================================================
--- fsf/trunk/libc/ChangeLog (original)
+++ fsf/trunk/libc/ChangeLog Sun Aug  9 00:05:21 2009
@@ -1,4 +1,57 @@
+2009-08-01  H.J. Lu  <hongjiu.lu@xxxxxxxxx>
+
+	* elf/Makefile (distribute): Add tst-audit6.c tst-auditmod6a.c
+	tst-auditmod6b.c tst-auditmod6c.c tst-audit7.c tst-auditmod7a.c
+	tst-auditmod7b.c.
+	(tests): Add tst-audit6 tst-audit7.
+	(modules-names): Add st-auditmod6a tst-auditmod6b tst-auditmod6c
+	tst-auditmod7a tst-auditmod7b.
+	($(objpfx)tst-audit6): New.
+	($(objpfx)tst-audit6.out): Likewise.
+	($(objpfx)tst-audit7): Likewise.
+	($(objpfx)tst-audit7.out): Likewise.
+	(tst-audit6-ENV): Likewise.
+	(tst-audit7-ENV): Likewise.
+	(CFLAGS-tst-auditmod6b.c): Likewise.
+	(CFLAGS-tst-auditmod6c.c): Likewise.
+	(CFLAGS-tst-auditmod7b.c): Likewise.
+	* elf/tst-audit6.c: New file.
+	* elf/tst-audit7.c: New file.
+	* elf/tst-auditmod6a.c: New file.
+	* elf/tst-auditmod6b.c: New file.
+	* elf/tst-auditmod6c.c: New file.
+	* elf/tst-auditmod7a.c: New file.
+	* elf/tst-auditmod7b.c: New file.
+	* sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Move
+	saving and restoring SSE/AVX registers to ...
+	* sysdeps/x86_64/dl-trampoline.h: This.  New file.
+
+2009-08-07  H.J. Lu  <hongjiu.lu@xxxxxxxxx>
+
+	* sysdeps/i386/i686/multiarch/strcspn.S (STRCSPN): Use PIC
+	only if SHARED is defined.
+	* sysdeps/i386/i686/multiarch/strspn.S (strspn): Likewise.
+
+2009-08-03  Jim Meyering  <meyering@xxxxxxxxxx>
+
+	* sysdeps/i386/configure.in: Use AC_HEADER_CHECK.
+
+2009-08-08  Ulrich Drepper  <drepper@xxxxxxxxxx>
+
+	* sysdeps/x86_64/multiarch/strlen.S: Move SSE4.2 version into the same
+	section as the other functions for this architecture.
+	* sysdeps/x86_64/multiarch/rawmemchr.S: Likewise.
+
 2009-08-07  Ulrich Drepper  <drepper@xxxxxxxxxx>
+
+	* sysdeps/x86_64/strcmp.S: Add support to compile with
+	USE_SSSE3.  In this case palignr is used.
+	* sysdeps/x86_64/multiarch/strcmp.S (strcmp): If SSE4.3 is not
+	available but SSSE3 is, pick __str{,n}cmp_ssse3.
+	* sysdeps/x86_64/multiarch/Makefile [subdir=string] (sysdep_routines):
+	Add strcmp-ssse3 and strncmp-ssse3.
+	* sysdeps/x86_64/multiarch/strcmp-ssse3.S: New file.
+	* sysdeps/x86_64/multiarch/strncmp-ssse3.S: New file.
 
 	* sysdeps/x86_64/multiarch/strcspn-c.c (STRCSPN_SSE42): Avoid
 	warning through fake initialization.

Modified: fsf/trunk/libc/NEWS
==============================================================================
--- fsf/trunk/libc/NEWS (original)
+++ fsf/trunk/libc/NEWS Sun Aug  9 00:05:21 2009
@@ -1,4 +1,4 @@
-GNU C Library NEWS -- history of user-visible changes.  2009-7-21
+GNU C Library NEWS -- history of user-visible changes.  2009-8-8
 Copyright (C) 1992-2008, 2009 Free Software Foundation, Inc.
 See the end for copying conditions.
 
@@ -17,11 +17,19 @@
   Implemented by H.J. Lu.
 
 * New optimized string functions for x86-64: strstr, strcasestr, memcmp,
-  strcspn, strpbrk, strspn, strcpy, stpcpy, strncpy, strcmp, strncmp.
+  strcspn, strpbrk, strspn, strcpy, stpcpy, strncpy, strcmp (SSE2, SSE4.2),
+  strncmp (SSE2, SSE4.2).
   Contributed by H.J. Lu.
 
-  strlen, rawmemchr.
-  Implemented by Ulrich Drepper.
+  strlen, rawmemchr, strcmp (SSSE3), strncmp (SSSE3).
+  Implemented by Ulrich Drepper.
+
+* New optimized string functions for x86: strlen, strcspn, strspn, strpbrk,
+  strstr, strcasestr.
+  Contributed by H.J. Lu.
+
+* Support for fma instruction in AVX on x86-64.
+  Implemented by H.J. Lu and Ulrich Drepper.
 
 * AVX support in x86-64 auditing support in ld.so.
   Implemented by H.J. Lu.
@@ -35,6 +43,16 @@
   the second request.  The 'single-request-reopen' option in /etc/resolv.conf
   can be used to select this mode right away, instead of rediscovering the
   necessity is every process again.
+  Implemented by Ulrich Drepper.
+
+* New resolver flag RES_USE_DNSSEC to enable use of verified lookup.
+  Implemented by Adam Tkac.
+
+* Optimized iconv conversions for S390x.
+  Implemented by Andreas Krebbel.
+
+* Using condvars with PI mutexes is now more efficient due to kernel
+  support for requeueing to PI futexes.  NPTL support added for x86-64.
   Implemented by Ulrich Drepper.
 
 

Modified: fsf/trunk/libc/elf/Makefile
==============================================================================
--- fsf/trunk/libc/elf/Makefile (original)
+++ fsf/trunk/libc/elf/Makefile Sun Aug  9 00:05:21 2009
@@ -93,6 +93,9 @@
 		   tst-auditmod1.c tst-auditmod3a.c tst-auditmod3b.c \
 		   tst-auditmod4a.c tst-auditmod4b.c \
 		   tst-audit5.c tst-auditmod5a.c tst-auditmod5b.c \
+		   tst-audit6.c tst-auditmod6a.c tst-auditmod6b.c \
+		   tst-auditmod6c.c \
+		   tst-audit7.c tst-auditmod7a.c tst-auditmod7b.c \
 		   order2mod1.c order2mod2.c order2mod3.c order2mod4.c \
 		   tst-stackguard1.c tst-stackguard1-static.c \
 		   tst-array5.c tst-array5-static.c tst-array5dep.c \
@@ -200,7 +203,7 @@
 test-srcs = tst-pathopt
 tests-execstack-yes = tst-execstack tst-execstack-needed tst-execstack-prog
 ifeq (x86_64,$(config-machine))
-tests += tst-audit3 tst-audit4 tst-audit5
+tests += tst-audit3 tst-audit4 tst-audit5 tst-audit6 tst-audit7
 endif
 endif
 ifeq (yesyes,$(have-fpie)$(build-shared))
@@ -255,7 +258,9 @@
 ifeq (x86_64,$(config-machine))
 modules-names += tst-auditmod3a tst-auditmod3b \
 		tst-auditmod4a tst-auditmod4b \
-		tst-auditmod5a tst-auditmod5b
+		tst-auditmod5a tst-auditmod5b \
+		tst-auditmod6a tst-auditmod6b tst-auditmod6c \
+		tst-auditmod7a tst-auditmod7b
 endif
 modules-execstack-yes = tst-execstack-mod
 extra-test-objs += $(addsuffix .os,$(strip $(modules-names)))
@@ -987,6 +992,15 @@
 $(objpfx)tst-audit5.out: $(objpfx)tst-auditmod5b.so
 tst-audit5-ENV = LD_AUDIT=$(objpfx)tst-auditmod5b.so
 
+$(objpfx)tst-audit6: $(objpfx)tst-auditmod6a.so
+$(objpfx)tst-audit6.out: $(objpfx)tst-auditmod6b.so \
+			 $(objpfx)tst-auditmod6c.so
+tst-audit6-ENV = LD_AUDIT=$(objpfx)tst-auditmod6b.so:$(objpfx)tst-auditmod6c.so
+
+$(objpfx)tst-audit7: $(objpfx)tst-auditmod7a.so
+$(objpfx)tst-audit7.out: $(objpfx)tst-auditmod7b.so
+tst-audit7-ENV = LD_AUDIT=$(objpfx)tst-auditmod7b.so
+
 $(objpfx)tst-global1: $(libdl)
 $(objpfx)tst-global1.out: $(objpfx)testobj6.so $(objpfx)testobj2.so
 
@@ -1134,4 +1148,7 @@
 CFLAGS-tst-audit4.c += -mavx
 CFLAGS-tst-auditmod4a.c += -mavx
 CFLAGS-tst-auditmod4b.c += -mavx
-endif
+CFLAGS-tst-auditmod6b.c += -mavx
+CFLAGS-tst-auditmod6c.c += -mavx
+CFLAGS-tst-auditmod7b.c += -mavx
+endif

Added: fsf/trunk/libc/elf/tst-audit6.c
==============================================================================
--- fsf/trunk/libc/elf/tst-audit6.c (added)
+++ fsf/trunk/libc/elf/tst-audit6.c Sun Aug  9 00:05:21 2009
@@ -1,0 +1,28 @@
+/* Test case for x86-64 preserved registers in dynamic linker.  */
+
+#include <stdlib.h>
+#include <string.h>
+#include <cpuid.h>
+#include <emmintrin.h>
+
+extern __m128i audit_test (__m128i, __m128i, __m128i, __m128i,
+			   __m128i, __m128i, __m128i, __m128i);
+
+int
+main (void)
+{
+  unsigned int eax, ebx, ecx, edx;
+
+  /* Run AVX test only if AVX is supported.  */
+  if (__get_cpuid (1, &eax, &ebx, &ecx, &edx)
+      && (ecx & bit_AVX))
+    {
+      __m128i xmm = _mm_setzero_si128 ();
+      __m128i ret = audit_test (xmm, xmm, xmm, xmm, xmm, xmm, xmm, xmm);
+
+      xmm = _mm_set1_epi32 (0x98abcdef);
+      if (memcmp (&xmm, &ret, sizeof (ret)))
+	abort ();
+    }
+  return 0;
+}

Added: fsf/trunk/libc/elf/tst-audit7.c
==============================================================================
--- fsf/trunk/libc/elf/tst-audit7.c (added)
+++ fsf/trunk/libc/elf/tst-audit7.c Sun Aug  9 00:05:21 2009
@@ -1,0 +1,1 @@
+#include "tst-audit6.c"

Added: fsf/trunk/libc/elf/tst-auditmod6a.c
==============================================================================
--- fsf/trunk/libc/elf/tst-auditmod6a.c (added)
+++ fsf/trunk/libc/elf/tst-auditmod6a.c Sun Aug  9 00:05:21 2009
@@ -1,0 +1,46 @@
+/* Test case for x86-64 preserved registers in dynamic linker.  */
+
+#include <stdlib.h>
+#include <string.h>
+#include <emmintrin.h>
+
+__m128i
+audit_test (__m128i x0, __m128i x1, __m128i x2, __m128i x3,
+	    __m128i x4, __m128i x5, __m128i x6, __m128i x7)
+{
+  __m128i xmm;
+
+  xmm =  _mm_set1_epi32 (0x100);
+  if (memcmp (&xmm, &x0, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (0x101);
+  if (memcmp (&xmm, &x1, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (0x102);
+  if (memcmp (&xmm, &x2, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (0x103);
+  if (memcmp (&xmm, &x3, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (0x104);
+  if (memcmp (&xmm, &x4, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (0x105);
+  if (memcmp (&xmm, &x5, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (0x106);
+  if (memcmp (&xmm, &x6, sizeof (xmm)))
+    abort ();
+
+  xmm =  _mm_set1_epi32 (0x107);
+  if (memcmp (&xmm, &x7, sizeof (xmm)))
+    abort ();
+
+  return _mm_setzero_si128 ();
+}

Added: fsf/trunk/libc/elf/tst-auditmod6b.c
==============================================================================
--- fsf/trunk/libc/elf/tst-auditmod6b.c (added)
+++ fsf/trunk/libc/elf/tst-auditmod6b.c Sun Aug  9 00:05:21 2009
@@ -1,0 +1,220 @@
+/* Verify that changing AVX registers in audit library won't affect
+   function parameter passing/return.  */
+
+#include <dlfcn.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <bits/wordsize.h>
+#include <gnu/lib-names.h>
+
+unsigned int
+la_version (unsigned int v)
+{
+  setlinebuf (stdout);
+
+  printf ("version: %u\n", v);
+
+  char buf[20];
+  sprintf (buf, "%u", v);
+
+  return v;
+}
+
+void
+la_activity (uintptr_t *cookie, unsigned int flag)
+{
+  if (flag == LA_ACT_CONSISTENT)
+    printf ("activity: consistent\n");
+  else if (flag == LA_ACT_ADD)
+    printf ("activity: add\n");
+  else if (flag == LA_ACT_DELETE)
+    printf ("activity: delete\n");
+  else
+    printf ("activity: unknown activity %u\n", flag);
+}
+
+char *
+la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
+{
+  char buf[100];
+  const char *flagstr;
+  if (flag == LA_SER_ORIG)
+    flagstr = "LA_SET_ORIG";
+  else if (flag == LA_SER_LIBPATH)
+    flagstr = "LA_SER_LIBPATH";
+  else if (flag == LA_SER_RUNPATH)
+    flagstr = "LA_SER_RUNPATH";
+  else if (flag == LA_SER_CONFIG)
+    flagstr = "LA_SER_CONFIG";
+  else if (flag == LA_SER_DEFAULT)
+    flagstr = "LA_SER_DEFAULT";
+  else if (flag == LA_SER_SECURE)
+    flagstr = "LA_SER_SECURE";
+  else
+    {
+       sprintf (buf, "unknown flag %d", flag);
+       flagstr = buf;
+    }
+  printf ("objsearch: %s, %s\n", name, flagstr);
+
+  return (char *) name;
+}
+
+unsigned int
+la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie)
+{
+  printf ("objopen: %ld, %s\n", lmid, l->l_name);
+
+  return 3;
+}
+
+void
+la_preinit (uintptr_t *cookie)
+{
+  printf ("preinit\n");
+}
+
+unsigned int
+la_objclose  (uintptr_t *cookie)
+{
+  printf ("objclose\n");
+  return 0;
+}
+
+uintptr_t
+la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+#define pltenter la_x86_64_gnu_pltenter
+#define pltexit la_x86_64_gnu_pltexit
+#define La_regs La_x86_64_regs
+#define La_retval La_x86_64_retval
+#define int_retval lrv_rax
+
+#include <tst-audit.h>
+
+#ifdef __AVX__
+#include <immintrin.h>
+#include <cpuid.h>
+
+static int avx = -1;
+
+static int
+__attribute ((always_inline))
+check_avx (void)
+{
+  if (avx == -1)
+    {
+      unsigned int eax, ebx, ecx, edx;
+
+      if (__get_cpuid (1, &eax, &ebx, &ecx, &edx)
+	  && (ecx & bit_AVX))
+	avx = 1;
+      else
+	avx = 0;
+    }
+  return avx;
+}
+#else
+#include <emmintrin.h>
+#endif
+
+ElfW(Addr)
+pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	  uintptr_t *defcook, La_regs *regs, unsigned int *flags,
+	  const char *symname, long int *framesizep)
+{
+  printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+#ifdef __AVX__
+  if (check_avx () && strcmp (symname, "audit_test") == 0)
+    {
+      int i;
+
+      __m128i xmm = _mm_setzero_si128 ();
+      for (i = 0; i < 8; i++)
+	if (memcmp (&regs->lr_xmm[i], &xmm, sizeof (xmm))
+	    || memcmp (&regs->lr_vector[i], &xmm, sizeof (xmm)))
+	  abort ();
+
+      for (i = 0; i < 8; i += 2)
+	{
+	  regs->lr_xmm[i] = (La_x86_64_xmm) _mm_set1_epi32 (i + 1);
+	  regs->lr_vector[i].xmm[0] = regs->lr_xmm[i];
+	  regs->lr_vector[i + 1].ymm[0]
+	    = (La_x86_64_ymm) _mm256_set1_epi32 (i + 2);
+	  regs->lr_xmm[i + 1] = regs->lr_vector[i + 1].xmm[0];
+	}
+
+      __m256i ymm = _mm256_set1_epi32 (-1);
+      asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" );
+      asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" );
+      asm volatile ("vmovdqa %0, %%ymm2" : : "x" (ymm) : "xmm2" );
+      asm volatile ("vmovdqa %0, %%ymm3" : : "x" (ymm) : "xmm3" );
+      asm volatile ("vmovdqa %0, %%ymm4" : : "x" (ymm) : "xmm4" );
+      asm volatile ("vmovdqa %0, %%ymm5" : : "x" (ymm) : "xmm5" );
+      asm volatile ("vmovdqa %0, %%ymm6" : : "x" (ymm) : "xmm6" );
+      asm volatile ("vmovdqa %0, %%ymm7" : : "x" (ymm) : "xmm7" );
+
+      *framesizep = 1024;
+    }
+#endif
+
+  return sym->st_value;
+}
+
+unsigned int
+pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	 uintptr_t *defcook, const La_regs *inregs, La_retval *outregs,
+	 const char *symname)
+{
+  printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n",
+	  symname, (long int) sym->st_value, ndx, outregs->int_retval);
+
+#ifdef __AVX__
+  if (check_avx () && strcmp (symname, "audit_test") == 0)
+    {
+      int i;
+
+      __m128i xmm = _mm_setzero_si128 ();
+      if (memcmp (&outregs->lrv_xmm0, &xmm, sizeof (xmm))
+	  || memcmp (&outregs->lrv_vector0, &xmm, sizeof (xmm)))
+	abort ();
+
+      __m256i ymm;
+
+      for (i = 0; i < 8; i += 2)
+	{
+	  xmm = _mm_set1_epi32 (i + 0x100);
+	  if (memcmp (&inregs->lr_xmm[i], &xmm, sizeof (xmm))
+	      || memcmp (&inregs->lr_vector[i], &xmm, sizeof (xmm)))
+	    abort ();
+
+	  ymm = _mm256_set1_epi32 (i + 0x101);
+	  if (memcmp (&inregs->lr_xmm[i + 1],
+		      &inregs->lr_vector[i + 1].xmm[0], sizeof (xmm))
+	      || memcmp (&inregs->lr_vector[i + 1], &ymm, sizeof (ymm)))
+	    abort ();
+	}
+
+      outregs->lrv_vector0.ymm[0]
+	= (La_x86_64_ymm) _mm256_set1_epi32 (0x12349876);
+
+      ymm = _mm256_set1_epi32 (-1);
+      asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" );
+      asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" );
+    }
+#endif
+
+  return 0;
+}

Added: fsf/trunk/libc/elf/tst-auditmod6c.c
==============================================================================
--- fsf/trunk/libc/elf/tst-auditmod6c.c (added)
+++ fsf/trunk/libc/elf/tst-auditmod6c.c Sun Aug  9 00:05:21 2009
@@ -1,0 +1,225 @@
+/* Verify that changing AVX registers in audit library won't affect
+   function parameter passing/return.  */
+
+#include <dlfcn.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <bits/wordsize.h>
+#include <gnu/lib-names.h>
+
+unsigned int
+la_version (unsigned int v)
+{
+  setlinebuf (stdout);
+
+  printf ("version: %u\n", v);
+
+  char buf[20];
+  sprintf (buf, "%u", v);
+
+  return v;
+}
+
+void
+la_activity (uintptr_t *cookie, unsigned int flag)
+{
+  if (flag == LA_ACT_CONSISTENT)
+    printf ("activity: consistent\n");
+  else if (flag == LA_ACT_ADD)
+    printf ("activity: add\n");
+  else if (flag == LA_ACT_DELETE)
+    printf ("activity: delete\n");
+  else
+    printf ("activity: unknown activity %u\n", flag);
+}
+
+char *
+la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
+{
+  char buf[100];
+  const char *flagstr;
+  if (flag == LA_SER_ORIG)
+    flagstr = "LA_SET_ORIG";
+  else if (flag == LA_SER_LIBPATH)
+    flagstr = "LA_SER_LIBPATH";
+  else if (flag == LA_SER_RUNPATH)
+    flagstr = "LA_SER_RUNPATH";
+  else if (flag == LA_SER_CONFIG)
+    flagstr = "LA_SER_CONFIG";
+  else if (flag == LA_SER_DEFAULT)
+    flagstr = "LA_SER_DEFAULT";
+  else if (flag == LA_SER_SECURE)
+    flagstr = "LA_SER_SECURE";
+  else
+    {
+       sprintf (buf, "unknown flag %d", flag);
+       flagstr = buf;
+    }
+  printf ("objsearch: %s, %s\n", name, flagstr);
+
+  return (char *) name;
+}
+
+unsigned int
+la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie)
+{
+  printf ("objopen: %ld, %s\n", lmid, l->l_name);
+
+  return 3;
+}
+
+void
+la_preinit (uintptr_t *cookie)
+{
+  printf ("preinit\n");
+}
+
+unsigned int
+la_objclose  (uintptr_t *cookie)
+{
+  printf ("objclose\n");
+  return 0;
+}
+
+uintptr_t
+la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+#define pltenter la_x86_64_gnu_pltenter
+#define pltexit la_x86_64_gnu_pltexit
+#define La_regs La_x86_64_regs
+#define La_retval La_x86_64_retval
+#define int_retval lrv_rax
+
+#include <tst-audit.h>
+
+#ifdef __AVX__
+#include <immintrin.h>
+#include <cpuid.h>
+
+static int avx = -1;
+
+static int
+__attribute ((always_inline))
+check_avx (void)
+{
+  if (avx == -1)
+    {
+      unsigned int eax, ebx, ecx, edx;
+
+      if (__get_cpuid (1, &eax, &ebx, &ecx, &edx)
+	  && (ecx & bit_AVX))
+	avx = 1;
+      else
+	avx = 0;
+    }
+  return avx;
+}
+#else
+#include <emmintrin.h>
+#endif
+
+ElfW(Addr)
+pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	  uintptr_t *defcook, La_regs *regs, unsigned int *flags,
+	  const char *symname, long int *framesizep)
+{
+  printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+#ifdef __AVX__
+  if (check_avx () && strcmp (symname, "audit_test") == 0)
+    {
+      int i;
+      __m128i xmm;
+      __m256i ymm;
+
+      for (i = 0; i < 8; i += 2)
+	{
+	  xmm = _mm_set1_epi32 (i + 1);
+	  if (memcmp (&regs->lr_xmm[i], &xmm, sizeof (xmm))
+	      || memcmp (&regs->lr_vector[i], &xmm, sizeof (xmm)))
+	    abort ();
+	  regs->lr_xmm[i] = (La_x86_64_xmm) _mm_set1_epi32 (i + 0x100);
+	  regs->lr_vector[i].xmm[0] = regs->lr_xmm[i];
+
+	  ymm = _mm256_set1_epi32 (i + 2);
+	  if (memcmp (&regs->lr_xmm[i + 1],
+		      &regs->lr_vector[i + 1].xmm[0], sizeof (xmm))
+	      || memcmp (&regs->lr_vector[i + 1], &ymm, sizeof (ymm)))
+	    abort ();
+	  regs->lr_vector[i + 1].ymm[0]
+	    = (La_x86_64_ymm) _mm256_set1_epi32 (i + 0x101);
+	  regs->lr_xmm[i + 1] = regs->lr_vector[i + 1].xmm[0];
+	}
+
+      ymm = _mm256_set1_epi32 (-1);
+      asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" );
+      asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" );
+      asm volatile ("vmovdqa %0, %%ymm2" : : "x" (ymm) : "xmm2" );
+      asm volatile ("vmovdqa %0, %%ymm3" : : "x" (ymm) : "xmm3" );
+      asm volatile ("vmovdqa %0, %%ymm4" : : "x" (ymm) : "xmm4" );
+      asm volatile ("vmovdqa %0, %%ymm5" : : "x" (ymm) : "xmm5" );
+      asm volatile ("vmovdqa %0, %%ymm6" : : "x" (ymm) : "xmm6" );
+      asm volatile ("vmovdqa %0, %%ymm7" : : "x" (ymm) : "xmm7" );
+
+      *framesizep = 1024;
+    }
+#endif
+
+  return sym->st_value;
+}
+
+unsigned int
+pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	 uintptr_t *defcook, const La_regs *inregs, La_retval *outregs,
+	 const char *symname)
+{
+  printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n",
+	  symname, (long int) sym->st_value, ndx, outregs->int_retval);
+
+#ifdef __AVX__
+  if (check_avx () && strcmp (symname, "audit_test") == 0)
+    {
+      int i;
+
+      __m256i ymm = _mm256_set1_epi32 (0x12349876);;
+      if (memcmp (&outregs->lrv_vector0, &ymm, sizeof (ymm)))
+	abort ();
+
+      __m128i xmm;
+
+      for (i = 0; i < 8; i += 2)
+	{
+	  xmm = _mm_set1_epi32 (i + 0x100);
+	  if (memcmp (&inregs->lr_xmm[i], &xmm, sizeof (xmm))
+	      || memcmp (&inregs->lr_vector[i], &xmm, sizeof (xmm)))
+	    abort ();
+
+	  ymm = _mm256_set1_epi32 (i + 0x101);
+	  if (memcmp (&inregs->lr_xmm[i + 1],
+		      &inregs->lr_vector[i + 1].xmm[0], sizeof (xmm))
+	      || memcmp (&inregs->lr_vector[i + 1], &ymm, sizeof (ymm)))
+	    abort ();
+	}
+
+      outregs->lrv_vector0.ymm[0]
+	= (La_x86_64_ymm) _mm256_set1_epi32 (0x98abcdef);
+
+      ymm = _mm256_set1_epi32 (-1);
+      asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" );
+      asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" );
+    }
+#endif
+
+  return 0;
+}

Added: fsf/trunk/libc/elf/tst-auditmod7a.c
==============================================================================
--- fsf/trunk/libc/elf/tst-auditmod7a.c (added)
+++ fsf/trunk/libc/elf/tst-auditmod7a.c Sun Aug  9 00:05:21 2009
@@ -1,0 +1,1 @@
+#include "tst-auditmod6a.c"

Added: fsf/trunk/libc/elf/tst-auditmod7b.c
==============================================================================
--- fsf/trunk/libc/elf/tst-auditmod7b.c (added)
+++ fsf/trunk/libc/elf/tst-auditmod7b.c Sun Aug  9 00:05:21 2009
@@ -1,0 +1,218 @@
+/* Verify that changing AVX registers in audit library won't affect
+   function parameter passing/return.  */
+
+#include <dlfcn.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <bits/wordsize.h>
+#include <gnu/lib-names.h>
+
+unsigned int
+la_version (unsigned int v)
+{
+  setlinebuf (stdout);
+
+  printf ("version: %u\n", v);
+
+  char buf[20];
+  sprintf (buf, "%u", v);
+
+  return v;
+}
+
+void
+la_activity (uintptr_t *cookie, unsigned int flag)
+{
+  if (flag == LA_ACT_CONSISTENT)
+    printf ("activity: consistent\n");
+  else if (flag == LA_ACT_ADD)
+    printf ("activity: add\n");
+  else if (flag == LA_ACT_DELETE)
+    printf ("activity: delete\n");
+  else
+    printf ("activity: unknown activity %u\n", flag);
+}
+
+char *
+la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
+{
+  char buf[100];
+  const char *flagstr;
+  if (flag == LA_SER_ORIG)
+    flagstr = "LA_SET_ORIG";
+  else if (flag == LA_SER_LIBPATH)
+    flagstr = "LA_SER_LIBPATH";
+  else if (flag == LA_SER_RUNPATH)
+    flagstr = "LA_SER_RUNPATH";
+  else if (flag == LA_SER_CONFIG)
+    flagstr = "LA_SER_CONFIG";
+  else if (flag == LA_SER_DEFAULT)
+    flagstr = "LA_SER_DEFAULT";
+  else if (flag == LA_SER_SECURE)
+    flagstr = "LA_SER_SECURE";
+  else
+    {
+       sprintf (buf, "unknown flag %d", flag);
+       flagstr = buf;
+    }
+  printf ("objsearch: %s, %s\n", name, flagstr);
+
+  return (char *) name;
+}
+
+unsigned int
+la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie)
+{
+  printf ("objopen: %ld, %s\n", lmid, l->l_name);
+
+  return 3;
+}
+
+void
+la_preinit (uintptr_t *cookie)
+{
+  printf ("preinit\n");
+}
+
+unsigned int
+la_objclose  (uintptr_t *cookie)
+{
+  printf ("objclose\n");
+  return 0;
+}
+
+uintptr_t
+la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+	      uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+  printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+  return sym->st_value;
+}
+
+#define pltenter la_x86_64_gnu_pltenter
+#define pltexit la_x86_64_gnu_pltexit
+#define La_regs La_x86_64_regs
+#define La_retval La_x86_64_retval
+#define int_retval lrv_rax
+
+#include <tst-audit.h>
+
+#ifdef __AVX__
+#include <immintrin.h>
+#include <cpuid.h>
+
+static int avx = -1;
+
+static int
+__attribute ((always_inline))
+check_avx (void)
+{
+  if (avx == -1)
+    {
+      unsigned int eax, ebx, ecx, edx;
+
+      if (__get_cpuid (1, &eax, &ebx, &ecx, &edx)
+	  && (ecx & bit_AVX))
+	avx = 1;
+      else
+	avx = 0;
+    }
+  return avx;
+}
+#else
+#include <emmintrin.h>
+#endif
+
+ElfW(Addr)
+pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	  uintptr_t *defcook, La_regs *regs, unsigned int *flags,
+	  const char *symname, long int *framesizep)
+{
+  printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+
+#ifdef __AVX__
+  if (check_avx () && strcmp (symname, "audit_test") == 0)
+    {
+      int i;
+
+      __m128i xmm = _mm_setzero_si128 ();
+      for (i = 0; i < 8; i++)
+	if (memcmp (&regs->lr_xmm[i], &xmm, sizeof (xmm))
+	    || memcmp (&regs->lr_vector[i], &xmm, sizeof (xmm)))
+	  abort ();
+
+      for (i = 0; i < 8; i += 2)
+	{
+	  regs->lr_xmm[i] = (La_x86_64_xmm) _mm_set1_epi32 (i + 0x100);
+	  regs->lr_vector[i + 1].ymm[0]
+	    = (La_x86_64_ymm) _mm256_set1_epi32 (i + 0x101);
+	}
+
+      __m256i ymm = _mm256_set1_epi32 (-1);
+      asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" );
+      asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" );
+      asm volatile ("vmovdqa %0, %%ymm2" : : "x" (ymm) : "xmm2" );
+      asm volatile ("vmovdqa %0, %%ymm3" : : "x" (ymm) : "xmm3" );
+      asm volatile ("vmovdqa %0, %%ymm4" : : "x" (ymm) : "xmm4" );
+      asm volatile ("vmovdqa %0, %%ymm5" : : "x" (ymm) : "xmm5" );
+      asm volatile ("vmovdqa %0, %%ymm6" : : "x" (ymm) : "xmm6" );
+      asm volatile ("vmovdqa %0, %%ymm7" : : "x" (ymm) : "xmm7" );
+
+      *framesizep = 1024;
+    }
+#endif
+
+  return sym->st_value;
+}
+
+unsigned int
+pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+	 uintptr_t *defcook, const La_regs *inregs, La_retval *outregs,
+	 const char *symname)
+{
+  printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n",
+	  symname, (long int) sym->st_value, ndx, outregs->int_retval);
+
+#ifdef __AVX__
+  if (check_avx () && strcmp (symname, "audit_test") == 0)
+    {
+      int i;
+
+      __m128i xmm = _mm_setzero_si128 ();
+      if (memcmp (&outregs->lrv_xmm0, &xmm, sizeof (xmm))
+	  || memcmp (&outregs->lrv_vector0, &xmm, sizeof (xmm)))
+	abort ();
+
+      __m256i ymm;
+
+      for (i = 0; i < 8; i += 2)
+	{
+	  xmm = _mm_set1_epi32 (i + 0x100);
+	  if (memcmp (&inregs->lr_xmm[i], &xmm, sizeof (xmm))
+	      || memcmp (&inregs->lr_vector[i], &xmm, sizeof (xmm)))
+	    abort ();
+
+	  ymm = _mm256_set1_epi32 (i + 0x101);
+	  if (memcmp (&inregs->lr_xmm[i + 1],
+		      &inregs->lr_vector[i + 1].xmm[0], sizeof (xmm))
+	      || memcmp (&inregs->lr_vector[i + 1], &ymm, sizeof (ymm)))
+	    abort ();
+	}
+
+      outregs->lrv_vector0.ymm[0]
+	= (La_x86_64_ymm) _mm256_set1_epi32 (0x98abcdef);
+
+      ymm = _mm256_set1_epi32 (-1);
+      asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" );
+      asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" );
+    }
+#endif
+
+  return 0;
+}

Modified: fsf/trunk/libc/nptl/ChangeLog
==============================================================================
--- fsf/trunk/libc/nptl/ChangeLog (original)
+++ fsf/trunk/libc/nptl/ChangeLog Sun Aug  9 00:05:21 2009
@@ -1,3 +1,12 @@
+2009-08-08  Ulrich Drepper  <drepper@xxxxxxxxxx>
+
+	* sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S (sem_timedwait):
+	Optimize code path used when FUTEX_CLOCK_REALTIME is supported.
+
+	* sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S
+	(__pthread_cond_wait): Optimize by avoiding use of callee-safe
+	register.
+
 2009-08-07  Ulrich Drepper  <drepper@xxxxxxxxxx>
 
 	* sysdeps/unix/sysv/linux/x86_64/sem_wait.S: Little optimizations

Modified: fsf/trunk/libc/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S
==============================================================================
--- fsf/trunk/libc/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S (original)
+++ fsf/trunk/libc/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S Sun Aug  9 00:05:21 2009
@@ -45,9 +45,6 @@
 	cfi_lsda(DW_EH_PE_udata4, .LexceptSTART)
 #endif
 
-	pushq	%r13
-	cfi_adjust_cfa_offset(8)
-	cfi_rel_offset(%r13, 0)
 #define FRAME_SIZE 32
 	leaq	-FRAME_SIZE(%rsp), %rsp
 	cfi_adjust_cfa_offset(FRAME_SIZE)
@@ -140,7 +137,7 @@
 	movl	$SYS_futex, %eax
 	syscall
 
-	movl	$1, %r13d
+	movl	$1, %r8d
 #ifdef __ASSUME_REQUEUE_PI
 	jmp	62f
 #else
@@ -158,7 +155,7 @@
 #else
 	orl	%fs:PRIVATE_FUTEX, %esi
 #endif
-60:	xorl	%r13d, %r13d
+60:	xorl	%r8d, %r8d
 	movl	$SYS_futex, %eax
 	syscall
 
@@ -233,20 +230,18 @@
 	/* If requeue_pi is used the kernel performs the locking of the
 	   mutex. */
 11:	movq	16(%rsp), %rdi
-	testl	%r13d, %r13d
+	testl	%r8d, %r8d
 	jnz	18f
 
 	callq	__pthread_mutex_cond_lock
 
-14:	movq	FRAME_SIZE(%rsp), %r13
-	leaq	FRAME_SIZE+8(%rsp), %rsp
-	cfi_adjust_cfa_offset(-(FRAME_SIZE + 8))
+14:	leaq	FRAME_SIZE(%rsp), %rsp
+	cfi_adjust_cfa_offset(-FRAME_SIZE)
 
 	/* We return the result of the mutex_lock operation.  */
 	retq
 
-	cfi_adjust_cfa_offset(8 + FRAME_SIZE)
-	cfi_rel_offset(%r13, FRAME_SIZE)
+	cfi_adjust_cfa_offset(FRAME_SIZE)
 
 18:	callq	__pthread_mutex_cond_lock_adjust
 	xorl	%eax, %eax
@@ -341,9 +336,7 @@
 __condvar_cleanup1:
 	/* Stack frame:
 
-	   rsp + 40
-		    +--------------------------+
-	   rsp + 32 | %r13                     |
+	   rsp + 32
 		    +--------------------------+
 	   rsp + 24 | unused                   |
 	            +--------------------------+
@@ -465,7 +458,6 @@
 	callq	__pthread_mutex_cond_lock
 
 	movq	24(%rsp), %rdi
-	movq	32(%rsp), %r13
 .LcallUR:
 	call	_Unwind_Resume@PLT
 	hlt

Modified: fsf/trunk/libc/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S
==============================================================================
--- fsf/trunk/libc/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S (original)
+++ fsf/trunk/libc/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S Sun Aug  9 00:05:21 2009
@@ -65,33 +65,8 @@
 	retq
 
 	/* Check whether the timeout value is valid.  */
-1:	pushq	%r12
-	cfi_adjust_cfa_offset(8)
-	cfi_rel_offset(%r12, 0)
-	pushq	%r13
-	cfi_adjust_cfa_offset(8)
-	cfi_rel_offset(%r13, 0)
-	pushq	%r14
-	cfi_adjust_cfa_offset(8)
-	cfi_rel_offset(%r14, 0)
-#ifdef __ASSUME_FUTEX_CLOCK_REALTIME
-# define STACKFRAME 8
-#else
-# define STACKFRAME 24
-#endif
-	subq	$STACKFRAME, %rsp
-	cfi_adjust_cfa_offset(STACKFRAME)
-
-	movq	%rdi, %r12
-	movq	%rsi, %r13
-
-	/* Check for invalid nanosecond field.  */
-	cmpq	$1000000000, 8(%r13)
-	movl	$EINVAL, %r14d
+1:	cmpq	$1000000000, 8(%rsi)
 	jae	6f
-
-	LOCK
-	addq	$1, NWAITERS(%r12)
 
 #ifndef __ASSUME_FUTEX_CLOCK_REALTIME
 #  ifdef PIC
@@ -102,15 +77,22 @@
 	je	.Lreltmo
 #endif
 
+	/* This push is only needed to store the sem_t pointer for the
+	   exception handler.  */
+	pushq	%rdi
+	cfi_adjust_cfa_offset(8)
+
+	movq	%rsi, %r10
+
+	LOCK
+	addq	$1, NWAITERS(%rdi)
+
 .LcleanupSTART:
 13:	call	__pthread_enable_asynccancel
-	movl	%eax, (%rsp)
-
-	movq	%r13, %r10
-#if VALUE == 0
-	movq	%r12, %rdi
-#else
-	leaq	VALUE(%r12), %rdi
+	movl	%eax, %r8d
+
+#if VALUE != 0
+	leaq	VALUE(%rdi), %rdi
 #endif
 	movl	$0xffffffff, %r9d
 	movl	$FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, %esi
@@ -118,22 +100,26 @@
 	movl	$SYS_futex, %eax
 	xorl	%edx, %edx
 	syscall
-	movq	%rax, %r14
-
-	movl	(%rsp), %edi
+	movq	%rax, %r9
+#if VALUE != 0
+	leaq	-VALUE(%rdi), %rdi
+#endif
+
+	xchgq	%r8, %rdi
 	call	__pthread_disable_asynccancel
 .LcleanupEND:
-
-	testq	%r14, %r14
+	movq	%r8, %rdi
+
+	testq	%r9, %r9
 	je	11f
-	cmpq	$-EWOULDBLOCK, %r14
+	cmpq	$-EWOULDBLOCK, %r9
 	jne	3f
 
 11:
 #if VALUE == 0
-	movl	(%r12), %eax
-#else
-	movl	VALUE(%r12), %eax
+	movl	(%rdi), %eax
+#else
+	movl	VALUE(%rdi), %eax
 #endif
 14:	testl	%eax, %eax
 	je	13b
@@ -141,49 +127,74 @@
 	leaq	-1(%rax), %rcx
 	LOCK
 #if VALUE == 0
-	cmpxchgl %ecx, (%r12)
-#else
-	cmpxchgl %ecx, VALUE(%r12)
+	cmpxchgl %ecx, (%rdi)
+#else
+	cmpxchgl %ecx, VALUE(%rdi)
 #endif
 	jne	14b
 
-10:	xorl	%eax, %eax
+	xorl	%eax, %eax
 
 15:	LOCK
-	subq	$1, NWAITERS(%r12)
-
-	addq	$STACKFRAME, %rsp
-	cfi_adjust_cfa_offset(-STACKFRAME)
-	popq	%r14
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore(%r14)
-	popq	%r13
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore(%r13)
-	popq	%r12
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore(%r12)
+	subq	$1, NWAITERS(%rdi)
+
+	leaq	8(%rsp), %rsp
+	cfi_adjust_cfa_offset(-8)
 	retq
 
-	cfi_adjust_cfa_offset(STACKFRAME + 3 * 8)
-	cfi_rel_offset(%r12, STACKFRAME + 2 * 8)
-	cfi_rel_offset(%r13, STACKFRAME + 1 * 8)
-	cfi_rel_offset(%r14, STACKFRAME)
-3:	negq	%r14
+	cfi_adjust_cfa_offset(8)
+3:	negq	%r9
+#if USE___THREAD
+	movq	errno@gottpoff(%rip), %rdx
+	movl	%r9d, %fs:(%rdx)
+#else
+	callq	__errno_location@plt
+	movl	%r9d, (%rax)
+#endif
+
+	orl	$-1, %eax
+	jmp	15b
+
+	cfi_adjust_cfa_offset(-8)
 6:
 #if USE___THREAD
 	movq	errno@gottpoff(%rip), %rdx
-	movl	%r14d, %fs:(%rdx)
+	movl	$EINVAL, %fs:(%rdx)
 #else
 	callq	__errno_location@plt
-	movl	%r14d, (%rax)
+	movl	$EINVAL, (%rax)
 #endif
 
 	orl	$-1, %eax
-	jmp	15b
+
+	retq
 
 #ifndef __ASSUME_FUTEX_CLOCK_REALTIME
 .Lreltmo:
+	pushq	%r12
+	cfi_adjust_cfa_offset(8)
+	cfi_rel_offset(%r12, 0)
+	pushq	%r13
+	cfi_adjust_cfa_offset(8)
+	cfi_rel_offset(%r13, 0)
+	pushq	%r14
+	cfi_adjust_cfa_offset(8)
+	cfi_rel_offset(%r14, 0)
+
+#ifdef __ASSUME_FUTEX_CLOCK_REALTIME
+# define STACKFRAME 8
+#else
+# define STACKFRAME 24
+#endif
+	subq	$STACKFRAME, %rsp
+	cfi_adjust_cfa_offset(STACKFRAME)
+
+	movq	%rdi, %r12
+	movq	%rsi, %r13
+
+	LOCK
+	addq	$1, NWAITERS(%r12)
+
 7:	xorl	%esi, %esi
 	movq	%rsp, %rdi
 	movq	$VSYSCALL_ADDR_vgettimeofday, %rax
@@ -202,7 +213,7 @@
 	decq	%rdi
 5:	testq	%rdi, %rdi
 	movl	$ETIMEDOUT, %r14d
-	js	6b		/* Time is already up.  */
+	js	36f		/* Time is already up.  */
 
 	movq	%rdi, (%rsp)	/* Store relative timeout.  */
 	movq	%rsi, 8(%rsp)
@@ -235,7 +246,7 @@
 	testq	%r14, %r14
 	je	9f
 	cmpq	$-EWOULDBLOCK, %r14
-	jne	3b
+	jne	33f
 
 9:
 # if VALUE == 0
@@ -254,15 +265,54 @@
 	cmpxchgl %ecx, VALUE(%r12)
 # endif
 	jne	8b
-	jmp	10b
-#endif
+
+	xorl	%eax, %eax
+
+45:	LOCK
+	subq	$1, NWAITERS(%r12)
+
+	addq	$STACKFRAME, %rsp
+	cfi_adjust_cfa_offset(-STACKFRAME)
+	popq	%r14
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(%r14)
+	popq	%r13
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(%r13)
+	popq	%r12
+	cfi_adjust_cfa_offset(-8)
+	cfi_restore(%r12)
+	retq
+
+	cfi_adjust_cfa_offset(STACKFRAME + 3 * 8)
+	cfi_rel_offset(%r12, STACKFRAME + 2 * 8)
+	cfi_rel_offset(%r13, STACKFRAME + 1 * 8)
+	cfi_rel_offset(%r14, STACKFRAME)
+33:	negq	%r14
+36:
+#if USE___THREAD
+	movq	errno@gottpoff(%rip), %rdx
+	movl	%r14d, %fs:(%rdx)
+#else
+	callq	__errno_location@plt
+	movl	%r14d, (%rax)
+#endif
+
+	orl	$-1, %eax
+	jmp	45b
+#endif
+	cfi_endproc
 	.size	sem_timedwait,.-sem_timedwait
 
 
 	.type	sem_timedwait_cleanup,@function
 sem_timedwait_cleanup:
-	LOCK
-	subq	$1, NWAITERS(%r12)
+	cfi_startproc
+	cfi_adjust_cfa_offset(8)
+
+	movq	(%rsp), %rdi
+	LOCK
+	subq	$1, NWAITERS(%rdi)
 	movq	%rax, %rdi
 .LcallUR:
 	call	_Unwind_Resume@PLT
@@ -270,6 +320,30 @@
 .LENDCODE:
 	cfi_endproc
 	.size	sem_timedwait_cleanup,.-sem_timedwait_cleanup
+
+
+#ifndef __ASSUME_FUTEX_CLOCK_REALTIME
+	.type	sem_timedwait_cleanup2,@function
+sem_timedwait_cleanup2:
+	cfi_startproc
+	cfi_adjust_cfa_offset(STACKFRAME + 3 * 8)
+	cfi_rel_offset(%r12, STACKFRAME + 2 * 8)
+	cfi_rel_offset(%r13, STACKFRAME + 1 * 8)
+	cfi_rel_offset(%r14, STACKFRAME)
+
+	LOCK
+	subq	$1, NWAITERS(%r12)
+	movq	%rax, %rdi
+	movq	STACKFRAME(%rsp), %r14
+	movq	STACKFRAME+8(%rsp), %r13
+	movq	STACKFRAME+16(%rsp), %r12
+.LcallUR2:
+	call	_Unwind_Resume@PLT
+	hlt
+.LENDCODE2:
+	cfi_endproc
+	.size	sem_timedwait_cleanup2,.-sem_timedwait_cleanup2
+#endif
 
 
 	.section .gcc_except_table,"a",@progbits
@@ -286,13 +360,19 @@
 #ifndef __ASSUME_FUTEX_CLOCK_REALTIME
 	.uleb128 .LcleanupSTART2-.LSTARTCODE
 	.uleb128 .LcleanupEND2-.LcleanupSTART2
-	.uleb128 sem_timedwait_cleanup-.LSTARTCODE
+	.uleb128 sem_timedwait_cleanup2-.LSTARTCODE
 	.uleb128  0
 #endif
 	.uleb128 .LcallUR-.LSTARTCODE
 	.uleb128 .LENDCODE-.LcallUR
 	.uleb128 0
 	.uleb128  0
+#ifndef __ASSUME_FUTEX_CLOCK_REALTIME
+	.uleb128 .LcallUR2-.LSTARTCODE
+	.uleb128 .LENDCODE2-.LcallUR2
+	.uleb128 0
+	.uleb128  0
+#endif
 .Lcstend:
 
 

Modified: fsf/trunk/libc/sysdeps/i386/configure
==============================================================================
--- fsf/trunk/libc/sysdeps/i386/configure (original)
+++ fsf/trunk/libc/sysdeps/i386/configure Sun Aug  9 00:05:21 2009
@@ -1,14 +1,229 @@
+# Factoring default headers for most tests.
+ac_includes_default="\
+#include <stdio.h>
+#ifdef HAVE_SYS_TYPES_H
+# include <sys/types.h>
+#endif
+#ifdef HAVE_SYS_STAT_H
+# include <sys/stat.h>
+#endif
+#ifdef STDC_HEADERS
+# include <stdlib.h>
+# include <stddef.h>
+#else
+# ifdef HAVE_STDLIB_H
+#  include <stdlib.h>
+# endif
+#endif
+#ifdef HAVE_STRING_H
+# if !defined STDC_HEADERS && defined HAVE_MEMORY_H
+#  include <memory.h>
+# endif
+# include <string.h>
+#endif
+#ifdef HAVE_STRINGS_H
+# include <strings.h>
+#endif
+#ifdef HAVE_INTTYPES_H
+# include <inttypes.h>
+#endif
+#ifdef HAVE_STDINT_H
+# include <stdint.h>
+#endif
+#ifdef HAVE_UNISTD_H
+# include <unistd.h>
+#endif"
+
 # This file is generated from configure.in by Autoconf.  DO NOT EDIT!
  # Local configure fragment for sysdeps/i386.
 
 
-{ echo "$as_me:$LINENO: checking if gcc provides <cpuid.h>" >&5
-echo $ECHO_N "checking if gcc provides <cpuid.h>... $ECHO_C" >&6; }
-if test "${libc_cv_gcc_cpuid+set}" = set; then
+
+{ echo "$as_me:$LINENO: checking for grep that handles long lines and -e" >&5
+echo $ECHO_N "checking for grep that handles long lines and -e... $ECHO_C" >&6; }
+if test "${ac_cv_path_GREP+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  # Extract the first word of "grep ggrep" to use in msg output
+if test -z "$GREP"; then
+set dummy grep ggrep; ac_prog_name=$2
+if test "${ac_cv_path_GREP+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  ac_path_GREP_found=false
+# Loop through the user's path and test for each of PROGNAME-LIST
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_prog in grep ggrep; do
+  for ac_exec_ext in '' $ac_executable_extensions; do
+    ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext"
+    { test -f "$ac_path_GREP" && $as_test_x "$ac_path_GREP"; } || continue
+    # Check for GNU ac_path_GREP and select it if it is found.
+  # Check for GNU $ac_path_GREP
+case `"$ac_path_GREP" --version 2>&1` in
+*GNU*)
+  ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;;
+*)
+  ac_count=0
+  echo $ECHO_N "0123456789$ECHO_C" >"conftest.in"
+  while :
+  do
+    cat "conftest.in" "conftest.in" >"conftest.tmp"
+    mv "conftest.tmp" "conftest.in"
+    cp "conftest.in" "conftest.nl"
+    echo 'GREP' >> "conftest.nl"
+    "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+    ac_count=`expr $ac_count + 1`
+    if test $ac_count -gt ${ac_path_GREP_max-0}; then
+      # Best one so far, save it but keep looking for a better one
+      ac_cv_path_GREP="$ac_path_GREP"
+      ac_path_GREP_max=$ac_count
+    fi
+    # 10*(2^10) chars as input seems more than enough
+    test $ac_count -gt 10 && break
+  done
+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+
+    $ac_path_GREP_found && break 3
+  done
+done
+
+done
+IFS=$as_save_IFS
+
+
+fi
+
+GREP="$ac_cv_path_GREP"
+if test -z "$GREP"; then
+  { { echo "$as_me:$LINENO: error: no acceptable $ac_prog_name could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" >&5
+echo "$as_me: error: no acceptable $ac_prog_name could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" >&2;}
+   { (exit 1); exit 1; }; }
+fi
+
+else
+  ac_cv_path_GREP=$GREP
+fi
+
+
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_path_GREP" >&5
+echo "${ECHO_T}$ac_cv_path_GREP" >&6; }
+ GREP="$ac_cv_path_GREP"
+
+
+{ echo "$as_me:$LINENO: checking for egrep" >&5
+echo $ECHO_N "checking for egrep... $ECHO_C" >&6; }
+if test "${ac_cv_path_EGREP+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if echo a | $GREP -E '(a|b)' >/dev/null 2>&1
+   then ac_cv_path_EGREP="$GREP -E"
+   else
+     # Extract the first word of "egrep" to use in msg output
+if test -z "$EGREP"; then
+set dummy egrep; ac_prog_name=$2
+if test "${ac_cv_path_EGREP+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  ac_path_EGREP_found=false
+# Loop through the user's path and test for each of PROGNAME-LIST
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_prog in egrep; do
+  for ac_exec_ext in '' $ac_executable_extensions; do
+    ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext"
+    { test -f "$ac_path_EGREP" && $as_test_x "$ac_path_EGREP"; } || continue
+    # Check for GNU ac_path_EGREP and select it if it is found.
+  # Check for GNU $ac_path_EGREP
+case `"$ac_path_EGREP" --version 2>&1` in
+*GNU*)
+  ac_cv_path_EGREP="$ac_path_EGREP" ac_path_EGREP_found=:;;
+*)
+  ac_count=0
+  echo $ECHO_N "0123456789$ECHO_C" >"conftest.in"
+  while :
+  do
+    cat "conftest.in" "conftest.in" >"conftest.tmp"
+    mv "conftest.tmp" "conftest.in"
+    cp "conftest.in" "conftest.nl"
+    echo 'EGREP' >> "conftest.nl"
+    "$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+    ac_count=`expr $ac_count + 1`
+    if test $ac_count -gt ${ac_path_EGREP_max-0}; then
+      # Best one so far, save it but keep looking for a better one
+      ac_cv_path_EGREP="$ac_path_EGREP"
+      ac_path_EGREP_max=$ac_count
+    fi
+    # 10*(2^10) chars as input seems more than enough
+    test $ac_count -gt 10 && break
+  done
+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+
+    $ac_path_EGREP_found && break 3
+  done
+done
+
+done
+IFS=$as_save_IFS
+
+
+fi
+
+EGREP="$ac_cv_path_EGREP"
+if test -z "$EGREP"; then
+  { { echo "$as_me:$LINENO: error: no acceptable $ac_prog_name could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" >&5
+echo "$as_me: error: no acceptable $ac_prog_name could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" >&2;}
+   { (exit 1); exit 1; }; }
+fi
+
+else
+  ac_cv_path_EGREP=$EGREP
+fi
+
+
+   fi
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_path_EGREP" >&5
+echo "${ECHO_T}$ac_cv_path_EGREP" >&6; }
+ EGREP="$ac_cv_path_EGREP"
+
+
+{ echo "$as_me:$LINENO: checking for ANSI C header files" >&5
+echo $ECHO_N "checking for ANSI C header files... $ECHO_C" >&6; }
+if test "${ac_cv_header_stdc+set}" = set; then
   echo $ECHO_N "(cached) $ECHO_C" >&6
 else
   cat >conftest.$ac_ext <<_ACEOF
-#include <cpuid.h>
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <float.h>
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
 _ACEOF
 rm -f conftest.$ac_objext
 if { (ac_try="$ac_compile"
@@ -27,23 +242,342 @@
 	 test -z "$ac_c_werror_flag" ||
 	 test ! -s conftest.err
        } && test -s conftest.$ac_objext; then
-  libc_cv_gcc_cpuid=yes
+  ac_cv_header_stdc=yes
 else
   echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-	libc_cv_gcc_cpuid=no
+	ac_cv_header_stdc=no
 fi
 
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-fi
-{ echo "$as_me:$LINENO: result: $libc_cv_gcc_cpuid" >&5
-echo "${ECHO_T}$libc_cv_gcc_cpuid" >&6; }
-if test $libc_cv_gcc_cpuid != yes; then
+
+if test $ac_cv_header_stdc = yes; then
+  # SunOS 4.x string.h does not declare mem*, contrary to ANSI.
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <string.h>
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "memchr" >/dev/null 2>&1; then
+  :
+else
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI.
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <stdlib.h>
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "free" >/dev/null 2>&1; then
+  :
+else
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi.
+  if test "$cross_compiling" = yes; then
+  :
+else
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <ctype.h>
+#include <stdlib.h>
+#if ((' ' & 0x0FF) == 0x020)
+# define ISLOWER(c) ('a' <= (c) && (c) <= 'z')
+# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c))
+#else
+# define ISLOWER(c) \
+		   (('a' <= (c) && (c) <= 'i') \
+		     || ('j' <= (c) && (c) <= 'r') \
+		     || ('s' <= (c) && (c) <= 'z'))
+# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c))
+#endif
+
+#define XOR(e, f) (((e) && !(f)) || (!(e) && (f)))
+int
+main ()
+{
+  int i;
+  for (i = 0; i < 256; i++)
+    if (XOR (islower (i), ISLOWER (i))
+	|| toupper (i) != TOUPPER (i))
+      return 2;
+  return 0;
+}
+_ACEOF
+rm -f conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && { ac_try='./conftest$ac_exeext'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  :
+else
+  echo "$as_me: program exited with status $ac_status" >&5
+echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+( exit $ac_status )
+ac_cv_header_stdc=no
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext conftest.$ac_objext conftest.$ac_ext
+fi
+
+
+fi
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_header_stdc" >&5
+echo "${ECHO_T}$ac_cv_header_stdc" >&6; }
+if test $ac_cv_header_stdc = yes; then
+
+cat >>confdefs.h <<\_ACEOF
+#define STDC_HEADERS 1
+_ACEOF
+
+fi
+
+# On IRIX 5.3, sys/types and inttypes.h are conflicting.
+
+
+
+
+
+
+
+
+
+for ac_header in sys/types.h sys/stat.h stdlib.h string.h memory.h strings.h \
+		  inttypes.h stdint.h unistd.h
+do
+as_ac_Header=`echo "ac_cv_header_$ac_header" | $as_tr_sh`
+{ echo "$as_me:$LINENO: checking for $ac_header" >&5
+echo $ECHO_N "checking for $ac_header... $ECHO_C" >&6; }
+if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
+
+#include <$ac_header>
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  eval "$as_ac_Header=yes"
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	eval "$as_ac_Header=no"
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+ac_res=`eval echo '${'$as_ac_Header'}'`
+	       { echo "$as_me:$LINENO: result: $ac_res" >&5
+echo "${ECHO_T}$ac_res" >&6; }
+if test `eval echo '${'$as_ac_Header'}'` = yes; then
+  cat >>confdefs.h <<_ACEOF
+#define `echo "HAVE_$ac_header" | $as_tr_cpp` 1
+_ACEOF
+
+fi
+
+done
+
+
+if test "${ac_cv_header_cpuid_h+set}" = set; then
+  { echo "$as_me:$LINENO: checking for cpuid.h" >&5
+echo $ECHO_N "checking for cpuid.h... $ECHO_C" >&6; }
+if test "${ac_cv_header_cpuid_h+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_header_cpuid_h" >&5
+echo "${ECHO_T}$ac_cv_header_cpuid_h" >&6; }
+else
+  # Is the header compilable?
+{ echo "$as_me:$LINENO: checking cpuid.h usability" >&5
+echo $ECHO_N "checking cpuid.h usability... $ECHO_C" >&6; }
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
+#include <cpuid.h>
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_header_compiler=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_header_compiler=no
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+{ echo "$as_me:$LINENO: result: $ac_header_compiler" >&5
+echo "${ECHO_T}$ac_header_compiler" >&6; }
+
+# Is the header present?
+{ echo "$as_me:$LINENO: checking cpuid.h presence" >&5
+echo $ECHO_N "checking cpuid.h presence... $ECHO_C" >&6; }
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <cpuid.h>
+_ACEOF
+if { (ac_try="$ac_cpp conftest.$ac_ext"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } >/dev/null && {
+	 test -z "$ac_c_preproc_warn_flag$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       }; then
+  ac_header_preproc=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+  ac_header_preproc=no
+fi
+
+rm -f conftest.err conftest.$ac_ext
+{ echo "$as_me:$LINENO: result: $ac_header_preproc" >&5
+echo "${ECHO_T}$ac_header_preproc" >&6; }
+
+# So?  What about this header?
+case $ac_header_compiler:$ac_header_preproc:$ac_c_preproc_warn_flag in
+  yes:no: )
+    { echo "$as_me:$LINENO: WARNING: cpuid.h: accepted by the compiler, rejected by the preprocessor!" >&5
+echo "$as_me: WARNING: cpuid.h: accepted by the compiler, rejected by the preprocessor!" >&2;}
+    { echo "$as_me:$LINENO: WARNING: cpuid.h: proceeding with the compiler's result" >&5
+echo "$as_me: WARNING: cpuid.h: proceeding with the compiler's result" >&2;}
+    ac_header_preproc=yes
+    ;;
+  no:yes:* )
+    { echo "$as_me:$LINENO: WARNING: cpuid.h: present but cannot be compiled" >&5
+echo "$as_me: WARNING: cpuid.h: present but cannot be compiled" >&2;}
+    { echo "$as_me:$LINENO: WARNING: cpuid.h:     check for missing prerequisite headers?" >&5
+echo "$as_me: WARNING: cpuid.h:     check for missing prerequisite headers?" >&2;}
+    { echo "$as_me:$LINENO: WARNING: cpuid.h: see the Autoconf documentation" >&5
+echo "$as_me: WARNING: cpuid.h: see the Autoconf documentation" >&2;}
+    { echo "$as_me:$LINENO: WARNING: cpuid.h:     section \"Present But Cannot Be Compiled\"" >&5
+echo "$as_me: WARNING: cpuid.h:     section \"Present But Cannot Be Compiled\"" >&2;}
+    { echo "$as_me:$LINENO: WARNING: cpuid.h: proceeding with the preprocessor's result" >&5
+echo "$as_me: WARNING: cpuid.h: proceeding with the preprocessor's result" >&2;}
+    { echo "$as_me:$LINENO: WARNING: cpuid.h: in the future, the compiler will take precedence" >&5
+echo "$as_me: WARNING: cpuid.h: in the future, the compiler will take precedence" >&2;}
+
+    ;;
+esac
+{ echo "$as_me:$LINENO: checking for cpuid.h" >&5
+echo $ECHO_N "checking for cpuid.h... $ECHO_C" >&6; }
+if test "${ac_cv_header_cpuid_h+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  ac_cv_header_cpuid_h=$ac_header_preproc
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_header_cpuid_h" >&5
+echo "${ECHO_T}$ac_cv_header_cpuid_h" >&6; }
+
+fi
+if test $ac_cv_header_cpuid_h = yes; then
+  :
+else
   { { echo "$as_me:$LINENO: error: gcc must provide the <cpuid.h> header" >&5
 echo "$as_me: error: gcc must provide the <cpuid.h> header" >&2;}
    { (exit 1); exit 1; }; }
 fi
+
+
 
 { echo "$as_me:$LINENO: checking if -g produces usable source locations for assembler-with-cpp" >&5
 echo $ECHO_N "checking if -g produces usable source locations for assembler-with-cpp... $ECHO_C" >&6; }

Modified: fsf/trunk/libc/sysdeps/i386/configure.in
==============================================================================
--- fsf/trunk/libc/sysdeps/i386/configure.in (original)
+++ fsf/trunk/libc/sysdeps/i386/configure.in Sun Aug  9 00:05:21 2009
@@ -1,12 +1,8 @@
 GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
 # Local configure fragment for sysdeps/i386.
 
-AC_CACHE_CHECK([if gcc provides <cpuid.h>], libc_cv_gcc_cpuid, [dnl
-AC_COMPILE_IFELSE([#include <cpuid.h>], libc_cv_gcc_cpuid=yes,
-		  libc_cv_gcc_cpuid=no)])
-if test $libc_cv_gcc_cpuid != yes; then
-  AC_MSG_ERROR([gcc must provide the <cpuid.h> header])
-fi
+AC_HEADER_CHECK([cpuid.h], ,
+  [AC_MSG_ERROR([gcc must provide the <cpuid.h> header])])
 
 AC_CACHE_CHECK(if -g produces usable source locations for assembler-with-cpp,
 	       libc_cv_cpp_asm_debuginfo, [dnl

Modified: fsf/trunk/libc/sysdeps/i386/i686/multiarch/strcspn.S
==============================================================================
--- fsf/trunk/libc/sysdeps/i386/i686/multiarch/strcspn.S (original)
+++ fsf/trunk/libc/sysdeps/i386/i686/multiarch/strcspn.S Sun Aug  9 00:05:21 2009
@@ -42,6 +42,7 @@
    define multiple versions for strpbrk in static library since we
    need strpbrk before the initialization happened.  */
 #if (defined SHARED || !defined USE_AS_STRPBRK) && !defined NOT_IN_libc
+# ifdef SHARED
 	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
 	.globl	__i686.get_pc_thunk.bx
 	.hidden	__i686.get_pc_thunk.bx
@@ -71,6 +72,20 @@
 	cfi_restore (ebx)
 	ret
 END(STRCSPN)
+# else
+	.text
+ENTRY(STRCSPN)
+	.type	STRCSPN, @gnu_indirect_function
+	cmpl	$0, KIND_OFFSET+__cpu_features
+	jne	1f
+	call	__init_cpu_features
+1:	leal	STRCSPN_IA32, %eax
+	testl	$(1<<20), CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET+__cpu_features
+	jz	2f
+	leal	STRCSPN_SSE42, %eax
+2:	ret
+END(STRCSPN)
+# endif
 
 # undef ENTRY
 # define ENTRY(name) \

Modified: fsf/trunk/libc/sysdeps/i386/i686/multiarch/strspn.S
==============================================================================
--- fsf/trunk/libc/sysdeps/i386/i686/multiarch/strspn.S (original)
+++ fsf/trunk/libc/sysdeps/i386/i686/multiarch/strspn.S Sun Aug  9 00:05:21 2009
@@ -27,6 +27,7 @@
 
 /* Define multiple versions only for the definition in libc.  */
 #ifndef NOT_IN_libc
+# ifdef SHARED
 	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
 	.globl	__i686.get_pc_thunk.bx
 	.hidden	__i686.get_pc_thunk.bx
@@ -56,6 +57,20 @@
 	cfi_restore (ebx)
 	ret
 END(strspn)
+# else
+	.text
+ENTRY(strspn)
+	.type	strspn, @gnu_indirect_function
+	cmpl	$0, KIND_OFFSET+__cpu_features
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__strspn_ia32, %eax
+	testl	$(1<<20), CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET+__cpu_features
+	jz	2f
+	leal	__strspn_sse42, %eax
+2:	ret
+END(strspn)
+# endif
 
 # undef ENTRY
 # define ENTRY(name) \

Modified: fsf/trunk/libc/sysdeps/x86_64/dl-trampoline.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/dl-trampoline.S (original)
+++ fsf/trunk/libc/sysdeps/x86_64/dl-trampoline.S Sun Aug  9 00:05:21 2009
@@ -146,247 +146,17 @@
 2:	movl	%eax, L(have_avx)(%rip)
 	cmpl	$0, %eax
 
-1:	js	L(no_avx1)
-
-	/* This is to support AVX audit modules.  */
-	vmovdqu %ymm0,		      (LR_VECTOR_OFFSET)(%rsp)
-	vmovdqu %ymm1, (LR_VECTOR_OFFSET +   VECTOR_SIZE)(%rsp)
-	vmovdqu %ymm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
-	vmovdqu %ymm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
-	vmovdqu %ymm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
-	vmovdqu %ymm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
-	vmovdqu %ymm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
-	vmovdqu %ymm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
-
-	/* Save xmm0-xmm7 registers to detect if any of them are
-	   changed by audit module.  */
-	vmovdqa %xmm0,		    (LR_SIZE)(%rsp)
-	vmovdqa %xmm1, (LR_SIZE +   XMM_SIZE)(%rsp)
-	vmovdqa %xmm2, (LR_SIZE + XMM_SIZE*2)(%rsp)
-	vmovdqa %xmm3, (LR_SIZE + XMM_SIZE*3)(%rsp)
-	vmovdqa %xmm4, (LR_SIZE + XMM_SIZE*4)(%rsp)
-	vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp)
-	vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp)
-	vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp)
-
-L(no_avx1):
-# endif
-
-	movq %rsp, %rcx		# La_x86_64_regs pointer to %rcx.
-	movq 48(%rbx), %rdx	# Load return address if needed.
-	movq 40(%rbx), %rsi	# Copy args pushed by PLT in register.
-	movq 32(%rbx), %rdi	# %rdi: link_map, %rsi: reloc_index
-	leaq 16(%rbx), %r8
-	call _dl_profile_fixup	# Call resolver.
-
-	movq %rax, %r11		# Save return value.
-
-	movq 8(%rbx), %rax	# Get back register content.
-	movq LR_RDX_OFFSET(%rsp), %rdx
-	movq  LR_R8_OFFSET(%rsp), %r8
-	movq  LR_R9_OFFSET(%rsp), %r9
-
-	movaps		    (LR_XMM_OFFSET)(%rsp), %xmm0
-	movaps	 (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1
-	movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2
-	movaps (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3
-	movaps (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4
-	movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5
-	movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
-	movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
-
-# ifdef HAVE_AVX_SUPPORT
-	cmpl	$0, L(have_avx)(%rip)
-	js	L(no_avx2)
-
-	/* Check if any xmm0-xmm7 registers are changed by audit
-	   module.  */
-	vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8
-	vpmovmskb %xmm8, %esi
-	cmpl $0xffff, %esi
-	jne 1f
-	vmovdqu			(LR_VECTOR_OFFSET)(%rsp), %ymm0
-
-1:	vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8
-	vpmovmskb %xmm8, %esi
-	cmpl $0xffff, %esi
-	jne 1f
-	vmovdqu	  (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1
-
-1:	vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8
-	vpmovmskb %xmm8, %esi
-	cmpl $0xffff, %esi
-	jne 1f
-	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2
-
-1:	vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8
-	vpmovmskb %xmm8, %esi
-	cmpl $0xffff, %esi
-	jne 1f
-	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3
-
-1:	vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8
-	vpmovmskb %xmm8, %esi
-	cmpl $0xffff, %esi
-	jne 1f
-	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4
-
-1:	vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8
-	vpmovmskb %xmm8, %esi
-	cmpl $0xffff, %esi
-	jne 1f
-	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5
-
-1:	vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8
-	vpmovmskb %xmm8, %esi
-	cmpl $0xffff, %esi
-	jne 1f
-	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6
-
-1:	vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
-	vpmovmskb %xmm8, %esi
-	cmpl $0xffff, %esi
-	jne 1f
-	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7
-
-L(no_avx2):
-1:
-# endif
-	movq 16(%rbx), %r10	# Anything in framesize?
-	testq %r10, %r10
-	jns 3f
-
-	/* There's nothing in the frame size, so there
-	   will be no call to the _dl_call_pltexit. */
-
-	/* Get back registers content.  */
-	movq LR_RCX_OFFSET(%rsp), %rcx
-	movq LR_RSI_OFFSET(%rsp), %rsi
-	movq LR_RDI_OFFSET(%rsp), %rdi
-
-	movq %rbx, %rsp
-	movq (%rsp), %rbx
-	cfi_restore(rbx)
-	cfi_def_cfa_register(%rsp)
-
-	addq $48, %rsp		# Adjust the stack to the return value
-				# (eats the reloc index and link_map)
-	cfi_adjust_cfa_offset(-48)
-	jmp *%r11		# Jump to function address.
-
-3:
-	cfi_adjust_cfa_offset(48)
-	cfi_rel_offset(%rbx, 0)
-	cfi_def_cfa_register(%rbx)
-
-	/* At this point we need to prepare new stack for the function
-	   which has to be called.  We copy the original stack to a
-	   temporary buffer of the size specified by the 'framesize'
-	   returned from _dl_profile_fixup */
-
-	leaq LR_RSP_OFFSET(%rbx), %rsi	# stack
-	addq $8, %r10
-	andq $0xfffffffffffffff0, %r10
-	movq %r10, %rcx
-	subq %r10, %rsp
-	movq %rsp, %rdi
-	shrq $3, %rcx
-	rep
-	movsq
-
-	movq 24(%rdi), %rcx	# Get back register content.
-	movq 32(%rdi), %rsi
-	movq 40(%rdi), %rdi
-
-	call *%r11
-
-	mov 24(%rbx), %rsp	# Drop the copied stack content
-
-	/* Now we have to prepare the La_x86_64_retval structure for the
-	   _dl_call_pltexit.  The La_x86_64_regs is being pointed by rsp now,
-	   so we just need to allocate the sizeof(La_x86_64_retval) space on
-	   the stack, since the alignment has already been taken care of. */
-# ifdef HAVE_AVX_SUPPORT
-	/* sizeof(La_x86_64_retval).  Need extra space for 2 SSE
-	   registers to detect if xmm0/xmm1 registers are changed
-	   by audit module.  */
-	subq $(LRV_SIZE + XMM_SIZE*2), %rsp
-# else
-	subq $LRV_SIZE, %rsp	# sizeof(La_x86_64_retval)
-# endif
-	movq %rsp, %rcx		# La_x86_64_retval argument to %rcx.
-
-	/* Fill in the La_x86_64_retval structure.  */
-	movq %rax, LRV_RAX_OFFSET(%rcx)
-	movq %rdx, LRV_RDX_OFFSET(%rcx)
-
-	movaps %xmm0, LRV_XMM0_OFFSET(%rcx)
-	movaps %xmm1, LRV_XMM1_OFFSET(%rcx)
-
-# ifdef HAVE_AVX_SUPPORT
-	cmpl	$0, L(have_avx)(%rip)
-	js	L(no_avx3)
-
-	/* This is to support AVX audit modules.  */
-	vmovdqu %ymm0, LRV_VECTOR0_OFFSET(%rcx)
-	vmovdqu %ymm1, LRV_VECTOR1_OFFSET(%rcx)
-
-	/* Save xmm0/xmm1 registers to detect if they are changed
-	   by audit module.  */
-	vmovdqa %xmm0,		  (LRV_SIZE)(%rcx)
-	vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx)
-
-L(no_avx3):
-# endif
-
-	fstpt LRV_ST0_OFFSET(%rcx)
-	fstpt LRV_ST1_OFFSET(%rcx)
-
-	movq 24(%rbx), %rdx	# La_x86_64_regs argument to %rdx.
-	movq 40(%rbx), %rsi	# Copy args pushed by PLT in register.
-        movq 32(%rbx), %rdi	# %rdi: link_map, %rsi: reloc_index
-	call _dl_call_pltexit
-
-	/* Restore return registers.  */
-	movq LRV_RAX_OFFSET(%rsp), %rax
-	movq LRV_RDX_OFFSET(%rsp), %rdx
-
-	movaps LRV_XMM0_OFFSET(%rsp), %xmm0
-	movaps LRV_XMM1_OFFSET(%rsp), %xmm1
-
-# ifdef HAVE_AVX_SUPPORT
-	cmpl	$0, L(have_avx)(%rip)
-	js	L(no_avx4)
-
-	/* Check if xmm0/xmm1 registers are changed by audit module.  */
-	vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2
-	vpmovmskb %xmm2, %esi
-	cmpl $0xffff, %esi
-	jne 1f
-	vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0
-
-1:	vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
-	vpmovmskb %xmm2, %esi
-	cmpl $0xffff, %esi
-	jne 1f
-	vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1
-
-L(no_avx4):
-1:
-# endif
-
-	fldt LRV_ST1_OFFSET(%rsp)
-	fldt LRV_ST0_OFFSET(%rsp)
-
-	movq %rbx, %rsp
-	movq (%rsp), %rbx
-	cfi_restore(rbx)
-	cfi_def_cfa_register(%rsp)
-
-	addq $48, %rsp		# Adjust the stack to the return value
-				# (eats the reloc index and link_map)
-	cfi_adjust_cfa_offset(-48)
-	retq
+1:	js	L(no_avx)
+
+#  define RESTORE_AVX
+#  include "dl-trampoline.h"
+
+	.align 16
+L(no_avx):
+# endif
+
+#  undef RESTORE_AVX
+#  include "dl-trampoline.h"
 
 	cfi_endproc
 	.size _dl_runtime_profile, .-_dl_runtime_profile

Added: fsf/trunk/libc/sysdeps/x86_64/dl-trampoline.h
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/dl-trampoline.h (added)
+++ fsf/trunk/libc/sysdeps/x86_64/dl-trampoline.h Sun Aug  9 00:05:21 2009
@@ -1,0 +1,269 @@
+/* Partial PLT profile trampoline to save and restore x86-64 vector
+   registers.
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifdef RESTORE_AVX
+	/* This is to support AVX audit modules.  */
+	vmovdqu %ymm0,		      (LR_VECTOR_OFFSET)(%rsp)
+	vmovdqu %ymm1, (LR_VECTOR_OFFSET +   VECTOR_SIZE)(%rsp)
+	vmovdqu %ymm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
+	vmovdqu %ymm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
+	vmovdqu %ymm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
+	vmovdqu %ymm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
+	vmovdqu %ymm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
+	vmovdqu %ymm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
+
+	/* Save xmm0-xmm7 registers to detect if any of them are
+	   changed by audit module.  */
+	vmovdqa %xmm0,		    (LR_SIZE)(%rsp)
+	vmovdqa %xmm1, (LR_SIZE +   XMM_SIZE)(%rsp)
+	vmovdqa %xmm2, (LR_SIZE + XMM_SIZE*2)(%rsp)
+	vmovdqa %xmm3, (LR_SIZE + XMM_SIZE*3)(%rsp)
+	vmovdqa %xmm4, (LR_SIZE + XMM_SIZE*4)(%rsp)
+	vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp)
+	vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp)
+	vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp)
+#endif
+
+	movq %rsp, %rcx		# La_x86_64_regs pointer to %rcx.
+	movq 48(%rbx), %rdx	# Load return address if needed.
+	movq 40(%rbx), %rsi	# Copy args pushed by PLT in register.
+	movq 32(%rbx), %rdi	# %rdi: link_map, %rsi: reloc_index
+	leaq 16(%rbx), %r8
+	call _dl_profile_fixup	# Call resolver.
+
+	movq %rax, %r11		# Save return value.
+
+	movq 8(%rbx), %rax	# Get back register content.
+	movq LR_RDX_OFFSET(%rsp), %rdx
+	movq  LR_R8_OFFSET(%rsp), %r8
+	movq  LR_R9_OFFSET(%rsp), %r9
+
+	movaps		    (LR_XMM_OFFSET)(%rsp), %xmm0
+	movaps	 (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1
+	movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2
+	movaps (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3
+	movaps (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4
+	movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5
+	movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
+	movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
+
+#ifdef RESTORE_AVX
+	/* Check if any xmm0-xmm7 registers are changed by audit
+	   module.  */
+	vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 2f
+	vmovdqa	%xmm0, (LR_VECTOR_OFFSET)(%rsp)
+	jmp 1f
+2:	vmovdqu	(LR_VECTOR_OFFSET)(%rsp), %ymm0
+	vmovdqa	%xmm0, (LR_XMM_OFFSET)(%rsp)
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 2f
+	vmovdqa	%xmm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp)
+	jmp 1f
+2:	vmovdqu	(LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1
+	vmovdqa	%xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp)
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 2f
+	vmovdqa	%xmm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
+	jmp 1f
+2:	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2
+	vmovdqa	%xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 2f
+	vmovdqa	%xmm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
+	jmp 1f
+2:	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3
+	vmovdqa	%xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 2f
+	vmovdqa	%xmm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
+	jmp 1f
+2:	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4
+	vmovdqa	%xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 2f
+	vmovdqa	%xmm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
+	jmp 1f
+2:	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5
+	vmovdqa	%xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 2f
+	vmovdqa	%xmm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
+	jmp 1f
+2:	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6
+	vmovdqa	%xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 2f
+	vmovdqa	%xmm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
+	jmp 1f
+2:	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7
+	vmovdqa	%xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
+
+1:
+#endif
+	movq 16(%rbx), %r10	# Anything in framesize?
+	testq %r10, %r10
+	jns 3f
+
+	/* There's nothing in the frame size, so there
+	   will be no call to the _dl_call_pltexit. */
+
+	/* Get back registers content.  */
+	movq LR_RCX_OFFSET(%rsp), %rcx
+	movq LR_RSI_OFFSET(%rsp), %rsi
+	movq LR_RDI_OFFSET(%rsp), %rdi
+
+	movq %rbx, %rsp
+	movq (%rsp), %rbx
+	cfi_restore(rbx)
+	cfi_def_cfa_register(%rsp)
+
+	addq $48, %rsp		# Adjust the stack to the return value
+				# (eats the reloc index and link_map)
+	cfi_adjust_cfa_offset(-48)
+	jmp *%r11		# Jump to function address.
+
+3:
+	cfi_adjust_cfa_offset(48)
+	cfi_rel_offset(%rbx, 0)
+	cfi_def_cfa_register(%rbx)
+
+	/* At this point we need to prepare new stack for the function
+	   which has to be called.  We copy the original stack to a
+	   temporary buffer of the size specified by the 'framesize'
+	   returned from _dl_profile_fixup */
+
+	leaq LR_RSP_OFFSET(%rbx), %rsi	# stack
+	addq $8, %r10
+	andq $0xfffffffffffffff0, %r10
+	movq %r10, %rcx
+	subq %r10, %rsp
+	movq %rsp, %rdi
+	shrq $3, %rcx
+	rep
+	movsq
+
+	movq 24(%rdi), %rcx	# Get back register content.
+	movq 32(%rdi), %rsi
+	movq 40(%rdi), %rdi
+
+	call *%r11
+
+	mov 24(%rbx), %rsp	# Drop the copied stack content
+
+	/* Now we have to prepare the La_x86_64_retval structure for the
+	   _dl_call_pltexit.  The La_x86_64_regs is being pointed by rsp now,
+	   so we just need to allocate the sizeof(La_x86_64_retval) space on
+	   the stack, since the alignment has already been taken care of. */
+# ifdef RESTORE_AVX
+	/* sizeof(La_x86_64_retval).  Need extra space for 2 SSE
+	   registers to detect if xmm0/xmm1 registers are changed
+	   by audit module.  */
+	subq $(LRV_SIZE + XMM_SIZE*2), %rsp
+# else
+	subq $LRV_SIZE, %rsp	# sizeof(La_x86_64_retval)
+# endif
+	movq %rsp, %rcx		# La_x86_64_retval argument to %rcx.
+
+	/* Fill in the La_x86_64_retval structure.  */
+	movq %rax, LRV_RAX_OFFSET(%rcx)
+	movq %rdx, LRV_RDX_OFFSET(%rcx)
+
+	movaps %xmm0, LRV_XMM0_OFFSET(%rcx)
+	movaps %xmm1, LRV_XMM1_OFFSET(%rcx)
+
+# ifdef RESTORE_AVX
+	/* This is to support AVX audit modules.  */
+	vmovdqu %ymm0, LRV_VECTOR0_OFFSET(%rcx)
+	vmovdqu %ymm1, LRV_VECTOR1_OFFSET(%rcx)
+
+	/* Save xmm0/xmm1 registers to detect if they are changed
+	   by audit module.  */
+	vmovdqa %xmm0,		  (LRV_SIZE)(%rcx)
+	vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx)
+# endif
+
+	fstpt LRV_ST0_OFFSET(%rcx)
+	fstpt LRV_ST1_OFFSET(%rcx)
+
+	movq 24(%rbx), %rdx	# La_x86_64_regs argument to %rdx.
+	movq 40(%rbx), %rsi	# Copy args pushed by PLT in register.
+        movq 32(%rbx), %rdi	# %rdi: link_map, %rsi: reloc_index
+	call _dl_call_pltexit
+
+	/* Restore return registers.  */
+	movq LRV_RAX_OFFSET(%rsp), %rax
+	movq LRV_RDX_OFFSET(%rsp), %rdx
+
+	movaps LRV_XMM0_OFFSET(%rsp), %xmm0
+	movaps LRV_XMM1_OFFSET(%rsp), %xmm1
+
+# ifdef RESTORE_AVX
+	/* Check if xmm0/xmm1 registers are changed by audit module.  */
+	vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2
+	vpmovmskb %xmm2, %esi
+	cmpl $0xffff, %esi
+	jne 1f
+	vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0
+
+1:	vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
+	vpmovmskb %xmm2, %esi
+	cmpl $0xffff, %esi
+	jne 1f
+	vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1
+
+1:
+# endif
+
+	fldt LRV_ST1_OFFSET(%rsp)
+	fldt LRV_ST0_OFFSET(%rsp)
+
+	movq %rbx, %rsp
+	movq (%rsp), %rbx
+	cfi_restore(rbx)
+	cfi_def_cfa_register(%rsp)
+
+	addq $48, %rsp		# Adjust the stack to the return value
+				# (eats the reloc index and link_map)
+	cfi_adjust_cfa_offset(-48)
+	retq

Modified: fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile (original)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/Makefile Sun Aug  9 00:05:21 2009
@@ -4,7 +4,7 @@
 endif
 
 ifeq ($(subdir),string)
-sysdep_routines += stpncpy-c strncpy-c
+sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
 CFLAGS-strcspn-c.c += -msse4

Modified: fsf/trunk/libc/sysdeps/x86_64/multiarch/rawmemchr.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/rawmemchr.S (original)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/rawmemchr.S Sun Aug  9 00:05:21 2009
@@ -38,6 +38,7 @@
 strong_alias (rawmemchr, __rawmemchr)
 
 
+	.section .text.sse4.2,"ax",@progbits
 	.align 	16
 	.type	__rawmemchr_sse42, @function
 __rawmemchr_sse42:

Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strcmp-ssse3.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strcmp-ssse3.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strcmp-ssse3.S Sun Aug  9 00:05:21 2009
@@ -1,0 +1,3 @@
+#define USE_SSSE3 1
+#define STRCMP __strcmp_ssse3
+#include "../strcmp.S"

Modified: fsf/trunk/libc/sysdeps/x86_64/multiarch/strcmp.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strcmp.S (original)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strcmp.S Sun Aug  9 00:05:21 2009
@@ -34,6 +34,7 @@
 	mov	%r9, %r11
 
 #define STRCMP_SSE42	__strncmp_sse42
+#define STRCMP_SSSE3	__strncmp_ssse3
 #define STRCMP_SSE2	__strncmp_sse2
 #define __GI_STRCMP	__GI_strncmp
 #else
@@ -41,6 +42,7 @@
 #ifndef STRCMP
 #define STRCMP		strcmp
 #define STRCMP_SSE42	__strcmp_sse42
+#define STRCMP_SSSE3	__strcmp_ssse3
 #define STRCMP_SSE2	__strcmp_sse2
 #define __GI_STRCMP	__GI_strcmp
 #endif
@@ -60,10 +62,14 @@
 	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
 	jne	1f
 	call	__init_cpu_features
-1:	leaq	STRCMP_SSE2(%rip), %rax
+1:
+	leaq	STRCMP_SSE42(%rip), %rax
 	testl	$(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
-	jz	2f
-	leaq	STRCMP_SSE42(%rip), %rax
+	jnz	2f
+	leaq	STRCMP_SSSE3(%rip), %rax
+	testl	$(1<<9), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
+	jnz	2f
+	leaq	STRCMP_SSE2(%rip), %rax
 2:	ret
 END(STRCMP)
 

Modified: fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen.S (original)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strlen.S Sun Aug  9 00:05:21 2009
@@ -40,6 +40,7 @@
 END(strlen)
 
 
+	.section .text.sse4.2,"ax",@progbits
 	.align 	16
 	.type	__strlen_sse42, @function
 __strlen_sse42:

Added: fsf/trunk/libc/sysdeps/x86_64/multiarch/strncmp-ssse3.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/multiarch/strncmp-ssse3.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/multiarch/strncmp-ssse3.S Sun Aug  9 00:05:21 2009
@@ -1,0 +1,4 @@
+#define USE_SSSE3 1
+#define STRCMP __strncmp_ssse3
+#define USE_AS_STRNCMP
+#include "../strcmp.S"

Modified: fsf/trunk/libc/sysdeps/x86_64/strcmp.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/strcmp.S (original)
+++ fsf/trunk/libc/sysdeps/x86_64/strcmp.S Sun Aug  9 00:05:21 2009
@@ -51,7 +51,12 @@
 # endif
 #endif
 
+#ifndef USE_SSSE3
 	.text
+#else
+        .section .text.ssse3,"ax",@progbits
+#endif
+
 ENTRY (BP_SYM (STRCMP))
 #ifdef NOT_IN_libc
 /* Simple version since we can't use SSE registers in ld.so.  */
@@ -244,9 +249,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4		 /* store for next cycle */
 
+#ifndef USE_SSSE3
 	psrldq	$1, %xmm3
 	pslldq	$15, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -269,9 +278,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4		/* store for next cycle */
 
+#ifndef USE_SSSE3
 	psrldq	$1, %xmm3
-	pslldq 	$15, %xmm2
-	por	%xmm3, %xmm2		/* merge into one 16byte value */
+	pslldq	$15, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -363,9 +376,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$2, %xmm3
 	pslldq	$14, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -389,9 +406,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$2, %xmm3
-	pslldq 	$14, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$14, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -477,9 +498,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$3, %xmm3
 	pslldq	$13, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -503,9 +528,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$3, %xmm3
-	pslldq 	$13, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$13, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -591,9 +620,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$4, %xmm3
 	pslldq	$12, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -617,9 +650,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$4, %xmm3
-	pslldq 	$12, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$12, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -705,9 +742,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$5, %xmm3
 	pslldq	$11, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -731,9 +772,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$5, %xmm3
-	pslldq 	$11, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$11, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -819,9 +864,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$6, %xmm3
 	pslldq	$10, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -845,9 +894,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$6, %xmm3
-	pslldq 	$10, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$10, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -933,9 +986,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$7, %xmm3
 	pslldq	$9, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -959,9 +1016,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$7, %xmm3
-	pslldq 	$9, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$9, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1047,9 +1108,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$8, %xmm3
 	pslldq	$8, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1073,9 +1138,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$8, %xmm3
-	pslldq 	$8, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$8, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1161,9 +1230,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$9, %xmm3
 	pslldq	$7, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1187,9 +1260,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$9, %xmm3
-	pslldq 	$7, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$7, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1275,9 +1352,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$10, %xmm3
 	pslldq	$6, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1301,9 +1382,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$10, %xmm3
-	pslldq 	$6, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$6, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1389,9 +1474,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$11, %xmm3
 	pslldq	$5, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1415,9 +1504,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$11, %xmm3
-	pslldq 	$5, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$5, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1503,9 +1596,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$12, %xmm3
 	pslldq	$4, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1529,9 +1626,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$12, %xmm3
-	pslldq 	$4, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$4, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1617,9 +1718,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$13, %xmm3
 	pslldq	$3, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1643,9 +1748,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$13, %xmm3
-	pslldq 	$3, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1731,9 +1840,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$14, %xmm3
 	pslldq	$2, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1757,9 +1870,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$14, %xmm3
-	pslldq 	$2, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$2, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1847,9 +1964,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$15, %xmm3
 	pslldq	$1, %xmm2
-	por	%xmm3, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1873,9 +1994,13 @@
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
+#ifndef USE_SSSE3
 	psrldq	$15, %xmm3
-	pslldq 	$1, %xmm2
-	por	%xmm3, %xmm2
+	pslldq	$1, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+#else
+	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
+#endif
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1