[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[commits] r8676 - in /fsf/trunk/libc: ./ elf/ include/ malloc/ nscd/ sysdeps/generic/ sysdeps/x86_64/



Author: eglibc
Date: Fri Jul 17 00:07:41 2009
New Revision: 8676

Log:
Import glibc-mainline for 2009-07-17

Added:
    fsf/trunk/libc/sysdeps/x86_64/memcmp.S
Modified:
    fsf/trunk/libc/ChangeLog
    fsf/trunk/libc/elf/dl-misc.c
    fsf/trunk/libc/include/atomic.h
    fsf/trunk/libc/malloc/malloc.c
    fsf/trunk/libc/nscd/aicache.c
    fsf/trunk/libc/nscd/cache.c
    fsf/trunk/libc/nscd/grpcache.c
    fsf/trunk/libc/nscd/hstcache.c
    fsf/trunk/libc/nscd/initgrcache.c
    fsf/trunk/libc/nscd/mem.c
    fsf/trunk/libc/nscd/pwdcache.c
    fsf/trunk/libc/nscd/servicescache.c
    fsf/trunk/libc/sysdeps/generic/ldsodefs.h
    fsf/trunk/libc/sysdeps/x86_64/dl-trampoline.S

Modified: fsf/trunk/libc/ChangeLog
==============================================================================
--- fsf/trunk/libc/ChangeLog (original)
+++ fsf/trunk/libc/ChangeLog Fri Jul 17 00:07:41 2009
@@ -1,3 +1,52 @@
+2009-07-16  Petr Baudis  <pasky@xxxxxxx>
+
+	* nscd/mem.c (mempool_alloc): Fix unlock missing in the else branch.
+	* nscd/aicache.c: Remove bogus db->lock unlock.
+	* nscd/grpcache.c: Likewise.
+	* nscd/hstcache.c: Likewise.
+	* nscd/initgrcache.c: Likewise.
+	* nscd/pwdcache.c: Likewise.
+	* nscd/servicescache.c: Likewise.
+
+2009-07-16  Ulrich Drepper  <drepper@xxxxxxxxxx>
+
+	* nscd/cache.c (cache_add): Use atomic_compare_and_exchange_bool_rel
+	instead of atomic_compare_and_exchange_bool_acq to ensure pointer
+	is written before the list head update.
+	Patch by Andreas Schwab <aschwab@xxxxxxxxxx>.
+
+2009-07-16  Ulrich Drepper  <drepper@xxxxxxxxxx>
+	    Jakub Jelinek  <jakub@xxxxxxxxxx>
+
+	* malloc/malloc.c [ATOMIC_FASTBINS] (_int_free): Make check for
+	corruption thread-safe.
+
+2009-07-13  Jakub Jelinek  <jakub@xxxxxxxxxx>
+
+	* include/atomic.h (catomic_compare_and_exchange_val_rel): If arch
+	overrides atomic_compare_and_exchange_val_rel, define to
+	atomic_compare_and_exchange_val_rel by default, otherwise default
+	to catomic_compare_and_exchange_val_acq.
+	(catomic_compare_and_exchange_bool_rel): If arch overrides
+	atomic_compare_and_exchange_bool_rel, define to
+	atomic_compare_and_exchange_bool_rel by default.
+	* malloc/malloc.c (_int_free): Revert 2009-07-02 change.
+	Use catomic_compare_and_exchange_val_rel instead of
+	catomic_compare_and_exchange_val_acq.
+
+2009-07-16  Ulrich Drepper  <drepper@xxxxxxxxxx>
+
+	* sysdeps/generic/ldsodefs.h: Add prototype for
+	_dl_higher_prime_number.
+	* elf/dl-misc.c (_dl_higher_prime_number): Mark with internal_function.
+
+	* sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Optimize
+	restoring of ymm registers a bit.
+
+2009-07-15  H.J. Lu  <hongjiu.lu@xxxxxxxxx>
+
+	* sysdeps/x86_64/memcmp.S: New file.
+
 2009-07-15  Ulrich Drepper  <drepper@xxxxxxxxxx>
 
 	* sysdeps/x86-64/dl-trampoline.h: Remove after integrating code into...

Modified: fsf/trunk/libc/elf/dl-misc.c
==============================================================================
--- fsf/trunk/libc/elf/dl-misc.c (original)
+++ fsf/trunk/libc/elf/dl-misc.c Fri Jul 17 00:07:41 2009
@@ -315,6 +315,7 @@
 
 
 unsigned long int
+internal_function
 _dl_higher_prime_number (unsigned long int n)
 {
   /* These are primes that are near, but slightly smaller than, a

Modified: fsf/trunk/libc/include/atomic.h
==============================================================================
--- fsf/trunk/libc/include/atomic.h (original)
+++ fsf/trunk/libc/include/atomic.h Fri Jul 17 00:07:41 2009
@@ -107,14 +107,19 @@
 #endif
 
 
+#ifndef catomic_compare_and_exchange_val_rel
+# ifndef atomic_compare_and_exchange_val_rel
+#  define catomic_compare_and_exchange_val_rel(mem, newval, oldval)	      \
+  catomic_compare_and_exchange_val_acq (mem, newval, oldval)
+# else
+#  define catomic_compare_and_exchange_val_rel(mem, newval, oldval)	      \
+  atomic_compare_and_exchange_val_rel (mem, newval, oldval)
+# endif
+#endif
+
+
 #ifndef atomic_compare_and_exchange_val_rel
 # define atomic_compare_and_exchange_val_rel(mem, newval, oldval)	      \
-  atomic_compare_and_exchange_val_acq (mem, newval, oldval)
-#endif
-
-
-#ifndef catomic_compare_and_exchange_val_rel
-# define catomic_compare_and_exchange_val_rel(mem, newval, oldval)	      \
   atomic_compare_and_exchange_val_acq (mem, newval, oldval)
 #endif
 
@@ -155,15 +160,20 @@
 #endif
 
 
+#ifndef catomic_compare_and_exchange_bool_rel
+# ifndef atomic_compare_and_exchange_bool_rel
+#  define catomic_compare_and_exchange_bool_rel(mem, newval, oldval)	      \
+  catomic_compare_and_exchange_bool_acq (mem, newval, oldval)
+# else
+#  define catomic_compare_and_exchange_bool_rel(mem, newval, oldval)	      \
+  atomic_compare_and_exchange_bool_rel (mem, newval, oldval)
+# endif
+#endif
+
+
 #ifndef atomic_compare_and_exchange_bool_rel
 # define atomic_compare_and_exchange_bool_rel(mem, newval, oldval) \
   atomic_compare_and_exchange_bool_acq (mem, newval, oldval)
-#endif
-
-
-#ifndef catomic_compare_and_exchange_bool_rel
-# define catomic_compare_and_exchange_bool_rel(mem, newval, oldval) \
-  catomic_compare_and_exchange_bool_acq (mem, newval, oldval)
 #endif
 
 

Modified: fsf/trunk/libc/malloc/malloc.c
==============================================================================
--- fsf/trunk/libc/malloc/malloc.c (original)
+++ fsf/trunk/libc/malloc/malloc.c Fri Jul 17 00:07:41 2009
@@ -4799,8 +4799,29 @@
 	|| __builtin_expect (chunksize (chunk_at_offset (p, size))
 			     >= av->system_mem, 0))
       {
-	errstr = "free(): invalid next size (fast)";
-	goto errout;
+#ifdef ATOMIC_FASTBINS
+	/* We might not have a lock at this point and concurrent modifications
+	   of system_mem might have let to a false positive.  Redo the test
+	   after getting the lock.  */
+	if (have_lock
+	    || ({ assert (locked == 0);
+		  mutex_lock(&av->mutex);
+		  locked = 1;
+		  chunk_at_offset (p, size)->size <= 2 * SIZE_SZ
+		    || chunksize (chunk_at_offset (p, size)) >= av->system_mem;
+	      }))
+#endif
+	  {
+	    errstr = "free(): invalid next size (fast)";
+	    goto errout;
+	  }
+#ifdef ATOMIC_FASTBINS
+	if (! have_lock)
+	  {
+	    (void)mutex_unlock(&av->mutex);
+	    locked = 0;
+	  }
+#endif
       }
 
     if (__builtin_expect (perturb_byte, 0))
@@ -4822,9 +4843,8 @@
 	    goto errout;
 	  }
 	p->fd = fd = old;
-	atomic_full_barrier ();
       }
-    while ((old = catomic_compare_and_exchange_val_acq (fb, p, fd)) != fd);
+    while ((old = catomic_compare_and_exchange_val_rel (fb, p, fd)) != fd);
 #else
     /* Another simple check: make sure the top of the bin is not the
        record we are going to add (i.e., double free).  */

Modified: fsf/trunk/libc/nscd/aicache.c
==============================================================================
--- fsf/trunk/libc/nscd/aicache.c (original)
+++ fsf/trunk/libc/nscd/aicache.c Fri Jul 17 00:07:41 2009
@@ -543,8 +543,6 @@
       (void) cache_add (req->type, key_copy, req->key_len, &dataset->head,
 			true, db, uid, he == NULL);
 
-      pthread_rwlock_unlock (&db->lock);
-
       /* Mark the old entry as obsolete.  */
       if (dh != NULL)
 	dh->usable = false;

Modified: fsf/trunk/libc/nscd/cache.c
==============================================================================
--- fsf/trunk/libc/nscd/cache.c (original)
+++ fsf/trunk/libc/nscd/cache.c Fri Jul 17 00:07:41 2009
@@ -179,7 +179,7 @@
   /* Put the new entry in the first position.  */
   do
     newp->next = table->head->array[hash];
-  while (atomic_compare_and_exchange_bool_acq (&table->head->array[hash],
+  while (atomic_compare_and_exchange_bool_rel (&table->head->array[hash],
 					       (ref_t) ((char *) newp
 							- table->data),
 					       (ref_t) newp->next));

Modified: fsf/trunk/libc/nscd/grpcache.c
==============================================================================
--- fsf/trunk/libc/nscd/grpcache.c (original)
+++ fsf/trunk/libc/nscd/grpcache.c Fri Jul 17 00:07:41 2009
@@ -146,8 +146,6 @@
 	      (void) cache_add (req->type, &dataset->strdata, req->key_len,
 				&dataset->head, true, db, owner, he == NULL);
 
-	      pthread_rwlock_unlock (&db->lock);
-
 	      /* Mark the old entry as obsolete.  */
 	      if (dh != NULL)
 		dh->usable = false;
@@ -367,12 +365,10 @@
 		(void) cache_add (GETGRBYGID, cp, key_offset, &dataset->head,
 				  false, db, owner, false);
 	    }
-
-	out:
-	  pthread_rwlock_unlock (&db->lock);
 	}
     }
 
+out:
   if (__builtin_expect (written != total, 0) && debug_level > 0)
     {
       char buf[256];

Modified: fsf/trunk/libc/nscd/hstcache.c
==============================================================================
--- fsf/trunk/libc/nscd/hstcache.c (original)
+++ fsf/trunk/libc/nscd/hstcache.c Fri Jul 17 00:07:41 2009
@@ -152,8 +152,6 @@
 
 	      (void) cache_add (req->type, &dataset->strdata, req->key_len,
 				&dataset->head, true, db, owner, he == NULL);
-
-	      pthread_rwlock_unlock (&db->lock);
 
 	      /* Mark the old entry as obsolete.  */
 	      if (dh != NULL)
@@ -404,8 +402,6 @@
 
 	  (void) cache_add (req->type, key_copy, req->key_len,
 			    &dataset->head, true, db, owner, he == NULL);
-
-	  pthread_rwlock_unlock (&db->lock);
 	}
     }
 

Modified: fsf/trunk/libc/nscd/initgrcache.c
==============================================================================
--- fsf/trunk/libc/nscd/initgrcache.c (original)
+++ fsf/trunk/libc/nscd/initgrcache.c Fri Jul 17 00:07:41 2009
@@ -229,8 +229,6 @@
 
 	      (void) cache_add (req->type, key_copy, req->key_len,
 				&dataset->head, true, db, uid, he == NULL);
-
-	      pthread_rwlock_unlock (&db->lock);
 
 	      /* Mark the old entry as obsolete.  */
 	      if (dh != NULL)
@@ -388,8 +386,6 @@
 
 	  (void) cache_add (INITGROUPS, cp, req->key_len, &dataset->head, true,
 			    db, uid, he == NULL);
-
-	  pthread_rwlock_unlock (&db->lock);
 	}
     }
 

Modified: fsf/trunk/libc/nscd/mem.c
==============================================================================
--- fsf/trunk/libc/nscd/mem.c (original)
+++ fsf/trunk/libc/nscd/mem.c Fri Jul 17 00:07:41 2009
@@ -566,9 +566,6 @@
 	    }
 	}
 
-      if (data_alloc)
-	pthread_rwlock_unlock (&db->lock);
-
       if (! db->last_alloc_failed)
 	{
 	  dbg_log (_("no more memory for database '%s'"), dbnames[db - dbs]);
@@ -591,5 +588,8 @@
 
   pthread_mutex_unlock (&db->memlock);
 
+  if (data_alloc)
+    pthread_rwlock_unlock (&db->lock);
+
   return res;
 }

Modified: fsf/trunk/libc/nscd/pwdcache.c
==============================================================================
--- fsf/trunk/libc/nscd/pwdcache.c (original)
+++ fsf/trunk/libc/nscd/pwdcache.c Fri Jul 17 00:07:41 2009
@@ -153,8 +153,6 @@
 	      (void) cache_add (req->type, key_copy, req->key_len,
 				&dataset->head, true, db, owner, he == NULL);
 
-	      pthread_rwlock_unlock (&db->lock);
-
 	      /* Mark the old entry as obsolete.  */
 	      if (dh != NULL)
 		dh->usable = false;
@@ -362,12 +360,10 @@
 		(void) cache_add (GETPWBYUID, cp, key_offset, &dataset->head,
 				  false, db, owner, false);
 	    }
-
-	out:
-	  pthread_rwlock_unlock (&db->lock);
 	}
     }
 
+out:
   if (__builtin_expect (written != total, 0) && debug_level > 0)
     {
       char buf[256];

Modified: fsf/trunk/libc/nscd/servicescache.c
==============================================================================
--- fsf/trunk/libc/nscd/servicescache.c (original)
+++ fsf/trunk/libc/nscd/servicescache.c Fri Jul 17 00:07:41 2009
@@ -135,8 +135,6 @@
 
 	      (void) cache_add (req->type, &dataset->strdata, req->key_len,
 				&dataset->head, true, db, owner, he == NULL);
-
-	      pthread_rwlock_unlock (&db->lock);
 
 	      /* Mark the old entry as obsolete.  */
 	      if (dh != NULL)
@@ -317,8 +315,6 @@
 
 	  (void) cache_add (req->type, key_copy, req->key_len,
 			    &dataset->head, true, db, owner, he == NULL);
-
-	  pthread_rwlock_unlock (&db->lock);
 	}
     }
 

Modified: fsf/trunk/libc/sysdeps/generic/ldsodefs.h
==============================================================================
--- fsf/trunk/libc/sysdeps/generic/ldsodefs.h (original)
+++ fsf/trunk/libc/sysdeps/generic/ldsodefs.h Fri Jul 17 00:07:41 2009
@@ -335,6 +335,10 @@
 extern int _dl_name_match_p (const char *__name, const struct link_map *__map)
      internal_function;
 
+/* Compute next higher prime number.  */
+extern unsigned long int _dl_higher_prime_number (unsigned long int n)
+     internal_function;
+
 /* Function used as argument for `_dl_receive_error' function.  The
    arguments are the error code, error string, and the objname the
    error occurred in.  */

Modified: fsf/trunk/libc/sysdeps/x86_64/dl-trampoline.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/dl-trampoline.S (original)
+++ fsf/trunk/libc/sysdeps/x86_64/dl-trampoline.S Fri Jul 17 00:07:41 2009
@@ -185,71 +185,6 @@
 	movq  LR_R8_OFFSET(%rsp), %r8
 	movq  LR_R9_OFFSET(%rsp), %r9
 
-# ifdef HAVE_AVX_SUPPORT
-	cmpl	$0, L(have_avx)(%rip)
-	js	L(no_avx2)
-
-	/* Check if any xmm0-xmm7 registers are changed by audit
-	   module.  */
-	vmovdqa (LR_XMM_OFFSET)(%rsp), %xmm0
-	vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm1
-	vpmovmskb %xmm1, %esi
-	cmpl $0xffff, %esi
-	je 1f
-	vmovdqu			(LR_VECTOR_OFFSET)(%rsp), %ymm0
-
-1:	vmovdqa (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1
-	vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
-	vpmovmskb %xmm2, %esi
-	cmpl $0xffff, %esi
-	je 1f
-	vmovdqu	  (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1
-
-1:	vmovdqa (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2
-	vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm3
-	vpmovmskb %xmm3, %esi
-	cmpl $0xffff, %esi
-	je 1f
-	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2
-
-1:	vmovdqa (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3
-	vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm4
-	vpmovmskb %xmm4, %esi
-	cmpl $0xffff, %esi
-	je 1f
-	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3
-
-1:	vmovdqa (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4
-	vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm5
-	vpmovmskb %xmm5, %esi
-	cmpl $0xffff, %esi
-	je 1f
-	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4
-
-1:	vmovdqa (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5
-	vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm6
-	vpmovmskb %xmm6, %esi
-	cmpl $0xffff, %esi
-	je 1f
-	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5
-
-1:	vmovdqa (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
-	vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm7
-	vpmovmskb %xmm7, %esi
-	cmpl $0xffff, %esi
-	je 1f
-	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6
-
-1:	vmovdqa (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
-	vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
-	vpmovmskb %xmm8, %esi
-	cmpl $0xffff, %esi
-	je 1f
-	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7
-	jmp	1f
-
-L(no_avx2):
-# endif
 	movaps		    (LR_XMM_OFFSET)(%rsp), %xmm0
 	movaps	 (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1
 	movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2
@@ -259,7 +194,64 @@
 	movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
 	movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
 
-1:	movq 16(%rbx), %r10	# Anything in framesize?
+# ifdef HAVE_AVX_SUPPORT
+	cmpl	$0, L(have_avx)(%rip)
+	js	L(no_avx2)
+
+	/* Check if any xmm0-xmm7 registers are changed by audit
+	   module.  */
+	vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu			(LR_VECTOR_OFFSET)(%rsp), %ymm0
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu	  (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7
+
+L(no_avx2):
+1:
+# endif
+	movq 16(%rbx), %r10	# Anything in framesize?
 	testq %r10, %r10
 	jns 3f
 
@@ -358,32 +350,31 @@
 	movq LRV_RAX_OFFSET(%rsp), %rax
 	movq LRV_RDX_OFFSET(%rsp), %rdx
 
+	movaps LRV_XMM0_OFFSET(%rsp), %xmm0
+	movaps LRV_XMM1_OFFSET(%rsp), %xmm1
+
 # ifdef HAVE_AVX_SUPPORT
 	cmpl	$0, L(have_avx)(%rip)
 	js	L(no_avx4)
 
 	/* Check if xmm0/xmm1 registers are changed by audit module.  */
-	vmovdqa LRV_XMM0_OFFSET(%rsp), %xmm0
-	vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm1
-	vpmovmskb %xmm1, %esi
+	vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2
+	vpmovmskb %xmm2, %esi
 	cmpl $0xffff, %esi
 	je 1f
 	vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0
 
-1:	vmovdqa LRV_XMM1_OFFSET(%rsp), %xmm1
-	vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
+1:	vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
 	vpmovmskb %xmm2, %esi
 	cmpl $0xffff, %esi
 	je 1f
 	vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1
-	jmp 1f
 
 L(no_avx4):
-# endif
-	movaps LRV_XMM0_OFFSET(%rsp), %xmm0
-	movaps LRV_XMM1_OFFSET(%rsp), %xmm1
-
-1:	fldt LRV_ST1_OFFSET(%rsp)
+1:
+# endif
+
+	fldt LRV_ST1_OFFSET(%rsp)
 	fldt LRV_ST0_OFFSET(%rsp)
 
 	movq %rbx, %rsp

Added: fsf/trunk/libc/sysdeps/x86_64/memcmp.S
==============================================================================
--- fsf/trunk/libc/sysdeps/x86_64/memcmp.S (added)
+++ fsf/trunk/libc/sysdeps/x86_64/memcmp.S Fri Jul 17 00:07:41 2009
@@ -1,0 +1,359 @@
+/* memcmp with SSE2
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY (memcmp)
+	test	%rdx, %rdx
+	jz	L(finz)
+	cmpq	$1, %rdx
+	jle	L(finr1b)
+	subq	%rdi, %rsi
+	movq	%rdx, %r10
+	cmpq	$32, %r10
+	jge	L(gt32)
+	/* Handle small chunks and last block of less than 32 bytes.  */
+L(small):
+	testq	$1, %r10
+	jz	L(s2b)
+	movzbl	(%rdi),	%eax
+	movzbl	(%rdi, %rsi), %edx
+	subq    $1, %r10
+	je	L(finz1)
+	addq	$1, %rdi
+	subl	%edx, %eax
+	jnz	L(exit)
+L(s2b):
+	testq	$2, %r10
+	jz	L(s4b)
+	movzwl	(%rdi),	%eax
+	movzwl	(%rdi, %rsi), %edx
+	subq    $2, %r10
+	je	L(fin2_7)
+	addq	$2, %rdi
+	cmpl	%edx, %eax
+	jnz	L(fin2_7)
+L(s4b):
+	testq	$4, %r10
+	jz	L(s8b)
+	movl	(%rdi),	%eax
+	movl	(%rdi, %rsi), %edx
+	subq    $4, %r10
+	je	L(fin2_7)
+	addq	$4, %rdi
+	cmpl	%edx, %eax
+	jnz	L(fin2_7)
+L(s8b):
+	testq	$8, %r10
+	jz	L(s16b)
+	movq	(%rdi),	%rax
+	movq	(%rdi, %rsi), %rdx
+	subq    $8, %r10
+	je	L(fin2_7)
+	addq	$8, %rdi
+	cmpq	%rdx, %rax
+	jnz	L(fin2_7)
+L(s16b):
+	movdqu    (%rdi), %xmm1
+	movdqu    (%rdi, %rsi), %xmm0
+	pcmpeqb   %xmm0, %xmm1
+	pmovmskb  %xmm1, %edx
+	xorl	  %eax, %eax
+	subl      $0xffff, %edx
+	jz	  L(finz)
+	bsfl      %edx, %ecx
+	leaq	 (%rdi, %rcx), %rcx
+	movzbl	 (%rcx), %eax
+	movzbl	 (%rsi, %rcx), %edx
+	jmp	 L(finz1)
+
+	.p2align 4,, 4
+L(finr1b):
+	movzbl	(%rdi), %eax
+	movzbl  (%rsi), %edx
+L(finz1):
+	subl	%edx, %eax
+L(exit):
+	ret
+
+	.p2align 4,, 4
+L(fin2_7):
+	cmpq	%rdx, %rax
+	jz	L(finz)
+	movq	%rax, %r11
+	subq	%rdx, %r11
+	bsfq	%r11, %rcx
+	sarq	$3, %rcx
+	salq	$3, %rcx
+	sarq	%cl, %rax
+	movzbl  %al, %eax
+	sarq	%cl, %rdx
+	movzbl  %dl, %edx
+	subl	%edx, %eax
+	ret
+
+	.p2align 4,, 4
+L(finz):
+	xorl	%eax, %eax
+	ret
+
+	/* For blocks bigger than 32 bytes
+	   1. Advance one of the addr pointer to be 16B aligned.
+	   2. Treat the case of both addr pointers aligned to 16B
+	      separately to avoid movdqu.
+	   3. Handle any blocks of greater than 64 consecutive bytes with
+	      unrolling to reduce branches.
+	   4. At least one addr pointer is 16B aligned, use memory version
+	      of pcmbeqb.
+	*/
+	.p2align 4,, 4
+L(gt32):
+	movq	%rdx, %r11
+	addq	%rdi, %r11
+	movq	%rdi, %r8
+
+	andq	$15, %r8
+	jz	L(16am)
+	/* Both pointers may be misaligned.  */
+	movdqu	(%rdi),	%xmm1
+	movdqu	(%rdi, %rsi), %xmm0
+	pcmpeqb   %xmm0, %xmm1
+	pmovmskb  %xmm1, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	neg	 %r8
+	leaq    16(%rdi, %r8), %rdi
+L(16am):
+	/* Handle two 16B aligned pointers separately.  */
+	testq   $15, %rsi
+	jz      L(ATR)
+	testq	$16, %rdi
+	jz	L(A32)
+	movdqu	(%rdi, %rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq	$16, %rdi
+L(A32):
+	movq	%r11, %r10
+	andq	$-32, %r10
+	cmpq	%r10, %rdi
+        jge	L(mt16)
+	/* Pre-unroll to be ready for unrolled 64B loop.  */
+	testq	$32, %rdi
+	jz	L(A64)
+	movdqu    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqu    (%rdi,%rsi), %xmm0
+	pcmpeqb  (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+L(A64):
+	movq	%r11, %r10
+	andq	$-64, %r10
+	cmpq	%r10, %rdi
+        jge	L(mt32)
+
+L(A64main):
+	movdqu    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqu    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqu    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqu    (%rdi,%rsi), %xmm0
+	pcmpeqb  (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	cmpq       %rdi, %r10
+	jne       L(A64main)
+
+L(mt32):
+	movq	%r11, %r10
+	andq	$-32, %r10
+	cmpq	%r10, %rdi
+        jge	L(mt16)
+
+L(A32main):
+	movdqu    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqu    (%rdi,%rsi), %xmm0
+	pcmpeqb  (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	cmpq       %rdi, %r10
+	jne       L(A32main)
+L(mt16):
+	subq       %rdi, %r11
+	je	  L(finz)
+	movq	  %r11, %r10
+	jmp	  L(small)
+
+	.p2align 4,, 4
+L(neq):
+	bsfl      %edx, %ecx
+	movzbl	 (%rdi, %rcx), %eax
+	addq	 %rdi, %rsi
+	movzbl	 (%rsi,%rcx), %edx
+	jmp	 L(finz1)
+
+	.p2align 4,, 4
+L(ATR):
+	movq	%r11, %r10
+	andq	$-32, %r10
+	cmpq	%r10, %rdi
+        jge	L(mt16)
+	testq	$16, %rdi
+	jz	L(ATR32)
+
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+	cmpq       %rdi, %r10
+	je       L(mt16)
+
+L(ATR32):
+	movq	%r11, %r10
+	andq	$-64, %r10
+	testq	$32, %rdi
+	jz	L(ATR64)
+
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+L(ATR64):
+	cmpq       %rdi, %r10
+	je	   L(mt32)
+
+L(ATR64main):
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+	cmpq       %rdi, %r10
+	jne       L(ATR64main)
+
+	movq	%r11, %r10
+	andq	$-32, %r10
+	cmpq	%r10, %rdi
+        jge	L(mt16)
+
+L(ATR32res):
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	cmpq	  %r10, %rdi
+	jne       L(ATR32res)
+
+	subq       %rdi, %r11
+	je	  L(finz)
+	movq	  %r11, %r10
+	jmp	  L(small)
+	/* Align to 16byte to improve instruction fetch.  */
+	.p2align 4,, 4
+END(memcmp)
+
+#undef bcmp
+weak_alias (memcmp, bcmp)
+libc_hidden_builtin_def (memcmp)