[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Commits] r21548 - in /fsf/trunk/libc: ./ math/ ports/ ports/sysdeps/tile/ ports/sysdeps/tile/nptl/ ports/sysdeps/tile/tilegx/ sysdeps...



Author: eglibc
Date: Tue Nov  6 16:19:40 2012
New Revision: 21548

Log:
Import glibc-mainline for 2012-11-06

Added:
    fsf/trunk/libc/ports/sysdeps/tile/memcopy.h
    fsf/trunk/libc/ports/sysdeps/tile/nptl/pthread_spin_unlock.c
    fsf/trunk/libc/ports/sysdeps/tile/tilegx/Makefile
    fsf/trunk/libc/ports/sysdeps/tile/wordcopy.c
Modified:
    fsf/trunk/libc/ChangeLog
    fsf/trunk/libc/NEWS
    fsf/trunk/libc/math/libm-test.inc
    fsf/trunk/libc/ports/ChangeLog.tile
    fsf/trunk/libc/ports/sysdeps/tile/crti.S
    fsf/trunk/libc/ports/sysdeps/tile/dl-runtime.c
    fsf/trunk/libc/ports/sysdeps/tile/math_private.h
    fsf/trunk/libc/ports/sysdeps/tile/start.S
    fsf/trunk/libc/ports/sysdeps/tile/tilegx/memcpy.c
    fsf/trunk/libc/sysdeps/ieee754/dbl-64/s_fma.c
    fsf/trunk/libc/sysdeps/ieee754/ldbl-128/s_fmal.c
    fsf/trunk/libc/sysdeps/ieee754/ldbl-96/s_fmal.c

Modified: fsf/trunk/libc/ChangeLog
==============================================================================
--- fsf/trunk/libc/ChangeLog (original)
+++ fsf/trunk/libc/ChangeLog Tue Nov  6 16:19:40 2012
@@ -1,3 +1,16 @@
+2012-11-06  Joseph Myers  <joseph@xxxxxxxxxxxxxxxx>
+
+	[BZ #14793]
+	* sysdeps/ieee754/dbl-64/s_fma.c (__fma): In case of large z
+	exponent and small x and y exponents, scale x or y up.  Increase
+	by 2 the exponent used in scaling up.
+	* sysdeps/ieee754/ldbl-128/s_fmal.c (__fmal): Likewise.
+	* sysdeps/ieee754/ldbl-96/s_fmal.c (__fmal): Likewise.
+	* math/libm-test.inc (fma_test): Add more tests.
+	(fma_test_towardzero): Likewise.
+	(fma_test_downward): Likewise.
+	(fma_test_upward): Likewise.
+
 2012-11-05  Joseph Myers  <joseph@xxxxxxxxxxxxxxxx>
 
 	[BZ #14805]

Modified: fsf/trunk/libc/NEWS
==============================================================================
--- fsf/trunk/libc/NEWS (original)
+++ fsf/trunk/libc/NEWS Tue Nov  6 16:19:40 2012
@@ -18,7 +18,7 @@
   14518, 14519, 14530, 14532, 14538, 14543, 14544, 14545, 14557, 14562,
   14568, 14576, 14579, 14583, 14587, 14595, 14602, 14610, 14621, 14638,
   14645, 14648, 14652, 14660, 14661, 14669, 14683, 14694, 14716, 14743,
-  14767, 14783, 14784, 14785, 14796, 14797, 14801, 14805.
+  14767, 14783, 14784, 14785, 14793, 14796, 14797, 14801, 14805.
 
 * Support for STT_GNU_IFUNC symbols added for s390 and s390x.
   Optimized versions of memcpy, memset, and memcmp added for System z10 and

Modified: fsf/trunk/libc/math/libm-test.inc
==============================================================================
--- fsf/trunk/libc/math/libm-test.inc (original)
+++ fsf/trunk/libc/math/libm-test.inc Tue Nov  6 16:19:40 2012
@@ -4662,6 +4662,14 @@
   TEST_fff_f (fma, 0x0.fffp0, -0x0.fffp0, 0x0.ffep0, -0x1p-24);
   TEST_fff_f (fma, -0x0.fffp0, 0x0.fffp0, 0x0.ffep0, -0x1p-24);
   TEST_fff_f (fma, -0x0.fffp0, -0x0.fffp0, -0x0.ffep0, 0x1p-24);
+  TEST_fff_f (fma, 0x1.000002p-126, 0x1.000002p-26, 0x1p127, 0x1p127);
+  TEST_fff_f (fma, 0x1.000002p-126, -0x1.000002p-26, 0x1p127, 0x1p127);
+  TEST_fff_f (fma, 0x1.000002p-126, 0x1.000002p-26, -0x1p127, -0x1p127);
+  TEST_fff_f (fma, 0x1.000002p-126, -0x1.000002p-26, -0x1p127, -0x1p127);
+  TEST_fff_f (fma, 0x1.000002p-126, 0x1.000002p-26, 0x1p103, 0x1p103);
+  TEST_fff_f (fma, 0x1.000002p-126, -0x1.000002p-26, 0x1p103, 0x1p103);
+  TEST_fff_f (fma, 0x1.000002p-126, 0x1.000002p-26, -0x1p103, -0x1p103);
+  TEST_fff_f (fma, 0x1.000002p-126, -0x1.000002p-26, -0x1p103, -0x1p103);
 #endif
 #if defined (TEST_DOUBLE) && DBL_MANT_DIG == 53
   TEST_fff_f (fma, 0x1.7fp+13, 0x1.0000000000001p+0, 0x1.ffep-48, 0x1.7f00000000001p+13);
@@ -4712,6 +4720,14 @@
   TEST_fff_f (fma, 0x0.fffffffffffff8p0, -0x0.fffffffffffff8p0, 0x0.fffffffffffffp0, -0x1p-106);
   TEST_fff_f (fma, -0x0.fffffffffffff8p0, 0x0.fffffffffffff8p0, 0x0.fffffffffffffp0, -0x1p-106);
   TEST_fff_f (fma, -0x0.fffffffffffff8p0, -0x0.fffffffffffff8p0, -0x0.fffffffffffffp0, 0x1p-106);
+  TEST_fff_f (fma, 0x1.0000000000001p-1022, 0x1.0000000000001p-55, 0x1p1023, 0x1p1023);
+  TEST_fff_f (fma, 0x1.0000000000001p-1022, -0x1.0000000000001p-55, 0x1p1023, 0x1p1023);
+  TEST_fff_f (fma, 0x1.0000000000001p-1022, 0x1.0000000000001p-55, -0x1p1023, -0x1p1023);
+  TEST_fff_f (fma, 0x1.0000000000001p-1022, -0x1.0000000000001p-55, -0x1p1023, -0x1p1023);
+  TEST_fff_f (fma, 0x1.0000000000001p-1022, 0x1.0000000000001p-55, 0x1p970, 0x1p970);
+  TEST_fff_f (fma, 0x1.0000000000001p-1022, -0x1.0000000000001p-55, 0x1p970, 0x1p970);
+  TEST_fff_f (fma, 0x1.0000000000001p-1022, 0x1.0000000000001p-55, -0x1p970, -0x1p970);
+  TEST_fff_f (fma, 0x1.0000000000001p-1022, -0x1.0000000000001p-55, -0x1p970, -0x1p970);
 #endif
 #if defined (TEST_LDOUBLE) && LDBL_MANT_DIG == 64
   TEST_fff_f (fma, -0x8.03fcp+3696L, 0xf.fffffffffffffffp-6140L, 0x8.3ffffffffffffffp-2450L, -0x8.01ecp-2440L);
@@ -4748,6 +4764,14 @@
   TEST_fff_f (fma, 0x0.ffffffffffffffffp0L, -0x0.ffffffffffffffffp0L, 0x0.fffffffffffffffep0L, -0x1p-128L);
   TEST_fff_f (fma, -0x0.ffffffffffffffffp0L, 0x0.ffffffffffffffffp0L, 0x0.fffffffffffffffep0L, -0x1p-128L);
   TEST_fff_f (fma, -0x0.ffffffffffffffffp0L, -0x0.ffffffffffffffffp0L, -0x0.fffffffffffffffep0L, 0x1p-128L);
+  TEST_fff_f (fma, 0x1.0000000000000002p-16382L, 0x1.0000000000000002p-66L, 0x1p16383L, 0x1p16383L);
+  TEST_fff_f (fma, 0x1.0000000000000002p-16382L, -0x1.0000000000000002p-66L, 0x1p16383L, 0x1p16383L);
+  TEST_fff_f (fma, 0x1.0000000000000002p-16382L, 0x1.0000000000000002p-66L, -0x1p16383L, -0x1p16383L);
+  TEST_fff_f (fma, 0x1.0000000000000002p-16382L, -0x1.0000000000000002p-66L, -0x1p16383L, -0x1p16383L);
+  TEST_fff_f (fma, 0x1.0000000000000002p-16382L, 0x1.0000000000000002p-66L, 0x1p16319L, 0x1p16319L);
+  TEST_fff_f (fma, 0x1.0000000000000002p-16382L, -0x1.0000000000000002p-66L, 0x1p16319L, 0x1p16319L);
+  TEST_fff_f (fma, 0x1.0000000000000002p-16382L, 0x1.0000000000000002p-66L, -0x1p16319L, -0x1p16319L);
+  TEST_fff_f (fma, 0x1.0000000000000002p-16382L, -0x1.0000000000000002p-66L, -0x1p16319L, -0x1p16319L);
 #endif
 #if defined (TEST_LDOUBLE) && LDBL_MANT_DIG == 113
   TEST_fff_f (fma, 0x1.bb2de33e02ccbbfa6e245a7c1f71p-2584L, -0x1.6b500daf0580d987f1bc0cadfcddp-13777L, 0x1.613cd91d9fed34b33820e5ab9d8dp-16378L, -0x1.3a79fb50eb9ce887cffa0f09bd9fp-16360L);
@@ -4791,6 +4815,14 @@
   TEST_fff_f (fma, 0x0.ffffffffffffffffffffffffffff8p0L, -0x0.ffffffffffffffffffffffffffff8p0L, 0x0.ffffffffffffffffffffffffffffp0L, -0x1p-226L);
   TEST_fff_f (fma, -0x0.ffffffffffffffffffffffffffff8p0L, 0x0.ffffffffffffffffffffffffffff8p0L, 0x0.ffffffffffffffffffffffffffffp0L, -0x1p-226L);
   TEST_fff_f (fma, -0x0.ffffffffffffffffffffffffffff8p0L, -0x0.ffffffffffffffffffffffffffff8p0L, -0x0.ffffffffffffffffffffffffffffp0L, 0x1p-226L);
+  TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, 0x1.0000000000000000000000000001p-66L, 0x1p16383L, 0x1p16383L);
+  TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, -0x1.0000000000000000000000000001p-66L, 0x1p16383L, 0x1p16383L);
+  TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, 0x1.0000000000000000000000000001p-66L, -0x1p16383L, -0x1p16383L);
+  TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, -0x1.0000000000000000000000000001p-66L, -0x1p16383L, -0x1p16383L);
+  TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, 0x1.0000000000000000000000000001p-66L, 0x1p16319L, 0x1p16319L);
+  TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, -0x1.0000000000000000000000000001p-66L, 0x1p16319L, 0x1p16319L);
+  TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, 0x1.0000000000000000000000000001p-66L, -0x1p16319L, -0x1p16319L);
+  TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, -0x1.0000000000000000000000000001p-66L, -0x1p16319L, -0x1p16319L);
 #endif
 
   END (fma);
@@ -4884,6 +4916,14 @@
       TEST_fff_f (fma, 0x0.fffp0, -0x0.fffp0, 0x0.ffep0, -0x1p-24);
       TEST_fff_f (fma, -0x0.fffp0, 0x0.fffp0, 0x0.ffep0, -0x1p-24);
       TEST_fff_f (fma, -0x0.fffp0, -0x0.fffp0, -0x0.ffep0, 0x1p-24);
+      TEST_fff_f (fma, 0x1.000002p-126, 0x1.000002p-26, 0x1p127, 0x1p127);
+      TEST_fff_f (fma, 0x1.000002p-126, -0x1.000002p-26, 0x1p127, 0x0.ffffffp127);
+      TEST_fff_f (fma, 0x1.000002p-126, 0x1.000002p-26, -0x1p127, -0x0.ffffffp127);
+      TEST_fff_f (fma, 0x1.000002p-126, -0x1.000002p-26, -0x1p127, -0x1p127);
+      TEST_fff_f (fma, 0x1.000002p-126, 0x1.000002p-26, 0x1p103, 0x1p103);
+      TEST_fff_f (fma, 0x1.000002p-126, -0x1.000002p-26, 0x1p103, 0x0.ffffffp103);
+      TEST_fff_f (fma, 0x1.000002p-126, 0x1.000002p-26, -0x1p103, -0x0.ffffffp103);
+      TEST_fff_f (fma, 0x1.000002p-126, -0x1.000002p-26, -0x1p103, -0x1p103);
 #endif
 #if defined (TEST_DOUBLE) && DBL_MANT_DIG == 53
       TEST_fff_f (fma, 0x1.4p-1022, 0x1.0000000000002p-1, 0x1p-1024, 0x1.c000000000002p-1023, UNDERFLOW_EXCEPTION);
@@ -4914,6 +4954,14 @@
       TEST_fff_f (fma, 0x0.fffffffffffff8p0, -0x0.fffffffffffff8p0, 0x0.fffffffffffffp0, -0x1p-106);
       TEST_fff_f (fma, -0x0.fffffffffffff8p0, 0x0.fffffffffffff8p0, 0x0.fffffffffffffp0, -0x1p-106);
       TEST_fff_f (fma, -0x0.fffffffffffff8p0, -0x0.fffffffffffff8p0, -0x0.fffffffffffffp0, 0x1p-106);
+      TEST_fff_f (fma, 0x1.0000000000001p-1022, 0x1.0000000000001p-55, 0x1p1023, 0x1p1023);
+      TEST_fff_f (fma, 0x1.0000000000001p-1022, -0x1.0000000000001p-55, 0x1p1023, 0x0.fffffffffffff8p1023);
+      TEST_fff_f (fma, 0x1.0000000000001p-1022, 0x1.0000000000001p-55, -0x1p1023, -0x0.fffffffffffff8p1023);
+      TEST_fff_f (fma, 0x1.0000000000001p-1022, -0x1.0000000000001p-55, -0x1p1023, -0x1p1023);
+      TEST_fff_f (fma, 0x1.0000000000001p-1022, 0x1.0000000000001p-55, 0x1p970, 0x1p970);
+      TEST_fff_f (fma, 0x1.0000000000001p-1022, -0x1.0000000000001p-55, 0x1p970, 0x0.fffffffffffff8p970);
+      TEST_fff_f (fma, 0x1.0000000000001p-1022, 0x1.0000000000001p-55, -0x1p970, -0x0.fffffffffffff8p970);
+      TEST_fff_f (fma, 0x1.0000000000001p-1022, -0x1.0000000000001p-55, -0x1p970, -0x1p970);
 #endif
 #if defined (TEST_LDOUBLE) && LDBL_MANT_DIG == 64
       TEST_fff_f (fma, 0x1.4p-16382L, 0x1.0000000000000004p-1L, 0x1p-16384L, 0x1.c000000000000004p-16383L, UNDERFLOW_EXCEPTION);
@@ -4944,6 +4992,14 @@
       TEST_fff_f (fma, 0x0.ffffffffffffffffp0L, -0x0.ffffffffffffffffp0L, 0x0.fffffffffffffffep0L, -0x1p-128L);
       TEST_fff_f (fma, -0x0.ffffffffffffffffp0L, 0x0.ffffffffffffffffp0L, 0x0.fffffffffffffffep0L, -0x1p-128L);
       TEST_fff_f (fma, -0x0.ffffffffffffffffp0L, -0x0.ffffffffffffffffp0L, -0x0.fffffffffffffffep0L, 0x1p-128L);
+      TEST_fff_f (fma, 0x1.0000000000000002p-16382L, 0x1.0000000000000002p-66L, 0x1p16383L, 0x1p16383L);
+      TEST_fff_f (fma, 0x1.0000000000000002p-16382L, -0x1.0000000000000002p-66L, 0x1p16383L, 0x0.ffffffffffffffffp16383L);
+      TEST_fff_f (fma, 0x1.0000000000000002p-16382L, 0x1.0000000000000002p-66L, -0x1p16383L, -0x0.ffffffffffffffffp16383L);
+      TEST_fff_f (fma, 0x1.0000000000000002p-16382L, -0x1.0000000000000002p-66L, -0x1p16383L, -0x1p16383L);
+      TEST_fff_f (fma, 0x1.0000000000000002p-16382L, 0x1.0000000000000002p-66L, 0x1p16319L, 0x1p16319L);
+      TEST_fff_f (fma, 0x1.0000000000000002p-16382L, -0x1.0000000000000002p-66L, 0x1p16319L, 0x0.ffffffffffffffffp16319L);
+      TEST_fff_f (fma, 0x1.0000000000000002p-16382L, 0x1.0000000000000002p-66L, -0x1p16319L, -0x0.ffffffffffffffffp16319L);
+      TEST_fff_f (fma, 0x1.0000000000000002p-16382L, -0x1.0000000000000002p-66L, -0x1p16319L, -0x1p16319L);
 #endif
 #if defined (TEST_LDOUBLE) && LDBL_MANT_DIG == 113
       TEST_fff_f (fma, 0x1.4p-16382L, 0x1.0000000000000000000000000002p-1L, 0x1p-16384L, 0x1.c000000000000000000000000002p-16383L, UNDERFLOW_EXCEPTION);
@@ -4974,6 +5030,14 @@
       TEST_fff_f (fma, 0x0.ffffffffffffffffffffffffffff8p0L, -0x0.ffffffffffffffffffffffffffff8p0L, 0x0.ffffffffffffffffffffffffffffp0L, -0x1p-226L);
       TEST_fff_f (fma, -0x0.ffffffffffffffffffffffffffff8p0L, 0x0.ffffffffffffffffffffffffffff8p0L, 0x0.ffffffffffffffffffffffffffffp0L, -0x1p-226L);
       TEST_fff_f (fma, -0x0.ffffffffffffffffffffffffffff8p0L, -0x0.ffffffffffffffffffffffffffff8p0L, -0x0.ffffffffffffffffffffffffffffp0L, 0x1p-226L);
+      TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, 0x1.0000000000000000000000000001p-66L, 0x1p16383L, 0x1p16383L);
+      TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, -0x1.0000000000000000000000000001p-66L, 0x1p16383L, 0x0.ffffffffffffffffffffffffffff8p16383L);
+      TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, 0x1.0000000000000000000000000001p-66L, -0x1p16383L, -0x0.ffffffffffffffffffffffffffff8p16383L);
+      TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, -0x1.0000000000000000000000000001p-66L, -0x1p16383L, -0x1p16383L);
+      TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, 0x1.0000000000000000000000000001p-66L, 0x1p16319L, 0x1p16319L);
+      TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, -0x1.0000000000000000000000000001p-66L, 0x1p16319L, 0x0.ffffffffffffffffffffffffffff8p16319L);
+      TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, 0x1.0000000000000000000000000001p-66L, -0x1p16319L, -0x0.ffffffffffffffffffffffffffff8p16319L);
+      TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, -0x1.0000000000000000000000000001p-66L, -0x1p16319L, -0x1p16319L);
 #endif
     }
 
@@ -5070,6 +5134,14 @@
       TEST_fff_f (fma, 0x0.fffp0, -0x0.fffp0, 0x0.ffep0, -0x1p-24);
       TEST_fff_f (fma, -0x0.fffp0, 0x0.fffp0, 0x0.ffep0, -0x1p-24);
       TEST_fff_f (fma, -0x0.fffp0, -0x0.fffp0, -0x0.ffep0, 0x1p-24);
+      TEST_fff_f (fma, 0x1.000002p-126, 0x1.000002p-26, 0x1p127, 0x1p127);
+      TEST_fff_f (fma, 0x1.000002p-126, -0x1.000002p-26, 0x1p127, 0x0.ffffffp127);
+      TEST_fff_f (fma, 0x1.000002p-126, 0x1.000002p-26, -0x1p127, -0x1p127);
+      TEST_fff_f (fma, 0x1.000002p-126, -0x1.000002p-26, -0x1p127, -0x1.000002p127);
+      TEST_fff_f (fma, 0x1.000002p-126, 0x1.000002p-26, 0x1p103, 0x1p103);
+      TEST_fff_f (fma, 0x1.000002p-126, -0x1.000002p-26, 0x1p103, 0x0.ffffffp103);
+      TEST_fff_f (fma, 0x1.000002p-126, 0x1.000002p-26, -0x1p103, -0x1p103);
+      TEST_fff_f (fma, 0x1.000002p-126, -0x1.000002p-26, -0x1p103, -0x1.000002p103);
 #endif
 #if defined (TEST_DOUBLE) && DBL_MANT_DIG == 53
       TEST_fff_f (fma, 0x1.4p-1022, 0x1.0000000000002p-1, 0x1p-1024, 0x1.c000000000002p-1023, UNDERFLOW_EXCEPTION);
@@ -5100,6 +5172,14 @@
       TEST_fff_f (fma, 0x0.fffffffffffff8p0, -0x0.fffffffffffff8p0, 0x0.fffffffffffffp0, -0x1p-106);
       TEST_fff_f (fma, -0x0.fffffffffffff8p0, 0x0.fffffffffffff8p0, 0x0.fffffffffffffp0, -0x1p-106);
       TEST_fff_f (fma, -0x0.fffffffffffff8p0, -0x0.fffffffffffff8p0, -0x0.fffffffffffffp0, 0x1p-106);
+      TEST_fff_f (fma, 0x1.0000000000001p-1022, 0x1.0000000000001p-55, 0x1p1023, 0x1p1023);
+      TEST_fff_f (fma, 0x1.0000000000001p-1022, -0x1.0000000000001p-55, 0x1p1023, 0x0.fffffffffffff8p1023);
+      TEST_fff_f (fma, 0x1.0000000000001p-1022, 0x1.0000000000001p-55, -0x1p1023, -0x1p1023);
+      TEST_fff_f (fma, 0x1.0000000000001p-1022, -0x1.0000000000001p-55, -0x1p1023, -0x1.0000000000001p1023);
+      TEST_fff_f (fma, 0x1.0000000000001p-1022, 0x1.0000000000001p-55, 0x1p970, 0x1p970);
+      TEST_fff_f (fma, 0x1.0000000000001p-1022, -0x1.0000000000001p-55, 0x1p970, 0x0.fffffffffffff8p970);
+      TEST_fff_f (fma, 0x1.0000000000001p-1022, 0x1.0000000000001p-55, -0x1p970, -0x1p970);
+      TEST_fff_f (fma, 0x1.0000000000001p-1022, -0x1.0000000000001p-55, -0x1p970, -0x1.0000000000001p970);
 #endif
 #if defined (TEST_LDOUBLE) && LDBL_MANT_DIG == 64
       TEST_fff_f (fma, 0x1.4p-16382L, 0x1.0000000000000004p-1L, 0x1p-16384L, 0x1.c000000000000004p-16383L, UNDERFLOW_EXCEPTION);
@@ -5130,6 +5210,14 @@
       TEST_fff_f (fma, 0x0.ffffffffffffffffp0L, -0x0.ffffffffffffffffp0L, 0x0.fffffffffffffffep0L, -0x1p-128L);
       TEST_fff_f (fma, -0x0.ffffffffffffffffp0L, 0x0.ffffffffffffffffp0L, 0x0.fffffffffffffffep0L, -0x1p-128L);
       TEST_fff_f (fma, -0x0.ffffffffffffffffp0L, -0x0.ffffffffffffffffp0L, -0x0.fffffffffffffffep0L, 0x1p-128L);
+      TEST_fff_f (fma, 0x1.0000000000000002p-16382L, 0x1.0000000000000002p-66L, 0x1p16383L, 0x1p16383L);
+      TEST_fff_f (fma, 0x1.0000000000000002p-16382L, -0x1.0000000000000002p-66L, 0x1p16383L, 0x0.ffffffffffffffffp16383L);
+      TEST_fff_f (fma, 0x1.0000000000000002p-16382L, 0x1.0000000000000002p-66L, -0x1p16383L, -0x1p16383L);
+      TEST_fff_f (fma, 0x1.0000000000000002p-16382L, -0x1.0000000000000002p-66L, -0x1p16383L, -0x1.0000000000000002p16383L);
+      TEST_fff_f (fma, 0x1.0000000000000002p-16382L, 0x1.0000000000000002p-66L, 0x1p16319L, 0x1p16319L);
+      TEST_fff_f (fma, 0x1.0000000000000002p-16382L, -0x1.0000000000000002p-66L, 0x1p16319L, 0x0.ffffffffffffffffp16319L);
+      TEST_fff_f (fma, 0x1.0000000000000002p-16382L, 0x1.0000000000000002p-66L, -0x1p16319L, -0x1p16319L);
+      TEST_fff_f (fma, 0x1.0000000000000002p-16382L, -0x1.0000000000000002p-66L, -0x1p16319L, -0x1.0000000000000002p16319L);
 #endif
 #if defined (TEST_LDOUBLE) && LDBL_MANT_DIG == 113
       TEST_fff_f (fma, 0x1.4p-16382L, 0x1.0000000000000000000000000002p-1L, 0x1p-16384L, 0x1.c000000000000000000000000002p-16383L, UNDERFLOW_EXCEPTION);
@@ -5160,6 +5248,14 @@
       TEST_fff_f (fma, 0x0.ffffffffffffffffffffffffffff8p0L, -0x0.ffffffffffffffffffffffffffff8p0L, 0x0.ffffffffffffffffffffffffffffp0L, -0x1p-226L);
       TEST_fff_f (fma, -0x0.ffffffffffffffffffffffffffff8p0L, 0x0.ffffffffffffffffffffffffffff8p0L, 0x0.ffffffffffffffffffffffffffffp0L, -0x1p-226L);
       TEST_fff_f (fma, -0x0.ffffffffffffffffffffffffffff8p0L, -0x0.ffffffffffffffffffffffffffff8p0L, -0x0.ffffffffffffffffffffffffffffp0L, 0x1p-226L);
+      TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, 0x1.0000000000000000000000000001p-66L, 0x1p16383L, 0x1p16383L);
+      TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, -0x1.0000000000000000000000000001p-66L, 0x1p16383L, 0x0.ffffffffffffffffffffffffffff8p16383L);
+      TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, 0x1.0000000000000000000000000001p-66L, -0x1p16383L, -0x1p16383L);
+      TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, -0x1.0000000000000000000000000001p-66L, -0x1p16383L, -0x1.0000000000000000000000000001p16383L);
+      TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, 0x1.0000000000000000000000000001p-66L, 0x1p16319L, 0x1p16319L);
+      TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, -0x1.0000000000000000000000000001p-66L, 0x1p16319L, 0x0.ffffffffffffffffffffffffffff8p16319L);
+      TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, 0x1.0000000000000000000000000001p-66L, -0x1p16319L, -0x1p16319L);
+      TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, -0x1.0000000000000000000000000001p-66L, -0x1p16319L, -0x1.0000000000000000000000000001p16319L);
 #endif
     }
 
@@ -5256,6 +5352,14 @@
       TEST_fff_f (fma, 0x0.fffp0, -0x0.fffp0, 0x0.ffep0, -0x1p-24);
       TEST_fff_f (fma, -0x0.fffp0, 0x0.fffp0, 0x0.ffep0, -0x1p-24);
       TEST_fff_f (fma, -0x0.fffp0, -0x0.fffp0, -0x0.ffep0, 0x1p-24);
+      TEST_fff_f (fma, 0x1.000002p-126, 0x1.000002p-26, 0x1p127, 0x1.000002p127);
+      TEST_fff_f (fma, 0x1.000002p-126, -0x1.000002p-26, 0x1p127, 0x1p127);
+      TEST_fff_f (fma, 0x1.000002p-126, 0x1.000002p-26, -0x1p127, -0x0.ffffffp127);
+      TEST_fff_f (fma, 0x1.000002p-126, -0x1.000002p-26, -0x1p127, -0x1p127);
+      TEST_fff_f (fma, 0x1.000002p-126, 0x1.000002p-26, 0x1p103, 0x1.000002p103);
+      TEST_fff_f (fma, 0x1.000002p-126, -0x1.000002p-26, 0x1p103, 0x1p103);
+      TEST_fff_f (fma, 0x1.000002p-126, 0x1.000002p-26, -0x1p103, -0x0.ffffffp103);
+      TEST_fff_f (fma, 0x1.000002p-126, -0x1.000002p-26, -0x1p103, -0x1p103);
 #endif
 #if defined (TEST_DOUBLE) && DBL_MANT_DIG == 53
       TEST_fff_f (fma, 0x1.4p-1022, 0x1.0000000000002p-1, 0x1p-1024, 0x1.c000000000004p-1023, UNDERFLOW_EXCEPTION);
@@ -5286,6 +5390,14 @@
       TEST_fff_f (fma, 0x0.fffffffffffff8p0, -0x0.fffffffffffff8p0, 0x0.fffffffffffffp0, -0x1p-106);
       TEST_fff_f (fma, -0x0.fffffffffffff8p0, 0x0.fffffffffffff8p0, 0x0.fffffffffffffp0, -0x1p-106);
       TEST_fff_f (fma, -0x0.fffffffffffff8p0, -0x0.fffffffffffff8p0, -0x0.fffffffffffffp0, 0x1p-106);
+      TEST_fff_f (fma, 0x1.0000000000001p-1022, 0x1.0000000000001p-55, 0x1p1023, 0x1.0000000000001p1023);
+      TEST_fff_f (fma, 0x1.0000000000001p-1022, -0x1.0000000000001p-55, 0x1p1023, 0x1p1023);
+      TEST_fff_f (fma, 0x1.0000000000001p-1022, 0x1.0000000000001p-55, -0x1p1023, -0x0.fffffffffffff8p1023);
+      TEST_fff_f (fma, 0x1.0000000000001p-1022, -0x1.0000000000001p-55, -0x1p1023, -0x1p1023);
+      TEST_fff_f (fma, 0x1.0000000000001p-1022, 0x1.0000000000001p-55, 0x1p970, 0x1.0000000000001p970);
+      TEST_fff_f (fma, 0x1.0000000000001p-1022, -0x1.0000000000001p-55, 0x1p970, 0x1p970);
+      TEST_fff_f (fma, 0x1.0000000000001p-1022, 0x1.0000000000001p-55, -0x1p970, -0x0.fffffffffffff8p970);
+      TEST_fff_f (fma, 0x1.0000000000001p-1022, -0x1.0000000000001p-55, -0x1p970, -0x1p970);
 #endif
 #if defined (TEST_LDOUBLE) && LDBL_MANT_DIG == 64
       TEST_fff_f (fma, 0x1.4p-16382L, 0x1.0000000000000004p-1L, 0x1p-16384L, 0x1.c000000000000008p-16383L, UNDERFLOW_EXCEPTION);
@@ -5316,6 +5428,14 @@
       TEST_fff_f (fma, 0x0.ffffffffffffffffp0L, -0x0.ffffffffffffffffp0L, 0x0.fffffffffffffffep0L, -0x1p-128L);
       TEST_fff_f (fma, -0x0.ffffffffffffffffp0L, 0x0.ffffffffffffffffp0L, 0x0.fffffffffffffffep0L, -0x1p-128L);
       TEST_fff_f (fma, -0x0.ffffffffffffffffp0L, -0x0.ffffffffffffffffp0L, -0x0.fffffffffffffffep0L, 0x1p-128L);
+      TEST_fff_f (fma, 0x1.0000000000000002p-16382L, 0x1.0000000000000002p-66L, 0x1p16383L, 0x1.0000000000000002p16383L);
+      TEST_fff_f (fma, 0x1.0000000000000002p-16382L, -0x1.0000000000000002p-66L, 0x1p16383L, 0x1p16383L);
+      TEST_fff_f (fma, 0x1.0000000000000002p-16382L, 0x1.0000000000000002p-66L, -0x1p16383L, -0x0.ffffffffffffffffp16383L);
+      TEST_fff_f (fma, 0x1.0000000000000002p-16382L, -0x1.0000000000000002p-66L, -0x1p16383L, -0x1p16383L);
+      TEST_fff_f (fma, 0x1.0000000000000002p-16382L, 0x1.0000000000000002p-66L, 0x1p16319L, 0x1.0000000000000002p16319L);
+      TEST_fff_f (fma, 0x1.0000000000000002p-16382L, -0x1.0000000000000002p-66L, 0x1p16319L, 0x1p16319L);
+      TEST_fff_f (fma, 0x1.0000000000000002p-16382L, 0x1.0000000000000002p-66L, -0x1p16319L, -0x0.ffffffffffffffffp16319L);
+      TEST_fff_f (fma, 0x1.0000000000000002p-16382L, -0x1.0000000000000002p-66L, -0x1p16319L, -0x1p16319L);
 #endif
 #if defined (TEST_LDOUBLE) && LDBL_MANT_DIG == 113
       TEST_fff_f (fma, 0x1.4p-16382L, 0x1.0000000000000000000000000002p-1L, 0x1p-16384L, 0x1.c000000000000000000000000004p-16383L, UNDERFLOW_EXCEPTION);
@@ -5346,6 +5466,14 @@
       TEST_fff_f (fma, 0x0.ffffffffffffffffffffffffffff8p0L, -0x0.ffffffffffffffffffffffffffff8p0L, 0x0.ffffffffffffffffffffffffffffp0L, -0x1p-226L);
       TEST_fff_f (fma, -0x0.ffffffffffffffffffffffffffff8p0L, 0x0.ffffffffffffffffffffffffffff8p0L, 0x0.ffffffffffffffffffffffffffffp0L, -0x1p-226L);
       TEST_fff_f (fma, -0x0.ffffffffffffffffffffffffffff8p0L, -0x0.ffffffffffffffffffffffffffff8p0L, -0x0.ffffffffffffffffffffffffffffp0L, 0x1p-226L);
+      TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, 0x1.0000000000000000000000000001p-66L, 0x1p16383L, 0x1.0000000000000000000000000001p16383L);
+      TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, -0x1.0000000000000000000000000001p-66L, 0x1p16383L, 0x1p16383L);
+      TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, 0x1.0000000000000000000000000001p-66L, -0x1p16383L, -0x0.ffffffffffffffffffffffffffff8p16383L);
+      TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, -0x1.0000000000000000000000000001p-66L, -0x1p16383L, -0x1p16383L);
+      TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, 0x1.0000000000000000000000000001p-66L, 0x1p16319L, 0x1.0000000000000000000000000001p16319L);
+      TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, -0x1.0000000000000000000000000001p-66L, 0x1p16319L, 0x1p16319L);
+      TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, 0x1.0000000000000000000000000001p-66L, -0x1p16319L, -0x0.ffffffffffffffffffffffffffff8p16319L);
+      TEST_fff_f (fma, 0x1.0000000000000000000000000001p-16382L, -0x1.0000000000000000000000000001p-66L, -0x1p16319L, -0x1p16319L);
 #endif
     }
 

Modified: fsf/trunk/libc/ports/ChangeLog.tile
==============================================================================
--- fsf/trunk/libc/ports/ChangeLog.tile (original)
+++ fsf/trunk/libc/ports/ChangeLog.tile Tue Nov  6 16:19:40 2012
@@ -1,3 +1,26 @@
+2012-11-06  Chris Metcalf  <cmetcalf@xxxxxxxxxx>
+
+	* sysdeps/tile/nptl/pthread_spin_unlock.c: New file.
+
+2012-11-05  Chris Metcalf  <cmetcalf@xxxxxxxxxx>
+
+	* sysdeps/tile/math_private.h: Provide additional no-op defines
+	for exception and rounding macros.
+
+	* sysdeps/tile/tilegx/Makefile: Generate Makefile fragment to determine
+	whether to build elf-init.c and gmon-start.c with -mcmodel=large.
+	* sysdeps/tile/crti.S: Support large memory model.
+	* sysdeps/tile/start.S: Likewise.
+
+2012-11-02  Chris Metcalf  <cmetcalf@xxxxxxxxxx>
+
+	* sysdeps/tile/dl-runtime.c (_dl_after_load): Handle simulator
+	notification better for dlopen() of relative paths.
+
+	* sysdeps/tile/tilegx/memcpy.c (__memcpy): Optimize.
+	* sysdeps/tile/memcopy.h: New file.
+	* sysdeps/tile/wordcopy.c: New file.
+
 2012-11-03  Joseph Myers  <joseph@xxxxxxxxxxxxxxxx>
 
 	[BZ #3439]

Modified: fsf/trunk/libc/ports/sysdeps/tile/crti.S
==============================================================================
--- fsf/trunk/libc/ports/sysdeps/tile/crti.S (original)
+++ fsf/trunk/libc/ports/sysdeps/tile/crti.S Tue Nov  6 16:19:40 2012
@@ -70,16 +70,17 @@
 #if PREINIT_FUNCTION_WEAK
 	lnk r2
 0:
-#ifdef __tilegx__
+# ifdef __tilegx__
+	moveli r1, hw2_last(_GLOBAL_OFFSET_TABLE_ - 0b)
 	{
-	 moveli r1, hw1_last(_GLOBAL_OFFSET_TABLE_ - 0b)
+	 shl16insli r1, r1, hw1(_GLOBAL_OFFSET_TABLE_ - 0b)
 	 moveli r0, hw1_last_got(PREINIT_FUNCTION)
 	}
 	{
 	 shl16insli r1, r1, hw0(_GLOBAL_OFFSET_TABLE_ - 0b)
 	 shl16insli r0, r0, hw0_got(PREINIT_FUNCTION)
 	}
-#else
+# else
 	{
 	 moveli r1, lo16(_GLOBAL_OFFSET_TABLE_ - 0b)
 	 moveli r0, got_lo16(PREINIT_FUNCTION)
@@ -88,13 +89,25 @@
 	 auli r1, r1, ha16(_GLOBAL_OFFSET_TABLE_ - 0b)
 	 auli r0, r0, got_ha16(PREINIT_FUNCTION)
 	}
-#endif
+# endif
 	ADD_PTR r0, r0, r1
 	ADD_PTR r0, r0, r2
 	LD_PTR r0, r0
 	BEQZ r0, .Lno_weak_fn
+	jalr r0
+#elif defined(__tilegx__)
+	/* Since we are calling from the start of the object to the PLT,
+	   call by loading the full address into a register.  */
+	lnk r2
+0:
+	moveli r0, hw2_last_plt(PREINIT_FUNCTION - 0b)
+	shl16insli r0, r0, hw1_plt(PREINIT_FUNCTION - 0b)
+	shl16insli r0, r0, hw0_plt(PREINIT_FUNCTION - 0b)
+	add r0, r0, r2
+	jalr r0
+#else
+	jal plt(PREINIT_FUNCTION)
 #endif
-	jal plt(PREINIT_FUNCTION)
 .Lno_weak_fn:
 
 	.section .fini,"ax",@progbits

Modified: fsf/trunk/libc/ports/sysdeps/tile/dl-runtime.c
==============================================================================
--- fsf/trunk/libc/ports/sysdeps/tile/dl-runtime.c (original)
+++ fsf/trunk/libc/ports/sysdeps/tile/dl-runtime.c Tue Nov  6 16:19:40 2012
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2012 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Chris Metcalf <cmetcalf@xxxxxxxxxx>, 2011.
 
@@ -28,44 +28,119 @@
 #include <sys/mman.h>
 #include <arch/sim.h>
 
-/* Support notifying the simulator about new objects. */
+/* Like realpath(), but simplified: no dynamic memory use, no lstat(),
+   no set_errno(), no valid "rpath" on error, etc.  This handles some
+   simple cases where the simulator might not have a valid entry for
+   a loaded Elf object, in particular dlopen() with a relative path.
+   For this relatively rare case, one could also imagine using
+   link_map.l_origin to avoid the getcwd() here, but the simpler code
+   here seems like a better solution.  */
+static char *
+dl_realpath (const char *name, char *rpath)
+{
+  char *dest;
+  const char *start, *end;
+
+  if (name[0] != '/')
+    {
+      if (!__getcwd (rpath, PATH_MAX))
+        return NULL;
+      dest = __rawmemchr (rpath, '\0');
+    }
+  else
+    {
+      rpath[0] = '/';
+      dest = rpath + 1;
+    }
+
+  for (start = end = name; *start; start = end)
+    {
+      /* Skip sequence of multiple path-separators.  */
+      while (*start == '/')
+	++start;
+
+      /* Find end of path component.  */
+      for (end = start; *end && *end != '/'; ++end)
+	/* Nothing.  */;
+
+      if (end - start == 0)
+	break;
+      else if (end - start == 1 && start[0] == '.')
+	/* nothing */;
+      else if (end - start == 2 && start[0] == '.' && start[1] == '.')
+	{
+	  /* Back up to previous component, ignore if at root already.  */
+	  if (dest > rpath + 1)
+	    while ((--dest)[-1] != '/');
+	}
+      else
+	{
+	  if (dest[-1] != '/')
+	    *dest++ = '/';
+
+	  if (dest + (end - start) >= rpath + PATH_MAX)
+            return NULL;
+
+	  dest = __mempcpy (dest, start, end - start);
+	  *dest = '\0';
+	}
+    }
+  if (dest > rpath + 1 && dest[-1] == '/')
+    --dest;
+  *dest = '\0';
+
+  return rpath;
+}
+
+/* Support notifying the simulator about new objects.  */
 void internal_function
 _dl_after_load (struct link_map *l)
 {
   int shift;
+  char pathbuf[PATH_MAX];
+  char *path;
 
-#define DLPUTC(c) __insn_mtspr(SPR_SIM_CONTROL,                         \
-                               (SIM_CONTROL_DLOPEN                      \
-                                | ((c) << _SIM_CONTROL_OPERATOR_BITS)))
+  /* Don't bother if not in the simulator. */
+  if (__insn_mfspr (SPR_SIM_CONTROL) == 0)
+    return;
 
-  /* Write the library address in hex. */
+#define DLPUTC(c) __insn_mtspr (SPR_SIM_CONTROL,                         \
+                                (SIM_CONTROL_DLOPEN                      \
+                                 | ((c) << _SIM_CONTROL_OPERATOR_BITS)))
+
+  /* Write the library address in hex.  */
   DLPUTC ('0');
   DLPUTC ('x');
   for (shift = (int) sizeof (unsigned long) * 8 - 4; shift >= 0; shift -= 4)
     DLPUTC ("0123456789abcdef"[(l->l_map_start >> shift) & 0xF]);
   DLPUTC (':');
 
-  /* Write the library path, including the terminating '\0'. */
+  /* Write the library path, including the terminating '\0'.  */
+  path = dl_realpath (l->l_name, pathbuf) ?: l->l_name;
   for (size_t i = 0;; i++)
     {
-      DLPUTC (l->l_name[i]);
-      if (l->l_name[i] == '\0')
+      DLPUTC (path[i]);
+      if (path[i] == '\0')
         break;
     }
 #undef DLPUTC
 }
 
-/* Support notifying the simulator about removed objects prior to munmap(). */
+/* Support notifying the simulator about removed objects prior to munmap().  */
 void internal_function
 _dl_unmap (struct link_map *l)
 {
   int shift;
 
-#define DLPUTC(c) __insn_mtspr(SPR_SIM_CONTROL,                         \
-                               (SIM_CONTROL_DLCLOSE                     \
-                                | ((c) << _SIM_CONTROL_OPERATOR_BITS)))
+  /* Don't bother if not in the simulator.  */
+  if (__insn_mfspr (SPR_SIM_CONTROL) == 0)
+    return;
 
-  /* Write the library address in hex. */
+#define DLPUTC(c) __insn_mtspr (SPR_SIM_CONTROL,                         \
+                                (SIM_CONTROL_DLCLOSE                     \
+                                 | ((c) << _SIM_CONTROL_OPERATOR_BITS)))
+
+  /* Write the library address in hex.  */
   DLPUTC ('0');
   DLPUTC ('x');
   for (shift = (int) sizeof (unsigned long) * 8 - 4; shift >= 0; shift -= 4)

Modified: fsf/trunk/libc/ports/sysdeps/tile/math_private.h
==============================================================================
--- fsf/trunk/libc/ports/sysdeps/tile/math_private.h (original)
+++ fsf/trunk/libc/ports/sysdeps/tile/math_private.h Tue Nov  6 16:19:40 2012
@@ -1,13 +1,31 @@
 #ifndef _MATH_PRIVATE_H
+
+/* Internally, we suppress any use of exception or rounding other
+   than what is supported by the hardware.  This does mean that some
+   code will silently fail to report exceptions, set rounding mode
+   as expected, etc., but it allows math code to compile that otherwise
+   wouldn't (such as math/s_fma.c) and so is valuable.
+
+   We intentionally ignore the "exception" arguments of functions that
+   take an exception, since we can't even evaluate the argument
+   without causing a build failure.  The extra level of statement
+   expression wrapping avoids "statement with no effect" warnings.
+   Since the callers don't check for errors anyway, we just claim
+   success in every case.
+
+   The overrides for libc_ functions must happen before we include
+   the generic math_private.h, and the overrides for regular
+   <fenv.h> functions must happen afterwards, to avoid clashing with
+   the declarations of those functions.  */
+
+#define libc_fesetround(rnd)			({ 0; })
+#define libc_fetestexcept(exc)			({ 0; })
+#define libc_feholdexcept_setround(env, exc)	({ (void) (env); 0; })
+#define libc_feupdateenv_test(env, exc)		({ (void) (env); 0; })
 
 #include_next <math_private.h>
 
-/* We have no exception support, so feraiseexcept() must be a no-op.
-   And since we don't define FE_INVALID, FE_DIVBYZERO, etc., we
-   must ignore the argument of feraiseexcept() as well.  we return
-   "1" to indicate we failed to raise an exception, though none of
-   the callers in glibc actually care.  The extra level of statement
-   expression wrapping avoids "statement with no effect" warnings.  */
-#define feraiseexcept(excepts) ({ 1; })
+#define feraiseexcept(excepts)			({ 0; })
+#define feclearexcept(exc)			({ 0; })
 
 #endif

Added: fsf/trunk/libc/ports/sysdeps/tile/memcopy.h
==============================================================================
--- fsf/trunk/libc/ports/sysdeps/tile/memcopy.h (added)
+++ fsf/trunk/libc/ports/sysdeps/tile/memcopy.h Tue Nov  6 16:19:40 2012
@@ -1,0 +1,27 @@
+/* memcopy.h -- definitions for memory copy functions.  Tile version.
+   Copyright (C) 2012 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdeps/generic/memcopy.h>
+#include <bits/wordsize.h>
+
+/* Support more efficient copying on tilegx32, which supports
+   long long as a native 64-bit type.  */
+#if defined (__tilegx__) && __WORDSIZE == 32
+# undef op_t
+# define op_t	unsigned long long int
+#endif

Added: fsf/trunk/libc/ports/sysdeps/tile/nptl/pthread_spin_unlock.c
==============================================================================
--- fsf/trunk/libc/ports/sysdeps/tile/nptl/pthread_spin_unlock.c (added)
+++ fsf/trunk/libc/ports/sysdeps/tile/nptl/pthread_spin_unlock.c Tue Nov  6 16:19:40 2012
@@ -1,0 +1,33 @@
+/* pthread_spin_unlock -- unlock a spin lock.  Tile version.
+   Copyright (C) 2012 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "pthreadP.h"
+#include <atomic.h>
+
+int
+pthread_spin_unlock (pthread_spinlock_t *lock)
+{
+#ifdef __tilegx__
+  /* Use exchange() to bypass the write buffer. */
+  atomic_exchange_rel (lock, 0);
+#else
+  atomic_full_barrier ();
+  *lock = 0;
+#endif
+  return 0;
+}

Modified: fsf/trunk/libc/ports/sysdeps/tile/start.S
==============================================================================
--- fsf/trunk/libc/ports/sysdeps/tile/start.S (original)
+++ fsf/trunk/libc/ports/sysdeps/tile/start.S Tue Nov  6 16:19:40 2012
@@ -126,27 +126,37 @@
 	 moveli r0, hw2_last(main - .Lmy_pc)
 	}
 	{
+	 shl16insli r0, r0, hw1(main - .Lmy_pc)
 	 moveli r3, hw2_last(__libc_csu_init - .Lmy_pc)
-	 shl16insli r0, r0, hw1(main - .Lmy_pc)
 	}
 	{
+	 shl16insli r0, r0, hw0(main - .Lmy_pc)
 	 shl16insli r3, r3, hw1(__libc_csu_init - .Lmy_pc)
-	 shl16insli r0, r0, hw0(main - .Lmy_pc)
 	}
 	{
+	 ADD_PTR r0, r0, r13
 	 shl16insli r3, r3, hw0(__libc_csu_init - .Lmy_pc)
+	}
+	{
+	 moveli r12, hw2_last_plt(__libc_start_main - .Lmy_pc)
+	 ADD_PTR r3, r3, r13
+	}
+	{
+	 shl16insli r12, r12, hw1_plt(__libc_start_main - .Lmy_pc)
 	 moveli r4, hw2_last(__libc_csu_fini - .Lmy_pc)
 	}
 	{
-	 ADD_PTR r0, r0, r13
+	 shl16insli r12, r12, hw0_plt(__libc_start_main - .Lmy_pc)
 	 shl16insli r4, r4, hw1(__libc_csu_fini - .Lmy_pc)
 	}
 	{
-	 ADD_PTR r3, r3, r13
+	 ADD_PTR r12, r12, r13
 	 shl16insli r4, r4, hw0(__libc_csu_fini - .Lmy_pc)
 	}
 	{
 	 ADD_PTR r4, r4, r13
+	 jalr r12
+	}
 #else
 	 addli r0, r13, lo16(main - .Lmy_pc)
 	}
@@ -160,13 +170,12 @@
 	}
 	{
 	 auli r4, r4, ha16(__libc_csu_fini - .Lmy_pc)
-
-#endif
-
 	 /* Call the user's main function, and exit with its value.
 	    But let the libc call main. */
 	 j plt(__libc_start_main)
 	}
+#endif
+
 	{
 	 /* Tell backtracer to give up (_start has no caller). */
 	 info INFO_OP_CANNOT_BACKTRACE

Added: fsf/trunk/libc/ports/sysdeps/tile/tilegx/Makefile
==============================================================================
--- fsf/trunk/libc/ports/sysdeps/tile/tilegx/Makefile (added)
+++ fsf/trunk/libc/ports/sysdeps/tile/tilegx/Makefile Tue Nov  6 16:19:40 2012
@@ -1,0 +1,18 @@
+include $(common-objpfx)cflags-mcmodel-large.mk
+
+$(common-objpfx)cflags-mcmodel-large.mk: $(common-objpfx)config.make
+	mcmodel=no; \
+	$(CC) -S -o /dev/null -xc /dev/null -mcmodel=large && mcmodel=yes; \
+	echo "cflags-mcmodel-large = $$mcmodel" > $@
+
+ifeq ($(subdir),csu)
+ifeq (yes,$(cflags-mcmodel-large))
+# elf-init.c is in libc_nonshared.o (the end of the shared object) but
+# must reach the _init symbol at the very start of the shared object.
+CFLAGS-elf-init.c += -mcmodel=large
+
+# __gmon_start__ is at the very start of the shared object when linked
+# with profiling, but calls to libc.so via the PLT at the very end.
+CFLAGS-gmon-start.c += -mcmodel=large
+endif
+endif

Modified: fsf/trunk/libc/ports/sysdeps/tile/tilegx/memcpy.c
==============================================================================
--- fsf/trunk/libc/ports/sysdeps/tile/tilegx/memcpy.c (original)
+++ fsf/trunk/libc/ports/sysdeps/tile/tilegx/memcpy.c Tue Nov  6 16:19:40 2012
@@ -19,10 +19,8 @@
 #include <string.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <memcopy.h>
 #include <arch/chip.h>
-
-/* Must be 8 bytes in size. */
-#define word_t uint64_t
 
 /* How many cache lines ahead should we prefetch? */
 #define PREFETCH_LINES_AHEAD 3
@@ -34,8 +32,8 @@
   const char *__restrict src1 = (const char *) srcv;
   const char *__restrict src1_end;
   const char *__restrict prefetch;
-  word_t *__restrict dst8; /* 8-byte pointer to destination memory. */
-  word_t final; /* Final bytes to write to trailing word, if any */
+  op_t *__restrict dst8; /* 8-byte pointer to destination memory. */
+  op_t final; /* Final bytes to write to trailing word, if any */
   long i;
 
   if (n < 16)
@@ -55,101 +53,169 @@
     {
       __insn_prefetch (prefetch);
       prefetch += CHIP_L2_LINE_SIZE ();
-      prefetch = (prefetch > src1_end) ? prefetch : src1;
+      prefetch = (prefetch < src1_end) ? prefetch : src1;
     }
 
   /* Copy bytes until dst is word-aligned. */
-  for (; (uintptr_t) dst1 & (sizeof (word_t) - 1); n--)
+  for (; (uintptr_t) dst1 & (sizeof (op_t) - 1); n--)
     *dst1++ = *src1++;
 
   /* 8-byte pointer to destination memory. */
-  dst8 = (word_t *) dst1;
-
-  if (__builtin_expect ((uintptr_t) src1 & (sizeof (word_t) - 1), 0))
-    {
-      /* Misaligned copy.  Copy 8 bytes at a time, but don't bother
-         with other fanciness.
-         TODO: Consider prefetching and using wh64 as well.  */
-
-      /* Create an aligned src8. */
-      const word_t *__restrict src8 =
-        (const word_t *) ((uintptr_t) src1 & -sizeof (word_t));
-      word_t b;
-
-      word_t a = *src8++;
-      for (; n >= sizeof (word_t); n -= sizeof (word_t))
-        {
-          b = *src8++;
-          a = __insn_dblalign (a, b, src1);
-          *dst8++ = a;
-          a = b;
-        }
-
+  dst8 = (op_t *) dst1;
+
+  if (__builtin_expect ((uintptr_t) src1 & (sizeof (op_t) - 1), 0))
+    {
+      /* Misaligned copy.  Use glibc's _wordcopy_fwd_dest_aligned, but
+         inline it to avoid prologue/epilogue.  TODO: Consider
+         prefetching and using wh64 as well.  */
+      void * srci;
+      op_t a0, a1, a2, a3;
+      long int dstp = (long int) dst1;
+      long int srcp = (long int) src1;
+      long int len = n / OPSIZ;
+
+      /* Save the initial source pointer so we know the number of
+         bytes to shift for merging two unaligned results.  */
+      srci = (void *) srcp;
+
+      /* Make SRCP aligned by rounding it down to the beginning of the
+         `op_t' it points in the middle of.  */
+      srcp &= -OPSIZ;
+
+      switch (len % 4)
+	{
+	case 2:
+	  a1 = ((op_t *) srcp)[0];
+	  a2 = ((op_t *) srcp)[1];
+	  len += 2;
+	  srcp += 2 * OPSIZ;
+	  goto do1;
+	case 3:
+	  a0 = ((op_t *) srcp)[0];
+	  a1 = ((op_t *) srcp)[1];
+	  len += 1;
+	  srcp += 2 * OPSIZ;
+	  goto do2;
+	case 0:
+	  if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	    return dstv;
+	  a3 = ((op_t *) srcp)[0];
+	  a0 = ((op_t *) srcp)[1];
+	  len += 0;
+	  srcp += 2 * OPSIZ;
+	  goto do3;
+	case 1:
+	  a2 = ((op_t *) srcp)[0];
+	  a3 = ((op_t *) srcp)[1];
+	  srcp += 2 * OPSIZ;
+	  len -= 1;
+	  if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	    goto do0;
+	  goto do4;			/* No-op.  */
+	}
+
+      do
+	{
+	do4:
+	  a0 = ((op_t *) srcp)[0];
+	  a2 = __insn_dblalign (a2, a3, srci);
+	  ((op_t *) dstp)[0] = a2;
+	  srcp += OPSIZ;
+	  dstp += OPSIZ;
+	do3:
+	  a1 = ((op_t *) srcp)[0];
+	  a3 = __insn_dblalign (a3, a0, srci);
+	  ((op_t *) dstp)[0] = a3;
+	  srcp += OPSIZ;
+	  dstp += OPSIZ;
+	do2:
+	  a2 = ((op_t *) srcp)[0];
+	  a0 = __insn_dblalign (a0, a1, srci);
+	  ((op_t *) dstp)[0] = a0;
+	  srcp += OPSIZ;
+	  dstp += OPSIZ;
+	do1:
+	  a3 = ((op_t *) srcp)[0];
+	  a1 = __insn_dblalign (a1, a2, srci);
+	  ((op_t *) dstp)[0] = a1;
+	  srcp += OPSIZ;
+	  dstp += OPSIZ;
+	  len -= 4;
+	}
+      while (len != 0);
+
+      /* This is the right position for do0.  Please don't move
+         it into the loop.  */
+    do0:
+      ((op_t *) dstp)[0] = __insn_dblalign (a2, a3, srci);
+
+      n = n % OPSIZ;
       if (n == 0)
-        return dstv;
-
-      b = ((const char *) src8 <= src1_end) ? *src8 : 0;
-
-      /* Final source bytes to write to trailing partial word, if any. */
-      final = __insn_dblalign (a, b, src1);
+	return dstv;
+
+      a0 = ((const char *) srcp <= src1_end) ? ((op_t *) srcp)[0] : 0;
+
+      final = __insn_dblalign (a3, a0, srci);
+      dst8 = (op_t *)(dstp + OPSIZ);
     }
   else
     {
       /* Aligned copy. */
 
-      const word_t *__restrict src8 = (const word_t *) src1;
+      const op_t *__restrict src8 = (const op_t *) src1;
 
       /* src8 and dst8 are both word-aligned. */
       if (n >= CHIP_L2_LINE_SIZE ())
         {
           /* Copy until 'dst' is cache-line-aligned. */
           for (; (uintptr_t) dst8 & (CHIP_L2_LINE_SIZE () - 1);
-               n -= sizeof (word_t))
+               n -= sizeof (op_t))
             *dst8++ = *src8++;
 
-          /* If copying to self, return.  The test is cheap enough
-             that we do it despite the fact that the memcpy() contract
-             doesn't require us to support overlapping dst and src.
-             This is the most common case of overlap, and any close
-             overlap will cause corruption due to the wh64 below.
-             This case is particularly important since the compiler
-             will emit memcpy() calls for aggregate copies even if it
-             can't prove that src != dst.  */
-          if (__builtin_expect (dst8 == src8, 0))
-            return dstv;
-
           for (; n >= CHIP_L2_LINE_SIZE ();)
-            {
-              __insn_wh64 (dst8);
-
-              /* Prefetch and advance to next line to prefetch, but
-                 don't go past the end.  */
-              __insn_prefetch (prefetch);
-              prefetch += CHIP_L2_LINE_SIZE ();
-              prefetch = (prefetch > src1_end) ? prefetch :
-                (const char *) src8;
-
-              /* Copy an entire cache line.  Manually unrolled to
-                 avoid idiosyncracies of compiler unrolling.  */
-#define COPY_WORD(offset) ({ dst8[offset] = src8[offset]; n -= 8; })
-              COPY_WORD (0);
-              COPY_WORD (1);
-              COPY_WORD (2);
-              COPY_WORD (3);
-              COPY_WORD (4);
-              COPY_WORD (5);
-              COPY_WORD (6);
-              COPY_WORD (7);
+	    {
+	      op_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+	      /* Prefetch and advance to next line to prefetch, but
+		 don't go past the end.  */
+	      __insn_prefetch (prefetch);
+	      prefetch += CHIP_L2_LINE_SIZE ();
+	      prefetch = (prefetch < src1_end) ? prefetch :
+		(const char *) src8;
+
+	      /* Do all the loads before wh64.  This is necessary if
+		 [src8, src8+7] and [dst8, dst8+7] share the same
+		 cache line and dst8 <= src8, as can be the case when
+		 called from memmove, or with code tested on x86 whose
+		 memcpy always works with forward copies.  */
+	      tmp0 = *src8++;
+	      tmp1 = *src8++;
+	      tmp2 = *src8++;
+	      tmp3 = *src8++;
+	      tmp4 = *src8++;
+	      tmp5 = *src8++;
+	      tmp6 = *src8++;
+	      tmp7 = *src8++;
+
+	      __insn_wh64 (dst8);
+
+	      *dst8++ = tmp0;
+	      *dst8++ = tmp1;
+	      *dst8++ = tmp2;
+	      *dst8++ = tmp3;
+	      *dst8++ = tmp4;
+	      *dst8++ = tmp5;
+	      *dst8++ = tmp6;
+	      *dst8++ = tmp7;
+
+	      n -= 64;
+	    }
 #if CHIP_L2_LINE_SIZE() != 64
 # error "Fix code that assumes particular L2 cache line size."
 #endif
-
-              dst8 += CHIP_L2_LINE_SIZE () / sizeof (word_t);
-              src8 += CHIP_L2_LINE_SIZE () / sizeof (word_t);
-            }
         }
 
-      for (; n >= sizeof (word_t); n -= sizeof (word_t))
+      for (; n >= sizeof (op_t); n -= sizeof (op_t))
         *dst8++ = *src8++;
 
       if (__builtin_expect (n == 0, 1))

Added: fsf/trunk/libc/ports/sysdeps/tile/wordcopy.c
==============================================================================
--- fsf/trunk/libc/ports/sysdeps/tile/wordcopy.c (added)
+++ fsf/trunk/libc/ports/sysdeps/tile/wordcopy.c Tue Nov  6 16:19:40 2012
@@ -1,0 +1,449 @@
+/* wordcopy.c -- subroutines for memory copy functions.  Tile version.
+   Copyright (C) 1991-2012 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* To optimize for tile, we make the following changes from the
+   default glibc version:
+   - Use the double align instruction instead of the MERGE macro.
+   - Since we don't have offset addressing mode, make sure the loads /
+     stores in the inner loop always have indices of 0.
+   - Use post-increment addresses in the inner loops, which yields
+     better scheduling.  */
+
+/* BE VERY CAREFUL IF YOU CHANGE THIS CODE...!  */
+
+#include <stddef.h>
+#include <memcopy.h>
+
+/* Provide the appropriate dblalign builtin to shift two registers
+   based on the alignment of a pointer held in a third register.  */
+#ifdef __tilegx__
+#define DBLALIGN __insn_dblalign
+#else
+#define DBLALIGN __insn_dword_align
+#endif
+
+/* _wordcopy_fwd_aligned -- Copy block beginning at SRCP to
+   block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
+   Both SRCP and DSTP should be aligned for memory operations on `op_t's.  */
+
+void
+_wordcopy_fwd_aligned (dstp, srcp, len)
+     long int dstp;
+     long int srcp;
+     size_t len;
+{
+  op_t a0, a1;
+
+  switch (len % 8)
+    {
+    case 2:
+      a0 = ((op_t *) srcp)[0];
+      srcp += OPSIZ;
+      len += 6;
+      goto do1;
+    case 3:
+      a1 = ((op_t *) srcp)[0];
+      srcp += OPSIZ;
+      len += 5;
+      goto do2;
+    case 4:
+      a0 = ((op_t *) srcp)[0];
+      srcp += OPSIZ;
+      len += 4;
+      goto do3;
+    case 5:
+      a1 = ((op_t *) srcp)[0];
+      srcp += OPSIZ;
+      len += 3;
+      goto do4;
+    case 6:
+      a0 = ((op_t *) srcp)[0];
+      srcp += OPSIZ;
+      len += 2;
+      goto do5;
+    case 7:
+      a1 = ((op_t *) srcp)[0];
+      srcp += OPSIZ;
+      len += 1;
+      goto do6;
+
+    case 0:
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	return;
+      a0 = ((op_t *) srcp)[0];
+      srcp += OPSIZ;
+      goto do7;
+    case 1:
+      a1 = ((op_t *) srcp)[0];
+      srcp += OPSIZ;
+      len -= 1;
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	goto do0;
+      goto do8;			/* No-op.  */
+    }
+
+  do
+    {
+    do8:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do7:
+      a1 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a0;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do6:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do5:
+      a1 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a0;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do4:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do3:
+      a1 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a0;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do2:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do1:
+      a1 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a0;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+
+      len -= 8;
+    }
+  while (len != 0);
+
+  /* This is the right position for do0.  Please don't move
+     it into the loop.  */
+ do0:
+  ((op_t *) dstp)[0] = a1;
+}
+
+/* _wordcopy_fwd_dest_aligned -- Copy block beginning at SRCP to
+   block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
+   DSTP should be aligned for memory operations on `op_t's, but SRCP must
+   *not* be aligned.  */
+
+void
+_wordcopy_fwd_dest_aligned (dstp, srcp, len)
+     long int dstp;
+     long int srcp;
+     size_t len;
+{
+  void * srci;
+  op_t a0, a1, a2, a3;
+
+  /* Save the initial source pointer so we know the number of bytes to
+     shift for merging two unaligned results.  */
+  srci = (void *) srcp;
+
+  /* Make SRCP aligned by rounding it down to the beginning of the `op_t'
+     it points in the middle of.  */
+  srcp &= -OPSIZ;
+
+  switch (len % 4)
+    {
+    case 2:
+      a1 = ((op_t *) srcp)[0];
+      a2 = ((op_t *) srcp)[1];
+      len += 2;
+      srcp += 2 * OPSIZ;
+      goto do1;
+    case 3:
+      a0 = ((op_t *) srcp)[0];
+      a1 = ((op_t *) srcp)[1];
+      len += 1;
+      srcp += 2 * OPSIZ;
+      goto do2;
+    case 0:
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	return;
+      a3 = ((op_t *) srcp)[0];
+      a0 = ((op_t *) srcp)[1];
+      len += 0;
+      srcp += 2 * OPSIZ;
+      goto do3;
+    case 1:
+      a2 = ((op_t *) srcp)[0];
+      a3 = ((op_t *) srcp)[1];
+      srcp += 2 * OPSIZ;
+      len -= 1;
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	goto do0;
+      goto do4;			/* No-op.  */
+    }
+
+  do
+    {
+    do4:
+      a0 = ((op_t *) srcp)[0];
+      a2 = DBLALIGN (a2, a3, srci);
+      ((op_t *) dstp)[0] = a2;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do3:
+      a1 = ((op_t *) srcp)[0];
+      a3 = DBLALIGN (a3, a0, srci);
+      ((op_t *) dstp)[0] = a3;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do2:
+      a2 = ((op_t *) srcp)[0];
+      a0 = DBLALIGN (a0, a1, srci);
+      ((op_t *) dstp)[0] = a0;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do1:
+      a3 = ((op_t *) srcp)[0];
+      a1 = DBLALIGN (a1, a2, srci);
+      ((op_t *) dstp)[0] = a1;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+      len -= 4;
+    }
+  while (len != 0);
+
+  /* This is the right position for do0.  Please don't move
+     it into the loop.  */
+ do0:
+  ((op_t *) dstp)[0] = DBLALIGN (a2, a3, srci);
+}
+
+/* _wordcopy_bwd_aligned -- Copy block finishing right before
+   SRCP to block finishing right before DSTP with LEN `op_t' words
+   (not LEN bytes!).  Both SRCP and DSTP should be aligned for memory
+   operations on `op_t's.  */
+
+void
+_wordcopy_bwd_aligned (dstp, srcp, len)
+     long int dstp;
+     long int srcp;
+     size_t len;
+{
+  op_t a0, a1;
+  long int srcp1;
+
+  srcp1 = srcp - 1 * OPSIZ;
+  srcp -= 2 * OPSIZ;
+  dstp -= 1 * OPSIZ;
+
+  switch (len % 8)
+    {
+    case 2:
+      a0 = ((op_t *) srcp1)[0];
+      len += 6;
+      goto do1;
+    case 3:
+      a1 = ((op_t *) srcp1)[0];
+      len += 5;
+      goto do2;
+    case 4:
+      a0 = ((op_t *) srcp1)[0];
+      len += 4;
+      goto do3;
+    case 5:
+      a1 = ((op_t *) srcp1)[0];
+      len += 3;
+      goto do4;
+    case 6:
+      a0 = ((op_t *) srcp1)[0];
+      len += 2;
+      goto do5;
+    case 7:
+      a1 = ((op_t *) srcp1)[0];
+      len += 1;
+      goto do6;
+
+    case 0:
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	return;
+      a0 = ((op_t *) srcp1)[0];
+      goto do7;
+    case 1:
+      a1 = ((op_t *) srcp1)[0];
+      len -= 1;
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	goto do0;
+      goto do8;			/* No-op.  */
+    }
+
+  do
+    {
+    do8:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do7:
+      a1 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a0;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do6:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do5:
+      a1 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a0;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do4:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do3:
+      a1 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a0;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do2:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do1:
+      a1 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a0;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+
+      len -= 8;
+    }
+  while (len != 0);
+
+  /* This is the right position for do0.  Please don't move
+     it into the loop.  */
+ do0:
+  ((op_t *) dstp)[0] = a1;
+}
+
+/* _wordcopy_bwd_dest_aligned -- Copy block finishing right
+   before SRCP to block finishing right before DSTP with LEN `op_t'
+   words (not LEN bytes!).  DSTP should be aligned for memory
+   operations on `op_t', but SRCP must *not* be aligned.  */
+
+void
+_wordcopy_bwd_dest_aligned (dstp, srcp, len)
+     long int dstp;
+     long int srcp;
+     size_t len;
+{
+  void * srci;
+  op_t a0, a1, a2, a3;
+  op_t b0, b1, b2, b3;
+
+  /* Save the initial source pointer so we know the number of bytes to
+     shift for merging two unaligned results.  */
+  srci = (void *) srcp;
+
+  /* Make SRCP aligned by rounding it down to the beginning of the op_t
+     it points in the middle of.  */
+  srcp &= -OPSIZ;
+  srcp += OPSIZ;
+
+  switch (len % 4)
+    {
+    case 2:
+      srcp -= 3 * OPSIZ;
+      dstp -= 1 * OPSIZ;
+      b2 = ((op_t *) srcp)[2];
+      b1 = a1 = ((op_t *) srcp)[1];
+      len += 2;
+      goto do1;
+    case 3:
+      srcp -= 3 * OPSIZ;
+      dstp -= 1 * OPSIZ;
+      b3 = ((op_t *) srcp)[2];
+      b2 = a2 = ((op_t *) srcp)[1];
+      len += 1;
+      goto do2;
+    case 0:
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	return;
+      srcp -= 3 * OPSIZ;
+      dstp -= 1 * OPSIZ;
+      b0 = ((op_t *) srcp)[2];
+      b3 = a3 = ((op_t *) srcp)[1];
+      goto do3;
+    case 1:
+      srcp -= 3 * OPSIZ;
+      dstp -= 1 * OPSIZ;
+      b1 = ((op_t *) srcp)[2];
+      b0 = a0 = ((op_t *) srcp)[1];
+      len -= 1;
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	goto do0;
+      goto do4;			/* No-op.  */
+    }
+
+  do
+    {
+    do4:
+      b3 = a3 = ((op_t *) srcp)[0];
+      a0 = DBLALIGN (a0, b1, srci);
+      ((op_t *) dstp)[0] = a0;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do3:
+      b2 = a2 = ((op_t *) srcp)[0];
+      a3 = DBLALIGN (a3, b0, srci);
+      ((op_t *) dstp)[0] = a3;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do2:
+      b1 = a1 = ((op_t *) srcp)[0];
+      a2 = DBLALIGN (a2, b3, srci);
+      ((op_t *) dstp)[0] = a2;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do1:
+      b0 = a0 = ((op_t *) srcp)[0];
+      a1 = DBLALIGN (a1, b2, srci);
+      ((op_t *) dstp)[0] = a1;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+
+      len -= 4;
+    }
+  while (len != 0);
+
+  /* This is the right position for do0.  Please don't move
+     it into the loop.  */
+ do0:
+  a0 = DBLALIGN (a0, b1, srci);
+  ((op_t *) dstp)[0] = a0;
+}

Modified: fsf/trunk/libc/sysdeps/ieee754/dbl-64/s_fma.c
==============================================================================
--- fsf/trunk/libc/sysdeps/ieee754/dbl-64/s_fma.c (original)
+++ fsf/trunk/libc/sysdeps/ieee754/dbl-64/s_fma.c Tue Nov  6 16:19:40 2012
@@ -114,8 +114,17 @@
 	{
 	  /* Similarly.
 	     If z exponent is very large and x and y exponents are
-	     very small, it doesn't matter if we don't adjust it.  */
-	  if (u.ieee.exponent > v.ieee.exponent)
+	     very small, adjust them up to avoid spurious underflows,
+	     rather than down.  */
+	  if (u.ieee.exponent + v.ieee.exponent
+	      <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG)
+	    {
+	      if (u.ieee.exponent > v.ieee.exponent)
+		u.ieee.exponent += 2 * DBL_MANT_DIG + 2;
+	      else
+		v.ieee.exponent += 2 * DBL_MANT_DIG + 2;
+	    }
+	  else if (u.ieee.exponent > v.ieee.exponent)
 	    {
 	      if (u.ieee.exponent > DBL_MANT_DIG)
 		u.ieee.exponent -= DBL_MANT_DIG;
@@ -145,15 +154,15 @@
 		  <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG) */
 	{
 	  if (u.ieee.exponent > v.ieee.exponent)
-	    u.ieee.exponent += 2 * DBL_MANT_DIG;
-	  else
-	    v.ieee.exponent += 2 * DBL_MANT_DIG;
-	  if (w.ieee.exponent <= 4 * DBL_MANT_DIG + 4)
+	    u.ieee.exponent += 2 * DBL_MANT_DIG + 2;
+	  else
+	    v.ieee.exponent += 2 * DBL_MANT_DIG + 2;
+	  if (w.ieee.exponent <= 4 * DBL_MANT_DIG + 6)
 	    {
 	      if (w.ieee.exponent)
-		w.ieee.exponent += 2 * DBL_MANT_DIG;
+		w.ieee.exponent += 2 * DBL_MANT_DIG + 2;
 	      else
-		w.d *= 0x1p106;
+		w.d *= 0x1p108;
 	      adjust = -1;
 	    }
 	  /* Otherwise x * y should just affect inexact
@@ -238,19 +247,19 @@
       /* If a1 + u.d is exact, the only rounding happens during
 	 scaling down.  */
       if (j == 0)
-	return v.d * 0x1p-106;
+	return v.d * 0x1p-108;
       /* If result rounded to zero is not subnormal, no double
 	 rounding will occur.  */
-      if (v.ieee.exponent > 106)
-	return (a1 + u.d) * 0x1p-106;
-      /* If v.d * 0x1p-106 with round to zero is a subnormal above
-	 or equal to DBL_MIN / 2, then v.d * 0x1p-106 shifts mantissa
+      if (v.ieee.exponent > 108)
+	return (a1 + u.d) * 0x1p-108;
+      /* If v.d * 0x1p-108 with round to zero is a subnormal above
+	 or equal to DBL_MIN / 2, then v.d * 0x1p-108 shifts mantissa
 	 down just by 1 bit, which means v.ieee.mantissa1 |= j would
 	 change the round bit, not sticky or guard bit.
-	 v.d * 0x1p-106 never normalizes by shifting up,
+	 v.d * 0x1p-108 never normalizes by shifting up,
 	 so round bit plus sticky bit should be already enough
 	 for proper rounding.  */
-      if (v.ieee.exponent == 106)
+      if (v.ieee.exponent == 108)
 	{
 	  /* If the exponent would be in the normal range when
 	     rounding to normal precision with unbounded exponent
@@ -260,8 +269,8 @@
 	  if (TININESS_AFTER_ROUNDING)
 	    {
 	      w.d = a1 + u.d;
-	      if (w.ieee.exponent == 107)
-		return w.d * 0x1p-106;
+	      if (w.ieee.exponent == 109)
+		return w.d * 0x1p-108;
 	    }
 	  /* v.ieee.mantissa1 & 2 is LSB bit of the result before rounding,
 	     v.ieee.mantissa1 & 1 is the round bit and j is our sticky
@@ -270,12 +279,12 @@
 	  w.ieee.mantissa1 = ((v.ieee.mantissa1 & 3) << 1) | j;
 	  w.ieee.negative = v.ieee.negative;
 	  v.ieee.mantissa1 &= ~3U;
-	  v.d *= 0x1p-106;
+	  v.d *= 0x1p-108;
 	  w.d *= 0x1p-2;
 	  return v.d + w.d;
 	}
       v.ieee.mantissa1 |= j;
-      return v.d * 0x1p-106;
+      return v.d * 0x1p-108;
     }
 }
 #ifndef __fma

Modified: fsf/trunk/libc/sysdeps/ieee754/ldbl-128/s_fmal.c
==============================================================================
--- fsf/trunk/libc/sysdeps/ieee754/ldbl-128/s_fmal.c (original)
+++ fsf/trunk/libc/sysdeps/ieee754/ldbl-128/s_fmal.c Tue Nov  6 16:19:40 2012
@@ -118,8 +118,17 @@
 	{
 	  /* Similarly.
 	     If z exponent is very large and x and y exponents are
-	     very small, it doesn't matter if we don't adjust it.  */
-	  if (u.ieee.exponent > v.ieee.exponent)
+	     very small, adjust them up to avoid spurious underflows,
+	     rather than down.  */
+	  if (u.ieee.exponent + v.ieee.exponent
+	      <= IEEE854_LONG_DOUBLE_BIAS + LDBL_MANT_DIG)
+	    {
+	      if (u.ieee.exponent > v.ieee.exponent)
+		u.ieee.exponent += 2 * LDBL_MANT_DIG + 2;
+	      else
+		v.ieee.exponent += 2 * LDBL_MANT_DIG + 2;
+	    }
+	  else if (u.ieee.exponent > v.ieee.exponent)
 	    {
 	      if (u.ieee.exponent > LDBL_MANT_DIG)
 		u.ieee.exponent -= LDBL_MANT_DIG;
@@ -149,15 +158,15 @@
 		  <= IEEE854_LONG_DOUBLE_BIAS + LDBL_MANT_DIG) */
 	{
 	  if (u.ieee.exponent > v.ieee.exponent)
-	    u.ieee.exponent += 2 * LDBL_MANT_DIG;
-	  else
-	    v.ieee.exponent += 2 * LDBL_MANT_DIG;
-	  if (w.ieee.exponent <= 4 * LDBL_MANT_DIG + 4)
+	    u.ieee.exponent += 2 * LDBL_MANT_DIG + 2;
+	  else
+	    v.ieee.exponent += 2 * LDBL_MANT_DIG + 2;
+	  if (w.ieee.exponent <= 4 * LDBL_MANT_DIG + 6)
 	    {
 	      if (w.ieee.exponent)
-		w.ieee.exponent += 2 * LDBL_MANT_DIG;
+		w.ieee.exponent += 2 * LDBL_MANT_DIG + 2;
 	      else
-		w.d *= 0x1p226L;
+		w.d *= 0x1p228L;
 	      adjust = -1;
 	    }
 	  /* Otherwise x * y should just affect inexact
@@ -242,19 +251,19 @@
       /* If a1 + u.d is exact, the only rounding happens during
 	 scaling down.  */
       if (j == 0)
-	return v.d * 0x1p-226L;
+	return v.d * 0x1p-228L;
       /* If result rounded to zero is not subnormal, no double
 	 rounding will occur.  */
-      if (v.ieee.exponent > 226)
-	return (a1 + u.d) * 0x1p-226L;
-      /* If v.d * 0x1p-226L with round to zero is a subnormal above
-	 or equal to LDBL_MIN / 2, then v.d * 0x1p-226L shifts mantissa
+      if (v.ieee.exponent > 228)
+	return (a1 + u.d) * 0x1p-228L;
+      /* If v.d * 0x1p-228L with round to zero is a subnormal above
+	 or equal to LDBL_MIN / 2, then v.d * 0x1p-228L shifts mantissa
 	 down just by 1 bit, which means v.ieee.mantissa3 |= j would
 	 change the round bit, not sticky or guard bit.
-	 v.d * 0x1p-226L never normalizes by shifting up,
+	 v.d * 0x1p-228L never normalizes by shifting up,
 	 so round bit plus sticky bit should be already enough
 	 for proper rounding.  */
-      if (v.ieee.exponent == 226)
+      if (v.ieee.exponent == 228)
 	{
 	  /* If the exponent would be in the normal range when
 	     rounding to normal precision with unbounded exponent
@@ -264,8 +273,8 @@
 	  if (TININESS_AFTER_ROUNDING)
 	    {
 	      w.d = a1 + u.d;
-	      if (w.ieee.exponent == 227)
-		return w.d * 0x1p-226L;
+	      if (w.ieee.exponent == 229)
+		return w.d * 0x1p-228L;
 	    }
 	  /* v.ieee.mantissa3 & 2 is LSB bit of the result before rounding,
 	     v.ieee.mantissa3 & 1 is the round bit and j is our sticky
@@ -274,12 +283,12 @@
 	  w.ieee.mantissa3 = ((v.ieee.mantissa3 & 3) << 1) | j;
 	  w.ieee.negative = v.ieee.negative;
 	  v.ieee.mantissa3 &= ~3U;
-	  v.d *= 0x1p-226L;
+	  v.d *= 0x1p-228L;
 	  w.d *= 0x1p-2L;
 	  return v.d + w.d;
 	}
       v.ieee.mantissa3 |= j;
-      return v.d * 0x1p-226L;
+      return v.d * 0x1p-228L;
     }
 }
 weak_alias (__fmal, fmal)

Modified: fsf/trunk/libc/sysdeps/ieee754/ldbl-96/s_fmal.c
==============================================================================
--- fsf/trunk/libc/sysdeps/ieee754/ldbl-96/s_fmal.c (original)
+++ fsf/trunk/libc/sysdeps/ieee754/ldbl-96/s_fmal.c Tue Nov  6 16:19:40 2012
@@ -116,8 +116,17 @@
 	{
 	  /* Similarly.
 	     If z exponent is very large and x and y exponents are
-	     very small, it doesn't matter if we don't adjust it.  */
-	  if (u.ieee.exponent > v.ieee.exponent)
+	     very small, adjust them up to avoid spurious underflows,
+	     rather than down.  */
+	  if (u.ieee.exponent + v.ieee.exponent
+	      <= IEEE854_LONG_DOUBLE_BIAS + LDBL_MANT_DIG)
+	    {
+	      if (u.ieee.exponent > v.ieee.exponent)
+		u.ieee.exponent += 2 * LDBL_MANT_DIG + 2;
+	      else
+		v.ieee.exponent += 2 * LDBL_MANT_DIG + 2;
+	    }
+	  else if (u.ieee.exponent > v.ieee.exponent)
 	    {
 	      if (u.ieee.exponent > LDBL_MANT_DIG)
 		u.ieee.exponent -= LDBL_MANT_DIG;
@@ -147,15 +156,15 @@
 		  <= IEEE854_LONG_DOUBLE_BIAS + LDBL_MANT_DIG) */
 	{
 	  if (u.ieee.exponent > v.ieee.exponent)
-	    u.ieee.exponent += 2 * LDBL_MANT_DIG;
-	  else
-	    v.ieee.exponent += 2 * LDBL_MANT_DIG;
-	  if (w.ieee.exponent <= 4 * LDBL_MANT_DIG + 4)
+	    u.ieee.exponent += 2 * LDBL_MANT_DIG + 2;
+	  else
+	    v.ieee.exponent += 2 * LDBL_MANT_DIG + 2;
+	  if (w.ieee.exponent <= 4 * LDBL_MANT_DIG + 6)
 	    {
 	      if (w.ieee.exponent)
-		w.ieee.exponent += 2 * LDBL_MANT_DIG;
+		w.ieee.exponent += 2 * LDBL_MANT_DIG + 2;
 	      else
-		w.d *= 0x1p128L;
+		w.d *= 0x1p130L;
 	      adjust = -1;
 	    }
 	  /* Otherwise x * y should just affect inexact
@@ -240,19 +249,19 @@
       /* If a1 + u.d is exact, the only rounding happens during
 	 scaling down.  */
       if (j == 0)
-	return v.d * 0x1p-128L;
+	return v.d * 0x1p-130L;
       /* If result rounded to zero is not subnormal, no double
 	 rounding will occur.  */
-      if (v.ieee.exponent > 128)
-	return (a1 + u.d) * 0x1p-128L;
-      /* If v.d * 0x1p-128L with round to zero is a subnormal above
-	 or equal to LDBL_MIN / 2, then v.d * 0x1p-128L shifts mantissa
+      if (v.ieee.exponent > 130)
+	return (a1 + u.d) * 0x1p-130L;
+      /* If v.d * 0x1p-130L with round to zero is a subnormal above
+	 or equal to LDBL_MIN / 2, then v.d * 0x1p-130L shifts mantissa
 	 down just by 1 bit, which means v.ieee.mantissa1 |= j would
 	 change the round bit, not sticky or guard bit.
-	 v.d * 0x1p-128L never normalizes by shifting up,
+	 v.d * 0x1p-130L never normalizes by shifting up,
 	 so round bit plus sticky bit should be already enough
 	 for proper rounding.  */
-      if (v.ieee.exponent == 128)
+      if (v.ieee.exponent == 130)
 	{
 	  /* If the exponent would be in the normal range when
 	     rounding to normal precision with unbounded exponent
@@ -262,8 +271,8 @@
 	  if (TININESS_AFTER_ROUNDING)
 	    {
 	      w.d = a1 + u.d;
-	      if (w.ieee.exponent == 129)
-		return w.d * 0x1p-128L;
+	      if (w.ieee.exponent == 131)
+		return w.d * 0x1p-130L;
 	    }
 	  /* v.ieee.mantissa1 & 2 is LSB bit of the result before rounding,
 	     v.ieee.mantissa1 & 1 is the round bit and j is our sticky
@@ -272,12 +281,12 @@
 	  w.ieee.mantissa1 = ((v.ieee.mantissa1 & 3) << 1) | j;
 	  w.ieee.negative = v.ieee.negative;
 	  v.ieee.mantissa1 &= ~3U;
-	  v.d *= 0x1p-128L;
+	  v.d *= 0x1p-130L;
 	  w.d *= 0x1p-2L;
 	  return v.d + w.d;
 	}
       v.ieee.mantissa1 |= j;
-      return v.d * 0x1p-128L;
+      return v.d * 0x1p-130L;
     }
 }
 weak_alias (__fmal, fmal)

_______________________________________________
Commits mailing list
Commits@xxxxxxxxxx
http://eglibc.org/cgi-bin/mailman/listinfo/commits