[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Commits] r23322 - in /fsf/trunk/libc: ./ elf/ ports/ ports/sysdeps/arm/ ports/sysdeps/arm/armv7/multiarch/



Author: eglibc
Date: Wed Jun 19 00:02:07 2013
New Revision: 23322

Log:
Import glibc-mainline for 2013-06-19

Added:
    fsf/trunk/libc/ports/sysdeps/arm/test-fpucw.c
Modified:
    fsf/trunk/libc/ChangeLog
    fsf/trunk/libc/elf/rtld-Rules
    fsf/trunk/libc/ports/ChangeLog.arm
    fsf/trunk/libc/ports/sysdeps/arm/arm-features.h
    fsf/trunk/libc/ports/sysdeps/arm/arm-mcount.S
    fsf/trunk/libc/ports/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
    fsf/trunk/libc/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
    fsf/trunk/libc/ports/sysdeps/arm/fpu_control.h

Modified: fsf/trunk/libc/ChangeLog
==============================================================================
--- fsf/trunk/libc/ChangeLog (original)
+++ fsf/trunk/libc/ChangeLog Wed Jun 19 00:02:07 2013
@@ -1,3 +1,9 @@
+2013-06-18  Roland McGrath  <roland@xxxxxxxxxxxxx>
+
+	* elf/rtld-Rules (rtld-compile-command.S): New variable.
+	(rtld-compile-command.s, rtld-compile-command.c): New variables.
+	($(objpfx)rtld-%.os rules): Use them.
+
 2013-06-17  Adhemerval Zanella  <azanella@xxxxxxxxxxxxxxxxxx>
 
 	* nptl/sysdeps/powerpc/tls.h (tcbhead_t): Add Event-Based Branch

Modified: fsf/trunk/libc/elf/rtld-Rules
==============================================================================
--- fsf/trunk/libc/elf/rtld-Rules (original)
+++ fsf/trunk/libc/elf/rtld-Rules Wed Jun 19 00:02:07 2013
@@ -88,29 +88,39 @@
 # Some other subdir's Makefile has provided all its normal rules,
 # and we just provide some additional definitions.
 
+rtld-compile-command.S = $(compile-command.S) $(rtld-CPPFLAGS)
+rtld-compile-command.s = $(compile-command.s) $(rtld-CPPFLAGS)
+rtld-compile-command.c = $(compile-command.c) $(rtld-CPPFLAGS)
+
 # These are the basic compilation rules corresponding to the Makerules ones.
 # The sysd-rules generated makefile already defines pattern rules for rtld-%
 # targets built from sysdeps source files.
 $(objpfx)rtld-%.os: rtld-%.S $(before-compile)
-	$(compile-command.S) $(rtld-CPPFLAGS)
+	$(rtld-compile-command.S)
 $(objpfx)rtld-%.os: rtld-%.s $(before-compile)
-	$(compile-command.s) $(rtld-CPPFLAGS)
+	$(rtld-compile-command.s)
 $(objpfx)rtld-%.os: rtld-%.c $(before-compile)
-	$(compile-command.c) $(rtld-CPPFLAGS)
+	$(rtld-compile-command.c)
 $(objpfx)rtld-%.os: %.S $(before-compile)
-	$(compile-command.S) $(rtld-CPPFLAGS)
+	$(rtld-compile-command.S)
 $(objpfx)rtld-%.os: %.s $(before-compile)
-	$(compile-command.s) $(rtld-CPPFLAGS)
+	$(rtld-compile-command.s)
 $(objpfx)rtld-%.os: %.c $(before-compile)
-	$(compile-command.c) $(rtld-CPPFLAGS)
+	$(rtld-compile-command.c)
 
 # The rules for generated source files.
-$(objpfx)rtld-%.os: $(objpfx)rtld-%.S $(before-compile); $(compile-command.S)
-$(objpfx)rtld-%.os: $(objpfx)rtld-%.s $(before-compile); $(compile-command.s)
-$(objpfx)rtld-%.os: $(objpfx)rtld-%.c $(before-compile); $(compile-command.c)
-$(objpfx)rtld-%.os: $(objpfx)%.S $(before-compile); $(compile-command.S)
-$(objpfx)rtld-%.os: $(objpfx)%.s $(before-compile); $(compile-command.s)
-$(objpfx)rtld-%.os: $(objpfx)%.c $(before-compile); $(compile-command.c)
+$(objpfx)rtld-%.os: $(objpfx)rtld-%.S $(before-compile)
+	$(rtld-compile-command.S)
+$(objpfx)rtld-%.os: $(objpfx)rtld-%.s $(before-compile)
+	$(rtld-compile-command.s)
+$(objpfx)rtld-%.os: $(objpfx)rtld-%.c $(before-compile)
+	$(rtld-compile-command.c)
+$(objpfx)rtld-%.os: $(objpfx)%.S $(before-compile)
+	$(rtld-compile-command.S)
+$(objpfx)rtld-%.os: $(objpfx)%.s $(before-compile)
+	$(rtld-compile-command.s)
+$(objpfx)rtld-%.os: $(objpfx)%.c $(before-compile)
+	$(rtld-compile-command.c)
 
 # The command line setting of rtld-modules (see above) tells us
 # what we need to build, and that tells us what dependency files we need.

Modified: fsf/trunk/libc/ports/ChangeLog.arm
==============================================================================
--- fsf/trunk/libc/ports/ChangeLog.arm (original)
+++ fsf/trunk/libc/ports/ChangeLog.arm Wed Jun 19 00:02:07 2013
@@ -1,3 +1,30 @@
+2013-06-18  Roland McGrath  <roland@xxxxxxxxxxxxx>
+
+	* sysdeps/arm/arm-mcount.S: Comment typo fix.
+
+	* sysdeps/arm/arm-features.h (ARM_BX_NINSNS): New macro.
+	* sysdeps/arm/armv7/multiarch/memcpy_impl.S: Macroize the
+	computed-jump dispatch sections.  Use sfi_breg throughout.
+	[ARM_ALWAYS_BX]: Define a different version of the dispatch macros
+	that uses bx rather than add-to-pc, and respects ARM_BX_ALIGN_LOG2.
+	[!USE_NEON] (D_l, D_h): Use r10, r11 rather than r8, r9.
+	(tmp2): Use r8 rather than r10.
+
+	* sysdeps/arm/armv7/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list)
+	[__ARM_NEON__]: Do not refer to HWCAP_ARM_NEON.
+	[!__SOFTFP__]: Do not refer to HWCAP_ARM_VFP.
+
+2013-06-18  Joseph Myers  <joseph@xxxxxxxxxxxxxxxx>
+
+	* sysdeps/arm/fpu_control.h [!(_LIBC && !_LIBC_TEST) &&
+	__SOFTFP__] (_FPU_GETCW): Define to (cw) = 0.
+	[!(_LIBC && !_LIBC_TEST) && __SOFTFP__] (_FPU_SETCW): Define to
+	(void) (cw).
+
+	* sysdeps/arm/fpu_control.h [!_LIBC && __SOFTFP__]: Change
+	condition to [!(_LIBC && !_LIBC_TEST) && __SOFTFP__].
+	* sysdeps/arm/test-fpucw.c: New file.
+
 2013-06-17  Joseph Myers  <joseph@xxxxxxxxxxxxxxxx>
 
 	[BZ #14907]

Modified: fsf/trunk/libc/ports/sysdeps/arm/arm-features.h
==============================================================================
--- fsf/trunk/libc/ports/sysdeps/arm/arm-features.h (original)
+++ fsf/trunk/libc/ports/sysdeps/arm/arm-features.h Wed Jun 19 00:02:07 2013
@@ -53,6 +53,14 @@
 # define ARM_BX_ALIGN_LOG2	2
 #endif
 
+/* The number of instructions that 'bx' expands to.  A more-specific
+   arm-features.h that defines 'bx' as a macro should define this to the
+   number instructions it expands to.  This is used only in a context
+   where the 'bx' expansion won't cross an ARM_BX_ALIGN_LOG2 boundary.  */
+#ifndef ARM_BX_NINSNS
+# define ARM_BX_NINSNS		1
+#endif
+
 /* An OS-specific arm-features.h file may define ARM_NO_INDEX_REGISTER to
    indicate that the two-register addressing modes must never be used.  */
 

Modified: fsf/trunk/libc/ports/sysdeps/arm/arm-mcount.S
==============================================================================
--- fsf/trunk/libc/ports/sysdeps/arm/arm-mcount.S (original)
+++ fsf/trunk/libc/ports/sysdeps/arm/arm-mcount.S Wed Jun 19 00:02:07 2013
@@ -39,7 +39,7 @@
    The calling sequence looks something like:
 func:
    push {lr}
-   bl __gnu_mount_nc
+   bl __gnu_mcount_nc
    <function body>
 */
 

Modified: fsf/trunk/libc/ports/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
==============================================================================
--- fsf/trunk/libc/ports/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c (original)
+++ fsf/trunk/libc/ports/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c Wed Jun 19 00:02:07 2013
@@ -16,6 +16,7 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+#include <stdbool.h>
 #include <string.h>
 #include <ldsodefs.h>
 #include <sysdep.h>
@@ -29,21 +30,25 @@
 			size_t max)
 {
   size_t i = 0;
-  int hwcap;
 
-  hwcap = GLRO(dl_hwcap);
+  bool use_neon = true;
+#ifdef __ARM_NEON__
+# define __memcpy_neon	memcpy
+#else
+  use_neon = (GLRO(dl_hwcap) & HWCAP_ARM_NEON) != 0;
+#endif
+
+#ifndef __ARM_NEON__
+  bool use_vfp = true;
+# ifdef __SOFTFP__
+  use_vfp = (GLRO(dl_hwcap) & HWCAP_ARM_VFP) != 0;
+# endif
+#endif
 
   IFUNC_IMPL (i, name, memcpy,
-	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & HWCAP_ARM_NEON,
-#ifdef __ARM_NEON__
-                              memcpy
-#else
-			      __memcpy_neon
-#endif
-                              )
+	      IFUNC_IMPL_ADD (array, i, memcpy, use_neon, __memcpy_neon)
 #ifndef __ARM_NEON__
-	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & HWCAP_ARM_VFP,
-			      __memcpy_vfp)
+	      IFUNC_IMPL_ADD (array, i, memcpy, use_vfp, __memcpy_vfp)
 #endif
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_arm));
 

Modified: fsf/trunk/libc/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
==============================================================================
--- fsf/trunk/libc/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S (original)
+++ fsf/trunk/libc/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S Wed Jun 19 00:02:07 2013
@@ -33,6 +33,7 @@
 #define NO_THUMB
 #endif
 #include <sysdep.h>
+#include <arm-features.h>
 
 	.syntax unified
 	/* This implementation requires ARM state.  */
@@ -71,7 +72,139 @@
 /* Locals.  */
 #define tmp1	r3
 #define dst	ip
-#define tmp2	r10
+#define tmp2	r8
+
+/* These two macros both work by repeated invocation of the macro
+   dispatch_step (not defined here).  That macro performs one "step",
+   doing one load instruction and one store instruction to copy one
+   "unit".  On entry, TMP1 contains the number of bytes to be copied,
+   a multiple of the unit size.  The macro clobbers TMP1 in the
+   process of doing a computed jump to the tail containing the
+   appropriate number of steps.
+
+   In dispatch_7_dword, dispatch_step is invoked seven times, with an
+   argument that is 7 for the first and 1 for the last.  Units are
+   double-words (8 bytes).  TMP1 is at most 56.
+
+   In dispatch_15_word, dispatch_step is invoked fifteen times,
+   with an argument that is 15 for the first and 1 for the last.
+   Units are words (4 bytes).  TMP1 is at most 60.  */
+
+#ifndef ARM_ALWAYS_BX
+# if ARM_BX_ALIGN_LOG2 != 2
+#  error case not handled
+# endif
+	.macro dispatch_7_dword
+	rsb	tmp1, tmp1, #((7 * 8) - PC_OFS + INSN_SIZE)
+	add	pc, pc, tmp1
+	dispatch_step 7
+	dispatch_step 6
+	dispatch_step 5
+	dispatch_step 4
+	dispatch_step 3
+	dispatch_step 2
+	dispatch_step 1
+	.purgem dispatch_step
+	.endm
+
+	.macro dispatch_15_word
+	rsb	tmp1, tmp1, #((15 * 4) - PC_OFS/2 + INSN_SIZE/2)
+	add	pc, pc, tmp1, lsl #1
+	dispatch_step 15
+	dispatch_step 14
+	dispatch_step 13
+	dispatch_step 12
+	dispatch_step 11
+	dispatch_step 10
+	dispatch_step 9
+	dispatch_step 8
+	dispatch_step 7
+	dispatch_step 6
+	dispatch_step 5
+	dispatch_step 4
+	dispatch_step 3
+	dispatch_step 2
+	dispatch_step 1
+	.purgem dispatch_step
+	.endm
+#else
+# if ARM_BX_ALIGN_LOG2 < 4
+#  error case not handled
+# endif
+	.macro dispatch_helper steps, log2_bytes_per_step
+	.p2align ARM_BX_ALIGN_LOG2
+	/* TMP1 gets (max_bytes - bytes_to_copy), where max_bytes is
+	   (STEPS << LOG2_BYTES_PER_STEP).
+	   So this is (steps_to_skip << LOG2_BYTES_PER_STEP).  */
+	rsb	tmp1, tmp1, #(\steps << \log2_bytes_per_step)
+	/* Pad so that the add;bx pair immediately precedes an alignment
+	   boundary.  Hence, TMP1=0 will run all the steps.  */
+	.rept (1 << (ARM_BX_ALIGN_LOG2 - 2)) - (2 + ARM_BX_NINSNS)
+	nop
+	.endr
+	/* Shifting down LOG2_BYTES_PER_STEP gives us the number of
+	   steps to skip, then shifting up ARM_BX_ALIGN_LOG2 gives us
+	   the (byte) distance to add to the PC.  */
+	add	tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)
+	bx	tmp1
+	.endm
+
+	.macro dispatch_7_dword
+	dispatch_helper 7, 3
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 7
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 6
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 5
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 4
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 3
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 2
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 1
+	.p2align ARM_BX_ALIGN_LOG2
+	.purgem dispatch_step
+	.endm
+
+	.macro dispatch_15_word
+	dispatch_helper 15, 2
+	dispatch_step 15
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 14
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 13
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 12
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 11
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 10
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 9
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 8
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 7
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 6
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 5
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 4
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 3
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 2
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 1
+	.p2align ARM_BX_ALIGN_LOG2
+	.purgem dispatch_step
+	.endm
+
+#endif
 
 #ifndef USE_NEON
 /* For bulk copies using GP registers.  */
@@ -81,8 +214,9 @@
 #define	B_h	r5
 #define	C_l	r6
 #define	C_h	r7
-#define	D_l	r8
-#define	D_h	r9
+/* Don't use the pair r8,r9 because in some EABI variants r9 is reserved.  */
+#define	D_l	r10
+#define	D_h	r11
 #endif
 
 /* Number of lines ahead to pre-fetch data.  If you change this the code
@@ -92,40 +226,71 @@
 
 #ifdef USE_VFP
 	.macro	cpy_line_vfp vreg, base
-	vstr	\vreg, [dst, #\base]
-	vldr	\vreg, [src, #\base]
-	vstr	d0, [dst, #\base + 8]
-	vldr	d0, [src, #\base + 8]
-	vstr	d1, [dst, #\base + 16]
-	vldr	d1, [src, #\base + 16]
-	vstr	d2, [dst, #\base + 24]
-	vldr	d2, [src, #\base + 24]
-	vstr	\vreg, [dst, #\base + 32]
-	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
-	vstr	d0, [dst, #\base + 40]
-	vldr	d0, [src, #\base + 40]
-	vstr	d1, [dst, #\base + 48]
-	vldr	d1, [src, #\base + 48]
-	vstr	d2, [dst, #\base + 56]
-	vldr	d2, [src, #\base + 56]
+	sfi_breg dst, \
+	vstr	\vreg, [\B, #\base]
+	sfi_breg src, \
+	vldr	\vreg, [\B, #\base]
+	sfi_breg dst, \
+	vstr	d0, [\B, #\base + 8]
+	sfi_breg src, \
+	vldr	d0, [\B, #\base + 8]
+	sfi_breg dst, \
+	vstr	d1, [\B, #\base + 16]
+	sfi_breg src, \
+	vldr	d1, [\B, #\base + 16]
+	sfi_breg dst, \
+	vstr	d2, [\B, #\base + 24]
+	sfi_breg src, \
+	vldr	d2, [\B, #\base + 24]
+	sfi_breg dst, \
+	vstr	\vreg, [\B, #\base + 32]
+	sfi_breg src, \
+	vldr	\vreg, [\B, #\base + prefetch_lines * 64 - 32]
+	sfi_breg dst, \
+	vstr	d0, [\B, #\base + 40]
+	sfi_breg src, \
+	vldr	d0, [\B, #\base + 40]
+	sfi_breg dst, \
+	vstr	d1, [\B, #\base + 48]
+	sfi_breg src, \
+	vldr	d1, [\B, #\base + 48]
+	sfi_breg dst, \
+	vstr	d2, [\B, #\base + 56]
+	sfi_breg src, \
+	vldr	d2, [\B, #\base + 56]
 	.endm
 
 	.macro	cpy_tail_vfp vreg, base
-	vstr	\vreg, [dst, #\base]
-	vldr	\vreg, [src, #\base]
-	vstr	d0, [dst, #\base + 8]
-	vldr	d0, [src, #\base + 8]
-	vstr	d1, [dst, #\base + 16]
-	vldr	d1, [src, #\base + 16]
-	vstr	d2, [dst, #\base + 24]
-	vldr	d2, [src, #\base + 24]
-	vstr	\vreg, [dst, #\base + 32]
-	vstr	d0, [dst, #\base + 40]
-	vldr	d0, [src, #\base + 40]
-	vstr	d1, [dst, #\base + 48]
-	vldr	d1, [src, #\base + 48]
-	vstr	d2, [dst, #\base + 56]
-	vldr	d2, [src, #\base + 56]
+	sfi_breg dst, \
+	vstr	\vreg, [\B, #\base]
+	sfi_breg src, \
+	vldr	\vreg, [\B, #\base]
+	sfi_breg dst, \
+	vstr	d0, [\B, #\base + 8]
+	sfi_breg src, \
+	vldr	d0, [\B, #\base + 8]
+	sfi_breg dst, \
+	vstr	d1, [\B, #\base + 16]
+	sfi_breg src, \
+	vldr	d1, [\B, #\base + 16]
+	sfi_breg dst, \
+	vstr	d2, [\B, #\base + 24]
+	sfi_breg src, \
+	vldr	d2, [\B, #\base + 24]
+	sfi_breg dst, \
+	vstr	\vreg, [\B, #\base + 32]
+	sfi_breg dst, \
+	vstr	d0, [\B, #\base + 40]
+	sfi_breg src, \
+	vldr	d0, [\B, #\base + 40]
+	sfi_breg dst, \
+	vstr	d1, [\B, #\base + 48]
+	sfi_breg src, \
+	vldr	d1, [\B, #\base + 48]
+	sfi_breg dst, \
+	vstr	d2, [\B, #\base + 56]
+	sfi_breg src, \
+	vldr	d2, [\B, #\base + 56]
 	.endm
 #endif
 
@@ -140,81 +305,62 @@
 
 .Ltail63unaligned:
 #ifdef USE_NEON
+	/* These need an extra layer of macro just to work around a
+	   bug in the assembler's parser when an operand starts with
+	   a {...}.  http://sourceware.org/bugzilla/show_bug.cgi?id=15647
+	   tracks that bug; it was not fixed as of binutils-2.23.2.  */
+	.macro neon_load_d0 reg
+	vld1.8	{d0}, [\reg]!
+	.endm
+	.macro neon_store_d0 reg
+	vst1.8	{d0}, [\reg]!
+	.endm
+
+	/* These are used by the NaCl sfi_breg macro.  */
+	.macro _sfi_breg_dmask_neon_load_d0 reg
+	_sfi_dmask \reg
+	.endm
+	.macro _sfi_breg_dmask_neon_store_d0 reg
+	_sfi_dmask \reg
+	.endm
+
 	and	tmp1, count, #0x38
-	rsb	tmp1, tmp1, #(56 - PC_OFS + INSN_SIZE)
-	add	pc, pc, tmp1
-	vld1.8	{d0}, [src]!	/* 14 words to go.  */
-	vst1.8	{d0}, [dst]!
-	vld1.8	{d0}, [src]!	/* 12 words to go.  */
-	vst1.8	{d0}, [dst]!
-	vld1.8	{d0}, [src]!	/* 10 words to go.  */
-	vst1.8	{d0}, [dst]!
-	vld1.8	{d0}, [src]!	/* 8 words to go.  */
-	vst1.8	{d0}, [dst]!
-	vld1.8	{d0}, [src]!	/* 6 words to go.  */
-	vst1.8	{d0}, [dst]!
-	vld1.8	{d0}, [src]!	/* 4 words to go.  */
-	vst1.8	{d0}, [dst]!
-	vld1.8	{d0}, [src]!	/* 2 words to go.  */
-	vst1.8	{d0}, [dst]!
+	.macro dispatch_step i
+	sfi_breg src, neon_load_d0 \B
+	sfi_breg dst, neon_store_d0 \B
+	.endm
+	dispatch_7_dword
 
 	tst	count, #4
-	ldrne	tmp1, [src], #4
-	strne	tmp1, [dst], #4
+	sfi_breg src, \
+	ldrne	tmp1, [\B], #4
+	sfi_breg dst, \
+	strne	tmp1, [\B], #4
 #else
 	/* Copy up to 15 full words of data.  May not be aligned.  */
 	/* Cannot use VFP for unaligned data.  */
 	and	tmp1, count, #0x3c
 	add	dst, dst, tmp1
 	add	src, src, tmp1
-	rsb	tmp1, tmp1, #(60 - PC_OFS/2 + INSN_SIZE/2)
 	/* Jump directly into the sequence below at the correct offset.  */
-	add	pc, pc, tmp1, lsl #1
-
-	ldr	tmp1, [src, #-60]	/* 15 words to go.  */
-	str	tmp1, [dst, #-60]
-
-	ldr	tmp1, [src, #-56]	/* 14 words to go.  */
-	str	tmp1, [dst, #-56]
-	ldr	tmp1, [src, #-52]
-	str	tmp1, [dst, #-52]
-
-	ldr	tmp1, [src, #-48]	/* 12 words to go.  */
-	str	tmp1, [dst, #-48]
-	ldr	tmp1, [src, #-44]
-	str	tmp1, [dst, #-44]
-
-	ldr	tmp1, [src, #-40]	/* 10 words to go.  */
-	str	tmp1, [dst, #-40]
-	ldr	tmp1, [src, #-36]
-	str	tmp1, [dst, #-36]
-
-	ldr	tmp1, [src, #-32]	/* 8 words to go.  */
-	str	tmp1, [dst, #-32]
-	ldr	tmp1, [src, #-28]
-	str	tmp1, [dst, #-28]
-
-	ldr	tmp1, [src, #-24]	/* 6 words to go.  */
-	str	tmp1, [dst, #-24]
-	ldr	tmp1, [src, #-20]
-	str	tmp1, [dst, #-20]
-
-	ldr	tmp1, [src, #-16]	/* 4 words to go.  */
-	str	tmp1, [dst, #-16]
-	ldr	tmp1, [src, #-12]
-	str	tmp1, [dst, #-12]
-
-	ldr	tmp1, [src, #-8]	/* 2 words to go.  */
-	str	tmp1, [dst, #-8]
-	ldr	tmp1, [src, #-4]
-	str	tmp1, [dst, #-4]
+	.macro dispatch_step i
+	sfi_breg src, \
+	ldr	tmp1, [\B, #-(\i * 4)]
+	sfi_breg dst, \
+	str	tmp1, [\B, #-(\i * 4)]
+	.endm
+	dispatch_15_word
 #endif
 
 	lsls	count, count, #31
-	ldrhcs	tmp1, [src], #2
-	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
-	strhcs	tmp1, [dst], #2
-	strbne	src, [dst]
+	sfi_breg src, \
+	ldrhcs	tmp1, [\B], #2
+	sfi_breg src, \
+	ldrbne	src, [\B]		/* Src is dead, use as a scratch.  */
+	sfi_breg dst, \
+	strhcs	tmp1, [\B], #2
+	sfi_breg dst, \
+	strbne	src, [\B]
 	bx	lr
 
 .Lcpy_not_short:
@@ -242,13 +388,19 @@
 	beq	1f
 	rsbs	tmp2, tmp2, #0
 	sub	count, count, tmp2, lsr #29
-	ldrmi	tmp1, [src], #4
-	strmi	tmp1, [dst], #4
+	sfi_breg src, \
+	ldrmi	tmp1, [\B], #4
+	sfi_breg dst, \
+	strmi	tmp1, [\B], #4
 	lsls	tmp2, tmp2, #2
-	ldrhcs	tmp1, [src], #2
-	ldrbne	tmp2, [src], #1
-	strhcs	tmp1, [dst], #2
-	strbne	tmp2, [dst], #1
+	sfi_breg src, \
+	ldrhcs	tmp1, [\B], #2
+	sfi_breg src, \
+	ldrbne	tmp2, [\B], #1
+	sfi_breg dst, \
+	strhcs	tmp1, [\B], #2
+	sfi_breg dst, \
+	strbne	tmp2, [\B], #1
 
 1:
 	subs	tmp2, count, #64	/* Use tmp2 for count.  */
@@ -260,24 +412,40 @@
 .Lcpy_body_medium:			/* Count in tmp2.  */
 #ifdef USE_VFP
 1:
-	vldr	d0, [src, #0]
+	sfi_breg src, \
+	vldr	d0, [\B, #0]
 	subs	tmp2, tmp2, #64
-	vldr	d1, [src, #8]
-	vstr	d0, [dst, #0]
-	vldr	d0, [src, #16]
-	vstr	d1, [dst, #8]
-	vldr	d1, [src, #24]
-	vstr	d0, [dst, #16]
-	vldr	d0, [src, #32]
-	vstr	d1, [dst, #24]
-	vldr	d1, [src, #40]
-	vstr	d0, [dst, #32]
-	vldr	d0, [src, #48]
-	vstr	d1, [dst, #40]
-	vldr	d1, [src, #56]
-	vstr	d0, [dst, #48]
+	sfi_breg src, \
+	vldr	d1, [\B, #8]
+	sfi_breg dst, \
+	vstr	d0, [\B, #0]
+	sfi_breg src, \
+	vldr	d0, [\B, #16]
+	sfi_breg dst, \
+	vstr	d1, [\B, #8]
+	sfi_breg src, \
+	vldr	d1, [\B, #24]
+	sfi_breg dst, \
+	vstr	d0, [\B, #16]
+	sfi_breg src, \
+	vldr	d0, [\B, #32]
+	sfi_breg dst, \
+	vstr	d1, [\B, #24]
+	sfi_breg src, \
+	vldr	d1, [\B, #40]
+	sfi_breg dst, \
+	vstr	d0, [\B, #32]
+	sfi_breg src, \
+	vldr	d0, [\B, #48]
+	sfi_breg dst, \
+	vstr	d1, [\B, #40]
+	sfi_breg src, \
+	vldr	d1, [\B, #56]
+	sfi_breg dst, \
+	vstr	d0, [\B, #48]
 	add	src, src, #64
-	vstr	d1, [dst, #56]
+	sfi_breg dst, \
+	vstr	d1, [\B, #56]
 	add	dst, dst, #64
 	bge	1b
 	tst	tmp2, #0x3f
@@ -287,43 +455,49 @@
 	and	tmp1, tmp2, #0x38
 	add	dst, dst, tmp1
 	add	src, src, tmp1
-	rsb	tmp1, tmp1, #(56 - PC_OFS + INSN_SIZE)
-	add	pc, pc, tmp1
-
-	vldr	d0, [src, #-56]	/* 14 words to go.  */
-	vstr	d0, [dst, #-56]
-	vldr	d0, [src, #-48]	/* 12 words to go.  */
-	vstr	d0, [dst, #-48]
-	vldr	d0, [src, #-40]	/* 10 words to go.  */
-	vstr	d0, [dst, #-40]
-	vldr	d0, [src, #-32]	/* 8 words to go.  */
-	vstr	d0, [dst, #-32]
-	vldr	d0, [src, #-24]	/* 6 words to go.  */
-	vstr	d0, [dst, #-24]
-	vldr	d0, [src, #-16]	/* 4 words to go.  */
-	vstr	d0, [dst, #-16]
-	vldr	d0, [src, #-8]	/* 2 words to go.  */
-	vstr	d0, [dst, #-8]
+	.macro dispatch_step i
+	sfi_breg src, \
+	vldr	d0, [\B, #-(\i * 8)]
+	sfi_breg dst, \
+	vstr	d0, [\B, #-(\i * 8)]
+	.endm
+	dispatch_7_dword
 #else
 	sub	src, src, #8
 	sub	dst, dst, #8
 1:
-	ldrd	A_l, A_h, [src, #8]
-	strd	A_l, A_h, [dst, #8]
-	ldrd	A_l, A_h, [src, #16]
-	strd	A_l, A_h, [dst, #16]
-	ldrd	A_l, A_h, [src, #24]
-	strd	A_l, A_h, [dst, #24]
-	ldrd	A_l, A_h, [src, #32]
-	strd	A_l, A_h, [dst, #32]
-	ldrd	A_l, A_h, [src, #40]
-	strd	A_l, A_h, [dst, #40]
-	ldrd	A_l, A_h, [src, #48]
-	strd	A_l, A_h, [dst, #48]
-	ldrd	A_l, A_h, [src, #56]
-	strd	A_l, A_h, [dst, #56]
-	ldrd	A_l, A_h, [src, #64]!
-	strd	A_l, A_h, [dst, #64]!
+	sfi_breg src, \
+	ldrd	A_l, A_h, [\B, #8]
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #8]
+	sfi_breg src, \
+	ldrd	A_l, A_h, [\B, #16]
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #16]
+	sfi_breg src, \
+	ldrd	A_l, A_h, [\B, #24]
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #24]
+	sfi_breg src, \
+	ldrd	A_l, A_h, [\B, #32]
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #32]
+	sfi_breg src, \
+	ldrd	A_l, A_h, [\B, #40]
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #40]
+	sfi_breg src, \
+	ldrd	A_l, A_h, [\B, #48]
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #48]
+	sfi_breg src, \
+	ldrd	A_l, A_h, [\B, #56]
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #56]
+	sfi_breg src, \
+	ldrd	A_l, A_h, [\B, #64]!
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #64]!
 	subs	tmp2, tmp2, #64
 	bge	1b
 	tst	tmp2, #0x3f
@@ -349,32 +523,29 @@
 	and	tmp1, tmp2, #0x38
 	add	dst, dst, tmp1
 	add	src, src, tmp1
-	rsb	tmp1, tmp1, #(56 - PC_OFS + INSN_SIZE)
-	add	pc, pc, tmp1
-	ldrd	A_l, A_h, [src, #-56]	/* 14 words to go.  */
-	strd	A_l, A_h, [dst, #-56]
-	ldrd	A_l, A_h, [src, #-48]	/* 12 words to go.  */
-	strd	A_l, A_h, [dst, #-48]
-	ldrd	A_l, A_h, [src, #-40]	/* 10 words to go.  */
-	strd	A_l, A_h, [dst, #-40]
-	ldrd	A_l, A_h, [src, #-32]	/* 8 words to go.  */
-	strd	A_l, A_h, [dst, #-32]
-	ldrd	A_l, A_h, [src, #-24]	/* 6 words to go.  */
-	strd	A_l, A_h, [dst, #-24]
-	ldrd	A_l, A_h, [src, #-16]	/* 4 words to go.  */
-	strd	A_l, A_h, [dst, #-16]
-	ldrd	A_l, A_h, [src, #-8]	/* 2 words to go.  */
-	strd	A_l, A_h, [dst, #-8]
-
+	.macro dispatch_step i
+	sfi_breg src, \
+	ldrd	A_l, A_h, [\B, #-(\i * 8)]
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #-(\i * 8)]
+	.endm
+	dispatch_7_dword
 #endif
+
 	tst	tmp2, #4
-	ldrne	tmp1, [src], #4
-	strne	tmp1, [dst], #4
+	sfi_breg src, \
+	ldrne	tmp1, [\B], #4
+	sfi_breg dst, \
+	strne	tmp1, [\B], #4
 	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
-	ldrhcs	tmp1, [src], #2
-	ldrbne	tmp2, [src]
-	strhcs	tmp1, [dst], #2
-	strbne	tmp2, [dst]
+	sfi_breg src, \
+	ldrhcs	tmp1, [\B], #2
+	sfi_breg src, \
+	ldrbne	tmp2, [\B]
+	sfi_breg dst, \
+	strhcs	tmp1, [\B], #2
+	sfi_breg dst, \
+	strbne	tmp2, [\B]
 
 .Ldone:
 	ldr	tmp2, [sp], #FRAME_SIZE
@@ -394,15 +565,23 @@
 	   copy position into a register.  This should act like a PLD
 	   operation but we won't have to repeat the transfer.  */
 
-	vldr	d3, [src, #0]
-	vldr	d4, [src, #64]
-	vldr	d5, [src, #128]
-	vldr	d6, [src, #192]
-	vldr	d7, [src, #256]
-
-	vldr	d0, [src, #8]
-	vldr	d1, [src, #16]
-	vldr	d2, [src, #24]
+	sfi_breg src, \
+	vldr	d3, [\B, #0]
+	sfi_breg src, \
+	vldr	d4, [\B, #64]
+	sfi_breg src, \
+	vldr	d5, [\B, #128]
+	sfi_breg src, \
+	vldr	d6, [\B, #192]
+	sfi_breg src, \
+	vldr	d7, [\B, #256]
+
+	sfi_breg src, \
+	vldr	d0, [\B, #8]
+	sfi_breg src, \
+	vldr	d1, [\B, #16]
+	sfi_breg src, \
+	vldr	d2, [\B, #24]
 	add	src, src, #32
 
 	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
@@ -427,19 +606,31 @@
 	add	src, src, #3 * 64
 	add	dst, dst, #3 * 64
 	cpy_tail_vfp	d6, 0
-	vstr	d7, [dst, #64]
-	vldr	d7, [src, #64]
-	vstr	d0, [dst, #64 + 8]
-	vldr	d0, [src, #64 + 8]
-	vstr	d1, [dst, #64 + 16]
-	vldr	d1, [src, #64 + 16]
-	vstr	d2, [dst, #64 + 24]
-	vldr	d2, [src, #64 + 24]
-	vstr	d7, [dst, #64 + 32]
+	sfi_breg dst, \
+	vstr	d7, [\B, #64]
+	sfi_breg src, \
+	vldr	d7, [\B, #64]
+	sfi_breg dst, \
+	vstr	d0, [\B, #64 + 8]
+	sfi_breg src, \
+	vldr	d0, [\B, #64 + 8]
+	sfi_breg dst, \
+	vstr	d1, [\B, #64 + 16]
+	sfi_breg src, \
+	vldr	d1, [\B, #64 + 16]
+	sfi_breg dst, \
+	vstr	d2, [\B, #64 + 24]
+	sfi_breg src, \
+	vldr	d2, [\B, #64 + 24]
+	sfi_breg dst, \
+	vstr	d7, [\B, #64 + 32]
 	add	src, src, #96
-	vstr	d0, [dst, #64 + 40]
-	vstr	d1, [dst, #64 + 48]
-	vstr	d2, [dst, #64 + 56]
+	sfi_breg dst, \
+	vstr	d0, [\B, #64 + 40]
+	sfi_breg dst, \
+	vstr	d1, [\B, #64 + 48]
+	sfi_breg dst, \
+	vstr	d2, [\B, #64 + 56]
 	add	dst, dst, #128
 	add	tmp2, tmp2, #prefetch_lines * 64
 	b	.Lcpy_body_medium
@@ -450,59 +641,83 @@
 	/* Pre-bias src and dst.  */
 	sub	src, src, #8
 	sub	dst, dst, #8
-	pld	[src, #8]
-	pld	[src, #72]
+	sfi_pld	src, #8
+	sfi_pld	src, #72
 	subs	tmp2, tmp2, #64
-	pld	[src, #136]
-	ldrd	A_l, A_h, [src, #8]
+	sfi_pld	src, #136
+	sfi_breg src, \
+	ldrd	A_l, A_h, [\B, #8]
 	strd	B_l, B_h, [sp, #8]
 	cfi_rel_offset (B_l, 8)
 	cfi_rel_offset (B_h, 12)
-	ldrd	B_l, B_h, [src, #16]
+	sfi_breg src, \
+	ldrd	B_l, B_h, [\B, #16]
 	strd	C_l, C_h, [sp, #16]
 	cfi_rel_offset (C_l, 16)
 	cfi_rel_offset (C_h, 20)
-	ldrd	C_l, C_h, [src, #24]
+	sfi_breg src, \
+	ldrd	C_l, C_h, [\B, #24]
 	strd	D_l, D_h, [sp, #24]
 	cfi_rel_offset (D_l, 24)
 	cfi_rel_offset (D_h, 28)
-	pld	[src, #200]
-	ldrd	D_l, D_h, [src, #32]!
+	sfi_pld	src, #200
+	sfi_breg src, \
+	ldrd	D_l, D_h, [\B, #32]!
 	b	1f
 	.p2align	6
 2:
-	pld	[src, #232]
-	strd	A_l, A_h, [dst, #40]
-	ldrd	A_l, A_h, [src, #40]
-	strd	B_l, B_h, [dst, #48]
-	ldrd	B_l, B_h, [src, #48]
-	strd	C_l, C_h, [dst, #56]
-	ldrd	C_l, C_h, [src, #56]
-	strd	D_l, D_h, [dst, #64]!
-	ldrd	D_l, D_h, [src, #64]!
+	sfi_pld	src, #232
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #40]
+	sfi_breg src, \
+	ldrd	A_l, A_h, [\B, #40]
+	sfi_breg dst, \
+	strd	B_l, B_h, [\B, #48]
+	sfi_breg src, \
+	ldrd	B_l, B_h, [\B, #48]
+	sfi_breg dst, \
+	strd	C_l, C_h, [\B, #56]
+	sfi_breg src, \
+	ldrd	C_l, C_h, [\B, #56]
+	sfi_breg dst, \
+	strd	D_l, D_h, [\B, #64]!
+	sfi_breg src, \
+	ldrd	D_l, D_h, [\B, #64]!
 	subs	tmp2, tmp2, #64
 1:
-	strd	A_l, A_h, [dst, #8]
-	ldrd	A_l, A_h, [src, #8]
-	strd	B_l, B_h, [dst, #16]
-	ldrd	B_l, B_h, [src, #16]
-	strd	C_l, C_h, [dst, #24]
-	ldrd	C_l, C_h, [src, #24]
-	strd	D_l, D_h, [dst, #32]
-	ldrd	D_l, D_h, [src, #32]
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #8]
+	sfi_breg src, \
+	ldrd	A_l, A_h, [\B, #8]
+	sfi_breg dst, \
+	strd	B_l, B_h, [\B, #16]
+	sfi_breg src, \
+	ldrd	B_l, B_h, [\B, #16]
+	sfi_breg dst, \
+	strd	C_l, C_h, [\B, #24]
+	sfi_breg src, \
+	ldrd	C_l, C_h, [\B, #24]
+	sfi_breg dst, \
+	strd	D_l, D_h, [\B, #32]
+	sfi_breg src, \
+	ldrd	D_l, D_h, [\B, #32]
 	bcs	2b
 	/* Save the remaining bytes and restore the callee-saved regs.  */
-	strd	A_l, A_h, [dst, #40]
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #40]
 	add	src, src, #40
-	strd	B_l, B_h, [dst, #48]
+	sfi_breg dst, \
+	strd	B_l, B_h, [\B, #48]
 	ldrd	B_l, B_h, [sp, #8]
 	cfi_restore (B_l)
 	cfi_restore (B_h)
-	strd	C_l, C_h, [dst, #56]
+	sfi_breg dst, \
+	strd	C_l, C_h, [\B, #56]
 	ldrd	C_l, C_h, [sp, #16]
 	cfi_restore (C_l)
 	cfi_restore (C_h)
-	strd	D_l, D_h, [dst, #64]
+	sfi_breg dst, \
+	strd	D_l, D_h, [\B, #64]
 	ldrd	D_l, D_h, [sp, #24]
 	cfi_restore (D_l)
 	cfi_restore (D_h)
@@ -519,113 +734,173 @@
 	cfi_remember_state
 
 .Lcpy_notaligned:
-	pld	[src]
-	pld	[src, #64]
+	sfi_pld	src
+	sfi_pld	src, #64
 	/* There's at least 64 bytes to copy, but there is no mutual
 	   alignment.  */
 	/* Bring DST to 64-bit alignment.  */
 	lsls	tmp2, dst, #29
-	pld	[src, #(2 * 64)]
+	sfi_pld	src, #(2 * 64)
 	beq	1f
 	rsbs	tmp2, tmp2, #0
 	sub	count, count, tmp2, lsr #29
-	ldrmi	tmp1, [src], #4
-	strmi	tmp1, [dst], #4
+	sfi_breg src, \
+	ldrmi	tmp1, [\B], #4
+	sfi_breg dst, \
+	strmi	tmp1, [\B], #4
 	lsls	tmp2, tmp2, #2
-	ldrbne	tmp1, [src], #1
-	ldrhcs	tmp2, [src], #2
-	strbne	tmp1, [dst], #1
-	strhcs	tmp2, [dst], #2
+	sfi_breg src, \
+	ldrbne	tmp1, [\B], #1
+	sfi_breg src, \
+	ldrhcs	tmp2, [\B], #2
+	sfi_breg dst, \
+	strbne	tmp1, [\B], #1
+	sfi_breg dst, \
+	strhcs	tmp2, [\B], #2
 1:
-	pld	[src, #(3 * 64)]
+	sfi_pld	src, #(3 * 64)
 	subs	count, count, #64
 	ldrmi	tmp2, [sp], #FRAME_SIZE
 	bmi	.Ltail63unaligned
-	pld	[src, #(4 * 64)]
+	sfi_pld	src, #(4 * 64)
 
 #ifdef USE_NEON
-	vld1.8	{d0-d3}, [src]!
-	vld1.8	{d4-d7}, [src]!
+	/* These need an extra layer of macro just to work around a
+	   bug in the assembler's parser when an operand starts with
+	   a {...}.  */
+	.macro neon_load_multi reglist, basereg
+	vld1.8	{\reglist}, [\basereg]!
+	.endm
+	.macro neon_store_multi reglist, basereg
+	vst1.8	{\reglist}, [ALIGN (\basereg, 64)]!
+	.endm
+
+	/* These are used by the NaCl sfi_breg macro.  */
+	.macro _sfi_breg_dmask_neon_load_multi reg
+	_sfi_dmask \reg
+	.endm
+	.macro _sfi_breg_dmask_neon_store_multi reg
+	_sfi_dmask \reg
+	.endm
+
+	sfi_breg src, neon_load_multi d0-d3, \B
+	sfi_breg src, neon_load_multi d4-d7, \B
 	subs	count, count, #64
 	bmi	2f
 1:
-	pld	[src, #(4 * 64)]
-	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
-	vld1.8	{d0-d3}, [src]!
-	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
-	vld1.8	{d4-d7}, [src]!
+	sfi_pld	src, #(4 * 64)
+	sfi_breg dst, neon_store_multi d0-d3, \B
+	sfi_breg src, neon_load_multi d0-d3, \B
+	sfi_breg dst, neon_store_multi d4-d7, \B
+	sfi_breg src, neon_load_multi d4-d7, \B
 	subs	count, count, #64
 	bpl	1b
 2:
-	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
-	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
+	sfi_breg dst, neon_store_multi d0-d3, \B
+	sfi_breg dst, neon_store_multi d4-d7, \B
 	ands	count, count, #0x3f
 #else
 	/* Use an SMS style loop to maximize the I/O bandwidth.  */
 	sub	src, src, #4
 	sub	dst, dst, #8
 	subs	tmp2, count, #64	/* Use tmp2 for count.  */
-	ldr	A_l, [src, #4]
-	ldr	A_h, [src, #8]
+	sfi_breg src, \
+	ldr	A_l, [\B, #4]
+	sfi_breg src, \
+	ldr	A_h, [\B, #8]
 	strd	B_l, B_h, [sp, #8]
 	cfi_rel_offset (B_l, 8)
 	cfi_rel_offset (B_h, 12)
-	ldr	B_l, [src, #12]
-	ldr	B_h, [src, #16]
+	sfi_breg src, \
+	ldr	B_l, [\B, #12]
+	sfi_breg src, \
+	ldr	B_h, [\B, #16]
 	strd	C_l, C_h, [sp, #16]
 	cfi_rel_offset (C_l, 16)
 	cfi_rel_offset (C_h, 20)
-	ldr	C_l, [src, #20]
-	ldr	C_h, [src, #24]
+	sfi_breg src, \
+	ldr	C_l, [\B, #20]
+	sfi_breg src, \
+	ldr	C_h, [\B, #24]
 	strd	D_l, D_h, [sp, #24]
 	cfi_rel_offset (D_l, 24)
 	cfi_rel_offset (D_h, 28)
-	ldr	D_l, [src, #28]
-	ldr	D_h, [src, #32]!
+	sfi_breg src, \
+	ldr	D_l, [\B, #28]
+	sfi_breg src, \
+	ldr	D_h, [\B, #32]!
 	b	1f
 	.p2align	6
 2:
-	pld	[src, #(5 * 64) - (32 - 4)]
-	strd	A_l, A_h, [dst, #40]
-	ldr	A_l, [src, #36]
-	ldr	A_h, [src, #40]
-	strd	B_l, B_h, [dst, #48]
-	ldr	B_l, [src, #44]
-	ldr	B_h, [src, #48]
-	strd	C_l, C_h, [dst, #56]
-	ldr	C_l, [src, #52]
-	ldr	C_h, [src, #56]
-	strd	D_l, D_h, [dst, #64]!
-	ldr	D_l, [src, #60]
-	ldr	D_h, [src, #64]!
+	sfi_pld	src, #(5 * 64) - (32 - 4)
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #40]
+	sfi_breg src, \
+	ldr	A_l, [\B, #36]
+	sfi_breg src, \
+	ldr	A_h, [\B, #40]
+	sfi_breg dst, \
+	strd	B_l, B_h, [\B, #48]
+	sfi_breg src, \
+	ldr	B_l, [\B, #44]
+	sfi_breg src, \
+	ldr	B_h, [\B, #48]
+	sfi_breg dst, \
+	strd	C_l, C_h, [\B, #56]
+	sfi_breg src, \
+	ldr	C_l, [\B, #52]
+	sfi_breg src, \
+	ldr	C_h, [\B, #56]
+	sfi_breg dst, \
+	strd	D_l, D_h, [\B, #64]!
+	sfi_breg src, \
+	ldr	D_l, [\B, #60]
+	sfi_breg src, \
+	ldr	D_h, [\B, #64]!
 	subs	tmp2, tmp2, #64
 1:
-	strd	A_l, A_h, [dst, #8]
-	ldr	A_l, [src, #4]
-	ldr	A_h, [src, #8]
-	strd	B_l, B_h, [dst, #16]
-	ldr	B_l, [src, #12]
-	ldr	B_h, [src, #16]
-	strd	C_l, C_h, [dst, #24]
-	ldr	C_l, [src, #20]
-	ldr	C_h, [src, #24]
-	strd	D_l, D_h, [dst, #32]
-	ldr	D_l, [src, #28]
-	ldr	D_h, [src, #32]
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #8]
+	sfi_breg src, \
+	ldr	A_l, [\B, #4]
+	sfi_breg src, \
+	ldr	A_h, [\B, #8]
+	sfi_breg dst, \
+	strd	B_l, B_h, [\B, #16]
+	sfi_breg src, \
+	ldr	B_l, [\B, #12]
+	sfi_breg src, \
+	ldr	B_h, [\B, #16]
+	sfi_breg dst, \
+	strd	C_l, C_h, [\B, #24]
+	sfi_breg src, \
+	ldr	C_l, [\B, #20]
+	sfi_breg src, \
+	ldr	C_h, [\B, #24]
+	sfi_breg dst, \
+	strd	D_l, D_h, [\B, #32]
+	sfi_breg src, \
+	ldr	D_l, [\B, #28]
+	sfi_breg src, \
+	ldr	D_h, [\B, #32]
 	bcs	2b
 
 	/* Save the remaining bytes and restore the callee-saved regs.  */
-	strd	A_l, A_h, [dst, #40]
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #40]
 	add	src, src, #36
-	strd	B_l, B_h, [dst, #48]
+	sfi_breg dst, \
+	strd	B_l, B_h, [\B, #48]
 	ldrd	B_l, B_h, [sp, #8]
 	cfi_restore (B_l)
 	cfi_restore (B_h)
-	strd	C_l, C_h, [dst, #56]
+	sfi_breg dst, \
+	strd	C_l, C_h, [\B, #56]
 	ldrd	C_l, C_h, [sp, #16]
 	cfi_restore (C_l)
 	cfi_restore (C_h)
-	strd	D_l, D_h, [dst, #64]
+	sfi_breg dst, \
+	strd	D_l, D_h, [\B, #64]
 	ldrd	D_l, D_h, [sp, #24]
 	cfi_restore (D_l)
 	cfi_restore (D_h)

Modified: fsf/trunk/libc/ports/sysdeps/arm/fpu_control.h
==============================================================================
--- fsf/trunk/libc/ports/sysdeps/arm/fpu_control.h (original)
+++ fsf/trunk/libc/ports/sysdeps/arm/fpu_control.h Wed Jun 19 00:02:07 2013
@@ -19,13 +19,13 @@
 #ifndef _FPU_CONTROL_H
 #define _FPU_CONTROL_H
 
-#if !defined(_LIBC) && defined(__SOFTFP__)
+#if !(defined(_LIBC) && !defined(_LIBC_TEST)) && defined(__SOFTFP__)
 
 #define _FPU_RESERVED 0xffffffff
 #define _FPU_DEFAULT  0x00000000
 typedef unsigned int fpu_control_t;
-#define _FPU_GETCW(cw) 0
-#define _FPU_SETCW(cw) do { } while (0)
+#define _FPU_GETCW(cw) (cw) = 0
+#define _FPU_SETCW(cw) (void) (cw)
 extern fpu_control_t __fpu_control;
 
 #else

Added: fsf/trunk/libc/ports/sysdeps/arm/test-fpucw.c
==============================================================================
--- fsf/trunk/libc/ports/sysdeps/arm/test-fpucw.c (added)
+++ fsf/trunk/libc/ports/sysdeps/arm/test-fpucw.c Wed Jun 19 00:02:07 2013
@@ -1,0 +1,5 @@
+/* Defining _LIBC_TEST stops fpu_control.h from defining the
+   hard-float versions of macros (for use with dynamic VFP detection)
+   when compiling for soft-float.  */
+#define _LIBC_TEST
+#include <math/test-fpucw.c>

_______________________________________________
Commits mailing list
Commits@xxxxxxxxxx
http://eglibc.org/cgi-bin/mailman/listinfo/commits