[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Commits] r18634 - in /fsf/trunk/ports: ./ sysdeps/alpha/ sysdeps/alpha/alphaev5/ sysdeps/alpha/alphaev6/
- To: commits@xxxxxxxxxx
- Subject: [Commits] r18634 - in /fsf/trunk/ports: ./ sysdeps/alpha/ sysdeps/alpha/alphaev5/ sysdeps/alpha/alphaev6/
- From: eglibc@xxxxxxxxxx
- Date: Mon, 21 May 2012 00:02:37 -0000
Author: eglibc
Date: Mon May 21 00:02:36 2012
New Revision: 18634
Log:
Import glibc-ports-mainline for 2012-05-21
Added:
fsf/trunk/ports/sysdeps/alpha/add_n.S
fsf/trunk/ports/sysdeps/alpha/addmul_1.S
fsf/trunk/ports/sysdeps/alpha/alphaev5/add_n.S
fsf/trunk/ports/sysdeps/alpha/alphaev5/lshift.S
fsf/trunk/ports/sysdeps/alpha/alphaev5/rshift.S
fsf/trunk/ports/sysdeps/alpha/alphaev5/sub_n.S
fsf/trunk/ports/sysdeps/alpha/alphaev6/addmul_1.S
fsf/trunk/ports/sysdeps/alpha/lshift.S
fsf/trunk/ports/sysdeps/alpha/mul_1.S
fsf/trunk/ports/sysdeps/alpha/rshift.S
fsf/trunk/ports/sysdeps/alpha/sub_n.S
fsf/trunk/ports/sysdeps/alpha/submul_1.S
Removed:
fsf/trunk/ports/sysdeps/alpha/add_n.s
fsf/trunk/ports/sysdeps/alpha/addmul_1.s
fsf/trunk/ports/sysdeps/alpha/alphaev5/add_n.s
fsf/trunk/ports/sysdeps/alpha/alphaev5/lshift.s
fsf/trunk/ports/sysdeps/alpha/alphaev5/rshift.s
fsf/trunk/ports/sysdeps/alpha/alphaev5/sub_n.s
fsf/trunk/ports/sysdeps/alpha/alphaev6/addmul_1.s
fsf/trunk/ports/sysdeps/alpha/lshift.s
fsf/trunk/ports/sysdeps/alpha/mul_1.s
fsf/trunk/ports/sysdeps/alpha/rshift.s
fsf/trunk/ports/sysdeps/alpha/sub_n.s
fsf/trunk/ports/sysdeps/alpha/submul_1.s
Modified:
fsf/trunk/ports/ChangeLog.alpha
Modified: fsf/trunk/ports/ChangeLog.alpha
==============================================================================
--- fsf/trunk/ports/ChangeLog.alpha (original)
+++ fsf/trunk/ports/ChangeLog.alpha Mon May 21 00:02:36 2012
@@ -1,4 +1,21 @@
+2012-05-20 Richard Henderson <rth@xxxxxxxxxxx>
+
+ * sysdeps/alpha/add_n.S: Rename from add_n.s.
+ * sysdeps/alpha/addmul_1.S: Rename from addmul_1.s.
+ * sysdeps/alpha/alphaev5/add_n.S: Rename from add_n.s.
+ * sysdeps/alpha/alphaev5/lshift.S: Rename from lshift.s.
+ * sysdeps/alpha/alphaev5/rshift.S: Rename from rshift.s.
+ * sysdeps/alpha/alphaev5/sub_n.S: Rename from sub_n.s.
+ * sysdeps/alpha/alphaev6/addmul_1.S: Rename from addmul_1.s.
+ * sysdeps/alpha/lshift.S: Rename from lshift.s.
+ * sysdeps/alpha/mul_1.S: Rename from mul_1.s.
+ * sysdeps/alpha/rshift.S: Rename from rshift.s.
+ * sysdeps/alpha/sub_n.S: Rename from sub_n.s.
+ * sysdeps/alpha/submul_1.S: Rename from submul_1.s.
+
2012-05-18 Richard Henderson <rth@xxxxxxxxxxx>
+
+ * data/localplt-alpha-linux-gnu.data: New file.
* sysdeps/unix/sysv/linux/alpha/bits/typesizes.h (__FSWORD_T_TYPE,
__SYSCALL_SLONG_TYPE, __SYSCALL_ULONG_TYPE): New.
Added: fsf/trunk/ports/sysdeps/alpha/add_n.S
==============================================================================
--- fsf/trunk/ports/sysdeps/alpha/add_n.S (added)
+++ fsf/trunk/ports/sysdeps/alpha/add_n.S Mon May 21 00:02:36 2012
@@ -1,0 +1,118 @@
+ # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_add_n
+ .ent __mpn_add_n
+__mpn_add_n:
+ .frame $30,0,$26,0
+
+ ldq $3,0($17)
+ ldq $4,0($18)
+
+ subq $19,1,$19
+ and $19,4-1,$2 # number of limbs in first loop
+ bis $31,$31,$0
+ beq $2,.L0 # if multiple of 4 limbs, skip first loop
+
+ subq $19,$2,$19
+
+.Loop0: subq $2,1,$2
+ ldq $5,8($17)
+ addq $4,$0,$4
+ ldq $6,8($18)
+ cmpult $4,$0,$1
+ addq $3,$4,$4
+ cmpult $4,$3,$0
+ stq $4,0($16)
+ or $0,$1,$0
+
+ addq $17,8,$17
+ addq $18,8,$18
+ bis $5,$5,$3
+ bis $6,$6,$4
+ addq $16,8,$16
+ bne $2,.Loop0
+
+.L0: beq $19,.Lend
+
+ .align 3
+.Loop: subq $19,4,$19
+
+ ldq $5,8($17)
+ addq $4,$0,$4
+ ldq $6,8($18)
+ cmpult $4,$0,$1
+ addq $3,$4,$4
+ cmpult $4,$3,$0
+ stq $4,0($16)
+ or $0,$1,$0
+
+ ldq $3,16($17)
+ addq $6,$0,$6
+ ldq $4,16($18)
+ cmpult $6,$0,$1
+ addq $5,$6,$6
+ cmpult $6,$5,$0
+ stq $6,8($16)
+ or $0,$1,$0
+
+ ldq $5,24($17)
+ addq $4,$0,$4
+ ldq $6,24($18)
+ cmpult $4,$0,$1
+ addq $3,$4,$4
+ cmpult $4,$3,$0
+ stq $4,16($16)
+ or $0,$1,$0
+
+ ldq $3,32($17)
+ addq $6,$0,$6
+ ldq $4,32($18)
+ cmpult $6,$0,$1
+ addq $5,$6,$6
+ cmpult $6,$5,$0
+ stq $6,24($16)
+ or $0,$1,$0
+
+ addq $17,32,$17
+ addq $18,32,$18
+ addq $16,32,$16
+ bne $19,.Loop
+
+.Lend: addq $4,$0,$4
+ cmpult $4,$0,$1
+ addq $3,$4,$4
+ cmpult $4,$3,$0
+ stq $4,0($16)
+ or $0,$1,$0
+ ret $31,($26),1
+
+ .end __mpn_add_n
Removed: fsf/trunk/ports/sysdeps/alpha/add_n.s
==============================================================================
--- fsf/trunk/ports/sysdeps/alpha/add_n.s (original)
+++ fsf/trunk/ports/sysdeps/alpha/add_n.s (removed)
@@ -1,118 +1,0 @@
- # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and
- # store sum in a third limb vector.
-
- # Copyright (C) 1995 Free Software Foundation, Inc.
-
- # This file is part of the GNU MP Library.
-
- # The GNU MP Library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Lesser General Public License as published by
- # the Free Software Foundation; either version 2.1 of the License, or (at your
- # option) any later version.
-
- # The GNU MP Library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
- # License for more details.
-
- # You should have received a copy of the GNU Lesser General Public License
- # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
-
-
- # INPUT PARAMETERS
- # res_ptr $16
- # s1_ptr $17
- # s2_ptr $18
- # size $19
-
- .set noreorder
- .set noat
-.text
- .align 3
- .globl __mpn_add_n
- .ent __mpn_add_n
-__mpn_add_n:
- .frame $30,0,$26,0
-
- ldq $3,0($17)
- ldq $4,0($18)
-
- subq $19,1,$19
- and $19,4-1,$2 # number of limbs in first loop
- bis $31,$31,$0
- beq $2,.L0 # if multiple of 4 limbs, skip first loop
-
- subq $19,$2,$19
-
-.Loop0: subq $2,1,$2
- ldq $5,8($17)
- addq $4,$0,$4
- ldq $6,8($18)
- cmpult $4,$0,$1
- addq $3,$4,$4
- cmpult $4,$3,$0
- stq $4,0($16)
- or $0,$1,$0
-
- addq $17,8,$17
- addq $18,8,$18
- bis $5,$5,$3
- bis $6,$6,$4
- addq $16,8,$16
- bne $2,.Loop0
-
-.L0: beq $19,.Lend
-
- .align 3
-.Loop: subq $19,4,$19
-
- ldq $5,8($17)
- addq $4,$0,$4
- ldq $6,8($18)
- cmpult $4,$0,$1
- addq $3,$4,$4
- cmpult $4,$3,$0
- stq $4,0($16)
- or $0,$1,$0
-
- ldq $3,16($17)
- addq $6,$0,$6
- ldq $4,16($18)
- cmpult $6,$0,$1
- addq $5,$6,$6
- cmpult $6,$5,$0
- stq $6,8($16)
- or $0,$1,$0
-
- ldq $5,24($17)
- addq $4,$0,$4
- ldq $6,24($18)
- cmpult $4,$0,$1
- addq $3,$4,$4
- cmpult $4,$3,$0
- stq $4,16($16)
- or $0,$1,$0
-
- ldq $3,32($17)
- addq $6,$0,$6
- ldq $4,32($18)
- cmpult $6,$0,$1
- addq $5,$6,$6
- cmpult $6,$5,$0
- stq $6,24($16)
- or $0,$1,$0
-
- addq $17,32,$17
- addq $18,32,$18
- addq $16,32,$16
- bne $19,.Loop
-
-.Lend: addq $4,$0,$4
- cmpult $4,$0,$1
- addq $3,$4,$4
- cmpult $4,$3,$0
- stq $4,0($16)
- or $0,$1,$0
- ret $31,($26),1
-
- .end __mpn_add_n
Added: fsf/trunk/ports/sysdeps/alpha/addmul_1.S
==============================================================================
--- fsf/trunk/ports/sysdeps/alpha/addmul_1.S (added)
+++ fsf/trunk/ports/sysdeps/alpha/addmul_1.S Mon May 21 00:02:36 2012
@@ -1,0 +1,90 @@
+ # Alpha 21064 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ # the result to a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # s2_limb r19
+
+ # This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_addmul_1
+ .ent __mpn_addmul_1 2
+__mpn_addmul_1:
+ .frame $30,0,$26
+
+ ldq $2,0($17) # $2 = s1_limb
+ addq $17,8,$17 # s1_ptr++
+ subq $18,1,$18 # size--
+ mulq $2,$19,$3 # $3 = prod_low
+ ldq $5,0($16) # $5 = *res_ptr
+ umulh $2,$19,$0 # $0 = prod_high
+ beq $18,.Lend1 # jump if size was == 1
+ ldq $2,0($17) # $2 = s1_limb
+ addq $17,8,$17 # s1_ptr++
+ subq $18,1,$18 # size--
+ addq $5,$3,$3
+ cmpult $3,$5,$4
+ stq $3,0($16)
+ addq $16,8,$16 # res_ptr++
+ beq $18,.Lend2 # jump if size was == 2
+
+ .align 3
+.Loop: mulq $2,$19,$3 # $3 = prod_low
+ ldq $5,0($16) # $5 = *res_ptr
+ addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
+ subq $18,1,$18 # size--
+ umulh $2,$19,$4 # $4 = cy_limb
+ ldq $2,0($17) # $2 = s1_limb
+ addq $17,8,$17 # s1_ptr++
+ addq $3,$0,$3 # $3 = cy_limb + prod_low
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
+ addq $5,$3,$3
+ cmpult $3,$5,$5
+ stq $3,0($16)
+ addq $16,8,$16 # res_ptr++
+ addq $5,$0,$0 # combine carries
+ bne $18,.Loop
+
+.Lend2: mulq $2,$19,$3 # $3 = prod_low
+ ldq $5,0($16) # $5 = *res_ptr
+ addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
+ umulh $2,$19,$4 # $4 = cy_limb
+ addq $3,$0,$3 # $3 = cy_limb + prod_low
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
+ addq $5,$3,$3
+ cmpult $3,$5,$5
+ stq $3,0($16)
+ addq $5,$0,$0 # combine carries
+ addq $4,$0,$0 # cy_limb = prod_high + cy
+ ret $31,($26),1
+.Lend1: addq $5,$3,$3
+ cmpult $3,$5,$5
+ stq $3,0($16)
+ addq $0,$5,$0
+ ret $31,($26),1
+
+ .end __mpn_addmul_1
Removed: fsf/trunk/ports/sysdeps/alpha/addmul_1.s
==============================================================================
--- fsf/trunk/ports/sysdeps/alpha/addmul_1.s (original)
+++ fsf/trunk/ports/sysdeps/alpha/addmul_1.s (removed)
@@ -1,90 +1,0 @@
- # Alpha 21064 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
- # the result to a second limb vector.
-
- # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
-
- # This file is part of the GNU MP Library.
-
- # The GNU MP Library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Lesser General Public License as published by
- # the Free Software Foundation; either version 2.1 of the License, or (at your
- # option) any later version.
-
- # The GNU MP Library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
- # License for more details.
-
- # You should have received a copy of the GNU Lesser General Public License
- # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
-
-
- # INPUT PARAMETERS
- # res_ptr r16
- # s1_ptr r17
- # size r18
- # s2_limb r19
-
- # This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5.
-
- .set noreorder
- .set noat
-.text
- .align 3
- .globl __mpn_addmul_1
- .ent __mpn_addmul_1 2
-__mpn_addmul_1:
- .frame $30,0,$26
-
- ldq $2,0($17) # $2 = s1_limb
- addq $17,8,$17 # s1_ptr++
- subq $18,1,$18 # size--
- mulq $2,$19,$3 # $3 = prod_low
- ldq $5,0($16) # $5 = *res_ptr
- umulh $2,$19,$0 # $0 = prod_high
- beq $18,.Lend1 # jump if size was == 1
- ldq $2,0($17) # $2 = s1_limb
- addq $17,8,$17 # s1_ptr++
- subq $18,1,$18 # size--
- addq $5,$3,$3
- cmpult $3,$5,$4
- stq $3,0($16)
- addq $16,8,$16 # res_ptr++
- beq $18,.Lend2 # jump if size was == 2
-
- .align 3
-.Loop: mulq $2,$19,$3 # $3 = prod_low
- ldq $5,0($16) # $5 = *res_ptr
- addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
- subq $18,1,$18 # size--
- umulh $2,$19,$4 # $4 = cy_limb
- ldq $2,0($17) # $2 = s1_limb
- addq $17,8,$17 # s1_ptr++
- addq $3,$0,$3 # $3 = cy_limb + prod_low
- cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
- addq $5,$3,$3
- cmpult $3,$5,$5
- stq $3,0($16)
- addq $16,8,$16 # res_ptr++
- addq $5,$0,$0 # combine carries
- bne $18,.Loop
-
-.Lend2: mulq $2,$19,$3 # $3 = prod_low
- ldq $5,0($16) # $5 = *res_ptr
- addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
- umulh $2,$19,$4 # $4 = cy_limb
- addq $3,$0,$3 # $3 = cy_limb + prod_low
- cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
- addq $5,$3,$3
- cmpult $3,$5,$5
- stq $3,0($16)
- addq $5,$0,$0 # combine carries
- addq $4,$0,$0 # cy_limb = prod_high + cy
- ret $31,($26),1
-.Lend1: addq $5,$3,$3
- cmpult $3,$5,$5
- stq $3,0($16)
- addq $0,$5,$0
- ret $31,($26),1
-
- .end __mpn_addmul_1
Added: fsf/trunk/ports/sysdeps/alpha/alphaev5/add_n.S
==============================================================================
--- fsf/trunk/ports/sysdeps/alpha/alphaev5/add_n.S (added)
+++ fsf/trunk/ports/sysdeps/alpha/alphaev5/add_n.S Mon May 21 00:02:36 2012
@@ -1,0 +1,146 @@
+ # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_add_n
+ .ent __mpn_add_n
+__mpn_add_n:
+ .frame $30,0,$26,0
+
+ or $31,$31,$25 # clear cy
+ subq $19,4,$19 # decr loop cnt
+ blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+ ldq $0,0($18)
+ ldq $1,8($18)
+ ldq $4,0($17)
+ ldq $5,8($17)
+ addq $17,32,$17 # update s1_ptr
+ ldq $2,16($18)
+ addq $0,$4,$20 # 1st main add
+ ldq $3,24($18)
+ subq $19,4,$19 # decr loop cnt
+ ldq $6,-16($17)
+ cmpult $20,$0,$25 # compute cy from last add
+ ldq $7,-8($17)
+ addq $1,$25,$28 # cy add
+ addq $18,32,$18 # update s2_ptr
+ addq $5,$28,$21 # 2nd main add
+ cmpult $28,$25,$8 # compute cy from last add
+ blt $19,.Lend1 # if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+ .align 4
+.Loop: cmpult $21,$28,$25 # compute cy from last add
+ ldq $0,0($18)
+ or $8,$25,$25 # combine cy from the two adds
+ ldq $1,8($18)
+ addq $2,$25,$28 # cy add
+ ldq $4,0($17)
+ addq $28,$6,$22 # 3rd main add
+ ldq $5,8($17)
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $22,$28,$25 # compute cy from last add
+ stq $20,0($16)
+ or $8,$25,$25 # combine cy from the two adds
+ stq $21,8($16)
+ addq $3,$25,$28 # cy add
+ addq $28,$7,$23 # 4th main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $23,$28,$25 # compute cy from last add
+ addq $17,32,$17 # update s1_ptr
+ or $8,$25,$25 # combine cy from the two adds
+ addq $16,32,$16 # update res_ptr
+ addq $0,$25,$28 # cy add
+ ldq $2,16($18)
+ addq $4,$28,$20 # 1st main add
+ ldq $3,24($18)
+ cmpult $28,$25,$8 # compute cy from last add
+ ldq $6,-16($17)
+ cmpult $20,$28,$25 # compute cy from last add
+ ldq $7,-8($17)
+ or $8,$25,$25 # combine cy from the two adds
+ subq $19,4,$19 # decr loop cnt
+ stq $22,-16($16)
+ addq $1,$25,$28 # cy add
+ stq $23,-8($16)
+ addq $5,$28,$21 # 2nd main add
+ addq $18,32,$18 # update s2_ptr
+ cmpult $28,$25,$8 # compute cy from last add
+ bge $19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1: cmpult $21,$28,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two adds
+ addq $2,$25,$28 # cy add
+ addq $28,$6,$22 # 3rd main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $22,$28,$25 # compute cy from last add
+ stq $20,0($16)
+ or $8,$25,$25 # combine cy from the two adds
+ stq $21,8($16)
+ addq $3,$25,$28 # cy add
+ addq $28,$7,$23 # 4th main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $23,$28,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two adds
+ addq $16,32,$16 # update res_ptr
+ stq $22,-16($16)
+ stq $23,-8($16)
+.Lend2: addq $19,4,$19 # restore loop cnt
+ beq $19,.Lret
+ # Start software pipeline for 2nd loop
+ ldq $0,0($18)
+ ldq $4,0($17)
+ subq $19,1,$19
+ beq $19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+ .align 4
+.Loop0: addq $0,$25,$28 # cy add
+ ldq $0,8($18)
+ addq $4,$28,$20 # main add
+ ldq $4,8($17)
+ addq $18,8,$18
+ cmpult $28,$25,$8 # compute cy from last add
+ addq $17,8,$17
+ stq $20,0($16)
+ cmpult $20,$28,$25 # compute cy from last add
+ subq $19,1,$19 # decr loop cnt
+ or $8,$25,$25 # combine cy from the two adds
+ addq $16,8,$16
+ bne $19,.Loop0
+.Lend0: addq $0,$25,$28 # cy add
+ addq $4,$28,$20 # main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $20,$28,$25 # compute cy from last add
+ stq $20,0($16)
+ or $8,$25,$25 # combine cy from the two adds
+
+.Lret: or $25,$31,$0 # return cy
+ ret $31,($26),1
+ .end __mpn_add_n
Removed: fsf/trunk/ports/sysdeps/alpha/alphaev5/add_n.s
==============================================================================
--- fsf/trunk/ports/sysdeps/alpha/alphaev5/add_n.s (original)
+++ fsf/trunk/ports/sysdeps/alpha/alphaev5/add_n.s (removed)
@@ -1,146 +1,0 @@
- # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and
- # store sum in a third limb vector.
-
- # Copyright (C) 1995 Free Software Foundation, Inc.
-
- # This file is part of the GNU MP Library.
-
- # The GNU MP Library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Lesser General Public License as published by
- # the Free Software Foundation; either version 2.1 of the License, or (at your
- # option) any later version.
-
- # The GNU MP Library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
- # License for more details.
-
- # You should have received a copy of the GNU Lesser General Public License
- # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
-
-
- # INPUT PARAMETERS
- # res_ptr $16
- # s1_ptr $17
- # s2_ptr $18
- # size $19
-
- .set noreorder
- .set noat
-.text
- .align 3
- .globl __mpn_add_n
- .ent __mpn_add_n
-__mpn_add_n:
- .frame $30,0,$26,0
-
- or $31,$31,$25 # clear cy
- subq $19,4,$19 # decr loop cnt
- blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
- # Start software pipeline for 1st loop
- ldq $0,0($18)
- ldq $1,8($18)
- ldq $4,0($17)
- ldq $5,8($17)
- addq $17,32,$17 # update s1_ptr
- ldq $2,16($18)
- addq $0,$4,$20 # 1st main add
- ldq $3,24($18)
- subq $19,4,$19 # decr loop cnt
- ldq $6,-16($17)
- cmpult $20,$0,$25 # compute cy from last add
- ldq $7,-8($17)
- addq $1,$25,$28 # cy add
- addq $18,32,$18 # update s2_ptr
- addq $5,$28,$21 # 2nd main add
- cmpult $28,$25,$8 # compute cy from last add
- blt $19,.Lend1 # if less than 4 limbs remain, jump
- # 1st loop handles groups of 4 limbs in a software pipeline
- .align 4
-.Loop: cmpult $21,$28,$25 # compute cy from last add
- ldq $0,0($18)
- or $8,$25,$25 # combine cy from the two adds
- ldq $1,8($18)
- addq $2,$25,$28 # cy add
- ldq $4,0($17)
- addq $28,$6,$22 # 3rd main add
- ldq $5,8($17)
- cmpult $28,$25,$8 # compute cy from last add
- cmpult $22,$28,$25 # compute cy from last add
- stq $20,0($16)
- or $8,$25,$25 # combine cy from the two adds
- stq $21,8($16)
- addq $3,$25,$28 # cy add
- addq $28,$7,$23 # 4th main add
- cmpult $28,$25,$8 # compute cy from last add
- cmpult $23,$28,$25 # compute cy from last add
- addq $17,32,$17 # update s1_ptr
- or $8,$25,$25 # combine cy from the two adds
- addq $16,32,$16 # update res_ptr
- addq $0,$25,$28 # cy add
- ldq $2,16($18)
- addq $4,$28,$20 # 1st main add
- ldq $3,24($18)
- cmpult $28,$25,$8 # compute cy from last add
- ldq $6,-16($17)
- cmpult $20,$28,$25 # compute cy from last add
- ldq $7,-8($17)
- or $8,$25,$25 # combine cy from the two adds
- subq $19,4,$19 # decr loop cnt
- stq $22,-16($16)
- addq $1,$25,$28 # cy add
- stq $23,-8($16)
- addq $5,$28,$21 # 2nd main add
- addq $18,32,$18 # update s2_ptr
- cmpult $28,$25,$8 # compute cy from last add
- bge $19,.Loop
- # Finish software pipeline for 1st loop
-.Lend1: cmpult $21,$28,$25 # compute cy from last add
- or $8,$25,$25 # combine cy from the two adds
- addq $2,$25,$28 # cy add
- addq $28,$6,$22 # 3rd main add
- cmpult $28,$25,$8 # compute cy from last add
- cmpult $22,$28,$25 # compute cy from last add
- stq $20,0($16)
- or $8,$25,$25 # combine cy from the two adds
- stq $21,8($16)
- addq $3,$25,$28 # cy add
- addq $28,$7,$23 # 4th main add
- cmpult $28,$25,$8 # compute cy from last add
- cmpult $23,$28,$25 # compute cy from last add
- or $8,$25,$25 # combine cy from the two adds
- addq $16,32,$16 # update res_ptr
- stq $22,-16($16)
- stq $23,-8($16)
-.Lend2: addq $19,4,$19 # restore loop cnt
- beq $19,.Lret
- # Start software pipeline for 2nd loop
- ldq $0,0($18)
- ldq $4,0($17)
- subq $19,1,$19
- beq $19,.Lend0
- # 2nd loop handles remaining 1-3 limbs
- .align 4
-.Loop0: addq $0,$25,$28 # cy add
- ldq $0,8($18)
- addq $4,$28,$20 # main add
- ldq $4,8($17)
- addq $18,8,$18
- cmpult $28,$25,$8 # compute cy from last add
- addq $17,8,$17
- stq $20,0($16)
- cmpult $20,$28,$25 # compute cy from last add
- subq $19,1,$19 # decr loop cnt
- or $8,$25,$25 # combine cy from the two adds
- addq $16,8,$16
- bne $19,.Loop0
-.Lend0: addq $0,$25,$28 # cy add
- addq $4,$28,$20 # main add
- cmpult $28,$25,$8 # compute cy from last add
- cmpult $20,$28,$25 # compute cy from last add
- stq $20,0($16)
- or $8,$25,$25 # combine cy from the two adds
-
-.Lret: or $25,$31,$0 # return cy
- ret $31,($26),1
- .end __mpn_add_n
Added: fsf/trunk/ports/sysdeps/alpha/alphaev5/lshift.S
==============================================================================
--- fsf/trunk/ports/sysdeps/alpha/alphaev5/lshift.S (added)
+++ fsf/trunk/ports/sysdeps/alpha/alphaev5/lshift.S Mon May 21 00:02:36 2012
@@ -1,0 +1,172 @@
+ # Alpha EV5 __mpn_lshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # cnt r19
+
+ # This code runs at 3.25 cycles/limb on the EV5.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_lshift
+ .ent __mpn_lshift
+__mpn_lshift:
+ .frame $30,0,$26,0
+
+ s8addq $18,$17,$17 # make r17 point at end of s1
+ ldq $4,-8($17) # load first limb
+ subq $31,$19,$20
+ s8addq $18,$16,$16 # make r16 point at end of RES
+ subq $18,1,$18
+ and $18,4-1,$28 # number of limbs in first loop
+ srl $4,$20,$0 # compute function result
+
+ beq $28,.L0
+ subq $18,$28,$18
+
+ .align 3
+.Loop0: ldq $3,-16($17)
+ subq $16,8,$16
+ sll $4,$19,$5
+ subq $17,8,$17
+ subq $28,1,$28
+ srl $3,$20,$6
+ or $3,$3,$4
+ or $5,$6,$8
+ stq $8,0($16)
+ bne $28,.Loop0
+
+.L0: sll $4,$19,$24
+ beq $18,.Lend
+ # warm up phase 1
+ ldq $1,-16($17)
+ subq $18,4,$18
+ ldq $2,-24($17)
+ ldq $3,-32($17)
+ ldq $4,-40($17)
+ beq $18,.Lend1
+ # warm up phase 2
+ srl $1,$20,$7
+ sll $1,$19,$21
+ srl $2,$20,$8
+ ldq $1,-48($17)
+ sll $2,$19,$22
+ ldq $2,-56($17)
+ srl $3,$20,$5
+ or $7,$24,$7
+ sll $3,$19,$23
+ or $8,$21,$8
+ srl $4,$20,$6
+ ldq $3,-64($17)
+ sll $4,$19,$24
+ ldq $4,-72($17)
+ subq $18,4,$18
+ beq $18,.Lend2
+ .align 4
+ # main loop
+.Loop: stq $7,-8($16)
+ or $5,$22,$5
+ stq $8,-16($16)
+ or $6,$23,$6
+
+ srl $1,$20,$7
+ subq $18,4,$18
+ sll $1,$19,$21
+ unop # ldq $31,-96($17)
+
+ srl $2,$20,$8
+ ldq $1,-80($17)
+ sll $2,$19,$22
+ ldq $2,-88($17)
+
+ stq $5,-24($16)
+ or $7,$24,$7
+ stq $6,-32($16)
+ or $8,$21,$8
+
+ srl $3,$20,$5
+ unop # ldq $31,-96($17)
+ sll $3,$19,$23
+ subq $16,32,$16
+
+ srl $4,$20,$6
+ ldq $3,-96($17)
+ sll $4,$19,$24
+ ldq $4,-104($17)
+
+ subq $17,32,$17
+ bne $18,.Loop
+ # cool down phase 2/1
+.Lend2: stq $7,-8($16)
+ or $5,$22,$5
+ stq $8,-16($16)
+ or $6,$23,$6
+ srl $1,$20,$7
+ sll $1,$19,$21
+ srl $2,$20,$8
+ sll $2,$19,$22
+ stq $5,-24($16)
+ or $7,$24,$7
+ stq $6,-32($16)
+ or $8,$21,$8
+ srl $3,$20,$5
+ sll $3,$19,$23
+ srl $4,$20,$6
+ sll $4,$19,$24
+ # cool down phase 2/2
+ stq $7,-40($16)
+ or $5,$22,$5
+ stq $8,-48($16)
+ or $6,$23,$6
+ stq $5,-56($16)
+ stq $6,-64($16)
+ # cool down phase 2/3
+ stq $24,-72($16)
+ ret $31,($26),1
+
+ # cool down phase 1/1
+.Lend1: srl $1,$20,$7
+ sll $1,$19,$21
+ srl $2,$20,$8
+ sll $2,$19,$22
+ srl $3,$20,$5
+ or $7,$24,$7
+ sll $3,$19,$23
+ or $8,$21,$8
+ srl $4,$20,$6
+ sll $4,$19,$24
+ # cool down phase 1/2
+ stq $7,-8($16)
+ or $5,$22,$5
+ stq $8,-16($16)
+ or $6,$23,$6
+ stq $5,-24($16)
+ stq $6,-32($16)
+ stq $24,-40($16)
+ ret $31,($26),1
+
+.Lend: stq $24,-8($16)
+ ret $31,($26),1
+ .end __mpn_lshift
Removed: fsf/trunk/ports/sysdeps/alpha/alphaev5/lshift.s
==============================================================================
--- fsf/trunk/ports/sysdeps/alpha/alphaev5/lshift.s (original)
+++ fsf/trunk/ports/sysdeps/alpha/alphaev5/lshift.s (removed)
@@ -1,172 +1,0 @@
- # Alpha EV5 __mpn_lshift --
-
- # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
-
- # This file is part of the GNU MP Library.
-
- # The GNU MP Library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Lesser General Public License as published by
- # the Free Software Foundation; either version 2.1 of the License, or (at your
- # option) any later version.
-
- # The GNU MP Library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
- # License for more details.
-
- # You should have received a copy of the GNU Lesser General Public License
- # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
-
-
- # INPUT PARAMETERS
- # res_ptr r16
- # s1_ptr r17
- # size r18
- # cnt r19
-
- # This code runs at 3.25 cycles/limb on the EV5.
-
- .set noreorder
- .set noat
-.text
- .align 3
- .globl __mpn_lshift
- .ent __mpn_lshift
-__mpn_lshift:
- .frame $30,0,$26,0
-
- s8addq $18,$17,$17 # make r17 point at end of s1
- ldq $4,-8($17) # load first limb
- subq $31,$19,$20
- s8addq $18,$16,$16 # make r16 point at end of RES
- subq $18,1,$18
- and $18,4-1,$28 # number of limbs in first loop
- srl $4,$20,$0 # compute function result
-
- beq $28,.L0
- subq $18,$28,$18
-
- .align 3
-.Loop0: ldq $3,-16($17)
- subq $16,8,$16
- sll $4,$19,$5
- subq $17,8,$17
- subq $28,1,$28
- srl $3,$20,$6
- or $3,$3,$4
- or $5,$6,$8
- stq $8,0($16)
- bne $28,.Loop0
-
-.L0: sll $4,$19,$24
- beq $18,.Lend
- # warm up phase 1
- ldq $1,-16($17)
- subq $18,4,$18
- ldq $2,-24($17)
- ldq $3,-32($17)
- ldq $4,-40($17)
- beq $18,.Lend1
- # warm up phase 2
- srl $1,$20,$7
- sll $1,$19,$21
- srl $2,$20,$8
- ldq $1,-48($17)
- sll $2,$19,$22
- ldq $2,-56($17)
- srl $3,$20,$5
- or $7,$24,$7
- sll $3,$19,$23
- or $8,$21,$8
- srl $4,$20,$6
- ldq $3,-64($17)
- sll $4,$19,$24
- ldq $4,-72($17)
- subq $18,4,$18
- beq $18,.Lend2
- .align 4
- # main loop
-.Loop: stq $7,-8($16)
- or $5,$22,$5
- stq $8,-16($16)
- or $6,$23,$6
-
- srl $1,$20,$7
- subq $18,4,$18
- sll $1,$19,$21
- unop # ldq $31,-96($17)
-
- srl $2,$20,$8
- ldq $1,-80($17)
- sll $2,$19,$22
- ldq $2,-88($17)
-
- stq $5,-24($16)
- or $7,$24,$7
- stq $6,-32($16)
- or $8,$21,$8
-
- srl $3,$20,$5
- unop # ldq $31,-96($17)
- sll $3,$19,$23
- subq $16,32,$16
-
- srl $4,$20,$6
- ldq $3,-96($17)
- sll $4,$19,$24
- ldq $4,-104($17)
-
- subq $17,32,$17
- bne $18,.Loop
- # cool down phase 2/1
-.Lend2: stq $7,-8($16)
- or $5,$22,$5
- stq $8,-16($16)
- or $6,$23,$6
- srl $1,$20,$7
- sll $1,$19,$21
- srl $2,$20,$8
- sll $2,$19,$22
- stq $5,-24($16)
- or $7,$24,$7
- stq $6,-32($16)
- or $8,$21,$8
- srl $3,$20,$5
- sll $3,$19,$23
- srl $4,$20,$6
- sll $4,$19,$24
- # cool down phase 2/2
- stq $7,-40($16)
- or $5,$22,$5
- stq $8,-48($16)
- or $6,$23,$6
- stq $5,-56($16)
- stq $6,-64($16)
- # cool down phase 2/3
- stq $24,-72($16)
- ret $31,($26),1
-
- # cool down phase 1/1
-.Lend1: srl $1,$20,$7
- sll $1,$19,$21
- srl $2,$20,$8
- sll $2,$19,$22
- srl $3,$20,$5
- or $7,$24,$7
- sll $3,$19,$23
- or $8,$21,$8
- srl $4,$20,$6
- sll $4,$19,$24
- # cool down phase 1/2
- stq $7,-8($16)
- or $5,$22,$5
- stq $8,-16($16)
- or $6,$23,$6
- stq $5,-24($16)
- stq $6,-32($16)
- stq $24,-40($16)
- ret $31,($26),1
-
-.Lend: stq $24,-8($16)
- ret $31,($26),1
- .end __mpn_lshift
Added: fsf/trunk/ports/sysdeps/alpha/alphaev5/rshift.S
==============================================================================
--- fsf/trunk/ports/sysdeps/alpha/alphaev5/rshift.S (added)
+++ fsf/trunk/ports/sysdeps/alpha/alphaev5/rshift.S Mon May 21 00:02:36 2012
@@ -1,0 +1,170 @@
+ # Alpha EV5 __mpn_rshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # cnt r19
+
+ # This code runs at 3.25 cycles/limb on the EV5.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_rshift
+ .ent __mpn_rshift
+__mpn_rshift:
+ .frame $30,0,$26,0
+
+ ldq $4,0($17) # load first limb
+ subq $31,$19,$20
+ subq $18,1,$18
+ and $18,4-1,$28 # number of limbs in first loop
+ sll $4,$20,$0 # compute function result
+
+ beq $28,.L0
+ subq $18,$28,$18
+
+ .align 3
+.Loop0: ldq $3,8($17)
+ addq $16,8,$16
+ srl $4,$19,$5
+ addq $17,8,$17
+ subq $28,1,$28
+ sll $3,$20,$6
+ or $3,$3,$4
+ or $5,$6,$8
+ stq $8,-8($16)
+ bne $28,.Loop0
+
+.L0: srl $4,$19,$24
+ beq $18,.Lend
+ # warm up phase 1
+ ldq $1,8($17)
+ subq $18,4,$18
+ ldq $2,16($17)
+ ldq $3,24($17)
+ ldq $4,32($17)
+ beq $18,.Lend1
+ # warm up phase 2
+ sll $1,$20,$7
+ srl $1,$19,$21
+ sll $2,$20,$8
+ ldq $1,40($17)
+ srl $2,$19,$22
+ ldq $2,48($17)
+ sll $3,$20,$5
+ or $7,$24,$7
+ srl $3,$19,$23
+ or $8,$21,$8
+ sll $4,$20,$6
+ ldq $3,56($17)
+ srl $4,$19,$24
+ ldq $4,64($17)
+ subq $18,4,$18
+ beq $18,.Lend2
+ .align 4
+ # main loop
+.Loop: stq $7,0($16)
+ or $5,$22,$5
+ stq $8,8($16)
+ or $6,$23,$6
+
+ sll $1,$20,$7
+ subq $18,4,$18
+ srl $1,$19,$21
+ unop # ldq $31,-96($17)
+
+ sll $2,$20,$8
+ ldq $1,72($17)
+ srl $2,$19,$22
+ ldq $2,80($17)
+
+ stq $5,16($16)
+ or $7,$24,$7
+ stq $6,24($16)
+ or $8,$21,$8
+
+ sll $3,$20,$5
+ unop # ldq $31,-96($17)
+ srl $3,$19,$23
+ addq $16,32,$16
+
+ sll $4,$20,$6
+ ldq $3,88($17)
+ srl $4,$19,$24
+ ldq $4,96($17)
+
+ addq $17,32,$17
+ bne $18,.Loop
+ # cool down phase 2/1
+.Lend2: stq $7,0($16)
+ or $5,$22,$5
+ stq $8,8($16)
+ or $6,$23,$6
+ sll $1,$20,$7
+ srl $1,$19,$21
+ sll $2,$20,$8
+ srl $2,$19,$22
+ stq $5,16($16)
+ or $7,$24,$7
+ stq $6,24($16)
+ or $8,$21,$8
+ sll $3,$20,$5
+ srl $3,$19,$23
+ sll $4,$20,$6
+ srl $4,$19,$24
+ # cool down phase 2/2
+ stq $7,32($16)
+ or $5,$22,$5
+ stq $8,40($16)
+ or $6,$23,$6
+ stq $5,48($16)
+ stq $6,56($16)
+ # cool down phase 2/3
+ stq $24,64($16)
+ ret $31,($26),1
+
+ # cool down phase 1/1
+.Lend1: sll $1,$20,$7
+ srl $1,$19,$21
+ sll $2,$20,$8
+ srl $2,$19,$22
+ sll $3,$20,$5
+ or $7,$24,$7
+ srl $3,$19,$23
+ or $8,$21,$8
+ sll $4,$20,$6
+ srl $4,$19,$24
+ # cool down phase 1/2
+ stq $7,0($16)
+ or $5,$22,$5
+ stq $8,8($16)
+ or $6,$23,$6
+ stq $5,16($16)
+ stq $6,24($16)
+ stq $24,32($16)
+ ret $31,($26),1
+
+.Lend: stq $24,0($16)
+ ret $31,($26),1
+ .end __mpn_rshift
Removed: fsf/trunk/ports/sysdeps/alpha/alphaev5/rshift.s
==============================================================================
--- fsf/trunk/ports/sysdeps/alpha/alphaev5/rshift.s (original)
+++ fsf/trunk/ports/sysdeps/alpha/alphaev5/rshift.s (removed)
@@ -1,170 +1,0 @@
- # Alpha EV5 __mpn_rshift --
-
- # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
-
- # This file is part of the GNU MP Library.
-
- # The GNU MP Library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Lesser General Public License as published by
- # the Free Software Foundation; either version 2.1 of the License, or (at your
- # option) any later version.
-
- # The GNU MP Library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
- # License for more details.
-
- # You should have received a copy of the GNU Lesser General Public License
- # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
-
-
- # INPUT PARAMETERS
- # res_ptr r16
- # s1_ptr r17
- # size r18
- # cnt r19
-
- # This code runs at 3.25 cycles/limb on the EV5.
-
- .set noreorder
- .set noat
-.text
- .align 3
- .globl __mpn_rshift
- .ent __mpn_rshift
-__mpn_rshift:
- .frame $30,0,$26,0
-
- ldq $4,0($17) # load first limb
- subq $31,$19,$20
- subq $18,1,$18
- and $18,4-1,$28 # number of limbs in first loop
- sll $4,$20,$0 # compute function result
-
- beq $28,.L0
- subq $18,$28,$18
-
- .align 3
-.Loop0: ldq $3,8($17)
- addq $16,8,$16
- srl $4,$19,$5
- addq $17,8,$17
- subq $28,1,$28
- sll $3,$20,$6
- or $3,$3,$4
- or $5,$6,$8
- stq $8,-8($16)
- bne $28,.Loop0
-
-.L0: srl $4,$19,$24
- beq $18,.Lend
- # warm up phase 1
- ldq $1,8($17)
- subq $18,4,$18
- ldq $2,16($17)
- ldq $3,24($17)
- ldq $4,32($17)
- beq $18,.Lend1
- # warm up phase 2
- sll $1,$20,$7
- srl $1,$19,$21
- sll $2,$20,$8
- ldq $1,40($17)
- srl $2,$19,$22
- ldq $2,48($17)
- sll $3,$20,$5
- or $7,$24,$7
- srl $3,$19,$23
- or $8,$21,$8
- sll $4,$20,$6
- ldq $3,56($17)
- srl $4,$19,$24
- ldq $4,64($17)
- subq $18,4,$18
- beq $18,.Lend2
- .align 4
- # main loop
-.Loop: stq $7,0($16)
- or $5,$22,$5
- stq $8,8($16)
- or $6,$23,$6
-
- sll $1,$20,$7
- subq $18,4,$18
- srl $1,$19,$21
- unop # ldq $31,-96($17)
-
- sll $2,$20,$8
- ldq $1,72($17)
- srl $2,$19,$22
- ldq $2,80($17)
-
- stq $5,16($16)
- or $7,$24,$7
- stq $6,24($16)
- or $8,$21,$8
-
- sll $3,$20,$5
- unop # ldq $31,-96($17)
- srl $3,$19,$23
- addq $16,32,$16
-
- sll $4,$20,$6
- ldq $3,88($17)
- srl $4,$19,$24
- ldq $4,96($17)
-
- addq $17,32,$17
- bne $18,.Loop
- # cool down phase 2/1
-.Lend2: stq $7,0($16)
- or $5,$22,$5
- stq $8,8($16)
- or $6,$23,$6
- sll $1,$20,$7
- srl $1,$19,$21
- sll $2,$20,$8
- srl $2,$19,$22
- stq $5,16($16)
- or $7,$24,$7
- stq $6,24($16)
- or $8,$21,$8
- sll $3,$20,$5
- srl $3,$19,$23
- sll $4,$20,$6
- srl $4,$19,$24
- # cool down phase 2/2
- stq $7,32($16)
- or $5,$22,$5
- stq $8,40($16)
- or $6,$23,$6
- stq $5,48($16)
- stq $6,56($16)
- # cool down phase 2/3
- stq $24,64($16)
- ret $31,($26),1
-
- # cool down phase 1/1
-.Lend1: sll $1,$20,$7
- srl $1,$19,$21
- sll $2,$20,$8
- srl $2,$19,$22
- sll $3,$20,$5
- or $7,$24,$7
- srl $3,$19,$23
- or $8,$21,$8
- sll $4,$20,$6
- srl $4,$19,$24
- # cool down phase 1/2
- stq $7,0($16)
- or $5,$22,$5
- stq $8,8($16)
- or $6,$23,$6
- stq $5,16($16)
- stq $6,24($16)
- stq $24,32($16)
- ret $31,($26),1
-
-.Lend: stq $24,0($16)
- ret $31,($26),1
- .end __mpn_rshift
Added: fsf/trunk/ports/sysdeps/alpha/alphaev5/sub_n.S
==============================================================================
--- fsf/trunk/ports/sysdeps/alpha/alphaev5/sub_n.S (added)
+++ fsf/trunk/ports/sysdeps/alpha/alphaev5/sub_n.S Mon May 21 00:02:36 2012
@@ -1,0 +1,147 @@
+ # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_sub_n
+ .ent __mpn_sub_n
+__mpn_sub_n:
+ .frame $30,0,$26,0
+
+ or $31,$31,$25 # clear cy
+ subq $19,4,$19 # decr loop cnt
+ blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+ ldq $0,0($18)
+ ldq $1,8($18)
+ ldq $4,0($17)
+ ldq $5,8($17)
+ addq $17,32,$17 # update s1_ptr
+ ldq $2,16($18)
+ subq $4,$0,$20 # 1st main sub
+ ldq $3,24($18)
+ subq $19,4,$19 # decr loop cnt
+ ldq $6,-16($17)
+ cmpult $4,$20,$25 # compute cy from last sub
+ ldq $7,-8($17)
+ addq $1,$25,$28 # cy add
+ addq $18,32,$18 # update s2_ptr
+ subq $5,$28,$21 # 2nd main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ blt $19,.Lend1 # if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+ .align 4
+.Loop: cmpult $5,$21,$25 # compute cy from last add
+ ldq $0,0($18)
+ or $8,$25,$25 # combine cy from the two adds
+ ldq $1,8($18)
+ addq $2,$25,$28 # cy add
+ ldq $4,0($17)
+ subq $6,$28,$22 # 3rd main sub
+ ldq $5,8($17)
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $6,$22,$25 # compute cy from last add
+ stq $20,0($16)
+ or $8,$25,$25 # combine cy from the two adds
+ stq $21,8($16)
+ addq $3,$25,$28 # cy add
+ subq $7,$28,$23 # 4th main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $7,$23,$25 # compute cy from last add
+ addq $17,32,$17 # update s1_ptr
+ or $8,$25,$25 # combine cy from the two adds
+ addq $16,32,$16 # update res_ptr
+ addq $0,$25,$28 # cy add
+ ldq $2,16($18)
+ subq $4,$28,$20 # 1st main sub
+ ldq $3,24($18)
+ cmpult $28,$25,$8 # compute cy from last add
+ ldq $6,-16($17)
+ cmpult $4,$20,$25 # compute cy from last add
+ ldq $7,-8($17)
+ or $8,$25,$25 # combine cy from the two adds
+ subq $19,4,$19 # decr loop cnt
+ stq $22,-16($16)
+ addq $1,$25,$28 # cy add
+ stq $23,-8($16)
+ subq $5,$28,$21 # 2nd main sub
+ addq $18,32,$18 # update s2_ptr
+ cmpult $28,$25,$8 # compute cy from last add
+ bge $19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1: cmpult $5,$21,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two adds
+ addq $2,$25,$28 # cy add
+ subq $6,$28,$22 # 3rd main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $6,$22,$25 # compute cy from last add
+ stq $20,0($16)
+ or $8,$25,$25 # combine cy from the two adds
+ stq $21,8($16)
+ addq $3,$25,$28 # cy add
+ subq $7,$28,$23 # 4th main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $7,$23,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two adds
+ addq $16,32,$16 # update res_ptr
+ stq $22,-16($16)
+ stq $23,-8($16)
+.Lend2: addq $19,4,$19 # restore loop cnt
+ beq $19,.Lret
+ # Start software pipeline for 2nd loop
+ ldq $0,0($18)
+ ldq $4,0($17)
+ subq $19,1,$19
+ beq $19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+ .align 4
+.Loop0: addq $0,$25,$28 # cy add
+ ldq $0,8($18)
+ subq $4,$28,$20 # main sub
+ ldq $1,8($17)
+ addq $18,8,$18
+ cmpult $28,$25,$8 # compute cy from last add
+ addq $17,8,$17
+ stq $20,0($16)
+ cmpult $4,$20,$25 # compute cy from last add
+ subq $19,1,$19 # decr loop cnt
+ or $8,$25,$25 # combine cy from the two adds
+ addq $16,8,$16
+ or $1,$31,$4
+ bne $19,.Loop0
+.Lend0: addq $0,$25,$28 # cy add
+ subq $4,$28,$20 # main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $4,$20,$25 # compute cy from last add
+ stq $20,0($16)
+ or $8,$25,$25 # combine cy from the two adds
+
+.Lret: or $25,$31,$0 # return cy
+ ret $31,($26),1
+ .end __mpn_sub_n
Removed: fsf/trunk/ports/sysdeps/alpha/alphaev5/sub_n.s
==============================================================================
--- fsf/trunk/ports/sysdeps/alpha/alphaev5/sub_n.s (original)
+++ fsf/trunk/ports/sysdeps/alpha/alphaev5/sub_n.s (removed)
@@ -1,147 +1,0 @@
- # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
- # store difference in a third limb vector.
-
- # Copyright (C) 1995 Free Software Foundation, Inc.
-
- # This file is part of the GNU MP Library.
-
- # The GNU MP Library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Lesser General Public License as published by
- # the Free Software Foundation; either version 2.1 of the License, or (at your
- # option) any later version.
-
- # The GNU MP Library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
- # License for more details.
-
- # You should have received a copy of the GNU Lesser General Public License
- # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
-
-
- # INPUT PARAMETERS
- # res_ptr $16
- # s1_ptr $17
- # s2_ptr $18
- # size $19
-
- .set noreorder
- .set noat
-.text
- .align 3
- .globl __mpn_sub_n
- .ent __mpn_sub_n
-__mpn_sub_n:
- .frame $30,0,$26,0
-
- or $31,$31,$25 # clear cy
- subq $19,4,$19 # decr loop cnt
- blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
- # Start software pipeline for 1st loop
- ldq $0,0($18)
- ldq $1,8($18)
- ldq $4,0($17)
- ldq $5,8($17)
- addq $17,32,$17 # update s1_ptr
- ldq $2,16($18)
- subq $4,$0,$20 # 1st main sub
- ldq $3,24($18)
- subq $19,4,$19 # decr loop cnt
- ldq $6,-16($17)
- cmpult $4,$20,$25 # compute cy from last sub
- ldq $7,-8($17)
- addq $1,$25,$28 # cy add
- addq $18,32,$18 # update s2_ptr
- subq $5,$28,$21 # 2nd main sub
- cmpult $28,$25,$8 # compute cy from last add
- blt $19,.Lend1 # if less than 4 limbs remain, jump
- # 1st loop handles groups of 4 limbs in a software pipeline
- .align 4
-.Loop: cmpult $5,$21,$25 # compute cy from last add
- ldq $0,0($18)
- or $8,$25,$25 # combine cy from the two adds
- ldq $1,8($18)
- addq $2,$25,$28 # cy add
- ldq $4,0($17)
- subq $6,$28,$22 # 3rd main sub
- ldq $5,8($17)
- cmpult $28,$25,$8 # compute cy from last add
- cmpult $6,$22,$25 # compute cy from last add
- stq $20,0($16)
- or $8,$25,$25 # combine cy from the two adds
- stq $21,8($16)
- addq $3,$25,$28 # cy add
- subq $7,$28,$23 # 4th main sub
- cmpult $28,$25,$8 # compute cy from last add
- cmpult $7,$23,$25 # compute cy from last add
- addq $17,32,$17 # update s1_ptr
- or $8,$25,$25 # combine cy from the two adds
- addq $16,32,$16 # update res_ptr
- addq $0,$25,$28 # cy add
- ldq $2,16($18)
- subq $4,$28,$20 # 1st main sub
- ldq $3,24($18)
- cmpult $28,$25,$8 # compute cy from last add
- ldq $6,-16($17)
- cmpult $4,$20,$25 # compute cy from last add
- ldq $7,-8($17)
- or $8,$25,$25 # combine cy from the two adds
- subq $19,4,$19 # decr loop cnt
- stq $22,-16($16)
- addq $1,$25,$28 # cy add
- stq $23,-8($16)
- subq $5,$28,$21 # 2nd main sub
- addq $18,32,$18 # update s2_ptr
- cmpult $28,$25,$8 # compute cy from last add
- bge $19,.Loop
- # Finish software pipeline for 1st loop
-.Lend1: cmpult $5,$21,$25 # compute cy from last add
- or $8,$25,$25 # combine cy from the two adds
- addq $2,$25,$28 # cy add
- subq $6,$28,$22 # 3rd main sub
- cmpult $28,$25,$8 # compute cy from last add
- cmpult $6,$22,$25 # compute cy from last add
- stq $20,0($16)
- or $8,$25,$25 # combine cy from the two adds
- stq $21,8($16)
- addq $3,$25,$28 # cy add
- subq $7,$28,$23 # 4th main sub
- cmpult $28,$25,$8 # compute cy from last add
- cmpult $7,$23,$25 # compute cy from last add
- or $8,$25,$25 # combine cy from the two adds
- addq $16,32,$16 # update res_ptr
- stq $22,-16($16)
- stq $23,-8($16)
-.Lend2: addq $19,4,$19 # restore loop cnt
- beq $19,.Lret
- # Start software pipeline for 2nd loop
- ldq $0,0($18)
- ldq $4,0($17)
- subq $19,1,$19
- beq $19,.Lend0
- # 2nd loop handles remaining 1-3 limbs
- .align 4
-.Loop0: addq $0,$25,$28 # cy add
- ldq $0,8($18)
- subq $4,$28,$20 # main sub
- ldq $1,8($17)
- addq $18,8,$18
- cmpult $28,$25,$8 # compute cy from last add
- addq $17,8,$17
- stq $20,0($16)
- cmpult $4,$20,$25 # compute cy from last add
- subq $19,1,$19 # decr loop cnt
- or $8,$25,$25 # combine cy from the two adds
- addq $16,8,$16
- or $1,$31,$4
- bne $19,.Loop0
-.Lend0: addq $0,$25,$28 # cy add
- subq $4,$28,$20 # main sub
- cmpult $28,$25,$8 # compute cy from last add
- cmpult $4,$20,$25 # compute cy from last add
- stq $20,0($16)
- or $8,$25,$25 # combine cy from the two adds
-
-.Lret: or $25,$31,$0 # return cy
- ret $31,($26),1
- .end __mpn_sub_n
Added: fsf/trunk/ports/sysdeps/alpha/alphaev6/addmul_1.S
==============================================================================
--- fsf/trunk/ports/sysdeps/alpha/alphaev6/addmul_1.S (added)
+++ fsf/trunk/ports/sysdeps/alpha/alphaev6/addmul_1.S Mon May 21 00:02:36 2012
@@ -1,0 +1,477 @@
+ # Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ # the result to a second limb vector.
+ #
+ # Copyright (C) 2000 Free Software Foundation, Inc.
+ #
+ # This file is part of the GNU MP Library.
+ #
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published
+ # by the Free Software Foundation; either version 2.1 of the License, or (at
+ # your option) any later version.
+ #
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+ #
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # size $18
+ # s2_limb $19
+ #
+ # This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
+ # exactly 3.625 cycles/limb on EV6...
+ #
+ # This code was written in close cooperation with ev6 pipeline expert
+ # Steve Root (root@xxxxxxxxxxxxxxxxxx). Any errors are tege's fault, though.
+ #
+ # Register usages for unrolled loop:
+ # 0-3 mul's
+ # 4-7 acc's
+ # 8-15 mul results
+ # 20,21 carry's
+ # 22,23 save for stores
+ #
+ # Sustains 8 mul-adds in 29 cycles in the unrolled inner loop.
+ #
+ # The stores can issue a cycle late so we have paired no-op's to 'catch'
+ # them, so that further disturbance to the schedule is damped.
+ #
+ # We couldn't pair the loads, because the entangled schedule of the
+ # carry's has to happen on one side {0} of the machine. Note, the total
+ # use of U0, and the total use of L0 (after attending to the stores).
+ # which is part of the reason why....
+ #
+ # This is a great schedule for the d_cache, a poor schedule for the
+ # b_cache. The lockup on U0 means that any stall can't be recovered
+ # from. Consider a ldq in L1. say that load gets stalled because it
+ # collides with a fill from the b_Cache. On the next cycle, this load
+ # gets priority. If first looks at L0, and goes there. The instruction
+ # we intended for L0 gets to look at L1, which is NOT where we want
+ # it. It either stalls 1, because it can't go in L0, or goes there, and
+ # causes a further instruction to stall.
+ #
+ # So for b_cache, we're likely going to want to put one or more cycles
+ # back into the code! And, of course, put in prefetches. For the
+ # accumulator, lds, intent to modify. For the multiplier, you might
+ # want ldq, evict next, if you're not wanting to use it again soon. Use
+ # 256 ahead of present pointer value. At a place where we have an mt
+ # followed by a bookkeeping, put the bookkeeping in upper, and the
+ # prefetch into lower.
+ #
+ # Note, the usage of physical registers per cycle is smoothed off, as
+ # much as possible.
+ #
+ # Note, the ldq's and stq's are at the end of the quadpacks. note, we'd
+ # like not to have a ldq or stq to preceded a conditional branch in a
+ # quadpack. The conditional branch moves the retire pointer one cycle
+ # later.
+ #
+ # Optimization notes:
+ # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27?
+ # Reserved regs: $29 $30 $31
+ # Free caller-saves regs in unrolled code: $24 $25 $28
+ # We should swap some of the callee-saves regs for some of the free
+ # caller-saves regs, saving some overhead cycles.
+ # Most importantly, we should write fast code for the 0-7 case.
+ # The code we use there are for the 21164, and runs at 7 cycles/limb
+ # on the 21264. Should not be hard, if we write specialized code for
+ # 1-7 limbs (the one for 0 limbs should be straightforward). We then just
+ # need a jump table indexed by the low 3 bits of the count argument.
+
+ .set noreorder
+ .set noat
+ .text
+
+ .globl __mpn_addmul_1
+ .ent __mpn_addmul_1
+__mpn_addmul_1:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ cmpult $18, 8, $1
+ beq $1, $Large
+
+ ldq $2, 0($17) # $2 = s1_limb
+ addq $17, 8, $17 # s1_ptr++
+ subq $18, 1, $18 # size--
+ mulq $2, $19, $3 # $3 = prod_low
+ ldq $5, 0($16) # $5 = *res_ptr
+ umulh $2, $19, $0 # $0 = prod_high
+ beq $18, $Lend0b # jump if size was == 1
+ ldq $2, 0($17) # $2 = s1_limb
+ addq $17, 8, $17 # s1_ptr++
+ subq $18, 1, $18 # size--
+ addq $5, $3, $3
+ cmpult $3, $5, $4
+ stq $3, 0($16)
+ addq $16, 8, $16 # res_ptr++
+ beq $18, $Lend0a # jump if size was == 2
+
+ .align 3
+$Loop0: mulq $2, $19, $3 # $3 = prod_low
+ ldq $5, 0($16) # $5 = *res_ptr
+ addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ subq $18, 1, $18 # size--
+ umulh $2, $19, $4 # $4 = cy_limb
+ ldq $2, 0($17) # $2 = s1_limb
+ addq $17, 8, $17 # s1_ptr++
+ addq $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addq $5, $3, $3
+ cmpult $3, $5, $5
+ stq $3, 0($16)
+ addq $16, 8, $16 # res_ptr++
+ addq $5, $0, $0 # combine carries
+ bne $18, $Loop0
+$Lend0a:
+ mulq $2, $19, $3 # $3 = prod_low
+ ldq $5, 0($16) # $5 = *res_ptr
+ addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ umulh $2, $19, $4 # $4 = cy_limb
+ addq $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addq $5, $3, $3
+ cmpult $3, $5, $5
+ stq $3, 0($16)
+ addq $5, $0, $0 # combine carries
+ addq $4, $0, $0 # cy_limb = prod_high + cy
+ ret $31, ($26), 1
+$Lend0b:
+ addq $5, $3, $3
+ cmpult $3, $5, $5
+ stq $3, 0($16)
+ addq $0, $5, $0
+ ret $31, ($26), 1
+
+$Large:
+ lda $30, -240($30)
+ stq $9, 8($30)
+ stq $10, 16($30)
+ stq $11, 24($30)
+ stq $12, 32($30)
+ stq $13, 40($30)
+ stq $14, 48($30)
+ stq $15, 56($30)
+
+ and $18, 7, $20 # count for the first loop, 0-7
+ srl $18, 3, $18 # count for unrolled loop
+ bis $31, $31, $0
+ beq $20, $Lunroll
+ ldq $2, 0($17) # $2 = s1_limb
+ addq $17, 8, $17 # s1_ptr++
+ subq $20, 1, $20 # size--
+ mulq $2, $19, $3 # $3 = prod_low
+ ldq $5, 0($16) # $5 = *res_ptr
+ umulh $2, $19, $0 # $0 = prod_high
+ beq $20, $Lend1b # jump if size was == 1
+ ldq $2, 0($17) # $2 = s1_limb
+ addq $17, 8, $17 # s1_ptr++
+ subq $20, 1, $20 # size--
+ addq $5, $3, $3
+ cmpult $3, $5, $4
+ stq $3, 0($16)
+ addq $16, 8, $16 # res_ptr++
+ beq $20, $Lend1a # jump if size was == 2
+
+ .align 3
+$Loop1: mulq $2, $19, $3 # $3 = prod_low
+ ldq $5, 0($16) # $5 = *res_ptr
+ addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ subq $20, 1, $20 # size--
+ umulh $2, $19, $4 # $4 = cy_limb
+ ldq $2, 0($17) # $2 = s1_limb
+ addq $17, 8, $17 # s1_ptr++
+ addq $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addq $5, $3, $3
+ cmpult $3, $5, $5
+ stq $3, 0($16)
+ addq $16, 8, $16 # res_ptr++
+ addq $5, $0, $0 # combine carries
+ bne $20, $Loop1
+
+$Lend1a:
+ mulq $2, $19, $3 # $3 = prod_low
+ ldq $5, 0($16) # $5 = *res_ptr
+ addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ umulh $2, $19, $4 # $4 = cy_limb
+ addq $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addq $5, $3, $3
+ cmpult $3, $5, $5
+ stq $3, 0($16)
+ addq $16, 8, $16 # res_ptr++
+ addq $5, $0, $0 # combine carries
+ addq $4, $0, $0 # cy_limb = prod_high + cy
+ br $31, $Lunroll
+$Lend1b:
+ addq $5, $3, $3
+ cmpult $3, $5, $5
+ stq $3, 0($16)
+ addq $16, 8, $16 # res_ptr++
+ addq $0, $5, $0
+
+$Lunroll:
+ lda $17, -16($17) # L1 bookkeeping
+ lda $16, -16($16) # L1 bookkeeping
+ bis $0, $31, $12
+
+ # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
+
+ ldq $2, 16($17) # L1
+ ldq $3, 24($17) # L1
+ lda $18, -1($18) # L1 bookkeeping
+ ldq $6, 16($16) # L1
+ ldq $7, 24($16) # L1
+ ldq $0, 32($17) # L1
+ mulq $19, $2, $13 # U1
+ ldq $1, 40($17) # L1
+ umulh $19, $2, $14 # U1
+ mulq $19, $3, $15 # U1
+ lda $17, 64($17) # L1 bookkeeping
+ ldq $4, 32($16) # L1
+ ldq $5, 40($16) # L1
+ umulh $19, $3, $8 # U1
+ ldq $2, -16($17) # L1
+ mulq $19, $0, $9 # U1
+ ldq $3, -8($17) # L1
+ umulh $19, $0, $10 # U1
+ addq $6, $13, $6 # L0 lo + acc
+ mulq $19, $1, $11 # U1
+ cmpult $6, $13, $20 # L0 lo add => carry
+ lda $16, 64($16) # L1 bookkeeping
+ addq $6, $12, $22 # U0 hi add => answer
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addq $14, $20, $14 # U0 hi mul + carry
+ ldq $6, -16($16) # L1
+ addq $7, $15, $23 # L0 lo + acc
+ addq $14, $21, $14 # U0 hi mul + carry
+ ldq $7, -8($16) # L1
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addq $23, $14, $23 # U0 hi add => answer
+ ldq $0, 0($17) # L1
+ mulq $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addq $8, $20, $8 # U0 hi mul + carry
+ ldq $1, 8($17) # L1
+ umulh $19, $2, $14 # U1
+ addq $4, $9, $4 # L0 lo + acc
+ stq $22, -48($16) # L0
+ stq $23, -40($16) # L1
+ mulq $19, $3, $15 # U1
+ addq $8, $21, $8 # U0 hi mul + carry
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addq $4, $8, $22 # U0 hi add => answer
+ ble $18, $Lend # U1 bookkeeping
+
+ # ____ MAIN UNROLLED LOOP ____
+ .align 4
+$Loop:
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addq $10, $20, $10 # U0 hi mul + carry
+ ldq $4, 0($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addq $5, $11, $23 # L0 lo + acc
+ addq $10, $21, $10 # L0 hi mul + carry
+ ldq $5, 8($16) # L1
+
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addq $23, $10, $23 # U0 hi add => answer
+ ldq $2, 16($17) # L1
+
+ mulq $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addq $12, $20, $12 # U0 hi mul + carry
+ ldq $3, 24($17) # L1
+
+ umulh $19, $0, $10 # U1
+ addq $6, $13, $6 # L0 lo + acc
+ stq $22, -32($16) # L0
+ stq $23, -24($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mulq $19, $1, $11 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addq $12, $21, $12 # U0 hi mul + carry
+
+ cmpult $6, $13, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ lda $18, -1($18) # L1 bookkeeping
+ addq $6, $12, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addq $14, $20, $14 # U0 hi mul + carry
+ ldq $6, 16($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addq $7, $15, $23 # L0 lo + acc
+ addq $14, $21, $14 # U0 hi mul + carry
+ ldq $7, 24($16) # L1
+
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addq $23, $14, $23 # U0 hi add => answer
+ ldq $0, 32($17) # L1
+
+ mulq $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addq $8, $20, $8 # U0 hi mul + carry
+ ldq $1, 40($17) # L1
+
+ umulh $19, $2, $14 # U1
+ addq $4, $9, $4 # U0 lo + acc
+ stq $22, -16($16) # L0
+ stq $23, -8($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mulq $19, $3, $15 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addq $8, $21, $8 # L0 hi mul + carry
+
+ cmpult $4, $9, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ lda $17, 64($17) # L1 bookkeeping
+ addq $4, $8, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addq $10, $20, $10 # U0 hi mul + carry
+ ldq $4, 32($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addq $5, $11, $23 # L0 lo + acc
+ addq $10, $21, $10 # L0 hi mul + carry
+ ldq $5, 40($16) # L1
+
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addq $23, $10, $23 # U0 hi add => answer
+ ldq $2, -16($17) # L1
+
+ mulq $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addq $12, $20, $12 # U0 hi mul + carry
+ ldq $3, -8($17) # L1
+
+ umulh $19, $0, $10 # U1
+ addq $6, $13, $6 # L0 lo + acc
+ stq $22, 0($16) # L0
+ stq $23, 8($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mulq $19, $1, $11 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addq $12, $21, $12 # U0 hi mul + carry
+
+ cmpult $6, $13, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ lda $16, 64($16) # L1 bookkeeping
+ addq $6, $12, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addq $14, $20, $14 # U0 hi mul + carry
+ ldq $6, -16($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addq $7, $15, $23 # L0 lo + acc
+ addq $14, $21, $14 # U0 hi mul + carry
+ ldq $7, -8($16) # L1
+
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addq $23, $14, $23 # U0 hi add => answer
+ ldq $0, 0($17) # L1
+
+ mulq $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addq $8, $20, $8 # U0 hi mul + carry
+ ldq $1, 8($17) # L1
+
+ umulh $19, $2, $14 # U1
+ addq $4, $9, $4 # L0 lo + acc
+ stq $22, -48($16) # L0
+ stq $23, -40($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mulq $19, $3, $15 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addq $8, $21, $8 # U0 hi mul + carry
+
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addq $4, $8, $22 # U0 hi add => answer
+ bis $31, $31, $31 # L1 mt
+ bgt $18, $Loop # U1 bookkeeping
+
+# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
+$Lend:
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addq $10, $20, $10 # U0 hi mul + carry
+ ldq $4, 0($16) # L1
+ addq $5, $11, $23 # L0 lo + acc
+ addq $10, $21, $10 # L0 hi mul + carry
+ ldq $5, 8($16) # L1
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addq $23, $10, $23 # U0 hi add => answer
+ mulq $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addq $12, $20, $12 # U0 hi mul + carry
+ umulh $19, $0, $10 # U1
+ addq $6, $13, $6 # L0 lo + acc
+ stq $22, -32($16) # L0
+ stq $23, -24($16) # L1
+ mulq $19, $1, $11 # U1
+ addq $12, $21, $12 # U0 hi mul + carry
+ cmpult $6, $13, $20 # L0 lo add => carry
+ addq $6, $12, $22 # U0 hi add => answer
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addq $14, $20, $14 # U0 hi mul + carry
+ addq $7, $15, $23 # L0 lo + acc
+ addq $14, $21, $14 # U0 hi mul + carry
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addq $23, $14, $23 # U0 hi add => answer
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addq $8, $20, $8 # U0 hi mul + carry
+ addq $4, $9, $4 # U0 lo + acc
+ stq $22, -16($16) # L0
+ stq $23, -8($16) # L1
+ bis $31, $31, $31 # L0 st slosh
+ addq $8, $21, $8 # L0 hi mul + carry
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addq $4, $8, $22 # U0 hi add => answer
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addq $10, $20, $10 # U0 hi mul + carry
+ addq $5, $11, $23 # L0 lo + acc
+ addq $10, $21, $10 # L0 hi mul + carry
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addq $23, $10, $23 # U0 hi add => answer
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addq $12, $20, $12 # U0 hi mul + carry
+ stq $22, 0($16) # L0
+ stq $23, 8($16) # L1
+ addq $12, $21, $0 # U0 hi mul + carry
+
+ ldq $9, 8($30)
+ ldq $10, 16($30)
+ ldq $11, 24($30)
+ ldq $12, 32($30)
+ ldq $13, 40($30)
+ ldq $14, 48($30)
+ ldq $15, 56($30)
+ lda $30, 240($30)
+ ret $31, ($26), 1
+
+ .end __mpn_addmul_1
Removed: fsf/trunk/ports/sysdeps/alpha/alphaev6/addmul_1.s
==============================================================================
--- fsf/trunk/ports/sysdeps/alpha/alphaev6/addmul_1.s (original)
+++ fsf/trunk/ports/sysdeps/alpha/alphaev6/addmul_1.s (removed)
@@ -1,477 +1,0 @@
- # Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
- # the result to a second limb vector.
- #
- # Copyright (C) 2000 Free Software Foundation, Inc.
- #
- # This file is part of the GNU MP Library.
- #
- # The GNU MP Library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Lesser General Public License as published
- # by the Free Software Foundation; either version 2.1 of the License, or (at
- # your option) any later version.
- #
- # The GNU MP Library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
- # License for more details.
- #
- # You should have received a copy of the GNU Lesser General Public License
- # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
-
- # INPUT PARAMETERS
- # res_ptr $16
- # s1_ptr $17
- # size $18
- # s2_limb $19
- #
- # This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
- # exactly 3.625 cycles/limb on EV6...
- #
- # This code was written in close cooperation with ev6 pipeline expert
- # Steve Root (root@xxxxxxxxxxxxxxxxxx). Any errors are tege's fault, though.
- #
- # Register usages for unrolled loop:
- # 0-3 mul's
- # 4-7 acc's
- # 8-15 mul results
- # 20,21 carry's
- # 22,23 save for stores
- #
- # Sustains 8 mul-adds in 29 cycles in the unrolled inner loop.
- #
- # The stores can issue a cycle late so we have paired no-op's to 'catch'
- # them, so that further disturbance to the schedule is damped.
- #
- # We couldn't pair the loads, because the entangled schedule of the
- # carry's has to happen on one side {0} of the machine. Note, the total
- # use of U0, and the total use of L0 (after attending to the stores).
- # which is part of the reason why....
- #
- # This is a great schedule for the d_cache, a poor schedule for the
- # b_cache. The lockup on U0 means that any stall can't be recovered
- # from. Consider a ldq in L1. say that load gets stalled because it
- # collides with a fill from the b_Cache. On the next cycle, this load
- # gets priority. If first looks at L0, and goes there. The instruction
- # we intended for L0 gets to look at L1, which is NOT where we want
- # it. It either stalls 1, because it can't go in L0, or goes there, and
- # causes a further instruction to stall.
- #
- # So for b_cache, we're likely going to want to put one or more cycles
- # back into the code! And, of course, put in prefetches. For the
- # accumulator, lds, intent to modify. For the multiplier, you might
- # want ldq, evict next, if you're not wanting to use it again soon. Use
- # 256 ahead of present pointer value. At a place where we have an mt
- # followed by a bookkeeping, put the bookkeeping in upper, and the
- # prefetch into lower.
- #
- # Note, the usage of physical registers per cycle is smoothed off, as
- # much as possible.
- #
- # Note, the ldq's and stq's are at the end of the quadpacks. note, we'd
- # like not to have a ldq or stq to preceded a conditional branch in a
- # quadpack. The conditional branch moves the retire pointer one cycle
- # later.
- #
- # Optimization notes:
- # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27?
- # Reserved regs: $29 $30 $31
- # Free caller-saves regs in unrolled code: $24 $25 $28
- # We should swap some of the callee-saves regs for some of the free
- # caller-saves regs, saving some overhead cycles.
- # Most importantly, we should write fast code for the 0-7 case.
- # The code we use there are for the 21164, and runs at 7 cycles/limb
- # on the 21264. Should not be hard, if we write specialized code for
- # 1-7 limbs (the one for 0 limbs should be straightforward). We then just
- # need a jump table indexed by the low 3 bits of the count argument.
-
- .set noreorder
- .set noat
- .text
-
- .globl __mpn_addmul_1
- .ent __mpn_addmul_1
-__mpn_addmul_1:
- .frame $30,0,$26,0
- .prologue 0
-
- cmpult $18, 8, $1
- beq $1, $Large
-
- ldq $2, 0($17) # $2 = s1_limb
- addq $17, 8, $17 # s1_ptr++
- subq $18, 1, $18 # size--
- mulq $2, $19, $3 # $3 = prod_low
- ldq $5, 0($16) # $5 = *res_ptr
- umulh $2, $19, $0 # $0 = prod_high
- beq $18, $Lend0b # jump if size was == 1
- ldq $2, 0($17) # $2 = s1_limb
- addq $17, 8, $17 # s1_ptr++
- subq $18, 1, $18 # size--
- addq $5, $3, $3
- cmpult $3, $5, $4
- stq $3, 0($16)
- addq $16, 8, $16 # res_ptr++
- beq $18, $Lend0a # jump if size was == 2
-
- .align 3
-$Loop0: mulq $2, $19, $3 # $3 = prod_low
- ldq $5, 0($16) # $5 = *res_ptr
- addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
- subq $18, 1, $18 # size--
- umulh $2, $19, $4 # $4 = cy_limb
- ldq $2, 0($17) # $2 = s1_limb
- addq $17, 8, $17 # s1_ptr++
- addq $3, $0, $3 # $3 = cy_limb + prod_low
- cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
- addq $5, $3, $3
- cmpult $3, $5, $5
- stq $3, 0($16)
- addq $16, 8, $16 # res_ptr++
- addq $5, $0, $0 # combine carries
- bne $18, $Loop0
-$Lend0a:
- mulq $2, $19, $3 # $3 = prod_low
- ldq $5, 0($16) # $5 = *res_ptr
- addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
- umulh $2, $19, $4 # $4 = cy_limb
- addq $3, $0, $3 # $3 = cy_limb + prod_low
- cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
- addq $5, $3, $3
- cmpult $3, $5, $5
- stq $3, 0($16)
- addq $5, $0, $0 # combine carries
- addq $4, $0, $0 # cy_limb = prod_high + cy
- ret $31, ($26), 1
-$Lend0b:
- addq $5, $3, $3
- cmpult $3, $5, $5
- stq $3, 0($16)
- addq $0, $5, $0
- ret $31, ($26), 1
-
-$Large:
- lda $30, -240($30)
- stq $9, 8($30)
- stq $10, 16($30)
- stq $11, 24($30)
- stq $12, 32($30)
- stq $13, 40($30)
- stq $14, 48($30)
- stq $15, 56($30)
-
- and $18, 7, $20 # count for the first loop, 0-7
- srl $18, 3, $18 # count for unrolled loop
- bis $31, $31, $0
- beq $20, $Lunroll
- ldq $2, 0($17) # $2 = s1_limb
- addq $17, 8, $17 # s1_ptr++
- subq $20, 1, $20 # size--
- mulq $2, $19, $3 # $3 = prod_low
- ldq $5, 0($16) # $5 = *res_ptr
- umulh $2, $19, $0 # $0 = prod_high
- beq $20, $Lend1b # jump if size was == 1
- ldq $2, 0($17) # $2 = s1_limb
- addq $17, 8, $17 # s1_ptr++
- subq $20, 1, $20 # size--
- addq $5, $3, $3
- cmpult $3, $5, $4
- stq $3, 0($16)
- addq $16, 8, $16 # res_ptr++
- beq $20, $Lend1a # jump if size was == 2
-
- .align 3
-$Loop1: mulq $2, $19, $3 # $3 = prod_low
- ldq $5, 0($16) # $5 = *res_ptr
- addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
- subq $20, 1, $20 # size--
- umulh $2, $19, $4 # $4 = cy_limb
- ldq $2, 0($17) # $2 = s1_limb
- addq $17, 8, $17 # s1_ptr++
- addq $3, $0, $3 # $3 = cy_limb + prod_low
- cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
- addq $5, $3, $3
- cmpult $3, $5, $5
- stq $3, 0($16)
- addq $16, 8, $16 # res_ptr++
- addq $5, $0, $0 # combine carries
- bne $20, $Loop1
-
-$Lend1a:
- mulq $2, $19, $3 # $3 = prod_low
- ldq $5, 0($16) # $5 = *res_ptr
- addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
- umulh $2, $19, $4 # $4 = cy_limb
- addq $3, $0, $3 # $3 = cy_limb + prod_low
- cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
- addq $5, $3, $3
- cmpult $3, $5, $5
- stq $3, 0($16)
- addq $16, 8, $16 # res_ptr++
- addq $5, $0, $0 # combine carries
- addq $4, $0, $0 # cy_limb = prod_high + cy
- br $31, $Lunroll
-$Lend1b:
- addq $5, $3, $3
- cmpult $3, $5, $5
- stq $3, 0($16)
- addq $16, 8, $16 # res_ptr++
- addq $0, $5, $0
-
-$Lunroll:
- lda $17, -16($17) # L1 bookkeeping
- lda $16, -16($16) # L1 bookkeeping
- bis $0, $31, $12
-
- # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
-
- ldq $2, 16($17) # L1
- ldq $3, 24($17) # L1
- lda $18, -1($18) # L1 bookkeeping
- ldq $6, 16($16) # L1
- ldq $7, 24($16) # L1
- ldq $0, 32($17) # L1
- mulq $19, $2, $13 # U1
- ldq $1, 40($17) # L1
- umulh $19, $2, $14 # U1
- mulq $19, $3, $15 # U1
- lda $17, 64($17) # L1 bookkeeping
- ldq $4, 32($16) # L1
- ldq $5, 40($16) # L1
- umulh $19, $3, $8 # U1
- ldq $2, -16($17) # L1
- mulq $19, $0, $9 # U1
- ldq $3, -8($17) # L1
- umulh $19, $0, $10 # U1
- addq $6, $13, $6 # L0 lo + acc
- mulq $19, $1, $11 # U1
- cmpult $6, $13, $20 # L0 lo add => carry
- lda $16, 64($16) # L1 bookkeeping
- addq $6, $12, $22 # U0 hi add => answer
- cmpult $22, $12, $21 # L0 hi add => carry
- addq $14, $20, $14 # U0 hi mul + carry
- ldq $6, -16($16) # L1
- addq $7, $15, $23 # L0 lo + acc
- addq $14, $21, $14 # U0 hi mul + carry
- ldq $7, -8($16) # L1
- umulh $19, $1, $12 # U1
- cmpult $23, $15, $20 # L0 lo add => carry
- addq $23, $14, $23 # U0 hi add => answer
- ldq $0, 0($17) # L1
- mulq $19, $2, $13 # U1
- cmpult $23, $14, $21 # L0 hi add => carry
- addq $8, $20, $8 # U0 hi mul + carry
- ldq $1, 8($17) # L1
- umulh $19, $2, $14 # U1
- addq $4, $9, $4 # L0 lo + acc
- stq $22, -48($16) # L0
- stq $23, -40($16) # L1
- mulq $19, $3, $15 # U1
- addq $8, $21, $8 # U0 hi mul + carry
- cmpult $4, $9, $20 # L0 lo add => carry
- addq $4, $8, $22 # U0 hi add => answer
- ble $18, $Lend # U1 bookkeeping
-
- # ____ MAIN UNROLLED LOOP ____
- .align 4
-$Loop:
- bis $31, $31, $31 # U1 mt
- cmpult $22, $8, $21 # L0 hi add => carry
- addq $10, $20, $10 # U0 hi mul + carry
- ldq $4, 0($16) # L1
-
- bis $31, $31, $31 # U1 mt
- addq $5, $11, $23 # L0 lo + acc
- addq $10, $21, $10 # L0 hi mul + carry
- ldq $5, 8($16) # L1
-
- umulh $19, $3, $8 # U1
- cmpult $23, $11, $20 # L0 lo add => carry
- addq $23, $10, $23 # U0 hi add => answer
- ldq $2, 16($17) # L1
-
- mulq $19, $0, $9 # U1
- cmpult $23, $10, $21 # L0 hi add => carry
- addq $12, $20, $12 # U0 hi mul + carry
- ldq $3, 24($17) # L1
-
- umulh $19, $0, $10 # U1
- addq $6, $13, $6 # L0 lo + acc
- stq $22, -32($16) # L0
- stq $23, -24($16) # L1
-
- bis $31, $31, $31 # L0 st slosh
- mulq $19, $1, $11 # U1
- bis $31, $31, $31 # L1 st slosh
- addq $12, $21, $12 # U0 hi mul + carry
-
- cmpult $6, $13, $20 # L0 lo add => carry
- bis $31, $31, $31 # U1 mt
- lda $18, -1($18) # L1 bookkeeping
- addq $6, $12, $22 # U0 hi add => answer
-
- bis $31, $31, $31 # U1 mt
- cmpult $22, $12, $21 # L0 hi add => carry
- addq $14, $20, $14 # U0 hi mul + carry
- ldq $6, 16($16) # L1
-
- bis $31, $31, $31 # U1 mt
- addq $7, $15, $23 # L0 lo + acc
- addq $14, $21, $14 # U0 hi mul + carry
- ldq $7, 24($16) # L1
-
- umulh $19, $1, $12 # U1
- cmpult $23, $15, $20 # L0 lo add => carry
- addq $23, $14, $23 # U0 hi add => answer
- ldq $0, 32($17) # L1
-
- mulq $19, $2, $13 # U1
- cmpult $23, $14, $21 # L0 hi add => carry
- addq $8, $20, $8 # U0 hi mul + carry
- ldq $1, 40($17) # L1
-
- umulh $19, $2, $14 # U1
- addq $4, $9, $4 # U0 lo + acc
- stq $22, -16($16) # L0
- stq $23, -8($16) # L1
-
- bis $31, $31, $31 # L0 st slosh
- mulq $19, $3, $15 # U1
- bis $31, $31, $31 # L1 st slosh
- addq $8, $21, $8 # L0 hi mul + carry
-
- cmpult $4, $9, $20 # L0 lo add => carry
- bis $31, $31, $31 # U1 mt
- lda $17, 64($17) # L1 bookkeeping
- addq $4, $8, $22 # U0 hi add => answer
-
- bis $31, $31, $31 # U1 mt
- cmpult $22, $8, $21 # L0 hi add => carry
- addq $10, $20, $10 # U0 hi mul + carry
- ldq $4, 32($16) # L1
-
- bis $31, $31, $31 # U1 mt
- addq $5, $11, $23 # L0 lo + acc
- addq $10, $21, $10 # L0 hi mul + carry
- ldq $5, 40($16) # L1
-
- umulh $19, $3, $8 # U1
- cmpult $23, $11, $20 # L0 lo add => carry
- addq $23, $10, $23 # U0 hi add => answer
- ldq $2, -16($17) # L1
-
- mulq $19, $0, $9 # U1
- cmpult $23, $10, $21 # L0 hi add => carry
- addq $12, $20, $12 # U0 hi mul + carry
- ldq $3, -8($17) # L1
-
- umulh $19, $0, $10 # U1
- addq $6, $13, $6 # L0 lo + acc
- stq $22, 0($16) # L0
- stq $23, 8($16) # L1
-
- bis $31, $31, $31 # L0 st slosh
- mulq $19, $1, $11 # U1
- bis $31, $31, $31 # L1 st slosh
- addq $12, $21, $12 # U0 hi mul + carry
-
- cmpult $6, $13, $20 # L0 lo add => carry
- bis $31, $31, $31 # U1 mt
- lda $16, 64($16) # L1 bookkeeping
- addq $6, $12, $22 # U0 hi add => answer
-
- bis $31, $31, $31 # U1 mt
- cmpult $22, $12, $21 # L0 hi add => carry
- addq $14, $20, $14 # U0 hi mul + carry
- ldq $6, -16($16) # L1
-
- bis $31, $31, $31 # U1 mt
- addq $7, $15, $23 # L0 lo + acc
- addq $14, $21, $14 # U0 hi mul + carry
- ldq $7, -8($16) # L1
-
- umulh $19, $1, $12 # U1
- cmpult $23, $15, $20 # L0 lo add => carry
- addq $23, $14, $23 # U0 hi add => answer
- ldq $0, 0($17) # L1
-
- mulq $19, $2, $13 # U1
- cmpult $23, $14, $21 # L0 hi add => carry
- addq $8, $20, $8 # U0 hi mul + carry
- ldq $1, 8($17) # L1
-
- umulh $19, $2, $14 # U1
- addq $4, $9, $4 # L0 lo + acc
- stq $22, -48($16) # L0
- stq $23, -40($16) # L1
-
- bis $31, $31, $31 # L0 st slosh
- mulq $19, $3, $15 # U1
- bis $31, $31, $31 # L1 st slosh
- addq $8, $21, $8 # U0 hi mul + carry
-
- cmpult $4, $9, $20 # L0 lo add => carry
- addq $4, $8, $22 # U0 hi add => answer
- bis $31, $31, $31 # L1 mt
- bgt $18, $Loop # U1 bookkeeping
-
-# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
-$Lend:
- cmpult $22, $8, $21 # L0 hi add => carry
- addq $10, $20, $10 # U0 hi mul + carry
- ldq $4, 0($16) # L1
- addq $5, $11, $23 # L0 lo + acc
- addq $10, $21, $10 # L0 hi mul + carry
- ldq $5, 8($16) # L1
- umulh $19, $3, $8 # U1
- cmpult $23, $11, $20 # L0 lo add => carry
- addq $23, $10, $23 # U0 hi add => answer
- mulq $19, $0, $9 # U1
- cmpult $23, $10, $21 # L0 hi add => carry
- addq $12, $20, $12 # U0 hi mul + carry
- umulh $19, $0, $10 # U1
- addq $6, $13, $6 # L0 lo + acc
- stq $22, -32($16) # L0
- stq $23, -24($16) # L1
- mulq $19, $1, $11 # U1
- addq $12, $21, $12 # U0 hi mul + carry
- cmpult $6, $13, $20 # L0 lo add => carry
- addq $6, $12, $22 # U0 hi add => answer
- cmpult $22, $12, $21 # L0 hi add => carry
- addq $14, $20, $14 # U0 hi mul + carry
- addq $7, $15, $23 # L0 lo + acc
- addq $14, $21, $14 # U0 hi mul + carry
- umulh $19, $1, $12 # U1
- cmpult $23, $15, $20 # L0 lo add => carry
- addq $23, $14, $23 # U0 hi add => answer
- cmpult $23, $14, $21 # L0 hi add => carry
- addq $8, $20, $8 # U0 hi mul + carry
- addq $4, $9, $4 # U0 lo + acc
- stq $22, -16($16) # L0
- stq $23, -8($16) # L1
- bis $31, $31, $31 # L0 st slosh
- addq $8, $21, $8 # L0 hi mul + carry
- cmpult $4, $9, $20 # L0 lo add => carry
- addq $4, $8, $22 # U0 hi add => answer
- cmpult $22, $8, $21 # L0 hi add => carry
- addq $10, $20, $10 # U0 hi mul + carry
- addq $5, $11, $23 # L0 lo + acc
- addq $10, $21, $10 # L0 hi mul + carry
- cmpult $23, $11, $20 # L0 lo add => carry
- addq $23, $10, $23 # U0 hi add => answer
- cmpult $23, $10, $21 # L0 hi add => carry
- addq $12, $20, $12 # U0 hi mul + carry
- stq $22, 0($16) # L0
- stq $23, 8($16) # L1
- addq $12, $21, $0 # U0 hi mul + carry
-
- ldq $9, 8($30)
- ldq $10, 16($30)
- ldq $11, 24($30)
- ldq $12, 32($30)
- ldq $13, 40($30)
- ldq $14, 48($30)
- ldq $15, 56($30)
- lda $30, 240($30)
- ret $31, ($26), 1
-
- .end __mpn_addmul_1
Added: fsf/trunk/ports/sysdeps/alpha/lshift.S
==============================================================================
--- fsf/trunk/ports/sysdeps/alpha/lshift.S (added)
+++ fsf/trunk/ports/sysdeps/alpha/lshift.S Mon May 21 00:02:36 2012
@@ -1,0 +1,107 @@
+ # Alpha 21064 __mpn_lshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # cnt r19
+
+ # This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling,
+ # it would take 4 cycles/limb. It should be possible to get down to 3
+ # cycles/limb since both ldq and stq can be paired with the other used
+ # instructions. But there are many restrictions in the 21064 pipeline that
+ # makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+ # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+ # 2. Only aligned instruction pairs can be paired.
+ # 3. The store buffer or silo might not be able to deal with the bandwidth.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_lshift
+ .ent __mpn_lshift
+__mpn_lshift:
+ .frame $30,0,$26,0
+
+ s8addq $18,$17,$17 # make r17 point at end of s1
+ ldq $4,-8($17) # load first limb
+ subq $17,8,$17
+ subq $31,$19,$7
+ s8addq $18,$16,$16 # make r16 point at end of RES
+ subq $18,1,$18
+ and $18,4-1,$20 # number of limbs in first loop
+ srl $4,$7,$0 # compute function result
+
+ beq $20,.L0
+ subq $18,$20,$18
+
+ .align 3
+.Loop0:
+ ldq $3,-8($17)
+ subq $16,8,$16
+ subq $17,8,$17
+ subq $20,1,$20
+ sll $4,$19,$5
+ srl $3,$7,$6
+ bis $3,$3,$4
+ bis $5,$6,$8
+ stq $8,0($16)
+ bne $20,.Loop0
+
+.L0: beq $18,.Lend
+
+ .align 3
+.Loop: ldq $3,-8($17)
+ subq $16,32,$16
+ subq $18,4,$18
+ sll $4,$19,$5
+ srl $3,$7,$6
+
+ ldq $4,-16($17)
+ sll $3,$19,$1
+ bis $5,$6,$8
+ stq $8,24($16)
+ srl $4,$7,$2
+
+ ldq $3,-24($17)
+ sll $4,$19,$5
+ bis $1,$2,$8
+ stq $8,16($16)
+ srl $3,$7,$6
+
+ ldq $4,-32($17)
+ sll $3,$19,$1
+ bis $5,$6,$8
+ stq $8,8($16)
+ srl $4,$7,$2
+
+ subq $17,32,$17
+ bis $1,$2,$8
+ stq $8,0($16)
+
+ bgt $18,.Loop
+
+.Lend: sll $4,$19,$8
+ stq $8,-8($16)
+ ret $31,($26),1
+ .end __mpn_lshift
Removed: fsf/trunk/ports/sysdeps/alpha/lshift.s
==============================================================================
--- fsf/trunk/ports/sysdeps/alpha/lshift.s (original)
+++ fsf/trunk/ports/sysdeps/alpha/lshift.s (removed)
@@ -1,107 +1,0 @@
- # Alpha 21064 __mpn_lshift --
-
- # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
-
- # This file is part of the GNU MP Library.
-
- # The GNU MP Library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Lesser General Public License as published by
- # the Free Software Foundation; either version 2.1 of the License, or (at your
- # option) any later version.
-
- # The GNU MP Library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
- # License for more details.
-
- # You should have received a copy of the GNU Lesser General Public License
- # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
-
-
- # INPUT PARAMETERS
- # res_ptr r16
- # s1_ptr r17
- # size r18
- # cnt r19
-
- # This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling,
- # it would take 4 cycles/limb. It should be possible to get down to 3
- # cycles/limb since both ldq and stq can be paired with the other used
- # instructions. But there are many restrictions in the 21064 pipeline that
- # makes it hard, if not impossible, to get down to 3 cycles/limb:
-
- # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
- # 2. Only aligned instruction pairs can be paired.
- # 3. The store buffer or silo might not be able to deal with the bandwidth.
-
- .set noreorder
- .set noat
-.text
- .align 3
- .globl __mpn_lshift
- .ent __mpn_lshift
-__mpn_lshift:
- .frame $30,0,$26,0
-
- s8addq $18,$17,$17 # make r17 point at end of s1
- ldq $4,-8($17) # load first limb
- subq $17,8,$17
- subq $31,$19,$7
- s8addq $18,$16,$16 # make r16 point at end of RES
- subq $18,1,$18
- and $18,4-1,$20 # number of limbs in first loop
- srl $4,$7,$0 # compute function result
-
- beq $20,.L0
- subq $18,$20,$18
-
- .align 3
-.Loop0:
- ldq $3,-8($17)
- subq $16,8,$16
- subq $17,8,$17
- subq $20,1,$20
- sll $4,$19,$5
- srl $3,$7,$6
- bis $3,$3,$4
- bis $5,$6,$8
- stq $8,0($16)
- bne $20,.Loop0
-
-.L0: beq $18,.Lend
-
- .align 3
-.Loop: ldq $3,-8($17)
- subq $16,32,$16
- subq $18,4,$18
- sll $4,$19,$5
- srl $3,$7,$6
-
- ldq $4,-16($17)
- sll $3,$19,$1
- bis $5,$6,$8
- stq $8,24($16)
- srl $4,$7,$2
-
- ldq $3,-24($17)
- sll $4,$19,$5
- bis $1,$2,$8
- stq $8,16($16)
- srl $3,$7,$6
-
- ldq $4,-32($17)
- sll $3,$19,$1
- bis $5,$6,$8
- stq $8,8($16)
- srl $4,$7,$2
-
- subq $17,32,$17
- bis $1,$2,$8
- stq $8,0($16)
-
- bgt $18,.Loop
-
-.Lend: sll $4,$19,$8
- stq $8,-8($16)
- ret $31,($26),1
- .end __mpn_lshift
Added: fsf/trunk/ports/sysdeps/alpha/mul_1.S
==============================================================================
--- fsf/trunk/ports/sysdeps/alpha/mul_1.S (added)
+++ fsf/trunk/ports/sysdeps/alpha/mul_1.S Mon May 21 00:02:36 2012
@@ -1,0 +1,83 @@
+ # Alpha 21064 __mpn_mul_1 -- Multiply a limb vector with a limb and store
+ # the result in a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # s2_limb r19
+
+ # This code runs at 42 cycles/limb on the EV4 and 18 cycles/limb on the EV5.
+
+ # To improve performance for long multiplications, we would use
+ # 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use
+ # these instructions without slowing down the general code: 1. We can
+ # only have two prefetches in operation at any time in the Alpha
+ # architecture. 2. There will seldom be any special alignment
+ # between RES_PTR and S1_PTR. Maybe we can simply divide the current
+ # loop into an inner and outer loop, having the inner loop handle
+ # exactly one prefetch block?
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_mul_1
+ .ent __mpn_mul_1 2
+__mpn_mul_1:
+ .frame $30,0,$26
+
+ ldq $2,0($17) # $2 = s1_limb
+ subq $18,1,$18 # size--
+ mulq $2,$19,$3 # $3 = prod_low
+ bic $31,$31,$4 # clear cy_limb
+ umulh $2,$19,$0 # $0 = prod_high
+ beq $18,Lend1 # jump if size was == 1
+ ldq $2,8($17) # $2 = s1_limb
+ subq $18,1,$18 # size--
+ stq $3,0($16)
+ beq $18,Lend2 # jump if size was == 2
+
+ .align 3
+Loop: mulq $2,$19,$3 # $3 = prod_low
+ addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
+ subq $18,1,$18 # size--
+ umulh $2,$19,$4 # $4 = cy_limb
+ ldq $2,16($17) # $2 = s1_limb
+ addq $17,8,$17 # s1_ptr++
+ addq $3,$0,$3 # $3 = cy_limb + prod_low
+ stq $3,8($16)
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
+ addq $16,8,$16 # res_ptr++
+ bne $18,Loop
+
+Lend2: mulq $2,$19,$3 # $3 = prod_low
+ addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
+ umulh $2,$19,$4 # $4 = cy_limb
+ addq $3,$0,$3 # $3 = cy_limb + prod_low
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
+ stq $3,8($16)
+ addq $4,$0,$0 # cy_limb = prod_high + cy
+ ret $31,($26),1
+Lend1: stq $3,0($16)
+ ret $31,($26),1
+
+ .end __mpn_mul_1
Removed: fsf/trunk/ports/sysdeps/alpha/mul_1.s
==============================================================================
--- fsf/trunk/ports/sysdeps/alpha/mul_1.s (original)
+++ fsf/trunk/ports/sysdeps/alpha/mul_1.s (removed)
@@ -1,83 +1,0 @@
- # Alpha 21064 __mpn_mul_1 -- Multiply a limb vector with a limb and store
- # the result in a second limb vector.
-
- # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
-
- # This file is part of the GNU MP Library.
-
- # The GNU MP Library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Lesser General Public License as published by
- # the Free Software Foundation; either version 2.1 of the License, or (at your
- # option) any later version.
-
- # The GNU MP Library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
- # License for more details.
-
- # You should have received a copy of the GNU Lesser General Public License
- # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
-
-
- # INPUT PARAMETERS
- # res_ptr r16
- # s1_ptr r17
- # size r18
- # s2_limb r19
-
- # This code runs at 42 cycles/limb on the EV4 and 18 cycles/limb on the EV5.
-
- # To improve performance for long multiplications, we would use
- # 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use
- # these instructions without slowing down the general code: 1. We can
- # only have two prefetches in operation at any time in the Alpha
- # architecture. 2. There will seldom be any special alignment
- # between RES_PTR and S1_PTR. Maybe we can simply divide the current
- # loop into an inner and outer loop, having the inner loop handle
- # exactly one prefetch block?
-
- .set noreorder
- .set noat
-.text
- .align 3
- .globl __mpn_mul_1
- .ent __mpn_mul_1 2
-__mpn_mul_1:
- .frame $30,0,$26
-
- ldq $2,0($17) # $2 = s1_limb
- subq $18,1,$18 # size--
- mulq $2,$19,$3 # $3 = prod_low
- bic $31,$31,$4 # clear cy_limb
- umulh $2,$19,$0 # $0 = prod_high
- beq $18,Lend1 # jump if size was == 1
- ldq $2,8($17) # $2 = s1_limb
- subq $18,1,$18 # size--
- stq $3,0($16)
- beq $18,Lend2 # jump if size was == 2
-
- .align 3
-Loop: mulq $2,$19,$3 # $3 = prod_low
- addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
- subq $18,1,$18 # size--
- umulh $2,$19,$4 # $4 = cy_limb
- ldq $2,16($17) # $2 = s1_limb
- addq $17,8,$17 # s1_ptr++
- addq $3,$0,$3 # $3 = cy_limb + prod_low
- stq $3,8($16)
- cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
- addq $16,8,$16 # res_ptr++
- bne $18,Loop
-
-Lend2: mulq $2,$19,$3 # $3 = prod_low
- addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
- umulh $2,$19,$4 # $4 = cy_limb
- addq $3,$0,$3 # $3 = cy_limb + prod_low
- cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
- stq $3,8($16)
- addq $4,$0,$0 # cy_limb = prod_high + cy
- ret $31,($26),1
-Lend1: stq $3,0($16)
- ret $31,($26),1
-
- .end __mpn_mul_1
Added: fsf/trunk/ports/sysdeps/alpha/rshift.S
==============================================================================
--- fsf/trunk/ports/sysdeps/alpha/rshift.S (added)
+++ fsf/trunk/ports/sysdeps/alpha/rshift.S Mon May 21 00:02:36 2012
@@ -1,0 +1,105 @@
+ # Alpha 21064 __mpn_rshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # cnt r19
+
+ # This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling,
+ # it would take 4 cycles/limb. It should be possible to get down to 3
+ # cycles/limb since both ldq and stq can be paired with the other used
+ # instructions. But there are many restrictions in the 21064 pipeline that
+ # makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+ # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+ # 2. Only aligned instruction pairs can be paired.
+ # 3. The store buffer or silo might not be able to deal with the bandwidth.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_rshift
+ .ent __mpn_rshift
+__mpn_rshift:
+ .frame $30,0,$26,0
+
+ ldq $4,0($17) # load first limb
+ addq $17,8,$17
+ subq $31,$19,$7
+ subq $18,1,$18
+ and $18,4-1,$20 # number of limbs in first loop
+ sll $4,$7,$0 # compute function result
+
+ beq $20,.L0
+ subq $18,$20,$18
+
+ .align 3
+.Loop0:
+ ldq $3,0($17)
+ addq $16,8,$16
+ addq $17,8,$17
+ subq $20,1,$20
+ srl $4,$19,$5
+ sll $3,$7,$6
+ bis $3,$3,$4
+ bis $5,$6,$8
+ stq $8,-8($16)
+ bne $20,.Loop0
+
+.L0: beq $18,.Lend
+
+ .align 3
+.Loop: ldq $3,0($17)
+ addq $16,32,$16
+ subq $18,4,$18
+ srl $4,$19,$5
+ sll $3,$7,$6
+
+ ldq $4,8($17)
+ srl $3,$19,$1
+ bis $5,$6,$8
+ stq $8,-32($16)
+ sll $4,$7,$2
+
+ ldq $3,16($17)
+ srl $4,$19,$5
+ bis $1,$2,$8
+ stq $8,-24($16)
+ sll $3,$7,$6
+
+ ldq $4,24($17)
+ srl $3,$19,$1
+ bis $5,$6,$8
+ stq $8,-16($16)
+ sll $4,$7,$2
+
+ addq $17,32,$17
+ bis $1,$2,$8
+ stq $8,-8($16)
+
+ bgt $18,.Loop
+
+.Lend: srl $4,$19,$8
+ stq $8,0($16)
+ ret $31,($26),1
+ .end __mpn_rshift
Removed: fsf/trunk/ports/sysdeps/alpha/rshift.s
==============================================================================
--- fsf/trunk/ports/sysdeps/alpha/rshift.s (original)
+++ fsf/trunk/ports/sysdeps/alpha/rshift.s (removed)
@@ -1,105 +1,0 @@
- # Alpha 21064 __mpn_rshift --
-
- # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
-
- # This file is part of the GNU MP Library.
-
- # The GNU MP Library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Lesser General Public License as published by
- # the Free Software Foundation; either version 2.1 of the License, or (at your
- # option) any later version.
-
- # The GNU MP Library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
- # License for more details.
-
- # You should have received a copy of the GNU Lesser General Public License
- # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
-
-
- # INPUT PARAMETERS
- # res_ptr r16
- # s1_ptr r17
- # size r18
- # cnt r19
-
- # This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling,
- # it would take 4 cycles/limb. It should be possible to get down to 3
- # cycles/limb since both ldq and stq can be paired with the other used
- # instructions. But there are many restrictions in the 21064 pipeline that
- # makes it hard, if not impossible, to get down to 3 cycles/limb:
-
- # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
- # 2. Only aligned instruction pairs can be paired.
- # 3. The store buffer or silo might not be able to deal with the bandwidth.
-
- .set noreorder
- .set noat
-.text
- .align 3
- .globl __mpn_rshift
- .ent __mpn_rshift
-__mpn_rshift:
- .frame $30,0,$26,0
-
- ldq $4,0($17) # load first limb
- addq $17,8,$17
- subq $31,$19,$7
- subq $18,1,$18
- and $18,4-1,$20 # number of limbs in first loop
- sll $4,$7,$0 # compute function result
-
- beq $20,.L0
- subq $18,$20,$18
-
- .align 3
-.Loop0:
- ldq $3,0($17)
- addq $16,8,$16
- addq $17,8,$17
- subq $20,1,$20
- srl $4,$19,$5
- sll $3,$7,$6
- bis $3,$3,$4
- bis $5,$6,$8
- stq $8,-8($16)
- bne $20,.Loop0
-
-.L0: beq $18,.Lend
-
- .align 3
-.Loop: ldq $3,0($17)
- addq $16,32,$16
- subq $18,4,$18
- srl $4,$19,$5
- sll $3,$7,$6
-
- ldq $4,8($17)
- srl $3,$19,$1
- bis $5,$6,$8
- stq $8,-32($16)
- sll $4,$7,$2
-
- ldq $3,16($17)
- srl $4,$19,$5
- bis $1,$2,$8
- stq $8,-24($16)
- sll $3,$7,$6
-
- ldq $4,24($17)
- srl $3,$19,$1
- bis $5,$6,$8
- stq $8,-16($16)
- sll $4,$7,$2
-
- addq $17,32,$17
- bis $1,$2,$8
- stq $8,-8($16)
-
- bgt $18,.Loop
-
-.Lend: srl $4,$19,$8
- stq $8,0($16)
- ret $31,($26),1
- .end __mpn_rshift
Added: fsf/trunk/ports/sysdeps/alpha/sub_n.S
==============================================================================
--- fsf/trunk/ports/sysdeps/alpha/sub_n.S (added)
+++ fsf/trunk/ports/sysdeps/alpha/sub_n.S Mon May 21 00:02:36 2012
@@ -1,0 +1,118 @@
+ # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_sub_n
+ .ent __mpn_sub_n
+__mpn_sub_n:
+ .frame $30,0,$26,0
+
+ ldq $3,0($17)
+ ldq $4,0($18)
+
+ subq $19,1,$19
+ and $19,4-1,$2 # number of limbs in first loop
+ bis $31,$31,$0
+ beq $2,.L0 # if multiple of 4 limbs, skip first loop
+
+ subq $19,$2,$19
+
+.Loop0: subq $2,1,$2
+ ldq $5,8($17)
+ addq $4,$0,$4
+ ldq $6,8($18)
+ cmpult $4,$0,$1
+ subq $3,$4,$4
+ cmpult $3,$4,$0
+ stq $4,0($16)
+ or $0,$1,$0
+
+ addq $17,8,$17
+ addq $18,8,$18
+ bis $5,$5,$3
+ bis $6,$6,$4
+ addq $16,8,$16
+ bne $2,.Loop0
+
+.L0: beq $19,.Lend
+
+ .align 3
+.Loop: subq $19,4,$19
+
+ ldq $5,8($17)
+ addq $4,$0,$4
+ ldq $6,8($18)
+ cmpult $4,$0,$1
+ subq $3,$4,$4
+ cmpult $3,$4,$0
+ stq $4,0($16)
+ or $0,$1,$0
+
+ ldq $3,16($17)
+ addq $6,$0,$6
+ ldq $4,16($18)
+ cmpult $6,$0,$1
+ subq $5,$6,$6
+ cmpult $5,$6,$0
+ stq $6,8($16)
+ or $0,$1,$0
+
+ ldq $5,24($17)
+ addq $4,$0,$4
+ ldq $6,24($18)
+ cmpult $4,$0,$1
+ subq $3,$4,$4
+ cmpult $3,$4,$0
+ stq $4,16($16)
+ or $0,$1,$0
+
+ ldq $3,32($17)
+ addq $6,$0,$6
+ ldq $4,32($18)
+ cmpult $6,$0,$1
+ subq $5,$6,$6
+ cmpult $5,$6,$0
+ stq $6,24($16)
+ or $0,$1,$0
+
+ addq $17,32,$17
+ addq $18,32,$18
+ addq $16,32,$16
+ bne $19,.Loop
+
+.Lend: addq $4,$0,$4
+ cmpult $4,$0,$1
+ subq $3,$4,$4
+ cmpult $3,$4,$0
+ stq $4,0($16)
+ or $0,$1,$0
+ ret $31,($26),1
+
+ .end __mpn_sub_n
Removed: fsf/trunk/ports/sysdeps/alpha/sub_n.s
==============================================================================
--- fsf/trunk/ports/sysdeps/alpha/sub_n.s (original)
+++ fsf/trunk/ports/sysdeps/alpha/sub_n.s (removed)
@@ -1,118 +1,0 @@
- # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
- # store difference in a third limb vector.
-
- # Copyright (C) 1995 Free Software Foundation, Inc.
-
- # This file is part of the GNU MP Library.
-
- # The GNU MP Library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Lesser General Public License as published by
- # the Free Software Foundation; either version 2.1 of the License, or (at your
- # option) any later version.
-
- # The GNU MP Library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
- # License for more details.
-
- # You should have received a copy of the GNU Lesser General Public License
- # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
-
-
- # INPUT PARAMETERS
- # res_ptr $16
- # s1_ptr $17
- # s2_ptr $18
- # size $19
-
- .set noreorder
- .set noat
-.text
- .align 3
- .globl __mpn_sub_n
- .ent __mpn_sub_n
-__mpn_sub_n:
- .frame $30,0,$26,0
-
- ldq $3,0($17)
- ldq $4,0($18)
-
- subq $19,1,$19
- and $19,4-1,$2 # number of limbs in first loop
- bis $31,$31,$0
- beq $2,.L0 # if multiple of 4 limbs, skip first loop
-
- subq $19,$2,$19
-
-.Loop0: subq $2,1,$2
- ldq $5,8($17)
- addq $4,$0,$4
- ldq $6,8($18)
- cmpult $4,$0,$1
- subq $3,$4,$4
- cmpult $3,$4,$0
- stq $4,0($16)
- or $0,$1,$0
-
- addq $17,8,$17
- addq $18,8,$18
- bis $5,$5,$3
- bis $6,$6,$4
- addq $16,8,$16
- bne $2,.Loop0
-
-.L0: beq $19,.Lend
-
- .align 3
-.Loop: subq $19,4,$19
-
- ldq $5,8($17)
- addq $4,$0,$4
- ldq $6,8($18)
- cmpult $4,$0,$1
- subq $3,$4,$4
- cmpult $3,$4,$0
- stq $4,0($16)
- or $0,$1,$0
-
- ldq $3,16($17)
[... 233 lines stripped ...]
_______________________________________________
Commits mailing list
Commits@xxxxxxxxxx
http://eglibc.org/cgi-bin/mailman/listinfo/commits