From 554fd8c5195424bdbcabf5de30fdc183aba391bd Mon Sep 17 00:00:00 2001 From: upstream source tree Date: Sun, 15 Mar 2015 20:14:05 -0400 Subject: obtained gcc-4.6.4.tar.bz2 from upstream website; verified gcc-4.6.4.tar.bz2.sig; imported gcc-4.6.4 source tree from verified upstream tarball. downloading a git-generated archive based on the 'upstream' tag should provide you with a source tree that is binary identical to the one extracted from the above tarball. if you have obtained the source via the command 'git clone', however, do note that line-endings of files in your working directory might differ from line-endings of the respective files in the upstream repository. --- gcc/config/ia64/lib1funcs.asm | 795 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 795 insertions(+) create mode 100644 gcc/config/ia64/lib1funcs.asm (limited to 'gcc/config/ia64/lib1funcs.asm') diff --git a/gcc/config/ia64/lib1funcs.asm b/gcc/config/ia64/lib1funcs.asm new file mode 100644 index 000000000..b7eaa6eca --- /dev/null +++ b/gcc/config/ia64/lib1funcs.asm @@ -0,0 +1,795 @@ +/* Copyright (C) 2000, 2001, 2003, 2005, 2009 Free Software Foundation, Inc. + Contributed by James E. Wilson . + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifdef L__divxf3 +// Compute a 80-bit IEEE double-extended quotient. +// +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. +// +// farg0 holds the dividend. farg1 holds the divisor. +// +// __divtf3 is an alternate symbol name for backward compatibility. + + .text + .align 16 + .global __divxf3 + .proc __divxf3 +__divxf3: +#ifdef SHARED + .global __divtf3 +__divtf3: +#endif + cmp.eq p7, p0 = r0, r0 + frcpa.s0 f10, p6 = farg0, farg1 + ;; +(p6) cmp.ne p7, p0 = r0, r0 + .pred.rel.mutex p6, p7 +(p6) fnma.s1 f11 = farg1, f10, f1 +(p6) fma.s1 f12 = farg0, f10, f0 + ;; +(p6) fma.s1 f13 = f11, f11, f0 +(p6) fma.s1 f14 = f11, f11, f11 + ;; +(p6) fma.s1 f11 = f13, f13, f11 +(p6) fma.s1 f13 = f14, f10, f10 + ;; +(p6) fma.s1 f10 = f13, f11, f10 +(p6) fnma.s1 f11 = farg1, f12, farg0 + ;; +(p6) fma.s1 f11 = f11, f10, f12 +(p6) fnma.s1 f12 = farg1, f10, f1 + ;; +(p6) fma.s1 f10 = f12, f10, f10 +(p6) fnma.s1 f12 = farg1, f11, farg0 + ;; +(p6) fma.s0 fret0 = f12, f10, f11 +(p7) mov fret0 = f10 + br.ret.sptk rp + .endp __divxf3 +#endif + +#ifdef L__divdf3 +// Compute a 64-bit IEEE double quotient. +// +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. +// +// farg0 holds the dividend. farg1 holds the divisor. + + .text + .align 16 + .global __divdf3 + .proc __divdf3 +__divdf3: + cmp.eq p7, p0 = r0, r0 + frcpa.s0 f10, p6 = farg0, farg1 + ;; +(p6) cmp.ne p7, p0 = r0, r0 + .pred.rel.mutex p6, p7 +(p6) fmpy.s1 f11 = farg0, f10 +(p6) fnma.s1 f12 = farg1, f10, f1 + ;; +(p6) fma.s1 f11 = f12, f11, f11 +(p6) fmpy.s1 f13 = f12, f12 + ;; +(p6) fma.s1 f10 = f12, f10, f10 +(p6) fma.s1 f11 = f13, f11, f11 + ;; +(p6) fmpy.s1 f12 = f13, f13 +(p6) fma.s1 f10 = f13, f10, f10 + ;; +(p6) fma.d.s1 f11 = f12, f11, f11 +(p6) fma.s1 f10 = f12, f10, f10 + ;; +(p6) fnma.d.s1 f8 = farg1, f11, farg0 + ;; +(p6) fma.d fret0 = f8, f10, f11 +(p7) mov fret0 = f10 + br.ret.sptk rp + ;; + .endp __divdf3 +#endif + +#ifdef L__divsf3 +// Compute a 32-bit IEEE float quotient. +// +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. +// +// farg0 holds the dividend. farg1 holds the divisor. + + .text + .align 16 + .global __divsf3 + .proc __divsf3 +__divsf3: + cmp.eq p7, p0 = r0, r0 + frcpa.s0 f10, p6 = farg0, farg1 + ;; +(p6) cmp.ne p7, p0 = r0, r0 + .pred.rel.mutex p6, p7 +(p6) fmpy.s1 f8 = farg0, f10 +(p6) fnma.s1 f9 = farg1, f10, f1 + ;; +(p6) fma.s1 f8 = f9, f8, f8 +(p6) fmpy.s1 f9 = f9, f9 + ;; +(p6) fma.s1 f8 = f9, f8, f8 +(p6) fmpy.s1 f9 = f9, f9 + ;; +(p6) fma.d.s1 f10 = f9, f8, f8 + ;; +(p6) fnorm.s.s0 fret0 = f10 +(p7) mov fret0 = f10 + br.ret.sptk rp + ;; + .endp __divsf3 +#endif + +#ifdef L__divdi3 +// Compute a 64-bit integer quotient. +// +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. +// +// in0 holds the dividend. in1 holds the divisor. + + .text + .align 16 + .global __divdi3 + .proc __divdi3 +__divdi3: + .regstk 2,0,0,0 + // Transfer inputs to FP registers. + setf.sig f8 = in0 + setf.sig f9 = in1 + // Check divide by zero. + cmp.ne.unc p0,p7=0,in1 + ;; + // Convert the inputs to FP, so that they won't be treated as unsigned. + fcvt.xf f8 = f8 + fcvt.xf f9 = f9 +(p7) break 1 + ;; + // Compute the reciprocal approximation. + frcpa.s1 f10, p6 = f8, f9 + ;; + // 3 Newton-Raphson iterations. +(p6) fnma.s1 f11 = f9, f10, f1 +(p6) fmpy.s1 f12 = f8, f10 + ;; +(p6) fmpy.s1 f13 = f11, f11 +(p6) fma.s1 f12 = f11, f12, f12 + ;; +(p6) fma.s1 f10 = f11, f10, f10 +(p6) fma.s1 f11 = f13, f12, f12 + ;; +(p6) fma.s1 f10 = f13, f10, f10 +(p6) fnma.s1 f12 = f9, f11, f8 + ;; +(p6) fma.s1 f10 = f12, f10, f11 + ;; + // Round quotient to an integer. + fcvt.fx.trunc.s1 f10 = f10 + ;; + // Transfer result to GP registers. + getf.sig ret0 = f10 + br.ret.sptk rp + ;; + .endp __divdi3 +#endif + +#ifdef L__moddi3 +// Compute a 64-bit integer modulus. +// +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. +// +// in0 holds the dividend (a). in1 holds the divisor (b). + + .text + .align 16 + .global __moddi3 + .proc __moddi3 +__moddi3: + .regstk 2,0,0,0 + // Transfer inputs to FP registers. + setf.sig f14 = in0 + setf.sig f9 = in1 + // Check divide by zero. + cmp.ne.unc p0,p7=0,in1 + ;; + // Convert the inputs to FP, so that they won't be treated as unsigned. + fcvt.xf f8 = f14 + fcvt.xf f9 = f9 +(p7) break 1 + ;; + // Compute the reciprocal approximation. + frcpa.s1 f10, p6 = f8, f9 + ;; + // 3 Newton-Raphson iterations. +(p6) fmpy.s1 f12 = f8, f10 +(p6) fnma.s1 f11 = f9, f10, f1 + ;; +(p6) fma.s1 f12 = f11, f12, f12 +(p6) fmpy.s1 f13 = f11, f11 + ;; +(p6) fma.s1 f10 = f11, f10, f10 +(p6) fma.s1 f11 = f13, f12, f12 + ;; + sub in1 = r0, in1 +(p6) fma.s1 f10 = f13, f10, f10 +(p6) fnma.s1 f12 = f9, f11, f8 + ;; + setf.sig f9 = in1 +(p6) fma.s1 f10 = f12, f10, f11 + ;; + fcvt.fx.trunc.s1 f10 = f10 + ;; + // r = q * (-b) + a + xma.l f10 = f10, f9, f14 + ;; + // Transfer result to GP registers. + getf.sig ret0 = f10 + br.ret.sptk rp + ;; + .endp __moddi3 +#endif + +#ifdef L__udivdi3 +// Compute a 64-bit unsigned integer quotient. +// +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. +// +// in0 holds the dividend. in1 holds the divisor. + + .text + .align 16 + .global __udivdi3 + .proc __udivdi3 +__udivdi3: + .regstk 2,0,0,0 + // Transfer inputs to FP registers. + setf.sig f8 = in0 + setf.sig f9 = in1 + // Check divide by zero. + cmp.ne.unc p0,p7=0,in1 + ;; + // Convert the inputs to FP, to avoid FP software-assist faults. + fcvt.xuf.s1 f8 = f8 + fcvt.xuf.s1 f9 = f9 +(p7) break 1 + ;; + // Compute the reciprocal approximation. + frcpa.s1 f10, p6 = f8, f9 + ;; + // 3 Newton-Raphson iterations. +(p6) fnma.s1 f11 = f9, f10, f1 +(p6) fmpy.s1 f12 = f8, f10 + ;; +(p6) fmpy.s1 f13 = f11, f11 +(p6) fma.s1 f12 = f11, f12, f12 + ;; +(p6) fma.s1 f10 = f11, f10, f10 +(p6) fma.s1 f11 = f13, f12, f12 + ;; +(p6) fma.s1 f10 = f13, f10, f10 +(p6) fnma.s1 f12 = f9, f11, f8 + ;; +(p6) fma.s1 f10 = f12, f10, f11 + ;; + // Round quotient to an unsigned integer. + fcvt.fxu.trunc.s1 f10 = f10 + ;; + // Transfer result to GP registers. + getf.sig ret0 = f10 + br.ret.sptk rp + ;; + .endp __udivdi3 +#endif + +#ifdef L__umoddi3 +// Compute a 64-bit unsigned integer modulus. +// +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. +// +// in0 holds the dividend (a). in1 holds the divisor (b). + + .text + .align 16 + .global __umoddi3 + .proc __umoddi3 +__umoddi3: + .regstk 2,0,0,0 + // Transfer inputs to FP registers. + setf.sig f14 = in0 + setf.sig f9 = in1 + // Check divide by zero. + cmp.ne.unc p0,p7=0,in1 + ;; + // Convert the inputs to FP, to avoid FP software assist faults. + fcvt.xuf.s1 f8 = f14 + fcvt.xuf.s1 f9 = f9 +(p7) break 1; + ;; + // Compute the reciprocal approximation. + frcpa.s1 f10, p6 = f8, f9 + ;; + // 3 Newton-Raphson iterations. +(p6) fmpy.s1 f12 = f8, f10 +(p6) fnma.s1 f11 = f9, f10, f1 + ;; +(p6) fma.s1 f12 = f11, f12, f12 +(p6) fmpy.s1 f13 = f11, f11 + ;; +(p6) fma.s1 f10 = f11, f10, f10 +(p6) fma.s1 f11 = f13, f12, f12 + ;; + sub in1 = r0, in1 +(p6) fma.s1 f10 = f13, f10, f10 +(p6) fnma.s1 f12 = f9, f11, f8 + ;; + setf.sig f9 = in1 +(p6) fma.s1 f10 = f12, f10, f11 + ;; + // Round quotient to an unsigned integer. + fcvt.fxu.trunc.s1 f10 = f10 + ;; + // r = q * (-b) + a + xma.l f10 = f10, f9, f14 + ;; + // Transfer result to GP registers. + getf.sig ret0 = f10 + br.ret.sptk rp + ;; + .endp __umoddi3 +#endif + +#ifdef L__divsi3 +// Compute a 32-bit integer quotient. +// +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. +// +// in0 holds the dividend. in1 holds the divisor. + + .text + .align 16 + .global __divsi3 + .proc __divsi3 +__divsi3: + .regstk 2,0,0,0 + // Check divide by zero. + cmp.ne.unc p0,p7=0,in1 + sxt4 in0 = in0 + sxt4 in1 = in1 + ;; + setf.sig f8 = in0 + setf.sig f9 = in1 +(p7) break 1 + ;; + mov r2 = 0x0ffdd + fcvt.xf f8 = f8 + fcvt.xf f9 = f9 + ;; + setf.exp f11 = r2 + frcpa.s1 f10, p6 = f8, f9 + ;; +(p6) fmpy.s1 f8 = f8, f10 +(p6) fnma.s1 f9 = f9, f10, f1 + ;; +(p6) fma.s1 f8 = f9, f8, f8 +(p6) fma.s1 f9 = f9, f9, f11 + ;; +(p6) fma.s1 f10 = f9, f8, f8 + ;; + fcvt.fx.trunc.s1 f10 = f10 + ;; + getf.sig ret0 = f10 + br.ret.sptk rp + ;; + .endp __divsi3 +#endif + +#ifdef L__modsi3 +// Compute a 32-bit integer modulus. +// +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. +// +// in0 holds the dividend. in1 holds the divisor. + + .text + .align 16 + .global __modsi3 + .proc __modsi3 +__modsi3: + .regstk 2,0,0,0 + mov r2 = 0x0ffdd + sxt4 in0 = in0 + sxt4 in1 = in1 + ;; + setf.sig f13 = r32 + setf.sig f9 = r33 + // Check divide by zero. + cmp.ne.unc p0,p7=0,in1 + ;; + sub in1 = r0, in1 + fcvt.xf f8 = f13 + fcvt.xf f9 = f9 + ;; + setf.exp f11 = r2 + frcpa.s1 f10, p6 = f8, f9 +(p7) break 1 + ;; +(p6) fmpy.s1 f12 = f8, f10 +(p6) fnma.s1 f10 = f9, f10, f1 + ;; + setf.sig f9 = in1 +(p6) fma.s1 f12 = f10, f12, f12 +(p6) fma.s1 f10 = f10, f10, f11 + ;; +(p6) fma.s1 f10 = f10, f12, f12 + ;; + fcvt.fx.trunc.s1 f10 = f10 + ;; + xma.l f10 = f10, f9, f13 + ;; + getf.sig ret0 = f10 + br.ret.sptk rp + ;; + .endp __modsi3 +#endif + +#ifdef L__udivsi3 +// Compute a 32-bit unsigned integer quotient. +// +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. +// +// in0 holds the dividend. in1 holds the divisor. + + .text + .align 16 + .global __udivsi3 + .proc __udivsi3 +__udivsi3: + .regstk 2,0,0,0 + mov r2 = 0x0ffdd + zxt4 in0 = in0 + zxt4 in1 = in1 + ;; + setf.sig f8 = in0 + setf.sig f9 = in1 + // Check divide by zero. + cmp.ne.unc p0,p7=0,in1 + ;; + fcvt.xf f8 = f8 + fcvt.xf f9 = f9 +(p7) break 1 + ;; + setf.exp f11 = r2 + frcpa.s1 f10, p6 = f8, f9 + ;; +(p6) fmpy.s1 f8 = f8, f10 +(p6) fnma.s1 f9 = f9, f10, f1 + ;; +(p6) fma.s1 f8 = f9, f8, f8 +(p6) fma.s1 f9 = f9, f9, f11 + ;; +(p6) fma.s1 f10 = f9, f8, f8 + ;; + fcvt.fxu.trunc.s1 f10 = f10 + ;; + getf.sig ret0 = f10 + br.ret.sptk rp + ;; + .endp __udivsi3 +#endif + +#ifdef L__umodsi3 +// Compute a 32-bit unsigned integer modulus. +// +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. +// +// in0 holds the dividend. in1 holds the divisor. + + .text + .align 16 + .global __umodsi3 + .proc __umodsi3 +__umodsi3: + .regstk 2,0,0,0 + mov r2 = 0x0ffdd + zxt4 in0 = in0 + zxt4 in1 = in1 + ;; + setf.sig f13 = in0 + setf.sig f9 = in1 + // Check divide by zero. + cmp.ne.unc p0,p7=0,in1 + ;; + sub in1 = r0, in1 + fcvt.xf f8 = f13 + fcvt.xf f9 = f9 + ;; + setf.exp f11 = r2 + frcpa.s1 f10, p6 = f8, f9 +(p7) break 1; + ;; +(p6) fmpy.s1 f12 = f8, f10 +(p6) fnma.s1 f10 = f9, f10, f1 + ;; + setf.sig f9 = in1 +(p6) fma.s1 f12 = f10, f12, f12 +(p6) fma.s1 f10 = f10, f10, f11 + ;; +(p6) fma.s1 f10 = f10, f12, f12 + ;; + fcvt.fxu.trunc.s1 f10 = f10 + ;; + xma.l f10 = f10, f9, f13 + ;; + getf.sig ret0 = f10 + br.ret.sptk rp + ;; + .endp __umodsi3 +#endif + +#ifdef L__save_stack_nonlocal +// Notes on save/restore stack nonlocal: We read ar.bsp but write +// ar.bspstore. This is because ar.bsp can be read at all times +// (independent of the RSE mode) but since it's read-only we need to +// restore the value via ar.bspstore. This is OK because +// ar.bsp==ar.bspstore after executing "flushrs". + +// void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer) + + .text + .align 16 + .global __ia64_save_stack_nonlocal + .proc __ia64_save_stack_nonlocal +__ia64_save_stack_nonlocal: + { .mmf + alloc r18 = ar.pfs, 2, 0, 0, 0 + mov r19 = ar.rsc + ;; + } + { .mmi + flushrs + st8 [in0] = in1, 24 + and r19 = 0x1c, r19 + ;; + } + { .mmi + st8 [in0] = r18, -16 + mov ar.rsc = r19 + or r19 = 0x3, r19 + ;; + } + { .mmi + mov r16 = ar.bsp + mov r17 = ar.rnat + adds r2 = 8, in0 + ;; + } + { .mmi + st8 [in0] = r16 + st8 [r2] = r17 + } + { .mib + mov ar.rsc = r19 + br.ret.sptk.few rp + ;; + } + .endp __ia64_save_stack_nonlocal +#endif + +#ifdef L__nonlocal_goto +// void __ia64_nonlocal_goto(void *target_label, void *save_area, +// void *static_chain); + + .text + .align 16 + .global __ia64_nonlocal_goto + .proc __ia64_nonlocal_goto +__ia64_nonlocal_goto: + { .mmi + alloc r20 = ar.pfs, 3, 0, 0, 0 + ld8 r12 = [in1], 8 + mov.ret.sptk rp = in0, .L0 + ;; + } + { .mmf + ld8 r16 = [in1], 8 + mov r19 = ar.rsc + ;; + } + { .mmi + flushrs + ld8 r17 = [in1], 8 + and r19 = 0x1c, r19 + ;; + } + { .mmi + ld8 r18 = [in1] + mov ar.rsc = r19 + or r19 = 0x3, r19 + ;; + } + { .mmi + mov ar.bspstore = r16 + ;; + mov ar.rnat = r17 + ;; + } + { .mmi + loadrs + invala + mov r15 = in2 + ;; + } +.L0: { .mib + mov ar.rsc = r19 + mov ar.pfs = r18 + br.ret.sptk.few rp + ;; + } + .endp __ia64_nonlocal_goto +#endif + +#ifdef L__restore_stack_nonlocal +// This is mostly the same as nonlocal_goto above. +// ??? This has not been tested yet. + +// void __ia64_restore_stack_nonlocal(void *save_area) + + .text + .align 16 + .global __ia64_restore_stack_nonlocal + .proc __ia64_restore_stack_nonlocal +__ia64_restore_stack_nonlocal: + { .mmf + alloc r20 = ar.pfs, 4, 0, 0, 0 + ld8 r12 = [in0], 8 + ;; + } + { .mmb + ld8 r16=[in0], 8 + mov r19 = ar.rsc + ;; + } + { .mmi + flushrs + ld8 r17 = [in0], 8 + and r19 = 0x1c, r19 + ;; + } + { .mmf + ld8 r18 = [in0] + mov ar.rsc = r19 + ;; + } + { .mmi + mov ar.bspstore = r16 + ;; + mov ar.rnat = r17 + or r19 = 0x3, r19 + ;; + } + { .mmf + loadrs + invala + ;; + } +.L0: { .mib + mov ar.rsc = r19 + mov ar.pfs = r18 + br.ret.sptk.few rp + ;; + } + .endp __ia64_restore_stack_nonlocal +#endif + +#ifdef L__trampoline +// Implement the nested function trampoline. This is out of line +// so that we don't have to bother with flushing the icache, as +// well as making the on-stack trampoline smaller. +// +// The trampoline has the following form: +// +// +-------------------+ > +// TRAMP: | __ia64_trampoline | | +// +-------------------+ > fake function descriptor +// | TRAMP+16 | | +// +-------------------+ > +// | target descriptor | +// +-------------------+ +// | static link | +// +-------------------+ + + .text + .align 16 + .global __ia64_trampoline + .proc __ia64_trampoline +__ia64_trampoline: + { .mmi + ld8 r2 = [r1], 8 + ;; + ld8 r15 = [r1] + } + { .mmi + ld8 r3 = [r2], 8 + ;; + ld8 r1 = [r2] + mov b6 = r3 + } + { .bbb + br.sptk.many b6 + ;; + } + .endp __ia64_trampoline +#endif + +#ifdef SHARED +// Thunks for backward compatibility. +#ifdef L_fixtfdi + .text + .align 16 + .global __fixtfti + .proc __fixtfti +__fixtfti: + { .bbb + br.sptk.many __fixxfti + ;; + } + .endp __fixtfti +#endif +#ifdef L_fixunstfdi + .align 16 + .global __fixunstfti + .proc __fixunstfti +__fixunstfti: + { .bbb + br.sptk.many __fixunsxfti + ;; + } + .endp __fixunstfti +#endif +#ifdef L_floatditf + .align 16 + .global __floattitf + .proc __floattitf +__floattitf: + { .bbb + br.sptk.many __floattixf + ;; + } + .endp __floattitf +#endif +#endif -- cgit v1.2.3