From 554fd8c5195424bdbcabf5de30fdc183aba391bd Mon Sep 17 00:00:00 2001 From: upstream source tree Date: Sun, 15 Mar 2015 20:14:05 -0400 Subject: obtained gcc-4.6.4.tar.bz2 from upstream website; verified gcc-4.6.4.tar.bz2.sig; imported gcc-4.6.4 source tree from verified upstream tarball. downloading a git-generated archive based on the 'upstream' tag should provide you with a source tree that is binary identical to the one extracted from the above tarball. if you have obtained the source via the command 'git clone', however, do note that line-endings of files in your working directory might differ from line-endings of the respective files in the upstream repository. --- gcc/config/i386/abmintrin.h | 55 + gcc/config/i386/ammintrin.h | 88 + gcc/config/i386/athlon.md | 1187 ++ gcc/config/i386/atom.md | 796 + gcc/config/i386/att.h | 92 + gcc/config/i386/avxintrin.h | 1426 ++ gcc/config/i386/avxmath.h | 29 + gcc/config/i386/bdver1.md | 796 + gcc/config/i386/biarch64.h | 29 + gcc/config/i386/bmiintrin.h | 145 + gcc/config/i386/bmmintrin.h | 29 + gcc/config/i386/bsd.h | 100 + gcc/config/i386/constraints.md | 175 + gcc/config/i386/core2.md | 691 + gcc/config/i386/cpuid.h | 188 + gcc/config/i386/cross-stdarg.h | 73 + gcc/config/i386/crtdll.h | 42 + gcc/config/i386/crtfastmath.c | 89 + gcc/config/i386/crtprec.c | 47 + gcc/config/i386/cygming-crtbegin.c | 135 + gcc/config/i386/cygming-crtend.c | 88 + gcc/config/i386/cygming.h | 478 + gcc/config/i386/cygming.opt | 54 + gcc/config/i386/cygwin-stdint.h | 62 + gcc/config/i386/cygwin.asm | 188 + gcc/config/i386/cygwin.h | 142 + gcc/config/i386/darwin-libgcc.10.4.ver | 98 + gcc/config/i386/darwin-libgcc.10.5.ver | 102 + gcc/config/i386/darwin.h | 323 + gcc/config/i386/darwin64.h | 35 + gcc/config/i386/djgpp-stdint.h | 62 + gcc/config/i386/djgpp.h | 182 + gcc/config/i386/djgpp.opt | 28 + gcc/config/i386/driver-i386.c | 769 + gcc/config/i386/emmintrin.h | 1513 ++ gcc/config/i386/fma4intrin.h | 236 + gcc/config/i386/freebsd.h | 152 + gcc/config/i386/freebsd64.h | 46 + gcc/config/i386/gas.h | 124 + gcc/config/i386/geode.md | 152 + gcc/config/i386/gmm_malloc.h | 74 + gcc/config/i386/gmon-sol2.c | 459 + gcc/config/i386/gnu.h | 56 + gcc/config/i386/gstabs.h | 7 + gcc/config/i386/gthr-win32.c | 260 + gcc/config/i386/host-cygwin.c | 78 + gcc/config/i386/host-i386-darwin.c | 30 + gcc/config/i386/host-mingw32.c | 179 + gcc/config/i386/i386-builtin-types.awk | 280 + gcc/config/i386/i386-builtin-types.def | 420 + gcc/config/i386/i386-c.c | 401 + gcc/config/i386/i386-interix.h | 357 + gcc/config/i386/i386-interix3.h | 23 + gcc/config/i386/i386-modes.def | 91 + gcc/config/i386/i386-protos.h | 292 + gcc/config/i386/i386.c | 35376 +++++++++++++++++++++++++++++++ gcc/config/i386/i386.h | 2400 +++ gcc/config/i386/i386.md | 18347 ++++++++++++++++ gcc/config/i386/i386.opt | 425 + gcc/config/i386/i386elf.h | 125 + gcc/config/i386/ia32intrin.h | 234 + gcc/config/i386/immintrin.h | 203 + gcc/config/i386/k6.md | 267 + gcc/config/i386/kfreebsd-gnu.h | 25 + gcc/config/i386/knetbsd-gnu.h | 23 + gcc/config/i386/kopensolaris-gnu.h | 22 + gcc/config/i386/libgcc-glibc.ver | 186 + gcc/config/i386/linux-unwind.h | 197 + gcc/config/i386/linux.h | 215 + gcc/config/i386/linux64.h | 132 + gcc/config/i386/lwpintrin.h | 100 + gcc/config/i386/lynx.h | 90 + gcc/config/i386/mingw-stdint.h | 50 + gcc/config/i386/mingw-w64.h | 79 + gcc/config/i386/mingw-w64.opt | 23 + gcc/config/i386/mingw.opt | 27 + gcc/config/i386/mingw32.h | 247 + gcc/config/i386/mm3dnow.h | 215 + gcc/config/i386/mmintrin.h | 921 + gcc/config/i386/mmx.md | 1716 ++ gcc/config/i386/msformat-c.c | 197 + gcc/config/i386/netbsd-elf.h | 124 + gcc/config/i386/netbsd.h | 96 + gcc/config/i386/netbsd64.h | 72 + gcc/config/i386/netware-crt0.c | 79 + gcc/config/i386/netware-libgcc.c | 58 + gcc/config/i386/netware-libgcc.def | 2 + gcc/config/i386/netware-libgcc.exp | 83 + gcc/config/i386/netware.c | 229 + gcc/config/i386/netware.h | 177 + gcc/config/i386/netware.opt | 33 + gcc/config/i386/nmmintrin.h | 37 + gcc/config/i386/nto.h | 108 + gcc/config/i386/nto.opt | 33 + gcc/config/i386/nwld.c | 73 + gcc/config/i386/nwld.h | 69 + gcc/config/i386/openbsd.h | 101 + gcc/config/i386/openbsdelf.h | 134 + gcc/config/i386/pentium.md | 306 + gcc/config/i386/pmm_malloc.h | 57 + gcc/config/i386/pmmintrin.h | 128 + gcc/config/i386/popcntintrin.h | 46 + gcc/config/i386/ppro.md | 758 + gcc/config/i386/predicates.md | 1226 ++ gcc/config/i386/rtemself.h | 32 + gcc/config/i386/sfp-machine.h | 5 + gcc/config/i386/smmintrin.h | 831 + gcc/config/i386/sol2-10.h | 138 + gcc/config/i386/sol2-c1.asm | 151 + gcc/config/i386/sol2-ci.asm | 40 + gcc/config/i386/sol2-cn.asm | 35 + gcc/config/i386/sol2-gas.h | 31 + gcc/config/i386/sol2-gc1.asm | 155 + gcc/config/i386/sol2-unwind.h | 289 + gcc/config/i386/sol2.h | 182 + gcc/config/i386/sse.md | 12125 +++++++++++ gcc/config/i386/ssemath.h | 25 + gcc/config/i386/sync.md | 242 + gcc/config/i386/sysv4.h | 73 + gcc/config/i386/t-crtfm | 8 + gcc/config/i386/t-crtpc | 34 + gcc/config/i386/t-crtpic | 10 + gcc/config/i386/t-crtstuff | 7 + gcc/config/i386/t-cygming | 109 + gcc/config/i386/t-cygwin | 39 + gcc/config/i386/t-darwin | 5 + gcc/config/i386/t-darwin64 | 8 + gcc/config/i386/t-djgpp | 2 + gcc/config/i386/t-dlldir | 6 + gcc/config/i386/t-dlldir-x | 9 + gcc/config/i386/t-dw2-eh | 3 + gcc/config/i386/t-fprules-softfp | 6 + gcc/config/i386/t-gmm_malloc | 6 + gcc/config/i386/t-gnu | 1 + gcc/config/i386/t-gthr-win32 | 2 + gcc/config/i386/t-i386 | 41 + gcc/config/i386/t-i386elf | 4 + gcc/config/i386/t-interix | 8 + gcc/config/i386/t-kfreebsd | 5 + gcc/config/i386/t-linux | 9 + gcc/config/i386/t-linux64 | 36 + gcc/config/i386/t-mingw-w32 | 12 + gcc/config/i386/t-mingw-w64 | 12 + gcc/config/i386/t-mingw32 | 5 + gcc/config/i386/t-netware | 10 + gcc/config/i386/t-nto | 4 + gcc/config/i386/t-nwld | 50 + gcc/config/i386/t-openbsd | 6 + gcc/config/i386/t-pmm_malloc | 6 + gcc/config/i386/t-rtems-i386 | 69 + gcc/config/i386/t-sjlj-eh | 3 + gcc/config/i386/t-sol2-10 | 29 + gcc/config/i386/t-svr3dbx | 7 + gcc/config/i386/t-vxworks | 8 + gcc/config/i386/t-vxworksae | 5 + gcc/config/i386/tbmintrin.h | 191 + gcc/config/i386/tmmintrin.h | 244 + gcc/config/i386/unix.h | 81 + gcc/config/i386/vx-common.h | 33 + gcc/config/i386/vxworks.h | 76 + gcc/config/i386/vxworksae.h | 35 + gcc/config/i386/w32-unwind.h | 204 + gcc/config/i386/winnt-cxx.c | 175 + gcc/config/i386/winnt-stubs.c | 52 + gcc/config/i386/winnt.c | 1134 + gcc/config/i386/wmmintrin.h | 120 + gcc/config/i386/x-cygwin | 4 + gcc/config/i386/x-darwin | 4 + gcc/config/i386/x-i386 | 4 + gcc/config/i386/x-mingw32 | 31 + gcc/config/i386/x86-64.h | 106 + gcc/config/i386/x86intrin.h | 96 + gcc/config/i386/xm-cygwin.h | 22 + gcc/config/i386/xm-djgpp.h | 84 + gcc/config/i386/xm-mingw32.h | 35 + gcc/config/i386/xmmintrin.h | 1251 ++ gcc/config/i386/xopintrin.h | 835 + 177 files changed, 99679 insertions(+) create mode 100644 gcc/config/i386/abmintrin.h create mode 100644 gcc/config/i386/ammintrin.h create mode 100644 gcc/config/i386/athlon.md create mode 100644 gcc/config/i386/atom.md create mode 100644 gcc/config/i386/att.h create mode 100644 gcc/config/i386/avxintrin.h create mode 100644 gcc/config/i386/avxmath.h create mode 100644 gcc/config/i386/bdver1.md create mode 100644 gcc/config/i386/biarch64.h create mode 100644 gcc/config/i386/bmiintrin.h create mode 100644 gcc/config/i386/bmmintrin.h create mode 100644 gcc/config/i386/bsd.h create mode 100644 gcc/config/i386/constraints.md create mode 100644 gcc/config/i386/core2.md create mode 100644 gcc/config/i386/cpuid.h create mode 100644 gcc/config/i386/cross-stdarg.h create mode 100644 gcc/config/i386/crtdll.h create mode 100644 gcc/config/i386/crtfastmath.c create mode 100644 gcc/config/i386/crtprec.c create mode 100644 gcc/config/i386/cygming-crtbegin.c create mode 100644 gcc/config/i386/cygming-crtend.c create mode 100644 gcc/config/i386/cygming.h create mode 100644 gcc/config/i386/cygming.opt create mode 100644 gcc/config/i386/cygwin-stdint.h create mode 100644 gcc/config/i386/cygwin.asm create mode 100644 gcc/config/i386/cygwin.h create mode 100644 gcc/config/i386/darwin-libgcc.10.4.ver create mode 100644 gcc/config/i386/darwin-libgcc.10.5.ver create mode 100644 gcc/config/i386/darwin.h create mode 100644 gcc/config/i386/darwin64.h create mode 100644 gcc/config/i386/djgpp-stdint.h create mode 100644 gcc/config/i386/djgpp.h create mode 100644 gcc/config/i386/djgpp.opt create mode 100644 gcc/config/i386/driver-i386.c create mode 100644 gcc/config/i386/emmintrin.h create mode 100644 gcc/config/i386/fma4intrin.h create mode 100644 gcc/config/i386/freebsd.h create mode 100644 gcc/config/i386/freebsd64.h create mode 100644 gcc/config/i386/gas.h create mode 100644 gcc/config/i386/geode.md create mode 100644 gcc/config/i386/gmm_malloc.h create mode 100644 gcc/config/i386/gmon-sol2.c create mode 100644 gcc/config/i386/gnu.h create mode 100644 gcc/config/i386/gstabs.h create mode 100644 gcc/config/i386/gthr-win32.c create mode 100644 gcc/config/i386/host-cygwin.c create mode 100644 gcc/config/i386/host-i386-darwin.c create mode 100644 gcc/config/i386/host-mingw32.c create mode 100644 gcc/config/i386/i386-builtin-types.awk create mode 100644 gcc/config/i386/i386-builtin-types.def create mode 100644 gcc/config/i386/i386-c.c create mode 100644 gcc/config/i386/i386-interix.h create mode 100644 gcc/config/i386/i386-interix3.h create mode 100644 gcc/config/i386/i386-modes.def create mode 100644 gcc/config/i386/i386-protos.h create mode 100644 gcc/config/i386/i386.c create mode 100644 gcc/config/i386/i386.h create mode 100644 gcc/config/i386/i386.md create mode 100644 gcc/config/i386/i386.opt create mode 100644 gcc/config/i386/i386elf.h create mode 100644 gcc/config/i386/ia32intrin.h create mode 100644 gcc/config/i386/immintrin.h create mode 100644 gcc/config/i386/k6.md create mode 100644 gcc/config/i386/kfreebsd-gnu.h create mode 100644 gcc/config/i386/knetbsd-gnu.h create mode 100644 gcc/config/i386/kopensolaris-gnu.h create mode 100644 gcc/config/i386/libgcc-glibc.ver create mode 100644 gcc/config/i386/linux-unwind.h create mode 100644 gcc/config/i386/linux.h create mode 100644 gcc/config/i386/linux64.h create mode 100644 gcc/config/i386/lwpintrin.h create mode 100644 gcc/config/i386/lynx.h create mode 100644 gcc/config/i386/mingw-stdint.h create mode 100644 gcc/config/i386/mingw-w64.h create mode 100644 gcc/config/i386/mingw-w64.opt create mode 100644 gcc/config/i386/mingw.opt create mode 100644 gcc/config/i386/mingw32.h create mode 100644 gcc/config/i386/mm3dnow.h create mode 100644 gcc/config/i386/mmintrin.h create mode 100644 gcc/config/i386/mmx.md create mode 100644 gcc/config/i386/msformat-c.c create mode 100644 gcc/config/i386/netbsd-elf.h create mode 100644 gcc/config/i386/netbsd.h create mode 100644 gcc/config/i386/netbsd64.h create mode 100644 gcc/config/i386/netware-crt0.c create mode 100644 gcc/config/i386/netware-libgcc.c create mode 100644 gcc/config/i386/netware-libgcc.def create mode 100644 gcc/config/i386/netware-libgcc.exp create mode 100644 gcc/config/i386/netware.c create mode 100644 gcc/config/i386/netware.h create mode 100644 gcc/config/i386/netware.opt create mode 100644 gcc/config/i386/nmmintrin.h create mode 100644 gcc/config/i386/nto.h create mode 100644 gcc/config/i386/nto.opt create mode 100644 gcc/config/i386/nwld.c create mode 100644 gcc/config/i386/nwld.h create mode 100644 gcc/config/i386/openbsd.h create mode 100644 gcc/config/i386/openbsdelf.h create mode 100644 gcc/config/i386/pentium.md create mode 100644 gcc/config/i386/pmm_malloc.h create mode 100644 gcc/config/i386/pmmintrin.h create mode 100644 gcc/config/i386/popcntintrin.h create mode 100644 gcc/config/i386/ppro.md create mode 100644 gcc/config/i386/predicates.md create mode 100644 gcc/config/i386/rtemself.h create mode 100644 gcc/config/i386/sfp-machine.h create mode 100644 gcc/config/i386/smmintrin.h create mode 100644 gcc/config/i386/sol2-10.h create mode 100644 gcc/config/i386/sol2-c1.asm create mode 100644 gcc/config/i386/sol2-ci.asm create mode 100644 gcc/config/i386/sol2-cn.asm create mode 100644 gcc/config/i386/sol2-gas.h create mode 100644 gcc/config/i386/sol2-gc1.asm create mode 100644 gcc/config/i386/sol2-unwind.h create mode 100644 gcc/config/i386/sol2.h create mode 100644 gcc/config/i386/sse.md create mode 100644 gcc/config/i386/ssemath.h create mode 100644 gcc/config/i386/sync.md create mode 100644 gcc/config/i386/sysv4.h create mode 100644 gcc/config/i386/t-crtfm create mode 100644 gcc/config/i386/t-crtpc create mode 100644 gcc/config/i386/t-crtpic create mode 100644 gcc/config/i386/t-crtstuff create mode 100644 gcc/config/i386/t-cygming create mode 100644 gcc/config/i386/t-cygwin create mode 100644 gcc/config/i386/t-darwin create mode 100644 gcc/config/i386/t-darwin64 create mode 100644 gcc/config/i386/t-djgpp create mode 100644 gcc/config/i386/t-dlldir create mode 100644 gcc/config/i386/t-dlldir-x create mode 100644 gcc/config/i386/t-dw2-eh create mode 100644 gcc/config/i386/t-fprules-softfp create mode 100644 gcc/config/i386/t-gmm_malloc create mode 100644 gcc/config/i386/t-gnu create mode 100644 gcc/config/i386/t-gthr-win32 create mode 100644 gcc/config/i386/t-i386 create mode 100644 gcc/config/i386/t-i386elf create mode 100644 gcc/config/i386/t-interix create mode 100644 gcc/config/i386/t-kfreebsd create mode 100644 gcc/config/i386/t-linux create mode 100644 gcc/config/i386/t-linux64 create mode 100644 gcc/config/i386/t-mingw-w32 create mode 100644 gcc/config/i386/t-mingw-w64 create mode 100644 gcc/config/i386/t-mingw32 create mode 100644 gcc/config/i386/t-netware create mode 100644 gcc/config/i386/t-nto create mode 100644 gcc/config/i386/t-nwld create mode 100644 gcc/config/i386/t-openbsd create mode 100644 gcc/config/i386/t-pmm_malloc create mode 100644 gcc/config/i386/t-rtems-i386 create mode 100644 gcc/config/i386/t-sjlj-eh create mode 100644 gcc/config/i386/t-sol2-10 create mode 100644 gcc/config/i386/t-svr3dbx create mode 100644 gcc/config/i386/t-vxworks create mode 100644 gcc/config/i386/t-vxworksae create mode 100644 gcc/config/i386/tbmintrin.h create mode 100644 gcc/config/i386/tmmintrin.h create mode 100644 gcc/config/i386/unix.h create mode 100644 gcc/config/i386/vx-common.h create mode 100644 gcc/config/i386/vxworks.h create mode 100644 gcc/config/i386/vxworksae.h create mode 100644 gcc/config/i386/w32-unwind.h create mode 100644 gcc/config/i386/winnt-cxx.c create mode 100644 gcc/config/i386/winnt-stubs.c create mode 100644 gcc/config/i386/winnt.c create mode 100644 gcc/config/i386/wmmintrin.h create mode 100644 gcc/config/i386/x-cygwin create mode 100644 gcc/config/i386/x-darwin create mode 100644 gcc/config/i386/x-i386 create mode 100644 gcc/config/i386/x-mingw32 create mode 100644 gcc/config/i386/x86-64.h create mode 100644 gcc/config/i386/x86intrin.h create mode 100644 gcc/config/i386/xm-cygwin.h create mode 100644 gcc/config/i386/xm-djgpp.h create mode 100644 gcc/config/i386/xm-mingw32.h create mode 100644 gcc/config/i386/xmmintrin.h create mode 100644 gcc/config/i386/xopintrin.h (limited to 'gcc/config/i386') diff --git a/gcc/config/i386/abmintrin.h b/gcc/config/i386/abmintrin.h new file mode 100644 index 000000000..9d87f5745 --- /dev/null +++ b/gcc/config/i386/abmintrin.h @@ -0,0 +1,55 @@ +/* Copyright (C) 2009 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86INTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef __ABM__ +# error "ABM instruction set not enabled" +#endif /* __ABM__ */ + +#ifndef _ABMINTRIN_H_INCLUDED +#define _ABMINTRIN_H_INCLUDED + +extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lzcnt16 (unsigned short __X) +{ + return __builtin_clzs (__X); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lzcnt (unsigned int __X) +{ + return __builtin_clz (__X); +} + +#ifdef __x86_64__ +extern __inline unsigned long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lzcnt64 (unsigned long __X) +{ + return __builtin_clzl (__X); +} +#endif + +#endif /* _ABMINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/ammintrin.h b/gcc/config/i386/ammintrin.h new file mode 100644 index 000000000..3647b3193 --- /dev/null +++ b/gcc/config/i386/ammintrin.h @@ -0,0 +1,88 @@ +/* Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Implemented from the specification included in the AMD Programmers + Manual Update, version 2.x */ + +#ifndef _AMMINTRIN_H_INCLUDED +#define _AMMINTRIN_H_INCLUDED + +#ifndef __SSE4A__ +# error "SSE4A instruction set not enabled" +#else + +/* We need definitions from the SSE3, SSE2 and SSE header files*/ +#include + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_sd (double * __P, __m128d __Y) +{ + __builtin_ia32_movntsd (__P, (__v2df) __Y); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_ss (float * __P, __m128 __Y) +{ + __builtin_ia32_movntss (__P, (__v4sf) __Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_si64 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_extrq ((__v2di) __X, (__v16qi) __Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extracti_si64 (__m128i __X, unsigned const int __I, unsigned const int __L) +{ + return (__m128i) __builtin_ia32_extrqi ((__v2di) __X, __I, __L); +} +#else +#define _mm_extracti_si64(X, I, L) \ + ((__m128i) __builtin_ia32_extrqi ((__v2di)(__m128i)(X), \ + (unsigned int)(I), (unsigned int)(L))) +#endif + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_si64 (__m128i __X,__m128i __Y) +{ + return (__m128i) __builtin_ia32_insertq ((__v2di)__X, (__v2di)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_inserti_si64(__m128i __X, __m128i __Y, unsigned const int __I, unsigned const int __L) +{ + return (__m128i) __builtin_ia32_insertqi ((__v2di)__X, (__v2di)__Y, __I, __L); +} +#else +#define _mm_inserti_si64(X, Y, I, L) \ + ((__m128i) __builtin_ia32_insertqi ((__v2di)(__m128i)(X), \ + (__v2di)(__m128i)(Y), \ + (unsigned int)(I), (unsigned int)(L))) +#endif + +#endif /* __SSE4A__ */ + +#endif /* _AMMINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/athlon.md b/gcc/config/i386/athlon.md new file mode 100644 index 000000000..2896a154d --- /dev/null +++ b/gcc/config/i386/athlon.md @@ -0,0 +1,1187 @@ +;; Copyright (C) 2002, 2003, 2004, 2005, 2006, +;; 2007 Free Software Foundation, Inc. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . +;; +;; AMD Athlon Scheduling +;; +;; The Athlon does contain three pipelined FP units, three integer units and +;; three address generation units. +;; +;; The predecode logic is determining boundaries of instructions in the 64 +;; byte cache line. So the cache line straddling problem of K6 might be issue +;; here as well, but it is not noted in the documentation. +;; +;; Three DirectPath instructions decoders and only one VectorPath decoder +;; is available. They can decode three DirectPath instructions or one VectorPath +;; instruction per cycle. +;; Decoded macro instructions are then passed to 72 entry instruction control +;; unit, that passes +;; it to the specialized integer (18 entry) and fp (36 entry) schedulers. +;; +;; The load/store queue unit is not attached to the schedulers but +;; communicates with all the execution units separately instead. + +(define_attr "athlon_decode" "direct,vector,double" + (cond [(eq_attr "type" "call,imul,idiv,other,multi,fcmov,fpspc,str,pop,leave") + (const_string "vector") + (and (eq_attr "type" "push") + (match_operand 1 "memory_operand" "")) + (const_string "vector") + (and (eq_attr "type" "fmov") + (and (eq_attr "memory" "load,store") + (eq_attr "mode" "XF"))) + (const_string "vector")] + (const_string "direct"))) + +(define_attr "amdfam10_decode" "direct,vector,double" + (const_string "direct")) +;; +;; decode0 decode1 decode2 +;; \ | / +;; instruction control unit (72 entry scheduler) +;; | | +;; integer scheduler (18) stack map +;; / | | | | \ stack rename +;; ieu0 agu0 ieu1 agu1 ieu2 agu2 scheduler +;; | agu0 | agu1 agu2 register file +;; | \ | | / | | | +;; \ /\ | / fadd fmul fstore +;; \ / \ | / fadd fmul fstore +;; imul load/store (2x) fadd fmul fstore + +(define_automaton "athlon,athlon_load,athlon_mult,athlon_fp") +(define_cpu_unit "athlon-decode0" "athlon") +(define_cpu_unit "athlon-decode1" "athlon") +(define_cpu_unit "athlon-decode2" "athlon") +(define_cpu_unit "athlon-decodev" "athlon") +;; Model the fact that double decoded instruction may take 2 cycles +;; to decode when decoder2 and decoder0 in next cycle +;; is used (this is needed to allow troughput of 1.5 double decoded +;; instructions per cycle). +;; +;; In order to avoid dependence between reservation of decoder +;; and other units, we model decoder as two stage fully pipelined unit +;; and only double decoded instruction may occupy unit in the first cycle. +;; With this scheme however two double instructions can be issued cycle0. +;; +;; Avoid this by using presence set requiring decoder0 to be allocated +;; too. Vector decoded instructions then can't be issued when +;; modeled as consuming decoder0+decoder1+decoder2. +;; We solve that by specialized vector decoder unit and exclusion set. +(presence_set "athlon-decode2" "athlon-decode0") +(exclusion_set "athlon-decodev" "athlon-decode0,athlon-decode1,athlon-decode2") +(define_reservation "athlon-vector" "nothing,athlon-decodev") +(define_reservation "athlon-direct0" "nothing,athlon-decode0") +(define_reservation "athlon-direct" "nothing, + (athlon-decode0 | athlon-decode1 + | athlon-decode2)") +;; Double instructions behaves like two direct instructions. +(define_reservation "athlon-double" "((athlon-decode2, athlon-decode0) + | (nothing,(athlon-decode0 + athlon-decode1)) + | (nothing,(athlon-decode1 + athlon-decode2)))") + +;; Agu and ieu unit results in extremely large automatons and +;; in our approximation they are hardly filled in. Only ieu +;; unit can, as issue rate is 3 and agu unit is always used +;; first in the insn reservations. Skip the models. + +;(define_cpu_unit "athlon-ieu0" "athlon_ieu") +;(define_cpu_unit "athlon-ieu1" "athlon_ieu") +;(define_cpu_unit "athlon-ieu2" "athlon_ieu") +;(define_reservation "athlon-ieu" "(athlon-ieu0 | athlon-ieu1 | athlon-ieu2)") +(define_reservation "athlon-ieu" "nothing") +(define_cpu_unit "athlon-ieu0" "athlon") +;(define_cpu_unit "athlon-agu0" "athlon_agu") +;(define_cpu_unit "athlon-agu1" "athlon_agu") +;(define_cpu_unit "athlon-agu2" "athlon_agu") +;(define_reservation "athlon-agu" "(athlon-agu0 | athlon-agu1 | athlon-agu2)") +(define_reservation "athlon-agu" "nothing") + +(define_cpu_unit "athlon-mult" "athlon_mult") + +(define_cpu_unit "athlon-load0" "athlon_load") +(define_cpu_unit "athlon-load1" "athlon_load") +(define_reservation "athlon-load" "athlon-agu, + (athlon-load0 | athlon-load1),nothing") +;; 128bit SSE instructions issue two loads at once +(define_reservation "athlon-load2" "athlon-agu, + (athlon-load0 + athlon-load1),nothing") + +(define_reservation "athlon-store" "(athlon-load0 | athlon-load1)") +;; 128bit SSE instructions issue two stores at once +(define_reservation "athlon-store2" "(athlon-load0 + athlon-load1)") + + +;; The FP operations start to execute at stage 12 in the pipeline, while +;; integer operations start to execute at stage 9 for Athlon and 11 for K8 +;; Compensate the difference for Athlon because it results in significantly +;; smaller automata. +(define_reservation "athlon-fpsched" "nothing,nothing,nothing") +;; The floating point loads. +(define_reservation "athlon-fpload" "(athlon-fpsched + athlon-load)") +(define_reservation "athlon-fpload2" "(athlon-fpsched + athlon-load2)") +(define_reservation "athlon-fploadk8" "(athlon-fpsched + athlon-load)") +(define_reservation "athlon-fpload2k8" "(athlon-fpsched + athlon-load2)") + + +;; The three fp units are fully pipelined with latency of 3 +(define_cpu_unit "athlon-fadd" "athlon_fp") +(define_cpu_unit "athlon-fmul" "athlon_fp") +(define_cpu_unit "athlon-fstore" "athlon_fp") +(define_reservation "athlon-fany" "(athlon-fstore | athlon-fmul | athlon-fadd)") +(define_reservation "athlon-faddmul" "(athlon-fadd | athlon-fmul)") + +;; Vector operations usually consume many of pipes. +(define_reservation "athlon-fvector" "(athlon-fadd + athlon-fmul + athlon-fstore)") + + +;; Jump instructions are executed in the branch unit completely transparent to us +(define_insn_reservation "athlon_branch" 0 + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (eq_attr "type" "ibr")) + "athlon-direct,athlon-ieu") +(define_insn_reservation "athlon_call" 0 + (and (eq_attr "cpu" "athlon,k8,generic64") + (eq_attr "type" "call,callv")) + "athlon-vector,athlon-ieu") +(define_insn_reservation "athlon_call_amdfam10" 0 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "call,callv")) + "athlon-double,athlon-ieu") + +;; Latency of push operation is 3 cycles, but ESP value is available +;; earlier +(define_insn_reservation "athlon_push" 2 + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (eq_attr "type" "push")) + "athlon-direct,athlon-agu,athlon-store") +(define_insn_reservation "athlon_pop" 4 + (and (eq_attr "cpu" "athlon,k8,generic64") + (eq_attr "type" "pop")) + "athlon-vector,athlon-load,athlon-ieu") +(define_insn_reservation "athlon_pop_k8" 3 + (and (eq_attr "cpu" "k8,generic64") + (eq_attr "type" "pop")) + "athlon-double,(athlon-ieu+athlon-load)") +(define_insn_reservation "athlon_pop_amdfam10" 3 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "pop")) + "athlon-direct,(athlon-ieu+athlon-load)") +(define_insn_reservation "athlon_leave" 3 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "leave")) + "athlon-vector,(athlon-ieu+athlon-load)") +(define_insn_reservation "athlon_leave_k8" 3 + (and (eq_attr "cpu" "k8,generic64,amdfam10") + (eq_attr "type" "leave")) + "athlon-double,(athlon-ieu+athlon-load)") + +;; Lea executes in AGU unit with 2 cycles latency. +(define_insn_reservation "athlon_lea" 2 + (and (eq_attr "cpu" "athlon,k8,generic64") + (eq_attr "type" "lea")) + "athlon-direct,athlon-agu,nothing") +;; Lea executes in AGU unit with 1 cycle latency on AMDFAM10 +(define_insn_reservation "athlon_lea_amdfam10" 1 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "lea")) + "athlon-direct,athlon-agu,nothing") + +;; Mul executes in special multiplier unit attached to IEU0 +(define_insn_reservation "athlon_imul" 5 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "imul") + (eq_attr "memory" "none,unknown"))) + "athlon-vector,athlon-ieu0,athlon-mult,nothing,nothing,athlon-ieu0") +;; ??? Widening multiply is vector or double. +(define_insn_reservation "athlon_imul_k8_DI" 4 + (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "imul") + (and (eq_attr "mode" "DI") + (eq_attr "memory" "none,unknown")))) + "athlon-direct0,athlon-ieu0,athlon-mult,nothing,athlon-ieu0") +(define_insn_reservation "athlon_imul_k8" 3 + (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "imul") + (eq_attr "memory" "none,unknown"))) + "athlon-direct0,athlon-ieu0,athlon-mult,athlon-ieu0") +(define_insn_reservation "athlon_imul_amdfam10_HI" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "imul") + (and (eq_attr "mode" "HI") + (eq_attr "memory" "none,unknown")))) + "athlon-vector,athlon-ieu0,athlon-mult,nothing,athlon-ieu0") +(define_insn_reservation "athlon_imul_mem" 8 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "imul") + (eq_attr "memory" "load,both"))) + "athlon-vector,athlon-load,athlon-ieu,athlon-mult,nothing,nothing,athlon-ieu") +(define_insn_reservation "athlon_imul_mem_k8_DI" 7 + (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "imul") + (and (eq_attr "mode" "DI") + (eq_attr "memory" "load,both")))) + "athlon-vector,athlon-load,athlon-ieu,athlon-mult,nothing,athlon-ieu") +(define_insn_reservation "athlon_imul_mem_k8" 6 + (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "imul") + (eq_attr "memory" "load,both"))) + "athlon-vector,athlon-load,athlon-ieu,athlon-mult,athlon-ieu") + +;; Idiv cannot execute in parallel with other instructions. Dealing with it +;; as with short latency vector instruction is good approximation avoiding +;; scheduler from trying too hard to can hide it's latency by overlap with +;; other instructions. +;; ??? Experiments show that the idiv can overlap with roughly 6 cycles +;; of the other code +;; Using the same heuristics for amdfam10 as K8 with idiv + +(define_insn_reservation "athlon_idiv" 6 + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (and (eq_attr "type" "idiv") + (eq_attr "memory" "none,unknown"))) + "athlon-vector,(athlon-ieu0*6+(athlon-fpsched,athlon-fvector))") +(define_insn_reservation "athlon_idiv_mem" 9 + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (and (eq_attr "type" "idiv") + (eq_attr "memory" "load,both"))) + "athlon-vector,((athlon-load,athlon-ieu0*6)+(athlon-fpsched,athlon-fvector))") +;; The parallelism of string instructions is not documented. Model it same way +;; as idiv to create smaller automata. This probably does not matter much. +;; Using the same heuristics for amdfam10 as K8 with idiv +(define_insn_reservation "athlon_str" 6 + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (and (eq_attr "type" "str") + (eq_attr "memory" "load,both,store"))) + "athlon-vector,athlon-load,athlon-ieu0*6") + +(define_insn_reservation "athlon_idirect" 1 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "athlon_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "none,unknown")))) + "athlon-direct,athlon-ieu") +(define_insn_reservation "athlon_idirect_amdfam10" 1 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "none,unknown")))) + "athlon-direct,athlon-ieu") +(define_insn_reservation "athlon_ivector" 2 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "athlon_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "none,unknown")))) + "athlon-vector,athlon-ieu,athlon-ieu") +(define_insn_reservation "athlon_ivector_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "none,unknown")))) + "athlon-vector,athlon-ieu,athlon-ieu") + +(define_insn_reservation "athlon_idirect_loadmov" 3 + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (and (eq_attr "type" "imov") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-load") + +(define_insn_reservation "athlon_idirect_load" 4 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "athlon_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-load,athlon-ieu") +(define_insn_reservation "athlon_idirect_load_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-load,athlon-ieu") +(define_insn_reservation "athlon_ivector_load" 6 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "athlon_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "load")))) + "athlon-vector,athlon-load,athlon-ieu,athlon-ieu") +(define_insn_reservation "athlon_ivector_load_amdfam10" 6 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "load")))) + "athlon-vector,athlon-load,athlon-ieu,athlon-ieu") + +(define_insn_reservation "athlon_idirect_movstore" 1 + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (and (eq_attr "type" "imov") + (eq_attr "memory" "store"))) + "athlon-direct,athlon-agu,athlon-store") + +(define_insn_reservation "athlon_idirect_both" 4 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "athlon_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "both")))) + "athlon-direct,athlon-load, + athlon-ieu,athlon-store, + athlon-store") +(define_insn_reservation "athlon_idirect_both_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "both")))) + "athlon-direct,athlon-load, + athlon-ieu,athlon-store, + athlon-store") + +(define_insn_reservation "athlon_ivector_both" 6 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "athlon_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "both")))) + "athlon-vector,athlon-load, + athlon-ieu, + athlon-ieu, + athlon-store") +(define_insn_reservation "athlon_ivector_both_amdfam10" 6 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "both")))) + "athlon-vector,athlon-load, + athlon-ieu, + athlon-ieu, + athlon-store") + +(define_insn_reservation "athlon_idirect_store" 1 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "athlon_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "store")))) + "athlon-direct,(athlon-ieu+athlon-agu), + athlon-store") +(define_insn_reservation "athlon_idirect_store_amdfam10" 1 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "store")))) + "athlon-direct,(athlon-ieu+athlon-agu), + athlon-store") + +(define_insn_reservation "athlon_ivector_store" 2 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "athlon_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "store")))) + "athlon-vector,(athlon-ieu+athlon-agu),athlon-ieu, + athlon-store") +(define_insn_reservation "athlon_ivector_store_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "store")))) + "athlon-vector,(athlon-ieu+athlon-agu),athlon-ieu, + athlon-store") + +;; Athlon floatin point unit +(define_insn_reservation "athlon_fldxf" 12 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "fmov") + (and (eq_attr "memory" "load") + (eq_attr "mode" "XF")))) + "athlon-vector,athlon-fpload2,athlon-fvector*9") +(define_insn_reservation "athlon_fldxf_k8" 13 + (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "fmov") + (and (eq_attr "memory" "load") + (eq_attr "mode" "XF")))) + "athlon-vector,athlon-fpload2k8,athlon-fvector*9") +;; Assume superforwarding to take place so effective latency of fany op is 0. +(define_insn_reservation "athlon_fld" 0 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "fmov") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fpload,athlon-fany") +(define_insn_reservation "athlon_fld_k8" 2 + (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "fmov") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fstore") + +(define_insn_reservation "athlon_fstxf" 10 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "fmov") + (and (eq_attr "memory" "store,both") + (eq_attr "mode" "XF")))) + "athlon-vector,(athlon-fpsched+athlon-agu),(athlon-store2+(athlon-fvector*7))") +(define_insn_reservation "athlon_fstxf_k8" 8 + (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "fmov") + (and (eq_attr "memory" "store,both") + (eq_attr "mode" "XF")))) + "athlon-vector,(athlon-fpsched+athlon-agu),(athlon-store2+(athlon-fvector*6))") +(define_insn_reservation "athlon_fst" 4 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "fmov") + (eq_attr "memory" "store,both"))) + "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") +(define_insn_reservation "athlon_fst_k8" 2 + (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "fmov") + (eq_attr "memory" "store,both"))) + "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") +(define_insn_reservation "athlon_fist" 4 + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (eq_attr "type" "fistp,fisttp")) + "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") +(define_insn_reservation "athlon_fmov" 2 + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (eq_attr "type" "fmov")) + "athlon-direct,athlon-fpsched,athlon-faddmul") +(define_insn_reservation "athlon_fadd_load" 4 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "fop") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fpload,athlon-fadd") +(define_insn_reservation "athlon_fadd_load_k8" 6 + (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "fop") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fadd") +(define_insn_reservation "athlon_fadd" 4 + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (eq_attr "type" "fop")) + "athlon-direct,athlon-fpsched,athlon-fadd") +(define_insn_reservation "athlon_fmul_load" 4 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "fmul") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fpload,athlon-fmul") +(define_insn_reservation "athlon_fmul_load_k8" 6 + (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "fmul") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fmul") +(define_insn_reservation "athlon_fmul" 4 + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (eq_attr "type" "fmul")) + "athlon-direct,athlon-fpsched,athlon-fmul") +(define_insn_reservation "athlon_fsgn" 2 + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (eq_attr "type" "fsgn")) + "athlon-direct,athlon-fpsched,athlon-fmul") +(define_insn_reservation "athlon_fdiv_load" 24 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "fdiv") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fpload,athlon-fmul") +(define_insn_reservation "athlon_fdiv_load_k8" 13 + (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "fdiv") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fmul") +(define_insn_reservation "athlon_fdiv" 24 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "fdiv")) + "athlon-direct,athlon-fpsched,athlon-fmul") +(define_insn_reservation "athlon_fdiv_k8" 11 + (and (eq_attr "cpu" "k8,generic64,amdfam10") + (eq_attr "type" "fdiv")) + "athlon-direct,athlon-fpsched,athlon-fmul") +(define_insn_reservation "athlon_fpspc_load" 103 + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (and (eq_attr "type" "fpspc") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-fpload,athlon-fvector") +(define_insn_reservation "athlon_fpspc" 100 + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (eq_attr "type" "fpspc")) + "athlon-vector,athlon-fpsched,athlon-fvector") +(define_insn_reservation "athlon_fcmov_load" 7 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "fcmov") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-fpload,athlon-fvector") +(define_insn_reservation "athlon_fcmov" 7 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "fcmov")) + "athlon-vector,athlon-fpsched,athlon-fvector") +(define_insn_reservation "athlon_fcmov_load_k8" 17 + (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "fcmov") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-fploadk8,athlon-fvector") +(define_insn_reservation "athlon_fcmov_k8" 15 + (and (eq_attr "cpu" "k8,generic64,amdfam10") + (eq_attr "type" "fcmov")) + "athlon-vector,athlon-fpsched,athlon-fvector") +;; fcomi is vector decoded by uses only one pipe. +(define_insn_reservation "athlon_fcomi_load" 3 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "fcmp") + (and (eq_attr "athlon_decode" "vector") + (eq_attr "memory" "load")))) + "athlon-vector,athlon-fpload,athlon-fadd") +(define_insn_reservation "athlon_fcomi_load_k8" 5 + (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "fcmp") + (and (eq_attr "athlon_decode" "vector") + (eq_attr "memory" "load")))) + "athlon-vector,athlon-fploadk8,athlon-fadd") +(define_insn_reservation "athlon_fcomi" 3 + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (and (eq_attr "athlon_decode" "vector") + (eq_attr "type" "fcmp"))) + "athlon-vector,athlon-fpsched,athlon-fadd") +(define_insn_reservation "athlon_fcom_load" 2 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "fcmp") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fpload,athlon-fadd") +(define_insn_reservation "athlon_fcom_load_k8" 4 + (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "fcmp") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fadd") +(define_insn_reservation "athlon_fcom" 2 + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (eq_attr "type" "fcmp")) + "athlon-direct,athlon-fpsched,athlon-fadd") +;; Never seen by the scheduler because we still don't do post reg-stack +;; scheduling. +;(define_insn_reservation "athlon_fxch" 2 +; (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") +; (eq_attr "type" "fxch")) +; "athlon-direct,athlon-fpsched,athlon-fany") + +;; Athlon handle MMX operations in the FPU unit with shorter latencies + +(define_insn_reservation "athlon_movlpd_load" 0 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "ssemov") + (match_operand:DF 1 "memory_operand" ""))) + "athlon-direct,athlon-fpload,athlon-fany") +(define_insn_reservation "athlon_movlpd_load_k8" 2 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "ssemov") + (match_operand:DF 1 "memory_operand" ""))) + "athlon-direct,athlon-fploadk8,athlon-fstore") +(define_insn_reservation "athlon_movsd_load_generic64" 2 + (and (eq_attr "cpu" "generic64") + (and (eq_attr "type" "ssemov") + (match_operand:DF 1 "memory_operand" ""))) + "athlon-double,athlon-fploadk8,(athlon-fstore+athlon-fmul)") +(define_insn_reservation "athlon_movaps_load_k8" 2 + (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "type" "ssemov") + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "load")))) + "athlon-double,athlon-fpload2k8,athlon-fstore,athlon-fstore") +(define_insn_reservation "athlon_movaps_load" 0 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "ssemov") + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "load")))) + "athlon-vector,athlon-fpload2,(athlon-fany+athlon-fany)") +(define_insn_reservation "athlon_movss_load" 1 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "ssemov") + (and (eq_attr "mode" "SF,DI") + (eq_attr "memory" "load")))) + "athlon-vector,athlon-fpload,(athlon-fany*2)") +(define_insn_reservation "athlon_movss_load_k8" 1 + (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "type" "ssemov") + (and (eq_attr "mode" "SF,DI") + (eq_attr "memory" "load")))) + "athlon-double,athlon-fploadk8,(athlon-fstore+athlon-fany)") +(define_insn_reservation "athlon_mmxsseld" 0 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "mmxmov,ssemov") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fpload,athlon-fany") +(define_insn_reservation "athlon_mmxsseld_k8" 2 + (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "type" "mmxmov,ssemov") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fstore") +;; On AMDFAM10 all double, single and integer packed and scalar SSEx data +;; loads generated are direct path, latency of 2 and do not use any FP +;; executions units. No separate entries for movlpx/movhpx loads, which +;; are direct path, latency of 4 and use the FADD/FMUL FP execution units, +;; as they will not be generated. +(define_insn_reservation "athlon_sseld_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssemov") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8") +;; On AMDFAM10 MMX data loads generated are direct path, latency of 4 +;; and can use any FP executions units +(define_insn_reservation "athlon_mmxld_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "mmxmov") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8, athlon-fany") +(define_insn_reservation "athlon_mmxssest" 3 + (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "type" "mmxmov,ssemov") + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "store,both")))) + "athlon-vector,(athlon-fpsched+athlon-agu),((athlon-fstore+athlon-store2)*2)") +(define_insn_reservation "athlon_mmxssest_k8" 3 + (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "type" "mmxmov,ssemov") + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "store,both")))) + "athlon-double,(athlon-fpsched+athlon-agu),((athlon-fstore+athlon-store2)*2)") +(define_insn_reservation "athlon_mmxssest_short" 2 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "type" "mmxmov,ssemov") + (eq_attr "memory" "store,both"))) + "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") +;; On AMDFAM10 all double, single and integer packed SSEx data stores +;; generated are all double path, latency of 2 and use the FSTORE FP +;; execution unit. No entries separate for movupx/movdqu, which are +;; vector path, latency of 3 and use the FSTORE*2 FP execution unit, +;; as they will not be generated. +(define_insn_reservation "athlon_ssest_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssemov") + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "store,both")))) + "athlon-double,(athlon-fpsched+athlon-agu),((athlon-fstore+athlon-store)*2)") +;; On AMDFAM10 all double, single and integer scalar SSEx and MMX +;; data stores generated are all direct path, latency of 2 and use +;; the FSTORE FP execution unit +(define_insn_reservation "athlon_mmxssest_short_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "mmxmov,ssemov") + (eq_attr "memory" "store,both"))) + "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") +(define_insn_reservation "athlon_movaps_k8" 2 + (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "type" "ssemov") + (eq_attr "mode" "V4SF,V2DF,TI"))) + "athlon-double,athlon-fpsched,((athlon-faddmul+athlon-faddmul) | (athlon-faddmul, athlon-faddmul))") +(define_insn_reservation "athlon_movaps" 2 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "ssemov") + (eq_attr "mode" "V4SF,V2DF,TI"))) + "athlon-vector,athlon-fpsched,(athlon-faddmul+athlon-faddmul)") +(define_insn_reservation "athlon_mmxssemov" 2 + (and (eq_attr "cpu" "athlon,k8,generic64") + (eq_attr "type" "mmxmov,ssemov")) + "athlon-direct,athlon-fpsched,athlon-faddmul") +(define_insn_reservation "athlon_mmxmul_load" 4 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "type" "mmxmul") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fpload,athlon-fmul") +(define_insn_reservation "athlon_mmxmul" 3 + (and (eq_attr "cpu" "athlon,k8,generic64") + (eq_attr "type" "mmxmul")) + "athlon-direct,athlon-fpsched,athlon-fmul") +(define_insn_reservation "athlon_mmx_load" 3 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "unit" "mmx") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fpload,athlon-faddmul") +(define_insn_reservation "athlon_mmx" 2 + (and (eq_attr "cpu" "athlon,k8,generic64") + (eq_attr "unit" "mmx")) + "athlon-direct,athlon-fpsched,athlon-faddmul") +;; SSE operations are handled by the i387 unit as well. The latency +;; is same as for i387 operations for scalar operations + +(define_insn_reservation "athlon_sselog_load" 3 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "sselog,sselog1") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-fpload2,(athlon-fmul*2)") +(define_insn_reservation "athlon_sselog_load_k8" 5 + (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "type" "sselog,sselog1") + (eq_attr "memory" "load"))) + "athlon-double,athlon-fpload2k8,(athlon-fmul*2)") +(define_insn_reservation "athlon_sselog_load_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sselog,sselog1") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,(athlon-fadd|athlon-fmul)") +(define_insn_reservation "athlon_sselog" 3 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "sselog,sselog1")) + "athlon-vector,athlon-fpsched,athlon-fmul*2") +(define_insn_reservation "athlon_sselog_k8" 3 + (and (eq_attr "cpu" "k8,generic64") + (eq_attr "type" "sselog,sselog1")) + "athlon-double,athlon-fpsched,athlon-fmul") +(define_insn_reservation "athlon_sselog_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "sselog,sselog1")) + "athlon-direct,athlon-fpsched,(athlon-fadd|athlon-fmul)") + +;; ??? pcmp executes in addmul, probably not worthwhile to bother about that. +(define_insn_reservation "athlon_ssecmp_load" 2 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "ssecmp") + (and (eq_attr "mode" "SF,DF,DI") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-fpload,athlon-fadd") +(define_insn_reservation "athlon_ssecmp_load_k8" 4 + (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "ssecmp") + (and (eq_attr "mode" "SF,DF,DI,TI") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-fploadk8,athlon-fadd") +(define_insn_reservation "athlon_ssecmp" 2 + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (and (eq_attr "type" "ssecmp") + (eq_attr "mode" "SF,DF,DI,TI"))) + "athlon-direct,athlon-fpsched,athlon-fadd") +(define_insn_reservation "athlon_ssecmpvector_load" 3 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "ssecmp") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-fpload2,(athlon-fadd*2)") +(define_insn_reservation "athlon_ssecmpvector_load_k8" 5 + (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "type" "ssecmp") + (eq_attr "memory" "load"))) + "athlon-double,athlon-fpload2k8,(athlon-fadd*2)") +(define_insn_reservation "athlon_ssecmpvector_load_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecmp") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fadd") +(define_insn_reservation "athlon_ssecmpvector" 3 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "ssecmp")) + "athlon-vector,athlon-fpsched,(athlon-fadd*2)") +(define_insn_reservation "athlon_ssecmpvector_k8" 3 + (and (eq_attr "cpu" "k8,generic64") + (eq_attr "type" "ssecmp")) + "athlon-double,athlon-fpsched,(athlon-fadd*2)") +(define_insn_reservation "athlon_ssecmpvector_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "ssecmp")) + "athlon-direct,athlon-fpsched,athlon-fadd") +(define_insn_reservation "athlon_ssecomi_load" 4 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "ssecomi") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-fpload,athlon-fadd") +(define_insn_reservation "athlon_ssecomi_load_k8" 6 + (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "type" "ssecomi") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-fploadk8,athlon-fadd") +(define_insn_reservation "athlon_ssecomi_load_amdfam10" 5 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecomi") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fadd") +(define_insn_reservation "athlon_ssecomi" 4 + (and (eq_attr "cpu" "athlon,k8,generic64") + (eq_attr "type" "ssecomi")) + "athlon-vector,athlon-fpsched,athlon-fadd") +(define_insn_reservation "athlon_ssecomi_amdfam10" 3 + (and (eq_attr "cpu" "amdfam10") +;; It seems athlon_ssecomi has a bug in the attr_type, fixed for amdfam10 + (eq_attr "type" "ssecomi")) + "athlon-direct,athlon-fpsched,athlon-fadd") +(define_insn_reservation "athlon_sseadd_load" 4 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "sseadd") + (and (eq_attr "mode" "SF,DF,DI") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-fpload,athlon-fadd") +(define_insn_reservation "athlon_sseadd_load_k8" 6 + (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "sseadd") + (and (eq_attr "mode" "SF,DF,DI") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-fploadk8,athlon-fadd") +(define_insn_reservation "athlon_sseadd" 4 + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (and (eq_attr "type" "sseadd") + (eq_attr "mode" "SF,DF,DI"))) + "athlon-direct,athlon-fpsched,athlon-fadd") +(define_insn_reservation "athlon_sseaddvector_load" 5 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "sseadd") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-fpload2,(athlon-fadd*2)") +(define_insn_reservation "athlon_sseaddvector_load_k8" 7 + (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "type" "sseadd") + (eq_attr "memory" "load"))) + "athlon-double,athlon-fpload2k8,(athlon-fadd*2)") +(define_insn_reservation "athlon_sseaddvector_load_amdfam10" 6 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseadd") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fadd") +(define_insn_reservation "athlon_sseaddvector" 5 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "sseadd")) + "athlon-vector,athlon-fpsched,(athlon-fadd*2)") +(define_insn_reservation "athlon_sseaddvector_k8" 5 + (and (eq_attr "cpu" "k8,generic64") + (eq_attr "type" "sseadd")) + "athlon-double,athlon-fpsched,(athlon-fadd*2)") +(define_insn_reservation "athlon_sseaddvector_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "sseadd")) + "athlon-direct,athlon-fpsched,athlon-fadd") + +;; Conversions behaves very irregularly and the scheduling is critical here. +;; Take each instruction separately. Assume that the mode is always set to the +;; destination one and athlon_decode is set to the K8 versions. + +;; cvtss2sd +(define_insn_reservation "athlon_ssecvt_cvtss2sd_load_k8" 4 + (and (eq_attr "cpu" "k8,athlon,generic64") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "athlon_decode" "direct") + (and (eq_attr "mode" "DF") + (eq_attr "memory" "load"))))) + "athlon-direct,athlon-fploadk8,athlon-fstore") +(define_insn_reservation "athlon_ssecvt_cvtss2sd_load_amdfam10" 7 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "DF") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") +(define_insn_reservation "athlon_ssecvt_cvtss2sd" 2 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "athlon_decode" "direct") + (eq_attr "mode" "DF")))) + "athlon-direct,athlon-fpsched,athlon-fstore") +(define_insn_reservation "athlon_ssecvt_cvtss2sd_amdfam10" 7 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "vector") + (eq_attr "mode" "DF")))) + "athlon-vector,athlon-fpsched,athlon-faddmul,(athlon-fstore*2)") +;; cvtps2pd. Model same way the other double decoded FP conversions. +(define_insn_reservation "athlon_ssecvt_cvtps2pd_load_k8" 5 + (and (eq_attr "cpu" "k8,athlon,generic64") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "athlon_decode" "double") + (and (eq_attr "mode" "V2DF,V4SF,TI") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fpload2k8,(athlon-fstore*2)") +(define_insn_reservation "athlon_ssecvt_cvtps2pd_load_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "direct") + (and (eq_attr "mode" "V2DF,V4SF,TI") + (eq_attr "memory" "load"))))) + "athlon-direct,athlon-fploadk8,athlon-fstore") +(define_insn_reservation "athlon_ssecvt_cvtps2pd_k8" 3 + (and (eq_attr "cpu" "k8,athlon,generic64") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "athlon_decode" "double") + (eq_attr "mode" "V2DF,V4SF,TI")))) + "athlon-double,athlon-fpsched,athlon-fstore,athlon-fstore") +(define_insn_reservation "athlon_ssecvt_cvtps2pd_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "direct") + (eq_attr "mode" "V2DF,V4SF,TI")))) + "athlon-direct,athlon-fpsched,athlon-fstore") +;; cvtsi2sd mem,reg is directpath path (cvtsi2sd reg,reg is doublepath) +;; cvtsi2sd has troughput 1 and is executed in store unit with latency of 6 +(define_insn_reservation "athlon_sseicvt_cvtsi2sd_load" 6 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "athlon_decode" "direct") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load"))))) + "athlon-direct,athlon-fploadk8,athlon-fstore") +(define_insn_reservation "athlon_sseicvt_cvtsi2sd_load_amdfam10" 9 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") +;; cvtsi2ss mem, reg is doublepath +(define_insn_reservation "athlon_sseicvt_cvtsi2ss_load" 9 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "athlon_decode" "double") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load"))))) + "athlon-vector,athlon-fpload,(athlon-fstore*2)") +(define_insn_reservation "athlon_sseicvt_cvtsi2ss_load_k8" 9 + (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "athlon_decode" "double") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-fstore*2)") +(define_insn_reservation "athlon_sseicvt_cvtsi2ss_load_amdfam10" 9 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") +;; cvtsi2sd reg,reg is double decoded (vector on Athlon) +(define_insn_reservation "athlon_sseicvt_cvtsi2sd_k8" 11 + (and (eq_attr "cpu" "k8,athlon,generic64") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "athlon_decode" "double") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "none"))))) + "athlon-double,athlon-fploadk8,athlon-fstore") +(define_insn_reservation "athlon_sseicvt_cvtsi2sd_amdfam10" 14 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "none"))))) + "athlon-vector,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") +;; cvtsi2ss reg, reg is doublepath +(define_insn_reservation "athlon_sseicvt_cvtsi2ss" 14 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "athlon_decode" "vector") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "none"))))) + "athlon-vector,athlon-fploadk8,(athlon-fvector*2)") +(define_insn_reservation "athlon_sseicvt_cvtsi2ss_amdfam10" 14 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "none"))))) + "athlon-vector,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") +;; cvtsd2ss mem,reg is doublepath, troughput unknown, latency 9 +(define_insn_reservation "athlon_ssecvt_cvtsd2ss_load_k8" 9 + (and (eq_attr "cpu" "k8,athlon,generic64") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "athlon_decode" "double") + (and (eq_attr "mode" "SF") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-fstore*3)") +(define_insn_reservation "athlon_ssecvt_cvtsd2ss_load_amdfam10" 9 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "SF") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") +;; cvtsd2ss reg,reg is vectorpath, troughput unknown, latency 12 +(define_insn_reservation "athlon_ssecvt_cvtsd2ss" 12 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "athlon_decode" "vector") + (and (eq_attr "mode" "SF") + (eq_attr "memory" "none"))))) + "athlon-vector,athlon-fpsched,(athlon-fvector*3)") +(define_insn_reservation "athlon_ssecvt_cvtsd2ss_amdfam10" 8 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "mode" "SF") + (eq_attr "memory" "none"))))) + "athlon-vector,athlon-fpsched,athlon-faddmul,(athlon-fstore*2)") +(define_insn_reservation "athlon_ssecvt_cvtpd2ps_load_k8" 8 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "athlon_decode" "vector") + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fpload2k8,(athlon-fstore*3)") +(define_insn_reservation "athlon_ssecvt_cvtpd2ps_load_amdfam10" 9 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") +;; cvtpd2ps mem,reg is vectorpath, troughput unknown, latency 10 +;; ??? Why it is fater than cvtsd2ss? +(define_insn_reservation "athlon_ssecvt_cvtpd2ps" 8 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "athlon_decode" "vector") + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "none"))))) + "athlon-vector,athlon-fpsched,athlon-fvector*2") +(define_insn_reservation "athlon_ssecvt_cvtpd2ps_amdfam10" 7 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "none"))))) + "athlon-double,athlon-fpsched,(athlon-faddmul+athlon-fstore)") +;; cvtsd2si mem,reg is doublepath, troughput 1, latency 9 +(define_insn_reservation "athlon_secvt_cvtsX2si_load" 9 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "athlon_decode" "vector") + (and (eq_attr "mode" "SI,DI") + (eq_attr "memory" "load"))))) + "athlon-vector,athlon-fploadk8,athlon-fvector") +(define_insn_reservation "athlon_secvt_cvtsX2si_load_amdfam10" 10 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "SI,DI") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-fadd+athlon-fstore)") +;; cvtsd2si reg,reg is doublepath, troughput 1, latency 9 +(define_insn_reservation "athlon_ssecvt_cvtsX2si" 9 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "athlon_decode" "double") + (and (eq_attr "mode" "SI,DI") + (eq_attr "memory" "none"))))) + "athlon-vector,athlon-fpsched,athlon-fvector") +(define_insn_reservation "athlon_ssecvt_cvtsX2si_k8" 9 + (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "athlon_decode" "double") + (and (eq_attr "mode" "SI,DI") + (eq_attr "memory" "none"))))) + "athlon-double,athlon-fpsched,athlon-fstore") +(define_insn_reservation "athlon_ssecvt_cvtsX2si_amdfam10" 8 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "SI,DI") + (eq_attr "memory" "none"))))) + "athlon-double,athlon-fpsched,(athlon-fadd+athlon-fstore)") +;; cvtpd2dq reg,mem is doublepath, troughput 1, latency 9 on amdfam10 +(define_insn_reservation "athlon_sseicvt_cvtpd2dq_load_amdfam10" 9 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "TI") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") +;; cvtpd2dq reg,mem is doublepath, troughput 1, latency 7 on amdfam10 +(define_insn_reservation "athlon_sseicvt_cvtpd2dq_amdfam10" 7 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "TI") + (eq_attr "memory" "none"))))) + "athlon-double,athlon-fpsched,(athlon-faddmul+athlon-fstore)") + + +(define_insn_reservation "athlon_ssemul_load" 4 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "ssemul") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-fpload,athlon-fmul") +(define_insn_reservation "athlon_ssemul_load_k8" 6 + (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "ssemul") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-fploadk8,athlon-fmul") +(define_insn_reservation "athlon_ssemul" 4 + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (and (eq_attr "type" "ssemul") + (eq_attr "mode" "SF,DF"))) + "athlon-direct,athlon-fpsched,athlon-fmul") +(define_insn_reservation "athlon_ssemulvector_load" 5 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "ssemul") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-fpload2,(athlon-fmul*2)") +(define_insn_reservation "athlon_ssemulvector_load_k8" 7 + (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "type" "ssemul") + (eq_attr "memory" "load"))) + "athlon-double,athlon-fpload2k8,(athlon-fmul*2)") +(define_insn_reservation "athlon_ssemulvector_load_amdfam10" 6 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssemul") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fmul") +(define_insn_reservation "athlon_ssemulvector" 5 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "ssemul")) + "athlon-vector,athlon-fpsched,(athlon-fmul*2)") +(define_insn_reservation "athlon_ssemulvector_k8" 5 + (and (eq_attr "cpu" "k8,generic64") + (eq_attr "type" "ssemul")) + "athlon-double,athlon-fpsched,(athlon-fmul*2)") +(define_insn_reservation "athlon_ssemulvector_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "ssemul")) + "athlon-direct,athlon-fpsched,athlon-fmul") +;; divsd timings. divss is faster +(define_insn_reservation "athlon_ssediv_load" 20 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "ssediv") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-fpload,athlon-fmul*17") +(define_insn_reservation "athlon_ssediv_load_k8" 22 + (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "ssediv") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-fploadk8,athlon-fmul*17") +(define_insn_reservation "athlon_ssediv" 20 + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (and (eq_attr "type" "ssediv") + (eq_attr "mode" "SF,DF"))) + "athlon-direct,athlon-fpsched,athlon-fmul*17") +(define_insn_reservation "athlon_ssedivvector_load" 39 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "ssediv") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-fpload2,athlon-fmul*34") +(define_insn_reservation "athlon_ssedivvector_load_k8" 35 + (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "type" "ssediv") + (eq_attr "memory" "load"))) + "athlon-double,athlon-fpload2k8,athlon-fmul*34") +(define_insn_reservation "athlon_ssedivvector_load_amdfam10" 22 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssediv") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fmul*17") +(define_insn_reservation "athlon_ssedivvector" 39 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "ssediv")) + "athlon-vector,athlon-fmul*34") +(define_insn_reservation "athlon_ssedivvector_k8" 39 + (and (eq_attr "cpu" "k8,generic64") + (eq_attr "type" "ssediv")) + "athlon-double,athlon-fmul*34") +(define_insn_reservation "athlon_ssedivvector_amdfam10" 20 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "ssediv")) + "athlon-direct,athlon-fmul*17") +(define_insn_reservation "athlon_sseins_amdfam10" 5 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseins") + (eq_attr "mode" "TI"))) + "athlon-vector,athlon-fpsched,athlon-faddmul") diff --git a/gcc/config/i386/atom.md b/gcc/config/i386/atom.md new file mode 100644 index 000000000..3c2b95758 --- /dev/null +++ b/gcc/config/i386/atom.md @@ -0,0 +1,796 @@ +;; Atom Scheduling +;; Copyright (C) 2009, 2010 Free Software Foundation, Inc. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . +;; +;; Atom is an in-order core with two integer pipelines. + + +(define_attr "atom_unit" "sishuf,simul,jeu,complex,other" + (const_string "other")) + +(define_attr "atom_sse_attr" "rcp,movdup,lfence,fence,prefetch,sqrt,mxcsr,other" + (const_string "other")) + +(define_automaton "atom") + +;; Atom has two ports: port 0 and port 1 connecting to all execution units +(define_cpu_unit "atom-port-0,atom-port-1" "atom") + +;; EU: Execution Unit +;; Atom EUs are connected by port 0 or port 1. + +(define_cpu_unit "atom-eu-0, atom-eu-1, + atom-imul-1, atom-imul-2, atom-imul-3, atom-imul-4" + "atom") + +;; Some EUs have duplicated copied and can be accessed via either +;; port 0 or port 1 +;; (define_reservation "atom-port-either" "(atom-port-0 | atom-port-1)") + +;;; Some instructions is dual-pipe execution, need both ports +;;; Complex multi-op macro-instructoins need both ports and all EUs +(define_reservation "atom-port-dual" "(atom-port-0 + atom-port-1)") +(define_reservation "atom-all-eu" "(atom-eu-0 + atom-eu-1 + + atom-imul-1 + atom-imul-2 + atom-imul-3 + + atom-imul-4)") + +;;; Most of simple instructions have 1 cycle latency. Some of them +;;; issue in port 0, some in port 0 and some in either port. +(define_reservation "atom-simple-0" "(atom-port-0 + atom-eu-0)") +(define_reservation "atom-simple-1" "(atom-port-1 + atom-eu-1)") +(define_reservation "atom-simple-either" "(atom-simple-0 | atom-simple-1)") + +;;; Some insn issues in port 0 with 3 cycle latency and 1 cycle tput +(define_reservation "atom-eu-0-3-1" "(atom-port-0 + atom-eu-0, nothing*2)") + +;;; fmul insn can have 4 or 5 cycles latency +(define_reservation "atom-fmul-5c" "(atom-port-0 + atom-eu-0), nothing*4") +(define_reservation "atom-fmul-4c" "(atom-port-0 + atom-eu-0), nothing*3") + +;;; fadd can has 5 cycles latency depends on instruction forms +(define_reservation "atom-fadd-5c" "(atom-port-1 + atom-eu-1), nothing*5") + +;;; imul insn has 5 cycles latency +(define_reservation "atom-imul-32" + "atom-imul-1, atom-imul-2, atom-imul-3, atom-imul-4, + atom-port-0") +;;; imul instruction excludes other non-FP instructions. +(exclusion_set "atom-eu-0, atom-eu-1" + "atom-imul-1, atom-imul-2, atom-imul-3, atom-imul-4") + +;;; dual-execution instructions can have 1,2,4,5 cycles latency depends on +;;; instruction forms +(define_reservation "atom-dual-1c" "(atom-port-dual + atom-eu-0 + atom-eu-1)") +(define_reservation "atom-dual-2c" + "(atom-port-dual + atom-eu-0 + atom-eu-1, nothing)") +(define_reservation "atom-dual-5c" + "(atom-port-dual + atom-eu-0 + atom-eu-1, nothing*4)") + +;;; Complex macro-instruction has variants of latency, and uses both ports. +(define_reservation "atom-complex" "(atom-port-dual + atom-all-eu)") + +(define_insn_reservation "atom_other" 9 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "other") + (eq_attr "atom_unit" "!jeu"))) + "atom-complex, atom-all-eu*8") + +;; return has type "other" with atom_unit "jeu" +(define_insn_reservation "atom_other_2" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "other") + (eq_attr "atom_unit" "jeu"))) + "atom-dual-1c") + +(define_insn_reservation "atom_multi" 9 + (and (eq_attr "cpu" "atom") + (eq_attr "type" "multi")) + "atom-complex, atom-all-eu*8") + +;; Normal alu insns without carry +(define_insn_reservation "atom_alu" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "alu") + (and (eq_attr "memory" "none") + (eq_attr "use_carry" "0")))) + "atom-simple-either") + +;; Normal alu insns without carry +(define_insn_reservation "atom_alu_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "alu") + (and (eq_attr "memory" "!none") + (eq_attr "use_carry" "0")))) + "atom-simple-either") + +;; Alu insn consuming CF, such as add/sbb +(define_insn_reservation "atom_alu_carry" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "alu") + (and (eq_attr "memory" "none") + (eq_attr "use_carry" "1")))) + "atom-simple-either") + +;; Alu insn consuming CF, such as add/sbb +(define_insn_reservation "atom_alu_carry_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "alu") + (and (eq_attr "memory" "!none") + (eq_attr "use_carry" "1")))) + "atom-simple-either") + +(define_insn_reservation "atom_alu1" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "alu1") + (eq_attr "memory" "none"))) + "atom-simple-either") + +(define_insn_reservation "atom_alu1_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "alu1") + (eq_attr "memory" "!none"))) + "atom-simple-either") + +(define_insn_reservation "atom_negnot" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "negnot") + (eq_attr "memory" "none"))) + "atom-simple-either") + +(define_insn_reservation "atom_negnot_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "negnot") + (eq_attr "memory" "!none"))) + "atom-simple-either") + +(define_insn_reservation "atom_imov" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "imov") + (eq_attr "memory" "none"))) + "atom-simple-either") + +(define_insn_reservation "atom_imov_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "imov") + (eq_attr "memory" "!none"))) + "atom-simple-either") + +;; 16<-16, 32<-32 +(define_insn_reservation "atom_imovx" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "imovx") + (and (eq_attr "memory" "none") + (ior (and (match_operand:HI 0 "register_operand") + (match_operand:HI 1 "general_operand")) + (and (match_operand:SI 0 "register_operand") + (match_operand:SI 1 "general_operand")))))) + "atom-simple-either") + +;; 16<-16, 32<-32, mem +(define_insn_reservation "atom_imovx_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "imovx") + (and (eq_attr "memory" "!none") + (ior (and (match_operand:HI 0 "register_operand") + (match_operand:HI 1 "general_operand")) + (and (match_operand:SI 0 "register_operand") + (match_operand:SI 1 "general_operand")))))) + "atom-simple-either") + +;; 32<-16, 32<-8, 64<-16, 64<-8, 64<-32, 8<-8 +(define_insn_reservation "atom_imovx_2" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "imovx") + (and (eq_attr "memory" "none") + (ior (match_operand:QI 0 "register_operand") + (ior (and (match_operand:SI 0 "register_operand") + (not (match_operand:SI 1 "general_operand"))) + (match_operand:DI 0 "register_operand")))))) + "atom-simple-0") + +;; 32<-16, 32<-8, 64<-16, 64<-8, 64<-32, 8<-8, mem +(define_insn_reservation "atom_imovx_2_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "imovx") + (and (eq_attr "memory" "!none") + (ior (match_operand:QI 0 "register_operand") + (ior (and (match_operand:SI 0 "register_operand") + (not (match_operand:SI 1 "general_operand"))) + (match_operand:DI 0 "register_operand")))))) + "atom-simple-0") + +;; 16<-8 +(define_insn_reservation "atom_imovx_3" 3 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "imovx") + (and (match_operand:HI 0 "register_operand") + (match_operand:QI 1 "general_operand")))) + "atom-complex, atom-all-eu*2") + +(define_insn_reservation "atom_lea" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "lea") + (eq_attr "mode" "!HI"))) + "atom-simple-either") + +;; lea 16bit address is complex insn +(define_insn_reservation "atom_lea_2" 2 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "lea") + (eq_attr "mode" "HI"))) + "atom-complex, atom-all-eu") + +(define_insn_reservation "atom_incdec" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "incdec") + (eq_attr "memory" "none"))) + "atom-simple-either") + +(define_insn_reservation "atom_incdec_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "incdec") + (eq_attr "memory" "!none"))) + "atom-simple-either") + +;; simple shift instruction use SHIFT eu, none memory +(define_insn_reservation "atom_ishift" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ishift") + (and (eq_attr "memory" "none") (eq_attr "prefix_0f" "0")))) + "atom-simple-0") + +;; simple shift instruction use SHIFT eu, memory +(define_insn_reservation "atom_ishift_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ishift") + (and (eq_attr "memory" "!none") (eq_attr "prefix_0f" "0")))) + "atom-simple-0") + +;; DF shift (prefixed with 0f) is complex insn with latency of 7 cycles +(define_insn_reservation "atom_ishift_3" 7 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ishift") + (eq_attr "prefix_0f" "1"))) + "atom-complex, atom-all-eu*6") + +(define_insn_reservation "atom_ishift1" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ishift1") + (eq_attr "memory" "none"))) + "atom-simple-0") + +(define_insn_reservation "atom_ishift1_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ishift1") + (eq_attr "memory" "!none"))) + "atom-simple-0") + +(define_insn_reservation "atom_rotate" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "rotate") + (eq_attr "memory" "none"))) + "atom-simple-0") + +(define_insn_reservation "atom_rotate_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "rotate") + (eq_attr "memory" "!none"))) + "atom-simple-0") + +(define_insn_reservation "atom_rotate1" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "rotate1") + (eq_attr "memory" "none"))) + "atom-simple-0") + +(define_insn_reservation "atom_rotate1_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "rotate1") + (eq_attr "memory" "!none"))) + "atom-simple-0") + +(define_insn_reservation "atom_imul" 5 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "imul") + (and (eq_attr "memory" "none") (eq_attr "mode" "SI")))) + "atom-imul-32") + +(define_insn_reservation "atom_imul_mem" 5 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "imul") + (and (eq_attr "memory" "!none") (eq_attr "mode" "SI")))) + "atom-imul-32") + +;; latency set to 10 as common 64x64 imul +(define_insn_reservation "atom_imul_3" 10 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "imul") + (eq_attr "mode" "!SI"))) + "atom-complex, atom-all-eu*9") + +(define_insn_reservation "atom_idiv" 65 + (and (eq_attr "cpu" "atom") + (eq_attr "type" "idiv")) + "atom-complex, atom-all-eu*32, nothing*32") + +(define_insn_reservation "atom_icmp" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "icmp") + (eq_attr "memory" "none"))) + "atom-simple-either") + +(define_insn_reservation "atom_icmp_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "icmp") + (eq_attr "memory" "!none"))) + "atom-simple-either") + +(define_insn_reservation "atom_test" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "test") + (eq_attr "memory" "none"))) + "atom-simple-either") + +(define_insn_reservation "atom_test_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "test") + (eq_attr "memory" "!none"))) + "atom-simple-either") + +(define_insn_reservation "atom_ibr" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ibr") + (eq_attr "memory" "!load"))) + "atom-simple-1") + +;; complex if jump target is from address +(define_insn_reservation "atom_ibr_2" 2 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ibr") + (eq_attr "memory" "load"))) + "atom-complex, atom-all-eu") + +(define_insn_reservation "atom_setcc" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "setcc") + (eq_attr "memory" "!store"))) + "atom-simple-either") + +;; 2 cycles complex if target is in memory +(define_insn_reservation "atom_setcc_2" 2 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "setcc") + (eq_attr "memory" "store"))) + "atom-complex, atom-all-eu") + +(define_insn_reservation "atom_icmov" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "icmov") + (eq_attr "memory" "none"))) + "atom-simple-either") + +(define_insn_reservation "atom_icmov_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "icmov") + (eq_attr "memory" "!none"))) + "atom-simple-either") + +;; UCODE if segreg, ignored +(define_insn_reservation "atom_push" 2 + (and (eq_attr "cpu" "atom") + (eq_attr "type" "push")) + "atom-dual-2c") + +;; pop r64 is 1 cycle. UCODE if segreg, ignored +(define_insn_reservation "atom_pop" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "pop") + (eq_attr "mode" "DI"))) + "atom-dual-1c") + +;; pop non-r64 is 2 cycles. UCODE if segreg, ignored +(define_insn_reservation "atom_pop_2" 2 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "pop") + (eq_attr "mode" "!DI"))) + "atom-dual-2c") + +;; UCODE if segreg, ignored +(define_insn_reservation "atom_call" 1 + (and (eq_attr "cpu" "atom") + (eq_attr "type" "call")) + "atom-dual-1c") + +(define_insn_reservation "atom_callv" 1 + (and (eq_attr "cpu" "atom") + (eq_attr "type" "callv")) + "atom-dual-1c") + +(define_insn_reservation "atom_leave" 3 + (and (eq_attr "cpu" "atom") + (eq_attr "type" "leave")) + "atom-complex, atom-all-eu*2") + +(define_insn_reservation "atom_str" 3 + (and (eq_attr "cpu" "atom") + (eq_attr "type" "str")) + "atom-complex, atom-all-eu*2") + +(define_insn_reservation "atom_sselog" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sselog") + (eq_attr "memory" "none"))) + "atom-simple-either") + +(define_insn_reservation "atom_sselog_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sselog") + (eq_attr "memory" "!none"))) + "atom-simple-either") + +(define_insn_reservation "atom_sselog1" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sselog1") + (eq_attr "memory" "none"))) + "atom-simple-0") + +(define_insn_reservation "atom_sselog1_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sselog1") + (eq_attr "memory" "!none"))) + "atom-simple-0") + +;; not pmad, not psad +(define_insn_reservation "atom_sseiadd" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sseiadd") + (and (not (match_operand:V2DI 0 "register_operand")) + (and (eq_attr "atom_unit" "!simul") + (eq_attr "atom_unit" "!complex"))))) + "atom-simple-either") + +;; pmad, psad and 64 +(define_insn_reservation "atom_sseiadd_2" 4 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sseiadd") + (and (not (match_operand:V2DI 0 "register_operand")) + (and (eq_attr "atom_unit" "simul" ) + (eq_attr "mode" "DI"))))) + "atom-fmul-4c") + +;; pmad, psad and 128 +(define_insn_reservation "atom_sseiadd_3" 5 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sseiadd") + (and (not (match_operand:V2DI 0 "register_operand")) + (and (eq_attr "atom_unit" "simul" ) + (eq_attr "mode" "TI"))))) + "atom-fmul-5c") + +;; if paddq(64 bit op), phadd/phsub +(define_insn_reservation "atom_sseiadd_4" 6 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sseiadd") + (ior (match_operand:V2DI 0 "register_operand") + (eq_attr "atom_unit" "complex")))) + "atom-complex, atom-all-eu*5") + +;; if immediate op. +(define_insn_reservation "atom_sseishft" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sseishft") + (and (eq_attr "atom_unit" "!sishuf") + (match_operand 2 "immediate_operand")))) + "atom-simple-either") + +;; if palignr or psrldq +(define_insn_reservation "atom_sseishft_2" 1 + (and (eq_attr "cpu" "atom") + (ior (eq_attr "type" "sseishft1") + (and (eq_attr "type" "sseishft") + (and (eq_attr "atom_unit" "sishuf") + (match_operand 2 "immediate_operand"))))) + "atom-simple-0") + +;; if reg/mem op +(define_insn_reservation "atom_sseishft_3" 2 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sseishft") + (not (match_operand 2 "immediate_operand")))) + "atom-complex, atom-all-eu") + +(define_insn_reservation "atom_sseimul" 1 + (and (eq_attr "cpu" "atom") + (eq_attr "type" "sseimul")) + "atom-simple-0") + +;; rcpss or rsqrtss +(define_insn_reservation "atom_sse" 4 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sse") + (and (eq_attr "atom_sse_attr" "rcp") (eq_attr "mode" "SF")))) + "atom-fmul-4c") + +;; movshdup, movsldup. Suggest to type sseishft +(define_insn_reservation "atom_sse_2" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sse") + (eq_attr "atom_sse_attr" "movdup"))) + "atom-simple-0") + +;; lfence +(define_insn_reservation "atom_sse_3" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sse") + (eq_attr "atom_sse_attr" "lfence"))) + "atom-simple-either") + +;; sfence,clflush,mfence, prefetch +(define_insn_reservation "atom_sse_4" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sse") + (ior (eq_attr "atom_sse_attr" "fence") + (eq_attr "atom_sse_attr" "prefetch")))) + "atom-simple-0") + +;; rcpps, rsqrtss, sqrt, ldmxcsr +(define_insn_reservation "atom_sse_5" 7 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sse") + (ior (ior (eq_attr "atom_sse_attr" "sqrt") + (eq_attr "atom_sse_attr" "mxcsr")) + (and (eq_attr "atom_sse_attr" "rcp") + (eq_attr "mode" "V4SF"))))) + "atom-complex, atom-all-eu*6") + +;; xmm->xmm +(define_insn_reservation "atom_ssemov" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ssemov") + (and (match_operand 0 "register_operand" "xy") (match_operand 1 "register_operand" "xy")))) + "atom-simple-either") + +;; reg->xmm +(define_insn_reservation "atom_ssemov_2" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ssemov") + (and (match_operand 0 "register_operand" "xy") (match_operand 1 "register_operand" "r")))) + "atom-simple-0") + +;; xmm->reg +(define_insn_reservation "atom_ssemov_3" 3 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ssemov") + (and (match_operand 0 "register_operand" "r") (match_operand 1 "register_operand" "xy")))) + "atom-eu-0-3-1") + +;; mov mem +(define_insn_reservation "atom_ssemov_4" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ssemov") + (and (eq_attr "movu" "0") (eq_attr "memory" "!none")))) + "atom-simple-0") + +;; movu mem +(define_insn_reservation "atom_ssemov_5" 2 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ssemov") + (ior (eq_attr "movu" "1") (eq_attr "memory" "!none")))) + "atom-complex, atom-all-eu") + +;; no memory simple +(define_insn_reservation "atom_sseadd" 5 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sseadd") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "!V2DF") + (eq_attr "atom_unit" "!complex"))))) + "atom-fadd-5c") + +;; memory simple +(define_insn_reservation "atom_sseadd_mem" 5 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sseadd") + (and (eq_attr "memory" "!none") + (and (eq_attr "mode" "!V2DF") + (eq_attr "atom_unit" "!complex"))))) + "atom-dual-5c") + +;; maxps, minps, *pd, hadd, hsub +(define_insn_reservation "atom_sseadd_3" 8 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sseadd") + (ior (eq_attr "mode" "V2DF") (eq_attr "atom_unit" "complex")))) + "atom-complex, atom-all-eu*7") + +;; Except dppd/dpps +(define_insn_reservation "atom_ssemul" 5 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ssemul") + (eq_attr "mode" "!SF"))) + "atom-fmul-5c") + +;; Except dppd/dpps, 4 cycle if mulss +(define_insn_reservation "atom_ssemul_2" 4 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ssemul") + (eq_attr "mode" "SF"))) + "atom-fmul-4c") + +(define_insn_reservation "atom_ssecmp" 1 + (and (eq_attr "cpu" "atom") + (eq_attr "type" "ssecmp")) + "atom-simple-either") + +(define_insn_reservation "atom_ssecomi" 10 + (and (eq_attr "cpu" "atom") + (eq_attr "type" "ssecomi")) + "atom-complex, atom-all-eu*9") + +;; no memory and cvtpi2ps, cvtps2pi, cvttps2pi +(define_insn_reservation "atom_ssecvt" 5 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ssecvt") + (ior (and (match_operand:V2SI 0 "register_operand") + (match_operand:V4SF 1 "register_operand")) + (and (match_operand:V4SF 0 "register_operand") + (match_operand:V2SI 1 "register_operand"))))) + "atom-fadd-5c") + +;; memory and cvtpi2ps, cvtps2pi, cvttps2pi +(define_insn_reservation "atom_ssecvt_2" 5 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ssecvt") + (ior (and (match_operand:V2SI 0 "register_operand") + (match_operand:V4SF 1 "memory_operand")) + (and (match_operand:V4SF 0 "register_operand") + (match_operand:V2SI 1 "memory_operand"))))) + "atom-dual-5c") + +;; otherwise. 7 cycles average for cvtss2sd +(define_insn_reservation "atom_ssecvt_3" 7 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ssecvt") + (not (ior (and (match_operand:V2SI 0 "register_operand") + (match_operand:V4SF 1 "nonimmediate_operand")) + (and (match_operand:V4SF 0 "register_operand") + (match_operand:V2SI 1 "nonimmediate_operand")))))) + "atom-complex, atom-all-eu*6") + +;; memory and cvtsi2sd +(define_insn_reservation "atom_sseicvt" 5 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sseicvt") + (and (match_operand:V2DF 0 "register_operand") + (match_operand:SI 1 "memory_operand")))) + "atom-dual-5c") + +;; otherwise. 8 cycles average for cvtsd2si +(define_insn_reservation "atom_sseicvt_2" 8 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sseicvt") + (not (and (match_operand:V2DF 0 "register_operand") + (match_operand:SI 1 "memory_operand"))))) + "atom-complex, atom-all-eu*7") + +(define_insn_reservation "atom_ssediv" 62 + (and (eq_attr "cpu" "atom") + (eq_attr "type" "ssediv")) + "atom-complex, atom-all-eu*12, nothing*49") + +;; simple for fmov +(define_insn_reservation "atom_fmov" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "fmov") + (eq_attr "memory" "none"))) + "atom-simple-either") + +;; simple for fmov +(define_insn_reservation "atom_fmov_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "fmov") + (eq_attr "memory" "!none"))) + "atom-simple-either") + +;; Define bypass here + +;; There will be no stall from lea to non-mem EX insns +(define_bypass 0 "atom_lea" + "atom_alu_carry, + atom_alu,atom_alu1,atom_negnot,atom_imov,atom_imovx, + atom_incdec, atom_setcc, atom_icmov, atom_pop") + +(define_bypass 0 "atom_lea" + "atom_alu_mem, atom_alu_carry_mem, atom_alu1_mem, + atom_imovx_mem, atom_imovx_2_mem, + atom_imov_mem, atom_icmov_mem, atom_fmov_mem" + "!ix86_agi_dependent") + +;; There will be 3 cycles stall from EX insns to AGAN insns LEA +(define_bypass 4 "atom_alu_carry, + atom_alu,atom_alu1,atom_negnot,atom_imov,atom_imovx, + atom_incdec,atom_ishift,atom_ishift1,atom_rotate, + atom_rotate1, atom_setcc, atom_icmov, atom_pop, + atom_alu_mem, atom_alu_carry_mem, atom_alu1_mem, + atom_imovx_mem, atom_imovx_2_mem, + atom_imov_mem, atom_icmov_mem, atom_fmov_mem" + "atom_lea") + +;; There will be 3 cycles stall from EX insns to insns need addr calculation +(define_bypass 4 "atom_alu_carry, + atom_alu,atom_alu1,atom_negnot,atom_imov,atom_imovx, + atom_incdec,atom_ishift,atom_ishift1,atom_rotate, + atom_rotate1, atom_setcc, atom_icmov, atom_pop, + atom_imovx_mem, atom_imovx_2_mem, + atom_alu_mem, atom_alu_carry_mem, atom_alu1_mem, + atom_imov_mem, atom_icmov_mem, atom_fmov_mem" + "atom_alu_mem, atom_alu_carry_mem, atom_alu1_mem, + atom_negnot_mem, atom_imov_mem, atom_incdec_mem, + atom_imovx_mem, atom_imovx_2_mem, + atom_imul_mem, atom_icmp_mem, + atom_test_mem, atom_icmov_mem, atom_sselog_mem, + atom_sselog1_mem, atom_fmov_mem, atom_sseadd_mem, + atom_ishift_mem, atom_ishift1_mem, + atom_rotate_mem, atom_rotate1_mem" + "ix86_agi_dependent") + +;; Stall from imul to lea is 8 cycles. +(define_bypass 9 "atom_imul, atom_imul_mem" "atom_lea") + +;; Stall from imul to memory address is 8 cycles. +(define_bypass 9 "atom_imul, atom_imul_mem" + "atom_alu_mem, atom_alu_carry_mem, atom_alu1_mem, + atom_negnot_mem, atom_imov_mem, atom_incdec_mem, + atom_ishift_mem, atom_ishift1_mem, atom_rotate_mem, + atom_rotate1_mem, atom_imul_mem, atom_icmp_mem, + atom_test_mem, atom_icmov_mem, atom_sselog_mem, + atom_sselog1_mem, atom_fmov_mem, atom_sseadd_mem" + "ix86_agi_dependent") + +;; There will be 0 cycle stall from cmp/test to jcc + +;; There will be 1 cycle stall from flag producer to cmov and adc/sbb +(define_bypass 2 "atom_icmp, atom_test, atom_alu, atom_alu_carry, + atom_alu1, atom_negnot, atom_incdec, atom_ishift, + atom_ishift1, atom_rotate, atom_rotate1" + "atom_icmov, atom_alu_carry") + +;; lea to shift count stall is 2 cycles +(define_bypass 3 "atom_lea" + "atom_ishift, atom_ishift1, atom_rotate, atom_rotate1, + atom_ishift_mem, atom_ishift1_mem, + atom_rotate_mem, atom_rotate1_mem" + "ix86_dep_by_shift_count") + +;; lea to shift source stall is 1 cycle +(define_bypass 2 "atom_lea" + "atom_ishift, atom_ishift1, atom_rotate, atom_rotate1" + "!ix86_dep_by_shift_count") + +;; non-lea to shift count stall is 1 cycle +(define_bypass 2 "atom_alu_carry, + atom_alu,atom_alu1,atom_negnot,atom_imov,atom_imovx, + atom_incdec,atom_ishift,atom_ishift1,atom_rotate, + atom_rotate1, atom_setcc, atom_icmov, atom_pop, + atom_alu_mem, atom_alu_carry_mem, atom_alu1_mem, + atom_imovx_mem, atom_imovx_2_mem, + atom_imov_mem, atom_icmov_mem, atom_fmov_mem" + "atom_ishift, atom_ishift1, atom_rotate, atom_rotate1, + atom_ishift_mem, atom_ishift1_mem, + atom_rotate_mem, atom_rotate1_mem" + "ix86_dep_by_shift_count") diff --git a/gcc/config/i386/att.h b/gcc/config/i386/att.h new file mode 100644 index 000000000..c16b2f922 --- /dev/null +++ b/gcc/config/i386/att.h @@ -0,0 +1,92 @@ +/* Definitions for AT&T assembler syntax for the Intel 80386. + Copyright (C) 1988, 1996, 2000, 2001, 2002, 2007, 2009, 2010 + Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ + + +/* Define the syntax of instructions and addresses. */ + +/* Prefix for internally generated assembler labels. */ +#define LPREFIX ".L" + +/* Assembler pseudos to introduce constants of various size. */ + +#define ASM_BYTE "\t.byte\t" +#define ASM_SHORT "\t.value\t" +#define ASM_LONG "\t.long\t" +#define ASM_QUAD "\t.quad\t" /* Should not be used for 32bit compilation. */ + +/* How to output an ASCII string constant. */ + +#undef ASM_OUTPUT_ASCII +#define ASM_OUTPUT_ASCII(FILE, PTR, SIZE) \ +do \ +{ size_t i = 0, limit = (SIZE); \ + while (i < limit) \ + { if (i%10 == 0) { if (i!=0) putc ('\n', (FILE)); \ + fputs (ASM_BYTE, (FILE)); } \ + else putc (',', (FILE)); \ + fprintf ((FILE), "0x%x", ((PTR)[i++] & 0377)) ;} \ + putc ('\n', (FILE)); \ +} while (0) + +/* Output at beginning of assembler file. */ +#define TARGET_ASM_FILE_START_FILE_DIRECTIVE true + +/* This is how to output an assembler line + that says to advance the location counter + to a multiple of 2**LOG bytes. */ + +#define ASM_OUTPUT_ALIGN(FILE,LOG) \ + if ((LOG)!=0) fprintf ((FILE), "\t.align %d\n", 1<<(LOG)) + +/* This is how to output an assembler line + that says to advance the location counter by SIZE bytes. */ + +#undef ASM_OUTPUT_SKIP +#define ASM_OUTPUT_SKIP(FILE,SIZE) \ + fprintf ((FILE), "\t.set .,.+%u\n", (int)(SIZE)) + +/* Can't use ASM_OUTPUT_SKIP in text section; it doesn't leave 0s. */ + +#define ASM_NO_SKIP_IN_TEXT 1 + +/* Define the syntax of labels and symbol definitions/declarations. */ + +/* The prefix to add for compiler private assembler symbols. */ +#undef LOCAL_LABEL_PREFIX +#define LOCAL_LABEL_PREFIX "." + +/* This is how to store into the string BUF + the symbol_ref name of an internal numbered label where + PREFIX is the class of label and NUM is the number within the class. + This is suitable for output with `assemble_name'. */ + +#undef ASM_GENERATE_INTERNAL_LABEL +#define ASM_GENERATE_INTERNAL_LABEL(BUF,PREFIX,NUMBER) \ + sprintf ((BUF), LOCAL_LABEL_PREFIX "%s%ld", (PREFIX), (long)(NUMBER)) + +/* The prefix to add to user-visible assembler symbols. */ + +#undef USER_LABEL_PREFIX +#define USER_LABEL_PREFIX "" diff --git a/gcc/config/i386/avxintrin.h b/gcc/config/i386/avxintrin.h new file mode 100644 index 000000000..6d4213dc8 --- /dev/null +++ b/gcc/config/i386/avxintrin.h @@ -0,0 +1,1426 @@ +/* Copyright (C) 2008, 2009 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 11.0. */ + +#ifndef _IMMINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +/* Internal data types for implementing the intrinsics. */ +typedef double __v4df __attribute__ ((__vector_size__ (32))); +typedef float __v8sf __attribute__ ((__vector_size__ (32))); +typedef long long __v4di __attribute__ ((__vector_size__ (32))); +typedef int __v8si __attribute__ ((__vector_size__ (32))); +typedef short __v16hi __attribute__ ((__vector_size__ (32))); +typedef char __v32qi __attribute__ ((__vector_size__ (32))); + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef float __m256 __attribute__ ((__vector_size__ (32), + __may_alias__)); +typedef long long __m256i __attribute__ ((__vector_size__ (32), + __may_alias__)); +typedef double __m256d __attribute__ ((__vector_size__ (32), + __may_alias__)); + +/* Compare predicates for scalar and packed compare intrinsics. */ + +/* Equal (ordered, non-signaling) */ +#define _CMP_EQ_OQ 0x00 +/* Less-than (ordered, signaling) */ +#define _CMP_LT_OS 0x01 +/* Less-than-or-equal (ordered, signaling) */ +#define _CMP_LE_OS 0x02 +/* Unordered (non-signaling) */ +#define _CMP_UNORD_Q 0x03 +/* Not-equal (unordered, non-signaling) */ +#define _CMP_NEQ_UQ 0x04 +/* Not-less-than (unordered, signaling) */ +#define _CMP_NLT_US 0x05 +/* Not-less-than-or-equal (unordered, signaling) */ +#define _CMP_NLE_US 0x06 +/* Ordered (nonsignaling) */ +#define _CMP_ORD_Q 0x07 +/* Equal (unordered, non-signaling) */ +#define _CMP_EQ_UQ 0x08 +/* Not-greater-than-or-equal (unordered, signaling) */ +#define _CMP_NGE_US 0x09 +/* Not-greater-than (unordered, signaling) */ +#define _CMP_NGT_US 0x0a +/* False (ordered, non-signaling) */ +#define _CMP_FALSE_OQ 0x0b +/* Not-equal (ordered, non-signaling) */ +#define _CMP_NEQ_OQ 0x0c +/* Greater-than-or-equal (ordered, signaling) */ +#define _CMP_GE_OS 0x0d +/* Greater-than (ordered, signaling) */ +#define _CMP_GT_OS 0x0e +/* True (unordered, non-signaling) */ +#define _CMP_TRUE_UQ 0x0f +/* Equal (ordered, signaling) */ +#define _CMP_EQ_OS 0x10 +/* Less-than (ordered, non-signaling) */ +#define _CMP_LT_OQ 0x11 +/* Less-than-or-equal (ordered, non-signaling) */ +#define _CMP_LE_OQ 0x12 +/* Unordered (signaling) */ +#define _CMP_UNORD_S 0x13 +/* Not-equal (unordered, signaling) */ +#define _CMP_NEQ_US 0x14 +/* Not-less-than (unordered, non-signaling) */ +#define _CMP_NLT_UQ 0x15 +/* Not-less-than-or-equal (unordered, non-signaling) */ +#define _CMP_NLE_UQ 0x16 +/* Ordered (signaling) */ +#define _CMP_ORD_S 0x17 +/* Equal (unordered, signaling) */ +#define _CMP_EQ_US 0x18 +/* Not-greater-than-or-equal (unordered, non-signaling) */ +#define _CMP_NGE_UQ 0x19 +/* Not-greater-than (unordered, non-signaling) */ +#define _CMP_NGT_UQ 0x1a +/* False (ordered, signaling) */ +#define _CMP_FALSE_OS 0x1b +/* Not-equal (ordered, signaling) */ +#define _CMP_NEQ_OS 0x1c +/* Greater-than-or-equal (ordered, non-signaling) */ +#define _CMP_GE_OQ 0x1d +/* Greater-than (ordered, non-signaling) */ +#define _CMP_GT_OQ 0x1e +/* True (unordered, signaling) */ +#define _CMP_TRUE_US 0x1f + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_addsub_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_addsub_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B); +} + + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_and_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_and_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_andnot_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_andnot_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B); +} + +/* Double/single precision floating point blend instructions - select + data from 2 sources using constant/variable mask. */ + +#ifdef __OPTIMIZE__ +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blend_pd (__m256d __X, __m256d __Y, const int __M) +{ + return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X, + (__v4df)__Y, + __M); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blend_ps (__m256 __X, __m256 __Y, const int __M) +{ + return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X, + (__v8sf)__Y, + __M); +} +#else +#define _mm256_blend_pd(X, Y, M) \ + ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \ + (__v4df)(__m256d)(Y), (int)(M))) + +#define _mm256_blend_ps(X, Y, M) \ + ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \ + (__v8sf)(__m256)(Y), (int)(M))) +#endif + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M) +{ + return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X, + (__v4df)__Y, + (__v4df)__M); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M) +{ + return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X, + (__v8sf)__Y, + (__v8sf)__M); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_div_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_div_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B); +} + +/* Dot product instructions with mask-defined summing and zeroing parts + of result. */ + +#ifdef __OPTIMIZE__ +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dp_ps (__m256 __X, __m256 __Y, const int __M) +{ + return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X, + (__v8sf)__Y, + __M); +} +#else +#define _mm256_dp_ps(X, Y, M) \ + ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \ + (__v8sf)(__m256)(Y), (int)(M))) +#endif + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hadd_pd (__m256d __X, __m256d __Y) +{ + return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hadd_ps (__m256 __X, __m256 __Y) +{ + return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hsub_pd (__m256d __X, __m256d __Y) +{ + return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hsub_ps (__m256 __X, __m256 __Y) +{ + return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mul_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mul_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_or_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_or_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask) +{ + return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B, + __mask); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask) +{ + return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B, + __mask); +} +#else +#define _mm256_shuffle_pd(A, B, N) \ + ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), (int)(N))) + +#define _mm256_shuffle_ps(A, B, N) \ + ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), (int)(N))) +#endif + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_xor_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_xor_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_pd (__m128d __X, __m128d __Y, const int __P) +{ + return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_ps (__m128 __X, __m128 __Y, const int __P) +{ + return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P) +{ + return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y, + __P); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P) +{ + return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y, + __P); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_sd (__m128d __X, __m128d __Y, const int __P) +{ + return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_ss (__m128 __X, __m128 __Y, const int __P) +{ + return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P); +} +#else +#define _mm_cmp_pd(X, Y, P) \ + ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P))) + +#define _mm_cmp_ps(X, Y, P) \ + ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P))) + +#define _mm256_cmp_pd(X, Y, P) \ + ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \ + (__v4df)(__m256d)(Y), (int)(P))) + +#define _mm256_cmp_ps(X, Y, P) \ + ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \ + (__v8sf)(__m256)(Y), (int)(P))) + +#define _mm_cmp_sd(X, Y, P) \ + ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P))) + +#define _mm_cmp_ss(X, Y, P) \ + ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P))) +#endif + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi32_pd (__m128i __A) +{ + return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi32_ps (__m256i __A) +{ + return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtpd_ps (__m256d __A) +{ + return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtps_epi32 (__m256 __A) +{ + return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtps_pd (__m128 __A) +{ + return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttpd_epi32 (__m256d __A) +{ + return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtpd_epi32 (__m256d __A) +{ + return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttps_epi32 (__m256 __A) +{ + return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extractf128_pd (__m256d __X, const int __N) +{ + return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extractf128_ps (__m256 __X, const int __N) +{ + return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extractf128_si256 (__m256i __X, const int __N) +{ + return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extract_epi32 (__m256i __X, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2); + return _mm_extract_epi32 (__Y, __N % 4); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extract_epi16 (__m256i __X, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3); + return _mm_extract_epi16 (__Y, __N % 8); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extract_epi8 (__m256i __X, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4); + return _mm_extract_epi8 (__Y, __N % 16); +} + +#ifdef __x86_64__ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extract_epi64 (__m256i __X, const int __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1); + return _mm_extract_epi64 (__Y, __N % 2); +} +#endif +#else +#define _mm256_extractf128_pd(X, N) \ + ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \ + (int)(N))) + +#define _mm256_extractf128_ps(X, N) \ + ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \ + (int)(N))) + +#define _mm256_extractf128_si256(X, N) \ + ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \ + (int)(N))) + +#define _mm256_extract_epi32(X, N) \ + (__extension__ \ + ({ \ + __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \ + _mm_extract_epi32 (__Y, (N) % 4); \ + })) + +#define _mm256_extract_epi16(X, N) \ + (__extension__ \ + ({ \ + __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \ + _mm_extract_epi16 (__Y, (N) % 8); \ + })) + +#define _mm256_extract_epi8(X, N) \ + (__extension__ \ + ({ \ + __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \ + _mm_extract_epi8 (__Y, (N) % 16); \ + })) + +#ifdef __x86_64__ +#define _mm256_extract_epi64(X, N) \ + (__extension__ \ + ({ \ + __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \ + _mm_extract_epi64 (__Y, (N) % 2); \ + })) +#endif +#endif + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_zeroall (void) +{ + __builtin_ia32_vzeroall (); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_zeroupper (void) +{ + __builtin_ia32_vzeroupper (); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutevar_pd (__m128d __A, __m128i __C) +{ + return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A, + (__v2di)__C); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutevar_pd (__m256d __A, __m256i __C) +{ + return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A, + (__v4di)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutevar_ps (__m128 __A, __m128i __C) +{ + return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A, + (__v4si)__C); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutevar_ps (__m256 __A, __m256i __C) +{ + return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A, + (__v8si)__C); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permute_pd (__m128d __X, const int __C) +{ + return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute_pd (__m256d __X, const int __C) +{ + return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permute_ps (__m128 __X, const int __C) +{ + return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute_ps (__m256 __X, const int __C) +{ + return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C); +} +#else +#define _mm_permute_pd(X, C) \ + ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C))) + +#define _mm256_permute_pd(X, C) \ + ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C))) + +#define _mm_permute_ps(X, C) \ + ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C))) + +#define _mm256_permute_ps(X, C) \ + ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C) +{ + return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X, + (__v4df)__Y, + __C); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C) +{ + return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X, + (__v8sf)__Y, + __C); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C) +{ + return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X, + (__v8si)__Y, + __C); +} +#else +#define _mm256_permute2f128_pd(X, Y, C) \ + ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \ + (__v4df)(__m256d)(Y), \ + (int)(C))) + +#define _mm256_permute2f128_ps(X, Y, C) \ + ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \ + (__v8sf)(__m256)(Y), \ + (int)(C))) + +#define _mm256_permute2f128_si256(X, Y, C) \ + ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \ + (__v8si)(__m256i)(Y), \ + (int)(C))) +#endif + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcast_ss (float const *__X) +{ + return (__m128) __builtin_ia32_vbroadcastss (__X); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcast_sd (double const *__X) +{ + return (__m256d) __builtin_ia32_vbroadcastsd256 (__X); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcast_ss (float const *__X) +{ + return (__m256) __builtin_ia32_vbroadcastss256 (__X); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcast_pd (__m128d const *__X) +{ + return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcast_ps (__m128 const *__X) +{ + return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O) +{ + return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X, + (__v2df)__Y, + __O); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O) +{ + return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X, + (__v4sf)__Y, + __O); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O) +{ + return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X, + (__v4si)__Y, + __O); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insert_epi32 (__m256i __X, int __D, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2); + __Y = _mm_insert_epi32 (__Y, __D, __N % 4); + return _mm256_insertf128_si256 (__X, __Y, __N >> 2); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insert_epi16 (__m256i __X, int __D, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3); + __Y = _mm_insert_epi16 (__Y, __D, __N % 8); + return _mm256_insertf128_si256 (__X, __Y, __N >> 3); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insert_epi8 (__m256i __X, int __D, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4); + __Y = _mm_insert_epi8 (__Y, __D, __N % 16); + return _mm256_insertf128_si256 (__X, __Y, __N >> 4); +} + +#ifdef __x86_64__ +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insert_epi64 (__m256i __X, long long __D, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1); + __Y = _mm_insert_epi64 (__Y, __D, __N % 2); + return _mm256_insertf128_si256 (__X, __Y, __N >> 1); +} +#endif +#else +#define _mm256_insertf128_pd(X, Y, O) \ + ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \ + (__v2df)(__m128d)(Y), \ + (int)(O))) + +#define _mm256_insertf128_ps(X, Y, O) \ + ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \ + (__v4sf)(__m128)(Y), \ + (int)(O))) + +#define _mm256_insertf128_si256(X, Y, O) \ + ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \ + (__v4si)(__m128i)(Y), \ + (int)(O))) + +#define _mm256_insert_epi32(X, D, N) \ + (__extension__ \ + ({ \ + __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \ + __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \ + _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \ + })) + +#define _mm256_insert_epi16(X, D, N) \ + (__extension__ \ + ({ \ + __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \ + __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \ + _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \ + })) + +#define _mm256_insert_epi8(X, D, N) \ + (__extension__ \ + ({ \ + __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \ + __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \ + _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \ + })) + +#ifdef __x86_64__ +#define _mm256_insert_epi64(X, D, N) \ + (__extension__ \ + ({ \ + __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \ + __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \ + _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \ + })) +#endif +#endif + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_load_pd (double const *__P) +{ + return *(__m256d *)__P; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_store_pd (double *__P, __m256d __A) +{ + *(__m256d *)__P = __A; +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_load_ps (float const *__P) +{ + return *(__m256 *)__P; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_store_ps (float *__P, __m256 __A) +{ + *(__m256 *)__P = __A; +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_pd (double const *__P) +{ + return (__m256d) __builtin_ia32_loadupd256 (__P); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu_pd (double *__P, __m256d __A) +{ + __builtin_ia32_storeupd256 (__P, (__v4df)__A); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_ps (float const *__P) +{ + return (__m256) __builtin_ia32_loadups256 (__P); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu_ps (float *__P, __m256 __A) +{ + __builtin_ia32_storeups256 (__P, (__v8sf)__A); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_load_si256 (__m256i const *__P) +{ + return *__P; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_store_si256 (__m256i *__P, __m256i __A) +{ + *__P = __A; +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_si256 (__m256i const *__P) +{ + return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu_si256 (__m256i *__P, __m256i __A) +{ + __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskload_pd (double const *__P, __m128i __M) +{ + return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P, + (__v2di)__M); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskstore_pd (double *__P, __m128i __M, __m128d __A) +{ + __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskload_pd (double const *__P, __m256i __M) +{ + return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P, + (__v4di)__M); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A) +{ + __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskload_ps (float const *__P, __m128i __M) +{ + return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P, + (__v4si)__M); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskstore_ps (float *__P, __m128i __M, __m128 __A) +{ + __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskload_ps (float const *__P, __m256i __M) +{ + return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P, + (__v8si)__M); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A) +{ + __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movehdup_ps (__m256 __X) +{ + return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_moveldup_ps (__m256 __X) +{ + return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movedup_pd (__m256d __X) +{ + return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_lddqu_si256 (__m256i const *__P) +{ + return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_stream_si256 (__m256i *__A, __m256i __B) +{ + __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_stream_pd (double *__A, __m256d __B) +{ + __builtin_ia32_movntpd256 (__A, (__v4df)__B); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_stream_ps (float *__P, __m256 __A) +{ + __builtin_ia32_movntps256 (__P, (__v8sf)__A); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rcp_ps (__m256 __A) +{ + return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rsqrt_ps (__m256 __A) +{ + return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sqrt_pd (__m256d __A) +{ + return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sqrt_ps (__m256 __A) +{ + return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_round_pd (__m256d __V, const int __M) +{ + return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_round_ps (__m256 __V, const int __M) +{ + return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M); +} +#else +#define _mm256_round_pd(V, M) \ + ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M))) + +#define _mm256_round_ps(V, M) \ + ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M))) +#endif + +#define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL) +#define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR) +#define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL) +#define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR) + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpackhi_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpacklo_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpackhi_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpacklo_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testz_pd (__m128d __M, __m128d __V) +{ + return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testc_pd (__m128d __M, __m128d __V) +{ + return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testnzc_pd (__m128d __M, __m128d __V) +{ + return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testz_ps (__m128 __M, __m128 __V) +{ + return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testc_ps (__m128 __M, __m128 __V) +{ + return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testnzc_ps (__m128 __M, __m128 __V) +{ + return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testz_pd (__m256d __M, __m256d __V) +{ + return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testc_pd (__m256d __M, __m256d __V) +{ + return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testnzc_pd (__m256d __M, __m256d __V) +{ + return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testz_ps (__m256 __M, __m256 __V) +{ + return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testc_ps (__m256 __M, __m256 __V) +{ + return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testnzc_ps (__m256 __M, __m256 __V) +{ + return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testz_si256 (__m256i __M, __m256i __V) +{ + return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testc_si256 (__m256i __M, __m256i __V) +{ + return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testnzc_si256 (__m256i __M, __m256i __V) +{ + return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movemask_pd (__m256d __A) +{ + return __builtin_ia32_movmskpd256 ((__v4df)__A); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movemask_ps (__m256 __A) +{ + return __builtin_ia32_movmskps256 ((__v8sf)__A); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setzero_pd (void) +{ + return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 }; +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setzero_ps (void) +{ + return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0 }; +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setzero_si256 (void) +{ + return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 }; +} + +/* Create the vector [A B C D]. */ +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_pd (double __A, double __B, double __C, double __D) +{ + return __extension__ (__m256d){ __D, __C, __B, __A }; +} + +/* Create the vector [A B C D E F G H]. */ +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_ps (float __A, float __B, float __C, float __D, + float __E, float __F, float __G, float __H) +{ + return __extension__ (__m256){ __H, __G, __F, __E, + __D, __C, __B, __A }; +} + +/* Create the vector [A B C D E F G H]. */ +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_epi32 (int __A, int __B, int __C, int __D, + int __E, int __F, int __G, int __H) +{ + return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E, + __D, __C, __B, __A }; +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12, + short __q11, short __q10, short __q09, short __q08, + short __q07, short __q06, short __q05, short __q04, + short __q03, short __q02, short __q01, short __q00) +{ + return __extension__ (__m256i)(__v16hi){ + __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 + }; +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28, + char __q27, char __q26, char __q25, char __q24, + char __q23, char __q22, char __q21, char __q20, + char __q19, char __q18, char __q17, char __q16, + char __q15, char __q14, char __q13, char __q12, + char __q11, char __q10, char __q09, char __q08, + char __q07, char __q06, char __q05, char __q04, + char __q03, char __q02, char __q01, char __q00) +{ + return __extension__ (__m256i)(__v32qi){ + __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15, + __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23, + __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31 + }; +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_epi64x (long long __A, long long __B, long long __C, + long long __D) +{ + return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A }; +} + +/* Create a vector with all elements equal to A. */ +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set1_pd (double __A) +{ + return __extension__ (__m256d){ __A, __A, __A, __A }; +} + +/* Create a vector with all elements equal to A. */ +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set1_ps (float __A) +{ + return __extension__ (__m256){ __A, __A, __A, __A, + __A, __A, __A, __A }; +} + +/* Create a vector with all elements equal to A. */ +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set1_epi32 (int __A) +{ + return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A, + __A, __A, __A, __A }; +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set1_epi16 (short __A) +{ + return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set1_epi8 (char __A) +{ + return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set1_epi64x (long long __A) +{ + return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A }; +} + +/* Create vectors of elements in the reversed order from the + _mm256_set_XXX functions. */ + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_pd (double __A, double __B, double __C, double __D) +{ + return _mm256_set_pd (__D, __C, __B, __A); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_ps (float __A, float __B, float __C, float __D, + float __E, float __F, float __G, float __H) +{ + return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_epi32 (int __A, int __B, int __C, int __D, + int __E, int __F, int __G, int __H) +{ + return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12, + short __q11, short __q10, short __q09, short __q08, + short __q07, short __q06, short __q05, short __q04, + short __q03, short __q02, short __q01, short __q00) +{ + return _mm256_set_epi16 (__q00, __q01, __q02, __q03, + __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, + __q12, __q13, __q14, __q15); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28, + char __q27, char __q26, char __q25, char __q24, + char __q23, char __q22, char __q21, char __q20, + char __q19, char __q18, char __q17, char __q16, + char __q15, char __q14, char __q13, char __q12, + char __q11, char __q10, char __q09, char __q08, + char __q07, char __q06, char __q05, char __q04, + char __q03, char __q02, char __q01, char __q00) +{ + return _mm256_set_epi8 (__q00, __q01, __q02, __q03, + __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, + __q12, __q13, __q14, __q15, + __q16, __q17, __q18, __q19, + __q20, __q21, __q22, __q23, + __q24, __q25, __q26, __q27, + __q28, __q29, __q30, __q31); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_epi64x (long long __A, long long __B, long long __C, + long long __D) +{ + return _mm256_set_epi64x (__D, __C, __B, __A); +} + +/* Casts between various SP, DP, INT vector types. Note that these do no + conversion of values, they just change the type. */ +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castpd_ps (__m256d __A) +{ + return (__m256) __A; +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castpd_si256 (__m256d __A) +{ + return (__m256i) __A; +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castps_pd (__m256 __A) +{ + return (__m256d) __A; +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castps_si256(__m256 __A) +{ + return (__m256i) __A; +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castsi256_ps (__m256i __A) +{ + return (__m256) __A; +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castsi256_pd (__m256i __A) +{ + return (__m256d) __A; +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castpd256_pd128 (__m256d __A) +{ + return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castps256_ps128 (__m256 __A) +{ + return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castsi256_si128 (__m256i __A) +{ + return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A); +} + +/* When cast is done from a 128 to 256-bit type, the low 128 bits of + the 256-bit result contain source parameter value and the upper 128 + bits of the result are undefined. Those intrinsics shouldn't + generate any extra moves. */ + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castpd128_pd256 (__m128d __A) +{ + return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castps128_ps256 (__m128 __A) +{ + return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castsi128_si256 (__m128i __A) +{ + return (__m256i) __builtin_ia32_si256_si ((__v4si)__A); +} diff --git a/gcc/config/i386/avxmath.h b/gcc/config/i386/avxmath.h new file mode 100644 index 000000000..997842b10 --- /dev/null +++ b/gcc/config/i386/avxmath.h @@ -0,0 +1,29 @@ +/* Copyright (C) 2010 + Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#undef TARGET_FPMATH_DEFAULT +#define TARGET_FPMATH_DEFAULT FPMATH_SSE + +#undef TARGET_SUBTARGET_ISA_DEFAULT +#define TARGET_SUBTARGET_ISA_DEFAULT \ + (OPTION_MASK_ISA_MMX | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_SSE2 \ + | OPTION_MASK_ISA_SSE3 | OPTION_MASK_ISA_SSSE3 \ + | OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSE4_2 \ + | OPTION_MASK_ISA_AVX) + diff --git a/gcc/config/i386/bdver1.md b/gcc/config/i386/bdver1.md new file mode 100644 index 000000000..3cde476b5 --- /dev/null +++ b/gcc/config/i386/bdver1.md @@ -0,0 +1,796 @@ +;; Copyright (C) 2010, Free Software Foundation, Inc. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . +;; +;; AMD bdver1 Scheduling +;; +;; The bdver1 contains four pipelined FP units, two integer units and +;; two address generation units. +;; +;; The predecode logic is determining boundaries of instructions in the 64 +;; byte cache line. So the cache line straddling problem of K6 might be issue +;; here as well, but it is not noted in the documentation. +;; +;; Three DirectPath instructions decoders and only one VectorPath decoder +;; is available. They can decode three DirectPath instructions or one +;; VectorPath instruction per cycle. +;; +;; The load/store queue unit is not attached to the schedulers but +;; communicates with all the execution units separately instead. + + +(define_attr "bdver1_decode" "direct,vector,double" + (const_string "direct")) + +(define_automaton "bdver1,bdver1_int,bdver1_load,bdver1_mult,bdver1_fp") + +(define_cpu_unit "bdver1-decode0" "bdver1") +(define_cpu_unit "bdver1-decode1" "bdver1") +(define_cpu_unit "bdver1-decode2" "bdver1") +(define_cpu_unit "bdver1-decodev" "bdver1") + +;; Model the fact that double decoded instruction may take 2 cycles +;; to decode when decoder2 and decoder0 in next cycle +;; is used (this is needed to allow throughput of 1.5 double decoded +;; instructions per cycle). +;; +;; In order to avoid dependence between reservation of decoder +;; and other units, we model decoder as two stage fully pipelined unit +;; and only double decoded instruction may occupy unit in the first cycle. +;; With this scheme however two double instructions can be issued cycle0. +;; +;; Avoid this by using presence set requiring decoder0 to be allocated +;; too. Vector decoded instructions then can't be issued when modeled +;; as consuming decoder0+decoder1+decoder2. +;; We solve that by specialized vector decoder unit and exclusion set. +(presence_set "bdver1-decode2" "bdver1-decode0") +(exclusion_set "bdver1-decodev" "bdver1-decode0,bdver1-decode1,bdver1-decode2") + +(define_reservation "bdver1-vector" "nothing,bdver1-decodev") +(define_reservation "bdver1-direct1" "nothing,bdver1-decode1") +(define_reservation "bdver1-direct" "nothing, + (bdver1-decode0 | bdver1-decode1 + | bdver1-decode2)") +;; Double instructions behaves like two direct instructions. +(define_reservation "bdver1-double" "((bdver1-decode2,bdver1-decode0) + | (nothing,(bdver1-decode0 + bdver1-decode1)) + | (nothing,(bdver1-decode1 + bdver1-decode2)))") + + +(define_cpu_unit "bdver1-ieu0" "bdver1_int") +(define_cpu_unit "bdver1-ieu1" "bdver1_int") +(define_reservation "bdver1-ieu" "(bdver1-ieu0 | bdver1-ieu1)") + +(define_cpu_unit "bdver1-agu0" "bdver1_int") +(define_cpu_unit "bdver1-agu1" "bdver1_int") +(define_reservation "bdver1-agu" "(bdver1-agu0 | bdver1-agu1)") + +(define_cpu_unit "bdver1-mult" "bdver1_mult") + +(define_cpu_unit "bdver1-load0" "bdver1_load") +(define_cpu_unit "bdver1-load1" "bdver1_load") +(define_reservation "bdver1-load" "bdver1-agu, + (bdver1-load0 | bdver1-load1),nothing") +;; 128bit SSE instructions issue two loads at once. +(define_reservation "bdver1-load2" "bdver1-agu, + (bdver1-load0 + bdver1-load1),nothing") + +(define_reservation "bdver1-store" "(bdver1-load0 | bdver1-load1)") +;; 128bit SSE instructions issue two stores at once. +(define_reservation "bdver1-store2" "(bdver1-load0 + bdver1-load1)") + +;; The FP operations start to execute at stage 12 in the pipeline, while +;; integer operations start to execute at stage 9 for athlon and 11 for K8 +;; Compensate the difference for athlon because it results in significantly +;; smaller automata. +;; NOTE: the above information was just copied from athlon.md, and was not +;; actually verified for bdver1. +(define_reservation "bdver1-fpsched" "nothing,nothing,nothing") +;; The floating point loads. +(define_reservation "bdver1-fpload" "(bdver1-fpsched + bdver1-load)") +(define_reservation "bdver1-fpload2" "(bdver1-fpsched + bdver1-load2)") + +;; Four FP units. +(define_cpu_unit "bdver1-ffma0" "bdver1_fp") +(define_cpu_unit "bdver1-ffma1" "bdver1_fp") +(define_cpu_unit "bdver1-fmal0" "bdver1_fp") +(define_cpu_unit "bdver1-fmal1" "bdver1_fp") + +(define_reservation "bdver1-ffma" "(bdver1-ffma0 | bdver1-ffma1)") +(define_reservation "bdver1-fcvt" "bdver1-ffma0") +(define_reservation "bdver1-fmma" "bdver1-ffma0") +(define_reservation "bdver1-fxbar" "bdver1-ffma1") +(define_reservation "bdver1-fmal" "(bdver1-fmal0 | bdver1-fmal1)") +(define_reservation "bdver1-fsto" "bdver1-fmal1") + +;; Vector operations usually consume many of pipes. +(define_reservation "bdver1-fvector" "(bdver1-ffma0 + bdver1-ffma1 + + bdver1-fmal0 + bdver1-fmal1)") + +;; Jump instructions are executed in the branch unit completely transparent to us. +(define_insn_reservation "bdver1_call" 0 + (and (eq_attr "cpu" "bdver1") + (eq_attr "type" "call,callv")) + "bdver1-double,bdver1-agu,bdver1-ieu") +;; PUSH mem is double path. +(define_insn_reservation "bdver1_push" 1 + (and (eq_attr "cpu" "bdver1") + (eq_attr "type" "push")) + "bdver1-direct,bdver1-agu,bdver1-store") +;; POP r16/mem are double path. +(define_insn_reservation "bdver1_pop" 1 + (and (eq_attr "cpu" "bdver1") + (eq_attr "type" "pop")) + "bdver1-direct,(bdver1-ieu+bdver1-load)") +;; LEAVE no latency info so far, assume same with amdfam10. +(define_insn_reservation "bdver1_leave" 3 + (and (eq_attr "cpu" "bdver1") + (eq_attr "type" "leave")) + "bdver1-vector,(bdver1-ieu+bdver1-load)") +;; LEA executes in AGU unit with 1 cycle latency on BDVER1. +(define_insn_reservation "bdver1_lea" 1 + (and (eq_attr "cpu" "bdver1") + (eq_attr "type" "lea")) + "bdver1-direct,bdver1-agu,nothing") + +;; MUL executes in special multiplier unit attached to IEU1. +(define_insn_reservation "bdver1_imul_DI" 6 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "imul") + (and (eq_attr "mode" "DI") + (eq_attr "memory" "none,unknown")))) + "bdver1-direct1,bdver1-ieu1,bdver1-mult,nothing,bdver1-ieu1") +(define_insn_reservation "bdver1_imul" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "imul") + (eq_attr "memory" "none,unknown"))) + "bdver1-direct1,bdver1-ieu1,bdver1-mult,bdver1-ieu1") +(define_insn_reservation "bdver1_imul_mem_DI" 10 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "imul") + (and (eq_attr "mode" "DI") + (eq_attr "memory" "load,both")))) + "bdver1-direct1,bdver1-load,bdver1-ieu,bdver1-mult,nothing,bdver1-ieu") +(define_insn_reservation "bdver1_imul_mem" 8 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "imul") + (eq_attr "memory" "load,both"))) + "bdver1-direct1,bdver1-load,bdver1-ieu,bdver1-mult,bdver1-ieu") + +;; IDIV cannot execute in parallel with other instructions. Dealing with it +;; as with short latency vector instruction is good approximation avoiding +;; scheduler from trying too hard to can hide it's latency by overlap with +;; other instructions. +;; ??? Experiments show that the IDIV can overlap with roughly 6 cycles +;; of the other code. +(define_insn_reservation "bdver1_idiv" 6 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "idiv") + (eq_attr "memory" "none,unknown"))) + "bdver1-vector,(bdver1-ieu0*6+(bdver1-fpsched,bdver1-fvector))") + +(define_insn_reservation "bdver1_idiv_mem" 10 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "idiv") + (eq_attr "memory" "load,both"))) + "bdver1-vector,((bdver1-load,bdver1-ieu0*6)+(bdver1-fpsched,bdver1-fvector))") + +;; The parallelism of string instructions is not documented. Model it same way +;; as IDIV to create smaller automata. This probably does not matter much. +;; Using the same heuristics for bdver1 as amdfam10 and K8 with IDIV. +(define_insn_reservation "bdver1_str" 6 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "str") + (eq_attr "memory" "load,both,store"))) + "bdver1-vector,bdver1-load,bdver1-ieu0*6") + +;; Integer instructions. +(define_insn_reservation "bdver1_idirect" 1 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "bdver1_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "none,unknown")))) + "bdver1-direct,bdver1-ieu") +(define_insn_reservation "bdver1_ivector" 2 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "bdver1_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "none,unknown")))) + "bdver1-vector,bdver1-ieu,bdver1-ieu") +(define_insn_reservation "bdver1_idirect_loadmov" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "imov") + (eq_attr "memory" "load"))) + "bdver1-direct,bdver1-load") +(define_insn_reservation "bdver1_idirect_load" 5 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "bdver1_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "load")))) + "bdver1-direct,bdver1-load,bdver1-ieu") +(define_insn_reservation "bdver1_ivector_load" 6 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "bdver1_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "load")))) + "bdver1-vector,bdver1-load,bdver1-ieu,bdver1-ieu") +(define_insn_reservation "bdver1_idirect_movstore" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "imov") + (eq_attr "memory" "store"))) + "bdver1-direct,bdver1-agu,bdver1-store") +(define_insn_reservation "bdver1_idirect_both" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "bdver1_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "both")))) + "bdver1-direct,bdver1-load, + bdver1-ieu,bdver1-store, + bdver1-store") +(define_insn_reservation "bdver1_ivector_both" 5 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "bdver1_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "both")))) + "bdver1-vector,bdver1-load, + bdver1-ieu, + bdver1-ieu, + bdver1-store") +(define_insn_reservation "bdver1_idirect_store" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "bdver1_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "store")))) + "bdver1-direct,(bdver1-ieu+bdver1-agu), + bdver1-store") +(define_insn_reservation "bdver1_ivector_store" 5 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "bdver1_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "store")))) + "bdver1-vector,(bdver1-ieu+bdver1-agu),bdver1-ieu, + bdver1-store") + +;; BDVER1 floating point units. +(define_insn_reservation "bdver1_fldxf" 13 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "fmov") + (and (eq_attr "memory" "load") + (eq_attr "mode" "XF")))) + "bdver1-vector,bdver1-fpload2,bdver1-fvector*9") +(define_insn_reservation "bdver1_fld" 5 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "fmov") + (eq_attr "memory" "load"))) + "bdver1-direct,bdver1-fpload,bdver1-ffma") +(define_insn_reservation "bdver1_fstxf" 8 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "fmov") + (and (eq_attr "memory" "store,both") + (eq_attr "mode" "XF")))) + "bdver1-vector,(bdver1-fpsched+bdver1-agu),(bdver1-store2+(bdver1-fvector*6))") +(define_insn_reservation "bdver1_fst" 2 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "fmov") + (eq_attr "memory" "store,both"))) + "bdver1-double,(bdver1-fpsched+bdver1-agu),(bdver1-fsto+bdver1-store)") +(define_insn_reservation "bdver1_fist" 2 + (and (eq_attr "cpu" "bdver1") + (eq_attr "type" "fistp,fisttp")) + "bdver1-double,(bdver1-fpsched+bdver1-agu),(bdver1-fsto+bdver1-store)") +(define_insn_reservation "bdver1_fmov_bdver1" 2 + (and (eq_attr "cpu" "bdver1") + (eq_attr "type" "fmov")) + "bdver1-direct,bdver1-fpsched,bdver1-ffma") +(define_insn_reservation "bdver1_fadd_load" 10 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "fop") + (eq_attr "memory" "load"))) + "bdver1-direct,bdver1-fpload,bdver1-ffma") +(define_insn_reservation "bdver1_fadd" 6 + (and (eq_attr "cpu" "bdver1") + (eq_attr "type" "fop")) + "bdver1-direct,bdver1-fpsched,bdver1-ffma") +(define_insn_reservation "bdver1_fmul_load" 10 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "fmul") + (eq_attr "memory" "load"))) + "bdver1-double,bdver1-fpload,bdver1-ffma") +(define_insn_reservation "bdver1_fmul" 6 + (and (eq_attr "cpu" "bdver1") + (eq_attr "type" "fmul")) + "bdver1-direct,bdver1-fpsched,bdver1-ffma") +(define_insn_reservation "bdver1_fsgn" 2 + (and (eq_attr "cpu" "bdver1") + (eq_attr "type" "fsgn")) + "bdver1-direct,bdver1-fpsched,bdver1-ffma") +(define_insn_reservation "bdver1_fdiv_load" 46 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "fdiv") + (eq_attr "memory" "load"))) + "bdver1-direct,bdver1-fpload,bdver1-ffma") +(define_insn_reservation "bdver1_fdiv" 42 + (and (eq_attr "cpu" "bdver1") + (eq_attr "type" "fdiv")) + "bdver1-direct,bdver1-fpsched,bdver1-ffma") +(define_insn_reservation "bdver1_fpspc_load" 103 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "fpspc") + (eq_attr "memory" "load"))) + "bdver1-vector,bdver1-fpload,bdver1-fvector") +(define_insn_reservation "bdver1_fpspc" 100 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "fpspc") + (eq_attr "memory" "load"))) + "bdver1-vector,bdver1-fpload,bdver1-fvector") +(define_insn_reservation "bdver1_fcmov_load" 17 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "fcmov") + (eq_attr "memory" "load"))) + "bdver1-vector,bdver1-fpload,bdver1-fvector") +(define_insn_reservation "bdver1_fcmov" 15 + (and (eq_attr "cpu" "bdver1") + (eq_attr "type" "fcmov")) + "bdver1-vector,bdver1-fpsched,bdver1-fvector") +(define_insn_reservation "bdver1_fcomi_load" 6 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "fcmp") + (and (eq_attr "bdver1_decode" "double") + (eq_attr "memory" "load")))) + "bdver1-double,bdver1-fpload,(bdver1-ffma | bdver1-fsto)") +(define_insn_reservation "bdver1_fcomi" 2 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "bdver1_decode" "double") + (eq_attr "type" "fcmp"))) + "bdver1-double,bdver1-fpsched,(bdver1-ffma | bdver1-fsto)") +(define_insn_reservation "bdver1_fcom_load" 6 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "fcmp") + (eq_attr "memory" "load"))) + "bdver1-direct,bdver1-fpload,bdver1-ffma") +(define_insn_reservation "bdver1_fcom" 2 + (and (eq_attr "cpu" "bdver1") + (eq_attr "type" "fcmp")) + "bdver1-direct,bdver1-fpsched,bdver1-ffma") +(define_insn_reservation "bdver1_fxch" 2 + (and (eq_attr "cpu" "bdver1") + (eq_attr "type" "fxch")) + "bdver1-direct,bdver1-fpsched,bdver1-ffma") + +;; SSE loads. +(define_insn_reservation "bdver1_ssevector_avx128_unaligned_load" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssemov") + (and (eq_attr "prefix" "vex") + (and (eq_attr "movu" "1") + (and (eq_attr "mode" "V4SF,V2DF") + (eq_attr "memory" "load")))))) + "bdver1-direct,bdver1-fpload") +(define_insn_reservation "bdver1_ssevector_avx256_unaligned_load" 5 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssemov") + (and (eq_attr "movu" "1") + (and (eq_attr "mode" "V8SF,V4DF") + (eq_attr "memory" "load"))))) + "bdver1-double,bdver1-fpload") +(define_insn_reservation "bdver1_ssevector_sse128_unaligned_load" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssemov") + (and (eq_attr "movu" "1") + (and (eq_attr "mode" "V4SF,V2DF") + (eq_attr "memory" "load"))))) + "bdver1-direct,bdver1-fpload,bdver1-fmal") +(define_insn_reservation "bdver1_ssevector_avx128_load" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssemov") + (and (eq_attr "prefix" "vex") + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "load"))))) + "bdver1-direct,bdver1-fpload,bdver1-fmal") +(define_insn_reservation "bdver1_ssevector_avx256_load" 5 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssemov") + (and (eq_attr "mode" "V8SF,V4DF,OI") + (eq_attr "memory" "load")))) + "bdver1-double,bdver1-fpload,bdver1-fmal") +(define_insn_reservation "bdver1_ssevector_sse128_load" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssemov") + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "load")))) + "bdver1-direct,bdver1-fpload") +(define_insn_reservation "bdver1_ssescalar_movq_load" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssemov") + (and (eq_attr "mode" "DI") + (eq_attr "memory" "load")))) + "bdver1-direct,bdver1-fpload,bdver1-fmal") +(define_insn_reservation "bdver1_ssescalar_vmovss_load" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssemov") + (and (eq_attr "prefix" "vex") + (and (eq_attr "mode" "SF") + (eq_attr "memory" "load"))))) + "bdver1-direct,bdver1-fpload") +(define_insn_reservation "bdver1_ssescalar_sse128_load" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssemov") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load")))) + "bdver1-direct,bdver1-fpload, bdver1-ffma") +(define_insn_reservation "bdver1_mmxsse_load" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "mmxmov,ssemov") + (eq_attr "memory" "load"))) + "bdver1-direct,bdver1-fpload, bdver1-fmal") + +;; SSE stores. +(define_insn_reservation "bdver1_sse_store_avx256" 5 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssemov") + (and (eq_attr "mode" "V8SF,V4DF,OI") + (eq_attr "memory" "store,both")))) + "bdver1-double,(bdver1-fpsched+bdver1-agu),((bdver1-fsto+bdver1-store)*2)") +(define_insn_reservation "bdver1_sse_store" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssemov") + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "store,both")))) + "bdver1-direct,(bdver1-fpsched+bdver1-agu),((bdver1-fsto+bdver1-store)*2)") +(define_insn_reservation "bdver1_mmxsse_store_short" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "mmxmov,ssemov") + (eq_attr "memory" "store,both"))) + "bdver1-direct,(bdver1-fpsched+bdver1-agu),(bdver1-fsto+bdver1-store)") + +;; Register moves. +(define_insn_reservation "bdver1_ssevector_avx256" 3 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssemov") + (and (eq_attr "mode" "V8SF,V4DF,OI") + (eq_attr "memory" "none")))) + "bdver1-double,bdver1-fpsched,bdver1-fmal") +(define_insn_reservation "bdver1_movss_movsd" 2 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssemov") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "none")))) + "bdver1-direct,bdver1-fpsched,bdver1-ffma") +(define_insn_reservation "bdver1_mmxssemov" 2 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "mmxmov,ssemov") + (eq_attr "memory" "none"))) + "bdver1-direct,bdver1-fpsched,bdver1-fmal") +;; SSE logs. +(define_insn_reservation "bdver1_sselog_load_256" 7 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "sselog,sselog1") + (and (eq_attr "mode" "V8SF") + (eq_attr "memory" "load")))) + "bdver1-double,bdver1-fpload,bdver1-fmal") +(define_insn_reservation "bdver1_sselog_256" 3 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "sselog,sselog1") + (eq_attr "mode" "V8SF"))) + "bdver1-double,bdver1-fpsched,bdver1-fmal") +(define_insn_reservation "bdver1_sselog_load" 6 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "sselog,sselog1") + (eq_attr "memory" "load"))) + "bdver1-direct,bdver1-fpload,bdver1-fxbar") +(define_insn_reservation "bdver1_sselog" 2 + (and (eq_attr "cpu" "bdver1") + (eq_attr "type" "sselog,sselog1")) + "bdver1-direct,bdver1-fpsched,bdver1-fxbar") + +;; PCMP actually executes in FMAL. +(define_insn_reservation "bdver1_ssecmp_load" 6 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssecmp") + (eq_attr "memory" "load"))) + "bdver1-direct,bdver1-fpload,bdver1-ffma") +(define_insn_reservation "bdver1_ssecmp" 2 + (and (eq_attr "cpu" "bdver1") + (eq_attr "type" "ssecmp")) + "bdver1-direct,bdver1-fpsched,bdver1-ffma") +(define_insn_reservation "bdver1_ssecomi_load" 6 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssecomi") + (eq_attr "memory" "load"))) + "bdver1-double,bdver1-fpload,(bdver1-ffma | bdver1-fsto)") +(define_insn_reservation "bdver1_ssecomi" 2 + (and (eq_attr "cpu" "bdver1") + (eq_attr "type" "ssecomi")) + "bdver1-double,bdver1-fpsched,(bdver1-ffma | bdver1-fsto)") + +;; Conversions behaves very irregularly and the scheduling is critical here. +;; Take each instruction separately. + +;; 256 bit conversion. +(define_insn_reservation "bdver1_vcvtX2Y_avx256_load" 8 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "memory" "load") + (ior (ior (match_operand:V4DF 0 "register_operand") + (ior (match_operand:V8SF 0 "register_operand") + (match_operand:V8SI 0 "register_operand"))) + (ior (match_operand:V4DF 1 "nonimmediate_operand") + (ior (match_operand:V8SF 1 "nonimmediate_operand") + (match_operand:V8SI 1 "nonimmediate_operand"))))))) + "bdver1-vector,bdver1-fpload,bdver1-fvector") +(define_insn_reservation "bdver1_vcvtX2Y_avx256" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "memory" "none") + (ior (ior (match_operand:V4DF 0 "register_operand") + (ior (match_operand:V8SF 0 "register_operand") + (match_operand:V8SI 0 "register_operand"))) + (ior (match_operand:V4DF 1 "nonimmediate_operand") + (ior (match_operand:V8SF 1 "nonimmediate_operand") + (match_operand:V8SI 1 "nonimmediate_operand"))))))) + "bdver1-vector,bdver1-fpsched,bdver1-fvector") +;; CVTSS2SD, CVTSD2SS. +(define_insn_reservation "bdver1_ssecvt_cvtss2sd_load" 8 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load")))) + "bdver1-direct,bdver1-fpload,bdver1-fcvt") +(define_insn_reservation "bdver1_ssecvt_cvtss2sd" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "none")))) + "bdver1-direct,bdver1-fpsched,bdver1-fcvt") +;; CVTSI2SD, CVTSI2SS, CVTSI2SDQ, CVTSI2SSQ. +(define_insn_reservation "bdver1_sseicvt_cvtsi2sd_load" 8 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load")))) + "bdver1-direct,bdver1-fpload,bdver1-fcvt") +(define_insn_reservation "bdver1_sseicvt_cvtsi2sd" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "none")))) + "bdver1-double,bdver1-fpsched,(nothing | bdver1-fcvt)") +;; CVTPD2PS. +(define_insn_reservation "bdver1_ssecvt_cvtpd2ps_load" 8 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "memory" "load") + (and (match_operand:V4SF 0 "register_operand") + (match_operand:V2DF 1 "nonimmediate_operand"))))) + "bdver1-double,bdver1-fpload,(bdver1-fxbar | bdver1-fcvt)") +(define_insn_reservation "bdver1_ssecvt_cvtpd2ps" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "memory" "none") + (and (match_operand:V4SF 0 "register_operand") + (match_operand:V2DF 1 "nonimmediate_operand"))))) + "bdver1-double,bdver1-fpsched,(bdver1-fxbar | bdver1-fcvt)") +;; CVTPI2PS, CVTDQ2PS. +(define_insn_reservation "bdver1_ssecvt_cvtdq2ps_load" 8 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "memory" "load") + (and (match_operand:V4SF 0 "register_operand") + (ior (match_operand:V2SI 1 "nonimmediate_operand") + (match_operand:V4SI 1 "nonimmediate_operand")))))) + "bdver1-direct,bdver1-fpload,bdver1-fcvt") +(define_insn_reservation "bdver1_ssecvt_cvtdq2ps" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "memory" "none") + (and (match_operand:V4SF 0 "register_operand") + (ior (match_operand:V2SI 1 "nonimmediate_operand") + (match_operand:V4SI 1 "nonimmediate_operand")))))) + "bdver1-direct,bdver1-fpsched,bdver1-fcvt") +;; CVTDQ2PD. +(define_insn_reservation "bdver1_ssecvt_cvtdq2pd_load" 8 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "memory" "load") + (and (match_operand:V2DF 0 "register_operand") + (match_operand:V4SI 1 "nonimmediate_operand"))))) + "bdver1-double,bdver1-fpload,(bdver1-fxbar | bdver1-fcvt)") +(define_insn_reservation "bdver1_ssecvt_cvtdq2pd" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "memory" "none") + (and (match_operand:V2DF 0 "register_operand") + (match_operand:V4SI 1 "nonimmediate_operand"))))) + "bdver1-double,bdver1-fpsched,(bdver1-fxbar | bdver1-fcvt)") +;; CVTPS2PD, CVTPI2PD. +(define_insn_reservation "bdver1_ssecvt_cvtps2pd_load" 6 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "memory" "load") + (and (match_operand:V2DF 0 "register_operand") + (ior (match_operand:V2SI 1 "nonimmediate_operand") + (match_operand:V4SF 1 "nonimmediate_operand")))))) + "bdver1-double,bdver1-fpload,(bdver1-fxbar | bdver1-fcvt)") +(define_insn_reservation "bdver1_ssecvt_cvtps2pd" 2 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "memory" "load") + (and (match_operand:V2DF 0 "register_operand") + (ior (match_operand:V2SI 1 "nonimmediate_operand") + (match_operand:V4SF 1 "nonimmediate_operand")))))) + "bdver1-double,bdver1-fpsched,(bdver1-fxbar | bdver1-fcvt)") +;; CVTSD2SI, CVTSD2SIQ, CVTSS2SI, CVTSS2SIQ, CVTTSD2SI, CVTTSD2SIQ, CVTTSS2SI, CVTTSS2SIQ. +(define_insn_reservation "bdver1_ssecvt_cvtsX2si_load" 8 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "mode" "SI,DI") + (eq_attr "memory" "load")))) + "bdver1-double,bdver1-fpload,(bdver1-fcvt | bdver1-fsto)") +(define_insn_reservation "bdver1_ssecvt_cvtsX2si" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "mode" "SI,DI") + (eq_attr "memory" "none")))) + "bdver1-double,bdver1-fpsched,(bdver1-fcvt | bdver1-fsto)") +;; CVTPD2PI, CVTTPD2PI. +(define_insn_reservation "bdver1_ssecvt_cvtpd2pi_load" 8 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "memory" "load") + (and (match_operand:V2DF 1 "nonimmediate_operand") + (match_operand:V2SI 0 "register_operand"))))) + "bdver1-double,bdver1-fpload,(bdver1-fcvt | bdver1-fxbar)") +(define_insn_reservation "bdver1_ssecvt_cvtpd2pi" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "memory" "none") + (and (match_operand:V2DF 1 "nonimmediate_operand") + (match_operand:V2SI 0 "register_operand"))))) + "bdver1-double,bdver1-fpsched,(bdver1-fcvt | bdver1-fxbar)") +;; CVTPD2DQ, CVTTPD2DQ. +(define_insn_reservation "bdver1_ssecvt_cvtpd2dq_load" 6 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "memory" "load") + (and (match_operand:V2DF 1 "nonimmediate_operand") + (match_operand:V4SI 0 "register_operand"))))) + "bdver1-double,bdver1-fpload,(bdver1-fcvt | bdver1-fxbar)") +(define_insn_reservation "bdver1_ssecvt_cvtpd2dq" 2 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "memory" "none") + (and (match_operand:V2DF 1 "nonimmediate_operand") + (match_operand:V4SI 0 "register_operand"))))) + "bdver1-double,bdver1-fpsched,(bdver1-fcvt | bdver1-fxbar)") +;; CVTPS2PI, CVTTPS2PI, CVTPS2DQ, CVTTPS2DQ. +(define_insn_reservation "bdver1_ssecvt_cvtps2pi_load" 8 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "memory" "load") + (and (match_operand:V4SF 1 "nonimmediate_operand") + (ior (match_operand: V2SI 0 "register_operand") + (match_operand: V4SI 0 "register_operand")))))) + "bdver1-direct,bdver1-fpload,bdver1-fcvt") +(define_insn_reservation "bdver1_ssecvt_cvtps2pi" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "memory" "none") + (and (match_operand:V4SF 1 "nonimmediate_operand") + (ior (match_operand: V2SI 0 "register_operand") + (match_operand: V4SI 0 "register_operand")))))) + "bdver1-direct,bdver1-fpsched,bdver1-fcvt") + +;; SSE MUL, ADD, and MULADD. +(define_insn_reservation "bdver1_ssemuladd_load_256" 11 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssemul,sseadd,ssemuladd") + (and (eq_attr "mode" "V8SF,V4DF") + (eq_attr "memory" "load")))) + "bdver1-double,bdver1-fpload,bdver1-ffma") +(define_insn_reservation "bdver1_ssemuladd_256" 7 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssemul,sseadd,ssemuladd") + (and (eq_attr "mode" "V8SF,V4DF") + (eq_attr "memory" "none")))) + "bdver1-double,bdver1-fpsched,bdver1-ffma") +(define_insn_reservation "bdver1_ssemuladd_load" 10 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssemul,sseadd,ssemuladd") + (eq_attr "memory" "load"))) + "bdver1-direct,bdver1-fpload,bdver1-ffma") +(define_insn_reservation "bdver1_ssemuladd" 6 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssemul,sseadd,ssemuladd") + (eq_attr "memory" "none"))) + "bdver1-direct,bdver1-fpsched,bdver1-ffma") +(define_insn_reservation "bdver1_sseimul_load" 8 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "sseimul") + (eq_attr "memory" "load"))) + "bdver1-direct,bdver1-fpload,bdver1-fmma") +(define_insn_reservation "bdver1_sseimul" 4 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "sseimul") + (eq_attr "memory" "none"))) + "bdver1-direct,bdver1-fpsched,bdver1-fmma") +(define_insn_reservation "bdver1_sseiadd_load" 6 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "sseiadd") + (eq_attr "memory" "load"))) + "bdver1-direct,bdver1-fpload,bdver1-fmal") +(define_insn_reservation "bdver1_sseiadd" 2 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "sseiadd") + (eq_attr "memory" "none"))) + "bdver1-direct,bdver1-fpsched,bdver1-fmal") + +;; SSE DIV: no throughput information (assume same as amdfam10). +(define_insn_reservation "bdver1_ssediv_double_load_256" 31 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssediv") + (and (eq_attr "mode" "V4DF") + (eq_attr "memory" "load")))) + "bdver1-double,bdver1-fpload,(bdver1-ffma0*17 | bdver1-ffma1*17)") +(define_insn_reservation "bdver1_ssediv_double_256" 27 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssediv") + (and (eq_attr "mode" "V4DF") + (eq_attr "memory" "none")))) + "bdver1-double,bdver1-fpsched,(bdver1-ffma0*17 | bdver1-ffma1*17)") +(define_insn_reservation "bdver1_ssediv_single_load_256" 28 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssediv") + (and (eq_attr "mode" "V8SF") + (eq_attr "memory" "load")))) + "bdver1-double,bdver1-fpload,(bdver1-ffma0*17 | bdver1-ffma1*17)") +(define_insn_reservation "bdver1_ssediv_single_256" 24 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssediv") + (and (eq_attr "mode" "V8SF") + (eq_attr "memory" "none")))) + "bdver1-double,bdver1-fpsched,(bdver1-ffma0*17 | bdver1-ffma1*17)") +(define_insn_reservation "bdver1_ssediv_double_load" 31 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssediv") + (and (eq_attr "mode" "DF,V2DF") + (eq_attr "memory" "load")))) + "bdver1-direct,bdver1-fpload,(bdver1-ffma0*17 | bdver1-ffma1*17)") +(define_insn_reservation "bdver1_ssediv_double" 27 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssediv") + (and (eq_attr "mode" "DF,V2DF") + (eq_attr "memory" "none")))) + "bdver1-direct,bdver1-fpsched,(bdver1-ffma0*17 | bdver1-ffma1*17)") +(define_insn_reservation "bdver1_ssediv_single_load" 28 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssediv") + (and (eq_attr "mode" "SF,V4SF") + (eq_attr "memory" "load")))) + "bdver1-direct,bdver1-fpload,(bdver1-ffma0*17 | bdver1-ffma1*17)") +(define_insn_reservation "bdver1_ssediv_single" 24 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "ssediv") + (and (eq_attr "mode" "SF,V4SF") + (eq_attr "memory" "none")))) + "bdver1-direct,bdver1-fpsched,(bdver1-ffma0*17 | bdver1-ffma1*17)") + +(define_insn_reservation "bdver1_sseins" 3 + (and (eq_attr "cpu" "bdver1") + (and (eq_attr "type" "sseins") + (eq_attr "mode" "TI"))) + "bdver1-direct,bdver1-fpsched,bdver1-fxbar") + diff --git a/gcc/config/i386/biarch64.h b/gcc/config/i386/biarch64.h new file mode 100644 index 000000000..629ec980d --- /dev/null +++ b/gcc/config/i386/biarch64.h @@ -0,0 +1,29 @@ +/* Make configure files to produce biarch compiler defaulting to 64bit mode. + This file must be included very first, while the OS specific file later + to overwrite otherwise wrong defaults. + Copyright (C) 2001, 2007, 2009 Free Software Foundation, Inc. + Contributed by Bo Thorsen . + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ + +#define TARGET_64BIT_DEFAULT OPTION_MASK_ISA_64BIT +#define TARGET_BI_ARCH 1 diff --git a/gcc/config/i386/bmiintrin.h b/gcc/config/i386/bmiintrin.h new file mode 100644 index 000000000..225f2ecbe --- /dev/null +++ b/gcc/config/i386/bmiintrin.h @@ -0,0 +1,145 @@ +/* Copyright (C) 2010 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86INTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef __BMI__ +# error "BMI instruction set not enabled" +#endif /* __BMI__ */ + +#ifndef _BMIINTRIN_H_INCLUDED +#define _BMIINTRIN_H_INCLUDED + +extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lzcnt_u16 (unsigned short __X) +{ + return __builtin_clzs (__X); +} + +extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__tzcnt_u16 (unsigned short __X) +{ + return __builtin_ctzs (__X); +} + + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__andn_u32 (unsigned int __X, unsigned int __Y) +{ + unsigned int tmp = ~(__X) & (__Y); + return tmp; +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bextr_u32 (unsigned int __X, unsigned int __Y) +{ + return __builtin_ia32_bextr_u32 (__X, __Y); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsi_u32 (unsigned int __X) +{ + unsigned int tmp = (__X) & (-(__X)); + return tmp; +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsmsk_u32 (unsigned int __X) +{ + unsigned int tmp = (__X) ^ (__X - 1); + return tmp; +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsr_u32 (unsigned int __X) +{ + unsigned int tmp = (__X) & (__X - 1); + return tmp; +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lzcnt_u32 (unsigned int __X) +{ + return __builtin_clz (__X); +} + + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__tzcnt_u32 (unsigned int __X) +{ + return __builtin_ctz (__X); +} + + +#ifdef __x86_64__ +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__andn_u64 (unsigned long long __X, unsigned long long __Y) +{ + unsigned long long tmp = ~(__X) & (__Y); + return tmp; +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bextr_u64 (unsigned long long __X, unsigned long long __Y) +{ + return __builtin_ia32_bextr_u64 (__X, __Y); +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsi_u64 (unsigned long long __X) +{ + unsigned long long tmp = (__X) & (-(__X)); + return tmp; +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsmsk_u64 (unsigned long long __X) +{ + unsigned long long tmp = (__X) ^ (__X - 1); + return tmp; +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsr_u64 (unsigned long long __X) +{ + unsigned long long tmp = (__X) & (__X - 1); + return tmp; +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lzcnt_u64 (unsigned long long __X) +{ + return __builtin_clzll (__X); +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__tzcnt_u64 (unsigned long long __X) +{ + return __builtin_ctzll (__X); +} + +#endif /* __x86_64__ */ + +#endif /* _BMIINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/bmmintrin.h b/gcc/config/i386/bmmintrin.h new file mode 100644 index 000000000..91d4e7742 --- /dev/null +++ b/gcc/config/i386/bmmintrin.h @@ -0,0 +1,29 @@ +/* Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _BMMINTRIN_H_INCLUDED +#define _BMMINTRIN_H_INCLUDED + +# error "SSE5 instruction set removed from compiler" + +#endif /* _BMMINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/bsd.h b/gcc/config/i386/bsd.h new file mode 100644 index 000000000..e408ccdb0 --- /dev/null +++ b/gcc/config/i386/bsd.h @@ -0,0 +1,100 @@ +/* Definitions for BSD assembler syntax for Intel 386 + (actually AT&T syntax for insns and operands, + adapted to BSD conventions for symbol names and debugging.) + Copyright (C) 1988, 1996, 2000, 2002, 2007, 2008 + Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +/* Use the Sequent Symmetry assembler syntax. */ + +/* Define the syntax of pseudo-ops, labels and comments. */ + +/* Prefix for internally generated assembler labels. If we aren't using + underscores, we are using prefix `.'s to identify labels that should + be ignored, as in `i386/gas.h' --karl@cs.umb.edu */ + +#define LPREFIX "L" + +/* Assembler pseudos to introduce constants of various size. */ + +#define ASM_BYTE "\t.byte\t" +#define ASM_SHORT "\t.word\t" +#define ASM_LONG "\t.long\t" +#define ASM_QUAD "\t.quad\t" /* Should not be used for 32bit compilation. */ + +/* This was suggested, but it shouldn't be right for DBX output. -- RMS + #define ASM_OUTPUT_SOURCE_FILENAME(FILE, NAME) */ + + +/* Define the syntax of labels and symbol definitions/declarations. */ + +/* This is how to output an assembler line + that says to advance the location counter by SIZE bytes. */ + +#define ASM_OUTPUT_SKIP(FILE,SIZE) \ + fprintf (FILE, "\t.space "HOST_WIDE_INT_PRINT_UNSIGNED"\n", (SIZE)) + +/* Define the syntax of labels and symbol definitions/declarations. */ + +/* This says how to output an assembler line + to define a global common symbol. */ + +#define ASM_OUTPUT_COMMON(FILE, NAME, SIZE, ROUNDED) \ +( fputs (".comm ", (FILE)), \ + assemble_name ((FILE), (NAME)), \ + fprintf ((FILE), ",%u\n", (int)(ROUNDED))) + +/* This says how to output an assembler line + to define a local common symbol. */ + +#define ASM_OUTPUT_LOCAL(FILE, NAME, SIZE, ROUNDED) \ +( fputs (".lcomm ", (FILE)), \ + assemble_name ((FILE), (NAME)), \ + fprintf ((FILE), ",%u\n", (int)(ROUNDED))) + +#ifdef HAVE_GAS_LCOMM_WITH_ALIGNMENT +#define ASM_OUTPUT_ALIGNED_LOCAL(FILE, NAME, SIZE, ALIGNMENT) \ +( fputs (".lcomm ", (FILE)), \ + assemble_name ((FILE), (NAME)), \ + fprintf ((FILE), ",%u,%u\n", (int)(SIZE), (int)(ALIGNMENT) / BITS_PER_UNIT)) +#endif + +/* This is how to output an assembler line + that says to advance the location counter + to a multiple of 2**LOG bytes. */ + +#define ASM_OUTPUT_ALIGN(FILE,LOG) \ + if ((LOG)!=0) fprintf ((FILE), "\t.align %d\n", (LOG)) + +/* This is how to store into the string BUF + the symbol_ref name of an internal numbered label where + PREFIX is the class of label and NUM is the number within the class. + This is suitable for output with `assemble_name'. */ + +#define ASM_GENERATE_INTERNAL_LABEL(BUF,PREFIX,NUMBER) \ + sprintf ((BUF), "*%s%ld", (PREFIX), (long)(NUMBER)) + +/* The prefix to add to user-visible assembler symbols. */ + +#define USER_LABEL_PREFIX "_" + +/* Sequent has some changes in the format of DBX symbols. */ +#define DBX_NO_XREFS 1 + +/* Don't split DBX symbols into continuations. */ +#define DBX_CONTIN_LENGTH 0 diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md new file mode 100644 index 000000000..6233b79ec --- /dev/null +++ b/gcc/config/i386/constraints.md @@ -0,0 +1,175 @@ +;; Constraint definitions for IA-32 and x86-64. +;; Copyright (C) 2006, 2007 Free Software Foundation, Inc. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +;;; Unused letters: +;;; B H T W +;;; h jk vw + +;; Integer register constraints. +;; It is not necessary to define 'r' here. +(define_register_constraint "R" "LEGACY_REGS" + "Legacy register---the eight integer registers available on all + i386 processors (@code{a}, @code{b}, @code{c}, @code{d}, + @code{si}, @code{di}, @code{bp}, @code{sp}).") + +(define_register_constraint "q" "TARGET_64BIT ? GENERAL_REGS : Q_REGS" + "Any register accessible as @code{@var{r}l}. In 32-bit mode, @code{a}, + @code{b}, @code{c}, and @code{d}; in 64-bit mode, any integer register.") + +(define_register_constraint "Q" "Q_REGS" + "Any register accessible as @code{@var{r}h}: @code{a}, @code{b}, + @code{c}, and @code{d}.") + +(define_register_constraint "l" "INDEX_REGS" + "@internal Any register that can be used as the index in a base+index + memory access: that is, any general register except the stack pointer.") + +(define_register_constraint "a" "AREG" + "The @code{a} register.") + +(define_register_constraint "b" "BREG" + "The @code{b} register.") + +(define_register_constraint "c" "CREG" + "The @code{c} register.") + +(define_register_constraint "d" "DREG" + "The @code{d} register.") + +(define_register_constraint "S" "SIREG" + "The @code{si} register.") + +(define_register_constraint "D" "DIREG" + "The @code{di} register.") + +(define_register_constraint "A" "AD_REGS" + "The @code{a} and @code{d} registers, as a pair (for instructions + that return half the result in one and half in the other).") + +(define_register_constraint "U" "CLOBBERED_REGS" + "The call-clobbered integer registers.") + +;; Floating-point register constraints. +(define_register_constraint "f" + "TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387 ? FLOAT_REGS : NO_REGS" + "Any 80387 floating-point (stack) register.") + +(define_register_constraint "t" + "TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387 ? FP_TOP_REG : NO_REGS" + "Top of 80387 floating-point stack (@code{%st(0)}).") + +(define_register_constraint "u" + "TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387 ? FP_SECOND_REG : NO_REGS" + "Second from top of 80387 floating-point stack (@code{%st(1)}).") + +;; Vector registers (also used for plain floating point nowadays). +(define_register_constraint "y" "TARGET_MMX ? MMX_REGS : NO_REGS" + "Any MMX register.") + +(define_register_constraint "x" "TARGET_SSE ? SSE_REGS : NO_REGS" + "Any SSE register.") + +;; We use the Y prefix to denote any number of conditional register sets: +;; z First SSE register. +;; 2 SSE2 enabled +;; i SSE2 inter-unit moves enabled +;; m MMX inter-unit moves enabled + +(define_register_constraint "Yz" "TARGET_SSE ? SSE_FIRST_REG : NO_REGS" + "First SSE register (@code{%xmm0}).") + +(define_register_constraint "Y2" "TARGET_SSE2 ? SSE_REGS : NO_REGS" + "@internal Any SSE register, when SSE2 is enabled.") + +(define_register_constraint "Yi" + "TARGET_SSE2 && TARGET_INTER_UNIT_MOVES ? SSE_REGS : NO_REGS" + "@internal Any SSE register, when SSE2 and inter-unit moves are enabled.") + +(define_register_constraint "Ym" + "TARGET_MMX && TARGET_INTER_UNIT_MOVES ? MMX_REGS : NO_REGS" + "@internal Any MMX register, when inter-unit moves are enabled.") + +(define_constraint "z" + "@internal Constant call address operand." + (match_operand 0 "constant_call_address_operand")) + +;; Integer constant constraints. +(define_constraint "I" + "Integer constant in the range 0 @dots{} 31, for 32-bit shifts." + (and (match_code "const_int") + (match_test "IN_RANGE (ival, 0, 31)"))) + +(define_constraint "J" + "Integer constant in the range 0 @dots{} 63, for 64-bit shifts." + (and (match_code "const_int") + (match_test "IN_RANGE (ival, 0, 63)"))) + +(define_constraint "K" + "Signed 8-bit integer constant." + (and (match_code "const_int") + (match_test "IN_RANGE (ival, -128, 127)"))) + +(define_constraint "L" + "@code{0xFF} or @code{0xFFFF}, for andsi as a zero-extending move." + (and (match_code "const_int") + (match_test "ival == 0xFF || ival == 0xFFFF"))) + +(define_constraint "M" + "0, 1, 2, or 3 (shifts for the @code{lea} instruction)." + (and (match_code "const_int") + (match_test "IN_RANGE (ival, 0, 3)"))) + +(define_constraint "N" + "Unsigned 8-bit integer constant (for @code{in} and @code{out} + instructions)." + (and (match_code "const_int") + (match_test "IN_RANGE (ival, 0, 255)"))) + +(define_constraint "O" + "@internal Integer constant in the range 0 @dots{} 127, for 128-bit shifts." + (and (match_code "const_int") + (match_test "IN_RANGE (ival, 0, 127)"))) + +;; Floating-point constant constraints. +;; We allow constants even if TARGET_80387 isn't set, because the +;; stack register converter may need to load 0.0 into the function +;; value register (top of stack). +(define_constraint "G" + "Standard 80387 floating point constant." + (and (match_code "const_double") + (match_test "standard_80387_constant_p (op) > 0"))) + +;; This can theoretically be any mode's CONST0_RTX. +(define_constraint "C" + "Standard SSE floating point constant." + (match_test "standard_sse_constant_p (op)")) + +;; Constant-or-symbol-reference constraints. + +(define_constraint "e" + "32-bit signed integer constant, or a symbolic reference known + to fit that range (for immediate operands in sign-extending x86-64 + instructions)." + (match_operand 0 "x86_64_immediate_operand")) + +(define_constraint "Z" + "32-bit unsigned integer constant, or a symbolic reference known + to fit that range (for immediate operands in zero-extending x86-64 + instructions)." + (match_operand 0 "x86_64_zext_immediate_operand")) diff --git a/gcc/config/i386/core2.md b/gcc/config/i386/core2.md new file mode 100644 index 000000000..d154cdc07 --- /dev/null +++ b/gcc/config/i386/core2.md @@ -0,0 +1,691 @@ +;; Scheduling for Core 2 and derived processors. +;; Copyright (C) 2004, 2005, 2007, 2008, 2010 Free Software Foundation, Inc. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ + +;; The scheduling description in this file is based on the one in ppro.md, +;; with additional information obtained from +;; +;; "How to optimize for the Pentium family of microprocessors", +;; by Agner Fog, PhD. +;; +;; The major difference from the P6 pipeline is one extra decoder, and +;; one extra execute unit. Due to micro-op fusion, many insns no longer +;; need to be decoded in decoder 0, but can be handled by all of them. + +;; The core2_idiv, core2_fdiv and core2_ssediv automata are used to +;; model issue latencies of idiv, fdiv and ssediv type insns. +(define_automaton "core2_decoder,core2_core,core2_idiv,core2_fdiv,core2_ssediv,core2_load,core2_store") + +;; The CPU domain, used for Core i7 bypass latencies +(define_attr "i7_domain" "int,float,simd" + (cond [(eq_attr "type" "fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,fistp,fisttp,frndint") + (const_string "float") + (eq_attr "type" "sselog,sselog1,sseiadd,sseiadd1,sseishft,sseishft1,sseimul, + sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt, + ssecvt1,sseicvt,ssediv,sseins,ssemuladd,sse4arg") + (cond [(eq_attr "mode" "V4DF,V8SF,V2DF,V4SF,SF,DF") + (const_string "float") + (eq_attr "mode" "SI") + (const_string "int")] + (const_string "simd")) + (eq_attr "type" "mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft") + (const_string "simd")] + (const_string "int"))) + +;; As for the Pentium Pro, +;; - an instruction with 1 uop can be decoded by any of the three +;; decoders in one cycle. +;; - an instruction with 1 to 4 uops can be decoded only by decoder 0 +;; but still in only one cycle. +;; - a complex (microcode) instruction can also only be decoded by +;; decoder 0, and this takes an unspecified number of cycles. +;; +;; The goal is to schedule such that we have a few-one-one uops sequence +;; in each cycle, to decode as many instructions per cycle as possible. +(define_cpu_unit "c2_decoder0" "core2_decoder") +(define_cpu_unit "c2_decoder1" "core2_decoder") +(define_cpu_unit "c2_decoder2" "core2_decoder") +(define_cpu_unit "c2_decoder3" "core2_decoder") + +;; We first wish to find an instruction for c2_decoder0, so exclude +;; c2_decoder1 and c2_decoder2 from being reserved until c2_decoder 0 is +;; reserved. +(presence_set "c2_decoder1" "c2_decoder0") +(presence_set "c2_decoder2" "c2_decoder0") +(presence_set "c2_decoder3" "c2_decoder0") + +;; Most instructions can be decoded on any of the three decoders. +(define_reservation "c2_decodern" "(c2_decoder0|c2_decoder1|c2_decoder2|c2_decoder3)") + +;; The out-of-order core has six pipelines. These are similar to the +;; Pentium Pro's five pipelines. Port 2 is responsible for memory loads, +;; port 3 for store address calculations, port 4 for memory stores, and +;; ports 0, 1 and 5 for everything else. + +(define_cpu_unit "c2_p0,c2_p1,c2_p5" "core2_core") +(define_cpu_unit "c2_p2" "core2_load") +(define_cpu_unit "c2_p3,c2_p4" "core2_store") +(define_cpu_unit "c2_idiv" "core2_idiv") +(define_cpu_unit "c2_fdiv" "core2_fdiv") +(define_cpu_unit "c2_ssediv" "core2_ssediv") + +;; Only the irregular instructions have to be modeled here. A load +;; increases the latency by 2 or 3, or by nothing if the manual gives +;; a latency already. Store latencies are not accounted for. +;; +;; The simple instructions follow a very regular pattern of 1 uop per +;; reg-reg operation, 1 uop per load on port 2. and 2 uops per store +;; on port 4 and port 3. These instructions are modelled at the bottom +;; of this file. +;; +;; For microcoded instructions we don't know how many uops are produced. +;; These instructions are the "complex" ones in the Intel manuals. All +;; we _do_ know is that they typically produce four or more uops, so +;; they can only be decoded on c2_decoder0. Modelling their latencies +;; doesn't make sense because we don't know how these instructions are +;; executed in the core. So we just model that they can only be decoded +;; on decoder 0, and say that it takes a little while before the result +;; is available. +(define_insn_reservation "c2_complex_insn" 6 + (and (eq_attr "cpu" "core2,corei7") + (eq_attr "type" "other,multi,str")) + "c2_decoder0") + +(define_insn_reservation "c2_call" 1 + (and (eq_attr "cpu" "core2,corei7") + (eq_attr "type" "call,callv")) + "c2_decoder0") + +;; imov with memory operands does not use the integer units. +;; imovx always decodes to one uop, and also doesn't use the integer +;; units if it has memory operands. +(define_insn_reservation "c2_imov" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "imov,imovx"))) + "c2_decodern,(c2_p0|c2_p1|c2_p5)") + +(define_insn_reservation "c2_imov_load" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (eq_attr "type" "imov,imovx"))) + "c2_decodern,c2_p2") + +(define_insn_reservation "c2_imov_store" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "store") + (eq_attr "type" "imov"))) + "c2_decodern,c2_p4+c2_p3") + +(define_insn_reservation "c2_icmov" 2 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "icmov"))) + "c2_decoder0,(c2_p0|c2_p1|c2_p5)*2") + +(define_insn_reservation "c2_icmov_load" 2 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (eq_attr "type" "icmov"))) + "c2_decoder0,c2_p2,(c2_p0|c2_p1|c2_p5)*2") + +(define_insn_reservation "c2_push_reg" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "store") + (eq_attr "type" "push"))) + "c2_decodern,c2_p4+c2_p3") + +(define_insn_reservation "c2_push_mem" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "both") + (eq_attr "type" "push"))) + "c2_decoder0,c2_p2,c2_p4+c2_p3") + +;; lea executes on port 0 with latency one and throughput 1. +(define_insn_reservation "c2_lea" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "lea"))) + "c2_decodern,c2_p0") + +;; Shift and rotate decode as two uops which can go to port 0 or 5. +;; The load and store units need to be reserved when memory operands +;; are involved. +(define_insn_reservation "c2_shift_rotate" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "ishift,ishift1,rotate,rotate1"))) + "c2_decodern,(c2_p0|c2_p5)") + +(define_insn_reservation "c2_shift_rotate_mem" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "!none") + (eq_attr "type" "ishift,ishift1,rotate,rotate1"))) + "c2_decoder0,c2_p2,(c2_p0|c2_p5),c2_p4+c2_p3") + +;; See comments in ppro.md for the corresponding reservation. +(define_insn_reservation "c2_branch" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "ibr"))) + "c2_decodern,c2_p5") + +;; ??? Indirect branches probably have worse latency than this. +(define_insn_reservation "c2_indirect_branch" 6 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "!none") + (eq_attr "type" "ibr"))) + "c2_decoder0,c2_p2+c2_p5") + +(define_insn_reservation "c2_leave" 4 + (and (eq_attr "cpu" "core2,corei7") + (eq_attr "type" "leave")) + "c2_decoder0,c2_p2+(c2_p0|c2_p1),(c2_p0|c2_p1)") + +;; mul and imul with two/three operands only execute on port 1 for HImode +;; and SImode, port 0 for DImode. +(define_insn_reservation "c2_imul_hisi" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "HI,SI") + (eq_attr "type" "imul")))) + "c2_decodern,c2_p1") + +(define_insn_reservation "c2_imul_hisi_mem" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "!none") + (and (eq_attr "mode" "HI,SI") + (eq_attr "type" "imul")))) + "c2_decoder0,c2_p2+c2_p1") + +(define_insn_reservation "c2_imul_di" 5 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "DI") + (eq_attr "type" "imul")))) + "c2_decodern,c2_p0") + +(define_insn_reservation "c2_imul_di_mem" 5 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "!none") + (and (eq_attr "mode" "DI") + (eq_attr "type" "imul")))) + "c2_decoder0,c2_p2+c2_p0") + +;; div and idiv are very similar, so we model them the same. +;; QI, HI, and SI have issue latency 12, 21, and 37, respectively. +;; These issue latencies are modelled via the c2_div automaton. +(define_insn_reservation "c2_idiv_QI" 19 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "QI") + (eq_attr "type" "idiv")))) + "c2_decoder0,(c2_p0+c2_idiv)*2,(c2_p0|c2_p1)+c2_idiv,c2_idiv*9") + +(define_insn_reservation "c2_idiv_QI_load" 19 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "QI") + (eq_attr "type" "idiv")))) + "c2_decoder0,c2_p2+c2_p0+c2_idiv,c2_p0+c2_idiv,(c2_p0|c2_p1)+c2_idiv,c2_idiv*9") + +(define_insn_reservation "c2_idiv_HI" 23 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "HI") + (eq_attr "type" "idiv")))) + "c2_decoder0,(c2_p0+c2_idiv)*3,(c2_p0|c2_p1)+c2_idiv,c2_idiv*17") + +(define_insn_reservation "c2_idiv_HI_load" 23 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "HI") + (eq_attr "type" "idiv")))) + "c2_decoder0,c2_p2+c2_p0+c2_idiv,c2_p0+c2_idiv,(c2_p0|c2_p1)+c2_idiv,c2_idiv*18") + +(define_insn_reservation "c2_idiv_SI" 39 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SI") + (eq_attr "type" "idiv")))) + "c2_decoder0,(c2_p0+c2_idiv)*3,(c2_p0|c2_p1)+c2_idiv,c2_idiv*33") + +(define_insn_reservation "c2_idiv_SI_load" 39 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SI") + (eq_attr "type" "idiv")))) + "c2_decoder0,c2_p2+c2_p0+c2_idiv,c2_p0+c2_idiv,(c2_p0|c2_p1)+c2_idiv,c2_idiv*34") + +;; x87 floating point operations. + +(define_insn_reservation "c2_fxch" 0 + (and (eq_attr "cpu" "core2,corei7") + (eq_attr "type" "fxch")) + "c2_decodern") + +(define_insn_reservation "c2_fop" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none,unknown") + (eq_attr "type" "fop"))) + "c2_decodern,c2_p1") + +(define_insn_reservation "c2_fop_load" 5 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (eq_attr "type" "fop"))) + "c2_decoder0,c2_p2+c2_p1,c2_p1") + +(define_insn_reservation "c2_fop_store" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "store") + (eq_attr "type" "fop"))) + "c2_decoder0,c2_p0,c2_p0,c2_p0+c2_p4+c2_p3") + +(define_insn_reservation "c2_fop_both" 5 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "both") + (eq_attr "type" "fop"))) + "c2_decoder0,c2_p2+c2_p0,c2_p0+c2_p4+c2_p3") + +(define_insn_reservation "c2_fsgn" 1 + (and (eq_attr "cpu" "core2,corei7") + (eq_attr "type" "fsgn")) + "c2_decodern,c2_p0") + +(define_insn_reservation "c2_fistp" 5 + (and (eq_attr "cpu" "core2,corei7") + (eq_attr "type" "fistp")) + "c2_decoder0,c2_p0*2,c2_p4+c2_p3") + +(define_insn_reservation "c2_fcmov" 2 + (and (eq_attr "cpu" "core2,corei7") + (eq_attr "type" "fcmov")) + "c2_decoder0,c2_p0*2") + +(define_insn_reservation "c2_fcmp" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "fcmp"))) + "c2_decodern,c2_p1") + +(define_insn_reservation "c2_fcmp_load" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (eq_attr "type" "fcmp"))) + "c2_decoder0,c2_p2+c2_p1") + +(define_insn_reservation "c2_fmov" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "fmov"))) + "c2_decodern,c2_p0") + +(define_insn_reservation "c2_fmov_load" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "!XF") + (eq_attr "type" "fmov")))) + "c2_decodern,c2_p2") + +(define_insn_reservation "c2_fmov_XF_load" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "XF") + (eq_attr "type" "fmov")))) + "c2_decoder0,(c2_p2+c2_p0)*2") + +(define_insn_reservation "c2_fmov_store" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "store") + (and (eq_attr "mode" "!XF") + (eq_attr "type" "fmov")))) + "c2_decodern,c2_p3+c2_p4") + +(define_insn_reservation "c2_fmov_XF_store" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "store") + (and (eq_attr "mode" "XF") + (eq_attr "type" "fmov")))) + "c2_decoder0,(c2_p3+c2_p4),(c2_p3+c2_p4)") + +;; fmul executes on port 0 with latency 5. It has issue latency 2, +;; but we don't model this. +(define_insn_reservation "c2_fmul" 5 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "fmul"))) + "c2_decoder0,c2_p0*2") + +(define_insn_reservation "c2_fmul_load" 6 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (eq_attr "type" "fmul"))) + "c2_decoder0,c2_p2+c2_p0,c2_p0") + +;; fdiv latencies depend on the mode of the operands. XFmode gives +;; a latency of 38 cycles, DFmode gives 32, and SFmode gives latency 18. +;; Division by a power of 2 takes only 9 cycles, but we cannot model +;; that. Throughput is equal to latency - 1, which we model using the +;; c2_div automaton. +(define_insn_reservation "c2_fdiv_SF" 18 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF") + (eq_attr "type" "fdiv,fpspc")))) + "c2_decodern,c2_p0+c2_fdiv,c2_fdiv*16") + +(define_insn_reservation "c2_fdiv_SF_load" 19 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SF") + (eq_attr "type" "fdiv,fpspc")))) + "c2_decoder0,c2_p2+c2_p0+c2_fdiv,c2_fdiv*16") + +(define_insn_reservation "c2_fdiv_DF" 32 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "DF") + (eq_attr "type" "fdiv,fpspc")))) + "c2_decodern,c2_p0+c2_fdiv,c2_fdiv*30") + +(define_insn_reservation "c2_fdiv_DF_load" 33 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "DF") + (eq_attr "type" "fdiv,fpspc")))) + "c2_decoder0,c2_p2+c2_p0+c2_fdiv,c2_fdiv*30") + +(define_insn_reservation "c2_fdiv_XF" 38 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "XF") + (eq_attr "type" "fdiv,fpspc")))) + "c2_decodern,c2_p0+c2_fdiv,c2_fdiv*36") + +(define_insn_reservation "c2_fdiv_XF_load" 39 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "XF") + (eq_attr "type" "fdiv,fpspc")))) + "c2_decoder0,c2_p2+c2_p0+c2_fdiv,c2_fdiv*36") + +;; MMX instructions. + +(define_insn_reservation "c2_mmx_add" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "mmxadd,sseiadd"))) + "c2_decodern,c2_p0|c2_p5") + +(define_insn_reservation "c2_mmx_add_load" 2 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (eq_attr "type" "mmxadd,sseiadd"))) + "c2_decodern,c2_p2+c2_p0|c2_p5") + +(define_insn_reservation "c2_mmx_shft" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "mmxshft"))) + "c2_decodern,c2_p0|c2_p5") + +(define_insn_reservation "c2_mmx_shft_load" 2 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (eq_attr "type" "mmxshft"))) + "c2_decoder0,c2_p2+c2_p1") + +(define_insn_reservation "c2_mmx_sse_shft" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "type" "sseishft") + (eq_attr "length_immediate" "!0")))) + "c2_decodern,c2_p1") + +(define_insn_reservation "c2_mmx_sse_shft_load" 2 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (and (eq_attr "type" "sseishft") + (eq_attr "length_immediate" "!0")))) + "c2_decodern,c2_p1") + +(define_insn_reservation "c2_mmx_sse_shft1" 2 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "type" "sseishft") + (eq_attr "length_immediate" "0")))) + "c2_decodern,c2_p1") + +(define_insn_reservation "c2_mmx_sse_shft1_load" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (and (eq_attr "type" "sseishft") + (eq_attr "length_immediate" "0")))) + "c2_decodern,c2_p1") + +(define_insn_reservation "c2_mmx_mul" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "mmxmul,sseimul"))) + "c2_decodern,c2_p1") + +(define_insn_reservation "c2_mmx_mul_load" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "mmxmul,sseimul"))) + "c2_decoder0,c2_p2+c2_p1") + +(define_insn_reservation "c2_sse_mmxcvt" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "mode" "DI") + (eq_attr "type" "mmxcvt"))) + "c2_decodern,c2_p1") + +;; FIXME: These are Pentium III only, but we cannot tell here if +;; we're generating code for PentiumPro/Pentium II or Pentium III +;; (define_insn_reservation "c2_sse_mmxshft" 2 +;; (and (eq_attr "cpu" "core2,corei7") +;; (and (eq_attr "mode" "TI") +;; (eq_attr "type" "mmxshft"))) +;; "c2_decodern,c2_p0") + +;; The sfence instruction. +(define_insn_reservation "c2_sse_sfence" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "unknown") + (eq_attr "type" "sse"))) + "c2_decoder0,c2_p4+c2_p3") + +;; FIXME: This reservation is all wrong when we're scheduling sqrtss. +(define_insn_reservation "c2_sse_SFDF" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "mode" "SF,DF") + (eq_attr "type" "sse"))) + "c2_decodern,c2_p0") + +(define_insn_reservation "c2_sse_V4SF" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "sse"))) + "c2_decoder0,c2_p1*2") + +(define_insn_reservation "c2_sse_addcmp" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "sseadd,ssecmp,ssecomi"))) + "c2_decodern,c2_p1") + +(define_insn_reservation "c2_sse_addcmp_load" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (eq_attr "type" "sseadd,ssecmp,ssecomi"))) + "c2_decodern,c2_p2+c2_p1") + +(define_insn_reservation "c2_sse_mul_SF" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF,V4SF") + (eq_attr "type" "ssemul")))) + "c2_decodern,c2_p0") + +(define_insn_reservation "c2_sse_mul_SF_load" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SF,V4SF") + (eq_attr "type" "ssemul")))) + "c2_decodern,c2_p2+c2_p0") + +(define_insn_reservation "c2_sse_mul_DF" 5 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "DF,V2DF") + (eq_attr "type" "ssemul")))) + "c2_decodern,c2_p0") + +(define_insn_reservation "c2_sse_mul_DF_load" 5 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "DF,V2DF") + (eq_attr "type" "ssemul")))) + "c2_decodern,c2_p2+c2_p0") + +(define_insn_reservation "c2_sse_div_SF" 18 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF,V4SF") + (eq_attr "type" "ssediv")))) + "c2_decodern,c2_p0,c2_ssediv*17") + +(define_insn_reservation "c2_sse_div_SF_load" 18 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF,V4SF") + (eq_attr "type" "ssediv")))) + "c2_decodern,(c2_p2+c2_p0),c2_ssediv*17") + +(define_insn_reservation "c2_sse_div_DF" 32 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "DF,V2DF") + (eq_attr "type" "ssediv")))) + "c2_decodern,c2_p0,c2_ssediv*31") + +(define_insn_reservation "c2_sse_div_DF_load" 32 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "DF,V2DF") + (eq_attr "type" "ssediv")))) + "c2_decodern,(c2_p2+c2_p0),c2_ssediv*31") + +;; FIXME: these have limited throughput +(define_insn_reservation "c2_sse_icvt_SF" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF") + (eq_attr "type" "sseicvt")))) + "c2_decodern,c2_p1") + +(define_insn_reservation "c2_sse_icvt_SF_load" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "!none") + (and (eq_attr "mode" "SF") + (eq_attr "type" "sseicvt")))) + "c2_decodern,c2_p2+c2_p1") + +(define_insn_reservation "c2_sse_icvt_DF" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "DF") + (eq_attr "type" "sseicvt")))) + "c2_decoder0,c2_p0+c2_p1") + +(define_insn_reservation "c2_sse_icvt_DF_load" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "!none") + (and (eq_attr "mode" "DF") + (eq_attr "type" "sseicvt")))) + "c2_decoder0,(c2_p2+c2_p1)") + +(define_insn_reservation "c2_sse_icvt_SI" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SI") + (eq_attr "type" "sseicvt")))) + "c2_decodern,c2_p1") + +(define_insn_reservation "c2_sse_icvt_SI_load" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "!none") + (and (eq_attr "mode" "SI") + (eq_attr "type" "sseicvt")))) + "c2_decodern,(c2_p2+c2_p1)") + +(define_insn_reservation "c2_sse_mov" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "ssemov"))) + "c2_decodern,(c2_p0|c2_p1|c2_p5)") + +(define_insn_reservation "c2_sse_mov_load" 2 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (eq_attr "type" "ssemov"))) + "c2_decodern,c2_p2") + +(define_insn_reservation "c2_sse_mov_store" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "store") + (eq_attr "type" "ssemov"))) + "c2_decodern,c2_p4+c2_p3") + +;; All other instructions are modelled as simple instructions. +;; We have already modelled all i387 floating point instructions, so all +;; other instructions execute on either port 0, 1 or 5. This includes +;; the ALU units, and the MMX units. +;; +;; reg-reg instructions produce 1 uop so they can be decoded on any of +;; the three decoders. Loads benefit from micro-op fusion and can be +;; treated in the same way. +(define_insn_reservation "c2_insn" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none,unknown") + (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,sseishft1,mmx,mmxcmp"))) + "c2_decodern,(c2_p0|c2_p1|c2_p5)") + +(define_insn_reservation "c2_insn_load" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,pop,sseishft1,mmx,mmxcmp"))) + "c2_decodern,c2_p2,(c2_p0|c2_p1|c2_p5)") + +;; register-memory instructions have three uops, so they have to be +;; decoded on c2_decoder0. +(define_insn_reservation "c2_insn_store" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "store") + (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,sseishft1,mmx,mmxcmp"))) + "c2_decoder0,(c2_p0|c2_p1|c2_p5),c2_p4+c2_p3") + +;; read-modify-store instructions produce 4 uops so they have to be +;; decoded on c2_decoder0 as well. +(define_insn_reservation "c2_insn_both" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "both") + (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,pop,sseishft1,mmx,mmxcmp"))) + "c2_decoder0,c2_p2,(c2_p0|c2_p1|c2_p5),c2_p4+c2_p3") diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h new file mode 100644 index 000000000..3c3f47b00 --- /dev/null +++ b/gcc/config/i386/cpuid.h @@ -0,0 +1,188 @@ +/* + * Copyright (C) 2007, 2008, 2009, 2010 Free Software Foundation, Inc. + * + * This file is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 3, or (at your option) any + * later version. + * + * This file is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Under Section 7 of GPL version 3, you are granted additional + * permissions described in the GCC Runtime Library Exception, version + * 3.1, as published by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License and + * a copy of the GCC Runtime Library Exception along with this program; + * see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + * . + */ + +/* %ecx */ +#define bit_SSE3 (1 << 0) +#define bit_PCLMUL (1 << 1) +#define bit_SSSE3 (1 << 9) +#define bit_FMA (1 << 12) +#define bit_CMPXCHG16B (1 << 13) +#define bit_SSE4_1 (1 << 19) +#define bit_SSE4_2 (1 << 20) +#define bit_MOVBE (1 << 22) +#define bit_POPCNT (1 << 23) +#define bit_AES (1 << 25) +#define bit_XSAVE (1 << 26) +#define bit_OSXSAVE (1 << 27) +#define bit_AVX (1 << 28) +#define bit_F16C (1 << 29) +#define bit_RDRND (1 << 30) + +/* %edx */ +#define bit_CMPXCHG8B (1 << 8) +#define bit_CMOV (1 << 15) +#define bit_MMX (1 << 23) +#define bit_FXSAVE (1 << 24) +#define bit_SSE (1 << 25) +#define bit_SSE2 (1 << 26) + +/* Extended Features */ +/* %ecx */ +#define bit_LAHF_LM (1 << 0) +#define bit_ABM (1 << 5) +#define bit_SSE4a (1 << 6) +#define bit_XOP (1 << 11) +#define bit_LWP (1 << 15) +#define bit_FMA4 (1 << 16) +#define bit_TBM (1 << 21) + +/* %edx */ +#define bit_MMXEXT (1 << 22) +#define bit_LM (1 << 29) +#define bit_3DNOWP (1 << 30) +#define bit_3DNOW (1 << 31) + +/* Extended Features (%eax == 7) */ +#define bit_FSGSBASE (1 << 0) +#define bit_BMI (1 << 3) + +#if defined(__i386__) && defined(__PIC__) +/* %ebx may be the PIC register. */ +#if __GNUC__ >= 3 +#define __cpuid(level, a, b, c, d) \ + __asm__ ("xchg{l}\t{%%}ebx, %1\n\t" \ + "cpuid\n\t" \ + "xchg{l}\t{%%}ebx, %1\n\t" \ + : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ + : "0" (level)) + +#define __cpuid_count(level, count, a, b, c, d) \ + __asm__ ("xchg{l}\t{%%}ebx, %1\n\t" \ + "cpuid\n\t" \ + "xchg{l}\t{%%}ebx, %1\n\t" \ + : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ + : "0" (level), "2" (count)) +#else +/* Host GCCs older than 3.0 weren't supporting Intel asm syntax + nor alternatives in i386 code. */ +#define __cpuid(level, a, b, c, d) \ + __asm__ ("xchgl\t%%ebx, %1\n\t" \ + "cpuid\n\t" \ + "xchgl\t%%ebx, %1\n\t" \ + : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ + : "0" (level)) + +#define __cpuid_count(level, count, a, b, c, d) \ + __asm__ ("xchgl\t%%ebx, %1\n\t" \ + "cpuid\n\t" \ + "xchgl\t%%ebx, %1\n\t" \ + : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ + : "0" (level), "2" (count)) +#endif +#else +#define __cpuid(level, a, b, c, d) \ + __asm__ ("cpuid\n\t" \ + : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ + : "0" (level)) + +#define __cpuid_count(level, count, a, b, c, d) \ + __asm__ ("cpuid\n\t" \ + : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ + : "0" (level), "2" (count)) +#endif + +/* Return highest supported input value for cpuid instruction. ext can + be either 0x0 or 0x8000000 to return highest supported value for + basic or extended cpuid information. Function returns 0 if cpuid + is not supported or whatever cpuid returns in eax register. If sig + pointer is non-null, then first four bytes of the signature + (as found in ebx register) are returned in location pointed by sig. */ + +static __inline unsigned int +__get_cpuid_max (unsigned int __ext, unsigned int *__sig) +{ + unsigned int __eax, __ebx, __ecx, __edx; + +#ifndef __x86_64__ + /* See if we can use cpuid. On AMD64 we always can. */ +#if __GNUC__ >= 3 + __asm__ ("pushf{l|d}\n\t" + "pushf{l|d}\n\t" + "pop{l}\t%0\n\t" + "mov{l}\t{%0, %1|%1, %0}\n\t" + "xor{l}\t{%2, %0|%0, %2}\n\t" + "push{l}\t%0\n\t" + "popf{l|d}\n\t" + "pushf{l|d}\n\t" + "pop{l}\t%0\n\t" + "popf{l|d}\n\t" + : "=&r" (__eax), "=&r" (__ebx) + : "i" (0x00200000)); +#else +/* Host GCCs older than 3.0 weren't supporting Intel asm syntax + nor alternatives in i386 code. */ + __asm__ ("pushfl\n\t" + "pushfl\n\t" + "popl\t%0\n\t" + "movl\t%0, %1\n\t" + "xorl\t%2, %0\n\t" + "pushl\t%0\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl\t%0\n\t" + "popfl\n\t" + : "=&r" (__eax), "=&r" (__ebx) + : "i" (0x00200000)); +#endif + + if (!((__eax ^ __ebx) & 0x00200000)) + return 0; +#endif + + /* Host supports cpuid. Return highest supported cpuid input value. */ + __cpuid (__ext, __eax, __ebx, __ecx, __edx); + + if (__sig) + *__sig = __ebx; + + return __eax; +} + +/* Return cpuid data for requested cpuid level, as found in returned + eax, ebx, ecx and edx registers. The function checks if cpuid is + supported and returns 1 for valid cpuid information or 0 for + unsupported cpuid level. All pointers are required to be non-null. */ + +static __inline int +__get_cpuid (unsigned int __level, + unsigned int *__eax, unsigned int *__ebx, + unsigned int *__ecx, unsigned int *__edx) +{ + unsigned int __ext = __level & 0x80000000; + + if (__get_cpuid_max (__ext, 0) < __level) + return 0; + + __cpuid (__level, *__eax, *__ebx, *__ecx, *__edx); + return 1; +} diff --git a/gcc/config/i386/cross-stdarg.h b/gcc/config/i386/cross-stdarg.h new file mode 100644 index 000000000..7139ffa74 --- /dev/null +++ b/gcc/config/i386/cross-stdarg.h @@ -0,0 +1,73 @@ +/* Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 + Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef __CROSS_STDARG_H_INCLUDED +#define __CROSS_STDARG_H_INCLUDED + +/* Make sure that for non x64 targets cross builtins are defined. */ +#ifndef __x86_64__ +/* Call abi ms_abi. */ +#define __builtin_ms_va_list __builtin_va_list +#define __builtin_ms_va_copy __builtin_va_copy +#define __builtin_ms_va_start __builtin_va_start +#define __builtin_ms_va_end __builtin_va_end + +/* Call abi sysv_abi. */ +#define __builtin_sysv_va_list __builtin_va_list +#define __builtin_sysv_va_copy __builtin_va_copy +#define __builtin_sysv_va_start __builtin_va_start +#define __builtin_sysv_va_end __builtin_va_end +#endif + +#define __ms_va_copy(__d,__s) __builtin_ms_va_copy(__d,__s) +#define __ms_va_start(__v,__l) __builtin_ms_va_start(__v,__l) +#define __ms_va_arg(__v,__l) __builtin_va_arg(__v,__l) +#define __ms_va_end(__v) __builtin_ms_va_end(__v) + +#define __sysv_va_copy(__d,__s) __builtin_sysv_va_copy(__d,__s) +#define __sysv_va_start(__v,__l) __builtin_sysv_va_start(__v,__l) +#define __sysv_va_arg(__v,__l) __builtin_va_arg(__v,__l) +#define __sysv_va_end(__v) __builtin_sysv_va_end(__v) + +#ifndef __GNUC_SYSV_VA_LIST +#define __GNUC_SYSV_VA_LIST + typedef __builtin_sysv_va_list __gnuc_sysv_va_list; +#endif + +#ifndef _SYSV_VA_LIST_DEFINED +#define _SYSV_VA_LIST_DEFINED + typedef __gnuc_sysv_va_list sysv_va_list; +#endif + +#ifndef __GNUC_MS_VA_LIST +#define __GNUC_MS_VA_LIST + typedef __builtin_ms_va_list __gnuc_ms_va_list; +#endif + +#ifndef _MS_VA_LIST_DEFINED +#define _MS_VA_LIST_DEFINED + typedef __gnuc_ms_va_list ms_va_list; +#endif + +#endif /* __CROSS_STDARG_H_INCLUDED */ diff --git a/gcc/config/i386/crtdll.h b/gcc/config/i386/crtdll.h new file mode 100644 index 000000000..1e5cefd62 --- /dev/null +++ b/gcc/config/i386/crtdll.h @@ -0,0 +1,42 @@ +/* Operating system specific defines to be used when targeting GCC for + hosting on Windows32, using GNU tools and the Windows32 API Library. + This variant uses CRTDLL.DLL instead of MSVCRTDLL.DLL. + Copyright (C) 1998, 1999, 2000, 2007 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#undef EXTRA_OS_CPP_BUILTINS +#define EXTRA_OS_CPP_BUILTINS() \ + do \ + { \ + builtin_define ("__CRTDLL__"); \ + builtin_define ("__MINGW32__"); \ + builtin_define ("_WIN32"); \ + builtin_define_std ("WIN32"); \ + builtin_define_std ("WINNT"); \ + } \ + while (0) + +#undef LIBGCC_SPEC +#define LIBGCC_SPEC \ + "%{mthreads:-lmingwthrd} -lmingw32 -lgcc -lcoldname -libmingwex -lcrtdll" + +/* Specify a different entry point when linking a DLL */ +#undef STARTFILE_SPEC +#define STARTFILE_SPEC "%{shared|mdll:dllcrt1%O%s} \ + %{!shared:%{!mdll:crt1%O%s}} %{pg:gcrt1%O%s}" + diff --git a/gcc/config/i386/crtfastmath.c b/gcc/config/i386/crtfastmath.c new file mode 100644 index 000000000..1c1ce2c78 --- /dev/null +++ b/gcc/config/i386/crtfastmath.c @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2005, 2007, 2009 Free Software Foundation, Inc. + * + * This file is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 3, or (at your option) any + * later version. + * + * This file is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Under Section 7 of GPL version 3, you are granted additional + * permissions described in the GCC Runtime Library Exception, version + * 3.1, as published by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License and + * a copy of the GCC Runtime Library Exception along with this program; + * see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + * . + */ + +#define MXCSR_DAZ (1 << 6) /* Enable denormals are zero mode */ +#define MXCSR_FTZ (1 << 15) /* Enable flush to zero mode */ + +#ifndef __x86_64__ +/* All 64-bit targets have SSE and DAZ; + only check them explicitly for 32-bit ones. */ +#include "cpuid.h" +#endif + +static void __attribute__((constructor)) +#ifndef __x86_64__ +/* The i386 ABI only requires 4-byte stack alignment, so this is necessary + to make sure the fxsave struct gets correct alignment. + See PR27537 and PR28621. */ +__attribute__ ((force_align_arg_pointer)) +#endif +set_fast_math (void) +{ +#ifndef __x86_64__ + unsigned int eax, ebx, ecx, edx; + + if (!__get_cpuid (1, &eax, &ebx, &ecx, &edx)) + return; + + if (edx & bit_SSE) + { + unsigned int mxcsr = __builtin_ia32_stmxcsr (); + + mxcsr |= MXCSR_FTZ; + + if (edx & bit_FXSAVE) + { + /* Check if DAZ is available. */ + struct + { + unsigned short int cwd; + unsigned short int swd; + unsigned short int twd; + unsigned short int fop; + long int fip; + long int fcs; + long int foo; + long int fos; + long int mxcsr; + long int mxcsr_mask; + long int st_space[32]; + long int xmm_space[32]; + long int padding[56]; + } __attribute__ ((aligned (16))) fxsave; + + __builtin_memset (&fxsave, 0, sizeof (fxsave)); + + asm volatile ("fxsave %0" : "=m" (fxsave) : "m" (fxsave)); + + if (fxsave.mxcsr_mask & MXCSR_DAZ) + mxcsr |= MXCSR_DAZ; + } + + __builtin_ia32_ldmxcsr (mxcsr); + } +#else + unsigned int mxcsr = __builtin_ia32_stmxcsr (); + mxcsr |= MXCSR_DAZ | MXCSR_FTZ; + __builtin_ia32_ldmxcsr (mxcsr); +#endif +} diff --git a/gcc/config/i386/crtprec.c b/gcc/config/i386/crtprec.c new file mode 100644 index 000000000..4f42a8fa1 --- /dev/null +++ b/gcc/config/i386/crtprec.c @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2007, 2009 Free Software Foundation, Inc. + * + * This file is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 3, or (at your option) any + * later version. + * + * This file is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Under Section 7 of GPL version 3, you are granted additional + * permissions described in the GCC Runtime Library Exception, version + * 3.1, as published by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License and + * a copy of the GCC Runtime Library Exception along with this program; + * see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + * . + */ + +#if __PREC == 32 + #define X87CW (0 << 8) /* Single precision (24 bits) */ +#elif __PREC == 64 + #define X87CW (2 << 8) /* Double precision (53 bits) */ +#elif __PREC == 80 + #define X87CW (3 << 8) /* Extended precision (64 bits) */ +#else + #error "Wrong precision requested." +#endif + +#define X87CW_PCMASK (3 << 8) + +static void __attribute__((constructor)) +set_precision (void) +{ + unsigned short int cwd; + + asm volatile ("fstcw\t%0" : "=m" (cwd)); + + cwd &= ~X87CW_PCMASK; + cwd |= X87CW; + + asm volatile ("fldcw\t%0" : : "m" (cwd)); +} diff --git a/gcc/config/i386/cygming-crtbegin.c b/gcc/config/i386/cygming-crtbegin.c new file mode 100644 index 000000000..fc36cce25 --- /dev/null +++ b/gcc/config/i386/cygming-crtbegin.c @@ -0,0 +1,135 @@ +/* crtbegin object for windows32 targets. + Copyright (C) 2007, 2009, 2010 Free Software Foundation, Inc. + + Contributed by Danny Smith + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ + +/* Target machine header files require this define. */ +#define IN_LIBGCC2 + +#include "auto-host.h" +#include "tconfig.h" +#include "tsystem.h" +#include "coretypes.h" +#include "tm.h" +#include "unwind-dw2-fde.h" + +#define WIN32_LEAN_AND_MEAN +#include + +#ifndef LIBGCC_SONAME +#define LIBGCC_SONAME "libgcc_s.dll" +#endif + +#ifndef LIBGCJ_SONAME +#define LIBGCJ_SONAME "libgcj_s.dll" +#endif + + +/* Make the declarations weak. This is critical for + _Jv_RegisterClasses because it lives in libgcj.a */ +extern void __register_frame_info (const void *, struct object *) + TARGET_ATTRIBUTE_WEAK; +extern void *__deregister_frame_info (const void *) + TARGET_ATTRIBUTE_WEAK; +extern void _Jv_RegisterClasses (const void *) TARGET_ATTRIBUTE_WEAK; + +#if defined(HAVE_LD_RO_RW_SECTION_MIXING) +# define EH_FRAME_SECTION_CONST const +#else +# define EH_FRAME_SECTION_CONST +#endif + +/* Stick a label at the beginning of the frame unwind info so we can + register/deregister it with the exception handling library code. */ +#if DWARF2_UNWIND_INFO +static EH_FRAME_SECTION_CONST char __EH_FRAME_BEGIN__[] + __attribute__((used, section(EH_FRAME_SECTION_NAME), aligned(4))) + = { }; + +static struct object obj; +#endif + +#if TARGET_USE_JCR_SECTION +static void *__JCR_LIST__[] + __attribute__ ((used, section(JCR_SECTION_NAME), aligned(4))) + = { }; +#endif + +/* Pull in references from libgcc.a(unwind-dw2-fde.o) in the + startfile. These are referenced by a ctor and dtor in crtend.o. */ +extern void __gcc_register_frame (void); +extern void __gcc_deregister_frame (void); + +void +__gcc_register_frame (void) +{ +#if DWARF2_UNWIND_INFO +/* Weak undefined symbols won't be pulled in from dlls; hence + we first test if the dll is already loaded and, if so, + get the symbol's address at run-time. If the dll is not loaded, + fallback to weak linkage to static archive. */ + + void (*register_frame_fn) (const void *, struct object *); + HANDLE h = GetModuleHandle (LIBGCC_SONAME); + if (h) + register_frame_fn = (void (*) (const void *, struct object *)) + GetProcAddress (h, "__register_frame_info"); + else + register_frame_fn = __register_frame_info; + if (register_frame_fn) + register_frame_fn (__EH_FRAME_BEGIN__, &obj); +#endif + +#if TARGET_USE_JCR_SECTION + if (__JCR_LIST__[0]) + { + void (*register_class_fn) (const void *); + HANDLE h = GetModuleHandle (LIBGCJ_SONAME); + if (h) + register_class_fn = (void (*) (const void *)) + GetProcAddress (h, "_Jv_RegisterClasses"); + else + register_class_fn = _Jv_RegisterClasses; + + if (register_class_fn) + register_class_fn (__JCR_LIST__); + } +#endif +} + +void +__gcc_deregister_frame (void) +{ +#if DWARF2_UNWIND_INFO + void * (*deregister_frame_fn) (const void *); + HANDLE h = GetModuleHandle (LIBGCC_SONAME); + if (h) + deregister_frame_fn = (void* (*) (const void *)) + GetProcAddress (h, "__deregister_frame_info"); + else + deregister_frame_fn = __deregister_frame_info; + if (deregister_frame_fn) + deregister_frame_fn (__EH_FRAME_BEGIN__); +#endif +} diff --git a/gcc/config/i386/cygming-crtend.c b/gcc/config/i386/cygming-crtend.c new file mode 100644 index 000000000..8545420b2 --- /dev/null +++ b/gcc/config/i386/cygming-crtend.c @@ -0,0 +1,88 @@ +/* crtend object for windows32 targets. + Copyright (C) 2007, 2008, 2009, 2010 Free Software Foundation, Inc. + + Contributed by Danny Smith + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ + +/* Target machine header files require this define. */ +#define IN_LIBGCC2 + +/* auto-host.h is needed by cygming.h for HAVE_GAS_WEAK and here + for HAVE_LD_RO_RW_SECTION_MIXING. */ +#include "auto-host.h" +#include "tconfig.h" +#include "tsystem.h" +#include "coretypes.h" +#include "tm.h" +#include "unwind-dw2-fde.h" + +#if defined(HAVE_LD_RO_RW_SECTION_MIXING) +# define EH_FRAME_SECTION_CONST const +#else +# define EH_FRAME_SECTION_CONST +#endif + +#if DWARF2_UNWIND_INFO +/* Terminate the frame unwind info section with a 0 as a sentinel; + this would be the 'length' field in a real FDE. */ + +static EH_FRAME_SECTION_CONST int __FRAME_END__[] + __attribute__ ((used, section(EH_FRAME_SECTION_NAME), + aligned(4))) + = { 0 }; +#endif + +#if TARGET_USE_JCR_SECTION +/* Null terminate the .jcr section array. */ +static void *__JCR_END__[1] + __attribute__ ((used, section(JCR_SECTION_NAME), + aligned(sizeof(void *)))) + = { 0 }; +#endif + +extern void __gcc_register_frame (void); +extern void __gcc_deregister_frame (void); + +static void register_frame_ctor (void) __attribute__ ((constructor (0))); + +static void +register_frame_ctor (void) +{ + __gcc_register_frame (); +#if DEFAULT_USE_CXA_ATEXIT + /* If we use the __cxa_atexit method to register C++ dtors + at object construction, also use atexit to register eh frame + info cleanup. */ + atexit (__gcc_deregister_frame); +#endif +} + +#if !DEFAULT_USE_CXA_ATEXIT +static void deregister_frame_dtor (void) __attribute__ ((destructor (0))); + +static void +deregister_frame_dtor (void) +{ + __gcc_deregister_frame (); +} +#endif diff --git a/gcc/config/i386/cygming.h b/gcc/config/i386/cygming.h new file mode 100644 index 000000000..9c32dda2e --- /dev/null +++ b/gcc/config/i386/cygming.h @@ -0,0 +1,478 @@ +/* Operating system specific defines to be used when targeting GCC for + hosting on Windows32, using a Unix style C library and tools. + Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, + 2004, 2005, 2007, 2008, 2009, 2010 + Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#define DBX_DEBUGGING_INFO 1 +#define SDB_DEBUGGING_INFO 1 +#if TARGET_64BIT_DEFAULT || defined (HAVE_GAS_PE_SECREL32_RELOC) +#define DWARF2_DEBUGGING_INFO 1 +#endif + +#undef PREFERRED_DEBUGGING_TYPE +#if (DWARF2_DEBUGGING_INFO) +#define PREFERRED_DEBUGGING_TYPE DWARF2_DEBUG +#else +#define PREFERRED_DEBUGGING_TYPE DBX_DEBUG +#endif + +#undef TARGET_SEH +#define TARGET_SEH (TARGET_64BIT_MS_ABI && flag_unwind_tables) + +/* Win64 with SEH cannot represent DRAP stack frames. Disable its use. + Force the use of different mechanisms to allocate aligned local data. */ +#undef MAX_STACK_ALIGNMENT +#define MAX_STACK_ALIGNMENT (TARGET_SEH ? 128 : MAX_OFILE_ALIGNMENT) + +/* Support hooks for SEH. */ +#undef TARGET_ASM_UNWIND_EMIT +#define TARGET_ASM_UNWIND_EMIT i386_pe_seh_unwind_emit +#undef TARGET_ASM_UNWIND_EMIT_BEFORE_INSN +#define TARGET_ASM_UNWIND_EMIT_BEFORE_INSN false +#undef TARGET_ASM_FUNCTION_END_PROLOGUE +#define TARGET_ASM_FUNCTION_END_PROLOGUE i386_pe_seh_end_prologue +#define SUBTARGET_ASM_UNWIND_INIT i386_pe_seh_init + +#undef DEFAULT_ABI +#define DEFAULT_ABI (TARGET_64BIT ? MS_ABI : SYSV_ABI) + +#if ! defined (USE_MINGW64_LEADING_UNDERSCORES) +#undef USER_LABEL_PREFIX +#define USER_LABEL_PREFIX (TARGET_64BIT ? "" : "_") + +#undef LOCAL_LABEL_PREFIX +#define LOCAL_LABEL_PREFIX (TARGET_64BIT ? "." : "") + +#undef ASM_GENERATE_INTERNAL_LABEL +#define ASM_GENERATE_INTERNAL_LABEL(BUF,PREFIX,NUMBER) \ + sprintf ((BUF), "*%s%s%ld", LOCAL_LABEL_PREFIX, \ + (PREFIX), (long)(NUMBER)) + +#undef LPREFIX +#define LPREFIX (TARGET_64BIT ? ".L" : "L") + +#endif + +#undef DBX_REGISTER_NUMBER +#define DBX_REGISTER_NUMBER(n) \ + (TARGET_64BIT ? dbx64_register_map[n] \ + : (write_symbols == DWARF2_DEBUG \ + ? svr4_dbx_register_map[n] : dbx_register_map[n])) + +/* Map gcc register number to DWARF 2 CFA column number. For 32 bit + target, always use the svr4_dbx_register_map for DWARF .eh_frame + even if we don't use DWARF .debug_frame. */ +#undef DWARF_FRAME_REGNUM +#define DWARF_FRAME_REGNUM(n) \ + (TARGET_64BIT ? dbx64_register_map[(n)] \ + : svr4_dbx_register_map[(n)]) + +/* The MS_ABI changes the set of call-used registers. */ +#undef DWARF_FRAME_REGISTERS +#define DWARF_FRAME_REGISTERS (TARGET_64BIT ? 33 : 17) + +#ifdef HAVE_GAS_PE_SECREL32_RELOC +/* Use section relative relocations for debugging offsets. Unlike + other targets that fake this by putting the section VMA at 0, PE + won't allow it. */ +#define ASM_OUTPUT_DWARF_OFFSET(FILE, SIZE, LABEL, SECTION) \ + do { \ + switch (SIZE) \ + { \ + case 4: \ + fputs ("\t.secrel32\t", FILE); \ + assemble_name (FILE, LABEL); \ + break; \ + case 8: \ + /* This is a hack. There is no 64-bit section relative \ + relocation. However, the COFF format also does not \ + support 64-bit file offsets; 64-bit applications are \ + limited to 32-bits of code+data in any one module. \ + Fake the 64-bit offset by zero-extending it. */ \ + fputs ("\t.secrel32\t", FILE); \ + assemble_name (FILE, LABEL); \ + fputs ("\n\t.long\t0", FILE); \ + break; \ + default: \ + gcc_unreachable (); \ + } \ + } while (0) +#endif + +#define TARGET_EXECUTABLE_SUFFIX ".exe" + +#define TARGET_OS_CPP_BUILTINS() \ + do \ + { \ + if (!TARGET_64BIT) \ + builtin_define ("_X86_=1"); \ + if (TARGET_SEH) \ + builtin_define ("__SEH__"); \ + builtin_assert ("system=winnt"); \ + builtin_define ("__stdcall=__attribute__((__stdcall__))"); \ + builtin_define ("__fastcall=__attribute__((__fastcall__))"); \ + builtin_define ("__thiscall=__attribute__((__thiscall__))"); \ + builtin_define ("__cdecl=__attribute__((__cdecl__))"); \ + if (!flag_iso) \ + { \ + builtin_define ("_stdcall=__attribute__((__stdcall__))"); \ + builtin_define ("_fastcall=__attribute__((__fastcall__))"); \ + builtin_define ("_thiscall=__attribute__((__thiscall__))"); \ + builtin_define ("_cdecl=__attribute__((__cdecl__))"); \ + } \ + /* Even though linkonce works with static libs, this is needed \ + to compare typeinfo symbols across dll boundaries. */ \ + builtin_define ("__GXX_MERGED_TYPEINFO_NAMES=0"); \ + builtin_define ("__GXX_TYPEINFO_EQUALITY_INLINE=0"); \ + EXTRA_OS_CPP_BUILTINS (); \ + } \ + while (0) + +/* Get tree.c to declare a target-specific specialization of + merge_decl_attributes. */ +#define TARGET_DLLIMPORT_DECL_ATTRIBUTES 1 + +/* This macro defines names of additional specifications to put in the specs + that can be used in various specifications like CC1_SPEC. Its definition + is an initializer with a subgrouping for each command option. + + Each subgrouping contains a string constant, that defines the + specification name, and a string constant that used by the GCC driver + program. + + Do not define this macro if it does not need to do anything. */ + +#undef SUBTARGET_EXTRA_SPECS +#define SUBTARGET_EXTRA_SPECS \ + { "mingw_include_path", DEFAULT_TARGET_MACHINE } + +#undef MATH_LIBRARY +#define MATH_LIBRARY "" + +#define SIZE_TYPE (TARGET_64BIT ? "long long unsigned int" : "unsigned int") +#define PTRDIFF_TYPE (TARGET_64BIT ? "long long int" : "int") + +#define WCHAR_TYPE_SIZE 16 +#define WCHAR_TYPE "short unsigned int" + +/* Windows64 continues to use a 32-bit long type. */ +#undef LONG_TYPE_SIZE +#define LONG_TYPE_SIZE 32 + +union tree_node; +#define TREE union tree_node * + +#define drectve_section() \ + (fprintf (asm_out_file, "\t.section .drectve\n"), \ + in_section = NULL) + +/* Older versions of gas don't handle 'r' as data. + Explicitly set data flag with 'd'. */ +#define READONLY_DATA_SECTION_ASM_OP "\t.section .rdata,\"dr\"" + +/* Don't allow flag_pic to propagate since gas may produce invalid code + otherwise. */ + +#undef SUBTARGET_OVERRIDE_OPTIONS +#define SUBTARGET_OVERRIDE_OPTIONS \ +do { \ + if (TARGET_64BIT && flag_pic != 1) \ + { \ + if (flag_pic > 1) \ + warning (0, \ + "-fPIC ignored for target (all code is position independent)"\ + ); \ + flag_pic = 1; \ + } \ + else if (!TARGET_64BIT && flag_pic) \ + { \ + warning (0, "-f%s ignored for target (all code is position independent)",\ + (flag_pic > 1) ? "PIC" : "pic"); \ + flag_pic = 0; \ + } \ +} while (0) \ + +/* Define this macro if references to a symbol must be treated + differently depending on something about the variable or + function named by the symbol (such as what section it is in). + + On i386 running Windows NT, modify the assembler name with a suffix + consisting of an atsign (@) followed by string of digits that represents + the number of bytes of arguments passed to the function, if it has the + attribute STDCALL. + + In addition, we must mark dll symbols specially. Definitions of + dllexport'd objects install some info in the .drectve section. + References to dllimport'd objects are fetched indirectly via + _imp__. If both are declared, dllexport overrides. This is also + needed to implement one-only vtables: they go into their own + section and we need to set DECL_SECTION_NAME so we do that here. + Note that we can be called twice on the same decl. */ + +#define SUBTARGET_ENCODE_SECTION_INFO i386_pe_encode_section_info + +/* Output a common block. */ +#undef ASM_OUTPUT_ALIGNED_DECL_COMMON +#define ASM_OUTPUT_ALIGNED_DECL_COMMON \ + i386_pe_asm_output_aligned_decl_common + +/* Output the label for an initialized variable. */ +#undef ASM_DECLARE_OBJECT_NAME +#define ASM_DECLARE_OBJECT_NAME(STREAM, NAME, DECL) \ +do { \ + i386_pe_maybe_record_exported_symbol (DECL, NAME, 1); \ + ASM_OUTPUT_LABEL ((STREAM), (NAME)); \ +} while (0) + +/* Output a reference to a label. Fastcall function symbols + keep their '@' prefix, while other symbols are prefixed + with user_label_prefix. */ +#undef ASM_OUTPUT_LABELREF +#define ASM_OUTPUT_LABELREF(STREAM, NAME) \ +do { \ + if ((NAME)[0] != FASTCALL_PREFIX) \ + fputs (user_label_prefix, (STREAM)); \ + fputs ((NAME), (STREAM)); \ +} while (0) + +/* This does much the same in memory rather than to a stream. */ +#undef TARGET_MANGLE_ASSEMBLER_NAME +#define TARGET_MANGLE_ASSEMBLER_NAME i386_pe_mangle_assembler_name + + +/* Emit code to check the stack when allocating more than 4000 + bytes in one go. */ +#define CHECK_STACK_LIMIT 4000 + +#undef STACK_BOUNDARY +#define STACK_BOUNDARY (ix86_abi == MS_ABI ? 128 : BITS_PER_WORD) + +/* By default, target has a 80387, uses IEEE compatible arithmetic, + returns float values in the 387 and needs stack probes. + We also align doubles to 64-bits for MSVC default compatibility. */ + +#undef TARGET_SUBTARGET_DEFAULT +#define TARGET_SUBTARGET_DEFAULT \ + (MASK_80387 | MASK_IEEE_FP | MASK_FLOAT_RETURNS \ + | MASK_STACK_PROBE | MASK_ALIGN_DOUBLE) + +#undef TARGET_SUBTARGET64_DEFAULT +#define TARGET_SUBTARGET64_DEFAULT \ + MASK_128BIT_LONG_DOUBLE + +/* This is how to output an assembler line + that says to advance the location counter + to a multiple of 2**LOG bytes. */ + +#undef ASM_OUTPUT_ALIGN +#define ASM_OUTPUT_ALIGN(FILE,LOG) \ + if ((LOG)!=0) fprintf ((FILE), "\t.align %d\n", 1<<(LOG)) + +/* Windows uses explicit import from shared libraries. */ +#define MULTIPLE_SYMBOL_SPACES 1 + +#define TARGET_ASM_UNIQUE_SECTION i386_pe_unique_section +#define TARGET_ASM_FUNCTION_RODATA_SECTION default_no_function_rodata_section + +#define SUPPORTS_ONE_ONLY 1 + +/* Switch into a generic section. */ +#define TARGET_ASM_NAMED_SECTION i386_pe_asm_named_section + +/* Select attributes for named sections. */ +#define TARGET_SECTION_TYPE_FLAGS i386_pe_section_type_flags + +/* Write the extra assembler code needed to declare a function + properly. If we are generating SDB debugging information, this + will happen automatically, so we only need to handle other cases. */ +#undef ASM_DECLARE_FUNCTION_NAME +#define ASM_DECLARE_FUNCTION_NAME(FILE, NAME, DECL) \ + i386_pe_start_function (FILE, NAME, DECL) + +#undef ASM_DECLARE_FUNCTION_SIZE +#define ASM_DECLARE_FUNCTION_SIZE(FILE,NAME,DECL) \ + i386_pe_end_function (FILE, NAME, DECL) + +/* Add an external function to the list of functions to be declared at + the end of the file. */ +#define ASM_OUTPUT_EXTERNAL(FILE, DECL, NAME) \ + do \ + { \ + if (TREE_CODE (DECL) == FUNCTION_DECL) \ + i386_pe_record_external_function ((DECL), (NAME)); \ + } \ + while (0) + +/* Declare the type properly for any external libcall. */ +#define ASM_OUTPUT_EXTERNAL_LIBCALL(FILE, FUN) \ + i386_pe_declare_function_type (FILE, XSTR (FUN, 0), 1) + +/* This says out to put a global symbol in the BSS section. */ +#undef ASM_OUTPUT_ALIGNED_BSS +#define ASM_OUTPUT_ALIGNED_BSS(FILE, DECL, NAME, SIZE, ALIGN) \ + asm_output_aligned_bss ((FILE), (DECL), (NAME), (SIZE), (ALIGN)) + +/* Put all *tf routines in libgcc. */ +#undef LIBGCC2_HAS_TF_MODE +#define LIBGCC2_HAS_TF_MODE 1 +#define LIBGCC2_TF_CEXT q +#define TF_SIZE 113 + +/* Output function declarations at the end of the file. */ +#undef TARGET_ASM_FILE_END +#define TARGET_ASM_FILE_END i386_pe_file_end + +#undef ASM_COMMENT_START +#define ASM_COMMENT_START " #" + +#ifndef DWARF2_UNWIND_INFO +/* If configured with --disable-sjlj-exceptions, use DWARF2, else + default to SJLJ. */ +#if (defined (CONFIG_SJLJ_EXCEPTIONS) && !CONFIG_SJLJ_EXCEPTIONS) +/* The logic of this #if must be kept synchronised with the logic + for selecting the tmake_eh_file fragment in config.gcc. */ +#define DWARF2_UNWIND_INFO 1 +/* If multilib is selected break build as sjlj is required. */ +#if defined (TARGET_BI_ARCH) +#error For 64-bit windows and 32-bit based multilib version of gcc just SJLJ exceptions are supported. +#endif +#else +#define DWARF2_UNWIND_INFO 0 +#endif +#endif + +/* Don't assume anything about the header files. */ +#define NO_IMPLICIT_EXTERN_C + +#undef PROFILE_HOOK +#define PROFILE_HOOK(LABEL) \ + if (MAIN_NAME_P (DECL_NAME (current_function_decl))) \ + { \ + emit_call_insn (gen_rtx_CALL (VOIDmode, \ + gen_rtx_MEM (FUNCTION_MODE, \ + gen_rtx_SYMBOL_REF (Pmode, "_monstartup")), \ + const0_rtx)); \ + } + +/* Java Native Interface (JNI) methods on Win32 are invoked using the + stdcall calling convention. */ +#undef MODIFY_JNI_METHOD_CALL +#define MODIFY_JNI_METHOD_CALL(MDECL) \ + build_type_attribute_variant ((MDECL), \ + build_tree_list (get_identifier ("stdcall"), \ + NULL)) + +/* For Win32 ABI compatibility */ +#undef DEFAULT_PCC_STRUCT_RETURN +#define DEFAULT_PCC_STRUCT_RETURN 0 + +/* MSVC returns aggregate types of up to 8 bytes via registers. + See i386.c:ix86_return_in_memory. */ +#undef MS_AGGREGATE_RETURN +#define MS_AGGREGATE_RETURN 1 + +/* Biggest alignment supported by the object file format of this + machine. Use this macro to limit the alignment which can be + specified using the `__attribute__ ((aligned (N)))' construct. If + not defined, the default value is `BIGGEST_ALIGNMENT'. */ +/* IMAGE_SCN_ALIGN_8192BYTES is the largest section alignment flag + specified in the PECOFF60 spec. Native MS compiler also limits + user-specified alignment to 8192 bytes. */ +#undef MAX_OFILE_ALIGNMENT +#define MAX_OFILE_ALIGNMENT (8192 * 8) + +/* BIGGEST_FIELD_ALIGNMENT macro is used directly by libobjc, There, we + align internal doubles in structures on dword boundaries. Otherwise, + support vector modes using ADJUST_FIELD_ALIGN, defined in i386.h. */ +#ifdef IN_TARGET_LIBS +#undef BIGGEST_FIELD_ALIGNMENT +#define BIGGEST_FIELD_ALIGNMENT 64 +#endif + +/* A bit-field declared as `int' forces `int' alignment for the struct. */ +#undef PCC_BITFIELD_TYPE_MATTERS +#define PCC_BITFIELD_TYPE_MATTERS 1 +#define GROUP_BITFIELDS_BY_ALIGN TYPE_NATIVE(rec) + +/* Enable alias attribute support. */ +#ifndef SET_ASM_OP +#define SET_ASM_OP "\t.set\t" +#endif + +/* This implements the `alias' attribute, keeping any stdcall or + fastcall decoration. */ +#undef ASM_OUTPUT_DEF_FROM_DECLS +#define ASM_OUTPUT_DEF_FROM_DECLS(STREAM, DECL, TARGET) \ + do \ + { \ + const char *alias \ + = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (DECL)); \ + i386_pe_maybe_record_exported_symbol (DECL, alias, 0); \ + if (TREE_CODE (DECL) == FUNCTION_DECL) \ + i386_pe_declare_function_type (STREAM, alias, \ + TREE_PUBLIC (DECL)); \ + ASM_OUTPUT_DEF (STREAM, alias, IDENTIFIER_POINTER (TARGET)); \ + } while (0) + +/* GNU as supports weak symbols on PECOFF. */ +#ifdef HAVE_GAS_WEAK +#define ASM_WEAKEN_LABEL(FILE, NAME) \ + do \ + { \ + fputs ("\t.weak\t", (FILE)); \ + assemble_name ((FILE), (NAME)); \ + fputc ('\n', (FILE)); \ + } \ + while (0) +#endif /* HAVE_GAS_WEAK */ + +/* FIXME: SUPPORTS_WEAK && TARGET_HAVE_NAMED_SECTIONS is true, + but for .jcr section to work we also need crtbegin and crtend + objects. */ +#define TARGET_USE_JCR_SECTION 1 + +/* Decide whether it is safe to use a local alias for a virtual function + when constructing thunks. */ +#undef TARGET_USE_LOCAL_THUNK_ALIAS_P +#define TARGET_USE_LOCAL_THUNK_ALIAS_P(DECL) (!DECL_ONE_ONLY (DECL)) + +#define SUBTARGET_ATTRIBUTE_TABLE \ + { "selectany", 0, 0, true, false, false, ix86_handle_selectany_attribute } + /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler } */ + +/* mcount() does not need a counter variable. */ +#undef NO_PROFILE_COUNTERS +#define NO_PROFILE_COUNTERS 1 + +#define TARGET_VALID_DLLIMPORT_ATTRIBUTE_P i386_pe_valid_dllimport_attribute_p +#define TARGET_CXX_ADJUST_CLASS_AT_DEFINITION i386_pe_adjust_class_at_definition +#define TARGET_MANGLE_DECL_ASSEMBLER_NAME i386_pe_mangle_decl_assembler_name + +#undef TARGET_ASM_ASSEMBLE_VISIBILITY +#define TARGET_ASM_ASSEMBLE_VISIBILITY i386_pe_assemble_visibility + +/* Static stack checking is supported by means of probes. */ +#define STACK_CHECK_STATIC_BUILTIN 1 + +#undef TREE + +#ifndef BUFSIZ +# undef FILE +#endif diff --git a/gcc/config/i386/cygming.opt b/gcc/config/i386/cygming.opt new file mode 100644 index 000000000..0fb325bde --- /dev/null +++ b/gcc/config/i386/cygming.opt @@ -0,0 +1,54 @@ +; Cygwin- and MinGW-specific options. + +; Copyright (C) 2005, 2007, 2009, 2010, 2011 Free Software Foundation, Inc. +; +; This file is part of GCC. +; +; GCC is free software; you can redistribute it and/or modify it under +; the terms of the GNU General Public License as published by the Free +; Software Foundation; either version 3, or (at your option) any later +; version. +; +; GCC is distributed in the hope that it will be useful, but WITHOUT ANY +; WARRANTY; without even the implied warranty of MERCHANTABILITY or +; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +; for more details. +; +; You should have received a copy of the GNU General Public License +; along with GCC; see the file COPYING3. If not see +; . + +mconsole +Target RejectNegative +Create console application + +mdll +Target RejectNegative +Generate code for a DLL + +mnop-fun-dllimport +Target Report Var(TARGET_NOP_FUN_DLLIMPORT) +Ignore dllimport for functions + +mthreads +Target RejectNegative +Use Mingw-specific thread support + +mwin32 +Target +Set Windows defines + +mwindows +Target +Create GUI application + +mpe-aligned-commons +Target Var(use_pe_aligned_common) Init(HAVE_GAS_ALIGNED_COMM) +Use the GNU extension to the PE format for aligned common data + +muse-libstdc-wrappers +Target Condition({defined (USE_CYGWIN_LIBSTDCXX_WRAPPERS)}) +Compile code that relies on Cygwin DLL wrappers to support C++ operator new/delete replacement + +posix +Driver diff --git a/gcc/config/i386/cygwin-stdint.h b/gcc/config/i386/cygwin-stdint.h new file mode 100644 index 000000000..df865f717 --- /dev/null +++ b/gcc/config/i386/cygwin-stdint.h @@ -0,0 +1,62 @@ +/* Definitions for types on systems using Cygwin. + Copyright (C) 2009 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#define SIG_ATOMIC_TYPE "int" + +/* Exact-width integer types */ + +#define INT8_TYPE "signed char" +#define INT16_TYPE "short int" +#define INT32_TYPE "int" +#define INT64_TYPE "long long int" + +#define UINT8_TYPE "unsigned char" +#define UINT16_TYPE "short unsigned int" +#define UINT32_TYPE "unsigned int" +#define UINT64_TYPE "long long unsigned int" + +/* Minimum-width integer types */ + +#define INT_LEAST8_TYPE "signed char" +#define INT_LEAST16_TYPE "short int" +#define INT_LEAST32_TYPE "int" +#define INT_LEAST64_TYPE "long long int" + +#define UINT_LEAST8_TYPE "unsigned char" +#define UINT_LEAST16_TYPE "short unsigned int" +#define UINT_LEAST32_TYPE "unsigned int" +#define UINT_LEAST64_TYPE "long long unsigned int" + +/* Fastest minimum-width integer types */ + +#define INT_FAST8_TYPE "signed char" +#define INT_FAST16_TYPE "int" +#define INT_FAST32_TYPE "int" +#define INT_FAST64_TYPE "long long int" + +#define UINT_FAST8_TYPE "unsigned char" +#define UINT_FAST16_TYPE "unsigned int" +#define UINT_FAST32_TYPE "unsigned int" +#define UINT_FAST64_TYPE "long long unsigned int" + +/* Integer types capable of holding object pointers */ + +#define INTPTR_TYPE "int" +#define UINTPTR_TYPE "unsigned int" + diff --git a/gcc/config/i386/cygwin.asm b/gcc/config/i386/cygwin.asm new file mode 100644 index 000000000..8f9c48685 --- /dev/null +++ b/gcc/config/i386/cygwin.asm @@ -0,0 +1,188 @@ +/* stuff needed for libgcc on win32. + * + * Copyright (C) 1996, 1998, 2001, 2003, 2008, 2009, 2010 + * Free Software Foundation, Inc. + * Written By Steve Chamberlain + * + * This file is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 3, or (at your option) any + * later version. + * + * This file is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Under Section 7 of GPL version 3, you are granted additional + * permissions described in the GCC Runtime Library Exception, version + * 3.1, as published by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License and + * a copy of the GCC Runtime Library Exception along with this program; + * see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + * . + */ + +#include "auto-host.h" + +#ifdef HAVE_GAS_CFI_SECTIONS_DIRECTIVE + .cfi_sections .debug_frame +# define cfi_startproc() .cfi_startproc +# define cfi_endproc() .cfi_endproc +# define cfi_adjust_cfa_offset(X) .cfi_adjust_cfa_offset X +# define cfi_def_cfa_register(X) .cfi_def_cfa_register X +# define cfi_register(D,S) .cfi_register D, S +# ifdef _WIN64 +# define cfi_push(X) .cfi_adjust_cfa_offset 8; .cfi_rel_offset X, 0 +# define cfi_pop(X) .cfi_adjust_cfa_offset -8; .cfi_restore X +# else +# define cfi_push(X) .cfi_adjust_cfa_offset 4; .cfi_rel_offset X, 0 +# define cfi_pop(X) .cfi_adjust_cfa_offset -4; .cfi_restore X +# endif +#else +# define cfi_startproc() +# define cfi_endproc() +# define cfi_adjust_cfa_offset(X) +# define cfi_def_cfa_register(X) +# define cfi_register(D,S) +# define cfi_push(X) +# define cfi_pop(X) +#endif /* HAVE_GAS_CFI_SECTIONS_DIRECTIVE */ + +#ifdef L_chkstk +/* Function prologue calls __chkstk to probe the stack when allocating more + than CHECK_STACK_LIMIT bytes in one go. Touching the stack at 4K + increments is necessary to ensure that the guard pages used + by the OS virtual memory manger are allocated in correct sequence. */ + + .global ___chkstk + .global __alloca +#ifdef _WIN64 +/* __alloca is a normal function call, which uses %rcx as the argument. */ + cfi_startproc() +__alloca: + movq %rcx, %rax + /* FALLTHRU */ + +/* ___chkstk is a *special* function call, which uses %rax as the argument. + We avoid clobbering the 4 integer argument registers, %rcx, %rdx, + %r8 and %r9, which leaves us with %rax, %r10, and %r11 to use. */ + .align 4 +___chkstk: + popq %r11 /* pop return address */ + cfi_adjust_cfa_offset(-8) /* indicate return address in r11 */ + cfi_register(%rip, %r11) + movq %rsp, %r10 + cmpq $0x1000, %rax /* > 4k ?*/ + jb 2f + +1: subq $0x1000, %r10 /* yes, move pointer down 4k*/ + orl $0x0, (%r10) /* probe there */ + subq $0x1000, %rax /* decrement count */ + cmpq $0x1000, %rax + ja 1b /* and do it again */ + +2: subq %rax, %r10 + movq %rsp, %rax /* hold CFA until return */ + cfi_def_cfa_register(%rax) + orl $0x0, (%r10) /* less than 4k, just peek here */ + movq %r10, %rsp /* decrement stack */ + + /* Push the return value back. Doing this instead of just + jumping to %r11 preserves the cached call-return stack + used by most modern processors. */ + pushq %r11 + ret + cfi_endproc() +#else + cfi_startproc() +___chkstk: +__alloca: + pushl %ecx /* save temp */ + cfi_push(%eax) + leal 8(%esp), %ecx /* point past return addr */ + cmpl $0x1000, %eax /* > 4k ?*/ + jb 2f + +1: subl $0x1000, %ecx /* yes, move pointer down 4k*/ + orl $0x0, (%ecx) /* probe there */ + subl $0x1000, %eax /* decrement count */ + cmpl $0x1000, %eax + ja 1b /* and do it again */ + +2: subl %eax, %ecx + orl $0x0, (%ecx) /* less than 4k, just peek here */ + movl %esp, %eax /* save current stack pointer */ + cfi_def_cfa_register(%eax) + movl %ecx, %esp /* decrement stack */ + movl (%eax), %ecx /* recover saved temp */ + + /* Copy the return register. Doing this instead of just jumping to + the address preserves the cached call-return stack used by most + modern processors. */ + pushl 4(%eax) + ret + cfi_endproc() +#endif /* _WIN64 */ +#endif /* L_chkstk */ + +#ifdef L_chkstk_ms +/* ___chkstk_ms is a *special* function call, which uses %rax as the argument. + We avoid clobbering any registers. Unlike ___chkstk, it just probes the + stack and does no stack allocation. */ + .global ___chkstk_ms +#ifdef _WIN64 + cfi_startproc() +___chkstk_ms: + pushq %rcx /* save temps */ + cfi_push(%rcx) + pushq %rax + cfi_push(%rax) + cmpq $0x1000, %rax /* > 4k ?*/ + leaq 24(%rsp), %rcx /* point past return addr */ + jb 2f + +1: subq $0x1000, %rcx /* yes, move pointer down 4k */ + orq $0x0, (%rcx) /* probe there */ + subq $0x1000, %rax /* decrement count */ + cmpq $0x1000, %rax + ja 1b /* and do it again */ + +2: subq %rax, %rcx + orq $0x0, (%rcx) /* less than 4k, just peek here */ + + popq %rax + cfi_pop(%rax) + popq %rcx + cfi_pop(%rcx) + ret + cfi_endproc() +#else + cfi_startproc() +___chkstk_ms: + pushl %ecx /* save temp */ + cfi_push(%ecx) + pushl %eax + cfi_push(%eax) + cmpl $0x1000, %eax /* > 4k ?*/ + leal 12(%esp), %ecx /* point past return addr */ + jb 2f + +1: subl $0x1000, %ecx /* yes, move pointer down 4k*/ + orl $0x0, (%ecx) /* probe there */ + subl $0x1000, %eax /* decrement count */ + cmpl $0x1000, %eax + ja 1b /* and do it again */ + +2: subl %eax, %ecx + orl $0x0, (%ecx) /* less than 4k, just peek here */ + + popl %eax + cfi_pop(%eax) + popl %ecx + cfi_pop(%ecx) + ret + cfi_endproc() +#endif /* _WIN64 */ +#endif /* L_chkstk_ms */ diff --git a/gcc/config/i386/cygwin.h b/gcc/config/i386/cygwin.h new file mode 100644 index 000000000..f8daeecec --- /dev/null +++ b/gcc/config/i386/cygwin.h @@ -0,0 +1,142 @@ +/* Operating system specific defines to be used when targeting GCC for + hosting on Windows32, using a Unix style C library and tools. + Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, + 2007, 2008, 2009, 2010 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#define TARGET_VERSION fprintf (stderr, " (x86 Cygwin)"); + +#define EXTRA_OS_CPP_BUILTINS() /* Nothing. */ + +#undef CPP_SPEC +#define CPP_SPEC "%(cpp_cpu) %{posix:-D_POSIX_SOURCE} \ + -D__CYGWIN32__ -D__CYGWIN__ %{!ansi:-Dunix} -D__unix__ -D__unix \ + %{mwin32:-DWIN32 -D_WIN32 -D__WIN32 -D__WIN32__ %{!ansi:-DWINNT}} \ + %{!nostdinc:%{!mno-win32:-idirafter ../include/w32api%s -idirafter ../../include/w32api%s}}\ +" + +#undef STARTFILE_SPEC +#define STARTFILE_SPEC "\ + %{!shared: %{!mdll: crt0%O%s \ + %{pg:gcrt0%O%s}}}\ + crtbegin.o%s" + +#undef ENDFILE_SPEC +#define ENDFILE_SPEC \ + "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}\ + crtend.o%s" + +/* Normally, -lgcc is not needed since everything in it is in the DLL, but we + want to allow things to be added to it when installing new versions of + GCC without making a new CYGWIN.DLL, so we leave it. Profiling is handled + by calling the init function from main. */ + +#ifdef ENABLE_SHARED_LIBGCC +#define SHARED_LIBGCC_SPEC " \ + %{static|static-libgcc:-lgcc -lgcc_eh} \ + %{!static: \ + %{!static-libgcc: \ + %{!shared: \ + %{!shared-libgcc:-lgcc -lgcc_eh} \ + %{shared-libgcc:-lgcc_s -lgcc} \ + } \ + %{shared:-lgcc_s -lgcc} \ + } \ + } " +#else +#define SHARED_LIBGCC_SPEC " -lgcc " +#endif + +#undef REAL_LIBGCC_SPEC +#define REAL_LIBGCC_SPEC SHARED_LIBGCC_SPEC + +/* We have to dynamic link to get to the system DLLs. All of libc, libm and + the Unix stuff is in cygwin.dll. The import library is called + 'libcygwin.a'. For Windows applications, include more libraries, but + always include kernel32. We'd like to specific subsystem windows to + ld, but that doesn't work just yet. */ + +#undef LIB_SPEC +#define LIB_SPEC "\ + %{pg:-lgmon} \ + -lcygwin \ + %{mwindows:-lgdi32 -lcomdlg32} \ + -ladvapi32 -lshell32 -luser32 -lkernel32" + +/* To implement C++ function replacement we always wrap the cxx + malloc-like operators. See N2800 #17.6.4.6 [replacement.functions] */ +#define CXX_WRAP_SPEC_LIST " \ + --wrap _Znwj \ + --wrap _Znaj \ + --wrap _ZdlPv \ + --wrap _ZdaPv \ + --wrap _ZnwjRKSt9nothrow_t \ + --wrap _ZnajRKSt9nothrow_t \ + --wrap _ZdlPvRKSt9nothrow_t \ + --wrap _ZdaPvRKSt9nothrow_t \ +" + +#if defined (USE_CYGWIN_LIBSTDCXX_WRAPPERS) + +#if USE_CYGWIN_LIBSTDCXX_WRAPPERS +/* Default on, only explict -mno disables. */ +#define CXX_WRAP_SPEC_OPT "!mno-use-libstdc-wrappers" +#else +/* Default off, only explict -m enables. */ +#define CXX_WRAP_SPEC_OPT "muse-libstdc-wrappers" +#endif + +#define CXX_WRAP_SPEC "%{" CXX_WRAP_SPEC_OPT ":" CXX_WRAP_SPEC_LIST "}" + +#else /* !defined (USE_CYGWIN_LIBSTDCXX_WRAPPERS) */ + +#define CXX_WRAP_SPEC "" + +#endif /* ?defined (USE_CYGWIN_LIBSTDCXX_WRAPPERS) */ + +#define LINK_SPEC "\ + %{mwindows:--subsystem windows} \ + %{mconsole:--subsystem console} \ + " CXX_WRAP_SPEC " \ + %{shared: %{mdll: %eshared and mdll are not compatible}} \ + %{shared: --shared} %{mdll:--dll} \ + %{static:-Bstatic} %{!static:-Bdynamic} \ + %{shared|mdll: --enable-auto-image-base -e __cygwin_dll_entry@12} \ + --dll-search-prefix=cyg -tsaware" + +/* Binutils does not handle weak symbols from dlls correctly. For now, + do not use them unnecessarily in gthr-posix.h. */ +#define GTHREAD_USE_WEAK 0 + +/* Every program on cygwin links against cygwin1.dll which contains + the pthread routines. There is no need to explicitly link them + and the -pthread flag is not recognized. */ +#undef GOMP_SELF_SPECS +#define GOMP_SELF_SPECS "" + +/* This matches SHLIB_SONAME and SHLIB_SOVERSION in t-cygwin. */ +#if DWARF2_UNWIND_INFO +#define LIBGCC_EH_EXTN "" +#else +#define LIBGCC_EH_EXTN "-sjlj" +#endif +#define LIBGCC_SONAME "cyggcc_s" LIBGCC_EH_EXTN "-1.dll" + +/* We should find a way to not have to update this manually. */ +#define LIBGCJ_SONAME "cyggcj" /*LIBGCC_EH_EXTN*/ "-12.dll" + diff --git a/gcc/config/i386/darwin-libgcc.10.4.ver b/gcc/config/i386/darwin-libgcc.10.4.ver new file mode 100644 index 000000000..67f5e239c --- /dev/null +++ b/gcc/config/i386/darwin-libgcc.10.4.ver @@ -0,0 +1,98 @@ +# Copyright (C) 2005 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . +__Unwind_Backtrace +__Unwind_DeleteException +__Unwind_FindEnclosingFunction +__Unwind_Find_FDE +__Unwind_ForcedUnwind +__Unwind_GetCFA +__Unwind_GetDataRelBase +__Unwind_GetGR +__Unwind_GetIP +__Unwind_GetLanguageSpecificData +__Unwind_GetRegionStart +__Unwind_GetTextRelBase +__Unwind_RaiseException +__Unwind_Resume +__Unwind_Resume_or_Rethrow +__Unwind_SetGR +__Unwind_SetIP +___absvdi2 +___absvsi2 +___addvdi3 +___addvsi3 +___ashldi3 +___ashrdi3 +___clear_cache +___clzdi2 +___clzsi2 +___cmpdi2 +___ctzdi2 +___ctzsi2 +___deregister_frame +___deregister_frame_info +___deregister_frame_info_bases +___divdc3 +___divdi3 +___divsc3 +___divxc3 +___enable_execute_stack +___ffsdi2 +___fixdfdi +___fixsfdi +___fixunsdfdi +___fixunsdfsi +___fixunssfdi +___fixunssfsi +___fixunsxfdi +___fixunsxfsi +___fixxfdi +___floatdidf +___floatdisf +___floatdixf +___gcc_personality_v0 +___lshrdi3 +___moddi3 +___muldc3 +___muldi3 +___mulsc3 +___mulvdi3 +___mulvsi3 +___mulxc3 +___negdi2 +___negvdi2 +___negvsi2 +___paritydi2 +___paritysi2 +___popcountdi2 +___popcountsi2 +___powidf2 +___powisf2 +___powixf2 +___register_frame +___register_frame_info +___register_frame_info_bases +___register_frame_info_table +___register_frame_info_table_bases +___register_frame_table +___subvdi3 +___subvsi3 +___ucmpdi2 +___udivdi3 +___udivmoddi4 +___umoddi3 diff --git a/gcc/config/i386/darwin-libgcc.10.5.ver b/gcc/config/i386/darwin-libgcc.10.5.ver new file mode 100644 index 000000000..eeec9fbfc --- /dev/null +++ b/gcc/config/i386/darwin-libgcc.10.5.ver @@ -0,0 +1,102 @@ +# Copyright (C) 2005, 2006 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . +__Unwind_Backtrace +__Unwind_DeleteException +__Unwind_FindEnclosingFunction +__Unwind_Find_FDE +__Unwind_ForcedUnwind +__Unwind_GetCFA +__Unwind_GetDataRelBase +__Unwind_GetGR +__Unwind_GetIP +__Unwind_GetIPInfo +__Unwind_GetLanguageSpecificData +__Unwind_GetRegionStart +__Unwind_GetTextRelBase +__Unwind_RaiseException +__Unwind_Resume +__Unwind_Resume_or_Rethrow +__Unwind_SetGR +__Unwind_SetIP +___absvdi2 +___absvsi2 +___addvdi3 +___addvsi3 +___ashldi3 +___ashrdi3 +___clear_cache +___clzdi2 +___clzsi2 +___cmpdi2 +___ctzdi2 +___ctzsi2 +___deregister_frame +___deregister_frame_info +___deregister_frame_info_bases +___divdc3 +___divdi3 +___divsc3 +___divxc3 +___enable_execute_stack +___ffsdi2 +___fixdfdi +___fixsfdi +___fixunsdfdi +___fixunsdfsi +___fixunssfdi +___fixunssfsi +___fixunsxfdi +___fixunsxfsi +___fixxfdi +___floatdidf +___floatdisf +___floatdixf +___floatundidf +___floatundisf +___floatundixf +___gcc_personality_v0 +___lshrdi3 +___moddi3 +___muldc3 +___muldi3 +___mulsc3 +___mulvdi3 +___mulvsi3 +___mulxc3 +___negdi2 +___negvdi2 +___negvsi2 +___paritydi2 +___paritysi2 +___popcountdi2 +___popcountsi2 +___powidf2 +___powisf2 +___powixf2 +___register_frame +___register_frame_info +___register_frame_info_bases +___register_frame_info_table +___register_frame_info_table_bases +___register_frame_table +___subvdi3 +___subvsi3 +___ucmpdi2 +___udivdi3 +___udivmoddi4 +___umoddi3 diff --git a/gcc/config/i386/darwin.h b/gcc/config/i386/darwin.h new file mode 100644 index 000000000..08b6c5253 --- /dev/null +++ b/gcc/config/i386/darwin.h @@ -0,0 +1,323 @@ +/* Target definitions for x86 running Darwin. + Copyright (C) 2001, 2002, 2004, 2005, 2006, 2007, 2008, 2010 + Free Software Foundation, Inc. + Contributed by Apple Computer Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +/* Enable Mach-O bits in generic x86 code. */ +#undef TARGET_MACHO +#define TARGET_MACHO 1 + +#undef DARWIN_X86 +#define DARWIN_X86 1 + +#define TARGET_VERSION fprintf (stderr, " (i686 Darwin)"); + +#undef TARGET_64BIT +#define TARGET_64BIT OPTION_ISA_64BIT + +#ifdef IN_LIBGCC2 +#undef TARGET_64BIT +#ifdef __x86_64__ +#define TARGET_64BIT 1 +#else +#define TARGET_64BIT 0 +#endif +#endif + +/* Size of the Obj-C jump buffer. */ +#define OBJC_JBLEN ((TARGET_64BIT) ? ((9 * 2) + 3 + 16) : (18)) + +#undef TARGET_FPMATH_DEFAULT +#define TARGET_FPMATH_DEFAULT (TARGET_SSE ? FPMATH_SSE : FPMATH_387) + +#define TARGET_OS_CPP_BUILTINS() \ + do \ + { \ + builtin_define ("__LITTLE_ENDIAN__"); \ + darwin_cpp_builtins (pfile); \ + } \ + while (0) + +#undef PTRDIFF_TYPE +#define PTRDIFF_TYPE (TARGET_64BIT ? "long int" : "int") + +#undef WCHAR_TYPE +#define WCHAR_TYPE "int" + +#undef WCHAR_TYPE_SIZE +#define WCHAR_TYPE_SIZE 32 + +/* Generate branch islands stubs if this is true. */ +extern int darwin_emit_branch_islands; + +#undef TARGET_MACHO_BRANCH_ISLANDS +#define TARGET_MACHO_BRANCH_ISLANDS darwin_emit_branch_islands + +/* For compatibility with OSX system tools, use the new style of pic stub + if this is set. */ +#undef MACHOPIC_ATT_STUB +#define MACHOPIC_ATT_STUB (darwin_macho_att_stub) + +#undef MAX_BITS_PER_WORD +#define MAX_BITS_PER_WORD 64 + +#undef FORCE_PREFERRED_STACK_BOUNDARY_IN_MAIN +#define FORCE_PREFERRED_STACK_BOUNDARY_IN_MAIN (0) + +#undef TARGET_KEEPS_VECTOR_ALIGNED_STACK +#define TARGET_KEEPS_VECTOR_ALIGNED_STACK 1 + +/* On Darwin, the stack is 128-bit aligned at the point of every call. + Failure to ensure this will lead to a crash in the system libraries + or dynamic loader. */ +#undef STACK_BOUNDARY +#define STACK_BOUNDARY \ + ((profile_flag || (TARGET_64BIT && ix86_abi == MS_ABI)) \ + ? 128 : BITS_PER_WORD) + +#undef MAIN_STACK_BOUNDARY +#define MAIN_STACK_BOUNDARY 128 + +/* Since we'll never want a stack boundary less aligned than 128 bits + we need the extra work here otherwise bits of gcc get very grumpy + when we ask for lower alignment. We could just reject values less + than 128 bits for Darwin, but it's easier to up the alignment if + it's below the minimum. */ +#undef PREFERRED_STACK_BOUNDARY +#define PREFERRED_STACK_BOUNDARY \ + MAX (128, ix86_preferred_stack_boundary) + +/* We want -fPIC by default, unless we're using -static to compile for + the kernel or some such. */ + +#undef CC1_SPEC +#define CC1_SPEC "%(cc1_cpu) \ + %{!mkernel:%{!static:%{!mdynamic-no-pic:-fPIC}}} \ + %{!mmacosx-version-min=*:-mmacosx-version-min=%(darwin_minversion)} \ + %{g: %{!fno-eliminate-unused-debug-symbols: -feliminate-unused-debug-symbols }} " \ + DARWIN_CC1_SPEC + +#undef ASM_SPEC +#define ASM_SPEC "-arch %(darwin_arch) -force_cpusubtype_ALL \ + %{static}" + +#define DARWIN_ARCH_SPEC "%{m64:x86_64;:i386}" +#define DARWIN_SUBARCH_SPEC DARWIN_ARCH_SPEC + +/* Determine a minimum version based on compiler options. */ +#define DARWIN_MINVERSION_SPEC \ + "%{!m64|fgnu-runtime:10.4; \ + ,objective-c|,objc-cpp-output:10.5; \ + ,objective-c-header:10.5; \ + ,objective-c++|,objective-c++-cpp-output:10.5; \ + ,objective-c++-header|,objc++-cpp-output:10.5; \ + :10.4}" + +#undef ENDFILE_SPEC +#define ENDFILE_SPEC \ + "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \ + %{mpc32:crtprec32.o%s} \ + %{mpc64:crtprec64.o%s} \ + %{mpc80:crtprec80.o%s}" + +#undef SUBTARGET_EXTRA_SPECS +#define SUBTARGET_EXTRA_SPECS \ + DARWIN_EXTRA_SPECS \ + { "darwin_arch", DARWIN_ARCH_SPEC }, \ + { "darwin_crt2", "" }, \ + { "darwin_subarch", DARWIN_SUBARCH_SPEC }, + +/* The Darwin assembler mostly follows AT&T syntax. */ +#undef ASSEMBLER_DIALECT +#define ASSEMBLER_DIALECT ASM_ATT + +/* Define macro used to output shift-double opcodes when the shift + count is in %cl. Some assemblers require %cl as an argument; + some don't. This macro controls what to do: by default, don't + print %cl. */ + +#define SHIFT_DOUBLE_OMITS_COUNT 0 + +/* Put all *tf routines in libgcc. */ +#undef LIBGCC2_HAS_TF_MODE +#define LIBGCC2_HAS_TF_MODE 1 +#define LIBGCC2_TF_CEXT q +#define TF_SIZE 113 + +#undef TARGET_ASM_FILE_END +#define TARGET_ASM_FILE_END darwin_file_end + +/* Define the syntax of pseudo-ops, labels and comments. */ + +/* String containing the assembler's comment-starter. */ + +#define ASM_COMMENT_START "#" + +/* By default, target has a 80387, uses IEEE compatible arithmetic, + and returns float values in the 387. */ + +#undef TARGET_SUBTARGET_DEFAULT +#define TARGET_SUBTARGET_DEFAULT (MASK_80387 | MASK_IEEE_FP | MASK_FLOAT_RETURNS | MASK_128BIT_LONG_DOUBLE) + +/* For darwin we want to target specific processor features as a minimum, + but these unfortunately don't correspond to a specific processor. */ +#undef TARGET_SUBTARGET32_ISA_DEFAULT +#define TARGET_SUBTARGET32_ISA_DEFAULT (OPTION_MASK_ISA_MMX \ + | OPTION_MASK_ISA_SSE \ + | OPTION_MASK_ISA_SSE2 \ + | OPTION_MASK_ISA_SSE3) + +#undef TARGET_SUBTARGET64_ISA_DEFAULT +#define TARGET_SUBTARGET64_ISA_DEFAULT TARGET_SUBTARGET32_ISA_DEFAULT + +#undef GOT_SYMBOL_NAME +#define GOT_SYMBOL_NAME MACHOPIC_FUNCTION_BASE_NAME + +/* Define the syntax of pseudo-ops, labels and comments. */ + +#define LPREFIX "L" + +/* Assembler pseudos to introduce constants of various size. */ + +#define ASM_BYTE "\t.byte\t" +#define ASM_SHORT "\t.word\t" +#define ASM_LONG "\t.long\t" +#define ASM_QUAD "\t.quad\t" + +#define SUBTARGET_ENCODE_SECTION_INFO darwin_encode_section_info + +#undef ASM_OUTPUT_ALIGN +#define ASM_OUTPUT_ALIGN(FILE,LOG) \ + do { if ((LOG) != 0) \ + { \ + if (in_section == text_section) \ + fprintf (FILE, "\t%s %d,0x90\n", ALIGN_ASM_OP, (LOG)); \ + else \ + fprintf (FILE, "\t%s %d\n", ALIGN_ASM_OP, (LOG)); \ + } \ + } while (0) + +/* Darwin profiling -- call mcount. */ +#undef FUNCTION_PROFILER +#define FUNCTION_PROFILER(FILE, LABELNO) \ + do { \ + if (TARGET_MACHO_BRANCH_ISLANDS \ + && MACHOPIC_INDIRECT && !TARGET_64BIT) \ + { \ + const char *name = machopic_mcount_stub_name (); \ + fprintf (FILE, "\tcall %s\n", name+1); /* skip '&' */ \ + machopic_validate_stub_or_non_lazy_ptr (name); \ + } \ + else fprintf (FILE, "\tcall mcount\n"); \ + } while (0) + +#define C_COMMON_OVERRIDE_OPTIONS \ + do { \ + SUBTARGET_C_COMMON_OVERRIDE_OPTIONS; \ + } while (0) + +#undef SUBTARGET_OVERRIDE_OPTIONS +#define SUBTARGET_OVERRIDE_OPTIONS \ +do { \ + if (TARGET_64BIT && MACHO_DYNAMIC_NO_PIC_P) \ + target_flags &= ~MASK_MACHO_DYNAMIC_NO_PIC; \ +} while (0) + +/* Darwin on x86_64 uses dwarf-2 by default. Pre-darwin9 32-bit + compiles default to stabs+. darwin9+ defaults to dwarf-2. */ +#ifndef DARWIN_PREFER_DWARF +#undef PREFERRED_DEBUGGING_TYPE +#define PREFERRED_DEBUGGING_TYPE (TARGET_64BIT ? DWARF2_DEBUG : DBX_DEBUG) +#endif + +/* Darwin uses the standard DWARF register numbers but the default + register numbers for STABS. Fortunately for 64-bit code the + default and the standard are the same. */ +#undef DBX_REGISTER_NUMBER +#define DBX_REGISTER_NUMBER(n) \ + (TARGET_64BIT ? dbx64_register_map[n] \ + : write_symbols == DWARF2_DEBUG ? svr4_dbx_register_map[n] \ + : dbx_register_map[n]) + +/* Unfortunately, the 32-bit EH information also doesn't use the standard + DWARF register numbers. */ +#define DWARF2_FRAME_REG_OUT(n, for_eh) \ + (! (for_eh) || write_symbols != DWARF2_DEBUG || TARGET_64BIT ? (n) \ + : (n) == 5 ? 4 \ + : (n) == 4 ? 5 \ + : (n) >= 11 && (n) <= 18 ? (n) + 1 \ + : (n)) + +#undef REGISTER_SUBTARGET_PRAGMAS +#define REGISTER_SUBTARGET_PRAGMAS() DARWIN_REGISTER_TARGET_PRAGMAS() + +#undef TARGET_SET_DEFAULT_TYPE_ATTRIBUTES +#define TARGET_SET_DEFAULT_TYPE_ATTRIBUTES darwin_set_default_type_attributes + +/* For 64-bit, we need to add 4 because @GOTPCREL is relative to the + end of the instruction, but without the 4 we'd only have the right + address for the start of the instruction. */ +#undef ASM_MAYBE_OUTPUT_ENCODED_ADDR_RTX +#define ASM_MAYBE_OUTPUT_ENCODED_ADDR_RTX(FILE, ENCODING, SIZE, ADDR, DONE) \ + if (TARGET_64BIT) \ + { \ + if ((SIZE) == 4 && ((ENCODING) & 0x70) == DW_EH_PE_pcrel) \ + { \ + fputs (ASM_LONG, FILE); \ + assemble_name (FILE, XSTR (ADDR, 0)); \ + fputs ("+4@GOTPCREL", FILE); \ + goto DONE; \ + } \ + } \ + else \ + { \ + if (ENCODING == ASM_PREFERRED_EH_DATA_FORMAT (2, 1)) \ + { \ + darwin_non_lazy_pcrel (FILE, ADDR); \ + goto DONE; \ + } \ + } + +/* This needs to move since i386 uses the first flag and other flags are + used in Mach-O. */ +#undef MACHO_SYMBOL_FLAG_VARIABLE +#define MACHO_SYMBOL_FLAG_VARIABLE ((SYMBOL_FLAG_MACH_DEP) << 3) + +#undef MACHOPIC_NL_SYMBOL_PTR_SECTION +#define MACHOPIC_NL_SYMBOL_PTR_SECTION \ + ".section __IMPORT,__pointers,non_lazy_symbol_pointers" + +#define SUBTARGET32_DEFAULT_CPU "i686" + +#undef SUBTARGET_INIT_BUILTINS +#define SUBTARGET_INIT_BUILTINS \ +do { \ + ix86_builtins[(int) IX86_BUILTIN_CFSTRING] \ + = darwin_init_cfstring_builtins ((unsigned) (IX86_BUILTIN_CFSTRING)); \ + darwin_rename_builtins (); \ +} while(0) + +/* The system ___divdc3 routine in libSystem on darwin10 is not + accurate to 1ulp, ours is, so we avoid ever using the system name + for this routine and instead install a non-conflicting name that is + accurate. See darwin_rename_builtins. */ +#ifdef L_divdc3 +#define DECLARE_LIBRARY_RENAMES \ + asm(".text; ___divdc3: jmp ___ieee_divdc3 ; .globl ___divdc3"); +#endif diff --git a/gcc/config/i386/darwin64.h b/gcc/config/i386/darwin64.h new file mode 100644 index 000000000..9562faa90 --- /dev/null +++ b/gcc/config/i386/darwin64.h @@ -0,0 +1,35 @@ +/* Target definitions for x86_64 running Darwin. + Copyright (C) 2006, 2007 Free Software Foundation, Inc. + Contributed by Apple Computer Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#undef TARGET_VERSION +#define TARGET_VERSION fprintf (stderr, " (x86_64 Darwin)"); + +#undef DARWIN_ARCH_SPEC +#define DARWIN_ARCH_SPEC "%{m32:i386;:x86_64}" + +#undef DARWIN_SUBARCH_SPEC +#define DARWIN_SUBARCH_SPEC DARWIN_ARCH_SPEC + +#undef SUBTARGET_EXTRA_SPECS +#define SUBTARGET_EXTRA_SPECS \ + DARWIN_EXTRA_SPECS \ + { "darwin_arch", DARWIN_ARCH_SPEC }, \ + { "darwin_crt2", "" }, \ + { "darwin_subarch", DARWIN_SUBARCH_SPEC }, diff --git a/gcc/config/i386/djgpp-stdint.h b/gcc/config/i386/djgpp-stdint.h new file mode 100644 index 000000000..8fd3a2565 --- /dev/null +++ b/gcc/config/i386/djgpp-stdint.h @@ -0,0 +1,62 @@ +/* Definitions for types on systems using DJGPP. + Copyright (C) 2009 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#define SIG_ATOMIC_TYPE "int" + +/* Exact-width integer types */ + +#define INT8_TYPE "signed char" +#define INT16_TYPE "signed short int" +#define INT32_TYPE "signed long int" +#define INT64_TYPE "signed long long int" + +#define UINT8_TYPE "unsigned char" +#define UINT16_TYPE "short unsigned int" +#define UINT32_TYPE "long unsigned int" +#define UINT64_TYPE "long long unsigned int" + +/* Minimum-width integer types */ + +#define INT_LEAST8_TYPE "signed char" +#define INT_LEAST16_TYPE "signed short int" +#define INT_LEAST32_TYPE "signed int" +#define INT_LEAST64_TYPE "signed long long int" + +#define UINT_LEAST8_TYPE "unsigned char" +#define UINT_LEAST16_TYPE "short unsigned int" +#define UINT_LEAST32_TYPE "unsigned int" +#define UINT_LEAST64_TYPE "long long unsigned int" + +/* Fastest minimum-width integer types */ + +#define INT_FAST8_TYPE "signed char" +#define INT_FAST16_TYPE "signed int" +#define INT_FAST32_TYPE "signed int" +#define INT_FAST64_TYPE "long long signed int" + +#define UINT_FAST8_TYPE "unsigned char" +#define UINT_FAST16_TYPE "unsigned int" +#define UINT_FAST32_TYPE "unsigned int" +#define UINT_FAST64_TYPE "long long unsigned int" + +/* Integer types capable of holding object pointers */ + +#define INTPTR_TYPE "long int" +#define UINTPTR_TYPE "long unsigned int" + diff --git a/gcc/config/i386/djgpp.h b/gcc/config/i386/djgpp.h new file mode 100644 index 000000000..34a15facb --- /dev/null +++ b/gcc/config/i386/djgpp.h @@ -0,0 +1,182 @@ +/* Configuration for an i386 running MS-DOS with DJGPP. + Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2004, 2005, 2007, + 2010 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +/* Support generation of DWARF2 debugging info. */ +#define DWARF2_DEBUGGING_INFO 1 + +/* Don't assume anything about the header files. */ +#define NO_IMPLICIT_EXTERN_C + +/* If defined, a C expression whose value is a string containing the + assembler operation to identify the following data as + uninitialized global data. If not defined, and neither + `ASM_OUTPUT_BSS' nor `ASM_OUTPUT_ALIGNED_BSS' are defined, + uninitialized global data will be output in the data section if + `-fno-common' is passed, otherwise `ASM_OUTPUT_COMMON' will be + used. */ +#undef BSS_SECTION_ASM_OP +#define BSS_SECTION_ASM_OP "\t.section\t.bss" + +/* Define the name of the .data section. */ +#undef DATA_SECTION_ASM_OP +#define DATA_SECTION_ASM_OP "\t.section .data" + +/* Define the name of the .ident op. */ +#undef IDENT_ASM_OP +#define IDENT_ASM_OP "\t.ident\t" + +/* Enable alias attribute support. */ +#ifndef SET_ASM_OP +#define SET_ASM_OP "\t.set\t" +#endif + +/* Define the name of the .text section. */ +#undef TEXT_SECTION_ASM_OP +#define TEXT_SECTION_ASM_OP "\t.section .text" + +/* Define standard DJGPP installation paths. */ +/* We override default /usr or /usr/local part with /dev/env/DJDIR which */ +/* points to actual DJGPP installation directory. */ + +/* Standard include directory */ +#undef STANDARD_INCLUDE_DIR +#define STANDARD_INCLUDE_DIR "/dev/env/DJDIR/include/" + +/* Search for as.exe and ld.exe in DJGPP's binary directory. */ +#undef MD_EXEC_PREFIX +#define MD_EXEC_PREFIX "/dev/env/DJDIR/bin/" + +/* Standard DJGPP library and startup files */ +#undef MD_STARTFILE_PREFIX +#define MD_STARTFILE_PREFIX "/dev/env/DJDIR/lib/" + +/* Correctly handle absolute filename detection in cp/xref.c */ +#define FILE_NAME_ABSOLUTE_P(NAME) \ + (((NAME)[0] == '/') || ((NAME)[0] == '\\') || \ + (((NAME)[0] >= 'A') && ((NAME)[0] <= 'z') && ((NAME)[1] == ':'))) + +#define TARGET_OS_CPP_BUILTINS() \ + do \ + { \ + builtin_define_std ("MSDOS"); \ + builtin_define_std ("GO32"); \ + builtin_assert ("system=msdos"); \ + } \ + while (0) + +/* Include so __DJGPP__ and __DJGPP_MINOR__ are defined. */ +#undef CPP_SPEC +#define CPP_SPEC "-remap %{posix:-D_POSIX_SOURCE} \ + -imacros %s../include/sys/version.h" + +/* We need to override link_command_spec in gcc.c so support -Tdjgpp.djl. + This cannot be done in LINK_SPECS as that LINK_SPECS is processed + before library search directories are known by the linker. + This avoids problems when specs file is not available. An alternate way, + suggested by Robert Hoehne, is to use SUBTARGET_EXTRA_SPECS instead. +*/ + +#undef LINK_COMMAND_SPEC +#define LINK_COMMAND_SPEC \ +"%{!fsyntax-only: \ +%{!c:%{!M:%{!MM:%{!E:%{!S:%(linker) %l %X %{o*} %{e*} %{N} %{n} \ +\t%{r} %{s} %{t} %{u*} %{z} %{Z}\ +\t%{!nostdlib:%{!nostartfiles:%S}}\ +\t%{static:} %{L*} %D %o\ +\t%{!nostdlib:%{!nodefaultlibs:%G %L %G}}\ +\t%{!nostdlib:%{!nostartfiles:%E}}\ +\t-Tdjgpp.djl %{T*}}}}}}}\n\ +%{!c:%{!M:%{!MM:%{!E:%{!S:stubify %{v} %{o*:%*} %{!o*:a.out} }}}}}" + +/* Always just link in 'libc.a'. */ +#undef LIB_SPEC +#define LIB_SPEC "-lc" + +/* Pick the right startup code depending on the -pg flag. */ +#undef STARTFILE_SPEC +#define STARTFILE_SPEC "%{pg:gcrt0.o%s}%{!pg:crt0.o%s}" + +/* Make sure that gcc will not look for .h files in /usr/local/include + unless user explicitly requests it. */ +#undef LOCAL_INCLUDE_DIR + +/* Switch into a generic section. */ +#define TARGET_ASM_NAMED_SECTION default_coff_asm_named_section + +/* This is how to output an assembler line + that says to advance the location counter + to a multiple of 2**LOG bytes. */ + +#undef ASM_OUTPUT_ALIGN +#define ASM_OUTPUT_ALIGN(FILE,LOG) \ + if ((LOG) != 0) fprintf ((FILE), "\t.p2align %d\n", LOG) + +/* This is how to output a global symbol in the BSS section. */ +#undef ASM_OUTPUT_ALIGNED_BSS +#define ASM_OUTPUT_ALIGNED_BSS(FILE, DECL, NAME, SIZE, ALIGN) \ + asm_output_aligned_bss ((FILE), (DECL), (NAME), (SIZE), (ALIGN)) + +/* This is how to tell assembler that a symbol is weak */ +#undef ASM_WEAKEN_LABEL +#define ASM_WEAKEN_LABEL(FILE,NAME) \ + do { fputs ("\t.weak\t", FILE); assemble_name (FILE, NAME); \ + fputc ('\n', FILE); } while (0) + +/* djgpp automatically calls its own version of __main, so don't define one + in libgcc, nor call one in main(). */ +#define HAS_INIT_SECTION + +/* Definitions for types and sizes. Wide characters are 16-bits long so + Win32 compiler add-ons will be wide character compatible. */ +#undef WCHAR_TYPE_SIZE +#define WCHAR_TYPE_SIZE 16 + +#undef WCHAR_TYPE +#define WCHAR_TYPE "short unsigned int" + +#undef WINT_TYPE +#define WINT_TYPE "int" + +#undef SIZE_TYPE +#define SIZE_TYPE "long unsigned int" + +#undef PTRDIFF_TYPE +#define PTRDIFF_TYPE "int" + +/* Used to be defined in xm-djgpp.h, but moved here for cross-compilers. */ +#define LIBSTDCXX "stdcxx" + +#define TARGET_VERSION fprintf (stderr, " (80386, MS-DOS DJGPP)"); + +/* Warn that -mbnu210 is now obsolete. */ +#undef SUBTARGET_OVERRIDE_OPTIONS +#define SUBTARGET_OVERRIDE_OPTIONS \ +do \ + { \ + if (TARGET_BNU210) \ + { \ + warning (0, "-mbnu210 is ignored (option is obsolete)"); \ + } \ + } \ +while (0) + +/* Support for C++ templates. */ +#undef MAKE_DECL_ONE_ONLY +#define MAKE_DECL_ONE_ONLY(DECL) (DECL_WEAK (DECL) = 1) diff --git a/gcc/config/i386/djgpp.opt b/gcc/config/i386/djgpp.opt new file mode 100644 index 000000000..7e4affca9 --- /dev/null +++ b/gcc/config/i386/djgpp.opt @@ -0,0 +1,28 @@ +; DJGPP-specific options. + +; Copyright (C) 2005, 2007, 2011 Free Software Foundation, Inc. +; +; This file is part of GCC. +; +; GCC is free software; you can redistribute it and/or modify it under +; the terms of the GNU General Public License as published by the Free +; Software Foundation; either version 3, or (at your option) any later +; version. +; +; GCC is distributed in the hope that it will be useful, but WITHOUT ANY +; WARRANTY; without even the implied warranty of MERCHANTABILITY or +; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +; for more details. +; +; You should have received a copy of the GNU General Public License +; along with GCC; see the file COPYING3. If not see +; . + +;; -mbnu210 is now ignored and obsolete. It was used to enable support for +;; weak symbols, and .gnu.linkonce support. +mbnu210 +Target Var(TARGET_BNU210) +Ignored (obsolete) + +posix +Driver diff --git a/gcc/config/i386/driver-i386.c b/gcc/config/i386/driver-i386.c new file mode 100644 index 000000000..69128c58c --- /dev/null +++ b/gcc/config/i386/driver-i386.c @@ -0,0 +1,769 @@ +/* Subroutines for the gcc driver. + Copyright (C) 2006-2012 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "tm.h" + +const char *host_detect_local_cpu (int argc, const char **argv); + +#ifdef __GNUC__ +#include "cpuid.h" + +struct cache_desc +{ + unsigned sizekb; + unsigned assoc; + unsigned line; +}; + +/* Returns command line parameters that describe size and + cache line size of the processor caches. */ + +static char * +describe_cache (struct cache_desc level1, struct cache_desc level2) +{ + char size[100], line[100], size2[100]; + + /* At the moment, gcc does not use the information + about the associativity of the cache. */ + + snprintf (size, sizeof (size), + "--param l1-cache-size=%u ", level1.sizekb); + snprintf (line, sizeof (line), + "--param l1-cache-line-size=%u ", level1.line); + + snprintf (size2, sizeof (size2), + "--param l2-cache-size=%u ", level2.sizekb); + + return concat (size, line, size2, NULL); +} + +/* Detect L2 cache parameters using CPUID extended function 0x80000006. */ + +static void +detect_l2_cache (struct cache_desc *level2) +{ + unsigned eax, ebx, ecx, edx; + unsigned assoc; + + __cpuid (0x80000006, eax, ebx, ecx, edx); + + level2->sizekb = (ecx >> 16) & 0xffff; + level2->line = ecx & 0xff; + + assoc = (ecx >> 12) & 0xf; + if (assoc == 6) + assoc = 8; + else if (assoc == 8) + assoc = 16; + else if (assoc >= 0xa && assoc <= 0xc) + assoc = 32 + (assoc - 0xa) * 16; + else if (assoc >= 0xd && assoc <= 0xe) + assoc = 96 + (assoc - 0xd) * 32; + + level2->assoc = assoc; +} + +/* Returns the description of caches for an AMD processor. */ + +static const char * +detect_caches_amd (unsigned max_ext_level) +{ + unsigned eax, ebx, ecx, edx; + + struct cache_desc level1, level2 = {0, 0, 0}; + + if (max_ext_level < 0x80000005) + return ""; + + __cpuid (0x80000005, eax, ebx, ecx, edx); + + level1.sizekb = (ecx >> 24) & 0xff; + level1.assoc = (ecx >> 16) & 0xff; + level1.line = ecx & 0xff; + + if (max_ext_level >= 0x80000006) + detect_l2_cache (&level2); + + return describe_cache (level1, level2); +} + +/* Decodes the size, the associativity and the cache line size of + L1/L2 caches of an Intel processor. Values are based on + "Intel Processor Identification and the CPUID Instruction" + [Application Note 485], revision -032, December 2007. */ + +static void +decode_caches_intel (unsigned reg, bool xeon_mp, + struct cache_desc *level1, struct cache_desc *level2) +{ + int i; + + for (i = 24; i >= 0; i -= 8) + switch ((reg >> i) & 0xff) + { + case 0x0a: + level1->sizekb = 8; level1->assoc = 2; level1->line = 32; + break; + case 0x0c: + level1->sizekb = 16; level1->assoc = 4; level1->line = 32; + break; + case 0x2c: + level1->sizekb = 32; level1->assoc = 8; level1->line = 64; + break; + case 0x39: + level2->sizekb = 128; level2->assoc = 4; level2->line = 64; + break; + case 0x3a: + level2->sizekb = 192; level2->assoc = 6; level2->line = 64; + break; + case 0x3b: + level2->sizekb = 128; level2->assoc = 2; level2->line = 64; + break; + case 0x3c: + level2->sizekb = 256; level2->assoc = 4; level2->line = 64; + break; + case 0x3d: + level2->sizekb = 384; level2->assoc = 6; level2->line = 64; + break; + case 0x3e: + level2->sizekb = 512; level2->assoc = 4; level2->line = 64; + break; + case 0x41: + level2->sizekb = 128; level2->assoc = 4; level2->line = 32; + break; + case 0x42: + level2->sizekb = 256; level2->assoc = 4; level2->line = 32; + break; + case 0x43: + level2->sizekb = 512; level2->assoc = 4; level2->line = 32; + break; + case 0x44: + level2->sizekb = 1024; level2->assoc = 4; level2->line = 32; + break; + case 0x45: + level2->sizekb = 2048; level2->assoc = 4; level2->line = 32; + break; + case 0x49: + if (xeon_mp) + break; + level2->sizekb = 4096; level2->assoc = 16; level2->line = 64; + break; + case 0x4e: + level2->sizekb = 6144; level2->assoc = 24; level2->line = 64; + break; + case 0x60: + level1->sizekb = 16; level1->assoc = 8; level1->line = 64; + break; + case 0x66: + level1->sizekb = 8; level1->assoc = 4; level1->line = 64; + break; + case 0x67: + level1->sizekb = 16; level1->assoc = 4; level1->line = 64; + break; + case 0x68: + level1->sizekb = 32; level1->assoc = 4; level1->line = 64; + break; + case 0x78: + level2->sizekb = 1024; level2->assoc = 4; level2->line = 64; + break; + case 0x79: + level2->sizekb = 128; level2->assoc = 8; level2->line = 64; + break; + case 0x7a: + level2->sizekb = 256; level2->assoc = 8; level2->line = 64; + break; + case 0x7b: + level2->sizekb = 512; level2->assoc = 8; level2->line = 64; + break; + case 0x7c: + level2->sizekb = 1024; level2->assoc = 8; level2->line = 64; + break; + case 0x7d: + level2->sizekb = 2048; level2->assoc = 8; level2->line = 64; + break; + case 0x7f: + level2->sizekb = 512; level2->assoc = 2; level2->line = 64; + break; + case 0x82: + level2->sizekb = 256; level2->assoc = 8; level2->line = 32; + break; + case 0x83: + level2->sizekb = 512; level2->assoc = 8; level2->line = 32; + break; + case 0x84: + level2->sizekb = 1024; level2->assoc = 8; level2->line = 32; + break; + case 0x85: + level2->sizekb = 2048; level2->assoc = 8; level2->line = 32; + break; + case 0x86: + level2->sizekb = 512; level2->assoc = 4; level2->line = 64; + break; + case 0x87: + level2->sizekb = 1024; level2->assoc = 8; level2->line = 64; + + default: + break; + } +} + +/* Detect cache parameters using CPUID function 2. */ + +static void +detect_caches_cpuid2 (bool xeon_mp, + struct cache_desc *level1, struct cache_desc *level2) +{ + unsigned regs[4]; + int nreps, i; + + __cpuid (2, regs[0], regs[1], regs[2], regs[3]); + + nreps = regs[0] & 0x0f; + regs[0] &= ~0x0f; + + while (--nreps >= 0) + { + for (i = 0; i < 4; i++) + if (regs[i] && !((regs[i] >> 31) & 1)) + decode_caches_intel (regs[i], xeon_mp, level1, level2); + + if (nreps) + __cpuid (2, regs[0], regs[1], regs[2], regs[3]); + } +} + +/* Detect cache parameters using CPUID function 4. This + method doesn't require hardcoded tables. */ + +enum cache_type +{ + CACHE_END = 0, + CACHE_DATA = 1, + CACHE_INST = 2, + CACHE_UNIFIED = 3 +}; + +static void +detect_caches_cpuid4 (struct cache_desc *level1, struct cache_desc *level2, + struct cache_desc *level3) +{ + struct cache_desc *cache; + + unsigned eax, ebx, ecx, edx; + int count; + + for (count = 0;; count++) + { + __cpuid_count(4, count, eax, ebx, ecx, edx); + switch (eax & 0x1f) + { + case CACHE_END: + return; + case CACHE_DATA: + case CACHE_UNIFIED: + { + switch ((eax >> 5) & 0x07) + { + case 1: + cache = level1; + break; + case 2: + cache = level2; + break; + case 3: + cache = level3; + break; + default: + cache = NULL; + } + + if (cache) + { + unsigned sets = ecx + 1; + unsigned part = ((ebx >> 12) & 0x03ff) + 1; + + cache->assoc = ((ebx >> 22) & 0x03ff) + 1; + cache->line = (ebx & 0x0fff) + 1; + + cache->sizekb = (cache->assoc * part + * cache->line * sets) / 1024; + } + } + default: + break; + } + } +} + +/* Returns the description of caches for an Intel processor. */ + +static const char * +detect_caches_intel (bool xeon_mp, unsigned max_level, + unsigned max_ext_level, unsigned *l2sizekb) +{ + struct cache_desc level1 = {0, 0, 0}, level2 = {0, 0, 0}, level3 = {0, 0, 0}; + + if (max_level >= 4) + detect_caches_cpuid4 (&level1, &level2, &level3); + else if (max_level >= 2) + detect_caches_cpuid2 (xeon_mp, &level1, &level2); + else + return ""; + + if (level1.sizekb == 0) + return ""; + + /* Let the L3 replace the L2. This assumes inclusive caches + and single threaded program for now. */ + if (level3.sizekb) + level2 = level3; + + /* Intel CPUs are equipped with AMD style L2 cache info. Try this + method if other methods fail to provide L2 cache parameters. */ + if (level2.sizekb == 0 && max_ext_level >= 0x80000006) + detect_l2_cache (&level2); + + *l2sizekb = level2.sizekb; + + return describe_cache (level1, level2); +} + +enum vendor_signatures +{ + SIG_INTEL = 0x756e6547 /* Genu */, + SIG_AMD = 0x68747541 /* Auth */ +}; + +enum processor_signatures +{ + SIG_GEODE = 0x646f6547 /* Geod */ +}; + +/* This will be called by the spec parser in gcc.c when it sees + a %:local_cpu_detect(args) construct. Currently it will be called + with either "arch" or "tune" as argument depending on if -march=native + or -mtune=native is to be substituted. + + It returns a string containing new command line parameters to be + put at the place of the above two options, depending on what CPU + this is executed. E.g. "-march=k8" on an AMD64 machine + for -march=native. + + ARGC and ARGV are set depending on the actual arguments given + in the spec. */ + +const char *host_detect_local_cpu (int argc, const char **argv) +{ + enum processor_type processor = PROCESSOR_I386; + const char *cpu = "i386"; + + const char *cache = ""; + const char *options = ""; + + unsigned int eax, ebx, ecx, edx; + + unsigned int max_level, ext_level; + + unsigned int vendor; + unsigned int model, family; + + unsigned int has_sse3, has_ssse3, has_cmpxchg16b; + unsigned int has_cmpxchg8b, has_cmov, has_mmx, has_sse, has_sse2; + + /* Extended features */ + unsigned int has_lahf_lm = 0, has_sse4a = 0; + unsigned int has_longmode = 0, has_3dnowp = 0, has_3dnow = 0; + unsigned int has_movbe = 0, has_sse4_1 = 0, has_sse4_2 = 0; + unsigned int has_popcnt = 0, has_aes = 0, has_avx = 0; + unsigned int has_pclmul = 0, has_abm = 0, has_lwp = 0; + unsigned int has_fma = 0, has_fma4 = 0, has_xop = 0; + unsigned int has_bmi = 0, has_tbm = 0; + unsigned int has_rdrnd = 0, has_f16c = 0, has_fsgsbase = 0; + unsigned int has_osxsave = 0; + + bool arch; + + unsigned int l2sizekb = 0; + + if (argc < 1) + return NULL; + + arch = !strcmp (argv[0], "arch"); + + if (!arch && strcmp (argv[0], "tune")) + return NULL; + + max_level = __get_cpuid_max (0, &vendor); + if (max_level < 1) + goto done; + + __cpuid (1, eax, ebx, ecx, edx); + + model = (eax >> 4) & 0x0f; + family = (eax >> 8) & 0x0f; + if (vendor == SIG_INTEL) + { + unsigned int extended_model, extended_family; + + extended_model = (eax >> 12) & 0xf0; + extended_family = (eax >> 20) & 0xff; + if (family == 0x0f) + { + family += extended_family; + model += extended_model; + } + else if (family == 0x06) + model += extended_model; + } + + has_sse3 = ecx & bit_SSE3; + has_ssse3 = ecx & bit_SSSE3; + has_sse4_1 = ecx & bit_SSE4_1; + has_sse4_2 = ecx & bit_SSE4_2; + has_avx = ecx & bit_AVX; + has_osxsave = ecx & bit_OSXSAVE; + has_cmpxchg16b = ecx & bit_CMPXCHG16B; + has_movbe = ecx & bit_MOVBE; + has_popcnt = ecx & bit_POPCNT; + has_aes = ecx & bit_AES; + has_pclmul = ecx & bit_PCLMUL; + has_fma = ecx & bit_FMA; + has_f16c = ecx & bit_F16C; + has_rdrnd = ecx & bit_RDRND; + + has_cmpxchg8b = edx & bit_CMPXCHG8B; + has_cmov = edx & bit_CMOV; + has_mmx = edx & bit_MMX; + has_sse = edx & bit_SSE; + has_sse2 = edx & bit_SSE2; + + if (max_level >= 7) + { + __cpuid_count (7, 0, eax, ebx, ecx, edx); + + has_fsgsbase = ebx & bit_FSGSBASE; + } + + /* Get XCR_XFEATURE_ENABLED_MASK register with xgetbv. */ +#define XCR_XFEATURE_ENABLED_MASK 0x0 +#define XSTATE_FP 0x1 +#define XSTATE_SSE 0x2 +#define XSTATE_YMM 0x4 + if (has_osxsave) + asm (".byte 0x0f; .byte 0x01; .byte 0xd0" + : "=a" (eax), "=d" (edx) + : "c" (XCR_XFEATURE_ENABLED_MASK)); + + /* Check if SSE and YMM states are supported. */ + if (!has_osxsave + || (eax & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) + { + has_avx = 0; + has_fma = 0; + has_fma4 = 0; + has_xop = 0; + } + + /* Check cpuid level of extended features. */ + __cpuid (0x80000000, ext_level, ebx, ecx, edx); + + if (ext_level > 0x80000000) + { + __cpuid (0x80000001, eax, ebx, ecx, edx); + + has_lahf_lm = ecx & bit_LAHF_LM; + has_sse4a = ecx & bit_SSE4a; + has_abm = ecx & bit_ABM; + has_lwp = ecx & bit_LWP; + has_fma4 = ecx & bit_FMA4; + has_xop = ecx & bit_XOP; + has_tbm = ecx & bit_TBM; + + has_longmode = edx & bit_LM; + has_3dnowp = edx & bit_3DNOWP; + has_3dnow = edx & bit_3DNOW; + + __cpuid (0x7, eax, ebx, ecx, edx); + + has_bmi = ebx & bit_BMI; + } + + if (!arch) + { + if (vendor == SIG_AMD) + cache = detect_caches_amd (ext_level); + else if (vendor == SIG_INTEL) + { + bool xeon_mp = (family == 15 && model == 6); + cache = detect_caches_intel (xeon_mp, max_level, + ext_level, &l2sizekb); + } + } + + if (vendor == SIG_AMD) + { + unsigned int name; + + /* Detect geode processor by its processor signature. */ + if (ext_level > 0x80000001) + __cpuid (0x80000002, name, ebx, ecx, edx); + else + name = 0; + + if (name == SIG_GEODE) + processor = PROCESSOR_GEODE; + else if (has_xop) + processor = PROCESSOR_BDVER1; + else if (has_sse4a && has_ssse3) + processor = PROCESSOR_BTVER1; + else if (has_sse4a) + processor = PROCESSOR_AMDFAM10; + else if (has_sse2 || has_longmode) + processor = PROCESSOR_K8; + else if (has_3dnowp && family == 6) + processor = PROCESSOR_ATHLON; + else if (has_mmx) + processor = PROCESSOR_K6; + else + processor = PROCESSOR_PENTIUM; + } + else + { + switch (family) + { + case 4: + processor = PROCESSOR_I486; + break; + case 5: + processor = PROCESSOR_PENTIUM; + break; + case 6: + processor = PROCESSOR_PENTIUMPRO; + break; + case 15: + processor = PROCESSOR_PENTIUM4; + break; + default: + /* We have no idea. */ + processor = PROCESSOR_GENERIC32; + } + } + + switch (processor) + { + case PROCESSOR_I386: + /* Default. */ + break; + case PROCESSOR_I486: + cpu = "i486"; + break; + case PROCESSOR_PENTIUM: + if (arch && has_mmx) + cpu = "pentium-mmx"; + else + cpu = "pentium"; + break; + case PROCESSOR_PENTIUMPRO: + switch (model) + { + case 0x1c: + case 0x26: + /* Atom. */ + cpu = "atom"; + break; + case 0x1a: + case 0x1e: + case 0x1f: + case 0x2e: + /* Nehalem. */ + cpu = "corei7"; + break; + case 0x25: + case 0x2c: + case 0x2f: + /* Westmere. */ + cpu = "corei7"; + break; + case 0x2a: + /* Sandy Bridge. */ + cpu = "corei7-avx"; + break; + case 0x17: + case 0x1d: + /* Penryn. */ + cpu = "core2"; + break; + case 0x0f: + /* Merom. */ + cpu = "core2"; + break; + default: + if (arch) + { + /* This is unknown family 0x6 CPU. */ + if (has_avx) + /* Assume Sandy Bridge. */ + cpu = "corei7-avx"; + else if (has_sse4_2) + /* Assume Core i7. */ + cpu = "corei7"; + else if (has_ssse3) + { + if (has_movbe) + /* Assume Atom. */ + cpu = "atom"; + else + /* Assume Core 2. */ + cpu = "core2"; + } + else if (has_sse3) + /* It is Core Duo. */ + cpu = "pentium-m"; + else if (has_sse2) + /* It is Pentium M. */ + cpu = "pentium-m"; + else if (has_sse) + /* It is Pentium III. */ + cpu = "pentium3"; + else if (has_mmx) + /* It is Pentium II. */ + cpu = "pentium2"; + else + /* Default to Pentium Pro. */ + cpu = "pentiumpro"; + } + else + /* For -mtune, we default to -mtune=generic. */ + cpu = "generic"; + break; + } + break; + case PROCESSOR_PENTIUM4: + if (has_sse3) + { + if (has_longmode) + cpu = "nocona"; + else + cpu = "prescott"; + } + else + cpu = "pentium4"; + break; + case PROCESSOR_GEODE: + cpu = "geode"; + break; + case PROCESSOR_K6: + if (arch && has_3dnow) + cpu = "k6-3"; + else + cpu = "k6"; + break; + case PROCESSOR_ATHLON: + if (arch && has_sse) + cpu = "athlon-4"; + else + cpu = "athlon"; + break; + case PROCESSOR_K8: + if (arch && has_sse3) + cpu = "k8-sse3"; + else + cpu = "k8"; + break; + case PROCESSOR_AMDFAM10: + cpu = "amdfam10"; + break; + case PROCESSOR_BDVER1: + cpu = "bdver1"; + break; + case PROCESSOR_BTVER1: + cpu = "btver1"; + break; + + default: + /* Use something reasonable. */ + if (arch) + { + if (has_ssse3) + cpu = "core2"; + else if (has_sse3) + { + if (has_longmode) + cpu = "nocona"; + else + cpu = "prescott"; + } + else if (has_sse2) + cpu = "pentium4"; + else if (has_cmov) + cpu = "pentiumpro"; + else if (has_mmx) + cpu = "pentium-mmx"; + else if (has_cmpxchg8b) + cpu = "pentium"; + } + else + cpu = "generic"; + } + + if (arch) + { + const char *cx16 = has_cmpxchg16b ? " -mcx16" : " -mno-cx16"; + const char *sahf = has_lahf_lm ? " -msahf" : " -mno-sahf"; + const char *movbe = has_movbe ? " -mmovbe" : " -mno-movbe"; + const char *ase = has_aes ? " -maes" : " -mno-aes"; + const char *pclmul = has_pclmul ? " -mpclmul" : " -mno-pclmul"; + const char *popcnt = has_popcnt ? " -mpopcnt" : " -mno-popcnt"; + const char *abm = has_abm ? " -mabm" : " -mno-abm"; + const char *lwp = has_lwp ? " -mlwp" : " -mno-lwp"; + const char *fma = has_fma ? " -mfma" : " -mno-fma"; + const char *fma4 = has_fma4 ? " -mfma4" : " -mno-fma4"; + const char *xop = has_xop ? " -mxop" : " -mno-xop"; + const char *bmi = has_bmi ? " -mbmi" : " -mno-bmi"; + const char *tbm = has_tbm ? " -mtbm" : " -mno-tbm"; + const char *avx = has_avx ? " -mavx" : " -mno-avx"; + const char *sse4_2 = has_sse4_2 ? " -msse4.2" : " -mno-sse4.2"; + const char *sse4_1 = has_sse4_1 ? " -msse4.1" : " -mno-sse4.1"; + const char *rdrnd = has_rdrnd ? " -mrdrnd" : " -mno-rdrnd"; + const char *f16c = has_f16c ? " -mf16c" : " -mno-f16c"; + const char *fsgsbase = has_fsgsbase ? " -mfsgsbase" : " -mno-fsgsbase"; + + options = concat (options, cx16, sahf, movbe, ase, pclmul, + popcnt, abm, lwp, fma, fma4, xop, bmi, tbm, + avx, sse4_2, sse4_1, rdrnd, f16c, fsgsbase, NULL); + } + +done: + return concat (cache, "-m", argv[0], "=", cpu, options, NULL); +} +#else + +/* If we aren't compiling with GCC then the driver will just ignore + -march and -mtune "native" target and will leave to the newly + built compiler to generate code for its default target. */ + +const char *host_detect_local_cpu (int argc ATTRIBUTE_UNUSED, + const char **argv ATTRIBUTE_UNUSED) +{ + return NULL; +} +#endif /* __GNUC__ */ diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h new file mode 100644 index 000000000..fe4cd6aba --- /dev/null +++ b/gcc/config/i386/emmintrin.h @@ -0,0 +1,1513 @@ +/* Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 + Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 9.0. */ + +#ifndef _EMMINTRIN_H_INCLUDED +#define _EMMINTRIN_H_INCLUDED + +#ifndef __SSE2__ +# error "SSE2 instruction set not enabled" +#else + +/* We need definitions from the SSE header files*/ +#include + +/* SSE2 */ +typedef double __v2df __attribute__ ((__vector_size__ (16))); +typedef long long __v2di __attribute__ ((__vector_size__ (16))); +typedef int __v4si __attribute__ ((__vector_size__ (16))); +typedef short __v8hi __attribute__ ((__vector_size__ (16))); +typedef char __v16qi __attribute__ ((__vector_size__ (16))); + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); +typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__)); + +/* Create a selector for use with the SHUFPD instruction. */ +#define _MM_SHUFFLE2(fp1,fp0) \ + (((fp1) << 1) | (fp0)) + +/* Create a vector with element 0 as F and the rest zero. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_sd (double __F) +{ + return __extension__ (__m128d){ __F, 0.0 }; +} + +/* Create a vector with both elements equal to F. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_pd (double __F) +{ + return __extension__ (__m128d){ __F, __F }; +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pd1 (double __F) +{ + return _mm_set1_pd (__F); +} + +/* Create a vector with the lower value X and upper value W. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pd (double __W, double __X) +{ + return __extension__ (__m128d){ __X, __W }; +} + +/* Create a vector with the lower value W and upper value X. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_pd (double __W, double __X) +{ + return __extension__ (__m128d){ __W, __X }; +} + +/* Create a vector of zeros. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setzero_pd (void) +{ + return __extension__ (__m128d){ 0.0, 0.0 }; +} + +/* Sets the low DPFP value of A from the low value of B. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_move_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B); +} + +/* Load two DPFP values from P. The address must be 16-byte aligned. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_pd (double const *__P) +{ + return *(__m128d *)__P; +} + +/* Load two DPFP values from P. The address need not be 16-byte aligned. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_pd (double const *__P) +{ + return __builtin_ia32_loadupd (__P); +} + +/* Create a vector with all two elements equal to *P. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load1_pd (double const *__P) +{ + return _mm_set1_pd (*__P); +} + +/* Create a vector with element 0 as *P and the rest zero. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_sd (double const *__P) +{ + return _mm_set_sd (*__P); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_pd1 (double const *__P) +{ + return _mm_load1_pd (__P); +} + +/* Load two DPFP values in reverse order. The address must be aligned. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadr_pd (double const *__P) +{ + __m128d __tmp = _mm_load_pd (__P); + return __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1)); +} + +/* Store two DPFP values. The address must be 16-byte aligned. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_pd (double *__P, __m128d __A) +{ + *(__m128d *)__P = __A; +} + +/* Store two DPFP values. The address need not be 16-byte aligned. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_pd (double *__P, __m128d __A) +{ + __builtin_ia32_storeupd (__P, __A); +} + +/* Stores the lower DPFP value. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_sd (double *__P, __m128d __A) +{ + *__P = __builtin_ia32_vec_ext_v2df (__A, 0); +} + +extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_f64 (__m128d __A) +{ + return __builtin_ia32_vec_ext_v2df (__A, 0); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storel_pd (double *__P, __m128d __A) +{ + _mm_store_sd (__P, __A); +} + +/* Stores the upper DPFP value. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeh_pd (double *__P, __m128d __A) +{ + *__P = __builtin_ia32_vec_ext_v2df (__A, 1); +} + +/* Store the lower DPFP value across two words. + The address must be 16-byte aligned. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store1_pd (double *__P, __m128d __A) +{ + _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0))); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_pd1 (double *__P, __m128d __A) +{ + _mm_store1_pd (__P, __A); +} + +/* Store two DPFP values in reverse order. The address must be aligned. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storer_pd (double *__P, __m128d __A) +{ + _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,1))); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi128_si32 (__m128i __A) +{ + return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0); +} + +#ifdef __x86_64__ +/* Intel intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi128_si64 (__m128i __A) +{ + return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0); +} + +/* Microsoft intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi128_si64x (__m128i __A) +{ + return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0); +} +#endif + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_pd (__m128d __A) +{ + return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A); +} + +/* Return pair {sqrt (A[0), B[1]}. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_sd (__m128d __A, __m128d __B) +{ + __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B); + return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_and_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_andnot_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_or_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_xor_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnlt_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnle_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpngt_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnge_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpord_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpunord_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_movsd ((__v2df) __A, + (__v2df) + __builtin_ia32_cmpltsd ((__v2df) __B, + (__v2df) + __A)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_movsd ((__v2df) __A, + (__v2df) + __builtin_ia32_cmplesd ((__v2df) __B, + (__v2df) + __A)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnlt_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnle_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpngt_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_movsd ((__v2df) __A, + (__v2df) + __builtin_ia32_cmpnltsd ((__v2df) __B, + (__v2df) + __A)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnge_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_movsd ((__v2df) __A, + (__v2df) + __builtin_ia32_cmpnlesd ((__v2df) __B, + (__v2df) + __A)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpord_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpunord_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comieq_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comilt_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comile_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comigt_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comige_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comineq_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomieq_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomilt_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomile_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomigt_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomige_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomineq_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B); +} + +/* Create a vector of Qi, where i is the element number. */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_epi64x (long long __q1, long long __q0) +{ + return __extension__ (__m128i)(__v2di){ __q0, __q1 }; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_epi64 (__m64 __q1, __m64 __q0) +{ + return _mm_set_epi64x ((long long)__q1, (long long)__q0); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0) +{ + return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 }; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4, + short __q3, short __q2, short __q1, short __q0) +{ + return __extension__ (__m128i)(__v8hi){ + __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 }; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12, + char __q11, char __q10, char __q09, char __q08, + char __q07, char __q06, char __q05, char __q04, + char __q03, char __q02, char __q01, char __q00) +{ + return __extension__ (__m128i)(__v16qi){ + __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 + }; +} + +/* Set all of the elements of the vector to A. */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_epi64x (long long __A) +{ + return _mm_set_epi64x (__A, __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_epi64 (__m64 __A) +{ + return _mm_set_epi64 (__A, __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_epi32 (int __A) +{ + return _mm_set_epi32 (__A, __A, __A, __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_epi16 (short __A) +{ + return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_epi8 (char __A) +{ + return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A); +} + +/* Create a vector of Qi, where i is the element number. + The parameter order is reversed from the _mm_set_epi* functions. */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_epi64 (__m64 __q0, __m64 __q1) +{ + return _mm_set_epi64 (__q1, __q0); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3) +{ + return _mm_set_epi32 (__q3, __q2, __q1, __q0); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3, + short __q4, short __q5, short __q6, short __q7) +{ + return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03, + char __q04, char __q05, char __q06, char __q07, + char __q08, char __q09, char __q10, char __q11, + char __q12, char __q13, char __q14, char __q15) +{ + return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08, + __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00); +} + +/* Create a vector with element 0 as *P and the rest zero. */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_si128 (__m128i const *__P) +{ + return *__P; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_si128 (__m128i const *__P) +{ + return (__m128i) __builtin_ia32_loaddqu ((char const *)__P); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadl_epi64 (__m128i const *__P) +{ + return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_si128 (__m128i *__P, __m128i __B) +{ + *__P = __B; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_si128 (__m128i *__P, __m128i __B) +{ + __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storel_epi64 (__m128i *__P, __m128i __B) +{ + *(long long *)__P = __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movepi64_pi64 (__m128i __B) +{ + return (__m64) __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movpi64_epi64 (__m64 __A) +{ + return _mm_set_epi64 ((__m64)0LL, __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_move_epi64 (__m128i __A) +{ + return (__m128i)__builtin_ia32_movq128 ((__v2di) __A); +} + +/* Create a vector of zeros. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setzero_si128 (void) +{ + return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 }; +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi32_pd (__m128i __A) +{ + return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi32_ps (__m128i __A) +{ + return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpd_epi32 (__m128d __A) +{ + return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpd_pi32 (__m128d __A) +{ + return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpd_ps (__m128d __A) +{ + return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttpd_epi32 (__m128d __A) +{ + return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttpd_pi32 (__m128d __A) +{ + return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpi32_pd (__m64 __A) +{ + return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_epi32 (__m128 __A) +{ + return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttps_epi32 (__m128 __A) +{ + return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_pd (__m128 __A) +{ + return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_si32 (__m128d __A) +{ + return __builtin_ia32_cvtsd2si ((__v2df) __A); +} + +#ifdef __x86_64__ +/* Intel intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_si64 (__m128d __A) +{ + return __builtin_ia32_cvtsd2si64 ((__v2df) __A); +} + +/* Microsoft intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_si64x (__m128d __A) +{ + return __builtin_ia32_cvtsd2si64 ((__v2df) __A); +} +#endif + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_si32 (__m128d __A) +{ + return __builtin_ia32_cvttsd2si ((__v2df) __A); +} + +#ifdef __x86_64__ +/* Intel intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_si64 (__m128d __A) +{ + return __builtin_ia32_cvttsd2si64 ((__v2df) __A); +} + +/* Microsoft intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_si64x (__m128d __A) +{ + return __builtin_ia32_cvttsd2si64 ((__v2df) __A); +} +#endif + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_ss (__m128 __A, __m128d __B) +{ + return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi32_sd (__m128d __A, int __B) +{ + return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B); +} + +#ifdef __x86_64__ +/* Intel intrinsic. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64_sd (__m128d __A, long long __B) +{ + return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B); +} + +/* Microsoft intrinsic. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64x_sd (__m128d __A, long long __B) +{ + return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B); +} +#endif + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_sd (__m128d __A, __m128 __B) +{ + return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) +{ + return (__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, __mask); +} +#else +#define _mm_shuffle_pd(A, B, N) \ + ((__m128d)__builtin_ia32_shufpd ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(N))) +#endif + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadh_pd (__m128d __A, double const *__B) +{ + return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadl_pd (__m128d __A, double const *__B) +{ + return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movemask_pd (__m128d __A) +{ + return __builtin_ia32_movmskpd ((__v2df)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packs_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packs_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packus_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_epu16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psubq128 ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_epu16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_madd_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhi_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mullo_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_su32 (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_epu32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_epi16 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_epi32 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_epi64 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srai_epi16 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srai_epi32 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_si128 (__m128i __A, const int __N) +{ + return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_si128 (__m128i __A, const int __N) +{ + return (__m128i)__builtin_ia32_pslldqi128 (__A, __N * 8); +} +#else +#define _mm_srli_si128(A, N) \ + ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8)) +#define _mm_slli_si128(A, N) \ + ((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8)) +#endif + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_epi16 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_epi32 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_epi64 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sll_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sll_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sll_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sra_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sra_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srl_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srl_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srl_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_and_si128 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_andnot_si128 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_or_si128 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_xor_si128 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pxor128 ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pcmpeqb128 ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pcmpeqw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pcmpeqd128 ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__B, (__v16qi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__B, (__v8hi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__B, (__v4si)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__A, (__v4si)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi16 (__m128i const __A, int const __N) +{ + return (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, __N); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi16 (__m128i const __A, int const __D, int const __N) +{ + return (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)__A, __D, __N); +} +#else +#define _mm_extract_epi16(A, N) \ + ((int) (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)(__m128i)(A), (int)(N))) +#define _mm_insert_epi16(A, D, N) \ + ((__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)(__m128i)(A), \ + (int)(D), (int)(N))) +#endif + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movemask_epi8 (__m128i __A) +{ + return __builtin_ia32_pmovmskb128 ((__v16qi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhi_epu16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shufflehi_epi16 (__m128i __A, const int __mask) +{ + return (__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __mask); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shufflelo_epi16 (__m128i __A, const int __mask) +{ + return (__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __mask); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_epi32 (__m128i __A, const int __mask) +{ + return (__m128i)__builtin_ia32_pshufd ((__v4si)__A, __mask); +} +#else +#define _mm_shufflehi_epi16(A, N) \ + ((__m128i)__builtin_ia32_pshufhw ((__v8hi)(__m128i)(A), (int)(N))) +#define _mm_shufflelo_epi16(A, N) \ + ((__m128i)__builtin_ia32_pshuflw ((__v8hi)(__m128i)(A), (int)(N))) +#define _mm_shuffle_epi32(A, N) \ + ((__m128i)__builtin_ia32_pshufd ((__v4si)(__m128i)(A), (int)(N))) +#endif + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C) +{ + __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_avg_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_avg_epu16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sad_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_si32 (int *__A, int __B) +{ + __builtin_ia32_movnti (__A, __B); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_si128 (__m128i *__A, __m128i __B) +{ + __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_pd (double *__A, __m128d __B) +{ + __builtin_ia32_movntpd (__A, (__v2df)__B); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_clflush (void const *__A) +{ + __builtin_ia32_clflush (__A); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_lfence (void) +{ + __builtin_ia32_lfence (); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mfence (void) +{ + __builtin_ia32_mfence (); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi32_si128 (int __A) +{ + return _mm_set_epi32 (0, 0, 0, __A); +} + +#ifdef __x86_64__ +/* Intel intrinsic. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64_si128 (long long __A) +{ + return _mm_set_epi64x (0, __A); +} + +/* Microsoft intrinsic. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64x_si128 (long long __A) +{ + return _mm_set_epi64x (0, __A); +} +#endif + +/* Casts between various SP, DP, INT vector types. Note that these do no + conversion of values, they just change the type. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castpd_ps(__m128d __A) +{ + return (__m128) __A; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castpd_si128(__m128d __A) +{ + return (__m128i) __A; +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castps_pd(__m128 __A) +{ + return (__m128d) __A; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castps_si128(__m128 __A) +{ + return (__m128i) __A; +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castsi128_ps(__m128i __A) +{ + return (__m128) __A; +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castsi128_pd(__m128i __A) +{ + return (__m128d) __A; +} + +#endif /* __SSE2__ */ + +#endif /* _EMMINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/fma4intrin.h b/gcc/config/i386/fma4intrin.h new file mode 100644 index 000000000..ae30bfec9 --- /dev/null +++ b/gcc/config/i386/fma4intrin.h @@ -0,0 +1,236 @@ +/* Copyright (C) 2007, 2008, 2009, 2010 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86INTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _FMA4INTRIN_H_INCLUDED +#define _FMA4INTRIN_H_INCLUDED + +#ifndef __FMA4__ +# error "FMA4 instruction set not enabled" +#else + +/* We need definitions from the SSE4A, SSE3, SSE2 and SSE header files. */ +#include + +/* 128b Floating point multiply/add type instructions. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macc_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macc_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macc_ss (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macc_sd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsd ((__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_msub_ps (__m128 __A, __m128 __B, __m128 __C) + +{ + return (__m128) __builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_msub_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B, -(__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_msub_ss (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddss ((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_msub_sd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsd ((__v2df)__A, (__v2df)__B, -(__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmacc_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddps (-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmacc_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddpd (-(__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmacc_ss (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddss (-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmacc_sd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsd (-(__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmsub_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddps (-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmsub_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddpd (-(__v2df)__A, (__v2df)__B, -(__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmsub_ss (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddss (-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmsub_sd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsd (-(__v2df)__A, (__v2df)__B, -(__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maddsub_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maddsub_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_msubadd_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_msubadd_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B, -(__v2df)__C); +} + +/* 256b Floating point multiply/add type instructions. */ +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_macc_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_macc_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_msub_ps (__m256 __A, __m256 __B, __m256 __C) + +{ + return (__m256) __builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_msub_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B, -(__v4df)__C); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_nmacc_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddps256 (-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_nmacc_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddpd256 (-(__v4df)__A, (__v4df)__B, (__v4df)__C); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_nmsub_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddps256 (-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_nmsub_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddpd256 (-(__v4df)__A, (__v4df)__B, -(__v4df)__C); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maddsub_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddsubps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maddsub_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddsubpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_msubadd_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddsubps256 ((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_msubadd_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddsubpd256 ((__v4df)__A, (__v4df)__B, -(__v4df)__C); +} + +#endif + +#endif diff --git a/gcc/config/i386/freebsd.h b/gcc/config/i386/freebsd.h new file mode 100644 index 000000000..6d2c559e7 --- /dev/null +++ b/gcc/config/i386/freebsd.h @@ -0,0 +1,152 @@ +/* Definitions for Intel 386 running FreeBSD with ELF format + Copyright (C) 1996, 2000, 2002, 2004, 2007, 2010 + Free Software Foundation, Inc. + Contributed by Eric Youngdale. + Modified for stabs-in-ELF by H.J. Lu. + Adapted from GNU/Linux version by John Polstra. + Continued development by David O'Brien + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + + +#define TARGET_VERSION fprintf (stderr, " (i386 FreeBSD/ELF)"); + +/* Override the default comment-starter of "/". */ +#undef ASM_COMMENT_START +#define ASM_COMMENT_START "#" + +#undef ASM_APP_ON +#define ASM_APP_ON "#APP\n" + +#undef ASM_APP_OFF +#define ASM_APP_OFF "#NO_APP\n" + +#undef DBX_REGISTER_NUMBER +#define DBX_REGISTER_NUMBER(n) \ + (TARGET_64BIT ? dbx64_register_map[n] : svr4_dbx_register_map[n]) + +#undef NO_PROFILE_COUNTERS +#define NO_PROFILE_COUNTERS 1 + +/* Tell final.c that we don't need a label passed to mcount. */ + +#undef MCOUNT_NAME +#define MCOUNT_NAME ".mcount" + +/* Make gcc agree with . */ + +#undef SIZE_TYPE +#define SIZE_TYPE (TARGET_64BIT ? "long unsigned int" : "unsigned int") + +#undef PTRDIFF_TYPE +#define PTRDIFF_TYPE (TARGET_64BIT ? "long int" : "int") + +#undef WCHAR_TYPE_SIZE +#define WCHAR_TYPE_SIZE (TARGET_64BIT ? 32 : BITS_PER_WORD) + +#undef SUBTARGET_EXTRA_SPECS /* i386.h bogusly defines it. */ +#define SUBTARGET_EXTRA_SPECS \ + { "fbsd_dynamic_linker", FBSD_DYNAMIC_LINKER } + +/* Provide a STARTFILE_SPEC appropriate for FreeBSD. Here we add + the magical crtbegin.o file (see crtstuff.c) which provides part + of the support for getting C++ file-scope static object constructed + before entering `main'. */ + +#undef STARTFILE_SPEC +#define STARTFILE_SPEC \ + "%{!shared: \ + %{pg:gcrt1.o%s} %{!pg:%{p:gcrt1.o%s} \ + %{!p:%{profile:gcrt1.o%s} \ + %{!profile:crt1.o%s}}}} \ + crti.o%s %{!shared:crtbegin.o%s} %{shared:crtbeginS.o%s}" + +/* Provide a ENDFILE_SPEC appropriate for FreeBSD. Here we tack on + the magical crtend.o file (see crtstuff.c) which provides part of + the support for getting C++ file-scope static object constructed + before entering `main', followed by a normal "finalizer" file, + `crtn.o'. */ + +#undef ENDFILE_SPEC +#define ENDFILE_SPEC \ + "%{!shared:crtend.o%s} %{shared:crtendS.o%s} crtn.o%s" + +/* Provide a LINK_SPEC appropriate for FreeBSD. Here we provide support + for the special GCC options -static and -shared, which allow us to + link things in one of these three modes by applying the appropriate + combinations of options at link-time. + + When the -shared link option is used a final link is not being + done. */ + +#undef LINK_SPEC +#define LINK_SPEC "\ + %{p:%nconsider using '-pg' instead of '-p' with gprof(1)} \ + %{v:-V} \ + %{assert*} %{R*} %{rpath*} %{defsym*} \ + %{shared:-Bshareable %{h*} %{soname*}} \ + %{!shared: \ + %{!static: \ + %{rdynamic:-export-dynamic} \ + -dynamic-linker %(fbsd_dynamic_linker) } \ + %{static:-Bstatic}} \ + %{symbolic:-Bsymbolic}" + +/* A C statement to output to the stdio stream FILE an assembler + command to advance the location counter to a multiple of 1< and std::numeric_limits correct. */ +#undef TARGET_96_ROUND_53_LONG_DOUBLE +#define TARGET_96_ROUND_53_LONG_DOUBLE (!TARGET_64BIT) + +/* Put all *tf routines in libgcc. */ +#undef LIBGCC2_HAS_TF_MODE +#define LIBGCC2_HAS_TF_MODE 1 +#define LIBGCC2_TF_CEXT q +#define TF_SIZE 113 + +/* Static stack checking is supported by means of probes. */ +#define STACK_CHECK_STATIC_BUILTIN 1 + +/* Support for i386 has been removed from FreeBSD 6.0 onward. */ +#if FBSD_MAJOR >= 6 +#define SUBTARGET32_DEFAULT_CPU "i486" +#endif + +#define TARGET_ASM_FILE_END file_end_indicate_exec_stack + diff --git a/gcc/config/i386/freebsd64.h b/gcc/config/i386/freebsd64.h new file mode 100644 index 000000000..8b5b149b0 --- /dev/null +++ b/gcc/config/i386/freebsd64.h @@ -0,0 +1,46 @@ +/* Definitions for AMD x86-64 running FreeBSD with ELF format + Copyright (C) 2002, 2004, 2007, 2010, 2011 Free Software Foundation, Inc. + Contributed by David O'Brien + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + + +#undef TARGET_VERSION +#define TARGET_VERSION fprintf (stderr, " (FreeBSD/x86-64 ELF)"); + +#define SUBTARGET_EXTRA_SPECS \ + { "fbsd_dynamic_linker", FBSD_DYNAMIC_LINKER } + +#undef CC1_SPEC +#define CC1_SPEC "%(cc1_cpu) %{profile:-p}" + +/* Provide a LINK_SPEC appropriate for the FreeBSD/x86-64 ELF target. + This is a copy of LINK_SPEC from tweaked for + the x86-64 target. */ + +#undef LINK_SPEC +#define LINK_SPEC "\ + %{m32:-m elf_i386_fbsd} \ + %{v:-V} \ + %{assert*} %{R*} %{rpath*} %{defsym*} \ + %{shared:-Bshareable %{h*} %{soname*}} \ + %{!shared: \ + %{!static: \ + %{rdynamic:-export-dynamic} \ + -dynamic-linker %(fbsd_dynamic_linker) } \ + %{static:-Bstatic}} \ + %{symbolic:-Bsymbolic}" diff --git a/gcc/config/i386/gas.h b/gcc/config/i386/gas.h new file mode 100644 index 000000000..4c7c9d1ac --- /dev/null +++ b/gcc/config/i386/gas.h @@ -0,0 +1,124 @@ +/* Definitions for Intel 386 using GAS. + Copyright (C) 1988, 1993, 1994, 1996, 2002, 2004, 2007, 2008 + Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +/* Note that i386/seq-gas.h is a GAS configuration that does not use this + file. */ + +/* Use the bsd assembler syntax. */ +/* we need to do this because gas is really a bsd style assembler, + * and so doesn't work well this these att-isms: + * + * ASM_OUTPUT_SKIP is .set .,.+N, which isn't implemented in gas + * ASM_OUTPUT_LOCAL is done with .set .,.+N, but that can't be + * used to define bss static space + * + * Next is the question of whether to uses underscores. RMS didn't + * like this idea at first, but since it is now obvious that we + * need this separate tm file for use with gas, at least to get + * dbx debugging info, I think we should also switch to underscores. + * We can keep i386v for real att style output, and the few + * people who want both form will have to compile twice. + */ + +/* these come from i386/bsd.h, but are specific to sequent */ +#undef DBX_NO_XREFS +#undef DBX_CONTIN_LENGTH + +/* Ask for COFF symbols. */ + +#define SDB_DEBUGGING_INFO 1 + +/* Output #ident as a .ident. */ + +#define ASM_OUTPUT_IDENT(FILE, NAME) fprintf (FILE, "\t.ident \"%s\"\n", NAME); + +/* In the past there was confusion as to what the argument to .align was + in GAS. For the last several years the rule has been this: for a.out + file formats that argument is LOG, and for all other file formats the + argument is 1<. +;; +;; The Geode architecture is one insn issue processor. +;; +;; This description is based on data from the following documents: +;; +;; "AMD Geode GX Processor Data Book" +;; Advanced Micro Devices, Inc., Aug 2005. +;; +;; "AMD Geode LX Processor Data Book" +;; Advanced Micro Devices, Inc., Jan 2006. +;; +;; +;; CPU execution units of the Geode: +;; +;; issue describes the issue pipeline. +;; alu describes the Integer unit +;; fpu describes the FP unit +;; +;; The fp unit is out of order execution unit with register renaming. +;; There is also memory management unit and execution pipeline for +;; load/store operations. We ignore it and difference between insns +;; using memory and registers. + +(define_automaton "geode") + +(define_cpu_unit "geode_issue,geode_alu,geode_fpu" "geode") + +(define_insn_reservation "alu" 1 + (and (eq_attr "cpu" "geode") + (eq_attr "type" "alu,alu1,negnot,icmp,lea,test,imov,imovx,icmov,incdec,setcc")) + "geode_issue,geode_alu") + +(define_insn_reservation "shift" 2 + (and (eq_attr "cpu" "geode") + (eq_attr "type" "ishift,ishift1,rotate,rotate1")) + "geode_issue,geode_alu*2") + +(define_insn_reservation "imul" 7 + (and (eq_attr "cpu" "geode") + (eq_attr "type" "imul")) + "geode_issue,geode_alu*7") + +(define_insn_reservation "idiv" 40 + (and (eq_attr "cpu" "geode") + (eq_attr "type" "idiv")) + "geode_issue,geode_alu*40") + +;; The branch unit. +(define_insn_reservation "call" 2 + (and (eq_attr "cpu" "geode") + (eq_attr "type" "call,callv")) + "geode_issue,geode_alu*2") + +(define_insn_reservation "geode_branch" 1 + (and (eq_attr "cpu" "geode") + (eq_attr "type" "ibr")) + "geode_issue,geode_alu") + +(define_insn_reservation "geode_pop_push" 1 + (and (eq_attr "cpu" "geode") + (eq_attr "type" "pop,push")) + "geode_issue,geode_alu") + +(define_insn_reservation "geode_leave" 2 + (and (eq_attr "cpu" "geode") + (eq_attr "type" "leave")) + "geode_issue,geode_alu*2") + +(define_insn_reservation "geode_load_str" 4 + (and (eq_attr "cpu" "geode") + (and (eq_attr "type" "str") + (eq_attr "memory" "load,both"))) + "geode_issue,geode_alu*4") + +(define_insn_reservation "geode_store_str" 2 + (and (eq_attr "cpu" "geode") + (and (eq_attr "type" "str") + (eq_attr "memory" "store"))) + "geode_issue,geode_alu*2") + +;; Be optimistic +(define_insn_reservation "geode_unknown" 1 + (and (eq_attr "cpu" "geode") + (eq_attr "type" "multi,other")) + "geode_issue,geode_alu") + +;; FPU + +(define_insn_reservation "geode_fop" 6 + (and (eq_attr "cpu" "geode") + (eq_attr "type" "fop,fcmp")) + "geode_issue,geode_fpu*6") + +(define_insn_reservation "geode_fsimple" 1 + (and (eq_attr "cpu" "geode") + (eq_attr "type" "fmov,fcmov,fsgn,fxch")) + "geode_issue,geode_fpu") + +(define_insn_reservation "geode_fist" 4 + (and (eq_attr "cpu" "geode") + (eq_attr "type" "fistp,fisttp")) + "geode_issue,geode_fpu*4") + +(define_insn_reservation "geode_fmul" 10 + (and (eq_attr "cpu" "geode") + (eq_attr "type" "fmul")) + "geode_issue,geode_fpu*10") + +(define_insn_reservation "geode_fdiv" 47 + (and (eq_attr "cpu" "geode") + (eq_attr "type" "fdiv")) + "geode_issue,geode_fpu*47") + +;; We use minimal latency (fsin) here +(define_insn_reservation "geode_fpspc" 54 + (and (eq_attr "cpu" "geode") + (eq_attr "type" "fpspc")) + "geode_issue,geode_fpu*54") + +(define_insn_reservation "geode_frndint" 12 + (and (eq_attr "cpu" "geode") + (eq_attr "type" "frndint")) + "geode_issue,geode_fpu*12") + +(define_insn_reservation "geode_mmxmov" 1 + (and (eq_attr "cpu" "geode") + (eq_attr "type" "mmxmov")) + "geode_issue,geode_fpu") + +(define_insn_reservation "geode_mmx" 2 + (and (eq_attr "cpu" "geode") + (eq_attr "type" "mmx,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft")) + "geode_issue,geode_fpu*2") diff --git a/gcc/config/i386/gmm_malloc.h b/gcc/config/i386/gmm_malloc.h new file mode 100644 index 000000000..7a7e84069 --- /dev/null +++ b/gcc/config/i386/gmm_malloc.h @@ -0,0 +1,74 @@ +/* Copyright (C) 2004, 2009 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _MM_MALLOC_H_INCLUDED +#define _MM_MALLOC_H_INCLUDED + +#include +#include + +static __inline__ void* +_mm_malloc (size_t size, size_t align) +{ + void * malloc_ptr; + void * aligned_ptr; + + /* Error if align is not a power of two. */ + if (align & (align - 1)) + { + errno = EINVAL; + return ((void*) 0); + } + + if (size == 0) + return ((void *) 0); + + /* Assume malloc'd pointer is aligned at least to sizeof (void*). + If necessary, add another sizeof (void*) to store the value + returned by malloc. Effectively this enforces a minimum alignment + of sizeof double. */ + if (align < 2 * sizeof (void *)) + align = 2 * sizeof (void *); + + malloc_ptr = malloc (size + align); + if (!malloc_ptr) + return ((void *) 0); + + /* Align We have at least sizeof (void *) space below malloc'd ptr. */ + aligned_ptr = (void *) (((size_t) malloc_ptr + align) + & ~((size_t) (align) - 1)); + + /* Store the original pointer just before p. */ + ((void **) aligned_ptr) [-1] = malloc_ptr; + + return aligned_ptr; +} + +static __inline__ void +_mm_free (void * aligned_ptr) +{ + if (aligned_ptr) + free (((void **) aligned_ptr) [-1]); +} + +#endif /* _MM_MALLOC_H_INCLUDED */ diff --git a/gcc/config/i386/gmon-sol2.c b/gcc/config/i386/gmon-sol2.c new file mode 100644 index 000000000..44bbb4448 --- /dev/null +++ b/gcc/config/i386/gmon-sol2.c @@ -0,0 +1,459 @@ +/*- + * Copyright (c) 1991 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. [rescinded 22 July 1999] + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This is a modified gmon.c by J.W.Hawtin , + * 14/8/96 based on the original gmon.c in GCC and the hacked version + * solaris 2 sparc version (config/sparc/gmon-sol.c) by Mark Eichin. To do + * process profiling on solaris 2.X X86 + * + * It must be used in conjunction with sol2-gc1.asm, which is used to start + * and stop process monitoring. + * + * Differences. + * + * On Solaris 2 _mcount is called by library functions not mcount, so support + * has been added for both. + * + * Also the prototype for profil() is different + * + * Solaris 2 does not seem to have char *minbrk whcih allows the setting of + * the minimum SBRK region so this code has been removed and lets pray malloc + * does not mess it up. + * + * Notes + * + * This code could easily be integrated with the original gmon.c and perhaps + * should be. + */ +#include "tconfig.h" +#include "tsystem.h" +#include /* for creat() */ + +#ifdef DEBUG +#include +#endif + +static void moncontrol (int); +extern void monstartup (char *, char *); +extern void _mcleanup (void); +extern void internal_mcount ( +#ifdef __x86_64__ + char *, unsigned short * +#else + void +#endif + ); + + +struct phdr { + char *lpc; + char *hpc; + int ncnt; +}; + + +#define HISTFRACTION 2 +#define HISTCOUNTER unsigned short +#define HASHFRACTION 1 +#define ARCDENSITY 2 +#define MINARCS 50 +#define BASEADDRESS 0x8000000 /* On Solaris 2 X86 all executables start here + and not at 0 */ + +struct tostruct { + char *selfpc; + long count; + unsigned short link; +}; + +struct rawarc { + unsigned long raw_frompc; + unsigned long raw_selfpc; + long raw_count; +}; +#define ROUNDDOWN(x,y) (((x)/(y))*(y)) +#define ROUNDUP(x,y) ((((x)+(y)-1)/(y))*(y)) + +/* char *minbrk; */ + +typedef __SIZE_TYPE__ size_t; +typedef __PTRDIFF_TYPE__ intptr_t; + + /* + * froms is actually a bunch of unsigned shorts indexing tos + */ +static int profiling = 3; +static unsigned short *froms; +static struct tostruct *tos = 0; +static long tolimit = 0; +static char *s_lowpc = 0; +static char *s_highpc = 0; +static size_t s_textsize = 0; + +static int ssiz; +static char *sbuf; +static int s_scale; + /* see profil(2) where this is describe (incorrectly) */ +#define SCALE_1_TO_1 0x10000L + +#define MSG "No space for profiling buffer(s)\n" + +extern int errno; + +extern void *sbrk (intptr_t); + +void +monstartup(char *lowpc, char *highpc) +{ + size_t monsize; + char *buffer; + register size_t o; + + /* + * round lowpc and highpc to multiples of the density we're using + * so the rest of the scaling (here and in gprof) stays in ints. + */ + lowpc = (char *) + ROUNDDOWN((size_t)lowpc, HISTFRACTION*sizeof(HISTCOUNTER)); + s_lowpc = lowpc; + highpc = (char *) + ROUNDUP((size_t)highpc, HISTFRACTION*sizeof(HISTCOUNTER)); + s_highpc = highpc; + s_textsize = highpc - lowpc; + monsize = (s_textsize / HISTFRACTION) + sizeof(struct phdr); + buffer = (char *) sbrk( monsize ); + if ( buffer == (char *) -1 ) { + write( 2 , MSG , sizeof(MSG) ); + return; + } + froms = (unsigned short *) sbrk( s_textsize / HASHFRACTION ); + if ( froms == (unsigned short *) -1 ) { + write( 2 , MSG , sizeof(MSG) ); + froms = 0; + return; + } + tolimit = s_textsize * ARCDENSITY / 100; + if ( tolimit < MINARCS ) { + tolimit = MINARCS; + } else if ( tolimit > 65534 ) { + tolimit = 65534; + } + tos = (struct tostruct *) sbrk( tolimit * sizeof( struct tostruct ) ); + if ( tos == (struct tostruct *) -1 ) { + write( 2 , MSG , sizeof(MSG) ); + froms = 0; + tos = 0; + return; + } +/* minbrk = (char *) sbrk(0);*/ + tos[0].link = 0; + sbuf = buffer; + ssiz = monsize; + ( (struct phdr *) buffer ) -> lpc = lowpc; + ( (struct phdr *) buffer ) -> hpc = highpc; + ( (struct phdr *) buffer ) -> ncnt = ssiz; + monsize -= sizeof(struct phdr); + if ( monsize <= 0 ) + return; + o = highpc - lowpc; + if( monsize < o ) +#ifndef hp300 + s_scale = ( (float) monsize / o ) * SCALE_1_TO_1; +#else /* avoid floating point */ + { + int quot = o / monsize; + + if (quot >= 0x10000) + s_scale = 1; + else if (quot >= 0x100) + s_scale = 0x10000 / quot; + else if (o >= 0x800000) + s_scale = 0x1000000 / (o / (monsize >> 8)); + else + s_scale = 0x1000000 / ((o << 8) / monsize); + } +#endif + else + s_scale = SCALE_1_TO_1; + moncontrol(1); +} + +void +_mcleanup (void) +{ + int fd; + int fromindex; + int endfrom; + char *frompc; + int toindex; + struct rawarc rawarc; + + moncontrol(0); + fd = creat( "gmon.out" , 0666 ); + if ( fd < 0 ) { + perror( "mcount: gmon.out" ); + return; + } +# ifdef DEBUG + fprintf( stderr , "[mcleanup] sbuf %#x ssiz %d\n" , sbuf , ssiz ); +# endif /* DEBUG */ + + write( fd , sbuf , ssiz ); + endfrom = s_textsize / (HASHFRACTION * sizeof(*froms)); + for ( fromindex = 0 ; fromindex < endfrom ; fromindex++ ) { + if ( froms[fromindex] == 0 ) { + continue; + } + frompc = s_lowpc + (fromindex * HASHFRACTION * sizeof(*froms)); + for (toindex=froms[fromindex]; toindex!=0; toindex=tos[toindex].link) { +# ifdef DEBUG + fprintf( stderr , + "[mcleanup] frompc %#x selfpc %#x count %d\n" , + frompc , tos[toindex].selfpc , tos[toindex].count ); +# endif /* DEBUG */ + rawarc.raw_frompc = (unsigned long) frompc; + rawarc.raw_selfpc = (unsigned long) tos[toindex].selfpc; + rawarc.raw_count = tos[toindex].count; + write( fd , &rawarc , sizeof rawarc ); + } + } + close( fd ); +} + +#ifdef __x86_64__ +/* See GLIBC for additional information about this technique. */ +asm(".globl _mcount\n" + "\t.type\t_mcount, @function\n" + "_mcount:\n" + /* The compiler calls _mcount after the prologue, and does not + save any of the registers. Therefore we must preserve all + seven registers which may contain function arguments. */ + "\tsubq\t$0x38,%rsp\n" + "\tmovq\t%rax,(%rsp)\n" + "\tmovq\t%rcx,0x08(%rsp)\n" + "\tmovq\t%rdx,0x10(%rsp)\n" + "\tmovq\t%rsi,0x18(%rsp)\n" + "\tmovq\t%rdi,0x20(%rsp)\n" + "\tmovq\t%r8,0x28(%rsp)\n" + "\tmovq\t%r9,0x30(%rsp)\n" + /* Get SELFPC (pushed by the call to this function) and + FROMPCINDEX (via the frame pointer. */ + "\tmovq\t0x38(%rsp),%rdi\n" + "\tmovq\t0x8(%rbp),%rsi\n" + "\tcall\tinternal_mcount\n" + /* Restore the saved registers. */ + "\tmovq\t0x30(%rsp),%r9\n" + "\tmovq\t0x28(%rsp),%r8\n" + "\tmovq\t0x20(%rsp),%rdi\n" + "\tmovq\t0x18(%rsp),%rsi\n" + "\tmovq\t0x10(%rsp),%rdx\n" + "\tmovq\t0x08(%rsp),%rcx\n" + "\tmovq\t(%rsp),%rax\n" + "\taddq\t$0x38,%rsp\n" + "\tretq\n" + ); +#else +/* Solaris 2 libraries use _mcount. */ +asm(".globl _mcount; _mcount: jmp internal_mcount"); +/* This is for compatibility with old versions of gcc which used mcount. */ +asm(".globl mcount; mcount: jmp internal_mcount"); +#endif + +void +internal_mcount ( +#ifdef __x86_64__ + char *selfpc, + unsigned short *frompcindex +#else + void +#endif + ) +{ +#ifndef __x86_64__ + register char *selfpc; + register unsigned short *frompcindex; +#endif + register struct tostruct *top; + register struct tostruct *prevtop; + register long toindex; + static char already_setup; + +#ifndef __x86_64__ + /* + * find the return address for mcount, + * and the return address for mcount's caller. + */ + + /* selfpc = pc pushed by mcount call. + This identifies the function that was just entered. */ + selfpc = (void *) __builtin_return_address (0); + /* frompcindex = pc in preceding frame. + This identifies the caller of the function just entered. */ + frompcindex = (void *) __builtin_return_address (1); +#endif + + if(!already_setup) { + extern char etext[]; + already_setup = 1; +#ifdef __x86_64__ + monstartup(0, etext); +#else + monstartup((char*)0x08040000, etext); +#endif +#ifdef USE_ONEXIT + on_exit(_mcleanup, 0); +#else + atexit(_mcleanup); +#endif + } + /* + * check that we are profiling + * and that we aren't recursively invoked. + */ + if (profiling) { + goto out; + } + profiling++; + /* + * check that frompcindex is a reasonable pc value. + * for example: signal catchers get called from the stack, + * not from text space. too bad. + */ + frompcindex = (unsigned short *)((long)frompcindex - (long)s_lowpc); + if ((unsigned long)frompcindex > s_textsize) { + goto done; + } + frompcindex = + &froms[((long)frompcindex) / (HASHFRACTION * sizeof(*froms))]; + toindex = *frompcindex; + if (toindex == 0) { + /* + * first time traversing this arc + */ + toindex = ++tos[0].link; + if (toindex >= tolimit) { + goto overflow; + } + *frompcindex = toindex; + top = &tos[toindex]; + top->selfpc = selfpc; + top->count = 1; + top->link = 0; + goto done; + } + top = &tos[toindex]; + if (top->selfpc == selfpc) { + /* + * arc at front of chain; usual case. + */ + top->count++; + goto done; + } + /* + * have to go looking down chain for it. + * top points to what we are looking at, + * prevtop points to previous top. + * we know it is not at the head of the chain. + */ + for (; /* goto done */; ) { + if (top->link == 0) { + /* + * top is end of the chain and none of the chain + * had top->selfpc == selfpc. + * so we allocate a new tostruct + * and link it to the head of the chain. + */ + toindex = ++tos[0].link; + if (toindex >= tolimit) { + goto overflow; + } + top = &tos[toindex]; + top->selfpc = selfpc; + top->count = 1; + top->link = *frompcindex; + *frompcindex = toindex; + goto done; + } + /* + * otherwise, check the next arc on the chain. + */ + prevtop = top; + top = &tos[top->link]; + if (top->selfpc == selfpc) { + /* + * there it is. + * increment its count + * move it to the head of the chain. + */ + top->count++; + toindex = prevtop->link; + prevtop->link = top->link; + top->link = *frompcindex; + *frompcindex = toindex; + goto done; + } + + } +done: + profiling--; + /* and fall through */ +out: + return; /* normal return restores saved registers */ + +overflow: + profiling++; /* halt further profiling */ +# define TOLIMIT "mcount: tos overflow\n" + write(2, TOLIMIT, sizeof(TOLIMIT)); + goto out; +} + +/* + * Control profiling + * profiling is what mcount checks to see if + * all the data structures are ready. + */ +static void +moncontrol(int mode) +{ + if (mode) + { + /* start */ + profil((unsigned short *)(sbuf + sizeof(struct phdr)), + ssiz - sizeof(struct phdr), + (size_t)s_lowpc, s_scale); + + profiling = 0; + } else { + /* stop */ + profil((unsigned short *)0, 0, 0, 0); + profiling = 3; + } +} diff --git a/gcc/config/i386/gnu.h b/gcc/config/i386/gnu.h new file mode 100644 index 000000000..ce37683b4 --- /dev/null +++ b/gcc/config/i386/gnu.h @@ -0,0 +1,56 @@ +/* Configuration for an i386 running GNU with ELF as the target machine. */ + +/* +Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, +2005, 2007, 2008, 2011 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC. If not, see . +*/ + +#undef GLIBC_DYNAMIC_LINKER +#define GLIBC_DYNAMIC_LINKER "/lib/ld.so" + +#undef TARGET_VERSION +#define TARGET_VERSION fprintf (stderr, " (i386 GNU)"); + +#undef CPP_SPEC +#define CPP_SPEC "%{pthread:-D_REENTRANT} %{posix:-D_POSIX_SOURCE}" + +#undef CC1_SPEC +#define CC1_SPEC "%(cc1_cpu)" + +#undef STARTFILE_SPEC +#if defined HAVE_LD_PIE +#define STARTFILE_SPEC \ + "%{!shared: %{pg|p|profile:gcrt0.o%s;pie:Scrt1.o%s;static:crt0.o%s;:crt1.o%s}} \ + crti.o%s %{static:crtbeginT.o%s;shared|pie:crtbeginS.o%s;:crtbegin.o%s}" +#else +#define STARTFILE_SPEC \ + "%{!shared: %{pg|p|profile:gcrt0.o%s;static:crt0.o%s;:crt1.o%s}} \ + crti.o%s %{static:crtbeginT.o%s;shared|pie:crtbeginS.o%s;:crtbegin.o%s}" +#endif + +#undef ENDFILE_SPEC +#define ENDFILE_SPEC \ + "%{shared|pie:crtendS.o%s;:crtend.o%s} crtn.o%s" + +/* FIXME: Is a Hurd-specific fallback mechanism necessary? */ +#undef MD_UNWIND_SUPPORT + +#ifdef TARGET_LIBC_PROVIDES_SSP +/* Not supported yet. */ +#undef TARGET_THREAD_SSP_OFFSET +#endif diff --git a/gcc/config/i386/gstabs.h b/gcc/config/i386/gstabs.h new file mode 100644 index 000000000..e9a621871 --- /dev/null +++ b/gcc/config/i386/gstabs.h @@ -0,0 +1,7 @@ +/* We do not want to output SDB debugging information. */ + +#undef SDB_DEBUGGING_INFO + +/* We want to output DBX debugging information. */ + +#define DBX_DEBUGGING_INFO 1 diff --git a/gcc/config/i386/gthr-win32.c b/gcc/config/i386/gthr-win32.c new file mode 100644 index 000000000..46ecb0d4b --- /dev/null +++ b/gcc/config/i386/gthr-win32.c @@ -0,0 +1,260 @@ +/* Implementation of W32-specific threads compatibility routines for + libgcc2. */ + +/* Copyright (C) 1999, 2000, 2002, 2004, 2008, 2009 Free Software Foundation, Inc. + Contributed by Mumit Khan . + Modified and moved to separate file by Danny Smith + . + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ + +#include +#ifndef __GTHREAD_HIDE_WIN32API +# define __GTHREAD_HIDE_WIN32API 1 +#endif +#undef __GTHREAD_I486_INLINE_LOCK_PRIMITIVES +#define __GTHREAD_I486_INLINE_LOCK_PRIMITIVES +#include + +/* Windows32 threads specific definitions. The windows32 threading model + does not map well into pthread-inspired gcc's threading model, and so + there are caveats one needs to be aware of. + + 1. The destructor supplied to __gthread_key_create is ignored for + generic x86-win32 ports. This will certainly cause memory leaks + due to unreclaimed eh contexts (sizeof (eh_context) is at least + 24 bytes for x86 currently). + + This memory leak may be significant for long-running applications + that make heavy use of C++ EH. + + However, Mingw runtime (version 0.3 or newer) provides a mechanism + to emulate pthreads key dtors; the runtime provides a special DLL, + linked in if -mthreads option is specified, that runs the dtors in + the reverse order of registration when each thread exits. If + -mthreads option is not given, a stub is linked in instead of the + DLL, which results in memory leak. Other x86-win32 ports can use + the same technique of course to avoid the leak. + + 2. The error codes returned are non-POSIX like, and cast into ints. + This may cause incorrect error return due to truncation values on + hw where sizeof (DWORD) > sizeof (int). + + 3. We are currently using a special mutex instead of the Critical + Sections, since Win9x does not support TryEnterCriticalSection + (while NT does). + + The basic framework should work well enough. In the long term, GCC + needs to use Structured Exception Handling on Windows32. */ + +int +__gthr_win32_once (__gthread_once_t *once, void (*func) (void)) +{ + if (once == NULL || func == NULL) + return EINVAL; + + if (! once->done) + { + if (InterlockedIncrement (&(once->started)) == 0) + { + (*func) (); + once->done = TRUE; + } + else + { + /* Another thread is currently executing the code, so wait for it + to finish; yield the CPU in the meantime. If performance + does become an issue, the solution is to use an Event that + we wait on here (and set above), but that implies a place to + create the event before this routine is called. */ + while (! once->done) + Sleep (0); + } + } + return 0; +} + +/* Windows32 thread local keys don't support destructors; this leads to + leaks, especially in threaded applications making extensive use of + C++ EH. Mingw uses a thread-support DLL to work-around this problem. */ + +int +__gthr_win32_key_create (__gthread_key_t *key, + void (*dtor) (void *) __attribute__((unused))) +{ + int status = 0; + DWORD tls_index = TlsAlloc (); + if (tls_index != 0xFFFFFFFF) + { + *key = tls_index; +#ifdef MINGW32_SUPPORTS_MT_EH + /* Mingw runtime will run the dtors in reverse order for each thread + when the thread exits. */ + status = __mingwthr_key_dtor (*key, dtor); +#endif + } + else + status = (int) GetLastError (); + return status; +} + +int +__gthr_win32_key_delete (__gthread_key_t key) +{ + return (TlsFree (key) != 0) ? 0 : (int) GetLastError (); +} + +void * +__gthr_win32_getspecific (__gthread_key_t key) +{ + DWORD lasterror; + void *ptr; + lasterror = GetLastError(); + ptr = TlsGetValue(key); + SetLastError( lasterror ); + return ptr; +} + +int +__gthr_win32_setspecific (__gthread_key_t key, const void *ptr) +{ + if (TlsSetValue (key, CONST_CAST2(void *, const void *, ptr)) != 0) + return 0; + else + return GetLastError (); +} + +void +__gthr_win32_mutex_init_function (__gthread_mutex_t *mutex) +{ + mutex->counter = -1; + mutex->sema = CreateSemaphore (NULL, 0, 65535, NULL); +} + +void +__gthr_win32_mutex_destroy (__gthread_mutex_t *mutex) +{ + CloseHandle ((HANDLE) mutex->sema); +} + +int +__gthr_win32_mutex_lock (__gthread_mutex_t *mutex) +{ + if (InterlockedIncrement (&mutex->counter) == 0 || + WaitForSingleObject (mutex->sema, INFINITE) == WAIT_OBJECT_0) + return 0; + else + { + /* WaitForSingleObject returns WAIT_FAILED, and we can only do + some best-effort cleanup here. */ + InterlockedDecrement (&mutex->counter); + return 1; + } +} + +int +__gthr_win32_mutex_trylock (__gthread_mutex_t *mutex) +{ + if (__GTHR_W32_InterlockedCompareExchange (&mutex->counter, 0, -1) < 0) + return 0; + else + return 1; +} + +int +__gthr_win32_mutex_unlock (__gthread_mutex_t *mutex) +{ + if (InterlockedDecrement (&mutex->counter) >= 0) + return ReleaseSemaphore (mutex->sema, 1, NULL) ? 0 : 1; + else + return 0; +} + +void +__gthr_win32_recursive_mutex_init_function (__gthread_recursive_mutex_t *mutex) +{ + mutex->counter = -1; + mutex->depth = 0; + mutex->owner = 0; + mutex->sema = CreateSemaphore (NULL, 0, 65535, NULL); +} + +int +__gthr_win32_recursive_mutex_lock (__gthread_recursive_mutex_t *mutex) +{ + DWORD me = GetCurrentThreadId(); + if (InterlockedIncrement (&mutex->counter) == 0) + { + mutex->depth = 1; + mutex->owner = me; + } + else if (mutex->owner == me) + { + InterlockedDecrement (&mutex->counter); + ++(mutex->depth); + } + else if (WaitForSingleObject (mutex->sema, INFINITE) == WAIT_OBJECT_0) + { + mutex->depth = 1; + mutex->owner = me; + } + else + { + /* WaitForSingleObject returns WAIT_FAILED, and we can only do + some best-effort cleanup here. */ + InterlockedDecrement (&mutex->counter); + return 1; + } + return 0; +} + +int +__gthr_win32_recursive_mutex_trylock (__gthread_recursive_mutex_t *mutex) +{ + DWORD me = GetCurrentThreadId(); + if (__GTHR_W32_InterlockedCompareExchange (&mutex->counter, 0, -1) < 0) + { + mutex->depth = 1; + mutex->owner = me; + } + else if (mutex->owner == me) + ++(mutex->depth); + else + return 1; + + return 0; +} + +int +__gthr_win32_recursive_mutex_unlock (__gthread_recursive_mutex_t *mutex) +{ + --(mutex->depth); + if (mutex->depth == 0) + { + mutex->owner = 0; + + if (InterlockedDecrement (&mutex->counter) >= 0) + return ReleaseSemaphore (mutex->sema, 1, NULL) ? 0 : 1; + } + + return 0; +} diff --git a/gcc/config/i386/host-cygwin.c b/gcc/config/i386/host-cygwin.c new file mode 100644 index 000000000..7d975af43 --- /dev/null +++ b/gcc/config/i386/host-cygwin.c @@ -0,0 +1,78 @@ +/* Cygwin host-specific hook definitions. + Copyright (C) 2004, 2007, 2010 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "hosthooks.h" +#include "hosthooks-def.h" +#include "diagnostic.h" + +static void * cygwin_gt_pch_get_address (size_t, int fd); +static size_t cygwin_gt_pch_alloc_granularity (void); + +#undef HOST_HOOKS_GT_PCH_GET_ADDRESS +#define HOST_HOOKS_GT_PCH_GET_ADDRESS cygwin_gt_pch_get_address +#undef HOST_HOOKS_GT_PCH_ALLOC_GRANULARITY +#define HOST_HOOKS_GT_PCH_ALLOC_GRANULARITY cygwin_gt_pch_alloc_granularity + +/* Granularity for reserving address space. */ +static const size_t va_granularity = 0x10000; + +/* Return the alignment required for allocating virtual memory. */ +static size_t +cygwin_gt_pch_alloc_granularity (void) +{ + return va_granularity; +} + +/* Identify an address that's likely to be free in a subsequent invocation + of the compiler. The area should be able to hold SIZE bytes. FD is an + open file descriptor if the host would like to probe with mmap. */ +static void * +cygwin_gt_pch_get_address (size_t sz, int fd) +{ + void *base; + off_t p = lseek(fd, 0, SEEK_CUR); + + if (p == (off_t) -1) + fatal_error ("can%'t get position in PCH file: %m"); + + /* Cygwin requires that the underlying file be at least + as large as the requested mapping. */ + if ((size_t) p < sz) + { + if ( ftruncate (fd, sz) == -1 ) + fatal_error ("can%'t extend PCH file: %m"); + } + + base = mmap (NULL, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + + if (base == MAP_FAILED) + base = NULL; + else + munmap (base, sz); + + if (lseek (fd, p, SEEK_SET) == (off_t) -1 ) + fatal_error ("can%'t set position in PCH file: %m"); + + return base; +} + +const struct host_hooks host_hooks = HOST_HOOKS_INITIALIZER; diff --git a/gcc/config/i386/host-i386-darwin.c b/gcc/config/i386/host-i386-darwin.c new file mode 100644 index 000000000..03a19aa4c --- /dev/null +++ b/gcc/config/i386/host-i386-darwin.c @@ -0,0 +1,30 @@ +/* i386-darwin host-specific hook definitions. + Copyright (C) 2003, 2005, 2007 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "hosthooks.h" +#include "hosthooks-def.h" +#include "config/host-darwin.h" + +/* Darwin doesn't do anything special for x86 hosts; this file exists just + to include config/host-darwin.h. */ + +const struct host_hooks host_hooks = HOST_HOOKS_INITIALIZER; diff --git a/gcc/config/i386/host-mingw32.c b/gcc/config/i386/host-mingw32.c new file mode 100644 index 000000000..c224b2807 --- /dev/null +++ b/gcc/config/i386/host-mingw32.c @@ -0,0 +1,179 @@ +/* mingw32 host-specific hook definitions. + Copyright (C) 2004, 2007, 2009, 2010 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "hosthooks.h" +#include "hosthooks-def.h" +#include "diagnostic.h" + + +#define WIN32_LEAN_AND_MEAN /* Not so important if we have windows.h.gch. */ +#include + +static void * mingw32_gt_pch_get_address (size_t, int); +static int mingw32_gt_pch_use_address (void *, size_t, int, size_t); +static size_t mingw32_gt_pch_alloc_granularity (void); + +#undef HOST_HOOKS_GT_PCH_GET_ADDRESS +#define HOST_HOOKS_GT_PCH_GET_ADDRESS mingw32_gt_pch_get_address +#undef HOST_HOOKS_GT_PCH_USE_ADDRESS +#define HOST_HOOKS_GT_PCH_USE_ADDRESS mingw32_gt_pch_use_address +#undef HOST_HOOKS_GT_PCH_ALLOC_GRANULARITY +#define HOST_HOOKS_GT_PCH_ALLOC_GRANULARITY mingw32_gt_pch_alloc_granularity + +static inline void w32_error(const char*, const char*, int, const char*); + +/* FIXME: Is this big enough? */ +static const size_t pch_VA_max_size = 128 * 1024 * 1024; + +/* Granularity for reserving address space. */ +static const size_t va_granularity = 0x10000; + +/* Print out the GetLastError() translation. */ +static inline void +w32_error (const char* function, const char* file, int line, + const char* my_msg) +{ + LPSTR w32_msgbuf; + FormatMessageA (FORMAT_MESSAGE_ALLOCATE_BUFFER + | FORMAT_MESSAGE_FROM_SYSTEM + | FORMAT_MESSAGE_IGNORE_INSERTS + | FORMAT_MESSAGE_MAX_WIDTH_MASK, + NULL, GetLastError(), + MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), + (LPSTR) &w32_msgbuf, 0, NULL); + fprintf(stderr, "internal error in %s, at %s:%d: %s: %s\n", + function, trim_filename (file), line, my_msg, w32_msgbuf); + LocalFree ((HLOCAL)w32_msgbuf); +} + +/* Granularity for reserving address space. */ +static size_t mingw32_gt_pch_alloc_granularity (void) +{ + return va_granularity; +} + +/* Identify an address that's likely to be free in a subsequent invocation + of the compiler. The area should be able to hold SIZE bytes. FD is an + open file descriptor if the host would like to probe with mmap. */ + +static void * +mingw32_gt_pch_get_address (size_t size, int fd ATTRIBUTE_UNUSED) +{ + void* res; + size = (size + va_granularity - 1) & ~(va_granularity - 1); + if (size > pch_VA_max_size) + return NULL; + + /* FIXME: We let system determine base by setting first arg to NULL. + Allocating at top of available address space avoids unnecessary + fragmentation of "ordinary" (malloc's) address space but may not + be safe with delayed load of system dll's. Preferred addresses + for NT system dlls is in 0x70000000 to 0x78000000 range. + If we allocate at bottom we need to reserve the address as early + as possible and at the same point in each invocation. */ + + res = VirtualAlloc (NULL, pch_VA_max_size, + MEM_RESERVE | MEM_TOP_DOWN, + PAGE_NOACCESS); + if (!res) + w32_error (__FUNCTION__, __FILE__, __LINE__, "VirtualAlloc"); + else + /* We do not need the address space for now, so free it. */ + VirtualFree (res, 0, MEM_RELEASE); + + return res; +} + +/* ADDR is an address returned by gt_pch_get_address. Attempt to allocate + SIZE bytes at the same address and load it with the data from FD at + OFFSET. Return -1 if we couldn't allocate memory at ADDR, return 0 + if the memory is allocated but the data not loaded, return 1 if done. */ + +static int +mingw32_gt_pch_use_address (void *addr, size_t size, int fd, + size_t offset) +{ + void * mmap_addr; + HANDLE mmap_handle; + + /* Apparently, MS Vista puts unnamed file mapping objects into Global + namespace when running an application in a Terminal Server + session. This causes failure since, by default, applications + don't get SeCreateGlobalPrivilege. We don't need global + memory sharing so explicitly put object into Local namespace. + + If multiple concurrent GCC processes are using PCH functionality, + MapViewOfFileEx returns "Access Denied" error. So we ensure the + session-wide mapping name is unique by appending process ID. */ + +#define OBJECT_NAME_FMT "Local\\MinGWGCCPCH-" + + char* object_name = NULL; + /* However, the documentation for CreateFileMapping says that on NT4 + and earlier, backslashes are invalid in object name. So, we need + to check if we are on Windows2000 or higher. */ + OSVERSIONINFO version_info; + version_info.dwOSVersionInfoSize = sizeof (version_info); + + if (size == 0) + return 0; + + /* Offset must be also be a multiple of allocation granularity for + this to work. We can't change the offset. */ + if ((offset & (va_granularity - 1)) != 0 || size > pch_VA_max_size) + return -1; + + + /* Determine the version of Windows we are running on and use a + uniquely-named local object if running > 4. */ + GetVersionEx (&version_info); + if (version_info.dwMajorVersion > 4) + { + char local_object_name [sizeof (OBJECT_NAME_FMT) + + sizeof (DWORD) * 2]; + snprintf (local_object_name, sizeof (local_object_name), + OBJECT_NAME_FMT "%lx", GetCurrentProcessId()); + object_name = local_object_name; + } + + mmap_handle = CreateFileMappingA ((HANDLE) _get_osfhandle (fd), NULL, + PAGE_WRITECOPY | SEC_COMMIT, 0, 0, + object_name); + + if (mmap_handle == NULL) + { + w32_error (__FUNCTION__, __FILE__, __LINE__, "CreateFileMapping"); + return -1; + } + mmap_addr = MapViewOfFileEx (mmap_handle, FILE_MAP_COPY, 0, offset, + size, addr); + if (mmap_addr != addr) + { + w32_error (__FUNCTION__, __FILE__, __LINE__, "MapViewOfFileEx"); + CloseHandle(mmap_handle); + return -1; + } + + return 1; +} + +const struct host_hooks host_hooks = HOST_HOOKS_INITIALIZER; diff --git a/gcc/config/i386/i386-builtin-types.awk b/gcc/config/i386/i386-builtin-types.awk new file mode 100644 index 000000000..7b016f44c --- /dev/null +++ b/gcc/config/i386/i386-builtin-types.awk @@ -0,0 +1,280 @@ +# Copyright (C) 2009 Free Software Foundation, Inc. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; see the file COPYING3. If not see +# . + +# Generates compressed tables for types for i386 builtin functions. + +function do_error(string) { + print FILENAME ":" FNR ": " string > "/dev/stderr" + errors = 1 +} + +function check_type(string) { + if (!(string in type_hash)) + do_error("undefined type code " string) +} + +# We can significantly reduce the size of the read-only tables +# by forcing the compiler to use a smaller implementation type +# for the enumerations. +function attribute_mode(count) { + # ??? Except that we get strange "comparison always false" warnings + # for comparisons between different elements of the enumeration. + # print "#ifdef __GNUC__" + # if (count < 256) + # print " __attribute__((__mode__(__QI__)))" + # else + # print " __attribute__((__mode__(__HI__)))" + # print "#endif" +} + +BEGIN { + FS = "[() \t,]+" + + prim_defs = 0 + vect_defs = 0 + ptr_defs = 0 + cptr_defs = 0 + func_defs = 0 + func_args = 0 + alias_defs = 0 +} + +# Skip blank lines or comments. +/^[ \t]*(#|$)/ { + next +} + +$1 == "DEF_PRIMITIVE_TYPE" { + if (NF == 4) { + type_hash[$2] = 1 + prim_name[prim_defs] = $2 + prim_base[prim_defs] = $3 + prim_defs++ + } else + do_error("DEF_PRIMITIVE_TYPE expected 2 arguments") + next +} + +$1 == "DEF_VECTOR_TYPE" { + if (NF == 4 || NF == 5) { + check_type($3) + type_hash[$2] = 1 + vect_name[vect_defs] = $2 + vect_base[vect_defs] = $3 + vect_mode[vect_defs] = (NF == 5 ? $4 : $2) + vect_defs++ + } else + do_error("DEF_VECTOR_TYPE expected 2 arguments") + next +} + +$1 == "DEF_POINTER_TYPE" { + if (NF == 4) { + check_type($3) + type_hash[$2] = 1 + ptr_name[ptr_defs] = $2 + ptr_base[ptr_defs] = $3 + ptr_defs++ + } else if (NF == 5) { + check_type($3) + if ($4 == "CONST") { + type_hash[$2] = 1 + cptr_name[cptr_defs] = $2 + cptr_base[cptr_defs] = $3 + cptr_defs++ + } else + do_error("invalid qualifier \"" $4 "\"") + } + else + do_error("DEF_POINTER_TYPE expected 2 or 3 arguments") + next +} + +$1 == "DEF_FUNCTION_TYPE" { + func_start[func_defs] = func_args + for (i = 2; i < NF; ++i) { + check_type($i) + func_types[func_args++] = $i + } + + if (NF < 3) + do_error("DEF_FUNCTION_TYPE expected at least 1 argument") + else if (NF == 3) + name = $2 "_FTYPE_VOID" + else { + name = $2 "_FTYPE" + for (i = 3; i < NF; ++i) + name = name "_" $i + } + func_hash[name] = 1 + func_name[func_defs++] = name + next +} + +$1 == "DEF_FUNCTION_TYPE_ALIAS" { + if (NF == 4) { + if ($2 in func_hash) { + alias_base[alias_defs] = $2 + alias_name[alias_defs] = $2 "_" $3 + alias_defs++ + } else + do_error("undefined function code " $2) + } else + do_error("DEF_FUNCTION_TYPE_ALIAS expected 2 arguments") + next +} + +{ + do_error("unknown directive \"" $1 "\""); +} + +END { + if (errors) + exit 1 + + print "/* This file is auto-generated by i386-builtin-types.awk. */\n" + + # This first enumeration contains all of the non-function types. + print "enum ix86_builtin_type {" + for (i = 0; i < prim_defs; ++i) + print " IX86_BT_" prim_name[i] "," + print " IX86_BT_LAST_PRIM = IX86_BT_" prim_name[i-1] "," + for (i = 0; i < vect_defs; ++i) + print " IX86_BT_" vect_name[i] "," + print " IX86_BT_LAST_VECT = IX86_BT_" vect_name[i-1] "," + for (i = 0; i < ptr_defs; ++i) + print " IX86_BT_" ptr_name[i] "," + print " IX86_BT_LAST_PTR = IX86_BT_" ptr_name[i-1] "," + for (i = 0; i < cptr_defs; ++i) + print " IX86_BT_" cptr_name[i] "," + print " IX86_BT_LAST_CPTR = IX86_BT_" cptr_name[i-1] "\n}" + attribute_mode(prim_defs + vect_defs + ptr_defs + cptr_defs) + print ";\n\n" + + # We can't tabularize the initialization of the primitives, since + # at least one of them is created via a local variable. That's ok, + # just create a nice big macro to do all the work. + print "#define DEFINE_BUILTIN_PRIMITIVE_TYPES \\" + for (i = 0; i < prim_defs; ++i) { + printf " ix86_builtin_type_tab[(int)IX86_BT_" prim_name[i] \ + "] = " prim_base[i] + if (i < prim_defs - 1) + print ", \\" + } + print "\n\n" + + # The vector types are defined via two tables defining the real + # machine mode and the builtin primitive type. We use two tables + # rather than a structure to avoid structure padding and save space. + print "static const enum machine_mode ix86_builtin_type_vect_mode[] = {" + for (i = 0; i < vect_defs; ++i) { + if (i == 0) + printf " " + else if (i % 6 == 0) + printf ",\n " + else + printf ", " + printf vect_mode[i] "mode" + } + print "\n};\n\n" + + print "static const enum ix86_builtin_type " \ + "ix86_builtin_type_vect_base[] = {" + for (i = 0; i < vect_defs; ++i) { + if (i == 0) + printf " " + else if (i % 4 == 0) + printf ",\n " + else + printf ", " + printf "IX86_BT_" vect_base[i] + } + print "\n};\n\n" + + # The pointer types are defined via a single table defining the + # builtin primitive type. The const-ness of the pointer is taken + # from the enumeration value > IX86_BT_LAST_PTR. + print "static const enum ix86_builtin_type " \ + "ix86_builtin_type_ptr_base[] = {" + for (i = 0; i < ptr_defs; ++i) { + if (i == 0) + printf " " + else if (i % 4 == 0) + printf "\n " + printf " IX86_BT_" ptr_base[i] "," + } + print "\n /* pointer-to-constant defs start here */" + for (i = 0; i < cptr_defs; ++i) { + if (i == 0) + printf " " + else if (i % 4 == 0) + printf ",\n " + else + printf ", " + printf "IX86_BT_" cptr_base[i] + } + print "\n};\n\n" + + # This second enumeration contains all of the function types. + print "enum ix86_builtin_func_type {" + for (i = 0; i < func_defs; ++i) + print " " func_name[i] "," + print " IX86_BT_LAST_FUNC = " func_name[i-1] "," + for (i = 0; i < alias_defs; ++i) + print " " alias_name[i] "," + print " IX86_BT_LAST_ALIAS = " alias_name[i-1] "\n}" + attribute_mode(func_defs + alias_defs) + print ";\n\n" + + # The function types are defined via two tables. The first contains + # ranges consiting of the function's return type, followed by all of + # the function argument types. The ranges for all of the builtin + # functions are smooshed together in the same array. The second array + # contains, for each builtin, the index of the function's return type + # within the first array. + print "static const enum ix86_builtin_type ix86_builtin_func_args[] = {" + for (i = 0; i < func_args; ++i) { + if (i == 0) + printf " " + else if (i % 4 == 0) + printf ",\n " + else + printf ", " + printf "IX86_BT_" func_types[i] + } + print "\n};\n\n" + + print "static const unsigned short ix86_builtin_func_start[] = {" + for (i = 0; i < func_defs; ++i) { + if (i == 0) + printf " " + else if (i % 10 == 0) + printf "\n " + printf " " func_start[i] "," + } + print " " func_args "\n};\n\n" + + print "static const enum ix86_builtin_func_type " \ + "ix86_builtin_func_alias_base[] = {" + for (i = 0; i < alias_defs; ++i) { + if (i == 0) + printf " " + else + printf ",\n " + printf alias_base[i] + } + print "\n};" +} diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def new file mode 100644 index 000000000..05a7f5468 --- /dev/null +++ b/gcc/config/i386/i386-builtin-types.def @@ -0,0 +1,420 @@ +# This file provides a declarative way of describing the types that +# are used when declaring ix86 builtin functions. It is processed +# with i386-builtin-type.awk to produce C code. +# +# DEF_PRIMITIVE_TYPE (ENUM, TYPE) +# +# The ENUM is an identifier indicating which type is being defined. +# TYPE is a variable that represents the type. +# ??? Note that the awk program expects a single token for TYPE. +# At present, that's all that's required; revisit if it turns out +# that we need more than that. +# +# DEF_VECTOR_TYPE (ENUM, TYPE [, MODE]) +# +# This describes a vector type. ENUM is an identifier as above. +# TYPE is the enumeral for the inner type which should of course +# name a type of the proper inner mode. If present, MODE is the +# machine mode, else the machine mode should be the same as ENUM. +# +# DEF_POINTER_TYPE (ENUM, TYPE [, CONST]) +# +# This describes a pointer type. ENUM is an identifier as above; +# TYPE is the enumeral for the type pointed to. An optional third +# argument is the keyword CONST, which defines this to be a pointer to +# a constant type. +# +# DEF_FUNCTION_TYPE (RETURN, ARGN*) +# +# This describes a function type. The return type and the arguments +# are the enumerals defined above. The enumeration name for the +# function is formed by RETURN ## _FTYPE_ ## ARG1 ## _ ## ARG2 ... +# +# DEF_FUNCTION_TYPE_ALIAS (ENUM, SUFFIX) +# +# This defines an enumeration ENUM ## _ ## SUFFIX and arranges for +# the function type to be copied from ENUM. This is used to control +# how the expanders treat the function. +# + +DEF_PRIMITIVE_TYPE (VOID, void_type_node) +DEF_PRIMITIVE_TYPE (CHAR, char_type_node) +DEF_PRIMITIVE_TYPE (UCHAR, unsigned_char_type_node) +# ??? Logically this should be intQI_type_node, but that maps to "signed char" +# which is a different type than "char" even if "char" is signed. This must +# match the usage in emmintrin.h and changing this would change name mangling +# and so is not advisable. +DEF_PRIMITIVE_TYPE (QI, char_type_node) +DEF_PRIMITIVE_TYPE (HI, intHI_type_node) +DEF_PRIMITIVE_TYPE (SI, intSI_type_node) +# ??? Logically this should be intDI_type_node, but that maps to "long" +# with 64-bit, and that's not how the emmintrin.h is written. Again, +# changing this would change name mangling. +DEF_PRIMITIVE_TYPE (DI, long_long_integer_type_node) +DEF_PRIMITIVE_TYPE (UQI, unsigned_intQI_type_node) +DEF_PRIMITIVE_TYPE (UHI, unsigned_intHI_type_node) +DEF_PRIMITIVE_TYPE (USI, unsigned_intSI_type_node) +DEF_PRIMITIVE_TYPE (UDI, long_long_unsigned_type_node) +# ??? Some of the types below should use the mode types above. +DEF_PRIMITIVE_TYPE (USHORT, short_unsigned_type_node) +DEF_PRIMITIVE_TYPE (INT, integer_type_node) +DEF_PRIMITIVE_TYPE (UINT, unsigned_type_node) +DEF_PRIMITIVE_TYPE (UNSIGNED, unsigned_type_node) +DEF_PRIMITIVE_TYPE (LONGLONG, long_long_integer_type_node) +DEF_PRIMITIVE_TYPE (ULONGLONG, long_long_unsigned_type_node) +DEF_PRIMITIVE_TYPE (UINT8, unsigned_char_type_node) +DEF_PRIMITIVE_TYPE (UINT16, short_unsigned_type_node) +DEF_PRIMITIVE_TYPE (INT64, long_long_integer_type_node) +DEF_PRIMITIVE_TYPE (UINT64, long_long_unsigned_type_node) +DEF_PRIMITIVE_TYPE (FLOAT, float_type_node) +DEF_PRIMITIVE_TYPE (DOUBLE, double_type_node) +DEF_PRIMITIVE_TYPE (FLOAT80, float80_type_node) +DEF_PRIMITIVE_TYPE (FLOAT128, float128_type_node) + +# MMX vectors +DEF_VECTOR_TYPE (V2SF, FLOAT) +DEF_VECTOR_TYPE (V1DI, DI) +DEF_VECTOR_TYPE (V2SI, SI) +DEF_VECTOR_TYPE (V4HI, HI) +DEF_VECTOR_TYPE (V8QI, QI) + +# SSE vectors +DEF_VECTOR_TYPE (V2DF, DOUBLE) +DEF_VECTOR_TYPE (V4SF, FLOAT) +DEF_VECTOR_TYPE (V2DI, DI) +DEF_VECTOR_TYPE (V4SI, SI) +DEF_VECTOR_TYPE (V8HI, HI) +DEF_VECTOR_TYPE (V16QI, QI) +DEF_VECTOR_TYPE (V2UDI, UDI, V2DI) +DEF_VECTOR_TYPE (V4USI, USI, V4SI) +DEF_VECTOR_TYPE (V8UHI, UHI, V8HI) +DEF_VECTOR_TYPE (V16UQI, UQI, V16QI) + +# AVX vectors +DEF_VECTOR_TYPE (V4DF, DOUBLE) +DEF_VECTOR_TYPE (V8SF, FLOAT) +DEF_VECTOR_TYPE (V4DI, DI) +DEF_VECTOR_TYPE (V8SI, SI) +DEF_VECTOR_TYPE (V16HI, HI) +DEF_VECTOR_TYPE (V32QI, QI) + + +DEF_POINTER_TYPE (PCCHAR, CHAR, CONST) +DEF_POINTER_TYPE (PCDOUBLE, DOUBLE, CONST) +DEF_POINTER_TYPE (PCFLOAT, FLOAT, CONST) +DEF_POINTER_TYPE (PCHAR, CHAR) +DEF_POINTER_TYPE (PCVOID, VOID, CONST) +DEF_POINTER_TYPE (PVOID, VOID) +DEF_POINTER_TYPE (PDOUBLE, DOUBLE) +DEF_POINTER_TYPE (PFLOAT, FLOAT) +DEF_POINTER_TYPE (PUSHORT, USHORT) +DEF_POINTER_TYPE (PINT, INT) +DEF_POINTER_TYPE (PULONGLONG, ULONGLONG) +DEF_POINTER_TYPE (PUNSIGNED, UNSIGNED) + +DEF_POINTER_TYPE (PV2DF, V2DF) +DEF_POINTER_TYPE (PV2DI, V2DI) +DEF_POINTER_TYPE (PV2SF, V2SF) +DEF_POINTER_TYPE (PV4DF, V4DF) +DEF_POINTER_TYPE (PV4DI, V4DI) +DEF_POINTER_TYPE (PV4SF, V4SF) +DEF_POINTER_TYPE (PV8SF, V8SF) + +DEF_POINTER_TYPE (PCV2DF, V2DF, CONST) +DEF_POINTER_TYPE (PCV2SF, V2SF, CONST) +DEF_POINTER_TYPE (PCV4DF, V4DF, CONST) +DEF_POINTER_TYPE (PCV4SF, V4SF, CONST) +DEF_POINTER_TYPE (PCV8SF, V8SF, CONST) + +DEF_FUNCTION_TYPE (FLOAT128) +DEF_FUNCTION_TYPE (UINT64) +DEF_FUNCTION_TYPE (UNSIGNED) +DEF_FUNCTION_TYPE (VOID) +DEF_FUNCTION_TYPE (PVOID) + +DEF_FUNCTION_TYPE (FLOAT, FLOAT) +DEF_FUNCTION_TYPE (FLOAT128, FLOAT128) +DEF_FUNCTION_TYPE (INT, INT) +DEF_FUNCTION_TYPE (INT, V16QI) +DEF_FUNCTION_TYPE (INT, V2DF) +DEF_FUNCTION_TYPE (INT, V4DF) +DEF_FUNCTION_TYPE (INT, V4SF) +DEF_FUNCTION_TYPE (INT, V8QI) +DEF_FUNCTION_TYPE (INT, V8SF) +DEF_FUNCTION_TYPE (INT64, INT64) +DEF_FUNCTION_TYPE (INT64, V2DF) +DEF_FUNCTION_TYPE (INT64, V4SF) +DEF_FUNCTION_TYPE (UINT64, INT) +DEF_FUNCTION_TYPE (UINT16, UINT16) +DEF_FUNCTION_TYPE (UINT64, PUNSIGNED) +DEF_FUNCTION_TYPE (V16QI, PCCHAR) +DEF_FUNCTION_TYPE (V16QI, V16QI) +DEF_FUNCTION_TYPE (V2DF, PCDOUBLE) +DEF_FUNCTION_TYPE (V2DF, V2DF) +DEF_FUNCTION_TYPE (V2DF, V2SI) +DEF_FUNCTION_TYPE (V2DF, V4DF) +DEF_FUNCTION_TYPE (V2DF, V4SF) +DEF_FUNCTION_TYPE (V2DF, V4SI) +DEF_FUNCTION_TYPE (V2DI, PV2DI) +DEF_FUNCTION_TYPE (V2DI, V16QI) +DEF_FUNCTION_TYPE (V2DI, V2DI) +DEF_FUNCTION_TYPE (V2DI, V4SI) +DEF_FUNCTION_TYPE (V2DI, V8HI) +DEF_FUNCTION_TYPE (V2SF, V2SF) +DEF_FUNCTION_TYPE (V2SF, V2SI) +DEF_FUNCTION_TYPE (V2SI, V2DF) +DEF_FUNCTION_TYPE (V2SI, V2SF) +DEF_FUNCTION_TYPE (V2SI, V2SI) +DEF_FUNCTION_TYPE (V2SI, V4SF) +DEF_FUNCTION_TYPE (V32QI, PCCHAR) +DEF_FUNCTION_TYPE (V4DF, PCDOUBLE) +DEF_FUNCTION_TYPE (V4DF, PCV2DF) +DEF_FUNCTION_TYPE (V4DF, V2DF) +DEF_FUNCTION_TYPE (V4DF, V4DF) +DEF_FUNCTION_TYPE (V4DF, V4SF) +DEF_FUNCTION_TYPE (V4DF, V4SI) +DEF_FUNCTION_TYPE (V4HI, V4HI) +DEF_FUNCTION_TYPE (V4SF, PCFLOAT) +DEF_FUNCTION_TYPE (V4SF, V2DF) +DEF_FUNCTION_TYPE (V4SF, V4DF) +DEF_FUNCTION_TYPE (V4SF, V4SF) +DEF_FUNCTION_TYPE (V4SF, V4SI) +DEF_FUNCTION_TYPE (V4SF, V8SF) +DEF_FUNCTION_TYPE (V4SF, V8HI) +DEF_FUNCTION_TYPE (V4SI, V16QI) +DEF_FUNCTION_TYPE (V4SI, V2DF) +DEF_FUNCTION_TYPE (V4SI, V4DF) +DEF_FUNCTION_TYPE (V4SI, V4SF) +DEF_FUNCTION_TYPE (V4SI, V4SI) +DEF_FUNCTION_TYPE (V4SI, V8HI) +DEF_FUNCTION_TYPE (V4SI, V8SI) +DEF_FUNCTION_TYPE (V8HI, V16QI) +DEF_FUNCTION_TYPE (V8HI, V8HI) +DEF_FUNCTION_TYPE (V8QI, V8QI) +DEF_FUNCTION_TYPE (V8SF, PCFLOAT) +DEF_FUNCTION_TYPE (V8SF, PCV4SF) +DEF_FUNCTION_TYPE (V8SF, V4SF) +DEF_FUNCTION_TYPE (V8SF, V8SF) +DEF_FUNCTION_TYPE (V8SF, V8SI) +DEF_FUNCTION_TYPE (V8SF, V8HI) +DEF_FUNCTION_TYPE (V8SI, V4SI) +DEF_FUNCTION_TYPE (V8SI, V8SF) +DEF_FUNCTION_TYPE (VOID, PCVOID) +DEF_FUNCTION_TYPE (VOID, PVOID) +DEF_FUNCTION_TYPE (VOID, UINT64) +DEF_FUNCTION_TYPE (VOID, UNSIGNED) +DEF_FUNCTION_TYPE (INT, PUSHORT) +DEF_FUNCTION_TYPE (INT, PUNSIGNED) +DEF_FUNCTION_TYPE (INT, PULONGLONG) + +DEF_FUNCTION_TYPE (DI, V2DI, INT) +DEF_FUNCTION_TYPE (DOUBLE, V2DF, INT) +DEF_FUNCTION_TYPE (FLOAT, V4SF, INT) +DEF_FUNCTION_TYPE (FLOAT128, FLOAT128, FLOAT128) +DEF_FUNCTION_TYPE (HI, V4HI, INT) +DEF_FUNCTION_TYPE (HI, V8HI, INT) +DEF_FUNCTION_TYPE (INT, V2DF, V2DF) +DEF_FUNCTION_TYPE (INT, V2DI, V2DI) +DEF_FUNCTION_TYPE (INT, V4DF, V4DF) +DEF_FUNCTION_TYPE (INT, V4DI, V4DI) +DEF_FUNCTION_TYPE (INT, V4SF, V4SF) +DEF_FUNCTION_TYPE (INT, V8SF, V8SF) +DEF_FUNCTION_TYPE (QI, V16QI, INT) +DEF_FUNCTION_TYPE (QI, V8QI, INT) +DEF_FUNCTION_TYPE (SI, V2SI, INT) +DEF_FUNCTION_TYPE (SI, V4SI, INT) +DEF_FUNCTION_TYPE (UINT, UINT, UCHAR) +DEF_FUNCTION_TYPE (UINT, UINT, UINT) +DEF_FUNCTION_TYPE (UINT, UINT, USHORT) +DEF_FUNCTION_TYPE (UINT16, UINT16, INT) +DEF_FUNCTION_TYPE (UINT64, UINT64, UINT64) +DEF_FUNCTION_TYPE (UINT8, UINT8, INT) +DEF_FUNCTION_TYPE (V16QI, V16QI, SI) +DEF_FUNCTION_TYPE (V16QI, V16QI, V16QI) +DEF_FUNCTION_TYPE (V16QI, V8HI, V8HI) +DEF_FUNCTION_TYPE (V1DI, V1DI, SI) +DEF_FUNCTION_TYPE (V1DI, V1DI, V1DI) +DEF_FUNCTION_TYPE (V1DI, V2SI, V2SI) +DEF_FUNCTION_TYPE (V1DI, V8QI, V8QI) +DEF_FUNCTION_TYPE (V2DF, PCV2DF, V2DI) +DEF_FUNCTION_TYPE (V2DF, V2DF, DI) +DEF_FUNCTION_TYPE (V2DF, V2DF, INT) +DEF_FUNCTION_TYPE (V2DF, V2DF, PCDOUBLE) +DEF_FUNCTION_TYPE (V2DF, V2DF, SI) +DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF) +DEF_FUNCTION_TYPE (V2DF, V2DF, V2DI) +DEF_FUNCTION_TYPE (V2DF, V2DF, V4SF) +DEF_FUNCTION_TYPE (V2DF, V4DF, INT) +DEF_FUNCTION_TYPE (V2DI, V16QI, V16QI) +DEF_FUNCTION_TYPE (V2DI, V2DF, V2DF) +DEF_FUNCTION_TYPE (V2DI, V2DI, INT) +DEF_FUNCTION_TYPE (V2DI, V2DI, SI) +DEF_FUNCTION_TYPE (V2DI, V2DI, V16QI) +DEF_FUNCTION_TYPE (V2DI, V2DI, V2DI) +DEF_FUNCTION_TYPE (V2DI, V4SI, V4SI) +DEF_FUNCTION_TYPE (V2SF, V2SF, V2SF) +DEF_FUNCTION_TYPE (V2SI, INT, INT) +DEF_FUNCTION_TYPE (V2SI, V2SF, V2SF) +DEF_FUNCTION_TYPE (V2SI, V2SI, SI) +DEF_FUNCTION_TYPE (V2SI, V2SI, V2SI) +DEF_FUNCTION_TYPE (V2SI, V4HI, V4HI) +DEF_FUNCTION_TYPE (V4DF, PCV4DF, V4DI) +DEF_FUNCTION_TYPE (V4DF, V4DF, INT) +DEF_FUNCTION_TYPE (V4DF, V4DF, V4DF) +DEF_FUNCTION_TYPE (V4DF, V4DF, V4DI) +DEF_FUNCTION_TYPE (V4HI, V2SI, V2SI) +DEF_FUNCTION_TYPE (V4HI, V4HI, INT) +DEF_FUNCTION_TYPE (V4HI, V4HI, SI) +DEF_FUNCTION_TYPE (V4HI, V4HI, V4HI) +DEF_FUNCTION_TYPE (V4HI, V8QI, V8QI) +DEF_FUNCTION_TYPE (V4SF, PCV4SF, V4SI) +DEF_FUNCTION_TYPE (V4SF, V4SF, DI) +DEF_FUNCTION_TYPE (V4SF, V4SF, INT) +DEF_FUNCTION_TYPE (V4SF, V4SF, PCV2SF) +DEF_FUNCTION_TYPE (V4SF, V4SF, SI) +DEF_FUNCTION_TYPE (V4SF, V4SF, V2DF) +DEF_FUNCTION_TYPE (V4SF, V4SF, V2SI) +DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF) +DEF_FUNCTION_TYPE (V4SF, V4SF, V4SI) +DEF_FUNCTION_TYPE (V4SF, V8SF, INT) +DEF_FUNCTION_TYPE (V4SI, V2DF, V2DF) +DEF_FUNCTION_TYPE (V4SI, V4SF, V4SF) +DEF_FUNCTION_TYPE (V4SI, V4SI, INT) +DEF_FUNCTION_TYPE (V4SI, V4SI, SI) +DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI) +DEF_FUNCTION_TYPE (V4SI, V8HI, V8HI) +DEF_FUNCTION_TYPE (V4SI, V8SI, INT) +DEF_FUNCTION_TYPE (V8HI, V16QI, V16QI) +DEF_FUNCTION_TYPE (V8HI, V4SI, V4SI) +DEF_FUNCTION_TYPE (V8HI, V8HI, INT) +DEF_FUNCTION_TYPE (V8HI, V8HI, SI) +DEF_FUNCTION_TYPE (V8HI, V8HI, V8HI) +DEF_FUNCTION_TYPE (V8HI, V8SF, INT) +DEF_FUNCTION_TYPE (V8HI, V4SF, INT) +DEF_FUNCTION_TYPE (V8QI, V4HI, V4HI) +DEF_FUNCTION_TYPE (V8QI, V8QI, V8QI) +DEF_FUNCTION_TYPE (V8SF, PCV8SF, V8SI) +DEF_FUNCTION_TYPE (V8SF, V8SF, INT) +DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF) +DEF_FUNCTION_TYPE (V8SF, V8SF, V8SI) +DEF_FUNCTION_TYPE (VOID, PCHAR, V16QI) +DEF_FUNCTION_TYPE (VOID, PCHAR, V32QI) +DEF_FUNCTION_TYPE (VOID, PDOUBLE, V2DF) +DEF_FUNCTION_TYPE (VOID, PDOUBLE, V4DF) +DEF_FUNCTION_TYPE (VOID, PFLOAT, V4SF) +DEF_FUNCTION_TYPE (VOID, PFLOAT, V8SF) +DEF_FUNCTION_TYPE (VOID, PINT, INT) +DEF_FUNCTION_TYPE (VOID, PULONGLONG, ULONGLONG) +DEF_FUNCTION_TYPE (VOID, PV2DI, V2DI) +DEF_FUNCTION_TYPE (VOID, PV2SF, V4SF) +DEF_FUNCTION_TYPE (VOID, PV4DI, V4DI) +DEF_FUNCTION_TYPE (VOID, UNSIGNED, UNSIGNED) + +DEF_FUNCTION_TYPE (INT, V16QI, V16QI, INT) +DEF_FUNCTION_TYPE (UCHAR, UINT, UINT, UINT) +DEF_FUNCTION_TYPE (UCHAR, UINT64, UINT, UINT) +DEF_FUNCTION_TYPE (V16HI, V16HI, V16HI, V16HI) +DEF_FUNCTION_TYPE (V16QI, V16QI, QI, INT) +DEF_FUNCTION_TYPE (V16QI, V16QI, V16QI, INT) +DEF_FUNCTION_TYPE (V16QI, V16QI, V16QI, V16QI) +DEF_FUNCTION_TYPE (V1DI, V1DI, V1DI, INT) +DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, INT) +DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DF) +DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DI, INT) +DEF_FUNCTION_TYPE (V2DI, V2DI, DI, INT) +DEF_FUNCTION_TYPE (V2DI, V2DI, UINT, UINT) +DEF_FUNCTION_TYPE (V2DI, V2DI, V2DI, INT) +DEF_FUNCTION_TYPE (V2DI, V2DI, V2DI, V2DI) +DEF_FUNCTION_TYPE (V32QI, V32QI, V32QI, V32QI) +DEF_FUNCTION_TYPE (V4DF, V4DF, V2DF, INT) +DEF_FUNCTION_TYPE (V4DF, V4DF, V4DF, INT) +DEF_FUNCTION_TYPE (V4DF, V4DF, V4DF, V4DF) +DEF_FUNCTION_TYPE (V4DF, V4DF, V4DF, V4DI, INT) +DEF_FUNCTION_TYPE (V4DI, V4DI, V4DI, V4DI) +DEF_FUNCTION_TYPE (V4HI, V4HI, HI, INT) +DEF_FUNCTION_TYPE (V4SF, V4SF, FLOAT, INT) +DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, INT) +DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SF) +DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SI, INT) +DEF_FUNCTION_TYPE (V4SI, V4SI, SI, INT) +DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, INT) +DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, V2DI) +DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, V4SI) +DEF_FUNCTION_TYPE (V8HI, V8HI, HI, INT) +DEF_FUNCTION_TYPE (V8HI, V8HI, V8HI, INT) +DEF_FUNCTION_TYPE (V8HI, V8HI, V8HI, V4SI) +DEF_FUNCTION_TYPE (V8HI, V8HI, V8HI, V8HI) +DEF_FUNCTION_TYPE (V8SF, V8SF, V4SF, INT) +DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF, INT) +DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF, V8SF) +DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF, V8SI, INT) +DEF_FUNCTION_TYPE (V8SI, V8SI, V4SI, INT) +DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, INT) +DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, V8SI) +DEF_FUNCTION_TYPE (VOID, PCVOID, UNSIGNED, UNSIGNED) +DEF_FUNCTION_TYPE (VOID, PV2DF, V2DI, V2DF) +DEF_FUNCTION_TYPE (VOID, PV4DF, V4DI, V4DF) +DEF_FUNCTION_TYPE (VOID, PV4SF, V4SI, V4SF) +DEF_FUNCTION_TYPE (VOID, PV8SF, V8SI, V8SF) +DEF_FUNCTION_TYPE (VOID, UINT, UINT, UINT) +DEF_FUNCTION_TYPE (VOID, UINT64, UINT, UINT) +DEF_FUNCTION_TYPE (VOID, V16QI, V16QI, PCHAR) +DEF_FUNCTION_TYPE (VOID, V8QI, V8QI, PCHAR) +DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DI) +DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SI) +DEF_FUNCTION_TYPE (V2UDI, V2UDI, V2UDI, V2UDI) +DEF_FUNCTION_TYPE (V4USI, V4USI, V4USI, V4USI) +DEF_FUNCTION_TYPE (V8UHI, V8UHI, V8UHI, V8UHI) +DEF_FUNCTION_TYPE (V16UQI, V16UQI, V16UQI, V16UQI) +DEF_FUNCTION_TYPE (V4DF, V4DF, V4DF, V4DI) +DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF, V8SI) + +DEF_FUNCTION_TYPE (V2DI, V2DI, V2DI, UINT, UINT) +DEF_FUNCTION_TYPE (V4HI, HI, HI, HI, HI) + +DEF_FUNCTION_TYPE (INT, V16QI, INT, V16QI, INT, INT) +DEF_FUNCTION_TYPE (V16QI, V16QI, INT, V16QI, INT, INT) + +DEF_FUNCTION_TYPE (V8QI, QI, QI, QI, QI, QI, QI, QI, QI) + +DEF_FUNCTION_TYPE_ALIAS (INT_FTYPE_V2DF_V2DF, PTEST) +DEF_FUNCTION_TYPE_ALIAS (INT_FTYPE_V2DI_V2DI, PTEST) +DEF_FUNCTION_TYPE_ALIAS (INT_FTYPE_V4DF_V4DF, PTEST) +DEF_FUNCTION_TYPE_ALIAS (INT_FTYPE_V4DI_V4DI, PTEST) +DEF_FUNCTION_TYPE_ALIAS (INT_FTYPE_V4SF_V4SF, PTEST) +DEF_FUNCTION_TYPE_ALIAS (INT_FTYPE_V8SF_V8SF, PTEST) + +DEF_FUNCTION_TYPE_ALIAS (V2DF_FTYPE_V2DF, VEC_MERGE) +DEF_FUNCTION_TYPE_ALIAS (V4SF_FTYPE_V4SF, VEC_MERGE) + +DEF_FUNCTION_TYPE_ALIAS (V1DI_FTYPE_V1DI_SI, COUNT) +DEF_FUNCTION_TYPE_ALIAS (V2DI_FTYPE_V2DI_SI, COUNT) +DEF_FUNCTION_TYPE_ALIAS (V2SI_FTYPE_V2SI_SI, COUNT) +DEF_FUNCTION_TYPE_ALIAS (V4HI_FTYPE_V4HI_SI, COUNT) +DEF_FUNCTION_TYPE_ALIAS (V4SI_FTYPE_V4SI_SI, COUNT) +DEF_FUNCTION_TYPE_ALIAS (V8HI_FTYPE_V8HI_SI, COUNT) +DEF_FUNCTION_TYPE_ALIAS (V1DI_FTYPE_V1DI_V1DI, COUNT) +DEF_FUNCTION_TYPE_ALIAS (V2DI_FTYPE_V2DI_V2DI, COUNT) +DEF_FUNCTION_TYPE_ALIAS (V2SI_FTYPE_V2SI_V2SI, COUNT) +DEF_FUNCTION_TYPE_ALIAS (V4HI_FTYPE_V4HI_V4HI, COUNT) +DEF_FUNCTION_TYPE_ALIAS (V4SI_FTYPE_V4SI_V4SI, COUNT) +DEF_FUNCTION_TYPE_ALIAS (V8HI_FTYPE_V8HI_V8HI, COUNT) + +DEF_FUNCTION_TYPE_ALIAS (V2DF_FTYPE_V2DF_V2DF, SWAP) +DEF_FUNCTION_TYPE_ALIAS (V4SF_FTYPE_V4SF_V4SF, SWAP) + +DEF_FUNCTION_TYPE_ALIAS (V2DI_FTYPE_V2DI_INT, CONVERT) +DEF_FUNCTION_TYPE_ALIAS (V2DI_FTYPE_V2DI_V2DI_INT, CONVERT) +DEF_FUNCTION_TYPE_ALIAS (V1DI_FTYPE_V1DI_V1DI_INT, CONVERT) + +DEF_FUNCTION_TYPE_ALIAS (V16QI_FTYPE_V16QI_V16QI, CMP) +DEF_FUNCTION_TYPE_ALIAS (V2DI_FTYPE_V2DI_V2DI, CMP) +DEF_FUNCTION_TYPE_ALIAS (V4SI_FTYPE_V4SI_V4SI, CMP) +DEF_FUNCTION_TYPE_ALIAS (V8HI_FTYPE_V8HI_V8HI, CMP) + +DEF_FUNCTION_TYPE_ALIAS (V16QI_FTYPE_V16QI_V16QI, TF) +DEF_FUNCTION_TYPE_ALIAS (V2DF_FTYPE_V2DF_V2DF, TF) +DEF_FUNCTION_TYPE_ALIAS (V2DI_FTYPE_V2DI_V2DI, TF) +DEF_FUNCTION_TYPE_ALIAS (V4SF_FTYPE_V4SF_V4SF, TF) +DEF_FUNCTION_TYPE_ALIAS (V4SI_FTYPE_V4SI_V4SI, TF) +DEF_FUNCTION_TYPE_ALIAS (V8HI_FTYPE_V8HI_V8HI, TF) diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c new file mode 100644 index 000000000..149735133 --- /dev/null +++ b/gcc/config/i386/i386-c.c @@ -0,0 +1,401 @@ +/* Subroutines used for macro/preprocessor support on the ia-32. + Copyright (C) 2008, 2009, 2010 + Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "tm.h" +#include "tree.h" +#include "tm_p.h" +#include "flags.h" +#include "c-family/c-common.h" +#include "ggc.h" +#include "target.h" +#include "target-def.h" +#include "cpplib.h" +#include "c-family/c-pragma.h" + +static bool ix86_pragma_target_parse (tree, tree); +static void ix86_target_macros_internal + (int, enum processor_type, enum processor_type, enum fpmath_unit, + void (*def_or_undef) (cpp_reader *, const char *)); + + +/* Internal function to either define or undef the appropriate system + macros. */ +static void +ix86_target_macros_internal (int isa_flag, + enum processor_type arch, + enum processor_type tune, + enum fpmath_unit fpmath, + void (*def_or_undef) (cpp_reader *, + const char *)) +{ + /* For some of the k6/pentium varients there weren't seperate ISA bits to + identify which tune/arch flag was passed, so figure it out here. */ + size_t arch_len = strlen (ix86_arch_string); + size_t tune_len = strlen (ix86_tune_string); + int last_arch_char = ix86_arch_string[arch_len - 1]; + int last_tune_char = ix86_tune_string[tune_len - 1]; + + /* Built-ins based on -march=. */ + switch (arch) + { + case PROCESSOR_I386: + break; + case PROCESSOR_I486: + def_or_undef (parse_in, "__i486"); + def_or_undef (parse_in, "__i486__"); + break; + case PROCESSOR_PENTIUM: + def_or_undef (parse_in, "__i586"); + def_or_undef (parse_in, "__i586__"); + def_or_undef (parse_in, "__pentium"); + def_or_undef (parse_in, "__pentium__"); + if (isa_flag & OPTION_MASK_ISA_MMX) + def_or_undef (parse_in, "__pentium_mmx__"); + break; + case PROCESSOR_PENTIUMPRO: + def_or_undef (parse_in, "__i686"); + def_or_undef (parse_in, "__i686__"); + def_or_undef (parse_in, "__pentiumpro"); + def_or_undef (parse_in, "__pentiumpro__"); + break; + case PROCESSOR_GEODE: + def_or_undef (parse_in, "__geode"); + def_or_undef (parse_in, "__geode__"); + break; + case PROCESSOR_K6: + def_or_undef (parse_in, "__k6"); + def_or_undef (parse_in, "__k6__"); + if (last_arch_char == '2') + def_or_undef (parse_in, "__k6_2__"); + else if (last_arch_char == '3') + def_or_undef (parse_in, "__k6_3__"); + else if (isa_flag & OPTION_MASK_ISA_3DNOW) + def_or_undef (parse_in, "__k6_3__"); + break; + case PROCESSOR_ATHLON: + def_or_undef (parse_in, "__athlon"); + def_or_undef (parse_in, "__athlon__"); + if (isa_flag & OPTION_MASK_ISA_SSE) + def_or_undef (parse_in, "__athlon_sse__"); + break; + case PROCESSOR_K8: + def_or_undef (parse_in, "__k8"); + def_or_undef (parse_in, "__k8__"); + break; + case PROCESSOR_AMDFAM10: + def_or_undef (parse_in, "__amdfam10"); + def_or_undef (parse_in, "__amdfam10__"); + break; + case PROCESSOR_BDVER1: + def_or_undef (parse_in, "__bdver1"); + def_or_undef (parse_in, "__bdver1__"); + break; + case PROCESSOR_BTVER1: + def_or_undef (parse_in, "__btver1"); + def_or_undef (parse_in, "__btver1__"); + break; + case PROCESSOR_PENTIUM4: + def_or_undef (parse_in, "__pentium4"); + def_or_undef (parse_in, "__pentium4__"); + break; + case PROCESSOR_NOCONA: + def_or_undef (parse_in, "__nocona"); + def_or_undef (parse_in, "__nocona__"); + break; + case PROCESSOR_CORE2_32: + case PROCESSOR_CORE2_64: + def_or_undef (parse_in, "__core2"); + def_or_undef (parse_in, "__core2__"); + break; + case PROCESSOR_COREI7_32: + case PROCESSOR_COREI7_64: + def_or_undef (parse_in, "__corei7"); + def_or_undef (parse_in, "__corei7__"); + break; + case PROCESSOR_ATOM: + def_or_undef (parse_in, "__atom"); + def_or_undef (parse_in, "__atom__"); + break; + /* use PROCESSOR_max to not set/unset the arch macro. */ + case PROCESSOR_max: + break; + case PROCESSOR_GENERIC32: + case PROCESSOR_GENERIC64: + gcc_unreachable (); + } + + /* Built-ins based on -mtune=. */ + switch (tune) + { + case PROCESSOR_I386: + def_or_undef (parse_in, "__tune_i386__"); + break; + case PROCESSOR_I486: + def_or_undef (parse_in, "__tune_i486__"); + break; + case PROCESSOR_PENTIUM: + def_or_undef (parse_in, "__tune_i586__"); + def_or_undef (parse_in, "__tune_pentium__"); + if (last_tune_char == 'x') + def_or_undef (parse_in, "__tune_pentium_mmx__"); + break; + case PROCESSOR_PENTIUMPRO: + def_or_undef (parse_in, "__tune_i686__"); + def_or_undef (parse_in, "__tune_pentiumpro__"); + switch (last_tune_char) + { + case '3': + def_or_undef (parse_in, "__tune_pentium3__"); + /* FALLTHRU */ + case '2': + def_or_undef (parse_in, "__tune_pentium2__"); + break; + } + break; + case PROCESSOR_GEODE: + def_or_undef (parse_in, "__tune_geode__"); + break; + case PROCESSOR_K6: + def_or_undef (parse_in, "__tune_k6__"); + if (last_tune_char == '2') + def_or_undef (parse_in, "__tune_k6_2__"); + else if (last_tune_char == '3') + def_or_undef (parse_in, "__tune_k6_3__"); + else if (isa_flag & OPTION_MASK_ISA_3DNOW) + def_or_undef (parse_in, "__tune_k6_3__"); + break; + case PROCESSOR_ATHLON: + def_or_undef (parse_in, "__tune_athlon__"); + if (isa_flag & OPTION_MASK_ISA_SSE) + def_or_undef (parse_in, "__tune_athlon_sse__"); + break; + case PROCESSOR_K8: + def_or_undef (parse_in, "__tune_k8__"); + break; + case PROCESSOR_AMDFAM10: + def_or_undef (parse_in, "__tune_amdfam10__"); + break; + case PROCESSOR_BDVER1: + def_or_undef (parse_in, "__tune_bdver1__"); + break; + case PROCESSOR_BTVER1: + def_or_undef (parse_in, "__tune_btver1__"); + break; + case PROCESSOR_PENTIUM4: + def_or_undef (parse_in, "__tune_pentium4__"); + break; + case PROCESSOR_NOCONA: + def_or_undef (parse_in, "__tune_nocona__"); + break; + case PROCESSOR_CORE2_32: + case PROCESSOR_CORE2_64: + def_or_undef (parse_in, "__tune_core2__"); + break; + case PROCESSOR_COREI7_32: + case PROCESSOR_COREI7_64: + def_or_undef (parse_in, "__tune_corei7__"); + break; + case PROCESSOR_ATOM: + def_or_undef (parse_in, "__tune_atom__"); + break; + case PROCESSOR_GENERIC32: + case PROCESSOR_GENERIC64: + break; + /* use PROCESSOR_max to not set/unset the tune macro. */ + case PROCESSOR_max: + break; + } + + if (isa_flag & OPTION_MASK_ISA_MMX) + def_or_undef (parse_in, "__MMX__"); + if (isa_flag & OPTION_MASK_ISA_3DNOW) + def_or_undef (parse_in, "__3dNOW__"); + if (isa_flag & OPTION_MASK_ISA_3DNOW_A) + def_or_undef (parse_in, "__3dNOW_A__"); + if (isa_flag & OPTION_MASK_ISA_SSE) + def_or_undef (parse_in, "__SSE__"); + if (isa_flag & OPTION_MASK_ISA_SSE2) + def_or_undef (parse_in, "__SSE2__"); + if (isa_flag & OPTION_MASK_ISA_SSE3) + def_or_undef (parse_in, "__SSE3__"); + if (isa_flag & OPTION_MASK_ISA_SSSE3) + def_or_undef (parse_in, "__SSSE3__"); + if (isa_flag & OPTION_MASK_ISA_SSE4_1) + def_or_undef (parse_in, "__SSE4_1__"); + if (isa_flag & OPTION_MASK_ISA_SSE4_2) + def_or_undef (parse_in, "__SSE4_2__"); + if (isa_flag & OPTION_MASK_ISA_AES) + def_or_undef (parse_in, "__AES__"); + if (isa_flag & OPTION_MASK_ISA_PCLMUL) + def_or_undef (parse_in, "__PCLMUL__"); + if (isa_flag & OPTION_MASK_ISA_AVX) + def_or_undef (parse_in, "__AVX__"); + if (isa_flag & OPTION_MASK_ISA_FMA) + def_or_undef (parse_in, "__FMA__"); + if (isa_flag & OPTION_MASK_ISA_SSE4A) + def_or_undef (parse_in, "__SSE4A__"); + if (isa_flag & OPTION_MASK_ISA_FMA4) + def_or_undef (parse_in, "__FMA4__"); + if (isa_flag & OPTION_MASK_ISA_XOP) + def_or_undef (parse_in, "__XOP__"); + if (isa_flag & OPTION_MASK_ISA_LWP) + def_or_undef (parse_in, "__LWP__"); + if (isa_flag & OPTION_MASK_ISA_ABM) + def_or_undef (parse_in, "__ABM__"); + if (isa_flag & OPTION_MASK_ISA_BMI) + def_or_undef (parse_in, "__BMI__"); + if (isa_flag & OPTION_MASK_ISA_TBM) + def_or_undef (parse_in, "__TBM__"); + if (isa_flag & OPTION_MASK_ISA_POPCNT) + def_or_undef (parse_in, "__POPCNT__"); + if (isa_flag & OPTION_MASK_ISA_FSGSBASE) + def_or_undef (parse_in, "__FSGSBASE__"); + if (isa_flag & OPTION_MASK_ISA_RDRND) + def_or_undef (parse_in, "__RDRND__"); + if (isa_flag & OPTION_MASK_ISA_F16C) + def_or_undef (parse_in, "__F16C__"); + if ((fpmath & FPMATH_SSE) && (isa_flag & OPTION_MASK_ISA_SSE)) + def_or_undef (parse_in, "__SSE_MATH__"); + if ((fpmath & FPMATH_SSE) && (isa_flag & OPTION_MASK_ISA_SSE2)) + def_or_undef (parse_in, "__SSE2_MATH__"); +} + + +/* Hook to validate the current #pragma GCC target and set the state, and + update the macros based on what was changed. If ARGS is NULL, then + POP_TARGET is used to reset the options. */ + +static bool +ix86_pragma_target_parse (tree args, tree pop_target) +{ + tree prev_tree = build_target_option_node (); + tree cur_tree; + struct cl_target_option *prev_opt; + struct cl_target_option *cur_opt; + int prev_isa; + int cur_isa; + int diff_isa; + enum processor_type prev_arch; + enum processor_type prev_tune; + enum processor_type cur_arch; + enum processor_type cur_tune; + + if (! args) + { + cur_tree = ((pop_target) + ? pop_target + : target_option_default_node); + cl_target_option_restore (&global_options, + TREE_TARGET_OPTION (cur_tree)); + } + else + { + cur_tree = ix86_valid_target_attribute_tree (args); + if (!cur_tree) + return false; + } + + target_option_current_node = cur_tree; + + /* Figure out the previous/current isa, arch, tune and the differences. */ + prev_opt = TREE_TARGET_OPTION (prev_tree); + cur_opt = TREE_TARGET_OPTION (cur_tree); + prev_isa = prev_opt->x_ix86_isa_flags; + cur_isa = cur_opt->x_ix86_isa_flags; + diff_isa = (prev_isa ^ cur_isa); + prev_arch = (enum processor_type) prev_opt->arch; + prev_tune = (enum processor_type) prev_opt->tune; + cur_arch = (enum processor_type) cur_opt->arch; + cur_tune = (enum processor_type) cur_opt->tune; + + /* If the same processor is used for both previous and current options, don't + change the macros. */ + if (cur_arch == prev_arch) + cur_arch = prev_arch = PROCESSOR_max; + + if (cur_tune == prev_tune) + cur_tune = prev_tune = PROCESSOR_max; + + /* Undef all of the macros for that are no longer current. */ + ix86_target_macros_internal (prev_isa & diff_isa, + prev_arch, + prev_tune, + (enum fpmath_unit) prev_opt->fpmath, + cpp_undef); + + /* Define all of the macros for new options that were just turned on. */ + ix86_target_macros_internal (cur_isa & diff_isa, + cur_arch, + cur_tune, + (enum fpmath_unit) cur_opt->fpmath, + cpp_define); + + return true; +} + +/* Function to tell the preprocessor about the defines for the current target. */ + +void +ix86_target_macros (void) +{ + /* 32/64-bit won't change with target specific options, so do the assert and + builtin_define_std calls here. */ + if (TARGET_64BIT) + { + cpp_assert (parse_in, "cpu=x86_64"); + cpp_assert (parse_in, "machine=x86_64"); + cpp_define (parse_in, "__amd64"); + cpp_define (parse_in, "__amd64__"); + cpp_define (parse_in, "__x86_64"); + cpp_define (parse_in, "__x86_64__"); + } + else + { + cpp_assert (parse_in, "cpu=i386"); + cpp_assert (parse_in, "machine=i386"); + builtin_define_std ("i386"); + } + + ix86_target_macros_internal (ix86_isa_flags, + ix86_arch, + ix86_tune, + ix86_fpmath, + cpp_define); +} + + +/* Register target pragmas. We need to add the hook for parsing #pragma GCC + option here rather than in i386.c since it will pull in various preprocessor + functions, and those are not present in languages like fortran without a + preprocessor. */ + +void +ix86_register_pragmas (void) +{ + /* Update pragma hook to allow parsing #pragma GCC target. */ + targetm.target_option.pragma_parse = ix86_pragma_target_parse; + +#ifdef REGISTER_SUBTARGET_PRAGMAS + REGISTER_SUBTARGET_PRAGMAS (); +#endif +} diff --git a/gcc/config/i386/i386-interix.h b/gcc/config/i386/i386-interix.h new file mode 100644 index 000000000..a2f579a1c --- /dev/null +++ b/gcc/config/i386/i386-interix.h @@ -0,0 +1,357 @@ +/* Target definitions for GCC for Intel 80386 running Interix + Parts Copyright (C) 1991, 1999, 2000, 2002, 2003, 2004, 2007, 2008, 2009, + 2010 Free Software Foundation, Inc. + + Parts: + by Douglas B. Rupp (drupp@cs.washington.edu). + by Ron Guilmette (rfg@netcom.com). + by Donn Terry (donn@softway.com). + by Mumit Khan (khan@xraylith.wisc.edu). + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +/* The rest must follow. */ + +#define DBX_DEBUGGING_INFO 1 +#define SDB_DEBUGGING_INFO 1 +#define PREFERRED_DEBUGGING_TYPE DBX_DEBUG + +/* By default, target has a 80387, uses IEEE compatible arithmetic, + and returns float values in the 387 and needs stack probes + We also align doubles to 64-bits for MSVC default compatibility + We do bitfields MSVC-compatibly by default, too. */ +#undef TARGET_SUBTARGET_DEFAULT +#define TARGET_SUBTARGET_DEFAULT \ + (MASK_80387 | MASK_IEEE_FP | MASK_FLOAT_RETURNS | MASK_STACK_PROBE | \ + MASK_ALIGN_DOUBLE | MASK_MS_BITFIELD_LAYOUT) + +#undef TARGET_CPU_DEFAULT +#define TARGET_CPU_DEFAULT TARGET_CPU_DEFAULT_i486 + +#define WCHAR_TYPE_SIZE 16 +#define WCHAR_TYPE "short unsigned int" + +/* WinNT (and thus Interix) use unsigned int */ +#define SIZE_TYPE "unsigned int" + +#define ASM_LOAD_ADDR(loc, reg) " leal " #loc "," #reg "\n" + +#define TARGET_DECLSPEC 1 + +/* cpp handles __STDC__ */ +#define TARGET_OS_CPP_BUILTINS() \ + do \ + { \ + builtin_define ("__INTERIX"); \ + builtin_define ("__OPENNT"); \ + builtin_define ("_M_IX86=300"); \ + builtin_define ("_X86_=1"); \ + builtin_define ("__stdcall=__attribute__((__stdcall__))"); \ + builtin_define ("__cdecl=__attribute__((__cdecl__))"); \ + builtin_assert ("system=unix"); \ + builtin_assert ("system=interix"); \ + if (preprocessing_asm_p ()) \ + builtin_define_std ("LANGUAGE_ASSEMBLY"); \ + else \ + { \ + builtin_define_std ("LANGUAGE_C"); \ + if (c_dialect_cxx ()) \ + builtin_define_std ("LANGUAGE_C_PLUS_PLUS"); \ + if (c_dialect_objc ()) \ + builtin_define_std ("LANGUAGE_OBJECTIVE_C"); \ + } \ + } \ + while (0) + +#undef CPP_SPEC +/* Write out the correct language type definition for the header files. + Unless we have assembler language, write out the symbols for C. + mieee is an Alpha specific variant. Cross pollination a bad idea. + */ +#define CPP_SPEC "-remap %{posix:-D_POSIX_SOURCE} \ +-isystem %$INTERIX_ROOT/usr/include" + +#define TARGET_VERSION fprintf (stderr, " (i386 Interix)"); + +/* The global __fltused is necessary to cause the printf/scanf routines + for outputting/inputting floating point numbers to be loaded. Since this + is kind of hard to detect, we just do it all the time. */ +#undef X86_FILE_START_FLTUSED +#define X86_FILE_START_FLTUSED 1 + +/* A table of bytes codes used by the ASM_OUTPUT_ASCII and + ASM_OUTPUT_LIMITED_STRING macros. Each byte in the table + corresponds to a particular byte value [0..255]. For any + given byte value, if the value in the corresponding table + position is zero, the given character can be output directly. + If the table value is 1, the byte must be output as a \ooo + octal escape. If the tables value is anything else, then the + byte value should be output as a \ followed by the value + in the table. Note that we can use standard UN*X escape + sequences for many control characters, but we don't use + \a to represent BEL because some svr4 assemblers (e.g. on + the i386) don't know about that. Also, we don't use \v + since some versions of gas, such as 2.2 did not accept it. */ + +#define ESCAPES \ +"\1\1\1\1\1\1\1\1btn\1fr\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\ +\0\0\"\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\\\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\ +\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\ +\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\ +\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\ +\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1" + +/* Some svr4 assemblers have a limit on the number of characters which + can appear in the operand of a .string directive. If your assembler + has such a limitation, you should define STRING_LIMIT to reflect that + limit. Note that at least some svr4 assemblers have a limit on the + actual number of bytes in the double-quoted string, and that they + count each character in an escape sequence as one byte. Thus, an + escape sequence like \377 would count as four bytes. + + If your target assembler doesn't support the .string directive, you + should define this to zero. +*/ + +#define STRING_LIMIT ((unsigned) 256) + +#define STRING_ASM_OP "\t.string\t" + +/* The routine used to output NUL terminated strings. We use a special + version of this for most svr4 targets because doing so makes the + generated assembly code more compact (and thus faster to assemble) + as well as more readable, especially for targets like the i386 + (where the only alternative is to output character sequences as + comma separated lists of numbers). */ + +#define ASM_OUTPUT_LIMITED_STRING(FILE, STR) \ + do \ + { \ + const unsigned char *_limited_str = \ + (const unsigned char *) (STR); \ + unsigned ch; \ + fprintf ((FILE), "%s\"", STRING_ASM_OP); \ + for (; (ch = *_limited_str); _limited_str++) \ + { \ + int escape = ESCAPES[ch]; \ + switch (escape) \ + { \ + case 0: \ + putc (ch, (FILE)); \ + break; \ + case 1: \ + fprintf ((FILE), "\\%03o", ch); \ + break; \ + default: \ + putc ('\\', (FILE)); \ + putc (escape, (FILE)); \ + break; \ + } \ + } \ + fprintf ((FILE), "\"\n"); \ + } \ + while (0) + +/* The routine used to output sequences of byte values. We use a special + version of this for most svr4 targets because doing so makes the + generated assembly code more compact (and thus faster to assemble) + as well as more readable. Note that if we find subparts of the + character sequence which end with NUL (and which are shorter than + STRING_LIMIT) we output those using ASM_OUTPUT_LIMITED_STRING. */ + +#undef ASM_OUTPUT_ASCII +#define ASM_OUTPUT_ASCII(FILE, STR, LENGTH) \ + do \ + { \ + const unsigned char *_ascii_bytes = \ + (const unsigned char *) (STR); \ + const unsigned char *limit = _ascii_bytes + (LENGTH); \ + unsigned bytes_in_chunk = 0; \ + for (; _ascii_bytes < limit; _ascii_bytes++) \ + { \ + const unsigned char *p; \ + if (bytes_in_chunk >= 64) \ + { \ + fputc ('\n', (FILE)); \ + bytes_in_chunk = 0; \ + } \ + for (p = _ascii_bytes; p < limit && *p != '\0'; p++) \ + continue; \ + if (p < limit && (p - _ascii_bytes) <= (long) STRING_LIMIT) \ + { \ + if (bytes_in_chunk > 0) \ + { \ + fputc ('\n', (FILE)); \ + bytes_in_chunk = 0; \ + } \ + ASM_OUTPUT_LIMITED_STRING ((FILE), _ascii_bytes); \ + _ascii_bytes = p; \ + } \ + else \ + { \ + if (bytes_in_chunk == 0) \ + fputs (ASM_BYTE, (FILE)); \ + else \ + fputc (',', (FILE)); \ + fprintf ((FILE), "0x%02x", *_ascii_bytes); \ + bytes_in_chunk += 5; \ + } \ + } \ + if (bytes_in_chunk > 0) \ + fputc ('\n', (FILE)); \ + } \ + while (0) + +/* Emit code to check the stack when allocating more that 4000 + bytes in one go. */ + +#define CHECK_STACK_LIMIT 0x1000 + +/* the following are OSF linker (not gld) specific... we don't want them */ +#undef HAS_INIT_SECTION +#undef LD_INIT_SWITCH +#undef LD_FINI_SWITCH + +/* The following are needed for us to be able to use winnt.c, but are not + otherwise meaningful to Interix. (The functions that use these are + never called because we don't do DLLs.) */ +#define TARGET_NOP_FUN_DLLIMPORT 1 +#define drectve_section() /* nothing */ + +/* Objective-C has its own packing rules... + Objc tries to parallel the code in stor-layout.c at runtime + (see libobjc/encoding.c). This (compile-time) packing info isn't + available at runtime, so it's hopeless to try. + + And if the user tries to set the flag for objc, give an error + so he has some clue. */ + +#undef SUBTARGET_OVERRIDE_OPTIONS +#define SUBTARGET_OVERRIDE_OPTIONS \ +do { \ + if (strcmp (lang_hooks.name, "GNU Objective-C") == 0) \ + { \ + if ((target_flags & MASK_MS_BITFIELD_LAYOUT) != 0 \ + && (target_flags_explicit & MASK_MS_BITFIELD_LAYOUT) != 0) \ + { \ + error ("ms-bitfields not supported for objc"); \ + } \ + target_flags &= ~MASK_MS_BITFIELD_LAYOUT; \ + } \ +} while (0) + +#define EH_FRAME_IN_DATA_SECTION + +#define READONLY_DATA_SECTION_ASM_OP "\t.section\t.rdata,\"r\"" + +/* The MS compilers take alignment as a number of bytes, so we do as well */ +#undef ASM_OUTPUT_ALIGN +#define ASM_OUTPUT_ALIGN(FILE,LOG) \ + if ((LOG)!=0) fprintf ((FILE), "\t.balign %d\n", 1<<(LOG)) + +/* The linker will take care of this, and having them causes problems with + ld -r (specifically -rU). */ +#define CTOR_LISTS_DEFINED_EXTERNALLY 1 + +#define SET_ASM_OP "\t.set\t" +/* Output a definition (implements alias) */ +#define ASM_OUTPUT_DEF(FILE,LABEL1,LABEL2) \ +do \ +{ \ + fputs (SET_ASM_OP, (FILE)); \ + assemble_name (FILE, LABEL1); \ + fputc (',', (FILE)); \ + assemble_name (FILE, LABEL2); \ + fputc ('\n', (FILE)); \ + } \ +while (0) + +#define HOST_PTR_AS_INT unsigned long + +/* The following two flags are usually "off" for i386, because some non-gnu + tools (for the i386) don't handle them. However, we don't have that + problem, so.... */ + +/* Forward references to tags are allowed. */ +#define SDB_ALLOW_FORWARD_REFERENCES + +/* Unknown tags are also allowed. */ +#define SDB_ALLOW_UNKNOWN_REFERENCES + +/* The integer half of this list needs to be constant. However, there's + a lot of disagreement about what the floating point adjustments should + be. We pick one that works with gdb. (The underlying problem is + what to do about the segment registers. Since we have access to them + from /proc, we'll allow them to be accessed in gdb, even tho the + gcc compiler can't generate them. (There's some evidence that + MSVC does, but possibly only for certain special "canned" sequences.) */ + +#undef DBX_REGISTER_NUMBER +#define DBX_REGISTER_NUMBER(n) \ +(TARGET_64BIT ? dbx64_register_map[n] \ + : (n) == 0 ? 0 \ + : (n) == 1 ? 2 \ + : (n) == 2 ? 1 \ + : (n) == 3 ? 3 \ + : (n) == 4 ? 6 \ + : (n) == 5 ? 7 \ + : (n) == 6 ? 5 \ + : (n) == 7 ? 4 \ + : ((n) >= FIRST_STACK_REG && (n) <= LAST_STACK_REG) ? (n)+8 \ + : (-1)) + +/* Define this macro if references to a symbol must be treated + differently depending on something about the variable or + function named by the symbol (such as what section it is in). */ + +#define SUBTARGET_ENCODE_SECTION_INFO i386_pe_encode_section_info +#undef TARGET_STRIP_NAME_ENCODING +#define TARGET_STRIP_NAME_ENCODING i386_pe_strip_name_encoding_full + +#if 0 +/* Turn this back on when the linker is updated to handle grouped + .data$ sections correctly. See corresponding note in i386/interix.c. + MK. */ + +/* Interix uses explicit import from shared libraries. */ +#define MULTIPLE_SYMBOL_SPACES 1 + +extern void i386_pe_unique_section (tree, int); +#define TARGET_ASM_UNIQUE_SECTION i386_pe_unique_section +#define TARGET_ASM_FUNCTION_RODATA_SECTION default_no_function_rodata_section + +#define SUPPORTS_ONE_ONLY 1 +#endif /* 0 */ + +/* Switch into a generic section. */ +#define TARGET_ASM_NAMED_SECTION default_pe_asm_named_section + +/* DWARF2 Unwinding doesn't work with exception handling yet. */ +#define DWARF2_UNWIND_INFO 0 + +/* Don't assume anything about the header files. */ +#define NO_IMPLICIT_EXTERN_C + +/* MSVC returns structs of up to 8 bytes via registers. */ + +#define DEFAULT_PCC_STRUCT_RETURN 0 + +#define SUBTARGET_RETURN_IN_MEMORY(TYPE, FNTYPE) \ + (TYPE_MODE (TYPE) == BLKmode \ + || (AGGREGATE_TYPE_P (TYPE) && int_size_in_bytes (TYPE) > 8 )) diff --git a/gcc/config/i386/i386-interix3.h b/gcc/config/i386/i386-interix3.h new file mode 100644 index 000000000..abd202c91 --- /dev/null +++ b/gcc/config/i386/i386-interix3.h @@ -0,0 +1,23 @@ +/* Target definitions for GCC for Intel 80386 running Interix V3. + Copyright (C) 2001, 2007 Free Software Foundation, Inc. + Contributed by Douglas B. Rupp (rupp@gnat.com) + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#undef CPP_SPEC +#define CPP_SPEC "%{posix:-D_POSIX_SOURCE}" + diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def new file mode 100644 index 000000000..c1e82cc6d --- /dev/null +++ b/gcc/config/i386/i386-modes.def @@ -0,0 +1,91 @@ +/* Definitions of target machine for GCC for IA-32. + Copyright (C) 2002, 2004, 2005, 2007, 2008 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +/* The x86_64 ABI specifies both XF and TF modes. + XFmode is __float80 is IEEE extended; TFmode is __float128 + is IEEE quad. */ + +FRACTIONAL_FLOAT_MODE (XF, 80, 12, ieee_extended_intel_96_format); +FLOAT_MODE (TF, 16, ieee_quad_format); + +/* In ILP32 mode, XFmode has size 12 and alignment 4. + In LP64 mode, XFmode has size and alignment 16. */ +ADJUST_FLOAT_FORMAT (XF, (TARGET_128BIT_LONG_DOUBLE + ? &ieee_extended_intel_128_format + : TARGET_96_ROUND_53_LONG_DOUBLE + ? &ieee_extended_intel_96_round_53_format + : &ieee_extended_intel_96_format)); +ADJUST_BYTESIZE (XF, TARGET_128BIT_LONG_DOUBLE ? 16 : 12); +ADJUST_ALIGNMENT (XF, TARGET_128BIT_LONG_DOUBLE ? 16 : 4); + +/* Add any extra modes needed to represent the condition code. + + For the i386, we need separate modes when floating-point + equality comparisons are being done. + + Add CCNO to indicate comparisons against zero that requires + Overflow flag to be unset. Sign bit test is used instead and + thus can be used to form "a&b>0" type of tests. + + Add CCGC to indicate comparisons against zero that allows + unspecified garbage in the Carry flag. This mode is used + by inc/dec instructions. + + Add CCGOC to indicate comparisons against zero that allows + unspecified garbage in the Carry and Overflow flag. This + mode is used to simulate comparisons of (a-b) and (a+b) + against zero using sub/cmp/add operations. + + Add CCA to indicate that only the Above flag is valid. + Add CCC to indicate that only the Carry flag is valid. + Add CCO to indicate that only the Overflow flag is valid. + Add CCS to indicate that only the Sign flag is valid. + Add CCZ to indicate that only the Zero flag is valid. */ + +CC_MODE (CCGC); +CC_MODE (CCGOC); +CC_MODE (CCNO); +CC_MODE (CCA); +CC_MODE (CCC); +CC_MODE (CCO); +CC_MODE (CCS); +CC_MODE (CCZ); +CC_MODE (CCFP); +CC_MODE (CCFPU); + +/* Vector modes. Note that VEC_CONCAT patterns require vector + sizes twice as big as implemented in hardware. */ +VECTOR_MODES (INT, 4); /* V4QI V2HI */ +VECTOR_MODES (INT, 8); /* V8QI V4HI V2SI */ +VECTOR_MODES (INT, 16); /* V16QI V8HI V4SI V2DI */ +VECTOR_MODES (INT, 32); /* V32QI V16HI V8SI V4DI */ +VECTOR_MODES (INT, 64); /* V64QI V32HI V16SI V8DI */ +VECTOR_MODES (FLOAT, 8); /* V4HF V2SF */ +VECTOR_MODES (FLOAT, 16); /* V8HF V4SF V2DF */ +VECTOR_MODES (FLOAT, 32); /* V16HF V8SF V4DF */ +VECTOR_MODES (FLOAT, 64); /* V32HF V16SF V8DF */ +VECTOR_MODE (INT, TI, 1); /* V1TI */ +VECTOR_MODE (INT, DI, 1); /* V1DI */ +VECTOR_MODE (INT, SI, 1); /* V1SI */ +VECTOR_MODE (INT, QI, 2); /* V2QI */ + +INT_MODE (OI, 32); + +/* The symbol Pmode stands for one of the above machine modes (usually SImode). + The tm.h file specifies which one. It is not a distinct mode. */ diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h new file mode 100644 index 000000000..d4513fa8e --- /dev/null +++ b/gcc/config/i386/i386-protos.h @@ -0,0 +1,292 @@ +/* Definitions of target machine for GCC for IA-32. + Copyright (C) 1988, 1992, 1994, 1995, 1996, 1996, 1997, 1998, 1999, + 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 + Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +/* Functions in i386.c */ +extern bool ix86_target_stack_probe (void); +extern bool ix86_can_use_return_insn_p (void); +extern void ix86_setup_frame_addresses (void); + +extern HOST_WIDE_INT ix86_initial_elimination_offset (int, int); +extern void ix86_expand_prologue (void); +extern void ix86_expand_epilogue (int); +extern void ix86_expand_split_stack_prologue (void); + +extern void ix86_output_addr_vec_elt (FILE *, int); +extern void ix86_output_addr_diff_elt (FILE *, int, int); + +extern enum calling_abi ix86_cfun_abi (void); +extern enum calling_abi ix86_function_type_abi (const_tree); + +#ifdef RTX_CODE +extern int standard_80387_constant_p (rtx); +extern const char *standard_80387_constant_opcode (rtx); +extern rtx standard_80387_constant_rtx (int); +extern int standard_sse_constant_p (rtx); +extern const char *standard_sse_constant_opcode (rtx, rtx); +extern bool symbolic_reference_mentioned_p (rtx); +extern bool extended_reg_mentioned_p (rtx); +extern bool x86_extended_QIreg_mentioned_p (rtx); +extern bool x86_extended_reg_mentioned_p (rtx); +extern bool x86_maybe_negate_const_int (rtx *, enum machine_mode); +extern enum machine_mode ix86_cc_mode (enum rtx_code, rtx, rtx); + +extern int avx_vpermilp_parallel (rtx par, enum machine_mode mode); +extern int avx_vperm2f128_parallel (rtx par, enum machine_mode mode); + +extern bool ix86_expand_movmem (rtx, rtx, rtx, rtx, rtx, rtx); +extern bool ix86_expand_setmem (rtx, rtx, rtx, rtx, rtx, rtx); +extern bool ix86_expand_strlen (rtx, rtx, rtx, rtx); + +extern bool legitimate_constant_p (rtx); +extern bool constant_address_p (rtx); +extern bool legitimate_pic_operand_p (rtx); +extern bool legitimate_pic_address_disp_p (rtx); +extern bool ix86_legitimize_reload_address (rtx, enum machine_mode, + int, int, int); +extern void print_reg (rtx, int, FILE*); +extern void ix86_print_operand (FILE *, rtx, int); + +extern void split_double_mode (enum machine_mode, rtx[], int, rtx[], rtx[]); + +extern const char *output_set_got (rtx, rtx); +extern const char *output_387_binary_op (rtx, rtx*); +extern const char *output_387_reg_move (rtx, rtx*); +extern const char *output_fix_trunc (rtx, rtx*, int); +extern const char *output_fp_compare (rtx, rtx*, int, int); +extern const char *output_adjust_stack_and_probe (rtx); +extern const char *output_probe_stack_range (rtx, rtx); + +extern void ix86_expand_clear (rtx); +extern void ix86_expand_move (enum machine_mode, rtx[]); +extern void ix86_expand_vector_move (enum machine_mode, rtx[]); +extern void ix86_expand_vector_move_misalign (enum machine_mode, rtx[]); +extern void ix86_expand_push (enum machine_mode, rtx); +extern rtx ix86_fixup_binary_operands (enum rtx_code, + enum machine_mode, rtx[]); +extern void ix86_fixup_binary_operands_no_copy (enum rtx_code, + enum machine_mode, rtx[]); +extern void ix86_expand_binary_operator (enum rtx_code, + enum machine_mode, rtx[]); +extern bool ix86_binary_operator_ok (enum rtx_code, enum machine_mode, rtx[]); +extern bool ix86_lea_for_add_ok (rtx, rtx[]); +extern bool ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high); +extern bool ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn); +extern bool ix86_agi_dependent (rtx set_insn, rtx use_insn); +extern void ix86_expand_unary_operator (enum rtx_code, enum machine_mode, + rtx[]); +extern rtx ix86_build_const_vector (enum machine_mode, bool, rtx); +extern rtx ix86_build_signbit_mask (enum machine_mode, bool, bool); +extern void ix86_split_convert_uns_si_sse (rtx[]); +extern void ix86_expand_convert_uns_didf_sse (rtx, rtx); +extern void ix86_expand_convert_uns_sixf_sse (rtx, rtx); +extern void ix86_expand_convert_uns_sidf_sse (rtx, rtx); +extern void ix86_expand_convert_uns_sisf_sse (rtx, rtx); +extern void ix86_expand_convert_sign_didf_sse (rtx, rtx); +extern enum ix86_fpcmp_strategy ix86_fp_comparison_strategy (enum rtx_code); +extern void ix86_expand_fp_absneg_operator (enum rtx_code, enum machine_mode, + rtx[]); +extern void ix86_expand_copysign (rtx []); +extern void ix86_split_copysign_const (rtx []); +extern void ix86_split_copysign_var (rtx []); +extern bool ix86_unary_operator_ok (enum rtx_code, enum machine_mode, rtx[]); +extern bool ix86_match_ccmode (rtx, enum machine_mode); +extern void ix86_expand_branch (enum rtx_code, rtx, rtx, rtx); +extern void ix86_expand_setcc (rtx, enum rtx_code, rtx, rtx); +extern bool ix86_expand_int_movcc (rtx[]); +extern bool ix86_expand_fp_movcc (rtx[]); +extern bool ix86_expand_fp_vcond (rtx[]); +extern bool ix86_expand_int_vcond (rtx[]); +extern void ix86_expand_sse_unpack (rtx[], bool, bool); +extern void ix86_expand_sse4_unpack (rtx[], bool, bool); +extern bool ix86_expand_int_addcc (rtx[]); +extern rtx ix86_expand_call (rtx, rtx, rtx, rtx, rtx, int); +extern void ix86_split_call_vzeroupper (rtx, rtx); +extern void x86_initialize_trampoline (rtx, rtx, rtx); +extern rtx ix86_zero_extend_to_Pmode (rtx); +extern void ix86_split_long_move (rtx[]); +extern void ix86_split_ashl (rtx *, rtx, enum machine_mode); +extern void ix86_split_ashr (rtx *, rtx, enum machine_mode); +extern void ix86_split_lshr (rtx *, rtx, enum machine_mode); +extern rtx ix86_find_base_term (rtx); +extern bool ix86_check_movabs (rtx, int); +extern void ix86_split_idivmod (enum machine_mode, rtx[], bool); + +extern rtx assign_386_stack_local (enum machine_mode, enum ix86_stack_slot); +extern int ix86_attr_length_immediate_default (rtx, int); +extern int ix86_attr_length_address_default (rtx); +extern int ix86_attr_length_vex_default (rtx, int, int); + +extern enum machine_mode ix86_fp_compare_mode (enum rtx_code); + +extern rtx ix86_libcall_value (enum machine_mode); +extern bool ix86_function_arg_regno_p (int); +extern void ix86_asm_output_function_label (FILE *, const char *, tree); +extern rtx ix86_force_to_memory (enum machine_mode, rtx); +extern void ix86_free_from_memory (enum machine_mode); +extern void ix86_call_abi_override (const_tree); +extern int ix86_reg_parm_stack_space (const_tree); + +extern void ix86_split_fp_branch (enum rtx_code code, rtx, rtx, + rtx, rtx, rtx, rtx); +extern bool ix86_hard_regno_mode_ok (int, enum machine_mode); +extern bool ix86_modes_tieable_p (enum machine_mode, enum machine_mode); +extern bool ix86_secondary_memory_needed (enum reg_class, enum reg_class, + enum machine_mode, int); +extern bool ix86_cannot_change_mode_class (enum machine_mode, + enum machine_mode, enum reg_class); +extern int ix86_mode_needed (int, rtx); +extern void emit_i387_cw_initialization (int); +extern void x86_order_regs_for_local_alloc (void); +extern void x86_function_profiler (FILE *, int); +extern void x86_emit_floatuns (rtx [2]); +extern void ix86_emit_fp_unordered_jump (rtx); + +extern void ix86_emit_i387_log1p (rtx, rtx); +extern void ix86_emit_swdivsf (rtx, rtx, rtx, enum machine_mode); +extern void ix86_emit_swsqrtsf (rtx, rtx, enum machine_mode, bool); + +extern enum rtx_code ix86_reverse_condition (enum rtx_code, enum machine_mode); + +extern void ix86_expand_lround (rtx, rtx); +extern void ix86_expand_lfloorceil (rtx, rtx, bool); +extern void ix86_expand_rint (rtx, rtx); +extern void ix86_expand_floorceil (rtx, rtx, bool); +extern void ix86_expand_floorceildf_32 (rtx, rtx, bool); +extern void ix86_expand_round (rtx, rtx); +extern void ix86_expand_rounddf_32 (rtx, rtx); +extern void ix86_expand_trunc (rtx, rtx); +extern void ix86_expand_truncdf_32 (rtx, rtx); + +#ifdef TREE_CODE +extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int); +#endif /* TREE_CODE */ + +#endif /* RTX_CODE */ + +#ifdef TREE_CODE +extern int ix86_data_alignment (tree, int); +extern unsigned int ix86_local_alignment (tree, enum machine_mode, + unsigned int); +extern unsigned int ix86_minimum_alignment (tree, enum machine_mode, + unsigned int); +extern int ix86_constant_alignment (tree, int); +extern tree ix86_handle_shared_attribute (tree *, tree, tree, int, bool *); +extern tree ix86_handle_selectany_attribute (tree *, tree, tree, int, bool *); +extern int x86_field_alignment (tree, int); +extern tree ix86_valid_target_attribute_tree (tree); +#endif + +extern rtx ix86_tls_get_addr (void); +extern rtx ix86_tls_module_base (void); + +extern void ix86_expand_vector_init (bool, rtx, rtx); +extern void ix86_expand_vector_set (bool, rtx, rtx, int); +extern void ix86_expand_vector_extract (bool, rtx, rtx, int); +extern void ix86_expand_reduc_v4sf (rtx (*)(rtx, rtx, rtx), rtx, rtx); + +extern void ix86_expand_vec_extract_even_odd (rtx, rtx, rtx, unsigned); + +/* In i386-c.c */ +extern void ix86_target_macros (void); +extern void ix86_register_pragmas (void); + +/* In winnt.c */ +extern void i386_pe_unique_section (tree, int); +extern void i386_pe_declare_function_type (FILE *, const char *, int); +extern void i386_pe_record_external_function (tree, const char *); +extern void i386_pe_maybe_record_exported_symbol (tree, const char *, int); +extern void i386_pe_encode_section_info (tree, rtx, int); +extern bool i386_pe_binds_local_p (const_tree); +extern const char *i386_pe_strip_name_encoding_full (const char *); +extern bool i386_pe_valid_dllimport_attribute_p (const_tree); +extern unsigned int i386_pe_section_type_flags (tree, const char *, int); +extern void i386_pe_asm_named_section (const char *, unsigned int, tree); +extern void i386_pe_asm_output_aligned_decl_common (FILE *, tree, + const char *, + HOST_WIDE_INT, + HOST_WIDE_INT); +extern void i386_pe_file_end (void); +extern void i386_pe_start_function (FILE *, const char *, tree); +extern void i386_pe_end_function (FILE *, const char *, tree); +extern void i386_pe_assemble_visibility (tree, int); +extern tree i386_pe_mangle_decl_assembler_name (tree, tree); +extern tree i386_pe_mangle_assembler_name (const char *); + +extern void i386_pe_seh_init (FILE *); +extern void i386_pe_seh_end_prologue (FILE *); +extern void i386_pe_seh_unwind_emit (FILE *, rtx); + +/* In winnt-cxx.c and winnt-stubs.c */ +extern void i386_pe_adjust_class_at_definition (tree); +extern bool i386_pe_type_dllimport_p (tree); +extern bool i386_pe_type_dllexport_p (tree); + +extern rtx maybe_get_pool_constant (rtx); + +extern char internal_label_prefix[16]; +extern int internal_label_prefix_len; + +enum ix86_address_seg { SEG_DEFAULT, SEG_FS, SEG_GS }; +struct ix86_address +{ + rtx base, index, disp; + HOST_WIDE_INT scale; + enum ix86_address_seg seg; +}; + +extern int ix86_decompose_address (rtx, struct ix86_address *); +extern int memory_address_length (rtx addr); +extern void x86_output_aligned_bss (FILE *, tree, const char *, + unsigned HOST_WIDE_INT, int); +extern void x86_elf_aligned_common (FILE *, const char *, + unsigned HOST_WIDE_INT, int); + +#ifdef RTX_CODE +extern void ix86_fp_comparison_codes (enum rtx_code code, enum rtx_code *, + enum rtx_code *, enum rtx_code *); +extern enum rtx_code ix86_fp_compare_code_to_integer (enum rtx_code); +extern rtx construct_plt_address (rtx); +#endif +extern int asm_preferred_eh_data_format (int, int); + +#ifdef HAVE_ATTR_cpu +extern enum attr_cpu ix86_schedule; +#endif + +extern const char * ix86_output_call_insn (rtx insn, rtx call_op, int addr_op); + +#ifdef RTX_CODE +/* Target data for multipass lookahead scheduling. + Currently used for Core 2/i7 tuning. */ +struct ix86_first_cycle_multipass_data_ +{ + /* The length (in bytes) of ifetch block in this solution. */ + int ifetch_block_len; + /* Number of instructions in ifetch block in this solution. */ + int ifetch_block_n_insns; + /* Bitmap to remember changes to ready_try for backtracking. */ + sbitmap ready_try_change; + /* Size of the bitmap. */ + int ready_try_change_size; +}; +# define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DATA_T \ + struct ix86_first_cycle_multipass_data_ +#endif /* RTX_CODE */ diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c new file mode 100644 index 000000000..c43d3ed57 --- /dev/null +++ b/gcc/config/i386/i386.c @@ -0,0 +1,35376 @@ +/* Subroutines used for code generation on IA-32. + Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, + 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 + Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "tm.h" +#include "rtl.h" +#include "tree.h" +#include "tm_p.h" +#include "regs.h" +#include "hard-reg-set.h" +#include "insn-config.h" +#include "conditions.h" +#include "output.h" +#include "insn-codes.h" +#include "insn-attr.h" +#include "flags.h" +#include "except.h" +#include "function.h" +#include "recog.h" +#include "expr.h" +#include "optabs.h" +#include "diagnostic-core.h" +#include "toplev.h" +#include "basic-block.h" +#include "ggc.h" +#include "target.h" +#include "target-def.h" +#include "langhooks.h" +#include "reload.h" +#include "cgraph.h" +#include "gimple.h" +#include "dwarf2.h" +#include "df.h" +#include "tm-constrs.h" +#include "params.h" +#include "cselib.h" +#include "debug.h" +#include "dwarf2out.h" +#include "sched-int.h" +#include "sbitmap.h" +#include "fibheap.h" + +enum upper_128bits_state +{ + unknown = 0, + unused, + used +}; + +typedef struct block_info_def +{ + /* State of the upper 128bits of AVX registers at exit. */ + enum upper_128bits_state state; + /* TRUE if state of the upper 128bits of AVX registers is unchanged + in this block. */ + bool unchanged; + /* TRUE if block has been processed. */ + bool processed; + /* TRUE if block has been scanned. */ + bool scanned; + /* Previous state of the upper 128bits of AVX registers at entry. */ + enum upper_128bits_state prev; +} *block_info; + +#define BLOCK_INFO(B) ((block_info) (B)->aux) + +enum call_avx256_state +{ + /* Callee returns 256bit AVX register. */ + callee_return_avx256 = -1, + /* Callee returns and passes 256bit AVX register. */ + callee_return_pass_avx256, + /* Callee passes 256bit AVX register. */ + callee_pass_avx256, + /* Callee doesn't return nor passe 256bit AVX register, or no + 256bit AVX register in function return. */ + call_no_avx256, + /* vzeroupper intrinsic. */ + vzeroupper_intrinsic +}; + +/* Check if a 256bit AVX register is referenced in stores. */ + +static void +check_avx256_stores (rtx dest, const_rtx set, void *data) +{ + if ((REG_P (dest) + && VALID_AVX256_REG_MODE (GET_MODE (dest))) + || (GET_CODE (set) == SET + && REG_P (SET_SRC (set)) + && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set))))) + { + enum upper_128bits_state *state + = (enum upper_128bits_state *) data; + *state = used; + } +} + +/* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper + in basic block BB. Delete it if upper 128bit AVX registers are + unused. If it isn't deleted, move it to just before a jump insn. + + STATE is state of the upper 128bits of AVX registers at entry. */ + +static void +move_or_delete_vzeroupper_2 (basic_block bb, + enum upper_128bits_state state) +{ + rtx insn, bb_end; + rtx vzeroupper_insn = NULL_RTX; + rtx pat; + int avx256; + bool unchanged; + + if (BLOCK_INFO (bb)->unchanged) + { + if (dump_file) + fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n", + bb->index, state); + + BLOCK_INFO (bb)->state = state; + return; + } + + if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state) + { + if (dump_file) + fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n", + bb->index, BLOCK_INFO (bb)->state); + return; + } + + BLOCK_INFO (bb)->prev = state; + + if (dump_file) + fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n", + bb->index, state); + + unchanged = true; + + /* BB_END changes when it is deleted. */ + bb_end = BB_END (bb); + insn = BB_HEAD (bb); + while (insn != bb_end) + { + insn = NEXT_INSN (insn); + + if (!NONDEBUG_INSN_P (insn)) + continue; + + /* Move vzeroupper before jump/call. */ + if (JUMP_P (insn) || CALL_P (insn)) + { + if (!vzeroupper_insn) + continue; + + if (PREV_INSN (insn) != vzeroupper_insn) + { + if (dump_file) + { + fprintf (dump_file, "Move vzeroupper after:\n"); + print_rtl_single (dump_file, PREV_INSN (insn)); + fprintf (dump_file, "before:\n"); + print_rtl_single (dump_file, insn); + } + reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn, + PREV_INSN (insn)); + } + vzeroupper_insn = NULL_RTX; + continue; + } + + pat = PATTERN (insn); + + /* Check insn for vzeroupper intrinsic. */ + if (GET_CODE (pat) == UNSPEC_VOLATILE + && XINT (pat, 1) == UNSPECV_VZEROUPPER) + { + if (dump_file) + { + /* Found vzeroupper intrinsic. */ + fprintf (dump_file, "Found vzeroupper:\n"); + print_rtl_single (dump_file, insn); + } + } + else + { + /* Check insn for vzeroall intrinsic. */ + if (GET_CODE (pat) == PARALLEL + && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE + && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL) + { + state = unused; + unchanged = false; + + /* Delete pending vzeroupper insertion. */ + if (vzeroupper_insn) + { + delete_insn (vzeroupper_insn); + vzeroupper_insn = NULL_RTX; + } + } + else if (state != used) + { + note_stores (pat, check_avx256_stores, &state); + if (state == used) + unchanged = false; + } + continue; + } + + /* Process vzeroupper intrinsic. */ + avx256 = INTVAL (XVECEXP (pat, 0, 0)); + + if (state == unused) + { + /* Since the upper 128bits are cleared, callee must not pass + 256bit AVX register. We only need to check if callee + returns 256bit AVX register. */ + if (avx256 == callee_return_avx256) + { + state = used; + unchanged = false; + } + + /* Remove unnecessary vzeroupper since upper 128bits are + cleared. */ + if (dump_file) + { + fprintf (dump_file, "Delete redundant vzeroupper:\n"); + print_rtl_single (dump_file, insn); + } + delete_insn (insn); + } + else + { + /* Set state to UNUSED if callee doesn't return 256bit AVX + register. */ + if (avx256 != callee_return_pass_avx256) + state = unused; + + if (avx256 == callee_return_pass_avx256 + || avx256 == callee_pass_avx256) + { + /* Must remove vzeroupper since callee passes in 256bit + AVX register. */ + if (dump_file) + { + fprintf (dump_file, "Delete callee pass vzeroupper:\n"); + print_rtl_single (dump_file, insn); + } + delete_insn (insn); + } + else + { + vzeroupper_insn = insn; + unchanged = false; + } + } + } + + BLOCK_INFO (bb)->state = state; + BLOCK_INFO (bb)->unchanged = unchanged; + BLOCK_INFO (bb)->scanned = true; + + if (dump_file) + fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n", + bb->index, unchanged ? "unchanged" : "changed", + state); +} + +/* Helper function for move_or_delete_vzeroupper. Process vzeroupper + in BLOCK and check its predecessor blocks. Treat UNKNOWN state + as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit + state is changed. */ + +static bool +move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused) +{ + edge e; + edge_iterator ei; + enum upper_128bits_state state, old_state, new_state; + bool seen_unknown; + + if (dump_file) + fprintf (dump_file, " Process [bb %i]: status: %d\n", + block->index, BLOCK_INFO (block)->processed); + + if (BLOCK_INFO (block)->processed) + return false; + + state = unused; + + /* Check all predecessor edges of this block. */ + seen_unknown = false; + FOR_EACH_EDGE (e, ei, block->preds) + { + if (e->src == block) + continue; + switch (BLOCK_INFO (e->src)->state) + { + case unknown: + if (!unknown_is_unused) + seen_unknown = true; + case unused: + break; + case used: + state = used; + goto done; + } + } + + if (seen_unknown) + state = unknown; + +done: + old_state = BLOCK_INFO (block)->state; + move_or_delete_vzeroupper_2 (block, state); + new_state = BLOCK_INFO (block)->state; + + if (state != unknown || new_state == used) + BLOCK_INFO (block)->processed = true; + + /* Need to rescan if the upper 128bits of AVX registers are changed + to USED at exit. */ + if (new_state != old_state) + { + if (new_state == used) + cfun->machine->rescan_vzeroupper_p = 1; + return true; + } + else + return false; +} + +/* Go through the instruction stream looking for vzeroupper. Delete + it if upper 128bit AVX registers are unused. If it isn't deleted, + move it to just before a jump insn. */ + +static void +move_or_delete_vzeroupper (void) +{ + edge e; + edge_iterator ei; + basic_block bb; + fibheap_t worklist, pending, fibheap_swap; + sbitmap visited, in_worklist, in_pending, sbitmap_swap; + int *bb_order; + int *rc_order; + int i; + + /* Set up block info for each basic block. */ + alloc_aux_for_blocks (sizeof (struct block_info_def)); + + /* Process outgoing edges of entry point. */ + if (dump_file) + fprintf (dump_file, "Process outgoing edges of entry point\n"); + + FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs) + { + move_or_delete_vzeroupper_2 (e->dest, + cfun->machine->caller_pass_avx256_p + ? used : unused); + BLOCK_INFO (e->dest)->processed = true; + } + + /* Compute reverse completion order of depth first search of the CFG + so that the data-flow runs faster. */ + rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS); + bb_order = XNEWVEC (int, last_basic_block); + pre_and_rev_post_order_compute (NULL, rc_order, false); + for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++) + bb_order[rc_order[i]] = i; + free (rc_order); + + worklist = fibheap_new (); + pending = fibheap_new (); + visited = sbitmap_alloc (last_basic_block); + in_worklist = sbitmap_alloc (last_basic_block); + in_pending = sbitmap_alloc (last_basic_block); + sbitmap_zero (in_worklist); + + /* Don't check outgoing edges of entry point. */ + sbitmap_ones (in_pending); + FOR_EACH_BB (bb) + if (BLOCK_INFO (bb)->processed) + RESET_BIT (in_pending, bb->index); + else + { + move_or_delete_vzeroupper_1 (bb, false); + fibheap_insert (pending, bb_order[bb->index], bb); + } + + if (dump_file) + fprintf (dump_file, "Check remaining basic blocks\n"); + + while (!fibheap_empty (pending)) + { + fibheap_swap = pending; + pending = worklist; + worklist = fibheap_swap; + sbitmap_swap = in_pending; + in_pending = in_worklist; + in_worklist = sbitmap_swap; + + sbitmap_zero (visited); + + cfun->machine->rescan_vzeroupper_p = 0; + + while (!fibheap_empty (worklist)) + { + bb = (basic_block) fibheap_extract_min (worklist); + RESET_BIT (in_worklist, bb->index); + gcc_assert (!TEST_BIT (visited, bb->index)); + if (!TEST_BIT (visited, bb->index)) + { + edge_iterator ei; + + SET_BIT (visited, bb->index); + + if (move_or_delete_vzeroupper_1 (bb, false)) + FOR_EACH_EDGE (e, ei, bb->succs) + { + if (e->dest == EXIT_BLOCK_PTR + || BLOCK_INFO (e->dest)->processed) + continue; + + if (TEST_BIT (visited, e->dest->index)) + { + if (!TEST_BIT (in_pending, e->dest->index)) + { + /* Send E->DEST to next round. */ + SET_BIT (in_pending, e->dest->index); + fibheap_insert (pending, + bb_order[e->dest->index], + e->dest); + } + } + else if (!TEST_BIT (in_worklist, e->dest->index)) + { + /* Add E->DEST to current round. */ + SET_BIT (in_worklist, e->dest->index); + fibheap_insert (worklist, bb_order[e->dest->index], + e->dest); + } + } + } + } + + if (!cfun->machine->rescan_vzeroupper_p) + break; + } + + free (bb_order); + fibheap_delete (worklist); + fibheap_delete (pending); + sbitmap_free (visited); + sbitmap_free (in_worklist); + sbitmap_free (in_pending); + + if (dump_file) + fprintf (dump_file, "Process remaining basic blocks\n"); + + FOR_EACH_BB (bb) + move_or_delete_vzeroupper_1 (bb, true); + + free_aux_for_blocks (); +} + +static rtx legitimize_dllimport_symbol (rtx, bool); + +#ifndef CHECK_STACK_LIMIT +#define CHECK_STACK_LIMIT (-1) +#endif + +/* Return index of given mode in mult and division cost tables. */ +#define MODE_INDEX(mode) \ + ((mode) == QImode ? 0 \ + : (mode) == HImode ? 1 \ + : (mode) == SImode ? 2 \ + : (mode) == DImode ? 3 \ + : 4) + +/* Processor costs (relative to an add) */ +/* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */ +#define COSTS_N_BYTES(N) ((N) * 2) + +#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}} + +const +struct processor_costs ix86_size_cost = {/* costs for tuning for size */ + COSTS_N_BYTES (2), /* cost of an add instruction */ + COSTS_N_BYTES (3), /* cost of a lea instruction */ + COSTS_N_BYTES (2), /* variable shift costs */ + COSTS_N_BYTES (3), /* constant shift costs */ + {COSTS_N_BYTES (3), /* cost of starting multiply for QI */ + COSTS_N_BYTES (3), /* HI */ + COSTS_N_BYTES (3), /* SI */ + COSTS_N_BYTES (3), /* DI */ + COSTS_N_BYTES (5)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */ + COSTS_N_BYTES (3), /* HI */ + COSTS_N_BYTES (3), /* SI */ + COSTS_N_BYTES (3), /* DI */ + COSTS_N_BYTES (5)}, /* other */ + COSTS_N_BYTES (3), /* cost of movsx */ + COSTS_N_BYTES (3), /* cost of movzx */ + 0, /* "large" insn */ + 2, /* MOVE_RATIO */ + 2, /* cost for loading QImode using movzbl */ + {2, 2, 2}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 2, 2}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {2, 2, 2}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {2, 2, 2}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 3, /* cost of moving MMX register */ + {3, 3}, /* cost of loading MMX registers + in SImode and DImode */ + {3, 3}, /* cost of storing MMX registers + in SImode and DImode */ + 3, /* cost of moving SSE register */ + {3, 3, 3}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {3, 3, 3}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + 0, /* size of l1 cache */ + 0, /* size of l2 cache */ + 0, /* size of prefetch block */ + 0, /* number of parallel prefetches */ + 2, /* Branch cost */ + COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */ + COSTS_N_BYTES (2), /* cost of FMUL instruction. */ + COSTS_N_BYTES (2), /* cost of FDIV instruction. */ + COSTS_N_BYTES (2), /* cost of FABS instruction. */ + COSTS_N_BYTES (2), /* cost of FCHS instruction. */ + COSTS_N_BYTES (2), /* cost of FSQRT instruction. */ + {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}, + {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}}, + {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}, + {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}}, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 1, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 1, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +/* Processor costs (relative to an add) */ +static const +struct processor_costs i386_cost = { /* 386 specific costs */ + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1), /* cost of a lea instruction */ + COSTS_N_INSNS (3), /* variable shift costs */ + COSTS_N_INSNS (2), /* constant shift costs */ + {COSTS_N_INSNS (6), /* cost of starting multiply for QI */ + COSTS_N_INSNS (6), /* HI */ + COSTS_N_INSNS (6), /* SI */ + COSTS_N_INSNS (6), /* DI */ + COSTS_N_INSNS (6)}, /* other */ + COSTS_N_INSNS (1), /* cost of multiply per each bit set */ + {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (23), /* HI */ + COSTS_N_INSNS (23), /* SI */ + COSTS_N_INSNS (23), /* DI */ + COSTS_N_INSNS (23)}, /* other */ + COSTS_N_INSNS (3), /* cost of movsx */ + COSTS_N_INSNS (2), /* cost of movzx */ + 15, /* "large" insn */ + 3, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {2, 4, 2}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 4, 2}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {8, 8, 8}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {8, 8, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {4, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 8}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 8, 16}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 8, 16}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + 0, /* size of l1 cache */ + 0, /* size of l2 cache */ + 0, /* size of prefetch block */ + 0, /* number of parallel prefetches */ + 1, /* Branch cost */ + COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (27), /* cost of FMUL instruction. */ + COSTS_N_INSNS (88), /* cost of FDIV instruction. */ + COSTS_N_INSNS (22), /* cost of FABS instruction. */ + COSTS_N_INSNS (24), /* cost of FCHS instruction. */ + COSTS_N_INSNS (122), /* cost of FSQRT instruction. */ + {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}, + DUMMY_STRINGOP_ALGS}, + {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}, + DUMMY_STRINGOP_ALGS}, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +static const +struct processor_costs i486_cost = { /* 486 specific costs */ + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1), /* cost of a lea instruction */ + COSTS_N_INSNS (3), /* variable shift costs */ + COSTS_N_INSNS (2), /* constant shift costs */ + {COSTS_N_INSNS (12), /* cost of starting multiply for QI */ + COSTS_N_INSNS (12), /* HI */ + COSTS_N_INSNS (12), /* SI */ + COSTS_N_INSNS (12), /* DI */ + COSTS_N_INSNS (12)}, /* other */ + 1, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (40), /* HI */ + COSTS_N_INSNS (40), /* SI */ + COSTS_N_INSNS (40), /* DI */ + COSTS_N_INSNS (40)}, /* other */ + COSTS_N_INSNS (3), /* cost of movsx */ + COSTS_N_INSNS (2), /* cost of movzx */ + 15, /* "large" insn */ + 3, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {2, 4, 2}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 4, 2}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {8, 8, 8}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {8, 8, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {4, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 8}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 8, 16}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 8, 16}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + 4, /* size of l1 cache. 486 has 8kB cache + shared for code and data, so 4kB is + not really precise. */ + 4, /* size of l2 cache */ + 0, /* size of prefetch block */ + 0, /* number of parallel prefetches */ + 1, /* Branch cost */ + COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (16), /* cost of FMUL instruction. */ + COSTS_N_INSNS (73), /* cost of FDIV instruction. */ + COSTS_N_INSNS (3), /* cost of FABS instruction. */ + COSTS_N_INSNS (3), /* cost of FCHS instruction. */ + COSTS_N_INSNS (83), /* cost of FSQRT instruction. */ + {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}}, + DUMMY_STRINGOP_ALGS}, + {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}}, + DUMMY_STRINGOP_ALGS}, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +static const +struct processor_costs pentium_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1), /* cost of a lea instruction */ + COSTS_N_INSNS (4), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (11), /* cost of starting multiply for QI */ + COSTS_N_INSNS (11), /* HI */ + COSTS_N_INSNS (11), /* SI */ + COSTS_N_INSNS (11), /* DI */ + COSTS_N_INSNS (11)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (25), /* HI */ + COSTS_N_INSNS (25), /* SI */ + COSTS_N_INSNS (25), /* DI */ + COSTS_N_INSNS (25)}, /* other */ + COSTS_N_INSNS (3), /* cost of movsx */ + COSTS_N_INSNS (2), /* cost of movzx */ + 8, /* "large" insn */ + 6, /* MOVE_RATIO */ + 6, /* cost for loading QImode using movzbl */ + {2, 4, 2}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 4, 2}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {2, 2, 6}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 4, 6}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 8, /* cost of moving MMX register */ + {8, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {8, 8}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 8, 16}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 8, 16}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + 8, /* size of l1 cache. */ + 8, /* size of l2 cache */ + 0, /* size of prefetch block */ + 0, /* number of parallel prefetches */ + 2, /* Branch cost */ + COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (3), /* cost of FMUL instruction. */ + COSTS_N_INSNS (39), /* cost of FDIV instruction. */ + COSTS_N_INSNS (1), /* cost of FABS instruction. */ + COSTS_N_INSNS (1), /* cost of FCHS instruction. */ + COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ + {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + {{libcall, {{-1, rep_prefix_4_byte}}}, + DUMMY_STRINGOP_ALGS}, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +static const +struct processor_costs pentiumpro_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (4), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (4)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (17), /* HI */ + COSTS_N_INSNS (17), /* SI */ + COSTS_N_INSNS (17), /* DI */ + COSTS_N_INSNS (17)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 6, /* MOVE_RATIO */ + 2, /* cost for loading QImode using movzbl */ + {4, 4, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 2, 2}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {2, 2, 6}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 4, 6}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {2, 2}, /* cost of loading MMX registers + in SImode and DImode */ + {2, 2}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {2, 2, 8}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {2, 2, 8}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + 8, /* size of l1 cache. */ + 256, /* size of l2 cache */ + 32, /* size of prefetch block */ + 6, /* number of parallel prefetches */ + 2, /* Branch cost */ + COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (5), /* cost of FMUL instruction. */ + COSTS_N_INSNS (56), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ + /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes + (we ensure the alignment). For small blocks inline loop is still a + noticeable win, for bigger blocks either rep movsl or rep movsb is + way to go. Rep movsb has apparently more expensive startup time in CPU, + but after 4K the difference is down in the noise. */ + {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop}, + {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}}, + DUMMY_STRINGOP_ALGS}, + {{rep_prefix_4_byte, {{1024, unrolled_loop}, + {8192, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +static const +struct processor_costs geode_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1), /* cost of a lea instruction */ + COSTS_N_INSNS (2), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (7), /* SI */ + COSTS_N_INSNS (7), /* DI */ + COSTS_N_INSNS (7)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (23), /* HI */ + COSTS_N_INSNS (39), /* SI */ + COSTS_N_INSNS (39), /* DI */ + COSTS_N_INSNS (39)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 4, /* MOVE_RATIO */ + 1, /* cost for loading QImode using movzbl */ + {1, 1, 1}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {1, 1, 1}, /* cost of storing integer registers */ + 1, /* cost of reg,reg fld/fst */ + {1, 1, 1}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 6, 6}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + + 1, /* cost of moving MMX register */ + {1, 1}, /* cost of loading MMX registers + in SImode and DImode */ + {1, 1}, /* cost of storing MMX registers + in SImode and DImode */ + 1, /* cost of moving SSE register */ + {1, 1, 1}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {1, 1, 1}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 1, /* MMX or SSE register to integer */ + 64, /* size of l1 cache. */ + 128, /* size of l2 cache. */ + 32, /* size of prefetch block */ + 1, /* number of parallel prefetches */ + 1, /* Branch cost */ + COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (11), /* cost of FMUL instruction. */ + COSTS_N_INSNS (47), /* cost of FDIV instruction. */ + COSTS_N_INSNS (1), /* cost of FABS instruction. */ + COSTS_N_INSNS (1), /* cost of FCHS instruction. */ + COSTS_N_INSNS (54), /* cost of FSQRT instruction. */ + {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +static const +struct processor_costs k6_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (2), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (3), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (3), /* DI */ + COSTS_N_INSNS (3)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (18), /* HI */ + COSTS_N_INSNS (18), /* SI */ + COSTS_N_INSNS (18), /* DI */ + COSTS_N_INSNS (18)}, /* other */ + COSTS_N_INSNS (2), /* cost of movsx */ + COSTS_N_INSNS (2), /* cost of movzx */ + 8, /* "large" insn */ + 4, /* MOVE_RATIO */ + 3, /* cost for loading QImode using movzbl */ + {4, 5, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 3, 2}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {6, 6, 6}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 4, 4}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {2, 2}, /* cost of loading MMX registers + in SImode and DImode */ + {2, 2}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {2, 2, 8}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {2, 2, 8}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 6, /* MMX or SSE register to integer */ + 32, /* size of l1 cache. */ + 32, /* size of l2 cache. Some models + have integrated l2 cache, but + optimizing for k6 is not important + enough to worry about that. */ + 32, /* size of prefetch block */ + 1, /* number of parallel prefetches */ + 1, /* Branch cost */ + COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (2), /* cost of FMUL instruction. */ + COSTS_N_INSNS (56), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ + {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +static const +struct processor_costs athlon_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (2), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (5), /* cost of starting multiply for QI */ + COSTS_N_INSNS (5), /* HI */ + COSTS_N_INSNS (5), /* SI */ + COSTS_N_INSNS (5), /* DI */ + COSTS_N_INSNS (5)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (26), /* HI */ + COSTS_N_INSNS (42), /* SI */ + COSTS_N_INSNS (74), /* DI */ + COSTS_N_INSNS (74)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {3, 4, 3}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {3, 4, 3}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {4, 4, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {4, 4}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 4, 6}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 5}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 5, /* MMX or SSE register to integer */ + 64, /* size of l1 cache. */ + 256, /* size of l2 cache. */ + 64, /* size of prefetch block */ + 6, /* number of parallel prefetches */ + 5, /* Branch cost */ + COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (4), /* cost of FMUL instruction. */ + COSTS_N_INSNS (24), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + /* For some reason, Athlon deals better with REP prefix (relative to loops) + compared to K8. Alignment becomes important after 8 bytes for memcpy and + 128 bytes for memset. */ + {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +static const +struct processor_costs k8_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (2), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (5)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (26), /* HI */ + COSTS_N_INSNS (42), /* SI */ + COSTS_N_INSNS (74), /* DI */ + COSTS_N_INSNS (74)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {3, 4, 3}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {3, 4, 3}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {4, 4, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {3, 3}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 3, 6}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 5}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 5, /* MMX or SSE register to integer */ + 64, /* size of l1 cache. */ + 512, /* size of l2 cache. */ + 64, /* size of prefetch block */ + /* New AMD processors never drop prefetches; if they cannot be performed + immediately, they are queued. We set number of simultaneous prefetches + to a large constant to reflect this (it probably is not a good idea not + to limit number of prefetches at all, as their execution also takes some + time). */ + 100, /* number of parallel prefetches */ + 3, /* Branch cost */ + COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (4), /* cost of FMUL instruction. */ + COSTS_N_INSNS (19), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + /* K8 has optimized REP instruction for medium sized blocks, but for very + small blocks it is better to use loop. For large blocks, libcall can + do nontemporary accesses and beat inline considerably. */ + {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}}, + {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{8, loop}, {24, unrolled_loop}, + {2048, rep_prefix_4_byte}, {-1, libcall}}}, + {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + 4, /* scalar_stmt_cost. */ + 2, /* scalar load_cost. */ + 2, /* scalar_store_cost. */ + 5, /* vec_stmt_cost. */ + 0, /* vec_to_scalar_cost. */ + 2, /* scalar_to_vec_cost. */ + 2, /* vec_align_load_cost. */ + 3, /* vec_unalign_load_cost. */ + 3, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 2, /* cond_not_taken_branch_cost. */ +}; + +struct processor_costs amdfam10_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (2), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (5)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (35), /* HI */ + COSTS_N_INSNS (51), /* SI */ + COSTS_N_INSNS (83), /* DI */ + COSTS_N_INSNS (83)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {3, 4, 3}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {3, 4, 3}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {4, 4, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {3, 3}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 4, 3}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 5}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + /* On K8: + MOVD reg64, xmmreg Double FSTORE 4 + MOVD reg32, xmmreg Double FSTORE 4 + On AMDFAM10: + MOVD reg64, xmmreg Double FADD 3 + 1/1 1/1 + MOVD reg32, xmmreg Double FADD 3 + 1/1 1/1 */ + 64, /* size of l1 cache. */ + 512, /* size of l2 cache. */ + 64, /* size of prefetch block */ + /* New AMD processors never drop prefetches; if they cannot be performed + immediately, they are queued. We set number of simultaneous prefetches + to a large constant to reflect this (it probably is not a good idea not + to limit number of prefetches at all, as their execution also takes some + time). */ + 100, /* number of parallel prefetches */ + 2, /* Branch cost */ + COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (4), /* cost of FMUL instruction. */ + COSTS_N_INSNS (19), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + + /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for + very small blocks it is better to use loop. For large blocks, libcall can + do nontemporary accesses and beat inline considerably. */ + {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}}, + {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{8, loop}, {24, unrolled_loop}, + {2048, rep_prefix_4_byte}, {-1, libcall}}}, + {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + 4, /* scalar_stmt_cost. */ + 2, /* scalar load_cost. */ + 2, /* scalar_store_cost. */ + 6, /* vec_stmt_cost. */ + 0, /* vec_to_scalar_cost. */ + 2, /* scalar_to_vec_cost. */ + 2, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 2, /* vec_store_cost. */ + 2, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +struct processor_costs bdver1_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (4), /* SI */ + COSTS_N_INSNS (6), /* DI */ + COSTS_N_INSNS (6)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (35), /* HI */ + COSTS_N_INSNS (51), /* SI */ + COSTS_N_INSNS (83), /* DI */ + COSTS_N_INSNS (83)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {5, 5, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {4, 4, 4}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {5, 5, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 4, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {4, 4}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 4, 4}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 4}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 2, /* MMX or SSE register to integer */ + /* On K8: + MOVD reg64, xmmreg Double FSTORE 4 + MOVD reg32, xmmreg Double FSTORE 4 + On AMDFAM10: + MOVD reg64, xmmreg Double FADD 3 + 1/1 1/1 + MOVD reg32, xmmreg Double FADD 3 + 1/1 1/1 */ + 16, /* size of l1 cache. */ + 2048, /* size of l2 cache. */ + 64, /* size of prefetch block */ + /* New AMD processors never drop prefetches; if they cannot be performed + immediately, they are queued. We set number of simultaneous prefetches + to a large constant to reflect this (it probably is not a good idea not + to limit number of prefetches at all, as their execution also takes some + time). */ + 100, /* number of parallel prefetches */ + 2, /* Branch cost */ + COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (6), /* cost of FMUL instruction. */ + COSTS_N_INSNS (42), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ + + /* BDVER1 has optimized REP instruction for medium sized blocks, but for + very small blocks it is better to use loop. For large blocks, libcall + can do nontemporary accesses and beat inline considerably. */ + {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}}, + {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{8, loop}, {24, unrolled_loop}, + {2048, rep_prefix_4_byte}, {-1, libcall}}}, + {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + 6, /* scalar_stmt_cost. */ + 4, /* scalar load_cost. */ + 4, /* scalar_store_cost. */ + 6, /* vec_stmt_cost. */ + 0, /* vec_to_scalar_cost. */ + 2, /* scalar_to_vec_cost. */ + 4, /* vec_align_load_cost. */ + 4, /* vec_unalign_load_cost. */ + 4, /* vec_store_cost. */ + 2, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +struct processor_costs btver1_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (2), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (5)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (35), /* HI */ + COSTS_N_INSNS (51), /* SI */ + COSTS_N_INSNS (83), /* DI */ + COSTS_N_INSNS (83)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {3, 4, 3}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {3, 4, 3}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {4, 4, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {3, 3}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 4, 3}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 5}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + /* On K8: + MOVD reg64, xmmreg Double FSTORE 4 + MOVD reg32, xmmreg Double FSTORE 4 + On AMDFAM10: + MOVD reg64, xmmreg Double FADD 3 + 1/1 1/1 + MOVD reg32, xmmreg Double FADD 3 + 1/1 1/1 */ + 32, /* size of l1 cache. */ + 512, /* size of l2 cache. */ + 64, /* size of prefetch block */ + 100, /* number of parallel prefetches */ + 2, /* Branch cost */ + COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (4), /* cost of FMUL instruction. */ + COSTS_N_INSNS (19), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + + /* BTVER1 has optimized REP instruction for medium sized blocks, but for + very small blocks it is better to use loop. For large blocks, libcall can + do nontemporary accesses and beat inline considerably. */ + {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}}, + {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{8, loop}, {24, unrolled_loop}, + {2048, rep_prefix_4_byte}, {-1, libcall}}}, + {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + 4, /* scalar_stmt_cost. */ + 2, /* scalar load_cost. */ + 2, /* scalar_store_cost. */ + 6, /* vec_stmt_cost. */ + 0, /* vec_to_scalar_cost. */ + 2, /* scalar_to_vec_cost. */ + 2, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 2, /* vec_store_cost. */ + 2, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +static const +struct processor_costs pentium4_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (3), /* cost of a lea instruction */ + COSTS_N_INSNS (4), /* variable shift costs */ + COSTS_N_INSNS (4), /* constant shift costs */ + {COSTS_N_INSNS (15), /* cost of starting multiply for QI */ + COSTS_N_INSNS (15), /* HI */ + COSTS_N_INSNS (15), /* SI */ + COSTS_N_INSNS (15), /* DI */ + COSTS_N_INSNS (15)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (56), /* HI */ + COSTS_N_INSNS (56), /* SI */ + COSTS_N_INSNS (56), /* DI */ + COSTS_N_INSNS (56)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 16, /* "large" insn */ + 6, /* MOVE_RATIO */ + 2, /* cost for loading QImode using movzbl */ + {4, 5, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 3, 2}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {2, 2, 6}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 4, 6}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {2, 2}, /* cost of loading MMX registers + in SImode and DImode */ + {2, 2}, /* cost of storing MMX registers + in SImode and DImode */ + 12, /* cost of moving SSE register */ + {12, 12, 12}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {2, 2, 8}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 10, /* MMX or SSE register to integer */ + 8, /* size of l1 cache. */ + 256, /* size of l2 cache. */ + 64, /* size of prefetch block */ + 6, /* number of parallel prefetches */ + 2, /* Branch cost */ + COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (7), /* cost of FMUL instruction. */ + COSTS_N_INSNS (43), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (43), /* cost of FSQRT instruction. */ + {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}}, + DUMMY_STRINGOP_ALGS}, + {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte}, + {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +static const +struct processor_costs nocona_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (10), /* cost of starting multiply for QI */ + COSTS_N_INSNS (10), /* HI */ + COSTS_N_INSNS (10), /* SI */ + COSTS_N_INSNS (10), /* DI */ + COSTS_N_INSNS (10)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (66), /* HI */ + COSTS_N_INSNS (66), /* SI */ + COSTS_N_INSNS (66), /* DI */ + COSTS_N_INSNS (66)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 16, /* "large" insn */ + 17, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {4, 4, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {4, 4, 4}, /* cost of storing integer registers */ + 3, /* cost of reg,reg fld/fst */ + {12, 12, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 4, 4}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 6, /* cost of moving MMX register */ + {12, 12}, /* cost of loading MMX registers + in SImode and DImode */ + {12, 12}, /* cost of storing MMX registers + in SImode and DImode */ + 6, /* cost of moving SSE register */ + {12, 12, 12}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {12, 12, 12}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 8, /* MMX or SSE register to integer */ + 8, /* size of l1 cache. */ + 1024, /* size of l2 cache. */ + 128, /* size of prefetch block */ + 8, /* number of parallel prefetches */ + 1, /* Branch cost */ + COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (8), /* cost of FMUL instruction. */ + COSTS_N_INSNS (40), /* cost of FDIV instruction. */ + COSTS_N_INSNS (3), /* cost of FABS instruction. */ + COSTS_N_INSNS (3), /* cost of FCHS instruction. */ + COSTS_N_INSNS (44), /* cost of FSQRT instruction. */ + {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}}, + {libcall, {{32, loop}, {20000, rep_prefix_8_byte}, + {100000, unrolled_loop}, {-1, libcall}}}}, + {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte}, + {-1, libcall}}}, + {libcall, {{24, loop}, {64, unrolled_loop}, + {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +static const +struct processor_costs atom_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (2)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (26), /* HI */ + COSTS_N_INSNS (42), /* SI */ + COSTS_N_INSNS (74), /* DI */ + COSTS_N_INSNS (74)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 17, /* MOVE_RATIO */ + 2, /* cost for loading QImode using movzbl */ + {4, 4, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {4, 4, 4}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {12, 12, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {8, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {8, 8}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {8, 8, 8}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {8, 8, 8}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 5, /* MMX or SSE register to integer */ + 32, /* size of l1 cache. */ + 256, /* size of l2 cache. */ + 64, /* size of prefetch block */ + 6, /* number of parallel prefetches */ + 3, /* Branch cost */ + COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (8), /* cost of FMUL instruction. */ + COSTS_N_INSNS (20), /* cost of FDIV instruction. */ + COSTS_N_INSNS (8), /* cost of FABS instruction. */ + COSTS_N_INSNS (8), /* cost of FCHS instruction. */ + COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ + {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}}, + {libcall, {{32, loop}, {64, rep_prefix_4_byte}, + {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{8, loop}, {15, unrolled_loop}, + {2048, rep_prefix_4_byte}, {-1, libcall}}}, + {libcall, {{24, loop}, {32, unrolled_loop}, + {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +/* Generic64 should produce code tuned for Nocona and K8. */ +static const +struct processor_costs generic64_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + /* On all chips taken into consideration lea is 2 cycles and more. With + this cost however our current implementation of synth_mult results in + use of unnecessary temporary registers causing regression on several + SPECfp benchmarks. */ + COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (2)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (26), /* HI */ + COSTS_N_INSNS (42), /* SI */ + COSTS_N_INSNS (74), /* DI */ + COSTS_N_INSNS (74)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 17, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {4, 4, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {4, 4, 4}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {12, 12, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {8, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {8, 8}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {8, 8, 8}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {8, 8, 8}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 5, /* MMX or SSE register to integer */ + 32, /* size of l1 cache. */ + 512, /* size of l2 cache. */ + 64, /* size of prefetch block */ + 6, /* number of parallel prefetches */ + /* Benchmarks shows large regressions on K8 sixtrack benchmark when this + value is increased to perhaps more appropriate value of 5. */ + 3, /* Branch cost */ + COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (8), /* cost of FMUL instruction. */ + COSTS_N_INSNS (20), /* cost of FDIV instruction. */ + COSTS_N_INSNS (8), /* cost of FABS instruction. */ + COSTS_N_INSNS (8), /* cost of FCHS instruction. */ + COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ + {DUMMY_STRINGOP_ALGS, + {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {DUMMY_STRINGOP_ALGS, + {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +/* Generic32 should produce code tuned for PPro, Pentium4, Nocona, + Athlon and K8. */ +static const +struct processor_costs generic32_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (2)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (26), /* HI */ + COSTS_N_INSNS (42), /* SI */ + COSTS_N_INSNS (74), /* DI */ + COSTS_N_INSNS (74)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 17, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {4, 4, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {4, 4, 4}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {12, 12, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {8, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {8, 8}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {8, 8, 8}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {8, 8, 8}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 5, /* MMX or SSE register to integer */ + 32, /* size of l1 cache. */ + 256, /* size of l2 cache. */ + 64, /* size of prefetch block */ + 6, /* number of parallel prefetches */ + 3, /* Branch cost */ + COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (8), /* cost of FMUL instruction. */ + COSTS_N_INSNS (20), /* cost of FDIV instruction. */ + COSTS_N_INSNS (8), /* cost of FABS instruction. */ + COSTS_N_INSNS (8), /* cost of FCHS instruction. */ + COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ + {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +const struct processor_costs *ix86_cost = &pentium_cost; + +/* Processor feature/optimization bitmasks. */ +#define m_386 (1< 70) + { + *ptr++ = '\\'; + *ptr++ = '\n'; + line_len = 0; + } + } + + for (j = 0; j < 2; j++) + if (opts[i][j]) + { + memcpy (ptr, opts[i][j], len2[j]); + ptr += len2[j]; + line_len += len2[j]; + } + } + + *ptr = '\0'; + gcc_assert (ret + len >= ptr); + + return ret; +} + +/* Return TRUE if software prefetching is beneficial for the + given CPU. */ + +static bool +software_prefetching_beneficial_p (void) +{ + switch (ix86_tune) + { + case PROCESSOR_GEODE: + case PROCESSOR_K6: + case PROCESSOR_ATHLON: + case PROCESSOR_K8: + case PROCESSOR_AMDFAM10: + case PROCESSOR_BTVER1: + return true; + + default: + return false; + } +} + +/* Return true, if profiling code should be emitted before + prologue. Otherwise it returns false. + Note: For x86 with "hotfix" it is sorried. */ +static bool +ix86_profile_before_prologue (void) +{ + return flag_fentry != 0; +} + +/* Function that is callable from the debugger to print the current + options. */ +void +ix86_debug_options (void) +{ + char *opts = ix86_target_string (ix86_isa_flags, target_flags, + ix86_arch_string, ix86_tune_string, + ix86_fpmath_string, true); + + if (opts) + { + fprintf (stderr, "%s\n\n", opts); + free (opts); + } + else + fputs ("\n\n", stderr); + + return; +} + +/* Override various settings based on options. If MAIN_ARGS_P, the + options are from the command line, otherwise they are from + attributes. */ + +static void +ix86_option_override_internal (bool main_args_p) +{ + int i; + unsigned int ix86_arch_mask, ix86_tune_mask; + const bool ix86_tune_specified = (ix86_tune_string != NULL); + const char *prefix; + const char *suffix; + const char *sw; + + /* Comes from final.c -- no real reason to change it. */ +#define MAX_CODE_ALIGN 16 + + enum pta_flags + { + PTA_SSE = 1 << 0, + PTA_SSE2 = 1 << 1, + PTA_SSE3 = 1 << 2, + PTA_MMX = 1 << 3, + PTA_PREFETCH_SSE = 1 << 4, + PTA_3DNOW = 1 << 5, + PTA_3DNOW_A = 1 << 6, + PTA_64BIT = 1 << 7, + PTA_SSSE3 = 1 << 8, + PTA_CX16 = 1 << 9, + PTA_POPCNT = 1 << 10, + PTA_ABM = 1 << 11, + PTA_SSE4A = 1 << 12, + PTA_NO_SAHF = 1 << 13, + PTA_SSE4_1 = 1 << 14, + PTA_SSE4_2 = 1 << 15, + PTA_AES = 1 << 16, + PTA_PCLMUL = 1 << 17, + PTA_AVX = 1 << 18, + PTA_FMA = 1 << 19, + PTA_MOVBE = 1 << 20, + PTA_FMA4 = 1 << 21, + PTA_XOP = 1 << 22, + PTA_LWP = 1 << 23, + PTA_FSGSBASE = 1 << 24, + PTA_RDRND = 1 << 25, + PTA_F16C = 1 << 26, + PTA_BMI = 1 << 27, + PTA_TBM = 1 << 28 + /* if this reaches 32, need to widen struct pta flags below */ + }; + + static struct pta + { + const char *const name; /* processor name or nickname. */ + const enum processor_type processor; + const enum attr_cpu schedule; + const unsigned /*enum pta_flags*/ flags; + } + const processor_alias_table[] = + { + {"i386", PROCESSOR_I386, CPU_NONE, 0}, + {"i486", PROCESSOR_I486, CPU_NONE, 0}, + {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0}, + {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0}, + {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX}, + {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX}, + {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW}, + {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW}, + {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE}, + {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0}, + {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0}, + {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX}, + {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, + PTA_MMX | PTA_SSE}, + {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, + PTA_MMX | PTA_SSE}, + {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, + PTA_MMX | PTA_SSE | PTA_SSE2}, + {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE, + PTA_MMX |PTA_SSE | PTA_SSE2}, + {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE, + PTA_MMX | PTA_SSE | PTA_SSE2}, + {"prescott", PROCESSOR_NOCONA, CPU_NONE, + PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3}, + {"nocona", PROCESSOR_NOCONA, CPU_NONE, + PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 + | PTA_CX16 | PTA_NO_SAHF}, + {"core2", PROCESSOR_CORE2_64, CPU_CORE2, + PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 + | PTA_SSSE3 | PTA_CX16}, + {"corei7", PROCESSOR_COREI7_64, CPU_COREI7, + PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 + | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16}, + {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7, + PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 + | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX + | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL}, + {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7, + PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 + | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX + | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE + | PTA_RDRND | PTA_F16C}, + {"atom", PROCESSOR_ATOM, CPU_ATOM, + PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 + | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE}, + {"geode", PROCESSOR_GEODE, CPU_GEODE, + PTA_MMX | PTA_3DNOW | PTA_3DNOW_A |PTA_PREFETCH_SSE}, + {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX}, + {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW}, + {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW}, + {"athlon", PROCESSOR_ATHLON, CPU_ATHLON, + PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE}, + {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON, + PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE}, + {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON, + PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE}, + {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON, + PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE}, + {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON, + PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE}, + {"x86-64", PROCESSOR_K8, CPU_K8, + PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF}, + {"k8", PROCESSOR_K8, CPU_K8, + PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE + | PTA_SSE2 | PTA_NO_SAHF}, + {"k8-sse3", PROCESSOR_K8, CPU_K8, + PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE + | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF}, + {"opteron", PROCESSOR_K8, CPU_K8, + PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE + | PTA_SSE2 | PTA_NO_SAHF}, + {"opteron-sse3", PROCESSOR_K8, CPU_K8, + PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE + | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF}, + {"athlon64", PROCESSOR_K8, CPU_K8, + PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE + | PTA_SSE2 | PTA_NO_SAHF}, + {"athlon64-sse3", PROCESSOR_K8, CPU_K8, + PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE + | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF}, + {"athlon-fx", PROCESSOR_K8, CPU_K8, + PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE + | PTA_SSE2 | PTA_NO_SAHF}, + {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10, + PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE + | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM}, + {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10, + PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE + | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM}, + {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1, + PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 + | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1 + | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4 + | PTA_XOP | PTA_LWP}, + {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64, + PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 + | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16}, + {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO, + 0 /* flags are only used for -march switch. */ }, + {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64, + PTA_64BIT /* flags are only used for -march switch. */ }, + }; + + int const pta_size = ARRAY_SIZE (processor_alias_table); + + /* Set up prefix/suffix so the error messages refer to either the command + line argument, or the attribute(target). */ + if (main_args_p) + { + prefix = "-m"; + suffix = ""; + sw = "switch"; + } + else + { + prefix = "option(\""; + suffix = "\")"; + sw = "attribute"; + } + +#ifdef SUBTARGET_OVERRIDE_OPTIONS + SUBTARGET_OVERRIDE_OPTIONS; +#endif + +#ifdef SUBSUBTARGET_OVERRIDE_OPTIONS + SUBSUBTARGET_OVERRIDE_OPTIONS; +#endif + + /* -fPIC is the default for x86_64. */ + if (TARGET_MACHO && TARGET_64BIT) + flag_pic = 2; + + /* Need to check -mtune=generic first. */ + if (ix86_tune_string) + { + if (!strcmp (ix86_tune_string, "generic") + || !strcmp (ix86_tune_string, "i686") + /* As special support for cross compilers we read -mtune=native + as -mtune=generic. With native compilers we won't see the + -mtune=native, as it was changed by the driver. */ + || !strcmp (ix86_tune_string, "native")) + { + if (TARGET_64BIT) + ix86_tune_string = "generic64"; + else + ix86_tune_string = "generic32"; + } + /* If this call is for setting the option attribute, allow the + generic32/generic64 that was previously set. */ + else if (!main_args_p + && (!strcmp (ix86_tune_string, "generic32") + || !strcmp (ix86_tune_string, "generic64"))) + ; + else if (!strncmp (ix86_tune_string, "generic", 7)) + error ("bad value (%s) for %stune=%s %s", + ix86_tune_string, prefix, suffix, sw); + else if (!strcmp (ix86_tune_string, "x86-64")) + warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use " + "%stune=k8%s or %stune=generic%s instead as appropriate", + prefix, suffix, prefix, suffix, prefix, suffix); + } + else + { + if (ix86_arch_string) + ix86_tune_string = ix86_arch_string; + if (!ix86_tune_string) + { + ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT]; + ix86_tune_defaulted = 1; + } + + /* ix86_tune_string is set to ix86_arch_string or defaulted. We + need to use a sensible tune option. */ + if (!strcmp (ix86_tune_string, "generic") + || !strcmp (ix86_tune_string, "x86-64") + || !strcmp (ix86_tune_string, "i686")) + { + if (TARGET_64BIT) + ix86_tune_string = "generic64"; + else + ix86_tune_string = "generic32"; + } + } + + if (ix86_stringop_string) + { + if (!strcmp (ix86_stringop_string, "rep_byte")) + stringop_alg = rep_prefix_1_byte; + else if (!strcmp (ix86_stringop_string, "libcall")) + stringop_alg = libcall; + else if (!strcmp (ix86_stringop_string, "rep_4byte")) + stringop_alg = rep_prefix_4_byte; + else if (!strcmp (ix86_stringop_string, "rep_8byte") + && TARGET_64BIT) + /* rep; movq isn't available in 32-bit code. */ + stringop_alg = rep_prefix_8_byte; + else if (!strcmp (ix86_stringop_string, "byte_loop")) + stringop_alg = loop_1_byte; + else if (!strcmp (ix86_stringop_string, "loop")) + stringop_alg = loop; + else if (!strcmp (ix86_stringop_string, "unrolled_loop")) + stringop_alg = unrolled_loop; + else + error ("bad value (%s) for %sstringop-strategy=%s %s", + ix86_stringop_string, prefix, suffix, sw); + } + + if (!ix86_arch_string) + ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU; + else + ix86_arch_specified = 1; + + /* Validate -mabi= value. */ + if (ix86_abi_string) + { + if (strcmp (ix86_abi_string, "sysv") == 0) + ix86_abi = SYSV_ABI; + else if (strcmp (ix86_abi_string, "ms") == 0) + ix86_abi = MS_ABI; + else + error ("unknown ABI (%s) for %sabi=%s %s", + ix86_abi_string, prefix, suffix, sw); + } + else + ix86_abi = DEFAULT_ABI; + + if (ix86_cmodel_string != 0) + { + if (!strcmp (ix86_cmodel_string, "small")) + ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL; + else if (!strcmp (ix86_cmodel_string, "medium")) + ix86_cmodel = flag_pic ? CM_MEDIUM_PIC : CM_MEDIUM; + else if (!strcmp (ix86_cmodel_string, "large")) + ix86_cmodel = flag_pic ? CM_LARGE_PIC : CM_LARGE; + else if (flag_pic) + error ("code model %s does not support PIC mode", ix86_cmodel_string); + else if (!strcmp (ix86_cmodel_string, "32")) + ix86_cmodel = CM_32; + else if (!strcmp (ix86_cmodel_string, "kernel") && !flag_pic) + ix86_cmodel = CM_KERNEL; + else + error ("bad value (%s) for %scmodel=%s %s", + ix86_cmodel_string, prefix, suffix, sw); + } + else + { + /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the + use of rip-relative addressing. This eliminates fixups that + would otherwise be needed if this object is to be placed in a + DLL, and is essentially just as efficient as direct addressing. */ + if (TARGET_64BIT && DEFAULT_ABI == MS_ABI) + ix86_cmodel = CM_SMALL_PIC, flag_pic = 1; + else if (TARGET_64BIT) + ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL; + else + ix86_cmodel = CM_32; + } + if (ix86_asm_string != 0) + { + if (! TARGET_MACHO + && !strcmp (ix86_asm_string, "intel")) + ix86_asm_dialect = ASM_INTEL; + else if (!strcmp (ix86_asm_string, "att")) + ix86_asm_dialect = ASM_ATT; + else + error ("bad value (%s) for %sasm=%s %s", + ix86_asm_string, prefix, suffix, sw); + } + if ((TARGET_64BIT == 0) != (ix86_cmodel == CM_32)) + error ("code model %qs not supported in the %s bit mode", + ix86_cmodel_string, TARGET_64BIT ? "64" : "32"); + if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0)) + sorry ("%i-bit mode not compiled in", + (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32); + + for (i = 0; i < pta_size; i++) + if (! strcmp (ix86_arch_string, processor_alias_table[i].name)) + { + ix86_schedule = processor_alias_table[i].schedule; + ix86_arch = processor_alias_table[i].processor; + /* Default cpu tuning to the architecture. */ + ix86_tune = ix86_arch; + + if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT)) + error ("CPU you selected does not support x86-64 " + "instruction set"); + + if (processor_alias_table[i].flags & PTA_MMX + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX)) + ix86_isa_flags |= OPTION_MASK_ISA_MMX; + if (processor_alias_table[i].flags & PTA_3DNOW + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW)) + ix86_isa_flags |= OPTION_MASK_ISA_3DNOW; + if (processor_alias_table[i].flags & PTA_3DNOW_A + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A)) + ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A; + if (processor_alias_table[i].flags & PTA_SSE + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE)) + ix86_isa_flags |= OPTION_MASK_ISA_SSE; + if (processor_alias_table[i].flags & PTA_SSE2 + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2)) + ix86_isa_flags |= OPTION_MASK_ISA_SSE2; + if (processor_alias_table[i].flags & PTA_SSE3 + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3)) + ix86_isa_flags |= OPTION_MASK_ISA_SSE3; + if (processor_alias_table[i].flags & PTA_SSSE3 + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3)) + ix86_isa_flags |= OPTION_MASK_ISA_SSSE3; + if (processor_alias_table[i].flags & PTA_SSE4_1 + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1)) + ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1; + if (processor_alias_table[i].flags & PTA_SSE4_2 + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2)) + ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2; + if (processor_alias_table[i].flags & PTA_AVX + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX)) + ix86_isa_flags |= OPTION_MASK_ISA_AVX; + if (processor_alias_table[i].flags & PTA_FMA + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA)) + ix86_isa_flags |= OPTION_MASK_ISA_FMA; + if (processor_alias_table[i].flags & PTA_SSE4A + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A)) + ix86_isa_flags |= OPTION_MASK_ISA_SSE4A; + if (processor_alias_table[i].flags & PTA_FMA4 + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4)) + ix86_isa_flags |= OPTION_MASK_ISA_FMA4; + if (processor_alias_table[i].flags & PTA_XOP + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP)) + ix86_isa_flags |= OPTION_MASK_ISA_XOP; + if (processor_alias_table[i].flags & PTA_LWP + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP)) + ix86_isa_flags |= OPTION_MASK_ISA_LWP; + if (processor_alias_table[i].flags & PTA_ABM + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM)) + ix86_isa_flags |= OPTION_MASK_ISA_ABM; + if (processor_alias_table[i].flags & PTA_BMI + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI)) + ix86_isa_flags |= OPTION_MASK_ISA_BMI; + if (processor_alias_table[i].flags & PTA_TBM + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM)) + ix86_isa_flags |= OPTION_MASK_ISA_TBM; + if (processor_alias_table[i].flags & PTA_CX16 + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16)) + ix86_isa_flags |= OPTION_MASK_ISA_CX16; + if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM) + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT)) + ix86_isa_flags |= OPTION_MASK_ISA_POPCNT; + if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF)) + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF)) + ix86_isa_flags |= OPTION_MASK_ISA_SAHF; + if (processor_alias_table[i].flags & PTA_MOVBE + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE)) + ix86_isa_flags |= OPTION_MASK_ISA_MOVBE; + if (processor_alias_table[i].flags & PTA_AES + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES)) + ix86_isa_flags |= OPTION_MASK_ISA_AES; + if (processor_alias_table[i].flags & PTA_PCLMUL + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL)) + ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL; + if (processor_alias_table[i].flags & PTA_FSGSBASE + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE)) + ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE; + if (processor_alias_table[i].flags & PTA_RDRND + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND)) + ix86_isa_flags |= OPTION_MASK_ISA_RDRND; + if (processor_alias_table[i].flags & PTA_F16C + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C)) + ix86_isa_flags |= OPTION_MASK_ISA_F16C; + if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)) + x86_prefetch_sse = true; + + break; + } + + if (!strcmp (ix86_arch_string, "generic")) + error ("generic CPU can be used only for %stune=%s %s", + prefix, suffix, sw); + else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size) + error ("bad value (%s) for %sarch=%s %s", + ix86_arch_string, prefix, suffix, sw); + + ix86_arch_mask = 1u << ix86_arch; + for (i = 0; i < X86_ARCH_LAST; ++i) + ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask); + + for (i = 0; i < pta_size; i++) + if (! strcmp (ix86_tune_string, processor_alias_table[i].name)) + { + ix86_schedule = processor_alias_table[i].schedule; + ix86_tune = processor_alias_table[i].processor; + if (TARGET_64BIT) + { + if (!(processor_alias_table[i].flags & PTA_64BIT)) + { + if (ix86_tune_defaulted) + { + ix86_tune_string = "x86-64"; + for (i = 0; i < pta_size; i++) + if (! strcmp (ix86_tune_string, + processor_alias_table[i].name)) + break; + ix86_schedule = processor_alias_table[i].schedule; + ix86_tune = processor_alias_table[i].processor; + } + else + error ("CPU you selected does not support x86-64 " + "instruction set"); + } + } + else + { + /* Adjust tuning when compiling for 32-bit ABI. */ + switch (ix86_tune) + { + case PROCESSOR_GENERIC64: + ix86_tune = PROCESSOR_GENERIC32; + ix86_schedule = CPU_PENTIUMPRO; + break; + + case PROCESSOR_CORE2_64: + ix86_tune = PROCESSOR_CORE2_32; + break; + + case PROCESSOR_COREI7_64: + ix86_tune = PROCESSOR_COREI7_32; + break; + + default: + break; + } + } + /* Intel CPUs have always interpreted SSE prefetch instructions as + NOPs; so, we can enable SSE prefetch instructions even when + -mtune (rather than -march) points us to a processor that has them. + However, the VIA C3 gives a SIGILL, so we only do that for i686 and + higher processors. */ + if (TARGET_CMOV + && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))) + x86_prefetch_sse = true; + break; + } + + if (ix86_tune_specified && i == pta_size) + error ("bad value (%s) for %stune=%s %s", + ix86_tune_string, prefix, suffix, sw); + + ix86_tune_mask = 1u << ix86_tune; + for (i = 0; i < X86_TUNE_LAST; ++i) + ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask); + +#ifndef USE_IX86_FRAME_POINTER +#define USE_IX86_FRAME_POINTER 0 +#endif + +#ifndef USE_X86_64_FRAME_POINTER +#define USE_X86_64_FRAME_POINTER 0 +#endif + + /* Set the default values for switches whose default depends on TARGET_64BIT + in case they weren't overwritten by command line options. */ + if (TARGET_64BIT) + { + if (optimize > 1 && !global_options_set.x_flag_zee) + flag_zee = 1; + if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer) + flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER; + if (flag_asynchronous_unwind_tables == 2) + flag_unwind_tables = flag_asynchronous_unwind_tables = 1; + if (flag_pcc_struct_return == 2) + flag_pcc_struct_return = 0; + } + else + { + if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer) + flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size); + if (flag_asynchronous_unwind_tables == 2) + flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER; + if (flag_pcc_struct_return == 2) + flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN; + } + + if (optimize_size) + ix86_cost = &ix86_size_cost; + else + ix86_cost = processor_target_table[ix86_tune].cost; + + /* Arrange to set up i386_stack_locals for all functions. */ + init_machine_status = ix86_init_machine_status; + + /* Validate -mregparm= value. */ + if (ix86_regparm_string) + { + if (TARGET_64BIT) + warning (0, "%sregparm%s is ignored in 64-bit mode", prefix, suffix); + i = atoi (ix86_regparm_string); + if (i < 0 || i > REGPARM_MAX) + error ("%sregparm=%d%s is not between 0 and %d", + prefix, i, suffix, REGPARM_MAX); + else + ix86_regparm = i; + } + if (TARGET_64BIT) + ix86_regparm = REGPARM_MAX; + + /* If the user has provided any of the -malign-* options, + warn and use that value only if -falign-* is not set. + Remove this code in GCC 3.2 or later. */ + if (ix86_align_loops_string) + { + warning (0, "%salign-loops%s is obsolete, use -falign-loops%s", + prefix, suffix, suffix); + if (align_loops == 0) + { + i = atoi (ix86_align_loops_string); + if (i < 0 || i > MAX_CODE_ALIGN) + error ("%salign-loops=%d%s is not between 0 and %d", + prefix, i, suffix, MAX_CODE_ALIGN); + else + align_loops = 1 << i; + } + } + + if (ix86_align_jumps_string) + { + warning (0, "%salign-jumps%s is obsolete, use -falign-jumps%s", + prefix, suffix, suffix); + if (align_jumps == 0) + { + i = atoi (ix86_align_jumps_string); + if (i < 0 || i > MAX_CODE_ALIGN) + error ("%salign-loops=%d%s is not between 0 and %d", + prefix, i, suffix, MAX_CODE_ALIGN); + else + align_jumps = 1 << i; + } + } + + if (ix86_align_funcs_string) + { + warning (0, "%salign-functions%s is obsolete, use -falign-functions%s", + prefix, suffix, suffix); + if (align_functions == 0) + { + i = atoi (ix86_align_funcs_string); + if (i < 0 || i > MAX_CODE_ALIGN) + error ("%salign-loops=%d%s is not between 0 and %d", + prefix, i, suffix, MAX_CODE_ALIGN); + else + align_functions = 1 << i; + } + } + + /* Default align_* from the processor table. */ + if (align_loops == 0) + { + align_loops = processor_target_table[ix86_tune].align_loop; + align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip; + } + if (align_jumps == 0) + { + align_jumps = processor_target_table[ix86_tune].align_jump; + align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip; + } + if (align_functions == 0) + { + align_functions = processor_target_table[ix86_tune].align_func; + } + + /* Validate -mbranch-cost= value, or provide default. */ + ix86_branch_cost = ix86_cost->branch_cost; + if (ix86_branch_cost_string) + { + i = atoi (ix86_branch_cost_string); + if (i < 0 || i > 5) + error ("%sbranch-cost=%d%s is not between 0 and 5", prefix, i, suffix); + else + ix86_branch_cost = i; + } + if (ix86_section_threshold_string) + { + i = atoi (ix86_section_threshold_string); + if (i < 0) + error ("%slarge-data-threshold=%d%s is negative", prefix, i, suffix); + else + ix86_section_threshold = i; + } + + if (ix86_tls_dialect_string) + { + if (strcmp (ix86_tls_dialect_string, "gnu") == 0) + ix86_tls_dialect = TLS_DIALECT_GNU; + else if (strcmp (ix86_tls_dialect_string, "gnu2") == 0) + ix86_tls_dialect = TLS_DIALECT_GNU2; + else + error ("bad value (%s) for %stls-dialect=%s %s", + ix86_tls_dialect_string, prefix, suffix, sw); + } + + if (ix87_precision_string) + { + i = atoi (ix87_precision_string); + if (i != 32 && i != 64 && i != 80) + error ("pc%d is not valid precision setting (32, 64 or 80)", i); + } + + if (TARGET_64BIT) + { + target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit; + + /* Enable by default the SSE and MMX builtins. Do allow the user to + explicitly disable any of these. In particular, disabling SSE and + MMX for kernel code is extremely useful. */ + if (!ix86_arch_specified) + ix86_isa_flags + |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX + | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit); + + if (TARGET_RTD) + warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix); + } + else + { + target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit; + + if (!ix86_arch_specified) + ix86_isa_flags + |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit; + + /* i386 ABI does not specify red zone. It still makes sense to use it + when programmer takes care to stack from being destroyed. */ + if (!(target_flags_explicit & MASK_NO_RED_ZONE)) + target_flags |= MASK_NO_RED_ZONE; + } + + /* Keep nonleaf frame pointers. */ + if (flag_omit_frame_pointer) + target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER; + else if (TARGET_OMIT_LEAF_FRAME_POINTER) + flag_omit_frame_pointer = 1; + + /* If we're doing fast math, we don't care about comparison order + wrt NaNs. This lets us use a shorter comparison sequence. */ + if (flag_finite_math_only) + target_flags &= ~MASK_IEEE_FP; + + /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387, + since the insns won't need emulation. */ + if (x86_arch_always_fancy_math_387 & ix86_arch_mask) + target_flags &= ~MASK_NO_FANCY_MATH_387; + + /* Likewise, if the target doesn't have a 387, or we've specified + software floating point, don't use 387 inline intrinsics. */ + if (!TARGET_80387) + target_flags |= MASK_NO_FANCY_MATH_387; + + /* Turn on MMX builtins for -msse. */ + if (TARGET_SSE) + { + ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit; + x86_prefetch_sse = true; + } + + /* Turn on popcnt instruction for -msse4.2 or -mabm. */ + if (TARGET_SSE4_2 || TARGET_ABM) + ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit; + + /* Validate -mpreferred-stack-boundary= value or default it to + PREFERRED_STACK_BOUNDARY_DEFAULT. */ + ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT; + if (ix86_preferred_stack_boundary_string) + { + int min = (TARGET_64BIT ? 4 : 2); + int max = (TARGET_SEH ? 4 : 12); + + i = atoi (ix86_preferred_stack_boundary_string); + if (i < min || i > max) + { + if (min == max) + error ("%spreferred-stack-boundary%s is not supported " + "for this target", prefix, suffix); + else + error ("%spreferred-stack-boundary=%d%s is not between %d and %d", + prefix, i, suffix, min, max); + } + else + ix86_preferred_stack_boundary = (1 << i) * BITS_PER_UNIT; + } + + /* Set the default value for -mstackrealign. */ + if (ix86_force_align_arg_pointer == -1) + ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT; + + ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY; + + /* Validate -mincoming-stack-boundary= value or default it to + MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */ + ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary; + if (ix86_incoming_stack_boundary_string) + { + i = atoi (ix86_incoming_stack_boundary_string); + if (i < (TARGET_64BIT ? 4 : 2) || i > 12) + error ("-mincoming-stack-boundary=%d is not between %d and 12", + i, TARGET_64BIT ? 4 : 2); + else + { + ix86_user_incoming_stack_boundary = (1 << i) * BITS_PER_UNIT; + ix86_incoming_stack_boundary + = ix86_user_incoming_stack_boundary; + } + } + + /* Accept -msseregparm only if at least SSE support is enabled. */ + if (TARGET_SSEREGPARM + && ! TARGET_SSE) + error ("%ssseregparm%s used without SSE enabled", prefix, suffix); + + ix86_fpmath = TARGET_FPMATH_DEFAULT; + if (ix86_fpmath_string != 0) + { + if (! strcmp (ix86_fpmath_string, "387")) + ix86_fpmath = FPMATH_387; + else if (! strcmp (ix86_fpmath_string, "sse")) + { + if (!TARGET_SSE) + { + warning (0, "SSE instruction set disabled, using 387 arithmetics"); + ix86_fpmath = FPMATH_387; + } + else + ix86_fpmath = FPMATH_SSE; + } + else if (! strcmp (ix86_fpmath_string, "387,sse") + || ! strcmp (ix86_fpmath_string, "387+sse") + || ! strcmp (ix86_fpmath_string, "sse,387") + || ! strcmp (ix86_fpmath_string, "sse+387") + || ! strcmp (ix86_fpmath_string, "both")) + { + if (!TARGET_SSE) + { + warning (0, "SSE instruction set disabled, using 387 arithmetics"); + ix86_fpmath = FPMATH_387; + } + else if (!TARGET_80387) + { + warning (0, "387 instruction set disabled, using SSE arithmetics"); + ix86_fpmath = FPMATH_SSE; + } + else + ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387); + } + else + error ("bad value (%s) for %sfpmath=%s %s", + ix86_fpmath_string, prefix, suffix, sw); + } + + /* If the i387 is disabled, then do not return values in it. */ + if (!TARGET_80387) + target_flags &= ~MASK_FLOAT_RETURNS; + + /* Use external vectorized library in vectorizing intrinsics. */ + if (ix86_veclibabi_string) + { + if (strcmp (ix86_veclibabi_string, "svml") == 0) + ix86_veclib_handler = ix86_veclibabi_svml; + else if (strcmp (ix86_veclibabi_string, "acml") == 0) + ix86_veclib_handler = ix86_veclibabi_acml; + else + error ("unknown vectorization library ABI type (%s) for " + "%sveclibabi=%s %s", ix86_veclibabi_string, + prefix, suffix, sw); + } + + if ((!USE_IX86_FRAME_POINTER + || (x86_accumulate_outgoing_args & ix86_tune_mask)) + && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS) + && !optimize_size) + target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; + + /* ??? Unwind info is not correct around the CFG unless either a frame + pointer is present or M_A_O_A is set. Fixing this requires rewriting + unwind info generation to be aware of the CFG and propagating states + around edges. */ + if ((flag_unwind_tables || flag_asynchronous_unwind_tables + || flag_exceptions || flag_non_call_exceptions) + && flag_omit_frame_pointer + && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)) + { + if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS) + warning (0, "unwind tables currently require either a frame pointer " + "or %saccumulate-outgoing-args%s for correctness", + prefix, suffix); + target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; + } + + /* If stack probes are required, the space used for large function + arguments on the stack must also be probed, so enable + -maccumulate-outgoing-args so this happens in the prologue. */ + if (TARGET_STACK_PROBE + && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)) + { + if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS) + warning (0, "stack probing requires %saccumulate-outgoing-args%s " + "for correctness", prefix, suffix); + target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; + } + + /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */ + { + char *p; + ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0); + p = strchr (internal_label_prefix, 'X'); + internal_label_prefix_len = p - internal_label_prefix; + *p = '\0'; + } + + /* When scheduling description is not available, disable scheduler pass + so it won't slow down the compilation and make x87 code slower. */ + if (!TARGET_SCHEDULE) + flag_schedule_insns_after_reload = flag_schedule_insns = 0; + + maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES, + ix86_cost->simultaneous_prefetches, + global_options.x_param_values, + global_options_set.x_param_values); + maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, ix86_cost->prefetch_block, + global_options.x_param_values, + global_options_set.x_param_values); + maybe_set_param_value (PARAM_L1_CACHE_SIZE, ix86_cost->l1_cache_size, + global_options.x_param_values, + global_options_set.x_param_values); + maybe_set_param_value (PARAM_L2_CACHE_SIZE, ix86_cost->l2_cache_size, + global_options.x_param_values, + global_options_set.x_param_values); + + /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */ + if (flag_prefetch_loop_arrays < 0 + && HAVE_prefetch + && optimize >= 3 + && software_prefetching_beneficial_p ()) + flag_prefetch_loop_arrays = 1; + + /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0) + can be optimized to ap = __builtin_next_arg (0). */ + if (!TARGET_64BIT && !flag_split_stack) + targetm.expand_builtin_va_start = NULL; + + if (TARGET_64BIT) + { + ix86_gen_leave = gen_leave_rex64; + ix86_gen_add3 = gen_adddi3; + ix86_gen_sub3 = gen_subdi3; + ix86_gen_sub3_carry = gen_subdi3_carry; + ix86_gen_one_cmpl2 = gen_one_cmpldi2; + ix86_gen_monitor = gen_sse3_monitor64; + ix86_gen_andsp = gen_anddi3; + ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di; + ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi; + ix86_gen_probe_stack_range = gen_probe_stack_rangedi; + } + else + { + ix86_gen_leave = gen_leave; + ix86_gen_add3 = gen_addsi3; + ix86_gen_sub3 = gen_subsi3; + ix86_gen_sub3_carry = gen_subsi3_carry; + ix86_gen_one_cmpl2 = gen_one_cmplsi2; + ix86_gen_monitor = gen_sse3_monitor; + ix86_gen_andsp = gen_andsi3; + ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si; + ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi; + ix86_gen_probe_stack_range = gen_probe_stack_rangesi; + } + +#ifdef USE_IX86_CLD + /* Use -mcld by default for 32-bit code if configured with --enable-cld. */ + if (!TARGET_64BIT) + target_flags |= MASK_CLD & ~target_flags_explicit; +#endif + + if (!TARGET_64BIT && flag_pic) + { + if (flag_fentry > 0) + sorry ("-mfentry isn%'t supported for 32-bit in combination " + "with -fpic"); + flag_fentry = 0; + } + else if (TARGET_SEH) + { + if (flag_fentry == 0) + sorry ("-mno-fentry isn%'t compatible with SEH"); + flag_fentry = 1; + } + else if (flag_fentry < 0) + { +#if defined(PROFILE_BEFORE_PROLOGUE) + flag_fentry = 1; +#else + flag_fentry = 0; +#endif + } + + /* Save the initial options in case the user does function specific options */ + if (main_args_p) + target_option_default_node = target_option_current_node + = build_target_option_node (); + + if (TARGET_AVX) + { + /* When not optimize for size, enable vzeroupper optimization for + TARGET_AVX with -fexpensive-optimizations and split 32-byte + AVX unaligned load/store. */ + if (!optimize_size) + { + if (flag_expensive_optimizations + && !(target_flags_explicit & MASK_VZEROUPPER)) + target_flags |= MASK_VZEROUPPER; + if ((x86_avx256_split_unaligned_load & ix86_tune_mask) + && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD)) + target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD; + if ((x86_avx256_split_unaligned_store & ix86_tune_mask) + && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE)) + target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE; + /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */ + if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128)) + target_flags |= MASK_PREFER_AVX128; + } + } + else + { + /* Disable vzeroupper pass if TARGET_AVX is disabled. */ + target_flags &= ~MASK_VZEROUPPER; + } +} + +/* Return TRUE if VAL is passed in register with 256bit AVX modes. */ + +static bool +function_pass_avx256_p (const_rtx val) +{ + if (!val) + return false; + + if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val))) + return true; + + if (GET_CODE (val) == PARALLEL) + { + int i; + rtx r; + + for (i = XVECLEN (val, 0) - 1; i >= 0; i--) + { + r = XVECEXP (val, 0, i); + if (GET_CODE (r) == EXPR_LIST + && XEXP (r, 0) + && REG_P (XEXP (r, 0)) + && (GET_MODE (XEXP (r, 0)) == OImode + || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0))))) + return true; + } + } + + return false; +} + +/* Implement the TARGET_OPTION_OVERRIDE hook. */ + +static void +ix86_option_override (void) +{ + ix86_option_override_internal (true); +} + +/* Update register usage after having seen the compiler flags. */ + +static void +ix86_conditional_register_usage (void) +{ + int i; + unsigned int j; + + for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) + { + if (fixed_regs[i] > 1) + fixed_regs[i] = (fixed_regs[i] == (TARGET_64BIT ? 3 : 2)); + if (call_used_regs[i] > 1) + call_used_regs[i] = (call_used_regs[i] == (TARGET_64BIT ? 3 : 2)); + } + + /* The PIC register, if it exists, is fixed. */ + j = PIC_OFFSET_TABLE_REGNUM; + if (j != INVALID_REGNUM) + fixed_regs[j] = call_used_regs[j] = 1; + + /* The MS_ABI changes the set of call-used registers. */ + if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI) + { + call_used_regs[SI_REG] = 0; + call_used_regs[DI_REG] = 0; + call_used_regs[XMM6_REG] = 0; + call_used_regs[XMM7_REG] = 0; + for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++) + call_used_regs[i] = 0; + } + + /* The default setting of CLOBBERED_REGS is for 32-bit; add in the + other call-clobbered regs for 64-bit. */ + if (TARGET_64BIT) + { + CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]); + + for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) + if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i) + && call_used_regs[i]) + SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i); + } + + /* If MMX is disabled, squash the registers. */ + if (! TARGET_MMX) + for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) + if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i)) + fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = ""; + + /* If SSE is disabled, squash the registers. */ + if (! TARGET_SSE) + for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) + if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i)) + fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = ""; + + /* If the FPU is disabled, squash the registers. */ + if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387)) + for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) + if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i)) + fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = ""; + + /* If 32-bit, squash the 64-bit registers. */ + if (! TARGET_64BIT) + { + for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++) + reg_names[i] = ""; + for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++) + reg_names[i] = ""; + } +} + + +/* Save the current options */ + +static void +ix86_function_specific_save (struct cl_target_option *ptr) +{ + ptr->arch = ix86_arch; + ptr->schedule = ix86_schedule; + ptr->tune = ix86_tune; + ptr->fpmath = ix86_fpmath; + ptr->branch_cost = ix86_branch_cost; + ptr->tune_defaulted = ix86_tune_defaulted; + ptr->arch_specified = ix86_arch_specified; + ptr->ix86_isa_flags_explicit = ix86_isa_flags_explicit; + ptr->ix86_target_flags_explicit = target_flags_explicit; + + /* The fields are char but the variables are not; make sure the + values fit in the fields. */ + gcc_assert (ptr->arch == ix86_arch); + gcc_assert (ptr->schedule == ix86_schedule); + gcc_assert (ptr->tune == ix86_tune); + gcc_assert (ptr->fpmath == ix86_fpmath); + gcc_assert (ptr->branch_cost == ix86_branch_cost); +} + +/* Restore the current options */ + +static void +ix86_function_specific_restore (struct cl_target_option *ptr) +{ + enum processor_type old_tune = ix86_tune; + enum processor_type old_arch = ix86_arch; + unsigned int ix86_arch_mask, ix86_tune_mask; + int i; + + ix86_arch = (enum processor_type) ptr->arch; + ix86_schedule = (enum attr_cpu) ptr->schedule; + ix86_tune = (enum processor_type) ptr->tune; + ix86_fpmath = (enum fpmath_unit) ptr->fpmath; + ix86_branch_cost = ptr->branch_cost; + ix86_tune_defaulted = ptr->tune_defaulted; + ix86_arch_specified = ptr->arch_specified; + ix86_isa_flags_explicit = ptr->ix86_isa_flags_explicit; + target_flags_explicit = ptr->ix86_target_flags_explicit; + + /* Recreate the arch feature tests if the arch changed */ + if (old_arch != ix86_arch) + { + ix86_arch_mask = 1u << ix86_arch; + for (i = 0; i < X86_ARCH_LAST; ++i) + ix86_arch_features[i] + = !!(initial_ix86_arch_features[i] & ix86_arch_mask); + } + + /* Recreate the tune optimization tests */ + if (old_tune != ix86_tune) + { + ix86_tune_mask = 1u << ix86_tune; + for (i = 0; i < X86_TUNE_LAST; ++i) + ix86_tune_features[i] + = !!(initial_ix86_tune_features[i] & ix86_tune_mask); + } +} + +/* Print the current options */ + +static void +ix86_function_specific_print (FILE *file, int indent, + struct cl_target_option *ptr) +{ + char *target_string + = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags, + NULL, NULL, NULL, false); + + fprintf (file, "%*sarch = %d (%s)\n", + indent, "", + ptr->arch, + ((ptr->arch < TARGET_CPU_DEFAULT_max) + ? cpu_names[ptr->arch] + : "")); + + fprintf (file, "%*stune = %d (%s)\n", + indent, "", + ptr->tune, + ((ptr->tune < TARGET_CPU_DEFAULT_max) + ? cpu_names[ptr->tune] + : "")); + + fprintf (file, "%*sfpmath = %d%s%s\n", indent, "", ptr->fpmath, + (ptr->fpmath & FPMATH_387) ? ", 387" : "", + (ptr->fpmath & FPMATH_SSE) ? ", sse" : ""); + fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost); + + if (target_string) + { + fprintf (file, "%*s%s\n", indent, "", target_string); + free (target_string); + } +} + + +/* Inner function to process the attribute((target(...))), take an argument and + set the current options from the argument. If we have a list, recursively go + over the list. */ + +static bool +ix86_valid_target_attribute_inner_p (tree args, char *p_strings[]) +{ + char *next_optstr; + bool ret = true; + +#define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 } +#define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 } +#define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M } +#define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M } + + enum ix86_opt_type + { + ix86_opt_unknown, + ix86_opt_yes, + ix86_opt_no, + ix86_opt_str, + ix86_opt_isa + }; + + static const struct + { + const char *string; + size_t len; + enum ix86_opt_type type; + int opt; + int mask; + } attrs[] = { + /* isa options */ + IX86_ATTR_ISA ("3dnow", OPT_m3dnow), + IX86_ATTR_ISA ("abm", OPT_mabm), + IX86_ATTR_ISA ("bmi", OPT_mbmi), + IX86_ATTR_ISA ("tbm", OPT_mtbm), + IX86_ATTR_ISA ("aes", OPT_maes), + IX86_ATTR_ISA ("avx", OPT_mavx), + IX86_ATTR_ISA ("mmx", OPT_mmmx), + IX86_ATTR_ISA ("pclmul", OPT_mpclmul), + IX86_ATTR_ISA ("popcnt", OPT_mpopcnt), + IX86_ATTR_ISA ("sse", OPT_msse), + IX86_ATTR_ISA ("sse2", OPT_msse2), + IX86_ATTR_ISA ("sse3", OPT_msse3), + IX86_ATTR_ISA ("sse4", OPT_msse4), + IX86_ATTR_ISA ("sse4.1", OPT_msse4_1), + IX86_ATTR_ISA ("sse4.2", OPT_msse4_2), + IX86_ATTR_ISA ("sse4a", OPT_msse4a), + IX86_ATTR_ISA ("ssse3", OPT_mssse3), + IX86_ATTR_ISA ("fma4", OPT_mfma4), + IX86_ATTR_ISA ("xop", OPT_mxop), + IX86_ATTR_ISA ("lwp", OPT_mlwp), + IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase), + IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd), + IX86_ATTR_ISA ("f16c", OPT_mf16c), + + /* string options */ + IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH), + IX86_ATTR_STR ("fpmath=", IX86_FUNCTION_SPECIFIC_FPMATH), + IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE), + + /* flag options */ + IX86_ATTR_YES ("cld", + OPT_mcld, + MASK_CLD), + + IX86_ATTR_NO ("fancy-math-387", + OPT_mfancy_math_387, + MASK_NO_FANCY_MATH_387), + + IX86_ATTR_YES ("ieee-fp", + OPT_mieee_fp, + MASK_IEEE_FP), + + IX86_ATTR_YES ("inline-all-stringops", + OPT_minline_all_stringops, + MASK_INLINE_ALL_STRINGOPS), + + IX86_ATTR_YES ("inline-stringops-dynamically", + OPT_minline_stringops_dynamically, + MASK_INLINE_STRINGOPS_DYNAMICALLY), + + IX86_ATTR_NO ("align-stringops", + OPT_mno_align_stringops, + MASK_NO_ALIGN_STRINGOPS), + + IX86_ATTR_YES ("recip", + OPT_mrecip, + MASK_RECIP), + + }; + + /* If this is a list, recurse to get the options. */ + if (TREE_CODE (args) == TREE_LIST) + { + bool ret = true; + + for (; args; args = TREE_CHAIN (args)) + if (TREE_VALUE (args) + && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args), p_strings)) + ret = false; + + return ret; + } + + else if (TREE_CODE (args) != STRING_CST) + gcc_unreachable (); + + /* Handle multiple arguments separated by commas. */ + next_optstr = ASTRDUP (TREE_STRING_POINTER (args)); + + while (next_optstr && *next_optstr != '\0') + { + char *p = next_optstr; + char *orig_p = p; + char *comma = strchr (next_optstr, ','); + const char *opt_string; + size_t len, opt_len; + int opt; + bool opt_set_p; + char ch; + unsigned i; + enum ix86_opt_type type = ix86_opt_unknown; + int mask = 0; + + if (comma) + { + *comma = '\0'; + len = comma - next_optstr; + next_optstr = comma + 1; + } + else + { + len = strlen (p); + next_optstr = NULL; + } + + /* Recognize no-xxx. */ + if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-') + { + opt_set_p = false; + p += 3; + len -= 3; + } + else + opt_set_p = true; + + /* Find the option. */ + ch = *p; + opt = N_OPTS; + for (i = 0; i < ARRAY_SIZE (attrs); i++) + { + type = attrs[i].type; + opt_len = attrs[i].len; + if (ch == attrs[i].string[0] + && ((type != ix86_opt_str) ? len == opt_len : len > opt_len) + && memcmp (p, attrs[i].string, opt_len) == 0) + { + opt = attrs[i].opt; + mask = attrs[i].mask; + opt_string = attrs[i].string; + break; + } + } + + /* Process the option. */ + if (opt == N_OPTS) + { + error ("attribute(target(\"%s\")) is unknown", orig_p); + ret = false; + } + + else if (type == ix86_opt_isa) + ix86_handle_option (opt, p, opt_set_p); + + else if (type == ix86_opt_yes || type == ix86_opt_no) + { + if (type == ix86_opt_no) + opt_set_p = !opt_set_p; + + if (opt_set_p) + target_flags |= mask; + else + target_flags &= ~mask; + } + + else if (type == ix86_opt_str) + { + if (p_strings[opt]) + { + error ("option(\"%s\") was already specified", opt_string); + ret = false; + } + else + p_strings[opt] = xstrdup (p + opt_len); + } + + else + gcc_unreachable (); + } + + return ret; +} + +/* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */ + +tree +ix86_valid_target_attribute_tree (tree args) +{ + const char *orig_arch_string = ix86_arch_string; + const char *orig_tune_string = ix86_tune_string; + const char *orig_fpmath_string = ix86_fpmath_string; + int orig_tune_defaulted = ix86_tune_defaulted; + int orig_arch_specified = ix86_arch_specified; + char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL, NULL }; + tree t = NULL_TREE; + int i; + struct cl_target_option *def + = TREE_TARGET_OPTION (target_option_default_node); + + /* Process each of the options on the chain. */ + if (! ix86_valid_target_attribute_inner_p (args, option_strings)) + return NULL_TREE; + + /* If the changed options are different from the default, rerun + ix86_option_override_internal, and then save the options away. + The string options are are attribute options, and will be undone + when we copy the save structure. */ + if (ix86_isa_flags != def->x_ix86_isa_flags + || target_flags != def->x_target_flags + || option_strings[IX86_FUNCTION_SPECIFIC_ARCH] + || option_strings[IX86_FUNCTION_SPECIFIC_TUNE] + || option_strings[IX86_FUNCTION_SPECIFIC_FPMATH]) + { + /* If we are using the default tune= or arch=, undo the string assigned, + and use the default. */ + if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]) + ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH]; + else if (!orig_arch_specified) + ix86_arch_string = NULL; + + if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]) + ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE]; + else if (orig_tune_defaulted) + ix86_tune_string = NULL; + + /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */ + if (option_strings[IX86_FUNCTION_SPECIFIC_FPMATH]) + ix86_fpmath_string = option_strings[IX86_FUNCTION_SPECIFIC_FPMATH]; + else if (!TARGET_64BIT && TARGET_SSE) + ix86_fpmath_string = "sse,387"; + + /* Do any overrides, such as arch=xxx, or tune=xxx support. */ + ix86_option_override_internal (false); + + /* Add any builtin functions with the new isa if any. */ + ix86_add_new_builtins (ix86_isa_flags); + + /* Save the current options unless we are validating options for + #pragma. */ + t = build_target_option_node (); + + ix86_arch_string = orig_arch_string; + ix86_tune_string = orig_tune_string; + ix86_fpmath_string = orig_fpmath_string; + + /* Free up memory allocated to hold the strings */ + for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++) + if (option_strings[i]) + free (option_strings[i]); + } + + return t; +} + +/* Hook to validate attribute((target("string"))). */ + +static bool +ix86_valid_target_attribute_p (tree fndecl, + tree ARG_UNUSED (name), + tree args, + int ARG_UNUSED (flags)) +{ + struct cl_target_option cur_target; + bool ret = true; + tree old_optimize = build_optimization_node (); + tree new_target, new_optimize; + tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl); + + /* If the function changed the optimization levels as well as setting target + options, start with the optimizations specified. */ + if (func_optimize && func_optimize != old_optimize) + cl_optimization_restore (&global_options, + TREE_OPTIMIZATION (func_optimize)); + + /* The target attributes may also change some optimization flags, so update + the optimization options if necessary. */ + cl_target_option_save (&cur_target, &global_options); + new_target = ix86_valid_target_attribute_tree (args); + new_optimize = build_optimization_node (); + + if (!new_target) + ret = false; + + else if (fndecl) + { + DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target; + + if (old_optimize != new_optimize) + DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize; + } + + cl_target_option_restore (&global_options, &cur_target); + + if (old_optimize != new_optimize) + cl_optimization_restore (&global_options, + TREE_OPTIMIZATION (old_optimize)); + + return ret; +} + + +/* Hook to determine if one function can safely inline another. */ + +static bool +ix86_can_inline_p (tree caller, tree callee) +{ + bool ret = false; + tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller); + tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee); + + /* If callee has no option attributes, then it is ok to inline. */ + if (!callee_tree) + ret = true; + + /* If caller has no option attributes, but callee does then it is not ok to + inline. */ + else if (!caller_tree) + ret = false; + + else + { + struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree); + struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree); + + /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function + can inline a SSE2 function but a SSE2 function can't inline a SSE4 + function. */ + if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags) + != callee_opts->x_ix86_isa_flags) + ret = false; + + /* See if we have the same non-isa options. */ + else if (caller_opts->x_target_flags != callee_opts->x_target_flags) + ret = false; + + /* See if arch, tune, etc. are the same. */ + else if (caller_opts->arch != callee_opts->arch) + ret = false; + + else if (caller_opts->tune != callee_opts->tune) + ret = false; + + else if (caller_opts->fpmath != callee_opts->fpmath) + ret = false; + + else if (caller_opts->branch_cost != callee_opts->branch_cost) + ret = false; + + else + ret = true; + } + + return ret; +} + + +/* Remember the last target of ix86_set_current_function. */ +static GTY(()) tree ix86_previous_fndecl; + +/* Establish appropriate back-end context for processing the function + FNDECL. The argument might be NULL to indicate processing at top + level, outside of any function scope. */ +static void +ix86_set_current_function (tree fndecl) +{ + /* Only change the context if the function changes. This hook is called + several times in the course of compiling a function, and we don't want to + slow things down too much or call target_reinit when it isn't safe. */ + if (fndecl && fndecl != ix86_previous_fndecl) + { + tree old_tree = (ix86_previous_fndecl + ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl) + : NULL_TREE); + + tree new_tree = (fndecl + ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl) + : NULL_TREE); + + ix86_previous_fndecl = fndecl; + if (old_tree == new_tree) + ; + + else if (new_tree) + { + cl_target_option_restore (&global_options, + TREE_TARGET_OPTION (new_tree)); + target_reinit (); + } + + else if (old_tree) + { + struct cl_target_option *def + = TREE_TARGET_OPTION (target_option_current_node); + + cl_target_option_restore (&global_options, def); + target_reinit (); + } + } +} + + +/* Return true if this goes in large data/bss. */ + +static bool +ix86_in_large_data_p (tree exp) +{ + if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC) + return false; + + /* Functions are never large data. */ + if (TREE_CODE (exp) == FUNCTION_DECL) + return false; + + if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp)) + { + const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp)); + if (strcmp (section, ".ldata") == 0 + || strcmp (section, ".lbss") == 0) + return true; + return false; + } + else + { + HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp)); + + /* If this is an incomplete type with size 0, then we can't put it + in data because it might be too big when completed. */ + if (!size || size > ix86_section_threshold) + return true; + } + + return false; +} + +/* Switch to the appropriate section for output of DECL. + DECL is either a `VAR_DECL' node or a constant of some sort. + RELOC indicates whether forming the initial value of DECL requires + link-time relocations. */ + +static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT) + ATTRIBUTE_UNUSED; + +static section * +x86_64_elf_select_section (tree decl, int reloc, + unsigned HOST_WIDE_INT align) +{ + if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC) + && ix86_in_large_data_p (decl)) + { + const char *sname = NULL; + unsigned int flags = SECTION_WRITE; + switch (categorize_decl_for_section (decl, reloc)) + { + case SECCAT_DATA: + sname = ".ldata"; + break; + case SECCAT_DATA_REL: + sname = ".ldata.rel"; + break; + case SECCAT_DATA_REL_LOCAL: + sname = ".ldata.rel.local"; + break; + case SECCAT_DATA_REL_RO: + sname = ".ldata.rel.ro"; + break; + case SECCAT_DATA_REL_RO_LOCAL: + sname = ".ldata.rel.ro.local"; + break; + case SECCAT_BSS: + sname = ".lbss"; + flags |= SECTION_BSS; + break; + case SECCAT_RODATA: + case SECCAT_RODATA_MERGE_STR: + case SECCAT_RODATA_MERGE_STR_INIT: + case SECCAT_RODATA_MERGE_CONST: + sname = ".lrodata"; + flags = 0; + break; + case SECCAT_SRODATA: + case SECCAT_SDATA: + case SECCAT_SBSS: + gcc_unreachable (); + case SECCAT_TEXT: + case SECCAT_TDATA: + case SECCAT_TBSS: + /* We don't split these for medium model. Place them into + default sections and hope for best. */ + break; + } + if (sname) + { + /* We might get called with string constants, but get_named_section + doesn't like them as they are not DECLs. Also, we need to set + flags in that case. */ + if (!DECL_P (decl)) + return get_section (sname, flags, NULL); + return get_named_section (decl, sname, reloc); + } + } + return default_elf_select_section (decl, reloc, align); +} + +/* Build up a unique section name, expressed as a + STRING_CST node, and assign it to DECL_SECTION_NAME (decl). + RELOC indicates whether the initial value of EXP requires + link-time relocations. */ + +static void ATTRIBUTE_UNUSED +x86_64_elf_unique_section (tree decl, int reloc) +{ + if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC) + && ix86_in_large_data_p (decl)) + { + const char *prefix = NULL; + /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */ + bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP; + + switch (categorize_decl_for_section (decl, reloc)) + { + case SECCAT_DATA: + case SECCAT_DATA_REL: + case SECCAT_DATA_REL_LOCAL: + case SECCAT_DATA_REL_RO: + case SECCAT_DATA_REL_RO_LOCAL: + prefix = one_only ? ".ld" : ".ldata"; + break; + case SECCAT_BSS: + prefix = one_only ? ".lb" : ".lbss"; + break; + case SECCAT_RODATA: + case SECCAT_RODATA_MERGE_STR: + case SECCAT_RODATA_MERGE_STR_INIT: + case SECCAT_RODATA_MERGE_CONST: + prefix = one_only ? ".lr" : ".lrodata"; + break; + case SECCAT_SRODATA: + case SECCAT_SDATA: + case SECCAT_SBSS: + gcc_unreachable (); + case SECCAT_TEXT: + case SECCAT_TDATA: + case SECCAT_TBSS: + /* We don't split these for medium model. Place them into + default sections and hope for best. */ + break; + } + if (prefix) + { + const char *name, *linkonce; + char *string; + + name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)); + name = targetm.strip_name_encoding (name); + + /* If we're using one_only, then there needs to be a .gnu.linkonce + prefix to the section name. */ + linkonce = one_only ? ".gnu.linkonce" : ""; + + string = ACONCAT ((linkonce, prefix, ".", name, NULL)); + + DECL_SECTION_NAME (decl) = build_string (strlen (string), string); + return; + } + } + default_unique_section (decl, reloc); +} + +#ifdef COMMON_ASM_OP +/* This says how to output assembler code to declare an + uninitialized external linkage data object. + + For medium model x86-64 we need to use .largecomm opcode for + large objects. */ +void +x86_elf_aligned_common (FILE *file, + const char *name, unsigned HOST_WIDE_INT size, + int align) +{ + if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC) + && size > (unsigned int)ix86_section_threshold) + fputs (".largecomm\t", file); + else + fputs (COMMON_ASM_OP, file); + assemble_name (file, name); + fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n", + size, align / BITS_PER_UNIT); +} +#endif + +/* Utility function for targets to use in implementing + ASM_OUTPUT_ALIGNED_BSS. */ + +void +x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED, + const char *name, unsigned HOST_WIDE_INT size, + int align) +{ + if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC) + && size > (unsigned int)ix86_section_threshold) + switch_to_section (get_named_section (decl, ".lbss", 0)); + else + switch_to_section (bss_section); + ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT)); +#ifdef ASM_DECLARE_OBJECT_NAME + last_assemble_variable_decl = decl; + ASM_DECLARE_OBJECT_NAME (file, name, decl); +#else + /* Standard thing is just output label for the object. */ + ASM_OUTPUT_LABEL (file, name); +#endif /* ASM_DECLARE_OBJECT_NAME */ + ASM_OUTPUT_SKIP (file, size ? size : 1); +} + +static const struct default_options ix86_option_optimization_table[] = + { + /* Turn off -fschedule-insns by default. It tends to make the + problem with not enough registers even worse. */ +#ifdef INSN_SCHEDULING + { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 }, +#endif + +#ifdef SUBTARGET_OPTIMIZATION_OPTIONS + SUBTARGET_OPTIMIZATION_OPTIONS, +#endif + { OPT_LEVELS_NONE, 0, NULL, 0 } + }; + +/* Implement TARGET_OPTION_INIT_STRUCT. */ + +static void +ix86_option_init_struct (struct gcc_options *opts) +{ + if (TARGET_MACHO) + /* The Darwin libraries never set errno, so we might as well + avoid calling them when that's the only reason we would. */ + opts->x_flag_errno_math = 0; + + opts->x_flag_pcc_struct_return = 2; + opts->x_flag_asynchronous_unwind_tables = 2; + opts->x_flag_vect_cost_model = 1; +} + +/* Decide whether we must probe the stack before any space allocation + on this target. It's essentially TARGET_STACK_PROBE except when + -fstack-check causes the stack to be already probed differently. */ + +bool +ix86_target_stack_probe (void) +{ + /* Do not probe the stack twice if static stack checking is enabled. */ + if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK) + return false; + + return TARGET_STACK_PROBE; +} + +/* Decide whether we can make a sibling call to a function. DECL is the + declaration of the function being targeted by the call and EXP is the + CALL_EXPR representing the call. */ + +static bool +ix86_function_ok_for_sibcall (tree decl, tree exp) +{ + tree type, decl_or_type; + rtx a, b; + + /* If we are generating position-independent code, we cannot sibcall + optimize any indirect call, or a direct call to a global function, + as the PLT requires %ebx be live. (Darwin does not have a PLT.) */ + if (!TARGET_MACHO + && !TARGET_64BIT + && flag_pic + && (!decl || !targetm.binds_local_p (decl))) + return false; + + /* If we need to align the outgoing stack, then sibcalling would + unalign the stack, which may break the called function. */ + if (ix86_minimum_incoming_stack_boundary (true) + < PREFERRED_STACK_BOUNDARY) + return false; + + if (decl) + { + decl_or_type = decl; + type = TREE_TYPE (decl); + } + else + { + /* We're looking at the CALL_EXPR, we need the type of the function. */ + type = CALL_EXPR_FN (exp); /* pointer expression */ + type = TREE_TYPE (type); /* pointer type */ + type = TREE_TYPE (type); /* function type */ + decl_or_type = type; + } + + /* Check that the return value locations are the same. Like + if we are returning floats on the 80387 register stack, we cannot + make a sibcall from a function that doesn't return a float to a + function that does or, conversely, from a function that does return + a float to a function that doesn't; the necessary stack adjustment + would not be executed. This is also the place we notice + differences in the return value ABI. Note that it is ok for one + of the functions to have void return type as long as the return + value of the other is passed in a register. */ + a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false); + b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)), + cfun->decl, false); + if (STACK_REG_P (a) || STACK_REG_P (b)) + { + if (!rtx_equal_p (a, b)) + return false; + } + else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl)))) + { + /* Disable sibcall if we need to generate vzeroupper after + callee returns. */ + if (TARGET_VZEROUPPER + && cfun->machine->callee_return_avx256_p + && !cfun->machine->caller_return_avx256_p) + return false; + } + else if (!rtx_equal_p (a, b)) + return false; + + if (TARGET_64BIT) + { + /* The SYSV ABI has more call-clobbered registers; + disallow sibcalls from MS to SYSV. */ + if (cfun->machine->call_abi == MS_ABI + && ix86_function_type_abi (type) == SYSV_ABI) + return false; + } + else + { + /* If this call is indirect, we'll need to be able to use a + call-clobbered register for the address of the target function. + Make sure that all such registers are not used for passing + parameters. Note that DLLIMPORT functions are indirect. */ + if (!decl + || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl))) + { + if (ix86_function_regparm (type, NULL) >= 3) + { + /* ??? Need to count the actual number of registers to be used, + not the possible number of registers. Fix later. */ + return false; + } + } + } + + /* Otherwise okay. That also includes certain types of indirect calls. */ + return true; +} + +/* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall", + and "sseregparm" calling convention attributes; + arguments as in struct attribute_spec.handler. */ + +static tree +ix86_handle_cconv_attribute (tree *node, tree name, + tree args, + int flags ATTRIBUTE_UNUSED, + bool *no_add_attrs) +{ + if (TREE_CODE (*node) != FUNCTION_TYPE + && TREE_CODE (*node) != METHOD_TYPE + && TREE_CODE (*node) != FIELD_DECL + && TREE_CODE (*node) != TYPE_DECL) + { + warning (OPT_Wattributes, "%qE attribute only applies to functions", + name); + *no_add_attrs = true; + return NULL_TREE; + } + + /* Can combine regparm with all attributes but fastcall. */ + if (is_attribute_p ("regparm", name)) + { + tree cst; + + if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node))) + { + error ("fastcall and regparm attributes are not compatible"); + } + + if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) + { + error ("regparam and thiscall attributes are not compatible"); + } + + cst = TREE_VALUE (args); + if (TREE_CODE (cst) != INTEGER_CST) + { + warning (OPT_Wattributes, + "%qE attribute requires an integer constant argument", + name); + *no_add_attrs = true; + } + else if (compare_tree_int (cst, REGPARM_MAX) > 0) + { + warning (OPT_Wattributes, "argument to %qE attribute larger than %d", + name, REGPARM_MAX); + *no_add_attrs = true; + } + + return NULL_TREE; + } + + if (TARGET_64BIT) + { + /* Do not warn when emulating the MS ABI. */ + if ((TREE_CODE (*node) != FUNCTION_TYPE + && TREE_CODE (*node) != METHOD_TYPE) + || ix86_function_type_abi (*node) != MS_ABI) + warning (OPT_Wattributes, "%qE attribute ignored", + name); + *no_add_attrs = true; + return NULL_TREE; + } + + /* Can combine fastcall with stdcall (redundant) and sseregparm. */ + if (is_attribute_p ("fastcall", name)) + { + if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node))) + { + error ("fastcall and cdecl attributes are not compatible"); + } + if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node))) + { + error ("fastcall and stdcall attributes are not compatible"); + } + if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node))) + { + error ("fastcall and regparm attributes are not compatible"); + } + if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) + { + error ("fastcall and thiscall attributes are not compatible"); + } + } + + /* Can combine stdcall with fastcall (redundant), regparm and + sseregparm. */ + else if (is_attribute_p ("stdcall", name)) + { + if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node))) + { + error ("stdcall and cdecl attributes are not compatible"); + } + if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node))) + { + error ("stdcall and fastcall attributes are not compatible"); + } + if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) + { + error ("stdcall and thiscall attributes are not compatible"); + } + } + + /* Can combine cdecl with regparm and sseregparm. */ + else if (is_attribute_p ("cdecl", name)) + { + if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node))) + { + error ("stdcall and cdecl attributes are not compatible"); + } + if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node))) + { + error ("fastcall and cdecl attributes are not compatible"); + } + if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) + { + error ("cdecl and thiscall attributes are not compatible"); + } + } + else if (is_attribute_p ("thiscall", name)) + { + if (TREE_CODE (*node) != METHOD_TYPE && pedantic) + warning (OPT_Wattributes, "%qE attribute is used for none class-method", + name); + if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node))) + { + error ("stdcall and thiscall attributes are not compatible"); + } + if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node))) + { + error ("fastcall and thiscall attributes are not compatible"); + } + if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node))) + { + error ("cdecl and thiscall attributes are not compatible"); + } + } + + /* Can combine sseregparm with all attributes. */ + + return NULL_TREE; +} + +/* Return 0 if the attributes for two types are incompatible, 1 if they + are compatible, and 2 if they are nearly compatible (which causes a + warning to be generated). */ + +static int +ix86_comp_type_attributes (const_tree type1, const_tree type2) +{ + /* Check for mismatch of non-default calling convention. */ + const char *const rtdstr = TARGET_RTD ? "cdecl" : "stdcall"; + + if (TREE_CODE (type1) != FUNCTION_TYPE + && TREE_CODE (type1) != METHOD_TYPE) + return 1; + + /* Check for mismatched fastcall/regparm types. */ + if ((!lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type1)) + != !lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type2))) + || (ix86_function_regparm (type1, NULL) + != ix86_function_regparm (type2, NULL))) + return 0; + + /* Check for mismatched sseregparm types. */ + if (!lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type1)) + != !lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type2))) + return 0; + + /* Check for mismatched thiscall types. */ + if (!lookup_attribute ("thiscall", TYPE_ATTRIBUTES (type1)) + != !lookup_attribute ("thiscall", TYPE_ATTRIBUTES (type2))) + return 0; + + /* Check for mismatched return types (cdecl vs stdcall). */ + if (!lookup_attribute (rtdstr, TYPE_ATTRIBUTES (type1)) + != !lookup_attribute (rtdstr, TYPE_ATTRIBUTES (type2))) + return 0; + + return 1; +} + +/* Return the regparm value for a function with the indicated TYPE and DECL. + DECL may be NULL when calling function indirectly + or considering a libcall. */ + +static int +ix86_function_regparm (const_tree type, const_tree decl) +{ + tree attr; + int regparm; + + if (TARGET_64BIT) + return (ix86_function_type_abi (type) == SYSV_ABI + ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX); + + regparm = ix86_regparm; + attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type)); + if (attr) + { + regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))); + return regparm; + } + + if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type))) + return 2; + + if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (type))) + return 1; + + /* Use register calling convention for local functions when possible. */ + if (decl + && TREE_CODE (decl) == FUNCTION_DECL + && optimize + && !(profile_flag && !flag_fentry)) + { + /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */ + struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl)); + if (i && i->local && i->can_change_signature) + { + int local_regparm, globals = 0, regno; + + /* Make sure no regparm register is taken by a + fixed register variable. */ + for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++) + if (fixed_regs[local_regparm]) + break; + + /* We don't want to use regparm(3) for nested functions as + these use a static chain pointer in the third argument. */ + if (local_regparm == 3 && DECL_STATIC_CHAIN (decl)) + local_regparm = 2; + + /* In 32-bit mode save a register for the split stack. */ + if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack) + local_regparm = 2; + + /* Each fixed register usage increases register pressure, + so less registers should be used for argument passing. + This functionality can be overriden by an explicit + regparm value. */ + for (regno = 0; regno <= DI_REG; regno++) + if (fixed_regs[regno]) + globals++; + + local_regparm + = globals < local_regparm ? local_regparm - globals : 0; + + if (local_regparm > regparm) + regparm = local_regparm; + } + } + + return regparm; +} + +/* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and + DFmode (2) arguments in SSE registers for a function with the + indicated TYPE and DECL. DECL may be NULL when calling function + indirectly or considering a libcall. Otherwise return 0. */ + +static int +ix86_function_sseregparm (const_tree type, const_tree decl, bool warn) +{ + gcc_assert (!TARGET_64BIT); + + /* Use SSE registers to pass SFmode and DFmode arguments if requested + by the sseregparm attribute. */ + if (TARGET_SSEREGPARM + || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type)))) + { + if (!TARGET_SSE) + { + if (warn) + { + if (decl) + error ("calling %qD with attribute sseregparm without " + "SSE/SSE2 enabled", decl); + else + error ("calling %qT with attribute sseregparm without " + "SSE/SSE2 enabled", type); + } + return 0; + } + + return 2; + } + + /* For local functions, pass up to SSE_REGPARM_MAX SFmode + (and DFmode for SSE2) arguments in SSE registers. */ + if (decl && TARGET_SSE_MATH && optimize + && !(profile_flag && !flag_fentry)) + { + /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */ + struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl)); + if (i && i->local && i->can_change_signature) + return TARGET_SSE2 ? 2 : 1; + } + + return 0; +} + +/* Return true if EAX is live at the start of the function. Used by + ix86_expand_prologue to determine if we need special help before + calling allocate_stack_worker. */ + +static bool +ix86_eax_live_at_start_p (void) +{ + /* Cheat. Don't bother working forward from ix86_function_regparm + to the function type to whether an actual argument is located in + eax. Instead just look at cfg info, which is still close enough + to correct at this point. This gives false positives for broken + functions that might use uninitialized data that happens to be + allocated in eax, but who cares? */ + return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0); +} + +static bool +ix86_keep_aggregate_return_pointer (tree fntype) +{ + tree attr; + + attr = lookup_attribute ("callee_pop_aggregate_return", + TYPE_ATTRIBUTES (fntype)); + if (attr) + return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0); + + return KEEP_AGGREGATE_RETURN_POINTER != 0; +} + +/* Value is the number of bytes of arguments automatically + popped when returning from a subroutine call. + FUNDECL is the declaration node of the function (as a tree), + FUNTYPE is the data type of the function (as a tree), + or for a library call it is an identifier node for the subroutine name. + SIZE is the number of bytes of arguments passed on the stack. + + On the 80386, the RTD insn may be used to pop them if the number + of args is fixed, but if the number is variable then the caller + must pop them all. RTD can't be used for library calls now + because the library is compiled with the Unix compiler. + Use of RTD is a selectable option, since it is incompatible with + standard Unix calling sequences. If the option is not selected, + the caller must always pop the args. + + The attribute stdcall is equivalent to RTD on a per module basis. */ + +static int +ix86_return_pops_args (tree fundecl, tree funtype, int size) +{ + int rtd; + + /* None of the 64-bit ABIs pop arguments. */ + if (TARGET_64BIT) + return 0; + + rtd = TARGET_RTD && (!fundecl || TREE_CODE (fundecl) != IDENTIFIER_NODE); + + /* Cdecl functions override -mrtd, and never pop the stack. */ + if (! lookup_attribute ("cdecl", TYPE_ATTRIBUTES (funtype))) + { + /* Stdcall and fastcall functions will pop the stack if not + variable args. */ + if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (funtype)) + || lookup_attribute ("fastcall", TYPE_ATTRIBUTES (funtype)) + || lookup_attribute ("thiscall", TYPE_ATTRIBUTES (funtype))) + rtd = 1; + + if (rtd && ! stdarg_p (funtype)) + return size; + } + + /* Lose any fake structure return argument if it is passed on the stack. */ + if (aggregate_value_p (TREE_TYPE (funtype), fundecl) + && !ix86_keep_aggregate_return_pointer (funtype)) + { + int nregs = ix86_function_regparm (funtype, fundecl); + if (nregs == 0) + return GET_MODE_SIZE (Pmode); + } + + return 0; +} + +/* Argument support functions. */ + +/* Return true when register may be used to pass function parameters. */ +bool +ix86_function_arg_regno_p (int regno) +{ + int i; + const int *parm_regs; + + if (!TARGET_64BIT) + { + if (TARGET_MACHO) + return (regno < REGPARM_MAX + || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno])); + else + return (regno < REGPARM_MAX + || (TARGET_MMX && MMX_REGNO_P (regno) + && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX)) + || (TARGET_SSE && SSE_REGNO_P (regno) + && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))); + } + + if (TARGET_MACHO) + { + if (SSE_REGNO_P (regno) && TARGET_SSE) + return true; + } + else + { + if (TARGET_SSE && SSE_REGNO_P (regno) + && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)) + return true; + } + + /* TODO: The function should depend on current function ABI but + builtins.c would need updating then. Therefore we use the + default ABI. */ + + /* RAX is used as hidden argument to va_arg functions. */ + if (ix86_abi == SYSV_ABI && regno == AX_REG) + return true; + + if (ix86_abi == MS_ABI) + parm_regs = x86_64_ms_abi_int_parameter_registers; + else + parm_regs = x86_64_int_parameter_registers; + for (i = 0; i < (ix86_abi == MS_ABI + ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++) + if (regno == parm_regs[i]) + return true; + return false; +} + +/* Return if we do not know how to pass TYPE solely in registers. */ + +static bool +ix86_must_pass_in_stack (enum machine_mode mode, const_tree type) +{ + if (must_pass_in_stack_var_size_or_pad (mode, type)) + return true; + + /* For 32-bit, we want TImode aggregates to go on the stack. But watch out! + The layout_type routine is crafty and tries to trick us into passing + currently unsupported vector types on the stack by using TImode. */ + return (!TARGET_64BIT && mode == TImode + && type && TREE_CODE (type) != VECTOR_TYPE); +} + +/* It returns the size, in bytes, of the area reserved for arguments passed + in registers for the function represented by fndecl dependent to the used + abi format. */ +int +ix86_reg_parm_stack_space (const_tree fndecl) +{ + enum calling_abi call_abi = SYSV_ABI; + if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL) + call_abi = ix86_function_abi (fndecl); + else + call_abi = ix86_function_type_abi (fndecl); + if (call_abi == MS_ABI) + return 32; + return 0; +} + +/* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the + call abi used. */ +enum calling_abi +ix86_function_type_abi (const_tree fntype) +{ + if (TARGET_64BIT && fntype != NULL) + { + enum calling_abi abi = ix86_abi; + if (abi == SYSV_ABI) + { + if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype))) + abi = MS_ABI; + } + else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype))) + abi = SYSV_ABI; + return abi; + } + return ix86_abi; +} + +static bool +ix86_function_ms_hook_prologue (const_tree fn) +{ + if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn))) + { + if (decl_function_context (fn) != NULL_TREE) + error_at (DECL_SOURCE_LOCATION (fn), + "ms_hook_prologue is not compatible with nested function"); + else + return true; + } + return false; +} + +static enum calling_abi +ix86_function_abi (const_tree fndecl) +{ + if (! fndecl) + return ix86_abi; + return ix86_function_type_abi (TREE_TYPE (fndecl)); +} + +/* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the + call abi used. */ +enum calling_abi +ix86_cfun_abi (void) +{ + if (! cfun || ! TARGET_64BIT) + return ix86_abi; + return cfun->machine->call_abi; +} + +/* Write the extra assembler code needed to declare a function properly. */ + +void +ix86_asm_output_function_label (FILE *asm_out_file, const char *fname, + tree decl) +{ + bool is_ms_hook = ix86_function_ms_hook_prologue (decl); + + if (is_ms_hook) + { + int i, filler_count = (TARGET_64BIT ? 32 : 16); + unsigned int filler_cc = 0xcccccccc; + + for (i = 0; i < filler_count; i += 4) + fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc); + } + +#ifdef SUBTARGET_ASM_UNWIND_INIT + SUBTARGET_ASM_UNWIND_INIT (asm_out_file); +#endif + + ASM_OUTPUT_LABEL (asm_out_file, fname); + + /* Output magic byte marker, if hot-patch attribute is set. */ + if (is_ms_hook) + { + if (TARGET_64BIT) + { + /* leaq [%rsp + 0], %rsp */ + asm_fprintf (asm_out_file, ASM_BYTE + "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n"); + } + else + { + /* movl.s %edi, %edi + push %ebp + movl.s %esp, %ebp */ + asm_fprintf (asm_out_file, ASM_BYTE + "0x8b, 0xff, 0x55, 0x8b, 0xec\n"); + } + } +} + +/* regclass.c */ +extern void init_regs (void); + +/* Implementation of call abi switching target hook. Specific to FNDECL + the specific call register sets are set. See also + ix86_conditional_register_usage for more details. */ +void +ix86_call_abi_override (const_tree fndecl) +{ + if (fndecl == NULL_TREE) + cfun->machine->call_abi = ix86_abi; + else + cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl)); +} + +/* MS and SYSV ABI have different set of call used registers. Avoid expensive + re-initialization of init_regs each time we switch function context since + this is needed only during RTL expansion. */ +static void +ix86_maybe_switch_abi (void) +{ + if (TARGET_64BIT && + call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI)) + reinit_regs (); +} + +/* Initialize a variable CUM of type CUMULATIVE_ARGS + for a call to a function whose data type is FNTYPE. + For a library call, FNTYPE is 0. */ + +void +init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */ + tree fntype, /* tree ptr for function decl */ + rtx libname, /* SYMBOL_REF of library name or 0 */ + tree fndecl, + int caller) +{ + struct cgraph_local_info *i; + tree fnret_type; + + memset (cum, 0, sizeof (*cum)); + + /* Initialize for the current callee. */ + if (caller) + { + cfun->machine->callee_pass_avx256_p = false; + cfun->machine->callee_return_avx256_p = false; + } + + if (fndecl) + { + i = cgraph_local_info (fndecl); + cum->call_abi = ix86_function_abi (fndecl); + fnret_type = TREE_TYPE (TREE_TYPE (fndecl)); + } + else + { + i = NULL; + cum->call_abi = ix86_function_type_abi (fntype); + if (fntype) + fnret_type = TREE_TYPE (fntype); + else + fnret_type = NULL; + } + + if (TARGET_VZEROUPPER && fnret_type) + { + rtx fnret_value = ix86_function_value (fnret_type, fntype, + false); + if (function_pass_avx256_p (fnret_value)) + { + /* The return value of this function uses 256bit AVX modes. */ + if (caller) + { + cfun->machine->callee_return_avx256_p = true; + cum->callee_return_avx256_p = true; + } + else + cfun->machine->caller_return_avx256_p = true; + } + } + + cum->caller = caller; + + /* Set up the number of registers to use for passing arguments. */ + + if (cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS) + sorry ("ms_abi attribute requires -maccumulate-outgoing-args " + "or subtarget optimization implying it"); + cum->nregs = ix86_regparm; + if (TARGET_64BIT) + { + cum->nregs = (cum->call_abi == SYSV_ABI + ? X86_64_REGPARM_MAX + : X86_64_MS_REGPARM_MAX); + } + if (TARGET_SSE) + { + cum->sse_nregs = SSE_REGPARM_MAX; + if (TARGET_64BIT) + { + cum->sse_nregs = (cum->call_abi == SYSV_ABI + ? X86_64_SSE_REGPARM_MAX + : X86_64_MS_SSE_REGPARM_MAX); + } + } + if (TARGET_MMX) + cum->mmx_nregs = MMX_REGPARM_MAX; + cum->warn_avx = true; + cum->warn_sse = true; + cum->warn_mmx = true; + + /* Because type might mismatch in between caller and callee, we need to + use actual type of function for local calls. + FIXME: cgraph_analyze can be told to actually record if function uses + va_start so for local functions maybe_vaarg can be made aggressive + helping K&R code. + FIXME: once typesytem is fixed, we won't need this code anymore. */ + if (i && i->local && i->can_change_signature) + fntype = TREE_TYPE (fndecl); + cum->maybe_vaarg = (fntype + ? (!prototype_p (fntype) || stdarg_p (fntype)) + : !libname); + + if (!TARGET_64BIT) + { + /* If there are variable arguments, then we won't pass anything + in registers in 32-bit mode. */ + if (stdarg_p (fntype)) + { + cum->nregs = 0; + cum->sse_nregs = 0; + cum->mmx_nregs = 0; + cum->warn_avx = 0; + cum->warn_sse = 0; + cum->warn_mmx = 0; + return; + } + + /* Use ecx and edx registers if function has fastcall attribute, + else look for regparm information. */ + if (fntype) + { + if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype))) + { + cum->nregs = 1; + cum->fastcall = 1; /* Same first register as in fastcall. */ + } + else if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype))) + { + cum->nregs = 2; + cum->fastcall = 1; + } + else + cum->nregs = ix86_function_regparm (fntype, fndecl); + } + + /* Set up the number of SSE registers used for passing SFmode + and DFmode arguments. Warn for mismatching ABI. */ + cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true); + } +} + +/* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE. + But in the case of vector types, it is some vector mode. + + When we have only some of our vector isa extensions enabled, then there + are some modes for which vector_mode_supported_p is false. For these + modes, the generic vector support in gcc will choose some non-vector mode + in order to implement the type. By computing the natural mode, we'll + select the proper ABI location for the operand and not depend on whatever + the middle-end decides to do with these vector types. + + The midde-end can't deal with the vector types > 16 bytes. In this + case, we return the original mode and warn ABI change if CUM isn't + NULL. */ + +static enum machine_mode +type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum) +{ + enum machine_mode mode = TYPE_MODE (type); + + if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode)) + { + HOST_WIDE_INT size = int_size_in_bytes (type); + if ((size == 8 || size == 16 || size == 32) + /* ??? Generic code allows us to create width 1 vectors. Ignore. */ + && TYPE_VECTOR_SUBPARTS (type) > 1) + { + enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type)); + + if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE) + mode = MIN_MODE_VECTOR_FLOAT; + else + mode = MIN_MODE_VECTOR_INT; + + /* Get the mode which has this inner mode and number of units. */ + for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode)) + if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type) + && GET_MODE_INNER (mode) == innermode) + { + if (size == 32 && !TARGET_AVX) + { + static bool warnedavx; + + if (cum + && !warnedavx + && cum->warn_avx) + { + warnedavx = true; + warning (0, "AVX vector argument without AVX " + "enabled changes the ABI"); + } + return TYPE_MODE (type); + } + else + return mode; + } + + gcc_unreachable (); + } + } + + return mode; +} + +/* We want to pass a value in REGNO whose "natural" mode is MODE. However, + this may not agree with the mode that the type system has chosen for the + register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can + go ahead and use it. Otherwise we have to build a PARALLEL instead. */ + +static rtx +gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode, + unsigned int regno) +{ + rtx tmp; + + if (orig_mode != BLKmode) + tmp = gen_rtx_REG (orig_mode, regno); + else + { + tmp = gen_rtx_REG (mode, regno); + tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx); + tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp)); + } + + return tmp; +} + +/* x86-64 register passing implementation. See x86-64 ABI for details. Goal + of this code is to classify each 8bytes of incoming argument by the register + class and assign registers accordingly. */ + +/* Return the union class of CLASS1 and CLASS2. + See the x86-64 PS ABI for details. */ + +static enum x86_64_reg_class +merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2) +{ + /* Rule #1: If both classes are equal, this is the resulting class. */ + if (class1 == class2) + return class1; + + /* Rule #2: If one of the classes is NO_CLASS, the resulting class is + the other class. */ + if (class1 == X86_64_NO_CLASS) + return class2; + if (class2 == X86_64_NO_CLASS) + return class1; + + /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */ + if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS) + return X86_64_MEMORY_CLASS; + + /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */ + if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS) + || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS)) + return X86_64_INTEGERSI_CLASS; + if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS + || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS) + return X86_64_INTEGER_CLASS; + + /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class, + MEMORY is used. */ + if (class1 == X86_64_X87_CLASS + || class1 == X86_64_X87UP_CLASS + || class1 == X86_64_COMPLEX_X87_CLASS + || class2 == X86_64_X87_CLASS + || class2 == X86_64_X87UP_CLASS + || class2 == X86_64_COMPLEX_X87_CLASS) + return X86_64_MEMORY_CLASS; + + /* Rule #6: Otherwise class SSE is used. */ + return X86_64_SSE_CLASS; +} + +/* Classify the argument of type TYPE and mode MODE. + CLASSES will be filled by the register class used to pass each word + of the operand. The number of words is returned. In case the parameter + should be passed in memory, 0 is returned. As a special case for zero + sized containers, classes[0] will be NO_CLASS and 1 is returned. + + BIT_OFFSET is used internally for handling records and specifies offset + of the offset in bits modulo 256 to avoid overflow cases. + + See the x86-64 PS ABI for details. +*/ + +static int +classify_argument (enum machine_mode mode, const_tree type, + enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset) +{ + HOST_WIDE_INT bytes = + (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode); + int words = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD; + + /* Variable sized entities are always passed/returned in memory. */ + if (bytes < 0) + return 0; + + if (mode != VOIDmode + && targetm.calls.must_pass_in_stack (mode, type)) + return 0; + + if (type && AGGREGATE_TYPE_P (type)) + { + int i; + tree field; + enum x86_64_reg_class subclasses[MAX_CLASSES]; + + /* On x86-64 we pass structures larger than 32 bytes on the stack. */ + if (bytes > 32) + return 0; + + for (i = 0; i < words; i++) + classes[i] = X86_64_NO_CLASS; + + /* Zero sized arrays or structures are NO_CLASS. We return 0 to + signalize memory class, so handle it as special case. */ + if (!words) + { + classes[0] = X86_64_NO_CLASS; + return 1; + } + + /* Classify each field of record and merge classes. */ + switch (TREE_CODE (type)) + { + case RECORD_TYPE: + /* And now merge the fields of structure. */ + for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field)) + { + if (TREE_CODE (field) == FIELD_DECL) + { + int num; + + if (TREE_TYPE (field) == error_mark_node) + continue; + + /* Bitfields are always classified as integer. Handle them + early, since later code would consider them to be + misaligned integers. */ + if (DECL_BIT_FIELD (field)) + { + for (i = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8; + i < ((int_bit_position (field) + (bit_offset % 64)) + + tree_low_cst (DECL_SIZE (field), 0) + + 63) / 8 / 8; i++) + classes[i] = + merge_classes (X86_64_INTEGER_CLASS, + classes[i]); + } + else + { + int pos; + + type = TREE_TYPE (field); + + /* Flexible array member is ignored. */ + if (TYPE_MODE (type) == BLKmode + && TREE_CODE (type) == ARRAY_TYPE + && TYPE_SIZE (type) == NULL_TREE + && TYPE_DOMAIN (type) != NULL_TREE + && (TYPE_MAX_VALUE (TYPE_DOMAIN (type)) + == NULL_TREE)) + { + static bool warned; + + if (!warned && warn_psabi) + { + warned = true; + inform (input_location, + "the ABI of passing struct with" + " a flexible array member has" + " changed in GCC 4.4"); + } + continue; + } + num = classify_argument (TYPE_MODE (type), type, + subclasses, + (int_bit_position (field) + + bit_offset) % 256); + if (!num) + return 0; + pos = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8; + for (i = 0; i < num && (i + pos) < words; i++) + classes[i + pos] = + merge_classes (subclasses[i], classes[i + pos]); + } + } + } + break; + + case ARRAY_TYPE: + /* Arrays are handled as small records. */ + { + int num; + num = classify_argument (TYPE_MODE (TREE_TYPE (type)), + TREE_TYPE (type), subclasses, bit_offset); + if (!num) + return 0; + + /* The partial classes are now full classes. */ + if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4) + subclasses[0] = X86_64_SSE_CLASS; + if (subclasses[0] == X86_64_INTEGERSI_CLASS + && !((bit_offset % 64) == 0 && bytes == 4)) + subclasses[0] = X86_64_INTEGER_CLASS; + + for (i = 0; i < words; i++) + classes[i] = subclasses[i % num]; + + break; + } + case UNION_TYPE: + case QUAL_UNION_TYPE: + /* Unions are similar to RECORD_TYPE but offset is always 0. + */ + for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field)) + { + if (TREE_CODE (field) == FIELD_DECL) + { + int num; + + if (TREE_TYPE (field) == error_mark_node) + continue; + + num = classify_argument (TYPE_MODE (TREE_TYPE (field)), + TREE_TYPE (field), subclasses, + bit_offset); + if (!num) + return 0; + for (i = 0; i < num; i++) + classes[i] = merge_classes (subclasses[i], classes[i]); + } + } + break; + + default: + gcc_unreachable (); + } + + if (words > 2) + { + /* When size > 16 bytes, if the first one isn't + X86_64_SSE_CLASS or any other ones aren't + X86_64_SSEUP_CLASS, everything should be passed in + memory. */ + if (classes[0] != X86_64_SSE_CLASS) + return 0; + + for (i = 1; i < words; i++) + if (classes[i] != X86_64_SSEUP_CLASS) + return 0; + } + + /* Final merger cleanup. */ + for (i = 0; i < words; i++) + { + /* If one class is MEMORY, everything should be passed in + memory. */ + if (classes[i] == X86_64_MEMORY_CLASS) + return 0; + + /* The X86_64_SSEUP_CLASS should be always preceded by + X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */ + if (classes[i] == X86_64_SSEUP_CLASS + && classes[i - 1] != X86_64_SSE_CLASS + && classes[i - 1] != X86_64_SSEUP_CLASS) + { + /* The first one should never be X86_64_SSEUP_CLASS. */ + gcc_assert (i != 0); + classes[i] = X86_64_SSE_CLASS; + } + + /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS, + everything should be passed in memory. */ + if (classes[i] == X86_64_X87UP_CLASS + && (classes[i - 1] != X86_64_X87_CLASS)) + { + static bool warned; + + /* The first one should never be X86_64_X87UP_CLASS. */ + gcc_assert (i != 0); + if (!warned && warn_psabi) + { + warned = true; + inform (input_location, + "the ABI of passing union with long double" + " has changed in GCC 4.4"); + } + return 0; + } + } + return words; + } + + /* Compute alignment needed. We align all types to natural boundaries with + exception of XFmode that is aligned to 64bits. */ + if (mode != VOIDmode && mode != BLKmode) + { + int mode_alignment = GET_MODE_BITSIZE (mode); + + if (mode == XFmode) + mode_alignment = 128; + else if (mode == XCmode) + mode_alignment = 256; + if (COMPLEX_MODE_P (mode)) + mode_alignment /= 2; + /* Misaligned fields are always returned in memory. */ + if (bit_offset % mode_alignment) + return 0; + } + + /* for V1xx modes, just use the base mode */ + if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode + && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes) + mode = GET_MODE_INNER (mode); + + /* Classification of atomic types. */ + switch (mode) + { + case SDmode: + case DDmode: + classes[0] = X86_64_SSE_CLASS; + return 1; + case TDmode: + classes[0] = X86_64_SSE_CLASS; + classes[1] = X86_64_SSEUP_CLASS; + return 2; + case DImode: + case SImode: + case HImode: + case QImode: + case CSImode: + case CHImode: + case CQImode: + { + int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode); + + if (size <= 32) + { + classes[0] = X86_64_INTEGERSI_CLASS; + return 1; + } + else if (size <= 64) + { + classes[0] = X86_64_INTEGER_CLASS; + return 1; + } + else if (size <= 64+32) + { + classes[0] = X86_64_INTEGER_CLASS; + classes[1] = X86_64_INTEGERSI_CLASS; + return 2; + } + else if (size <= 64+64) + { + classes[0] = classes[1] = X86_64_INTEGER_CLASS; + return 2; + } + else + gcc_unreachable (); + } + case CDImode: + case TImode: + classes[0] = classes[1] = X86_64_INTEGER_CLASS; + return 2; + case COImode: + case OImode: + /* OImode shouldn't be used directly. */ + gcc_unreachable (); + case CTImode: + return 0; + case SFmode: + if (!(bit_offset % 64)) + classes[0] = X86_64_SSESF_CLASS; + else + classes[0] = X86_64_SSE_CLASS; + return 1; + case DFmode: + classes[0] = X86_64_SSEDF_CLASS; + return 1; + case XFmode: + classes[0] = X86_64_X87_CLASS; + classes[1] = X86_64_X87UP_CLASS; + return 2; + case TFmode: + classes[0] = X86_64_SSE_CLASS; + classes[1] = X86_64_SSEUP_CLASS; + return 2; + case SCmode: + classes[0] = X86_64_SSE_CLASS; + if (!(bit_offset % 64)) + return 1; + else + { + static bool warned; + + if (!warned && warn_psabi) + { + warned = true; + inform (input_location, + "the ABI of passing structure with complex float" + " member has changed in GCC 4.4"); + } + classes[1] = X86_64_SSESF_CLASS; + return 2; + } + case DCmode: + classes[0] = X86_64_SSEDF_CLASS; + classes[1] = X86_64_SSEDF_CLASS; + return 2; + case XCmode: + classes[0] = X86_64_COMPLEX_X87_CLASS; + return 1; + case TCmode: + /* This modes is larger than 16 bytes. */ + return 0; + case V8SFmode: + case V8SImode: + case V32QImode: + case V16HImode: + case V4DFmode: + case V4DImode: + classes[0] = X86_64_SSE_CLASS; + classes[1] = X86_64_SSEUP_CLASS; + classes[2] = X86_64_SSEUP_CLASS; + classes[3] = X86_64_SSEUP_CLASS; + return 4; + case V4SFmode: + case V4SImode: + case V16QImode: + case V8HImode: + case V2DFmode: + case V2DImode: + classes[0] = X86_64_SSE_CLASS; + classes[1] = X86_64_SSEUP_CLASS; + return 2; + case V1TImode: + case V1DImode: + case V2SFmode: + case V2SImode: + case V4HImode: + case V8QImode: + classes[0] = X86_64_SSE_CLASS; + return 1; + case BLKmode: + case VOIDmode: + return 0; + default: + gcc_assert (VECTOR_MODE_P (mode)); + + if (bytes > 16) + return 0; + + gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT); + + if (bit_offset + GET_MODE_BITSIZE (mode) <= 32) + classes[0] = X86_64_INTEGERSI_CLASS; + else + classes[0] = X86_64_INTEGER_CLASS; + classes[1] = X86_64_INTEGER_CLASS; + return 1 + (bytes > 8); + } +} + +/* Examine the argument and return set number of register required in each + class. Return 0 iff parameter should be passed in memory. */ +static int +examine_argument (enum machine_mode mode, const_tree type, int in_return, + int *int_nregs, int *sse_nregs) +{ + enum x86_64_reg_class regclass[MAX_CLASSES]; + int n = classify_argument (mode, type, regclass, 0); + + *int_nregs = 0; + *sse_nregs = 0; + if (!n) + return 0; + for (n--; n >= 0; n--) + switch (regclass[n]) + { + case X86_64_INTEGER_CLASS: + case X86_64_INTEGERSI_CLASS: + (*int_nregs)++; + break; + case X86_64_SSE_CLASS: + case X86_64_SSESF_CLASS: + case X86_64_SSEDF_CLASS: + (*sse_nregs)++; + break; + case X86_64_NO_CLASS: + case X86_64_SSEUP_CLASS: + break; + case X86_64_X87_CLASS: + case X86_64_X87UP_CLASS: + if (!in_return) + return 0; + break; + case X86_64_COMPLEX_X87_CLASS: + return in_return ? 2 : 0; + case X86_64_MEMORY_CLASS: + gcc_unreachable (); + } + return 1; +} + +/* Construct container for the argument used by GCC interface. See + FUNCTION_ARG for the detailed description. */ + +static rtx +construct_container (enum machine_mode mode, enum machine_mode orig_mode, + const_tree type, int in_return, int nintregs, int nsseregs, + const int *intreg, int sse_regno) +{ + /* The following variables hold the static issued_error state. */ + static bool issued_sse_arg_error; + static bool issued_sse_ret_error; + static bool issued_x87_ret_error; + + enum machine_mode tmpmode; + int bytes = + (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode); + enum x86_64_reg_class regclass[MAX_CLASSES]; + int n; + int i; + int nexps = 0; + int needed_sseregs, needed_intregs; + rtx exp[MAX_CLASSES]; + rtx ret; + + n = classify_argument (mode, type, regclass, 0); + if (!n) + return NULL; + if (!examine_argument (mode, type, in_return, &needed_intregs, + &needed_sseregs)) + return NULL; + if (needed_intregs > nintregs || needed_sseregs > nsseregs) + return NULL; + + /* We allowed the user to turn off SSE for kernel mode. Don't crash if + some less clueful developer tries to use floating-point anyway. */ + if (needed_sseregs && !TARGET_SSE) + { + if (in_return) + { + if (!issued_sse_ret_error) + { + error ("SSE register return with SSE disabled"); + issued_sse_ret_error = true; + } + } + else if (!issued_sse_arg_error) + { + error ("SSE register argument with SSE disabled"); + issued_sse_arg_error = true; + } + return NULL; + } + + /* Likewise, error if the ABI requires us to return values in the + x87 registers and the user specified -mno-80387. */ + if (!TARGET_80387 && in_return) + for (i = 0; i < n; i++) + if (regclass[i] == X86_64_X87_CLASS + || regclass[i] == X86_64_X87UP_CLASS + || regclass[i] == X86_64_COMPLEX_X87_CLASS) + { + if (!issued_x87_ret_error) + { + error ("x87 register return with x87 disabled"); + issued_x87_ret_error = true; + } + return NULL; + } + + /* First construct simple cases. Avoid SCmode, since we want to use + single register to pass this type. */ + if (n == 1 && mode != SCmode) + switch (regclass[0]) + { + case X86_64_INTEGER_CLASS: + case X86_64_INTEGERSI_CLASS: + return gen_rtx_REG (mode, intreg[0]); + case X86_64_SSE_CLASS: + case X86_64_SSESF_CLASS: + case X86_64_SSEDF_CLASS: + if (mode != BLKmode) + return gen_reg_or_parallel (mode, orig_mode, + SSE_REGNO (sse_regno)); + break; + case X86_64_X87_CLASS: + case X86_64_COMPLEX_X87_CLASS: + return gen_rtx_REG (mode, FIRST_STACK_REG); + case X86_64_NO_CLASS: + /* Zero sized array, struct or class. */ + return NULL; + default: + gcc_unreachable (); + } + if (n == 2 && regclass[0] == X86_64_SSE_CLASS + && regclass[1] == X86_64_SSEUP_CLASS && mode != BLKmode) + return gen_rtx_REG (mode, SSE_REGNO (sse_regno)); + if (n == 4 + && regclass[0] == X86_64_SSE_CLASS + && regclass[1] == X86_64_SSEUP_CLASS + && regclass[2] == X86_64_SSEUP_CLASS + && regclass[3] == X86_64_SSEUP_CLASS + && mode != BLKmode) + return gen_rtx_REG (mode, SSE_REGNO (sse_regno)); + + if (n == 2 + && regclass[0] == X86_64_X87_CLASS && regclass[1] == X86_64_X87UP_CLASS) + return gen_rtx_REG (XFmode, FIRST_STACK_REG); + if (n == 2 && regclass[0] == X86_64_INTEGER_CLASS + && regclass[1] == X86_64_INTEGER_CLASS + && (mode == CDImode || mode == TImode || mode == TFmode) + && intreg[0] + 1 == intreg[1]) + return gen_rtx_REG (mode, intreg[0]); + + /* Otherwise figure out the entries of the PARALLEL. */ + for (i = 0; i < n; i++) + { + int pos; + + switch (regclass[i]) + { + case X86_64_NO_CLASS: + break; + case X86_64_INTEGER_CLASS: + case X86_64_INTEGERSI_CLASS: + /* Merge TImodes on aligned occasions here too. */ + if (i * 8 + 8 > bytes) + tmpmode = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0); + else if (regclass[i] == X86_64_INTEGERSI_CLASS) + tmpmode = SImode; + else + tmpmode = DImode; + /* We've requested 24 bytes we don't have mode for. Use DImode. */ + if (tmpmode == BLKmode) + tmpmode = DImode; + exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode, + gen_rtx_REG (tmpmode, *intreg), + GEN_INT (i*8)); + intreg++; + break; + case X86_64_SSESF_CLASS: + exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode, + gen_rtx_REG (SFmode, + SSE_REGNO (sse_regno)), + GEN_INT (i*8)); + sse_regno++; + break; + case X86_64_SSEDF_CLASS: + exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode, + gen_rtx_REG (DFmode, + SSE_REGNO (sse_regno)), + GEN_INT (i*8)); + sse_regno++; + break; + case X86_64_SSE_CLASS: + pos = i; + switch (n) + { + case 1: + tmpmode = DImode; + break; + case 2: + if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS) + { + tmpmode = TImode; + i++; + } + else + tmpmode = DImode; + break; + case 4: + gcc_assert (i == 0 + && regclass[1] == X86_64_SSEUP_CLASS + && regclass[2] == X86_64_SSEUP_CLASS + && regclass[3] == X86_64_SSEUP_CLASS); + tmpmode = OImode; + i += 3; + break; + default: + gcc_unreachable (); + } + exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode, + gen_rtx_REG (tmpmode, + SSE_REGNO (sse_regno)), + GEN_INT (pos*8)); + sse_regno++; + break; + default: + gcc_unreachable (); + } + } + + /* Empty aligned struct, union or class. */ + if (nexps == 0) + return NULL; + + ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps)); + for (i = 0; i < nexps; i++) + XVECEXP (ret, 0, i) = exp [i]; + return ret; +} + +/* Update the data in CUM to advance over an argument of mode MODE + and data type TYPE. (TYPE is null for libcalls where that information + may not be available.) */ + +static void +function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode, + const_tree type, HOST_WIDE_INT bytes, + HOST_WIDE_INT words) +{ + switch (mode) + { + default: + break; + + case BLKmode: + if (bytes < 0) + break; + /* FALLTHRU */ + + case DImode: + case SImode: + case HImode: + case QImode: + cum->words += words; + cum->nregs -= words; + cum->regno += words; + + if (cum->nregs <= 0) + { + cum->nregs = 0; + cum->regno = 0; + } + break; + + case OImode: + /* OImode shouldn't be used directly. */ + gcc_unreachable (); + + case DFmode: + if (cum->float_in_sse < 2) + break; + case SFmode: + if (cum->float_in_sse < 1) + break; + /* FALLTHRU */ + + case V8SFmode: + case V8SImode: + case V32QImode: + case V16HImode: + case V4DFmode: + case V4DImode: + case TImode: + case V16QImode: + case V8HImode: + case V4SImode: + case V2DImode: + case V4SFmode: + case V2DFmode: + if (!type || !AGGREGATE_TYPE_P (type)) + { + cum->sse_words += words; + cum->sse_nregs -= 1; + cum->sse_regno += 1; + if (cum->sse_nregs <= 0) + { + cum->sse_nregs = 0; + cum->sse_regno = 0; + } + } + break; + + case V8QImode: + case V4HImode: + case V2SImode: + case V2SFmode: + case V1TImode: + case V1DImode: + if (!type || !AGGREGATE_TYPE_P (type)) + { + cum->mmx_words += words; + cum->mmx_nregs -= 1; + cum->mmx_regno += 1; + if (cum->mmx_nregs <= 0) + { + cum->mmx_nregs = 0; + cum->mmx_regno = 0; + } + } + break; + } +} + +static void +function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode, + const_tree type, HOST_WIDE_INT words, bool named) +{ + int int_nregs, sse_nregs; + + /* Unnamed 256bit vector mode parameters are passed on stack. */ + if (!named && VALID_AVX256_REG_MODE (mode)) + return; + + if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs) + && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs) + { + cum->nregs -= int_nregs; + cum->sse_nregs -= sse_nregs; + cum->regno += int_nregs; + cum->sse_regno += sse_nregs; + } + else + { + int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD; + cum->words = (cum->words + align - 1) & ~(align - 1); + cum->words += words; + } +} + +static void +function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes, + HOST_WIDE_INT words) +{ + /* Otherwise, this should be passed indirect. */ + gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8); + + cum->words += words; + if (cum->nregs > 0) + { + cum->nregs -= 1; + cum->regno += 1; + } +} + +/* Update the data in CUM to advance over an argument of mode MODE and + data type TYPE. (TYPE is null for libcalls where that information + may not be available.) */ + +static void +ix86_function_arg_advance (CUMULATIVE_ARGS *cum, enum machine_mode mode, + const_tree type, bool named) +{ + HOST_WIDE_INT bytes, words; + + if (mode == BLKmode) + bytes = int_size_in_bytes (type); + else + bytes = GET_MODE_SIZE (mode); + words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD; + + if (type) + mode = type_natural_mode (type, NULL); + + if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI) + function_arg_advance_ms_64 (cum, bytes, words); + else if (TARGET_64BIT) + function_arg_advance_64 (cum, mode, type, words, named); + else + function_arg_advance_32 (cum, mode, type, bytes, words); +} + +/* Define where to put the arguments to a function. + Value is zero to push the argument on the stack, + or a hard register in which to store the argument. + + MODE is the argument's machine mode. + TYPE is the data type of the argument (as a tree). + This is null for libcalls where that information may + not be available. + CUM is a variable of type CUMULATIVE_ARGS which gives info about + the preceding args and about the function being called. + NAMED is nonzero if this argument is a named parameter + (otherwise it is an extra parameter matching an ellipsis). */ + +static rtx +function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode, + enum machine_mode orig_mode, const_tree type, + HOST_WIDE_INT bytes, HOST_WIDE_INT words) +{ + static bool warnedsse, warnedmmx; + + /* Avoid the AL settings for the Unix64 ABI. */ + if (mode == VOIDmode) + return constm1_rtx; + + switch (mode) + { + default: + break; + + case BLKmode: + if (bytes < 0) + break; + /* FALLTHRU */ + case DImode: + case SImode: + case HImode: + case QImode: + if (words <= cum->nregs) + { + int regno = cum->regno; + + /* Fastcall allocates the first two DWORD (SImode) or + smaller arguments to ECX and EDX if it isn't an + aggregate type . */ + if (cum->fastcall) + { + if (mode == BLKmode + || mode == DImode + || (type && AGGREGATE_TYPE_P (type))) + break; + + /* ECX not EAX is the first allocated register. */ + if (regno == AX_REG) + regno = CX_REG; + } + return gen_rtx_REG (mode, regno); + } + break; + + case DFmode: + if (cum->float_in_sse < 2) + break; + case SFmode: + if (cum->float_in_sse < 1) + break; + /* FALLTHRU */ + case TImode: + /* In 32bit, we pass TImode in xmm registers. */ + case V16QImode: + case V8HImode: + case V4SImode: + case V2DImode: + case V4SFmode: + case V2DFmode: + if (!type || !AGGREGATE_TYPE_P (type)) + { + if (!TARGET_SSE && !warnedsse && cum->warn_sse) + { + warnedsse = true; + warning (0, "SSE vector argument without SSE enabled " + "changes the ABI"); + } + if (cum->sse_nregs) + return gen_reg_or_parallel (mode, orig_mode, + cum->sse_regno + FIRST_SSE_REG); + } + break; + + case OImode: + /* OImode shouldn't be used directly. */ + gcc_unreachable (); + + case V8SFmode: + case V8SImode: + case V32QImode: + case V16HImode: + case V4DFmode: + case V4DImode: + if (!type || !AGGREGATE_TYPE_P (type)) + { + if (cum->sse_nregs) + return gen_reg_or_parallel (mode, orig_mode, + cum->sse_regno + FIRST_SSE_REG); + } + break; + + case V8QImode: + case V4HImode: + case V2SImode: + case V2SFmode: + case V1TImode: + case V1DImode: + if (!type || !AGGREGATE_TYPE_P (type)) + { + if (!TARGET_MMX && !warnedmmx && cum->warn_mmx) + { + warnedmmx = true; + warning (0, "MMX vector argument without MMX enabled " + "changes the ABI"); + } + if (cum->mmx_nregs) + return gen_reg_or_parallel (mode, orig_mode, + cum->mmx_regno + FIRST_MMX_REG); + } + break; + } + + return NULL_RTX; +} + +static rtx +function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode, + enum machine_mode orig_mode, const_tree type, bool named) +{ + /* Handle a hidden AL argument containing number of registers + for varargs x86-64 functions. */ + if (mode == VOIDmode) + return GEN_INT (cum->maybe_vaarg + ? (cum->sse_nregs < 0 + ? X86_64_SSE_REGPARM_MAX + : cum->sse_regno) + : -1); + + switch (mode) + { + default: + break; + + case V8SFmode: + case V8SImode: + case V32QImode: + case V16HImode: + case V4DFmode: + case V4DImode: + /* Unnamed 256bit vector mode parameters are passed on stack. */ + if (!named) + return NULL; + break; + } + + return construct_container (mode, orig_mode, type, 0, cum->nregs, + cum->sse_nregs, + &x86_64_int_parameter_registers [cum->regno], + cum->sse_regno); +} + +static rtx +function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode, + enum machine_mode orig_mode, bool named, + HOST_WIDE_INT bytes) +{ + unsigned int regno; + + /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call. + We use value of -2 to specify that current function call is MSABI. */ + if (mode == VOIDmode) + return GEN_INT (-2); + + /* If we've run out of registers, it goes on the stack. */ + if (cum->nregs == 0) + return NULL_RTX; + + regno = x86_64_ms_abi_int_parameter_registers[cum->regno]; + + /* Only floating point modes are passed in anything but integer regs. */ + if (TARGET_SSE && (mode == SFmode || mode == DFmode)) + { + if (named) + regno = cum->regno + FIRST_SSE_REG; + else + { + rtx t1, t2; + + /* Unnamed floating parameters are passed in both the + SSE and integer registers. */ + t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG); + t2 = gen_rtx_REG (mode, regno); + t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx); + t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx); + return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2)); + } + } + /* Handle aggregated types passed in register. */ + if (orig_mode == BLKmode) + { + if (bytes > 0 && bytes <= 8) + mode = (bytes > 4 ? DImode : SImode); + if (mode == BLKmode) + mode = DImode; + } + + return gen_reg_or_parallel (mode, orig_mode, regno); +} + +/* Return where to put the arguments to a function. + Return zero to push the argument on the stack, or a hard register in which to store the argument. + + MODE is the argument's machine mode. TYPE is the data type of the + argument. It is null for libcalls where that information may not be + available. CUM gives information about the preceding args and about + the function being called. NAMED is nonzero if this argument is a + named parameter (otherwise it is an extra parameter matching an + ellipsis). */ + +static rtx +ix86_function_arg (CUMULATIVE_ARGS *cum, enum machine_mode omode, + const_tree type, bool named) +{ + enum machine_mode mode = omode; + HOST_WIDE_INT bytes, words; + rtx arg; + + if (mode == BLKmode) + bytes = int_size_in_bytes (type); + else + bytes = GET_MODE_SIZE (mode); + words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD; + + /* To simplify the code below, represent vector types with a vector mode + even if MMX/SSE are not active. */ + if (type && TREE_CODE (type) == VECTOR_TYPE) + mode = type_natural_mode (type, cum); + + if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI) + arg = function_arg_ms_64 (cum, mode, omode, named, bytes); + else if (TARGET_64BIT) + arg = function_arg_64 (cum, mode, omode, type, named); + else + arg = function_arg_32 (cum, mode, omode, type, bytes, words); + + if (TARGET_VZEROUPPER && function_pass_avx256_p (arg)) + { + /* This argument uses 256bit AVX modes. */ + if (cum->caller) + cum->callee_pass_avx256_p = true; + else + cfun->machine->caller_pass_avx256_p = true; + } + + if (cum->caller && mode == VOIDmode) + { + /* This function is called with MODE == VOIDmode immediately + before the call instruction is emitted. We copy callee 256bit + AVX info from the current CUM here. */ + cfun->machine->callee_return_avx256_p = cum->callee_return_avx256_p; + cfun->machine->callee_pass_avx256_p = cum->callee_pass_avx256_p; + } + + return arg; +} + +/* A C expression that indicates when an argument must be passed by + reference. If nonzero for an argument, a copy of that argument is + made in memory and a pointer to the argument is passed instead of + the argument itself. The pointer is passed in whatever way is + appropriate for passing a pointer to that type. */ + +static bool +ix86_pass_by_reference (CUMULATIVE_ARGS *cum ATTRIBUTE_UNUSED, + enum machine_mode mode ATTRIBUTE_UNUSED, + const_tree type, bool named ATTRIBUTE_UNUSED) +{ + /* See Windows x64 Software Convention. */ + if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI) + { + int msize = (int) GET_MODE_SIZE (mode); + if (type) + { + /* Arrays are passed by reference. */ + if (TREE_CODE (type) == ARRAY_TYPE) + return true; + + if (AGGREGATE_TYPE_P (type)) + { + /* Structs/unions of sizes other than 8, 16, 32, or 64 bits + are passed by reference. */ + msize = int_size_in_bytes (type); + } + } + + /* __m128 is passed by reference. */ + switch (msize) { + case 1: case 2: case 4: case 8: + break; + default: + return true; + } + } + else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1) + return 1; + + return 0; +} + +/* Return true when TYPE should be 128bit aligned for 32bit argument + passing ABI. XXX: This function is obsolete and is only used for + checking psABI compatibility with previous versions of GCC. */ + +static bool +ix86_compat_aligned_value_p (const_tree type) +{ + enum machine_mode mode = TYPE_MODE (type); + if (((TARGET_SSE && SSE_REG_MODE_P (mode)) + || mode == TDmode + || mode == TFmode + || mode == TCmode) + && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128)) + return true; + if (TYPE_ALIGN (type) < 128) + return false; + + if (AGGREGATE_TYPE_P (type)) + { + /* Walk the aggregates recursively. */ + switch (TREE_CODE (type)) + { + case RECORD_TYPE: + case UNION_TYPE: + case QUAL_UNION_TYPE: + { + tree field; + + /* Walk all the structure fields. */ + for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field)) + { + if (TREE_CODE (field) == FIELD_DECL + && ix86_compat_aligned_value_p (TREE_TYPE (field))) + return true; + } + break; + } + + case ARRAY_TYPE: + /* Just for use if some languages passes arrays by value. */ + if (ix86_compat_aligned_value_p (TREE_TYPE (type))) + return true; + break; + + default: + gcc_unreachable (); + } + } + return false; +} + +/* Return the alignment boundary for MODE and TYPE with alignment ALIGN. + XXX: This function is obsolete and is only used for checking psABI + compatibility with previous versions of GCC. */ + +static unsigned int +ix86_compat_function_arg_boundary (enum machine_mode mode, + const_tree type, unsigned int align) +{ + /* In 32bit, only _Decimal128 and __float128 are aligned to their + natural boundaries. */ + if (!TARGET_64BIT && mode != TDmode && mode != TFmode) + { + /* i386 ABI defines all arguments to be 4 byte aligned. We have to + make an exception for SSE modes since these require 128bit + alignment. + + The handling here differs from field_alignment. ICC aligns MMX + arguments to 4 byte boundaries, while structure fields are aligned + to 8 byte boundaries. */ + if (!type) + { + if (!(TARGET_SSE && SSE_REG_MODE_P (mode))) + align = PARM_BOUNDARY; + } + else + { + if (!ix86_compat_aligned_value_p (type)) + align = PARM_BOUNDARY; + } + } + if (align > BIGGEST_ALIGNMENT) + align = BIGGEST_ALIGNMENT; + return align; +} + +/* Return true when TYPE should be 128bit aligned for 32bit argument + passing ABI. */ + +static bool +ix86_contains_aligned_value_p (const_tree type) +{ + enum machine_mode mode = TYPE_MODE (type); + + if (mode == XFmode || mode == XCmode) + return false; + + if (TYPE_ALIGN (type) < 128) + return false; + + if (AGGREGATE_TYPE_P (type)) + { + /* Walk the aggregates recursively. */ + switch (TREE_CODE (type)) + { + case RECORD_TYPE: + case UNION_TYPE: + case QUAL_UNION_TYPE: + { + tree field; + + /* Walk all the structure fields. */ + for (field = TYPE_FIELDS (type); + field; + field = DECL_CHAIN (field)) + { + if (TREE_CODE (field) == FIELD_DECL + && ix86_contains_aligned_value_p (TREE_TYPE (field))) + return true; + } + break; + } + + case ARRAY_TYPE: + /* Just for use if some languages passes arrays by value. */ + if (ix86_contains_aligned_value_p (TREE_TYPE (type))) + return true; + break; + + default: + gcc_unreachable (); + } + } + else + return TYPE_ALIGN (type) >= 128; + + return false; +} + +/* Gives the alignment boundary, in bits, of an argument with the + specified mode and type. */ + +static unsigned int +ix86_function_arg_boundary (enum machine_mode mode, const_tree type) +{ + unsigned int align; + if (type) + { + /* Since the main variant type is used for call, we convert it to + the main variant type. */ + type = TYPE_MAIN_VARIANT (type); + align = TYPE_ALIGN (type); + } + else + align = GET_MODE_ALIGNMENT (mode); + if (align < PARM_BOUNDARY) + align = PARM_BOUNDARY; + else + { + static bool warned; + unsigned int saved_align = align; + + if (!TARGET_64BIT) + { + /* i386 ABI defines XFmode arguments to be 4 byte aligned. */ + if (!type) + { + if (mode == XFmode || mode == XCmode) + align = PARM_BOUNDARY; + } + else if (!ix86_contains_aligned_value_p (type)) + align = PARM_BOUNDARY; + + if (align < 128) + align = PARM_BOUNDARY; + } + + if (warn_psabi + && !warned + && align != ix86_compat_function_arg_boundary (mode, type, + saved_align)) + { + warned = true; + inform (input_location, + "The ABI for passing parameters with %d-byte" + " alignment has changed in GCC 4.6", + align / BITS_PER_UNIT); + } + } + + return align; +} + +/* Return true if N is a possible register number of function value. */ + +static bool +ix86_function_value_regno_p (const unsigned int regno) +{ + switch (regno) + { + case 0: + return true; + + case FIRST_FLOAT_REG: + /* TODO: The function should depend on current function ABI but + builtins.c would need updating then. Therefore we use the + default ABI. */ + if (TARGET_64BIT && ix86_abi == MS_ABI) + return false; + return TARGET_FLOAT_RETURNS_IN_80387; + + case FIRST_SSE_REG: + return TARGET_SSE; + + case FIRST_MMX_REG: + if (TARGET_MACHO || TARGET_64BIT) + return false; + return TARGET_MMX; + } + + return false; +} + +/* Define how to find the value returned by a function. + VALTYPE is the data type of the value (as a tree). + If the precise function being called is known, FUNC is its FUNCTION_DECL; + otherwise, FUNC is 0. */ + +static rtx +function_value_32 (enum machine_mode orig_mode, enum machine_mode mode, + const_tree fntype, const_tree fn) +{ + unsigned int regno; + + /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where + we normally prevent this case when mmx is not available. However + some ABIs may require the result to be returned like DImode. */ + if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8) + regno = TARGET_MMX ? FIRST_MMX_REG : 0; + + /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where + we prevent this case when sse is not available. However some ABIs + may require the result to be returned like integer TImode. */ + else if (mode == TImode + || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16)) + regno = TARGET_SSE ? FIRST_SSE_REG : 0; + + /* 32-byte vector modes in %ymm0. */ + else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32) + regno = TARGET_AVX ? FIRST_SSE_REG : 0; + + /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */ + else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387) + regno = FIRST_FLOAT_REG; + else + /* Most things go in %eax. */ + regno = AX_REG; + + /* Override FP return register with %xmm0 for local functions when + SSE math is enabled or for functions with sseregparm attribute. */ + if ((fn || fntype) && (mode == SFmode || mode == DFmode)) + { + int sse_level = ix86_function_sseregparm (fntype, fn, false); + if ((sse_level >= 1 && mode == SFmode) + || (sse_level == 2 && mode == DFmode)) + regno = FIRST_SSE_REG; + } + + /* OImode shouldn't be used directly. */ + gcc_assert (mode != OImode); + + return gen_rtx_REG (orig_mode, regno); +} + +static rtx +function_value_64 (enum machine_mode orig_mode, enum machine_mode mode, + const_tree valtype) +{ + rtx ret; + + /* Handle libcalls, which don't provide a type node. */ + if (valtype == NULL) + { + switch (mode) + { + case SFmode: + case SCmode: + case DFmode: + case DCmode: + case TFmode: + case SDmode: + case DDmode: + case TDmode: + return gen_rtx_REG (mode, FIRST_SSE_REG); + case XFmode: + case XCmode: + return gen_rtx_REG (mode, FIRST_FLOAT_REG); + case TCmode: + return NULL; + default: + return gen_rtx_REG (mode, AX_REG); + } + } + + ret = construct_container (mode, orig_mode, valtype, 1, + X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX, + x86_64_int_return_registers, 0); + + /* For zero sized structures, construct_container returns NULL, but we + need to keep rest of compiler happy by returning meaningful value. */ + if (!ret) + ret = gen_rtx_REG (orig_mode, AX_REG); + + return ret; +} + +static rtx +function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode) +{ + unsigned int regno = AX_REG; + + if (TARGET_SSE) + { + switch (GET_MODE_SIZE (mode)) + { + case 16: + if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode)) + && !COMPLEX_MODE_P (mode)) + regno = FIRST_SSE_REG; + break; + case 8: + case 4: + if (mode == SFmode || mode == DFmode) + regno = FIRST_SSE_REG; + break; + default: + break; + } + } + return gen_rtx_REG (orig_mode, regno); +} + +static rtx +ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl, + enum machine_mode orig_mode, enum machine_mode mode) +{ + const_tree fn, fntype; + + fn = NULL_TREE; + if (fntype_or_decl && DECL_P (fntype_or_decl)) + fn = fntype_or_decl; + fntype = fn ? TREE_TYPE (fn) : fntype_or_decl; + + if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI) + return function_value_ms_64 (orig_mode, mode); + else if (TARGET_64BIT) + return function_value_64 (orig_mode, mode, valtype); + else + return function_value_32 (orig_mode, mode, fntype, fn); +} + +static rtx +ix86_function_value (const_tree valtype, const_tree fntype_or_decl, + bool outgoing ATTRIBUTE_UNUSED) +{ + enum machine_mode mode, orig_mode; + + orig_mode = TYPE_MODE (valtype); + mode = type_natural_mode (valtype, NULL); + return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode); +} + +rtx +ix86_libcall_value (enum machine_mode mode) +{ + return ix86_function_value_1 (NULL, NULL, mode, mode); +} + +/* Return true iff type is returned in memory. */ + +static bool ATTRIBUTE_UNUSED +return_in_memory_32 (const_tree type, enum machine_mode mode) +{ + HOST_WIDE_INT size; + + if (mode == BLKmode) + return true; + + size = int_size_in_bytes (type); + + if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8) + return false; + + if (VECTOR_MODE_P (mode) || mode == TImode) + { + /* User-created vectors small enough to fit in EAX. */ + if (size < 8) + return false; + + /* MMX/3dNow values are returned in MM0, + except when it doesn't exits or the ABI prescribes otherwise. */ + if (size == 8) + return !TARGET_MMX || TARGET_VECT8_RETURNS; + + /* SSE values are returned in XMM0, except when it doesn't exist. */ + if (size == 16) + return !TARGET_SSE; + + /* AVX values are returned in YMM0, except when it doesn't exist. */ + if (size == 32) + return !TARGET_AVX; + } + + if (mode == XFmode) + return false; + + if (size > 12) + return true; + + /* OImode shouldn't be used directly. */ + gcc_assert (mode != OImode); + + return false; +} + +static bool ATTRIBUTE_UNUSED +return_in_memory_64 (const_tree type, enum machine_mode mode) +{ + int needed_intregs, needed_sseregs; + return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs); +} + +static bool ATTRIBUTE_UNUSED +return_in_memory_ms_64 (const_tree type, enum machine_mode mode) +{ + HOST_WIDE_INT size = int_size_in_bytes (type); + + /* __m128 is returned in xmm0. */ + if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode)) + && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16)) + return false; + + /* Otherwise, the size must be exactly in [1248]. */ + return size != 1 && size != 2 && size != 4 && size != 8; +} + +static bool +ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED) +{ +#ifdef SUBTARGET_RETURN_IN_MEMORY + return SUBTARGET_RETURN_IN_MEMORY (type, fntype); +#else + const enum machine_mode mode = type_natural_mode (type, NULL); + + if (TARGET_64BIT) + { + if (ix86_function_type_abi (fntype) == MS_ABI) + return return_in_memory_ms_64 (type, mode); + else + return return_in_memory_64 (type, mode); + } + else + return return_in_memory_32 (type, mode); +#endif +} + +/* When returning SSE vector types, we have a choice of either + (1) being abi incompatible with a -march switch, or + (2) generating an error. + Given no good solution, I think the safest thing is one warning. + The user won't be able to use -Werror, but.... + + Choose the STRUCT_VALUE_RTX hook because that's (at present) only + called in response to actually generating a caller or callee that + uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called + via aggregate_value_p for general type probing from tree-ssa. */ + +static rtx +ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED) +{ + static bool warnedsse, warnedmmx; + + if (!TARGET_64BIT && type) + { + /* Look at the return type of the function, not the function type. */ + enum machine_mode mode = TYPE_MODE (TREE_TYPE (type)); + + if (!TARGET_SSE && !warnedsse) + { + if (mode == TImode + || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16)) + { + warnedsse = true; + warning (0, "SSE vector return without SSE enabled " + "changes the ABI"); + } + } + + if (!TARGET_MMX && !warnedmmx) + { + if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8) + { + warnedmmx = true; + warning (0, "MMX vector return without MMX enabled " + "changes the ABI"); + } + } + } + + return NULL; +} + + +/* Create the va_list data type. */ + +/* Returns the calling convention specific va_list date type. + The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */ + +static tree +ix86_build_builtin_va_list_abi (enum calling_abi abi) +{ + tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl; + + /* For i386 we use plain pointer to argument area. */ + if (!TARGET_64BIT || abi == MS_ABI) + return build_pointer_type (char_type_node); + + record = lang_hooks.types.make_type (RECORD_TYPE); + type_decl = build_decl (BUILTINS_LOCATION, + TYPE_DECL, get_identifier ("__va_list_tag"), record); + + f_gpr = build_decl (BUILTINS_LOCATION, + FIELD_DECL, get_identifier ("gp_offset"), + unsigned_type_node); + f_fpr = build_decl (BUILTINS_LOCATION, + FIELD_DECL, get_identifier ("fp_offset"), + unsigned_type_node); + f_ovf = build_decl (BUILTINS_LOCATION, + FIELD_DECL, get_identifier ("overflow_arg_area"), + ptr_type_node); + f_sav = build_decl (BUILTINS_LOCATION, + FIELD_DECL, get_identifier ("reg_save_area"), + ptr_type_node); + + va_list_gpr_counter_field = f_gpr; + va_list_fpr_counter_field = f_fpr; + + DECL_FIELD_CONTEXT (f_gpr) = record; + DECL_FIELD_CONTEXT (f_fpr) = record; + DECL_FIELD_CONTEXT (f_ovf) = record; + DECL_FIELD_CONTEXT (f_sav) = record; + + TYPE_STUB_DECL (record) = type_decl; + TYPE_NAME (record) = type_decl; + TYPE_FIELDS (record) = f_gpr; + DECL_CHAIN (f_gpr) = f_fpr; + DECL_CHAIN (f_fpr) = f_ovf; + DECL_CHAIN (f_ovf) = f_sav; + + layout_type (record); + + /* The correct type is an array type of one element. */ + return build_array_type (record, build_index_type (size_zero_node)); +} + +/* Setup the builtin va_list data type and for 64-bit the additional + calling convention specific va_list data types. */ + +static tree +ix86_build_builtin_va_list (void) +{ + tree ret = ix86_build_builtin_va_list_abi (ix86_abi); + + /* Initialize abi specific va_list builtin types. */ + if (TARGET_64BIT) + { + tree t; + if (ix86_abi == MS_ABI) + { + t = ix86_build_builtin_va_list_abi (SYSV_ABI); + if (TREE_CODE (t) != RECORD_TYPE) + t = build_variant_type_copy (t); + sysv_va_list_type_node = t; + } + else + { + t = ret; + if (TREE_CODE (t) != RECORD_TYPE) + t = build_variant_type_copy (t); + sysv_va_list_type_node = t; + } + if (ix86_abi != MS_ABI) + { + t = ix86_build_builtin_va_list_abi (MS_ABI); + if (TREE_CODE (t) != RECORD_TYPE) + t = build_variant_type_copy (t); + ms_va_list_type_node = t; + } + else + { + t = ret; + if (TREE_CODE (t) != RECORD_TYPE) + t = build_variant_type_copy (t); + ms_va_list_type_node = t; + } + } + + return ret; +} + +/* Worker function for TARGET_SETUP_INCOMING_VARARGS. */ + +static void +setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum) +{ + rtx save_area, mem; + alias_set_type set; + int i, max; + + /* GPR size of varargs save area. */ + if (cfun->va_list_gpr_size) + ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD; + else + ix86_varargs_gpr_size = 0; + + /* FPR size of varargs save area. We don't need it if we don't pass + anything in SSE registers. */ + if (TARGET_SSE && cfun->va_list_fpr_size) + ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16; + else + ix86_varargs_fpr_size = 0; + + if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size) + return; + + save_area = frame_pointer_rtx; + set = get_varargs_alias_set (); + + max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD; + if (max > X86_64_REGPARM_MAX) + max = X86_64_REGPARM_MAX; + + for (i = cum->regno; i < max; i++) + { + mem = gen_rtx_MEM (Pmode, + plus_constant (save_area, i * UNITS_PER_WORD)); + MEM_NOTRAP_P (mem) = 1; + set_mem_alias_set (mem, set); + emit_move_insn (mem, gen_rtx_REG (Pmode, + x86_64_int_parameter_registers[i])); + } + + if (ix86_varargs_fpr_size) + { + enum machine_mode smode; + rtx label, test; + + /* Now emit code to save SSE registers. The AX parameter contains number + of SSE parameter registers used to call this function, though all we + actually check here is the zero/non-zero status. */ + + label = gen_label_rtx (); + test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx); + emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1), + label)); + + /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if + we used movdqa (i.e. TImode) instead? Perhaps even better would + be if we could determine the real mode of the data, via a hook + into pass_stdarg. Ignore all that for now. */ + smode = V4SFmode; + if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode)) + crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode); + + max = cum->sse_regno + cfun->va_list_fpr_size / 16; + if (max > X86_64_SSE_REGPARM_MAX) + max = X86_64_SSE_REGPARM_MAX; + + for (i = cum->sse_regno; i < max; ++i) + { + mem = plus_constant (save_area, i * 16 + ix86_varargs_gpr_size); + mem = gen_rtx_MEM (smode, mem); + MEM_NOTRAP_P (mem) = 1; + set_mem_alias_set (mem, set); + set_mem_align (mem, GET_MODE_ALIGNMENT (smode)); + + emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i))); + } + + emit_label (label); + } +} + +static void +setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum) +{ + alias_set_type set = get_varargs_alias_set (); + int i; + + for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++) + { + rtx reg, mem; + + mem = gen_rtx_MEM (Pmode, + plus_constant (virtual_incoming_args_rtx, + i * UNITS_PER_WORD)); + MEM_NOTRAP_P (mem) = 1; + set_mem_alias_set (mem, set); + + reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]); + emit_move_insn (mem, reg); + } +} + +static void +ix86_setup_incoming_varargs (CUMULATIVE_ARGS *cum, enum machine_mode mode, + tree type, int *pretend_size ATTRIBUTE_UNUSED, + int no_rtl) +{ + CUMULATIVE_ARGS next_cum; + tree fntype; + + /* This argument doesn't appear to be used anymore. Which is good, + because the old code here didn't suppress rtl generation. */ + gcc_assert (!no_rtl); + + if (!TARGET_64BIT) + return; + + fntype = TREE_TYPE (current_function_decl); + + /* For varargs, we do not want to skip the dummy va_dcl argument. + For stdargs, we do want to skip the last named argument. */ + next_cum = *cum; + if (stdarg_p (fntype)) + ix86_function_arg_advance (&next_cum, mode, type, true); + + if (cum->call_abi == MS_ABI) + setup_incoming_varargs_ms_64 (&next_cum); + else + setup_incoming_varargs_64 (&next_cum); +} + +/* Checks if TYPE is of kind va_list char *. */ + +static bool +is_va_list_char_pointer (tree type) +{ + tree canonic; + + /* For 32-bit it is always true. */ + if (!TARGET_64BIT) + return true; + canonic = ix86_canonical_va_list_type (type); + return (canonic == ms_va_list_type_node + || (ix86_abi == MS_ABI && canonic == va_list_type_node)); +} + +/* Implement va_start. */ + +static void +ix86_va_start (tree valist, rtx nextarg) +{ + HOST_WIDE_INT words, n_gpr, n_fpr; + tree f_gpr, f_fpr, f_ovf, f_sav; + tree gpr, fpr, ovf, sav, t; + tree type; + rtx ovf_rtx; + + if (flag_split_stack + && cfun->machine->split_stack_varargs_pointer == NULL_RTX) + { + unsigned int scratch_regno; + + /* When we are splitting the stack, we can't refer to the stack + arguments using internal_arg_pointer, because they may be on + the old stack. The split stack prologue will arrange to + leave a pointer to the old stack arguments in a scratch + register, which we here copy to a pseudo-register. The split + stack prologue can't set the pseudo-register directly because + it (the prologue) runs before any registers have been saved. */ + + scratch_regno = split_stack_prologue_scratch_regno (); + if (scratch_regno != INVALID_REGNUM) + { + rtx reg, seq; + + reg = gen_reg_rtx (Pmode); + cfun->machine->split_stack_varargs_pointer = reg; + + start_sequence (); + emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno)); + seq = get_insns (); + end_sequence (); + + push_topmost_sequence (); + emit_insn_after (seq, entry_of_function ()); + pop_topmost_sequence (); + } + } + + /* Only 64bit target needs something special. */ + if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist))) + { + if (cfun->machine->split_stack_varargs_pointer == NULL_RTX) + std_expand_builtin_va_start (valist, nextarg); + else + { + rtx va_r, next; + + va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE); + next = expand_binop (ptr_mode, add_optab, + cfun->machine->split_stack_varargs_pointer, + crtl->args.arg_offset_rtx, + NULL_RTX, 0, OPTAB_LIB_WIDEN); + convert_move (va_r, next, 0); + } + return; + } + + f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node)); + f_fpr = DECL_CHAIN (f_gpr); + f_ovf = DECL_CHAIN (f_fpr); + f_sav = DECL_CHAIN (f_ovf); + + valist = build_simple_mem_ref (valist); + TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node); + /* The following should be folded into the MEM_REF offset. */ + gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist), + f_gpr, NULL_TREE); + fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist), + f_fpr, NULL_TREE); + ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist), + f_ovf, NULL_TREE); + sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist), + f_sav, NULL_TREE); + + /* Count number of gp and fp argument registers used. */ + words = crtl->args.info.words; + n_gpr = crtl->args.info.regno; + n_fpr = crtl->args.info.sse_regno; + + if (cfun->va_list_gpr_size) + { + type = TREE_TYPE (gpr); + t = build2 (MODIFY_EXPR, type, + gpr, build_int_cst (type, n_gpr * 8)); + TREE_SIDE_EFFECTS (t) = 1; + expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); + } + + if (TARGET_SSE && cfun->va_list_fpr_size) + { + type = TREE_TYPE (fpr); + t = build2 (MODIFY_EXPR, type, fpr, + build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX)); + TREE_SIDE_EFFECTS (t) = 1; + expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); + } + + /* Find the overflow area. */ + type = TREE_TYPE (ovf); + if (cfun->machine->split_stack_varargs_pointer == NULL_RTX) + ovf_rtx = crtl->args.internal_arg_pointer; + else + ovf_rtx = cfun->machine->split_stack_varargs_pointer; + t = make_tree (type, ovf_rtx); + if (words != 0) + t = build2 (POINTER_PLUS_EXPR, type, t, + size_int (words * UNITS_PER_WORD)); + t = build2 (MODIFY_EXPR, type, ovf, t); + TREE_SIDE_EFFECTS (t) = 1; + expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); + + if (ix86_varargs_gpr_size || ix86_varargs_fpr_size) + { + /* Find the register save area. + Prologue of the function save it right above stack frame. */ + type = TREE_TYPE (sav); + t = make_tree (type, frame_pointer_rtx); + if (!ix86_varargs_gpr_size) + t = build2 (POINTER_PLUS_EXPR, type, t, + size_int (-8 * X86_64_REGPARM_MAX)); + t = build2 (MODIFY_EXPR, type, sav, t); + TREE_SIDE_EFFECTS (t) = 1; + expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); + } +} + +/* Implement va_arg. */ + +static tree +ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p, + gimple_seq *post_p) +{ + static const int intreg[6] = { 0, 1, 2, 3, 4, 5 }; + tree f_gpr, f_fpr, f_ovf, f_sav; + tree gpr, fpr, ovf, sav, t; + int size, rsize; + tree lab_false, lab_over = NULL_TREE; + tree addr, t2; + rtx container; + int indirect_p = 0; + tree ptrtype; + enum machine_mode nat_mode; + unsigned int arg_boundary; + + /* Only 64bit target needs something special. */ + if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist))) + return std_gimplify_va_arg_expr (valist, type, pre_p, post_p); + + f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node)); + f_fpr = DECL_CHAIN (f_gpr); + f_ovf = DECL_CHAIN (f_fpr); + f_sav = DECL_CHAIN (f_ovf); + + gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), + build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE); + valist = build_va_arg_indirect_ref (valist); + fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE); + ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE); + sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE); + + indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false); + if (indirect_p) + type = build_pointer_type (type); + size = int_size_in_bytes (type); + rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD; + + nat_mode = type_natural_mode (type, NULL); + switch (nat_mode) + { + case V8SFmode: + case V8SImode: + case V32QImode: + case V16HImode: + case V4DFmode: + case V4DImode: + /* Unnamed 256bit vector mode parameters are passed on stack. */ + if (ix86_cfun_abi () == SYSV_ABI) + { + container = NULL; + break; + } + + default: + container = construct_container (nat_mode, TYPE_MODE (type), + type, 0, X86_64_REGPARM_MAX, + X86_64_SSE_REGPARM_MAX, intreg, + 0); + break; + } + + /* Pull the value out of the saved registers. */ + + addr = create_tmp_var (ptr_type_node, "addr"); + + if (container) + { + int needed_intregs, needed_sseregs; + bool need_temp; + tree int_addr, sse_addr; + + lab_false = create_artificial_label (UNKNOWN_LOCATION); + lab_over = create_artificial_label (UNKNOWN_LOCATION); + + examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs); + + need_temp = (!REG_P (container) + && ((needed_intregs && TYPE_ALIGN (type) > 64) + || TYPE_ALIGN (type) > 128)); + + /* In case we are passing structure, verify that it is consecutive block + on the register save area. If not we need to do moves. */ + if (!need_temp && !REG_P (container)) + { + /* Verify that all registers are strictly consecutive */ + if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0)))) + { + int i; + + for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++) + { + rtx slot = XVECEXP (container, 0, i); + if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i + || INTVAL (XEXP (slot, 1)) != i * 16) + need_temp = 1; + } + } + else + { + int i; + + for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++) + { + rtx slot = XVECEXP (container, 0, i); + if (REGNO (XEXP (slot, 0)) != (unsigned int) i + || INTVAL (XEXP (slot, 1)) != i * 8) + need_temp = 1; + } + } + } + if (!need_temp) + { + int_addr = addr; + sse_addr = addr; + } + else + { + int_addr = create_tmp_var (ptr_type_node, "int_addr"); + sse_addr = create_tmp_var (ptr_type_node, "sse_addr"); + } + + /* First ensure that we fit completely in registers. */ + if (needed_intregs) + { + t = build_int_cst (TREE_TYPE (gpr), + (X86_64_REGPARM_MAX - needed_intregs + 1) * 8); + t = build2 (GE_EXPR, boolean_type_node, gpr, t); + t2 = build1 (GOTO_EXPR, void_type_node, lab_false); + t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE); + gimplify_and_add (t, pre_p); + } + if (needed_sseregs) + { + t = build_int_cst (TREE_TYPE (fpr), + (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16 + + X86_64_REGPARM_MAX * 8); + t = build2 (GE_EXPR, boolean_type_node, fpr, t); + t2 = build1 (GOTO_EXPR, void_type_node, lab_false); + t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE); + gimplify_and_add (t, pre_p); + } + + /* Compute index to start of area used for integer regs. */ + if (needed_intregs) + { + /* int_addr = gpr + sav; */ + t = fold_convert (sizetype, gpr); + t = build2 (POINTER_PLUS_EXPR, ptr_type_node, sav, t); + gimplify_assign (int_addr, t, pre_p); + } + if (needed_sseregs) + { + /* sse_addr = fpr + sav; */ + t = fold_convert (sizetype, fpr); + t = build2 (POINTER_PLUS_EXPR, ptr_type_node, sav, t); + gimplify_assign (sse_addr, t, pre_p); + } + if (need_temp) + { + int i, prev_size = 0; + tree temp = create_tmp_var (type, "va_arg_tmp"); + + /* addr = &temp; */ + t = build1 (ADDR_EXPR, build_pointer_type (type), temp); + gimplify_assign (addr, t, pre_p); + + for (i = 0; i < XVECLEN (container, 0); i++) + { + rtx slot = XVECEXP (container, 0, i); + rtx reg = XEXP (slot, 0); + enum machine_mode mode = GET_MODE (reg); + tree piece_type; + tree addr_type; + tree daddr_type; + tree src_addr, src; + int src_offset; + tree dest_addr, dest; + int cur_size = GET_MODE_SIZE (mode); + + gcc_assert (prev_size <= INTVAL (XEXP (slot, 1))); + prev_size = INTVAL (XEXP (slot, 1)); + if (prev_size + cur_size > size) + { + cur_size = size - prev_size; + mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1); + if (mode == BLKmode) + mode = QImode; + } + piece_type = lang_hooks.types.type_for_mode (mode, 1); + if (mode == GET_MODE (reg)) + addr_type = build_pointer_type (piece_type); + else + addr_type = build_pointer_type_for_mode (piece_type, ptr_mode, + true); + daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode, + true); + + if (SSE_REGNO_P (REGNO (reg))) + { + src_addr = sse_addr; + src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16; + } + else + { + src_addr = int_addr; + src_offset = REGNO (reg) * 8; + } + src_addr = fold_convert (addr_type, src_addr); + src_addr = fold_build2 (POINTER_PLUS_EXPR, addr_type, src_addr, + size_int (src_offset)); + + dest_addr = fold_convert (daddr_type, addr); + dest_addr = fold_build2 (POINTER_PLUS_EXPR, daddr_type, dest_addr, + size_int (prev_size)); + if (cur_size == GET_MODE_SIZE (mode)) + { + src = build_va_arg_indirect_ref (src_addr); + dest = build_va_arg_indirect_ref (dest_addr); + + gimplify_assign (dest, src, pre_p); + } + else + { + tree copy + = build_call_expr (implicit_built_in_decls[BUILT_IN_MEMCPY], + 3, dest_addr, src_addr, + size_int (cur_size)); + gimplify_and_add (copy, pre_p); + } + prev_size += cur_size; + } + } + + if (needed_intregs) + { + t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr, + build_int_cst (TREE_TYPE (gpr), needed_intregs * 8)); + gimplify_assign (gpr, t, pre_p); + } + + if (needed_sseregs) + { + t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr, + build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16)); + gimplify_assign (fpr, t, pre_p); + } + + gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over)); + + gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false)); + } + + /* ... otherwise out of the overflow area. */ + + /* When we align parameter on stack for caller, if the parameter + alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be + aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee + here with caller. */ + arg_boundary = ix86_function_arg_boundary (VOIDmode, type); + if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT) + arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT; + + /* Care for on-stack alignment if needed. */ + if (arg_boundary <= 64 || size == 0) + t = ovf; + else + { + HOST_WIDE_INT align = arg_boundary / 8; + t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (ovf), ovf, + size_int (align - 1)); + t = fold_convert (sizetype, t); + t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, + size_int (-align)); + t = fold_convert (TREE_TYPE (ovf), t); + } + + gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue); + gimplify_assign (addr, t, pre_p); + + t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (t), t, + size_int (rsize * UNITS_PER_WORD)); + gimplify_assign (unshare_expr (ovf), t, pre_p); + + if (container) + gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over)); + + ptrtype = build_pointer_type_for_mode (type, ptr_mode, true); + addr = fold_convert (ptrtype, addr); + + if (indirect_p) + addr = build_va_arg_indirect_ref (addr); + return build_va_arg_indirect_ref (addr); +} + +/* Return true if OPNUM's MEM should be matched + in movabs* patterns. */ + +bool +ix86_check_movabs (rtx insn, int opnum) +{ + rtx set, mem; + + set = PATTERN (insn); + if (GET_CODE (set) == PARALLEL) + set = XVECEXP (set, 0, 0); + gcc_assert (GET_CODE (set) == SET); + mem = XEXP (set, opnum); + while (GET_CODE (mem) == SUBREG) + mem = SUBREG_REG (mem); + gcc_assert (MEM_P (mem)); + return volatile_ok || !MEM_VOLATILE_P (mem); +} + +/* Initialize the table of extra 80387 mathematical constants. */ + +static void +init_ext_80387_constants (void) +{ + static const char * cst[5] = + { + "0.3010299956639811952256464283594894482", /* 0: fldlg2 */ + "0.6931471805599453094286904741849753009", /* 1: fldln2 */ + "1.4426950408889634073876517827983434472", /* 2: fldl2e */ + "3.3219280948873623478083405569094566090", /* 3: fldl2t */ + "3.1415926535897932385128089594061862044", /* 4: fldpi */ + }; + int i; + + for (i = 0; i < 5; i++) + { + real_from_string (&ext_80387_constants_table[i], cst[i]); + /* Ensure each constant is rounded to XFmode precision. */ + real_convert (&ext_80387_constants_table[i], + XFmode, &ext_80387_constants_table[i]); + } + + ext_80387_constants_init = 1; +} + +/* Return non-zero if the constant is something that + can be loaded with a special instruction. */ + +int +standard_80387_constant_p (rtx x) +{ + enum machine_mode mode = GET_MODE (x); + + REAL_VALUE_TYPE r; + + if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE))) + return -1; + + if (x == CONST0_RTX (mode)) + return 1; + if (x == CONST1_RTX (mode)) + return 2; + + REAL_VALUE_FROM_CONST_DOUBLE (r, x); + + /* For XFmode constants, try to find a special 80387 instruction when + optimizing for size or on those CPUs that benefit from them. */ + if (mode == XFmode + && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS)) + { + int i; + + if (! ext_80387_constants_init) + init_ext_80387_constants (); + + for (i = 0; i < 5; i++) + if (real_identical (&r, &ext_80387_constants_table[i])) + return i + 3; + } + + /* Load of the constant -0.0 or -1.0 will be split as + fldz;fchs or fld1;fchs sequence. */ + if (real_isnegzero (&r)) + return 8; + if (real_identical (&r, &dconstm1)) + return 9; + + return 0; +} + +/* Return the opcode of the special instruction to be used to load + the constant X. */ + +const char * +standard_80387_constant_opcode (rtx x) +{ + switch (standard_80387_constant_p (x)) + { + case 1: + return "fldz"; + case 2: + return "fld1"; + case 3: + return "fldlg2"; + case 4: + return "fldln2"; + case 5: + return "fldl2e"; + case 6: + return "fldl2t"; + case 7: + return "fldpi"; + case 8: + case 9: + return "#"; + default: + gcc_unreachable (); + } +} + +/* Return the CONST_DOUBLE representing the 80387 constant that is + loaded by the specified special instruction. The argument IDX + matches the return value from standard_80387_constant_p. */ + +rtx +standard_80387_constant_rtx (int idx) +{ + int i; + + if (! ext_80387_constants_init) + init_ext_80387_constants (); + + switch (idx) + { + case 3: + case 4: + case 5: + case 6: + case 7: + i = idx - 3; + break; + + default: + gcc_unreachable (); + } + + return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i], + XFmode); +} + +/* Return 1 if X is all 0s and 2 if x is all 1s + in supported SSE vector mode. */ + +int +standard_sse_constant_p (rtx x) +{ + enum machine_mode mode = GET_MODE (x); + + if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x))) + return 1; + if (vector_all_ones_operand (x, mode)) + switch (mode) + { + case V16QImode: + case V8HImode: + case V4SImode: + case V2DImode: + if (TARGET_SSE2) + return 2; + default: + break; + } + + return 0; +} + +/* Return the opcode of the special instruction to be used to load + the constant X. */ + +const char * +standard_sse_constant_opcode (rtx insn, rtx x) +{ + switch (standard_sse_constant_p (x)) + { + case 1: + switch (get_attr_mode (insn)) + { + case MODE_V4SF: + return TARGET_AVX ? "vxorps\t%0, %0, %0" : "xorps\t%0, %0"; + case MODE_V2DF: + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return TARGET_AVX ? "vxorps\t%0, %0, %0" : "xorps\t%0, %0"; + else + return TARGET_AVX ? "vxorpd\t%0, %0, %0" : "xorpd\t%0, %0"; + case MODE_TI: + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return TARGET_AVX ? "vxorps\t%0, %0, %0" : "xorps\t%0, %0"; + else + return TARGET_AVX ? "vpxor\t%0, %0, %0" : "pxor\t%0, %0"; + case MODE_V8SF: + return "vxorps\t%x0, %x0, %x0"; + case MODE_V4DF: + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "vxorps\t%x0, %x0, %x0"; + else + return "vxorpd\t%x0, %x0, %x0"; + case MODE_OI: + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "vxorps\t%x0, %x0, %x0"; + else + return "vpxor\t%x0, %x0, %x0"; + default: + break; + } + case 2: + return TARGET_AVX ? "vpcmpeqd\t%0, %0, %0" : "pcmpeqd\t%0, %0"; + default: + break; + } + gcc_unreachable (); +} + +/* Returns true if OP contains a symbol reference */ + +bool +symbolic_reference_mentioned_p (rtx op) +{ + const char *fmt; + int i; + + if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF) + return true; + + fmt = GET_RTX_FORMAT (GET_CODE (op)); + for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--) + { + if (fmt[i] == 'E') + { + int j; + + for (j = XVECLEN (op, i) - 1; j >= 0; j--) + if (symbolic_reference_mentioned_p (XVECEXP (op, i, j))) + return true; + } + + else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i))) + return true; + } + + return false; +} + +/* Return true if it is appropriate to emit `ret' instructions in the + body of a function. Do this only if the epilogue is simple, needing a + couple of insns. Prior to reloading, we can't tell how many registers + must be saved, so return false then. Return false if there is no frame + marker to de-allocate. */ + +bool +ix86_can_use_return_insn_p (void) +{ + struct ix86_frame frame; + + if (! reload_completed || frame_pointer_needed) + return 0; + + /* Don't allow more than 32k pop, since that's all we can do + with one instruction. */ + if (crtl->args.pops_args && crtl->args.size >= 32768) + return 0; + + ix86_compute_frame_layout (&frame); + return (frame.stack_pointer_offset == UNITS_PER_WORD + && (frame.nregs + frame.nsseregs) == 0); +} + +/* Value should be nonzero if functions must have frame pointers. + Zero means the frame pointer need not be set up (and parms may + be accessed via the stack pointer) in functions that seem suitable. */ + +static bool +ix86_frame_pointer_required (void) +{ + /* If we accessed previous frames, then the generated code expects + to be able to access the saved ebp value in our frame. */ + if (cfun->machine->accesses_prev_frame) + return true; + + /* Several x86 os'es need a frame pointer for other reasons, + usually pertaining to setjmp. */ + if (SUBTARGET_FRAME_POINTER_REQUIRED) + return true; + + /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER + turns off the frame pointer by default. Turn it back on now if + we've not got a leaf function. */ + if (TARGET_OMIT_LEAF_FRAME_POINTER + && (!current_function_is_leaf + || ix86_current_function_calls_tls_descriptor)) + return true; + + if (crtl->profile && !flag_fentry) + return true; + + return false; +} + +/* Record that the current function accesses previous call frames. */ + +void +ix86_setup_frame_addresses (void) +{ + cfun->machine->accesses_prev_frame = 1; +} + +#ifndef USE_HIDDEN_LINKONCE +# if (defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)) || TARGET_MACHO +# define USE_HIDDEN_LINKONCE 1 +# else +# define USE_HIDDEN_LINKONCE 0 +# endif +#endif + +static int pic_labels_used; + +/* Fills in the label name that should be used for a pc thunk for + the given register. */ + +static void +get_pc_thunk_name (char name[32], unsigned int regno) +{ + gcc_assert (!TARGET_64BIT); + + if (USE_HIDDEN_LINKONCE) + sprintf (name, "__i686.get_pc_thunk.%s", reg_names[regno]); + else + ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno); +} + + +/* This function generates code for -fpic that loads %ebx with + the return address of the caller and then returns. */ + +static void +ix86_code_end (void) +{ + rtx xops[2]; + int regno; + + for (regno = AX_REG; regno <= SP_REG; regno++) + { + char name[32]; + tree decl; + + if (!(pic_labels_used & (1 << regno))) + continue; + + get_pc_thunk_name (name, regno); + + decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL, + get_identifier (name), + build_function_type (void_type_node, void_list_node)); + DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL, + NULL_TREE, void_type_node); + TREE_PUBLIC (decl) = 1; + TREE_STATIC (decl) = 1; + +#if TARGET_MACHO + if (TARGET_MACHO) + { + switch_to_section (darwin_sections[text_coal_section]); + fputs ("\t.weak_definition\t", asm_out_file); + assemble_name (asm_out_file, name); + fputs ("\n\t.private_extern\t", asm_out_file); + assemble_name (asm_out_file, name); + putc ('\n', asm_out_file); + ASM_OUTPUT_LABEL (asm_out_file, name); + DECL_WEAK (decl) = 1; + } + else +#endif + if (USE_HIDDEN_LINKONCE) + { + DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl); + + targetm.asm_out.unique_section (decl, 0); + switch_to_section (get_named_section (decl, NULL, 0)); + + targetm.asm_out.globalize_label (asm_out_file, name); + fputs ("\t.hidden\t", asm_out_file); + assemble_name (asm_out_file, name); + putc ('\n', asm_out_file); + ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl); + } + else + { + switch_to_section (text_section); + ASM_OUTPUT_LABEL (asm_out_file, name); + } + + DECL_INITIAL (decl) = make_node (BLOCK); + current_function_decl = decl; + init_function_start (decl); + first_function_block_is_cold = false; + /* Make sure unwind info is emitted for the thunk if needed. */ + final_start_function (emit_barrier (), asm_out_file, 1); + + /* Pad stack IP move with 4 instructions (two NOPs count + as one instruction). */ + if (TARGET_PAD_SHORT_FUNCTION) + { + int i = 8; + + while (i--) + fputs ("\tnop\n", asm_out_file); + } + + xops[0] = gen_rtx_REG (Pmode, regno); + xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx); + output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops); + fputs ("\tret\n", asm_out_file); + final_end_function (); + init_insn_lengths (); + free_after_compilation (cfun); + set_cfun (NULL); + current_function_decl = NULL; + } + + if (flag_split_stack) + file_end_indicate_split_stack (); +} + +/* Emit code for the SET_GOT patterns. */ + +const char * +output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED) +{ + rtx xops[3]; + + xops[0] = dest; + + if (TARGET_VXWORKS_RTP && flag_pic) + { + /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */ + xops[2] = gen_rtx_MEM (Pmode, + gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE)); + output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops); + + /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register. + Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as + an unadorned address. */ + xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX); + SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL; + output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops); + return ""; + } + + xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME); + + if (! TARGET_DEEP_BRANCH_PREDICTION || !flag_pic) + { + xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ()); + + if (!flag_pic) + output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops); + else + { + output_asm_insn ("call\t%a2", xops); +#ifdef DWARF2_UNWIND_INFO + /* The call to next label acts as a push. */ + if (dwarf2out_do_frame ()) + { + rtx insn; + start_sequence (); + insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, + gen_rtx_PLUS (Pmode, + stack_pointer_rtx, + GEN_INT (-4)))); + RTX_FRAME_RELATED_P (insn) = 1; + dwarf2out_frame_debug (insn, true); + end_sequence (); + } +#endif + } + +#if TARGET_MACHO + /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This + is what will be referenced by the Mach-O PIC subsystem. */ + if (!label) + ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME); +#endif + + targetm.asm_out.internal_label (asm_out_file, "L", + CODE_LABEL_NUMBER (XEXP (xops[2], 0))); + + if (flag_pic) + { + output_asm_insn ("pop%z0\t%0", xops); +#ifdef DWARF2_UNWIND_INFO + /* The pop is a pop and clobbers dest, but doesn't restore it + for unwind info purposes. */ + if (dwarf2out_do_frame ()) + { + rtx insn; + start_sequence (); + insn = emit_insn (gen_rtx_SET (VOIDmode, dest, const0_rtx)); + dwarf2out_frame_debug (insn, true); + insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, + gen_rtx_PLUS (Pmode, + stack_pointer_rtx, + GEN_INT (4)))); + RTX_FRAME_RELATED_P (insn) = 1; + dwarf2out_frame_debug (insn, true); + end_sequence (); + } +#endif + } + } + else + { + char name[32]; + get_pc_thunk_name (name, REGNO (dest)); + pic_labels_used |= 1 << REGNO (dest); + +#ifdef DWARF2_UNWIND_INFO + /* Ensure all queued register saves are flushed before the + call. */ + if (dwarf2out_do_frame ()) + dwarf2out_flush_queued_reg_saves (); +#endif + xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name)); + xops[2] = gen_rtx_MEM (QImode, xops[2]); + output_asm_insn ("call\t%X2", xops); + /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This + is what will be referenced by the Mach-O PIC subsystem. */ +#if TARGET_MACHO + if (!label) + ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME); + else + targetm.asm_out.internal_label (asm_out_file, "L", + CODE_LABEL_NUMBER (label)); +#endif + } + + if (TARGET_MACHO) + return ""; + + if (!flag_pic || TARGET_DEEP_BRANCH_PREDICTION) + output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops); + else + output_asm_insn ("add%z0\t{%1+[.-%a2], %0|%0, %1+(.-%a2)}", xops); + + return ""; +} + +/* Generate an "push" pattern for input ARG. */ + +static rtx +gen_push (rtx arg) +{ + struct machine_function *m = cfun->machine; + + if (m->fs.cfa_reg == stack_pointer_rtx) + m->fs.cfa_offset += UNITS_PER_WORD; + m->fs.sp_offset += UNITS_PER_WORD; + + return gen_rtx_SET (VOIDmode, + gen_rtx_MEM (Pmode, + gen_rtx_PRE_DEC (Pmode, + stack_pointer_rtx)), + arg); +} + +/* Generate an "pop" pattern for input ARG. */ + +static rtx +gen_pop (rtx arg) +{ + return gen_rtx_SET (VOIDmode, + arg, + gen_rtx_MEM (Pmode, + gen_rtx_POST_INC (Pmode, + stack_pointer_rtx))); +} + +/* Return >= 0 if there is an unused call-clobbered register available + for the entire function. */ + +static unsigned int +ix86_select_alt_pic_regnum (void) +{ + if (current_function_is_leaf + && !crtl->profile + && !ix86_current_function_calls_tls_descriptor) + { + int i, drap; + /* Can't use the same register for both PIC and DRAP. */ + if (crtl->drap_reg) + drap = REGNO (crtl->drap_reg); + else + drap = -1; + for (i = 2; i >= 0; --i) + if (i != drap && !df_regs_ever_live_p (i)) + return i; + } + + return INVALID_REGNUM; +} + +/* Return 1 if we need to save REGNO. */ +static int +ix86_save_reg (unsigned int regno, int maybe_eh_return) +{ + if (pic_offset_table_rtx + && regno == REAL_PIC_OFFSET_TABLE_REGNUM + && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM) + || crtl->profile + || crtl->calls_eh_return + || crtl->uses_const_pool)) + { + if (ix86_select_alt_pic_regnum () != INVALID_REGNUM) + return 0; + return 1; + } + + if (crtl->calls_eh_return && maybe_eh_return) + { + unsigned i; + for (i = 0; ; i++) + { + unsigned test = EH_RETURN_DATA_REGNO (i); + if (test == INVALID_REGNUM) + break; + if (test == regno) + return 1; + } + } + + if (crtl->drap_reg && regno == REGNO (crtl->drap_reg)) + return 1; + + return (df_regs_ever_live_p (regno) + && !call_used_regs[regno] + && !fixed_regs[regno] + && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed)); +} + +/* Return number of saved general prupose registers. */ + +static int +ix86_nsaved_regs (void) +{ + int nregs = 0; + int regno; + + for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true)) + nregs ++; + return nregs; +} + +/* Return number of saved SSE registrers. */ + +static int +ix86_nsaved_sseregs (void) +{ + int nregs = 0; + int regno; + + if (ix86_cfun_abi () != MS_ABI) + return 0; + for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true)) + nregs ++; + return nregs; +} + +/* Given FROM and TO register numbers, say whether this elimination is + allowed. If stack alignment is needed, we can only replace argument + pointer with hard frame pointer, or replace frame pointer with stack + pointer. Otherwise, frame pointer elimination is automatically + handled and all other eliminations are valid. */ + +static bool +ix86_can_eliminate (const int from, const int to) +{ + if (stack_realign_fp) + return ((from == ARG_POINTER_REGNUM + && to == HARD_FRAME_POINTER_REGNUM) + || (from == FRAME_POINTER_REGNUM + && to == STACK_POINTER_REGNUM)); + else + return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true; +} + +/* Return the offset between two registers, one to be eliminated, and the other + its replacement, at the start of a routine. */ + +HOST_WIDE_INT +ix86_initial_elimination_offset (int from, int to) +{ + struct ix86_frame frame; + ix86_compute_frame_layout (&frame); + + if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM) + return frame.hard_frame_pointer_offset; + else if (from == FRAME_POINTER_REGNUM + && to == HARD_FRAME_POINTER_REGNUM) + return frame.hard_frame_pointer_offset - frame.frame_pointer_offset; + else + { + gcc_assert (to == STACK_POINTER_REGNUM); + + if (from == ARG_POINTER_REGNUM) + return frame.stack_pointer_offset; + + gcc_assert (from == FRAME_POINTER_REGNUM); + return frame.stack_pointer_offset - frame.frame_pointer_offset; + } +} + +/* In a dynamically-aligned function, we can't know the offset from + stack pointer to frame pointer, so we must ensure that setjmp + eliminates fp against the hard fp (%ebp) rather than trying to + index from %esp up to the top of the frame across a gap that is + of unknown (at compile-time) size. */ +static rtx +ix86_builtin_setjmp_frame_value (void) +{ + return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx; +} + +/* On the x86 -fsplit-stack and -fstack-protector both use the same + field in the TCB, so they can not be used together. */ + +static bool +ix86_supports_split_stack (bool report ATTRIBUTE_UNUSED, + struct gcc_options *opts ATTRIBUTE_UNUSED) +{ + bool ret = true; + +#ifndef TARGET_THREAD_SPLIT_STACK_OFFSET + if (report) + error ("%<-fsplit-stack%> currently only supported on GNU/Linux"); + ret = false; +#else + if (!HAVE_GAS_CFI_PERSONALITY_DIRECTIVE) + { + if (report) + error ("%<-fsplit-stack%> requires " + "assembler support for CFI directives"); + ret = false; + } +#endif + + return ret; +} + +/* When using -fsplit-stack, the allocation routines set a field in + the TCB to the bottom of the stack plus this much space, measured + in bytes. */ + +#define SPLIT_STACK_AVAILABLE 256 + +/* Fill structure ix86_frame about frame of currently computed function. */ + +static void +ix86_compute_frame_layout (struct ix86_frame *frame) +{ + unsigned int stack_alignment_needed; + HOST_WIDE_INT offset; + unsigned int preferred_alignment; + HOST_WIDE_INT size = get_frame_size (); + HOST_WIDE_INT to_allocate; + + frame->nregs = ix86_nsaved_regs (); + frame->nsseregs = ix86_nsaved_sseregs (); + + stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT; + preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT; + + /* MS ABI seem to require stack alignment to be always 16 except for function + prologues and leaf. */ + if ((ix86_cfun_abi () == MS_ABI && preferred_alignment < 16) + && (!current_function_is_leaf || cfun->calls_alloca != 0 + || ix86_current_function_calls_tls_descriptor)) + { + preferred_alignment = 16; + stack_alignment_needed = 16; + crtl->preferred_stack_boundary = 128; + crtl->stack_alignment_needed = 128; + } + + gcc_assert (!size || stack_alignment_needed); + gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT); + gcc_assert (preferred_alignment <= stack_alignment_needed); + + /* For SEH we have to limit the amount of code movement into the prologue. + At present we do this via a BLOCKAGE, at which point there's very little + scheduling that can be done, which means that there's very little point + in doing anything except PUSHs. */ + if (TARGET_SEH) + cfun->machine->use_fast_prologue_epilogue = false; + + /* During reload iteration the amount of registers saved can change. + Recompute the value as needed. Do not recompute when amount of registers + didn't change as reload does multiple calls to the function and does not + expect the decision to change within single iteration. */ + else if (!optimize_function_for_size_p (cfun) + && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs) + { + int count = frame->nregs; + struct cgraph_node *node = cgraph_node (current_function_decl); + + cfun->machine->use_fast_prologue_epilogue_nregs = count; + + /* The fast prologue uses move instead of push to save registers. This + is significantly longer, but also executes faster as modern hardware + can execute the moves in parallel, but can't do that for push/pop. + + Be careful about choosing what prologue to emit: When function takes + many instructions to execute we may use slow version as well as in + case function is known to be outside hot spot (this is known with + feedback only). Weight the size of function by number of registers + to save as it is cheap to use one or two push instructions but very + slow to use many of them. */ + if (count) + count = (count - 1) * FAST_PROLOGUE_INSN_COUNT; + if (node->frequency < NODE_FREQUENCY_NORMAL + || (flag_branch_probabilities + && node->frequency < NODE_FREQUENCY_HOT)) + cfun->machine->use_fast_prologue_epilogue = false; + else + cfun->machine->use_fast_prologue_epilogue + = !expensive_function_p (count); + } + if (TARGET_PROLOGUE_USING_MOVE + && cfun->machine->use_fast_prologue_epilogue) + frame->save_regs_using_mov = true; + else + frame->save_regs_using_mov = false; + + /* If static stack checking is enabled and done with probes, the registers + need to be saved before allocating the frame. */ + if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK) + frame->save_regs_using_mov = false; + + /* Skip return address. */ + offset = UNITS_PER_WORD; + + /* Skip pushed static chain. */ + if (ix86_static_chain_on_stack) + offset += UNITS_PER_WORD; + + /* Skip saved base pointer. */ + if (frame_pointer_needed) + offset += UNITS_PER_WORD; + frame->hfp_save_offset = offset; + + /* The traditional frame pointer location is at the top of the frame. */ + frame->hard_frame_pointer_offset = offset; + + /* Register save area */ + offset += frame->nregs * UNITS_PER_WORD; + frame->reg_save_offset = offset; + + /* Align and set SSE register save area. */ + if (frame->nsseregs) + { + /* The only ABI that has saved SSE registers (Win64) also has a + 16-byte aligned default stack, and thus we don't need to be + within the re-aligned local stack frame to save them. */ + gcc_assert (INCOMING_STACK_BOUNDARY >= 128); + offset = (offset + 16 - 1) & -16; + offset += frame->nsseregs * 16; + } + frame->sse_reg_save_offset = offset; + + /* The re-aligned stack starts here. Values before this point are not + directly comparable with values below this point. In order to make + sure that no value happens to be the same before and after, force + the alignment computation below to add a non-zero value. */ + if (stack_realign_fp) + offset = (offset + stack_alignment_needed) & -stack_alignment_needed; + + /* Va-arg area */ + frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size; + offset += frame->va_arg_size; + + /* Align start of frame for local function. */ + if (stack_realign_fp + || offset != frame->sse_reg_save_offset + || size != 0 + || !current_function_is_leaf + || cfun->calls_alloca + || ix86_current_function_calls_tls_descriptor) + offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed; + + /* Frame pointer points here. */ + frame->frame_pointer_offset = offset; + + offset += size; + + /* Add outgoing arguments area. Can be skipped if we eliminated + all the function calls as dead code. + Skipping is however impossible when function calls alloca. Alloca + expander assumes that last crtl->outgoing_args_size + of stack frame are unused. */ + if (ACCUMULATE_OUTGOING_ARGS + && (!current_function_is_leaf || cfun->calls_alloca + || ix86_current_function_calls_tls_descriptor)) + { + offset += crtl->outgoing_args_size; + frame->outgoing_arguments_size = crtl->outgoing_args_size; + } + else + frame->outgoing_arguments_size = 0; + + /* Align stack boundary. Only needed if we're calling another function + or using alloca. */ + if (!current_function_is_leaf || cfun->calls_alloca + || ix86_current_function_calls_tls_descriptor) + offset = (offset + preferred_alignment - 1) & -preferred_alignment; + + /* We've reached end of stack frame. */ + frame->stack_pointer_offset = offset; + + /* Size prologue needs to allocate. */ + to_allocate = offset - frame->sse_reg_save_offset; + + if ((!to_allocate && frame->nregs <= 1) + || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000)) + frame->save_regs_using_mov = false; + + if (ix86_using_red_zone () + && current_function_sp_is_unchanging + && current_function_is_leaf + && !ix86_current_function_calls_tls_descriptor) + { + frame->red_zone_size = to_allocate; + if (frame->save_regs_using_mov) + frame->red_zone_size += frame->nregs * UNITS_PER_WORD; + if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE) + frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE; + } + else + frame->red_zone_size = 0; + frame->stack_pointer_offset -= frame->red_zone_size; + + /* The SEH frame pointer location is near the bottom of the frame. + This is enforced by the fact that the difference between the + stack pointer and the frame pointer is limited to 240 bytes in + the unwind data structure. */ + if (TARGET_SEH) + { + HOST_WIDE_INT diff; + + /* If we can leave the frame pointer where it is, do so. */ + diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset; + if (diff > 240 || (diff & 15) != 0) + { + /* Ideally we'd determine what portion of the local stack frame + (within the constraint of the lowest 240) is most heavily used. + But without that complication, simply bias the frame pointer + by 128 bytes so as to maximize the amount of the local stack + frame that is addressable with 8-bit offsets. */ + frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128; + } + } +} + +/* This is semi-inlined memory_address_length, but simplified + since we know that we're always dealing with reg+offset, and + to avoid having to create and discard all that rtl. */ + +static inline int +choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset) +{ + int len = 4; + + if (offset == 0) + { + /* EBP and R13 cannot be encoded without an offset. */ + len = (regno == BP_REG || regno == R13_REG); + } + else if (IN_RANGE (offset, -128, 127)) + len = 1; + + /* ESP and R12 must be encoded with a SIB byte. */ + if (regno == SP_REG || regno == R12_REG) + len++; + + return len; +} + +/* Return an RTX that points to CFA_OFFSET within the stack frame. + The valid base registers are taken from CFUN->MACHINE->FS. */ + +static rtx +choose_baseaddr (HOST_WIDE_INT cfa_offset) +{ + const struct machine_function *m = cfun->machine; + rtx base_reg = NULL; + HOST_WIDE_INT base_offset = 0; + + if (m->use_fast_prologue_epilogue) + { + /* Choose the base register most likely to allow the most scheduling + opportunities. Generally FP is valid througout the function, + while DRAP must be reloaded within the epilogue. But choose either + over the SP due to increased encoding size. */ + + if (m->fs.fp_valid) + { + base_reg = hard_frame_pointer_rtx; + base_offset = m->fs.fp_offset - cfa_offset; + } + else if (m->fs.drap_valid) + { + base_reg = crtl->drap_reg; + base_offset = 0 - cfa_offset; + } + else if (m->fs.sp_valid) + { + base_reg = stack_pointer_rtx; + base_offset = m->fs.sp_offset - cfa_offset; + } + } + else + { + HOST_WIDE_INT toffset; + int len = 16, tlen; + + /* Choose the base register with the smallest address encoding. + With a tie, choose FP > DRAP > SP. */ + if (m->fs.sp_valid) + { + base_reg = stack_pointer_rtx; + base_offset = m->fs.sp_offset - cfa_offset; + len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset); + } + if (m->fs.drap_valid) + { + toffset = 0 - cfa_offset; + tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset); + if (tlen <= len) + { + base_reg = crtl->drap_reg; + base_offset = toffset; + len = tlen; + } + } + if (m->fs.fp_valid) + { + toffset = m->fs.fp_offset - cfa_offset; + tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset); + if (tlen <= len) + { + base_reg = hard_frame_pointer_rtx; + base_offset = toffset; + len = tlen; + } + } + } + gcc_assert (base_reg != NULL); + + return plus_constant (base_reg, base_offset); +} + +/* Emit code to save registers in the prologue. */ + +static void +ix86_emit_save_regs (void) +{ + unsigned int regno; + rtx insn; + + for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; ) + if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true)) + { + insn = emit_insn (gen_push (gen_rtx_REG (Pmode, regno))); + RTX_FRAME_RELATED_P (insn) = 1; + } +} + +/* Emit a single register save at CFA - CFA_OFFSET. */ + +static void +ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno, + HOST_WIDE_INT cfa_offset) +{ + struct machine_function *m = cfun->machine; + rtx reg = gen_rtx_REG (mode, regno); + rtx mem, addr, base, insn; + + addr = choose_baseaddr (cfa_offset); + mem = gen_frame_mem (mode, addr); + + /* For SSE saves, we need to indicate the 128-bit alignment. */ + set_mem_align (mem, GET_MODE_ALIGNMENT (mode)); + + insn = emit_move_insn (mem, reg); + RTX_FRAME_RELATED_P (insn) = 1; + + base = addr; + if (GET_CODE (base) == PLUS) + base = XEXP (base, 0); + gcc_checking_assert (REG_P (base)); + + /* When saving registers into a re-aligned local stack frame, avoid + any tricky guessing by dwarf2out. */ + if (m->fs.realigned) + { + gcc_checking_assert (stack_realign_drap); + + if (regno == REGNO (crtl->drap_reg)) + { + /* A bit of a hack. We force the DRAP register to be saved in + the re-aligned stack frame, which provides us with a copy + of the CFA that will last past the prologue. Install it. */ + gcc_checking_assert (cfun->machine->fs.fp_valid); + addr = plus_constant (hard_frame_pointer_rtx, + cfun->machine->fs.fp_offset - cfa_offset); + mem = gen_rtx_MEM (mode, addr); + add_reg_note (insn, REG_CFA_DEF_CFA, mem); + } + else + { + /* The frame pointer is a stable reference within the + aligned frame. Use it. */ + gcc_checking_assert (cfun->machine->fs.fp_valid); + addr = plus_constant (hard_frame_pointer_rtx, + cfun->machine->fs.fp_offset - cfa_offset); + mem = gen_rtx_MEM (mode, addr); + add_reg_note (insn, REG_CFA_EXPRESSION, + gen_rtx_SET (VOIDmode, mem, reg)); + } + } + + /* The memory may not be relative to the current CFA register, + which means that we may need to generate a new pattern for + use by the unwind info. */ + else if (base != m->fs.cfa_reg) + { + addr = plus_constant (m->fs.cfa_reg, m->fs.cfa_offset - cfa_offset); + mem = gen_rtx_MEM (mode, addr); + add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg)); + } +} + +/* Emit code to save registers using MOV insns. + First register is stored at CFA - CFA_OFFSET. */ +static void +ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset) +{ + unsigned int regno; + + for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true)) + { + ix86_emit_save_reg_using_mov (Pmode, regno, cfa_offset); + cfa_offset -= UNITS_PER_WORD; + } +} + +/* Emit code to save SSE registers using MOV insns. + First register is stored at CFA - CFA_OFFSET. */ +static void +ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset) +{ + unsigned int regno; + + for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true)) + { + ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset); + cfa_offset -= 16; + } +} + +static GTY(()) rtx queued_cfa_restores; + +/* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack + manipulation insn. The value is on the stack at CFA - CFA_OFFSET. + Don't add the note if the previously saved value will be left untouched + within stack red-zone till return, as unwinders can find the same value + in the register and on the stack. */ + +static void +ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset) +{ + if (cfa_offset <= cfun->machine->fs.red_zone_offset) + return; + + if (insn) + { + add_reg_note (insn, REG_CFA_RESTORE, reg); + RTX_FRAME_RELATED_P (insn) = 1; + } + else + queued_cfa_restores + = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores); +} + +/* Add queued REG_CFA_RESTORE notes if any to INSN. */ + +static void +ix86_add_queued_cfa_restore_notes (rtx insn) +{ + rtx last; + if (!queued_cfa_restores) + return; + for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1)) + ; + XEXP (last, 1) = REG_NOTES (insn); + REG_NOTES (insn) = queued_cfa_restores; + queued_cfa_restores = NULL_RTX; + RTX_FRAME_RELATED_P (insn) = 1; +} + +/* Expand prologue or epilogue stack adjustment. + The pattern exist to put a dependency on all ebp-based memory accesses. + STYLE should be negative if instructions should be marked as frame related, + zero if %r11 register is live and cannot be freely used and positive + otherwise. */ + +static void +pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset, + int style, bool set_cfa) +{ + struct machine_function *m = cfun->machine; + rtx insn; + bool add_frame_related_expr = false; + + if (! TARGET_64BIT) + insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset); + else if (x86_64_immediate_operand (offset, DImode)) + insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset); + else + { + rtx tmp; + /* r11 is used by indirect sibcall return as well, set before the + epilogue and used after the epilogue. */ + if (style) + tmp = gen_rtx_REG (DImode, R11_REG); + else + { + gcc_assert (src != hard_frame_pointer_rtx + && dest != hard_frame_pointer_rtx); + tmp = hard_frame_pointer_rtx; + } + insn = emit_insn (gen_rtx_SET (DImode, tmp, offset)); + if (style < 0) + add_frame_related_expr = true; + + insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp); + } + + insn = emit_insn (insn); + if (style >= 0) + ix86_add_queued_cfa_restore_notes (insn); + + if (set_cfa) + { + rtx r; + + gcc_assert (m->fs.cfa_reg == src); + m->fs.cfa_offset += INTVAL (offset); + m->fs.cfa_reg = dest; + + r = gen_rtx_PLUS (Pmode, src, offset); + r = gen_rtx_SET (VOIDmode, dest, r); + add_reg_note (insn, REG_CFA_ADJUST_CFA, r); + RTX_FRAME_RELATED_P (insn) = 1; + } + else if (style < 0) + { + RTX_FRAME_RELATED_P (insn) = 1; + if (add_frame_related_expr) + { + rtx r = gen_rtx_PLUS (Pmode, src, offset); + r = gen_rtx_SET (VOIDmode, dest, r); + add_reg_note (insn, REG_FRAME_RELATED_EXPR, r); + } + } + + if (dest == stack_pointer_rtx) + { + HOST_WIDE_INT ooffset = m->fs.sp_offset; + bool valid = m->fs.sp_valid; + + if (src == hard_frame_pointer_rtx) + { + valid = m->fs.fp_valid; + ooffset = m->fs.fp_offset; + } + else if (src == crtl->drap_reg) + { + valid = m->fs.drap_valid; + ooffset = 0; + } + else + { + /* Else there are two possibilities: SP itself, which we set + up as the default above. Or EH_RETURN_STACKADJ_RTX, which is + taken care of this by hand along the eh_return path. */ + gcc_checking_assert (src == stack_pointer_rtx + || offset == const0_rtx); + } + + m->fs.sp_offset = ooffset - INTVAL (offset); + m->fs.sp_valid = valid; + } +} + +/* Find an available register to be used as dynamic realign argument + pointer regsiter. Such a register will be written in prologue and + used in begin of body, so it must not be + 1. parameter passing register. + 2. GOT pointer. + We reuse static-chain register if it is available. Otherwise, we + use DI for i386 and R13 for x86-64. We chose R13 since it has + shorter encoding. + + Return: the regno of chosen register. */ + +static unsigned int +find_drap_reg (void) +{ + tree decl = cfun->decl; + + if (TARGET_64BIT) + { + /* Use R13 for nested function or function need static chain. + Since function with tail call may use any caller-saved + registers in epilogue, DRAP must not use caller-saved + register in such case. */ + if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit) + return R13_REG; + + return R10_REG; + } + else + { + /* Use DI for nested function or function need static chain. + Since function with tail call may use any caller-saved + registers in epilogue, DRAP must not use caller-saved + register in such case. */ + if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit) + return DI_REG; + + /* Reuse static chain register if it isn't used for parameter + passing. */ + if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2 + && !lookup_attribute ("fastcall", + TYPE_ATTRIBUTES (TREE_TYPE (decl))) + && !lookup_attribute ("thiscall", + TYPE_ATTRIBUTES (TREE_TYPE (decl)))) + return CX_REG; + else + return DI_REG; + } +} + +/* Return minimum incoming stack alignment. */ + +static unsigned int +ix86_minimum_incoming_stack_boundary (bool sibcall) +{ + unsigned int incoming_stack_boundary; + + /* Prefer the one specified at command line. */ + if (ix86_user_incoming_stack_boundary) + incoming_stack_boundary = ix86_user_incoming_stack_boundary; + /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary + if -mstackrealign is used, it isn't used for sibcall check and + estimated stack alignment is 128bit. */ + else if (!sibcall + && !TARGET_64BIT + && ix86_force_align_arg_pointer + && crtl->stack_alignment_estimated == 128) + incoming_stack_boundary = MIN_STACK_BOUNDARY; + else + incoming_stack_boundary = ix86_default_incoming_stack_boundary; + + /* Incoming stack alignment can be changed on individual functions + via force_align_arg_pointer attribute. We use the smallest + incoming stack boundary. */ + if (incoming_stack_boundary > MIN_STACK_BOUNDARY + && lookup_attribute (ix86_force_align_arg_pointer_string, + TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl)))) + incoming_stack_boundary = MIN_STACK_BOUNDARY; + + /* The incoming stack frame has to be aligned at least at + parm_stack_boundary. */ + if (incoming_stack_boundary < crtl->parm_stack_boundary) + incoming_stack_boundary = crtl->parm_stack_boundary; + + /* Stack at entrance of main is aligned by runtime. We use the + smallest incoming stack boundary. */ + if (incoming_stack_boundary > MAIN_STACK_BOUNDARY + && DECL_NAME (current_function_decl) + && MAIN_NAME_P (DECL_NAME (current_function_decl)) + && DECL_FILE_SCOPE_P (current_function_decl)) + incoming_stack_boundary = MAIN_STACK_BOUNDARY; + + return incoming_stack_boundary; +} + +/* Update incoming stack boundary and estimated stack alignment. */ + +static void +ix86_update_stack_boundary (void) +{ + ix86_incoming_stack_boundary + = ix86_minimum_incoming_stack_boundary (false); + + /* x86_64 vararg needs 16byte stack alignment for register save + area. */ + if (TARGET_64BIT + && cfun->stdarg + && crtl->stack_alignment_estimated < 128) + crtl->stack_alignment_estimated = 128; +} + +/* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is + needed or an rtx for DRAP otherwise. */ + +static rtx +ix86_get_drap_rtx (void) +{ + if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS) + crtl->need_drap = true; + + if (stack_realign_drap) + { + /* Assign DRAP to vDRAP and returns vDRAP */ + unsigned int regno = find_drap_reg (); + rtx drap_vreg; + rtx arg_ptr; + rtx seq, insn; + + arg_ptr = gen_rtx_REG (Pmode, regno); + crtl->drap_reg = arg_ptr; + + start_sequence (); + drap_vreg = copy_to_reg (arg_ptr); + seq = get_insns (); + end_sequence (); + + insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ())); + if (!optimize) + { + add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg); + RTX_FRAME_RELATED_P (insn) = 1; + } + return drap_vreg; + } + else + return NULL; +} + +/* Handle the TARGET_INTERNAL_ARG_POINTER hook. */ + +static rtx +ix86_internal_arg_pointer (void) +{ + return virtual_incoming_args_rtx; +} + +struct scratch_reg { + rtx reg; + bool saved; +}; + +/* Return a short-lived scratch register for use on function entry. + In 32-bit mode, it is valid only after the registers are saved + in the prologue. This register must be released by means of + release_scratch_register_on_entry once it is dead. */ + +static void +get_scratch_register_on_entry (struct scratch_reg *sr) +{ + int regno; + + sr->saved = false; + + if (TARGET_64BIT) + { + /* We always use R11 in 64-bit mode. */ + regno = R11_REG; + } + else + { + tree decl = current_function_decl, fntype = TREE_TYPE (decl); + bool fastcall_p + = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE; + bool static_chain_p = DECL_STATIC_CHAIN (decl); + int regparm = ix86_function_regparm (fntype, decl); + int drap_regno + = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM; + + /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax + for the static chain register. */ + if ((regparm < 1 || (fastcall_p && !static_chain_p)) + && drap_regno != AX_REG) + regno = AX_REG; + else if (regparm < 2 && drap_regno != DX_REG) + regno = DX_REG; + /* ecx is the static chain register. */ + else if (regparm < 3 && !fastcall_p && !static_chain_p + && drap_regno != CX_REG) + regno = CX_REG; + else if (ix86_save_reg (BX_REG, true)) + regno = BX_REG; + /* esi is the static chain register. */ + else if (!(regparm == 3 && static_chain_p) + && ix86_save_reg (SI_REG, true)) + regno = SI_REG; + else if (ix86_save_reg (DI_REG, true)) + regno = DI_REG; + else + { + regno = (drap_regno == AX_REG ? DX_REG : AX_REG); + sr->saved = true; + } + } + + sr->reg = gen_rtx_REG (Pmode, regno); + if (sr->saved) + { + rtx insn = emit_insn (gen_push (sr->reg)); + RTX_FRAME_RELATED_P (insn) = 1; + } +} + +/* Release a scratch register obtained from the preceding function. */ + +static void +release_scratch_register_on_entry (struct scratch_reg *sr) +{ + if (sr->saved) + { + rtx x, insn = emit_insn (gen_pop (sr->reg)); + + /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */ + RTX_FRAME_RELATED_P (insn) = 1; + x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD)); + x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x); + add_reg_note (insn, REG_FRAME_RELATED_EXPR, x); + } +} + +#define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP) + +/* Emit code to adjust the stack pointer by SIZE bytes while probing it. */ + +static void +ix86_adjust_stack_and_probe (const HOST_WIDE_INT size) +{ + /* We skip the probe for the first interval + a small dope of 4 words and + probe that many bytes past the specified size to maintain a protection + area at the botton of the stack. */ + const int dope = 4 * UNITS_PER_WORD; + rtx size_rtx = GEN_INT (size), last; + + /* See if we have a constant small number of probes to generate. If so, + that's the easy case. The run-time loop is made up of 11 insns in the + generic case while the compile-time loop is made up of 3+2*(n-1) insns + for n # of intervals. */ + if (size <= 5 * PROBE_INTERVAL) + { + HOST_WIDE_INT i, adjust; + bool first_probe = true; + + /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for + values of N from 1 until it exceeds SIZE. If only one probe is + needed, this will not generate any code. Then adjust and probe + to PROBE_INTERVAL + SIZE. */ + for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL) + { + if (first_probe) + { + adjust = 2 * PROBE_INTERVAL + dope; + first_probe = false; + } + else + adjust = PROBE_INTERVAL; + + emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, + plus_constant (stack_pointer_rtx, -adjust))); + emit_stack_probe (stack_pointer_rtx); + } + + if (first_probe) + adjust = size + PROBE_INTERVAL + dope; + else + adjust = size + PROBE_INTERVAL - i; + + emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, + plus_constant (stack_pointer_rtx, -adjust))); + emit_stack_probe (stack_pointer_rtx); + + /* Adjust back to account for the additional first interval. */ + last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, + plus_constant (stack_pointer_rtx, + PROBE_INTERVAL + dope))); + } + + /* Otherwise, do the same as above, but in a loop. Note that we must be + extra careful with variables wrapping around because we might be at + the very top (or the very bottom) of the address space and we have + to be able to handle this case properly; in particular, we use an + equality test for the loop condition. */ + else + { + HOST_WIDE_INT rounded_size; + struct scratch_reg sr; + + get_scratch_register_on_entry (&sr); + + + /* Step 1: round SIZE to the previous multiple of the interval. */ + + rounded_size = size & -PROBE_INTERVAL; + + + /* Step 2: compute initial and final value of the loop counter. */ + + /* SP = SP_0 + PROBE_INTERVAL. */ + emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, + plus_constant (stack_pointer_rtx, + - (PROBE_INTERVAL + dope)))); + + /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */ + emit_move_insn (sr.reg, GEN_INT (-rounded_size)); + emit_insn (gen_rtx_SET (VOIDmode, sr.reg, + gen_rtx_PLUS (Pmode, sr.reg, + stack_pointer_rtx))); + + + /* Step 3: the loop + + while (SP != LAST_ADDR) + { + SP = SP + PROBE_INTERVAL + probe at SP + } + + adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for + values of N from 1 until it is equal to ROUNDED_SIZE. */ + + emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx)); + + + /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot + assert at compile-time that SIZE is equal to ROUNDED_SIZE. */ + + if (size != rounded_size) + { + emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, + plus_constant (stack_pointer_rtx, + rounded_size - size))); + emit_stack_probe (stack_pointer_rtx); + } + + /* Adjust back to account for the additional first interval. */ + last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, + plus_constant (stack_pointer_rtx, + PROBE_INTERVAL + dope))); + + release_scratch_register_on_entry (&sr); + } + + gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx); + + /* Even if the stack pointer isn't the CFA register, we need to correctly + describe the adjustments made to it, in particular differentiate the + frame-related ones from the frame-unrelated ones. */ + if (size > 0) + { + rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2)); + XVECEXP (expr, 0, 0) + = gen_rtx_SET (VOIDmode, stack_pointer_rtx, + plus_constant (stack_pointer_rtx, -size)); + XVECEXP (expr, 0, 1) + = gen_rtx_SET (VOIDmode, stack_pointer_rtx, + plus_constant (stack_pointer_rtx, + PROBE_INTERVAL + dope + size)); + add_reg_note (last, REG_FRAME_RELATED_EXPR, expr); + RTX_FRAME_RELATED_P (last) = 1; + + cfun->machine->fs.sp_offset += size; + } + + /* Make sure nothing is scheduled before we are done. */ + emit_insn (gen_blockage ()); +} + +/* Adjust the stack pointer up to REG while probing it. */ + +const char * +output_adjust_stack_and_probe (rtx reg) +{ + static int labelno = 0; + char loop_lab[32], end_lab[32]; + rtx xops[2]; + + ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno); + ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++); + + ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab); + + /* Jump to END_LAB if SP == LAST_ADDR. */ + xops[0] = stack_pointer_rtx; + xops[1] = reg; + output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops); + fputs ("\tje\t", asm_out_file); + assemble_name_raw (asm_out_file, end_lab); + fputc ('\n', asm_out_file); + + /* SP = SP + PROBE_INTERVAL. */ + xops[1] = GEN_INT (PROBE_INTERVAL); + output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops); + + /* Probe at SP. */ + xops[1] = const0_rtx; + output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops); + + fprintf (asm_out_file, "\tjmp\t"); + assemble_name_raw (asm_out_file, loop_lab); + fputc ('\n', asm_out_file); + + ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab); + + return ""; +} + +/* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE, + inclusive. These are offsets from the current stack pointer. */ + +static void +ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size) +{ + /* See if we have a constant small number of probes to generate. If so, + that's the easy case. The run-time loop is made up of 7 insns in the + generic case while the compile-time loop is made up of n insns for n # + of intervals. */ + if (size <= 7 * PROBE_INTERVAL) + { + HOST_WIDE_INT i; + + /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until + it exceeds SIZE. If only one probe is needed, this will not + generate any code. Then probe at FIRST + SIZE. */ + for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL) + emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + i))); + + emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + size))); + } + + /* Otherwise, do the same as above, but in a loop. Note that we must be + extra careful with variables wrapping around because we might be at + the very top (or the very bottom) of the address space and we have + to be able to handle this case properly; in particular, we use an + equality test for the loop condition. */ + else + { + HOST_WIDE_INT rounded_size, last; + struct scratch_reg sr; + + get_scratch_register_on_entry (&sr); + + + /* Step 1: round SIZE to the previous multiple of the interval. */ + + rounded_size = size & -PROBE_INTERVAL; + + + /* Step 2: compute initial and final value of the loop counter. */ + + /* TEST_OFFSET = FIRST. */ + emit_move_insn (sr.reg, GEN_INT (-first)); + + /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */ + last = first + rounded_size; + + + /* Step 3: the loop + + while (TEST_ADDR != LAST_ADDR) + { + TEST_ADDR = TEST_ADDR + PROBE_INTERVAL + probe at TEST_ADDR + } + + probes at FIRST + N * PROBE_INTERVAL for values of N from 1 + until it is equal to ROUNDED_SIZE. */ + + emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last))); + + + /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time + that SIZE is equal to ROUNDED_SIZE. */ + + if (size != rounded_size) + emit_stack_probe (plus_constant (gen_rtx_PLUS (Pmode, + stack_pointer_rtx, + sr.reg), + rounded_size - size)); + + release_scratch_register_on_entry (&sr); + } + + /* Make sure nothing is scheduled before we are done. */ + emit_insn (gen_blockage ()); +} + +/* Probe a range of stack addresses from REG to END, inclusive. These are + offsets from the current stack pointer. */ + +const char * +output_probe_stack_range (rtx reg, rtx end) +{ + static int labelno = 0; + char loop_lab[32], end_lab[32]; + rtx xops[3]; + + ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno); + ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++); + + ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab); + + /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */ + xops[0] = reg; + xops[1] = end; + output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops); + fputs ("\tje\t", asm_out_file); + assemble_name_raw (asm_out_file, end_lab); + fputc ('\n', asm_out_file); + + /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */ + xops[1] = GEN_INT (PROBE_INTERVAL); + output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops); + + /* Probe at TEST_ADDR. */ + xops[0] = stack_pointer_rtx; + xops[1] = reg; + xops[2] = const0_rtx; + output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops); + + fprintf (asm_out_file, "\tjmp\t"); + assemble_name_raw (asm_out_file, loop_lab); + fputc ('\n', asm_out_file); + + ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab); + + return ""; +} + +/* Finalize stack_realign_needed flag, which will guide prologue/epilogue + to be generated in correct form. */ +static void +ix86_finalize_stack_realign_flags (void) +{ + /* Check if stack realign is really needed after reload, and + stores result in cfun */ + unsigned int incoming_stack_boundary + = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary + ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary); + unsigned int stack_realign = (incoming_stack_boundary + < (current_function_is_leaf + ? crtl->max_used_stack_slot_alignment + : crtl->stack_alignment_needed)); + + if (crtl->stack_realign_finalized) + { + /* After stack_realign_needed is finalized, we can't no longer + change it. */ + gcc_assert (crtl->stack_realign_needed == stack_realign); + } + else + { + crtl->stack_realign_needed = stack_realign; + crtl->stack_realign_finalized = true; + } +} + +/* Expand the prologue into a bunch of separate insns. */ + +void +ix86_expand_prologue (void) +{ + struct machine_function *m = cfun->machine; + rtx insn, t; + bool pic_reg_used; + struct ix86_frame frame; + HOST_WIDE_INT allocate; + bool int_registers_saved; + + ix86_finalize_stack_realign_flags (); + + /* DRAP should not coexist with stack_realign_fp */ + gcc_assert (!(crtl->drap_reg && stack_realign_fp)); + + memset (&m->fs, 0, sizeof (m->fs)); + + /* Initialize CFA state for before the prologue. */ + m->fs.cfa_reg = stack_pointer_rtx; + m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET; + + /* Track SP offset to the CFA. We continue tracking this after we've + swapped the CFA register away from SP. In the case of re-alignment + this is fudged; we're interested to offsets within the local frame. */ + m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET; + m->fs.sp_valid = true; + + ix86_compute_frame_layout (&frame); + + if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl)) + { + /* We should have already generated an error for any use of + ms_hook on a nested function. */ + gcc_checking_assert (!ix86_static_chain_on_stack); + + /* Check if profiling is active and we shall use profiling before + prologue variant. If so sorry. */ + if (crtl->profile && flag_fentry != 0) + sorry ("ms_hook_prologue attribute isn%'t compatible " + "with -mfentry for 32-bit"); + + /* In ix86_asm_output_function_label we emitted: + 8b ff movl.s %edi,%edi + 55 push %ebp + 8b ec movl.s %esp,%ebp + + This matches the hookable function prologue in Win32 API + functions in Microsoft Windows XP Service Pack 2 and newer. + Wine uses this to enable Windows apps to hook the Win32 API + functions provided by Wine. + + What that means is that we've already set up the frame pointer. */ + + if (frame_pointer_needed + && !(crtl->drap_reg && crtl->stack_realign_needed)) + { + rtx push, mov; + + /* We've decided to use the frame pointer already set up. + Describe this to the unwinder by pretending that both + push and mov insns happen right here. + + Putting the unwind info here at the end of the ms_hook + is done so that we can make absolutely certain we get + the required byte sequence at the start of the function, + rather than relying on an assembler that can produce + the exact encoding required. + + However it does mean (in the unpatched case) that we have + a 1 insn window where the asynchronous unwind info is + incorrect. However, if we placed the unwind info at + its correct location we would have incorrect unwind info + in the patched case. Which is probably all moot since + I don't expect Wine generates dwarf2 unwind info for the + system libraries that use this feature. */ + + insn = emit_insn (gen_blockage ()); + + push = gen_push (hard_frame_pointer_rtx); + mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx, + stack_pointer_rtx); + RTX_FRAME_RELATED_P (push) = 1; + RTX_FRAME_RELATED_P (mov) = 1; + + RTX_FRAME_RELATED_P (insn) = 1; + add_reg_note (insn, REG_FRAME_RELATED_EXPR, + gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov))); + + /* Note that gen_push incremented m->fs.cfa_offset, even + though we didn't emit the push insn here. */ + m->fs.cfa_reg = hard_frame_pointer_rtx; + m->fs.fp_offset = m->fs.cfa_offset; + m->fs.fp_valid = true; + } + else + { + /* The frame pointer is not needed so pop %ebp again. + This leaves us with a pristine state. */ + emit_insn (gen_pop (hard_frame_pointer_rtx)); + } + } + + /* The first insn of a function that accepts its static chain on the + stack is to push the register that would be filled in by a direct + call. This insn will be skipped by the trampoline. */ + else if (ix86_static_chain_on_stack) + { + insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false))); + emit_insn (gen_blockage ()); + + /* We don't want to interpret this push insn as a register save, + only as a stack adjustment. The real copy of the register as + a save will be done later, if needed. */ + t = plus_constant (stack_pointer_rtx, -UNITS_PER_WORD); + t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t); + add_reg_note (insn, REG_CFA_ADJUST_CFA, t); + RTX_FRAME_RELATED_P (insn) = 1; + } + + /* Emit prologue code to adjust stack alignment and setup DRAP, in case + of DRAP is needed and stack realignment is really needed after reload */ + if (stack_realign_drap) + { + int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT; + + /* Only need to push parameter pointer reg if it is caller saved. */ + if (!call_used_regs[REGNO (crtl->drap_reg)]) + { + /* Push arg pointer reg */ + insn = emit_insn (gen_push (crtl->drap_reg)); + RTX_FRAME_RELATED_P (insn) = 1; + } + + /* Grab the argument pointer. */ + t = plus_constant (stack_pointer_rtx, m->fs.sp_offset); + insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t)); + RTX_FRAME_RELATED_P (insn) = 1; + m->fs.cfa_reg = crtl->drap_reg; + m->fs.cfa_offset = 0; + + /* Align the stack. */ + insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx, + stack_pointer_rtx, + GEN_INT (-align_bytes))); + RTX_FRAME_RELATED_P (insn) = 1; + + /* Replicate the return address on the stack so that return + address can be reached via (argp - 1) slot. This is needed + to implement macro RETURN_ADDR_RTX and intrinsic function + expand_builtin_return_addr etc. */ + t = plus_constant (crtl->drap_reg, -UNITS_PER_WORD); + t = gen_frame_mem (Pmode, t); + insn = emit_insn (gen_push (t)); + RTX_FRAME_RELATED_P (insn) = 1; + + /* For the purposes of frame and register save area addressing, + we've started over with a new frame. */ + m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET; + m->fs.realigned = true; + } + + if (frame_pointer_needed && !m->fs.fp_valid) + { + /* Note: AT&T enter does NOT have reversed args. Enter is probably + slower on all targets. Also sdb doesn't like it. */ + insn = emit_insn (gen_push (hard_frame_pointer_rtx)); + RTX_FRAME_RELATED_P (insn) = 1; + + if (m->fs.sp_offset == frame.hard_frame_pointer_offset) + { + insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx); + RTX_FRAME_RELATED_P (insn) = 1; + + if (m->fs.cfa_reg == stack_pointer_rtx) + m->fs.cfa_reg = hard_frame_pointer_rtx; + m->fs.fp_offset = m->fs.sp_offset; + m->fs.fp_valid = true; + } + } + + int_registers_saved = (frame.nregs == 0); + + if (!int_registers_saved) + { + /* If saving registers via PUSH, do so now. */ + if (!frame.save_regs_using_mov) + { + ix86_emit_save_regs (); + int_registers_saved = true; + gcc_assert (m->fs.sp_offset == frame.reg_save_offset); + } + + /* When using red zone we may start register saving before allocating + the stack frame saving one cycle of the prologue. However, avoid + doing this if we have to probe the stack; at least on x86_64 the + stack probe can turn into a call that clobbers a red zone location. */ + else if (ix86_using_red_zone () + && (! TARGET_STACK_PROBE + || frame.stack_pointer_offset < CHECK_STACK_LIMIT)) + { + ix86_emit_save_regs_using_mov (frame.reg_save_offset); + int_registers_saved = true; + } + } + + if (stack_realign_fp) + { + int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT; + gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT); + + /* The computation of the size of the re-aligned stack frame means + that we must allocate the size of the register save area before + performing the actual alignment. Otherwise we cannot guarantee + that there's enough storage above the realignment point. */ + if (m->fs.sp_offset != frame.sse_reg_save_offset) + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (m->fs.sp_offset + - frame.sse_reg_save_offset), + -1, false); + + /* Align the stack. */ + insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx, + stack_pointer_rtx, + GEN_INT (-align_bytes))); + + /* For the purposes of register save area addressing, the stack + pointer is no longer valid. As for the value of sp_offset, + see ix86_compute_frame_layout, which we need to match in order + to pass verification of stack_pointer_offset at the end. */ + m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes; + m->fs.sp_valid = false; + } + + allocate = frame.stack_pointer_offset - m->fs.sp_offset; + + if (flag_stack_usage) + { + /* We start to count from ARG_POINTER. */ + HOST_WIDE_INT stack_size = frame.stack_pointer_offset; + + /* If it was realigned, take into account the fake frame. */ + if (stack_realign_drap) + { + if (ix86_static_chain_on_stack) + stack_size += UNITS_PER_WORD; + + if (!call_used_regs[REGNO (crtl->drap_reg)]) + stack_size += UNITS_PER_WORD; + + /* This over-estimates by 1 minimal-stack-alignment-unit but + mitigates that by counting in the new return address slot. */ + current_function_dynamic_stack_size + += crtl->stack_alignment_needed / BITS_PER_UNIT; + } + + current_function_static_stack_size = stack_size; + } + + /* The stack has already been decremented by the instruction calling us + so probe if the size is non-negative to preserve the protection area. */ + if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK) + { + /* We expect the registers to be saved when probes are used. */ + gcc_assert (int_registers_saved); + + if (STACK_CHECK_MOVING_SP) + { + ix86_adjust_stack_and_probe (allocate); + allocate = 0; + } + else + { + HOST_WIDE_INT size = allocate; + + if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000) + size = 0x80000000 - STACK_CHECK_PROTECT - 1; + + if (TARGET_STACK_PROBE) + ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT); + else + ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size); + } + } + + if (allocate == 0) + ; + else if (!ix86_target_stack_probe () + || frame.stack_pointer_offset < CHECK_STACK_LIMIT) + { + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (-allocate), -1, + m->fs.cfa_reg == stack_pointer_rtx); + } + else + { + rtx eax = gen_rtx_REG (Pmode, AX_REG); + rtx r10 = NULL; + rtx (*adjust_stack_insn)(rtx, rtx, rtx); + + bool eax_live = false; + bool r10_live = false; + + if (TARGET_64BIT) + r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0); + if (!TARGET_64BIT_MS_ABI) + eax_live = ix86_eax_live_at_start_p (); + + if (eax_live) + { + emit_insn (gen_push (eax)); + allocate -= UNITS_PER_WORD; + } + if (r10_live) + { + r10 = gen_rtx_REG (Pmode, R10_REG); + emit_insn (gen_push (r10)); + allocate -= UNITS_PER_WORD; + } + + emit_move_insn (eax, GEN_INT (allocate)); + emit_insn (ix86_gen_allocate_stack_worker (eax, eax)); + + /* Use the fact that AX still contains ALLOCATE. */ + adjust_stack_insn = (TARGET_64BIT + ? gen_pro_epilogue_adjust_stack_di_sub + : gen_pro_epilogue_adjust_stack_si_sub); + + insn = emit_insn (adjust_stack_insn (stack_pointer_rtx, + stack_pointer_rtx, eax)); + + /* Note that SEH directives need to continue tracking the stack + pointer even after the frame pointer has been set up. */ + if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH) + { + if (m->fs.cfa_reg == stack_pointer_rtx) + m->fs.cfa_offset += allocate; + + RTX_FRAME_RELATED_P (insn) = 1; + add_reg_note (insn, REG_FRAME_RELATED_EXPR, + gen_rtx_SET (VOIDmode, stack_pointer_rtx, + plus_constant (stack_pointer_rtx, + -allocate))); + } + m->fs.sp_offset += allocate; + + if (r10_live && eax_live) + { + t = choose_baseaddr (m->fs.sp_offset - allocate); + emit_move_insn (r10, gen_frame_mem (Pmode, t)); + t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD); + emit_move_insn (eax, gen_frame_mem (Pmode, t)); + } + else if (eax_live || r10_live) + { + t = choose_baseaddr (m->fs.sp_offset - allocate); + emit_move_insn ((eax_live ? eax : r10), gen_frame_mem (Pmode, t)); + } + } + gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset); + + /* If we havn't already set up the frame pointer, do so now. */ + if (frame_pointer_needed && !m->fs.fp_valid) + { + insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx, + GEN_INT (frame.stack_pointer_offset + - frame.hard_frame_pointer_offset)); + insn = emit_insn (insn); + RTX_FRAME_RELATED_P (insn) = 1; + add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL); + + if (m->fs.cfa_reg == stack_pointer_rtx) + m->fs.cfa_reg = hard_frame_pointer_rtx; + m->fs.fp_offset = frame.hard_frame_pointer_offset; + m->fs.fp_valid = true; + } + + if (!int_registers_saved) + ix86_emit_save_regs_using_mov (frame.reg_save_offset); + if (frame.nsseregs) + ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset); + + pic_reg_used = false; + if (pic_offset_table_rtx + && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM) + || crtl->profile)) + { + unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum (); + + if (alt_pic_reg_used != INVALID_REGNUM) + SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used); + + pic_reg_used = true; + } + + if (pic_reg_used) + { + if (TARGET_64BIT) + { + if (ix86_cmodel == CM_LARGE_PIC) + { + rtx tmp_reg = gen_rtx_REG (DImode, R11_REG); + rtx label = gen_label_rtx (); + emit_label (label); + LABEL_PRESERVE_P (label) = 1; + gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg)); + insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx, label)); + insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label)); + insn = emit_insn (gen_adddi3 (pic_offset_table_rtx, + pic_offset_table_rtx, tmp_reg)); + } + else + insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx)); + } + else + insn = emit_insn (gen_set_got (pic_offset_table_rtx)); + } + + /* In the pic_reg_used case, make sure that the got load isn't deleted + when mcount needs it. Blockage to avoid call movement across mcount + call is emitted in generic code after the NOTE_INSN_PROLOGUE_END + note. */ + if (crtl->profile && !flag_fentry && pic_reg_used) + emit_insn (gen_prologue_use (pic_offset_table_rtx)); + + if (crtl->drap_reg && !crtl->stack_realign_needed) + { + /* vDRAP is setup but after reload it turns out stack realign + isn't necessary, here we will emit prologue to setup DRAP + without stack realign adjustment */ + t = choose_baseaddr (0); + emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t)); + } + + /* Prevent instructions from being scheduled into register save push + sequence when access to the redzone area is done through frame pointer. + The offset between the frame pointer and the stack pointer is calculated + relative to the value of the stack pointer at the end of the function + prologue, and moving instructions that access redzone area via frame + pointer inside push sequence violates this assumption. */ + if (frame_pointer_needed && frame.red_zone_size) + emit_insn (gen_memory_blockage ()); + + /* Emit cld instruction if stringops are used in the function. */ + if (TARGET_CLD && ix86_current_function_needs_cld) + emit_insn (gen_cld ()); + + /* SEH requires that the prologue end within 256 bytes of the start of + the function. Prevent instruction schedules that would extend that. */ + if (TARGET_SEH) + emit_insn (gen_blockage ()); +} + +/* Emit code to restore REG using a POP insn. */ + +static void +ix86_emit_restore_reg_using_pop (rtx reg) +{ + struct machine_function *m = cfun->machine; + rtx insn = emit_insn (gen_pop (reg)); + + ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset); + m->fs.sp_offset -= UNITS_PER_WORD; + + if (m->fs.cfa_reg == crtl->drap_reg + && REGNO (reg) == REGNO (crtl->drap_reg)) + { + /* Previously we'd represented the CFA as an expression + like *(%ebp - 8). We've just popped that value from + the stack, which means we need to reset the CFA to + the drap register. This will remain until we restore + the stack pointer. */ + add_reg_note (insn, REG_CFA_DEF_CFA, reg); + RTX_FRAME_RELATED_P (insn) = 1; + + /* This means that the DRAP register is valid for addressing too. */ + m->fs.drap_valid = true; + return; + } + + if (m->fs.cfa_reg == stack_pointer_rtx) + { + rtx x = plus_constant (stack_pointer_rtx, UNITS_PER_WORD); + x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x); + add_reg_note (insn, REG_CFA_ADJUST_CFA, x); + RTX_FRAME_RELATED_P (insn) = 1; + + m->fs.cfa_offset -= UNITS_PER_WORD; + } + + /* When the frame pointer is the CFA, and we pop it, we are + swapping back to the stack pointer as the CFA. This happens + for stack frames that don't allocate other data, so we assume + the stack pointer is now pointing at the return address, i.e. + the function entry state, which makes the offset be 1 word. */ + if (reg == hard_frame_pointer_rtx) + { + m->fs.fp_valid = false; + if (m->fs.cfa_reg == hard_frame_pointer_rtx) + { + m->fs.cfa_reg = stack_pointer_rtx; + m->fs.cfa_offset -= UNITS_PER_WORD; + + add_reg_note (insn, REG_CFA_DEF_CFA, + gen_rtx_PLUS (Pmode, stack_pointer_rtx, + GEN_INT (m->fs.cfa_offset))); + RTX_FRAME_RELATED_P (insn) = 1; + } + } +} + +/* Emit code to restore saved registers using POP insns. */ + +static void +ix86_emit_restore_regs_using_pop (void) +{ + unsigned int regno; + + for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false)) + ix86_emit_restore_reg_using_pop (gen_rtx_REG (Pmode, regno)); +} + +/* Emit code and notes for the LEAVE instruction. */ + +static void +ix86_emit_leave (void) +{ + struct machine_function *m = cfun->machine; + rtx insn = emit_insn (ix86_gen_leave ()); + + ix86_add_queued_cfa_restore_notes (insn); + + gcc_assert (m->fs.fp_valid); + m->fs.sp_valid = true; + m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD; + m->fs.fp_valid = false; + + if (m->fs.cfa_reg == hard_frame_pointer_rtx) + { + m->fs.cfa_reg = stack_pointer_rtx; + m->fs.cfa_offset = m->fs.sp_offset; + + add_reg_note (insn, REG_CFA_DEF_CFA, + plus_constant (stack_pointer_rtx, m->fs.sp_offset)); + RTX_FRAME_RELATED_P (insn) = 1; + ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx, + m->fs.fp_offset); + } +} + +/* Emit code to restore saved registers using MOV insns. + First register is restored from CFA - CFA_OFFSET. */ +static void +ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset, + int maybe_eh_return) +{ + struct machine_function *m = cfun->machine; + unsigned int regno; + + for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return)) + { + rtx reg = gen_rtx_REG (Pmode, regno); + rtx insn, mem; + + mem = choose_baseaddr (cfa_offset); + mem = gen_frame_mem (Pmode, mem); + insn = emit_move_insn (reg, mem); + + if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg)) + { + /* Previously we'd represented the CFA as an expression + like *(%ebp - 8). We've just popped that value from + the stack, which means we need to reset the CFA to + the drap register. This will remain until we restore + the stack pointer. */ + add_reg_note (insn, REG_CFA_DEF_CFA, reg); + RTX_FRAME_RELATED_P (insn) = 1; + + /* This means that the DRAP register is valid for addressing. */ + m->fs.drap_valid = true; + } + else + ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset); + + cfa_offset -= UNITS_PER_WORD; + } +} + +/* Emit code to restore saved registers using MOV insns. + First register is restored from CFA - CFA_OFFSET. */ +static void +ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset, + int maybe_eh_return) +{ + unsigned int regno; + + for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return)) + { + rtx reg = gen_rtx_REG (V4SFmode, regno); + rtx mem; + + mem = choose_baseaddr (cfa_offset); + mem = gen_rtx_MEM (V4SFmode, mem); + set_mem_align (mem, 128); + emit_move_insn (reg, mem); + + ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset); + + cfa_offset -= 16; + } +} + +/* Restore function stack, frame, and registers. */ + +void +ix86_expand_epilogue (int style) +{ + struct machine_function *m = cfun->machine; + struct machine_frame_state frame_state_save = m->fs; + struct ix86_frame frame; + bool restore_regs_via_mov; + bool using_drap; + + ix86_finalize_stack_realign_flags (); + ix86_compute_frame_layout (&frame); + + m->fs.sp_valid = (!frame_pointer_needed + || (current_function_sp_is_unchanging + && !stack_realign_fp)); + gcc_assert (!m->fs.sp_valid + || m->fs.sp_offset == frame.stack_pointer_offset); + + /* The FP must be valid if the frame pointer is present. */ + gcc_assert (frame_pointer_needed == m->fs.fp_valid); + gcc_assert (!m->fs.fp_valid + || m->fs.fp_offset == frame.hard_frame_pointer_offset); + + /* We must have *some* valid pointer to the stack frame. */ + gcc_assert (m->fs.sp_valid || m->fs.fp_valid); + + /* The DRAP is never valid at this point. */ + gcc_assert (!m->fs.drap_valid); + + /* See the comment about red zone and frame + pointer usage in ix86_expand_prologue. */ + if (frame_pointer_needed && frame.red_zone_size) + emit_insn (gen_memory_blockage ()); + + using_drap = crtl->drap_reg && crtl->stack_realign_needed; + gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg); + + /* Determine the CFA offset of the end of the red-zone. */ + m->fs.red_zone_offset = 0; + if (ix86_using_red_zone () && crtl->args.pops_args < 65536) + { + /* The red-zone begins below the return address. */ + m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD; + + /* When the register save area is in the aligned portion of + the stack, determine the maximum runtime displacement that + matches up with the aligned frame. */ + if (stack_realign_drap) + m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT + + UNITS_PER_WORD); + } + + /* Special care must be taken for the normal return case of a function + using eh_return: the eax and edx registers are marked as saved, but + not restored along this path. Adjust the save location to match. */ + if (crtl->calls_eh_return && style != 2) + frame.reg_save_offset -= 2 * UNITS_PER_WORD; + + /* EH_RETURN requires the use of moves to function properly. */ + if (crtl->calls_eh_return) + restore_regs_via_mov = true; + /* SEH requires the use of pops to identify the epilogue. */ + else if (TARGET_SEH) + restore_regs_via_mov = false; + /* If we're only restoring one register and sp is not valid then + using a move instruction to restore the register since it's + less work than reloading sp and popping the register. */ + else if (!m->fs.sp_valid && frame.nregs <= 1) + restore_regs_via_mov = true; + else if (TARGET_EPILOGUE_USING_MOVE + && cfun->machine->use_fast_prologue_epilogue + && (frame.nregs > 1 + || m->fs.sp_offset != frame.reg_save_offset)) + restore_regs_via_mov = true; + else if (frame_pointer_needed + && !frame.nregs + && m->fs.sp_offset != frame.reg_save_offset) + restore_regs_via_mov = true; + else if (frame_pointer_needed + && TARGET_USE_LEAVE + && cfun->machine->use_fast_prologue_epilogue + && frame.nregs == 1) + restore_regs_via_mov = true; + else + restore_regs_via_mov = false; + + if (restore_regs_via_mov || frame.nsseregs) + { + /* Ensure that the entire register save area is addressable via + the stack pointer, if we will restore via sp. */ + if (TARGET_64BIT + && m->fs.sp_offset > 0x7fffffff + && !(m->fs.fp_valid || m->fs.drap_valid) + && (frame.nsseregs + frame.nregs) != 0) + { + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (m->fs.sp_offset + - frame.sse_reg_save_offset), + style, + m->fs.cfa_reg == stack_pointer_rtx); + } + } + + /* If there are any SSE registers to restore, then we have to do it + via moves, since there's obviously no pop for SSE regs. */ + if (frame.nsseregs) + ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset, + style == 2); + + if (restore_regs_via_mov) + { + rtx t; + + if (frame.nregs) + ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2); + + /* eh_return epilogues need %ecx added to the stack pointer. */ + if (style == 2) + { + rtx insn, sa = EH_RETURN_STACKADJ_RTX; + + /* Stack align doesn't work with eh_return. */ + gcc_assert (!stack_realign_drap); + /* Neither does regparm nested functions. */ + gcc_assert (!ix86_static_chain_on_stack); + + if (frame_pointer_needed) + { + t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa); + t = plus_constant (t, m->fs.fp_offset - UNITS_PER_WORD); + emit_insn (gen_rtx_SET (VOIDmode, sa, t)); + + t = gen_frame_mem (Pmode, hard_frame_pointer_rtx); + insn = emit_move_insn (hard_frame_pointer_rtx, t); + + /* Note that we use SA as a temporary CFA, as the return + address is at the proper place relative to it. We + pretend this happens at the FP restore insn because + prior to this insn the FP would be stored at the wrong + offset relative to SA, and after this insn we have no + other reasonable register to use for the CFA. We don't + bother resetting the CFA to the SP for the duration of + the return insn. */ + add_reg_note (insn, REG_CFA_DEF_CFA, + plus_constant (sa, UNITS_PER_WORD)); + ix86_add_queued_cfa_restore_notes (insn); + add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx); + RTX_FRAME_RELATED_P (insn) = 1; + + m->fs.cfa_reg = sa; + m->fs.cfa_offset = UNITS_PER_WORD; + m->fs.fp_valid = false; + + pro_epilogue_adjust_stack (stack_pointer_rtx, sa, + const0_rtx, style, false); + } + else + { + t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa); + t = plus_constant (t, m->fs.sp_offset - UNITS_PER_WORD); + insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t)); + ix86_add_queued_cfa_restore_notes (insn); + + gcc_assert (m->fs.cfa_reg == stack_pointer_rtx); + if (m->fs.cfa_offset != UNITS_PER_WORD) + { + m->fs.cfa_offset = UNITS_PER_WORD; + add_reg_note (insn, REG_CFA_DEF_CFA, + plus_constant (stack_pointer_rtx, + UNITS_PER_WORD)); + RTX_FRAME_RELATED_P (insn) = 1; + } + } + m->fs.sp_offset = UNITS_PER_WORD; + m->fs.sp_valid = true; + } + } + else + { + /* SEH requires that the function end with (1) a stack adjustment + if necessary, (2) a sequence of pops, and (3) a return or + jump instruction. Prevent insns from the function body from + being scheduled into this sequence. */ + if (TARGET_SEH) + { + /* Prevent a catch region from being adjacent to the standard + epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor + several other flags that would be interesting to test are + not yet set up. */ + if (flag_non_call_exceptions) + emit_insn (gen_nops (const1_rtx)); + else + emit_insn (gen_blockage ()); + } + + /* First step is to deallocate the stack frame so that we can + pop the registers. */ + if (!m->fs.sp_valid) + { + pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx, + GEN_INT (m->fs.fp_offset + - frame.reg_save_offset), + style, false); + } + else if (m->fs.sp_offset != frame.reg_save_offset) + { + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (m->fs.sp_offset + - frame.reg_save_offset), + style, + m->fs.cfa_reg == stack_pointer_rtx); + } + + ix86_emit_restore_regs_using_pop (); + } + + /* If we used a stack pointer and haven't already got rid of it, + then do so now. */ + if (m->fs.fp_valid) + { + /* If the stack pointer is valid and pointing at the frame + pointer store address, then we only need a pop. */ + if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset) + ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx); + /* Leave results in shorter dependency chains on CPUs that are + able to grok it fast. */ + else if (TARGET_USE_LEAVE + || optimize_function_for_size_p (cfun) + || !cfun->machine->use_fast_prologue_epilogue) + ix86_emit_leave (); + else + { + pro_epilogue_adjust_stack (stack_pointer_rtx, + hard_frame_pointer_rtx, + const0_rtx, style, !using_drap); + ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx); + } + } + + if (using_drap) + { + int param_ptr_offset = UNITS_PER_WORD; + rtx insn; + + gcc_assert (stack_realign_drap); + + if (ix86_static_chain_on_stack) + param_ptr_offset += UNITS_PER_WORD; + if (!call_used_regs[REGNO (crtl->drap_reg)]) + param_ptr_offset += UNITS_PER_WORD; + + insn = emit_insn (gen_rtx_SET + (VOIDmode, stack_pointer_rtx, + gen_rtx_PLUS (Pmode, + crtl->drap_reg, + GEN_INT (-param_ptr_offset)))); + m->fs.cfa_reg = stack_pointer_rtx; + m->fs.cfa_offset = param_ptr_offset; + m->fs.sp_offset = param_ptr_offset; + m->fs.realigned = false; + + add_reg_note (insn, REG_CFA_DEF_CFA, + gen_rtx_PLUS (Pmode, stack_pointer_rtx, + GEN_INT (param_ptr_offset))); + RTX_FRAME_RELATED_P (insn) = 1; + + if (!call_used_regs[REGNO (crtl->drap_reg)]) + ix86_emit_restore_reg_using_pop (crtl->drap_reg); + } + + /* At this point the stack pointer must be valid, and we must have + restored all of the registers. We may not have deallocated the + entire stack frame. We've delayed this until now because it may + be possible to merge the local stack deallocation with the + deallocation forced by ix86_static_chain_on_stack. */ + gcc_assert (m->fs.sp_valid); + gcc_assert (!m->fs.fp_valid); + gcc_assert (!m->fs.realigned); + if (m->fs.sp_offset != UNITS_PER_WORD) + { + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (m->fs.sp_offset - UNITS_PER_WORD), + style, true); + } + + /* Sibcall epilogues don't want a return instruction. */ + if (style == 0) + { + m->fs = frame_state_save; + return; + } + + /* Emit vzeroupper if needed. */ + if (TARGET_VZEROUPPER + && !TREE_THIS_VOLATILE (cfun->decl) + && !cfun->machine->caller_return_avx256_p) + emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256))); + + if (crtl->args.pops_args && crtl->args.size) + { + rtx popc = GEN_INT (crtl->args.pops_args); + + /* i386 can only pop 64K bytes. If asked to pop more, pop return + address, do explicit add, and jump indirectly to the caller. */ + + if (crtl->args.pops_args >= 65536) + { + rtx ecx = gen_rtx_REG (SImode, CX_REG); + rtx insn; + + /* There is no "pascal" calling convention in any 64bit ABI. */ + gcc_assert (!TARGET_64BIT); + + insn = emit_insn (gen_pop (ecx)); + m->fs.cfa_offset -= UNITS_PER_WORD; + m->fs.sp_offset -= UNITS_PER_WORD; + + add_reg_note (insn, REG_CFA_ADJUST_CFA, + copy_rtx (XVECEXP (PATTERN (insn), 0, 1))); + add_reg_note (insn, REG_CFA_REGISTER, + gen_rtx_SET (VOIDmode, ecx, pc_rtx)); + RTX_FRAME_RELATED_P (insn) = 1; + + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + popc, -1, true); + emit_jump_insn (gen_return_indirect_internal (ecx)); + } + else + emit_jump_insn (gen_return_pop_internal (popc)); + } + else + emit_jump_insn (gen_return_internal ()); + + /* Restore the state back to the state from the prologue, + so that it's correct for the next epilogue. */ + m->fs = frame_state_save; +} + +/* Reset from the function's potential modifications. */ + +static void +ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED, + HOST_WIDE_INT size ATTRIBUTE_UNUSED) +{ + if (pic_offset_table_rtx) + SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM); +#if TARGET_MACHO + /* Mach-O doesn't support labels at the end of objects, so if + it looks like we might want one, insert a NOP. */ + { + rtx insn = get_last_insn (); + while (insn + && NOTE_P (insn) + && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL) + insn = PREV_INSN (insn); + if (insn + && (LABEL_P (insn) + || (NOTE_P (insn) + && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))) + fputs ("\tnop\n", file); + } +#endif + +} + +/* Return a scratch register to use in the split stack prologue. The + split stack prologue is used for -fsplit-stack. It is the first + instructions in the function, even before the regular prologue. + The scratch register can be any caller-saved register which is not + used for parameters or for the static chain. */ + +static unsigned int +split_stack_prologue_scratch_regno (void) +{ + if (TARGET_64BIT) + return R11_REG; + else + { + bool is_fastcall; + int regparm; + + is_fastcall = (lookup_attribute ("fastcall", + TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl))) + != NULL); + regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl); + + if (is_fastcall) + { + if (DECL_STATIC_CHAIN (cfun->decl)) + { + sorry ("-fsplit-stack does not support fastcall with " + "nested function"); + return INVALID_REGNUM; + } + return AX_REG; + } + else if (regparm < 3) + { + if (!DECL_STATIC_CHAIN (cfun->decl)) + return CX_REG; + else + { + if (regparm >= 2) + { + sorry ("-fsplit-stack does not support 2 register " + " parameters for a nested function"); + return INVALID_REGNUM; + } + return DX_REG; + } + } + else + { + /* FIXME: We could make this work by pushing a register + around the addition and comparison. */ + sorry ("-fsplit-stack does not support 3 register parameters"); + return INVALID_REGNUM; + } + } +} + +/* A SYMBOL_REF for the function which allocates new stackspace for + -fsplit-stack. */ + +static GTY(()) rtx split_stack_fn; + +/* A SYMBOL_REF for the more stack function when using the large + model. */ + +static GTY(()) rtx split_stack_fn_large; + +/* Handle -fsplit-stack. These are the first instructions in the + function, even before the regular prologue. */ + +void +ix86_expand_split_stack_prologue (void) +{ + struct ix86_frame frame; + HOST_WIDE_INT allocate; + unsigned HOST_WIDE_INT args_size; + rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage; + rtx scratch_reg = NULL_RTX; + rtx varargs_label = NULL_RTX; + rtx fn; + + gcc_assert (flag_split_stack && reload_completed); + + ix86_finalize_stack_realign_flags (); + ix86_compute_frame_layout (&frame); + allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET; + + /* This is the label we will branch to if we have enough stack + space. We expect the basic block reordering pass to reverse this + branch if optimizing, so that we branch in the unlikely case. */ + label = gen_label_rtx (); + + /* We need to compare the stack pointer minus the frame size with + the stack boundary in the TCB. The stack boundary always gives + us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we + can compare directly. Otherwise we need to do an addition. */ + + limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx), + UNSPEC_STACK_CHECK); + limit = gen_rtx_CONST (Pmode, limit); + limit = gen_rtx_MEM (Pmode, limit); + if (allocate < SPLIT_STACK_AVAILABLE) + current = stack_pointer_rtx; + else + { + unsigned int scratch_regno; + rtx offset; + + /* We need a scratch register to hold the stack pointer minus + the required frame size. Since this is the very start of the + function, the scratch register can be any caller-saved + register which is not used for parameters. */ + offset = GEN_INT (- allocate); + scratch_regno = split_stack_prologue_scratch_regno (); + if (scratch_regno == INVALID_REGNUM) + return; + scratch_reg = gen_rtx_REG (Pmode, scratch_regno); + if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode)) + { + /* We don't use ix86_gen_add3 in this case because it will + want to split to lea, but when not optimizing the insn + will not be split after this point. */ + emit_insn (gen_rtx_SET (VOIDmode, scratch_reg, + gen_rtx_PLUS (Pmode, stack_pointer_rtx, + offset))); + } + else + { + emit_move_insn (scratch_reg, offset); + emit_insn (gen_adddi3 (scratch_reg, scratch_reg, + stack_pointer_rtx)); + } + current = scratch_reg; + } + + ix86_expand_branch (GEU, current, limit, label); + jump_insn = get_last_insn (); + JUMP_LABEL (jump_insn) = label; + + /* Mark the jump as very likely to be taken. */ + add_reg_note (jump_insn, REG_BR_PROB, + GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100)); + + if (split_stack_fn == NULL_RTX) + split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack"); + fn = split_stack_fn; + + /* Get more stack space. We pass in the desired stack space and the + size of the arguments to copy to the new stack. In 32-bit mode + we push the parameters; __morestack will return on a new stack + anyhow. In 64-bit mode we pass the parameters in r10 and + r11. */ + allocate_rtx = GEN_INT (allocate); + args_size = crtl->args.size >= 0 ? crtl->args.size : 0; + call_fusage = NULL_RTX; + if (TARGET_64BIT) + { + rtx reg10, reg11; + + reg10 = gen_rtx_REG (Pmode, R10_REG); + reg11 = gen_rtx_REG (Pmode, R11_REG); + + /* If this function uses a static chain, it will be in %r10. + Preserve it across the call to __morestack. */ + if (DECL_STATIC_CHAIN (cfun->decl)) + { + rtx rax; + + rax = gen_rtx_REG (Pmode, AX_REG); + emit_move_insn (rax, reg10); + use_reg (&call_fusage, rax); + } + + if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC) + { + HOST_WIDE_INT argval; + + /* When using the large model we need to load the address + into a register, and we've run out of registers. So we + switch to a different calling convention, and we call a + different function: __morestack_large. We pass the + argument size in the upper 32 bits of r10 and pass the + frame size in the lower 32 bits. */ + gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate); + gcc_assert ((args_size & 0xffffffff) == args_size); + + if (split_stack_fn_large == NULL_RTX) + split_stack_fn_large = + gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model"); + + if (ix86_cmodel == CM_LARGE_PIC) + { + rtx label, x; + + label = gen_label_rtx (); + emit_label (label); + LABEL_PRESERVE_P (label) = 1; + emit_insn (gen_set_rip_rex64 (reg10, label)); + emit_insn (gen_set_got_offset_rex64 (reg11, label)); + emit_insn (gen_adddi3 (reg10, reg10, reg11)); + x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large), + UNSPEC_GOT); + x = gen_rtx_CONST (Pmode, x); + emit_move_insn (reg11, x); + x = gen_rtx_PLUS (Pmode, reg10, reg11); + x = gen_const_mem (Pmode, x); + emit_move_insn (reg11, x); + } + else + emit_move_insn (reg11, split_stack_fn_large); + + fn = reg11; + + argval = ((args_size << 16) << 16) + allocate; + emit_move_insn (reg10, GEN_INT (argval)); + } + else + { + emit_move_insn (reg10, allocate_rtx); + emit_move_insn (reg11, GEN_INT (args_size)); + use_reg (&call_fusage, reg11); + } + + use_reg (&call_fusage, reg10); + } + else + { + emit_insn (gen_push (GEN_INT (args_size))); + emit_insn (gen_push (allocate_rtx)); + } + call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn), + GEN_INT (UNITS_PER_WORD), constm1_rtx, + NULL_RTX, 0); + add_function_usage_to (call_insn, call_fusage); + + /* In order to make call/return prediction work right, we now need + to execute a return instruction. See + libgcc/config/i386/morestack.S for the details on how this works. + + For flow purposes gcc must not see this as a return + instruction--we need control flow to continue at the subsequent + label. Therefore, we use an unspec. */ + gcc_assert (crtl->args.pops_args < 65536); + emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args))); + + /* If we are in 64-bit mode and this function uses a static chain, + we saved %r10 in %rax before calling _morestack. */ + if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl)) + emit_move_insn (gen_rtx_REG (Pmode, R10_REG), + gen_rtx_REG (Pmode, AX_REG)); + + /* If this function calls va_start, we need to store a pointer to + the arguments on the old stack, because they may not have been + all copied to the new stack. At this point the old stack can be + found at the frame pointer value used by __morestack, because + __morestack has set that up before calling back to us. Here we + store that pointer in a scratch register, and in + ix86_expand_prologue we store the scratch register in a stack + slot. */ + if (cfun->machine->split_stack_varargs_pointer != NULL_RTX) + { + unsigned int scratch_regno; + rtx frame_reg; + int words; + + scratch_regno = split_stack_prologue_scratch_regno (); + scratch_reg = gen_rtx_REG (Pmode, scratch_regno); + frame_reg = gen_rtx_REG (Pmode, BP_REG); + + /* 64-bit: + fp -> old fp value + return address within this function + return address of caller of this function + stack arguments + So we add three words to get to the stack arguments. + + 32-bit: + fp -> old fp value + return address within this function + first argument to __morestack + second argument to __morestack + return address of caller of this function + stack arguments + So we add five words to get to the stack arguments. + */ + words = TARGET_64BIT ? 3 : 5; + emit_insn (gen_rtx_SET (VOIDmode, scratch_reg, + gen_rtx_PLUS (Pmode, frame_reg, + GEN_INT (words * UNITS_PER_WORD)))); + + varargs_label = gen_label_rtx (); + emit_jump_insn (gen_jump (varargs_label)); + JUMP_LABEL (get_last_insn ()) = varargs_label; + + emit_barrier (); + } + + emit_label (label); + LABEL_NUSES (label) = 1; + + /* If this function calls va_start, we now have to set the scratch + register for the case where we do not call __morestack. In this + case we need to set it based on the stack pointer. */ + if (cfun->machine->split_stack_varargs_pointer != NULL_RTX) + { + emit_insn (gen_rtx_SET (VOIDmode, scratch_reg, + gen_rtx_PLUS (Pmode, stack_pointer_rtx, + GEN_INT (UNITS_PER_WORD)))); + + emit_label (varargs_label); + LABEL_NUSES (varargs_label) = 1; + } +} + +/* We may have to tell the dataflow pass that the split stack prologue + is initializing a scratch register. */ + +static void +ix86_live_on_entry (bitmap regs) +{ + if (cfun->machine->split_stack_varargs_pointer != NULL_RTX) + { + gcc_assert (flag_split_stack); + bitmap_set_bit (regs, split_stack_prologue_scratch_regno ()); + } +} + +/* Extract the parts of an RTL expression that is a valid memory address + for an instruction. Return 0 if the structure of the address is + grossly off. Return -1 if the address contains ASHIFT, so it is not + strictly valid, but still used for computing length of lea instruction. */ + +int +ix86_decompose_address (rtx addr, struct ix86_address *out) +{ + rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX; + rtx base_reg, index_reg; + HOST_WIDE_INT scale = 1; + rtx scale_rtx = NULL_RTX; + rtx tmp; + int retval = 1; + enum ix86_address_seg seg = SEG_DEFAULT; + + if (REG_P (addr) || GET_CODE (addr) == SUBREG) + base = addr; + else if (GET_CODE (addr) == PLUS) + { + rtx addends[4], op; + int n = 0, i; + + op = addr; + do + { + if (n >= 4) + return 0; + addends[n++] = XEXP (op, 1); + op = XEXP (op, 0); + } + while (GET_CODE (op) == PLUS); + if (n >= 4) + return 0; + addends[n] = op; + + for (i = n; i >= 0; --i) + { + op = addends[i]; + switch (GET_CODE (op)) + { + case MULT: + if (index) + return 0; + index = XEXP (op, 0); + scale_rtx = XEXP (op, 1); + break; + + case ASHIFT: + if (index) + return 0; + index = XEXP (op, 0); + tmp = XEXP (op, 1); + if (!CONST_INT_P (tmp)) + return 0; + scale = INTVAL (tmp); + if ((unsigned HOST_WIDE_INT) scale > 3) + return 0; + scale = 1 << scale; + break; + + case UNSPEC: + if (XINT (op, 1) == UNSPEC_TP + && TARGET_TLS_DIRECT_SEG_REFS + && seg == SEG_DEFAULT) + seg = TARGET_64BIT ? SEG_FS : SEG_GS; + else + return 0; + break; + + case REG: + case SUBREG: + if (!base) + base = op; + else if (!index) + index = op; + else + return 0; + break; + + case CONST: + case CONST_INT: + case SYMBOL_REF: + case LABEL_REF: + if (disp) + return 0; + disp = op; + break; + + default: + return 0; + } + } + } + else if (GET_CODE (addr) == MULT) + { + index = XEXP (addr, 0); /* index*scale */ + scale_rtx = XEXP (addr, 1); + } + else if (GET_CODE (addr) == ASHIFT) + { + /* We're called for lea too, which implements ashift on occasion. */ + index = XEXP (addr, 0); + tmp = XEXP (addr, 1); + if (!CONST_INT_P (tmp)) + return 0; + scale = INTVAL (tmp); + if ((unsigned HOST_WIDE_INT) scale > 3) + return 0; + scale = 1 << scale; + retval = -1; + } + else + disp = addr; /* displacement */ + + /* Extract the integral value of scale. */ + if (scale_rtx) + { + if (!CONST_INT_P (scale_rtx)) + return 0; + scale = INTVAL (scale_rtx); + } + + base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base; + index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index; + + /* Avoid useless 0 displacement. */ + if (disp == const0_rtx && (base || index)) + disp = NULL_RTX; + + /* Allow arg pointer and stack pointer as index if there is not scaling. */ + if (base_reg && index_reg && scale == 1 + && (index_reg == arg_pointer_rtx + || index_reg == frame_pointer_rtx + || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM))) + { + rtx tmp; + tmp = base, base = index, index = tmp; + tmp = base_reg, base_reg = index_reg, index_reg = tmp; + } + + /* Special case: %ebp cannot be encoded as a base without a displacement. + Similarly %r13. */ + if (!disp + && base_reg + && (base_reg == hard_frame_pointer_rtx + || base_reg == frame_pointer_rtx + || base_reg == arg_pointer_rtx + || (REG_P (base_reg) + && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM + || REGNO (base_reg) == R13_REG)))) + disp = const0_rtx; + + /* Special case: on K6, [%esi] makes the instruction vector decoded. + Avoid this by transforming to [%esi+0]. + Reload calls address legitimization without cfun defined, so we need + to test cfun for being non-NULL. */ + if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun) + && base_reg && !index_reg && !disp + && REG_P (base_reg) && REGNO (base_reg) == SI_REG) + disp = const0_rtx; + + /* Special case: encode reg+reg instead of reg*2. */ + if (!base && index && scale == 2) + base = index, base_reg = index_reg, scale = 1; + + /* Special case: scaling cannot be encoded without base or displacement. */ + if (!base && !disp && index && scale != 1) + disp = const0_rtx; + + out->base = base; + out->index = index; + out->disp = disp; + out->scale = scale; + out->seg = seg; + + return retval; +} + +/* Return cost of the memory address x. + For i386, it is better to use a complex address than let gcc copy + the address into a reg and make a new pseudo. But not if the address + requires to two regs - that would mean more pseudos with longer + lifetimes. */ +static int +ix86_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED) +{ + struct ix86_address parts; + int cost = 1; + int ok = ix86_decompose_address (x, &parts); + + gcc_assert (ok); + + if (parts.base && GET_CODE (parts.base) == SUBREG) + parts.base = SUBREG_REG (parts.base); + if (parts.index && GET_CODE (parts.index) == SUBREG) + parts.index = SUBREG_REG (parts.index); + + /* Attempt to minimize number of registers in the address. */ + if ((parts.base + && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)) + || (parts.index + && (!REG_P (parts.index) + || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER))) + cost++; + + if (parts.base + && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER) + && parts.index + && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER) + && parts.base != parts.index) + cost++; + + /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b, + since it's predecode logic can't detect the length of instructions + and it degenerates to vector decoded. Increase cost of such + addresses here. The penalty is minimally 2 cycles. It may be worthwhile + to split such addresses or even refuse such addresses at all. + + Following addressing modes are affected: + [base+scale*index] + [scale*index+disp] + [base+index] + + The first and last case may be avoidable by explicitly coding the zero in + memory address, but I don't have AMD-K6 machine handy to check this + theory. */ + + if (TARGET_K6 + && ((!parts.disp && parts.base && parts.index && parts.scale != 1) + || (parts.disp && !parts.base && parts.index && parts.scale != 1) + || (!parts.disp && parts.base && parts.index && parts.scale == 1))) + cost += 10; + + return cost; +} + +/* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as + this is used for to form addresses to local data when -fPIC is in + use. */ + +static bool +darwin_local_data_pic (rtx disp) +{ + return (GET_CODE (disp) == UNSPEC + && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET); +} + +/* Determine if a given RTX is a valid constant. We already know this + satisfies CONSTANT_P. */ + +bool +legitimate_constant_p (rtx x) +{ + switch (GET_CODE (x)) + { + case CONST: + x = XEXP (x, 0); + + if (GET_CODE (x) == PLUS) + { + if (!CONST_INT_P (XEXP (x, 1))) + return false; + x = XEXP (x, 0); + } + + if (TARGET_MACHO && darwin_local_data_pic (x)) + return true; + + /* Only some unspecs are valid as "constants". */ + if (GET_CODE (x) == UNSPEC) + switch (XINT (x, 1)) + { + case UNSPEC_GOT: + case UNSPEC_GOTOFF: + case UNSPEC_PLTOFF: + return TARGET_64BIT; + case UNSPEC_TPOFF: + case UNSPEC_NTPOFF: + x = XVECEXP (x, 0, 0); + return (GET_CODE (x) == SYMBOL_REF + && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC); + case UNSPEC_DTPOFF: + x = XVECEXP (x, 0, 0); + return (GET_CODE (x) == SYMBOL_REF + && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC); + default: + return false; + } + + /* We must have drilled down to a symbol. */ + if (GET_CODE (x) == LABEL_REF) + return true; + if (GET_CODE (x) != SYMBOL_REF) + return false; + /* FALLTHRU */ + + case SYMBOL_REF: + /* TLS symbols are never valid. */ + if (SYMBOL_REF_TLS_MODEL (x)) + return false; + + /* DLLIMPORT symbols are never valid. */ + if (TARGET_DLLIMPORT_DECL_ATTRIBUTES + && SYMBOL_REF_DLLIMPORT_P (x)) + return false; + +#if TARGET_MACHO + /* mdynamic-no-pic */ + if (MACHO_DYNAMIC_NO_PIC_P) + return machopic_symbol_defined_p (x); +#endif + break; + + case CONST_DOUBLE: + if (GET_MODE (x) == TImode + && x != CONST0_RTX (TImode) + && !TARGET_64BIT) + return false; + break; + + case CONST_VECTOR: + if (!standard_sse_constant_p (x)) + return false; + + default: + break; + } + + /* Otherwise we handle everything else in the move patterns. */ + return true; +} + +/* Determine if it's legal to put X into the constant pool. This + is not possible for the address of thread-local symbols, which + is checked above. */ + +static bool +ix86_cannot_force_const_mem (rtx x) +{ + /* We can always put integral constants and vectors in memory. */ + switch (GET_CODE (x)) + { + case CONST_INT: + case CONST_DOUBLE: + case CONST_VECTOR: + return false; + + default: + break; + } + return !legitimate_constant_p (x); +} + + +/* Nonzero if the constant value X is a legitimate general operand + when generating PIC code. It is given that flag_pic is on and + that X satisfies CONSTANT_P or is a CONST_DOUBLE. */ + +bool +legitimate_pic_operand_p (rtx x) +{ + rtx inner; + + switch (GET_CODE (x)) + { + case CONST: + inner = XEXP (x, 0); + if (GET_CODE (inner) == PLUS + && CONST_INT_P (XEXP (inner, 1))) + inner = XEXP (inner, 0); + + /* Only some unspecs are valid as "constants". */ + if (GET_CODE (inner) == UNSPEC) + switch (XINT (inner, 1)) + { + case UNSPEC_GOT: + case UNSPEC_GOTOFF: + case UNSPEC_PLTOFF: + return TARGET_64BIT; + case UNSPEC_TPOFF: + x = XVECEXP (inner, 0, 0); + return (GET_CODE (x) == SYMBOL_REF + && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC); + case UNSPEC_MACHOPIC_OFFSET: + return legitimate_pic_address_disp_p (x); + default: + return false; + } + /* FALLTHRU */ + + case SYMBOL_REF: + case LABEL_REF: + return legitimate_pic_address_disp_p (x); + + default: + return true; + } +} + +/* Determine if a given CONST RTX is a valid memory displacement + in PIC mode. */ + +bool +legitimate_pic_address_disp_p (rtx disp) +{ + bool saw_plus; + + /* In 64bit mode we can allow direct addresses of symbols and labels + when they are not dynamic symbols. */ + if (TARGET_64BIT) + { + rtx op0 = disp, op1; + + switch (GET_CODE (disp)) + { + case LABEL_REF: + return true; + + case CONST: + if (GET_CODE (XEXP (disp, 0)) != PLUS) + break; + op0 = XEXP (XEXP (disp, 0), 0); + op1 = XEXP (XEXP (disp, 0), 1); + if (!CONST_INT_P (op1) + || INTVAL (op1) >= 16*1024*1024 + || INTVAL (op1) < -16*1024*1024) + break; + if (GET_CODE (op0) == LABEL_REF) + return true; + if (GET_CODE (op0) != SYMBOL_REF) + break; + /* FALLTHRU */ + + case SYMBOL_REF: + /* TLS references should always be enclosed in UNSPEC. */ + if (SYMBOL_REF_TLS_MODEL (op0)) + return false; + if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0) + && ix86_cmodel != CM_LARGE_PIC) + return true; + break; + + default: + break; + } + } + if (GET_CODE (disp) != CONST) + return false; + disp = XEXP (disp, 0); + + if (TARGET_64BIT) + { + /* We are unsafe to allow PLUS expressions. This limit allowed distance + of GOT tables. We should not need these anyway. */ + if (GET_CODE (disp) != UNSPEC + || (XINT (disp, 1) != UNSPEC_GOTPCREL + && XINT (disp, 1) != UNSPEC_GOTOFF + && XINT (disp, 1) != UNSPEC_PCREL + && XINT (disp, 1) != UNSPEC_PLTOFF)) + return false; + + if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF + && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF) + return false; + return true; + } + + saw_plus = false; + if (GET_CODE (disp) == PLUS) + { + if (!CONST_INT_P (XEXP (disp, 1))) + return false; + disp = XEXP (disp, 0); + saw_plus = true; + } + + if (TARGET_MACHO && darwin_local_data_pic (disp)) + return true; + + if (GET_CODE (disp) != UNSPEC) + return false; + + switch (XINT (disp, 1)) + { + case UNSPEC_GOT: + if (saw_plus) + return false; + /* We need to check for both symbols and labels because VxWorks loads + text labels with @GOT rather than @GOTOFF. See gotoff_operand for + details. */ + return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF + || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF); + case UNSPEC_GOTOFF: + /* Refuse GOTOFF in 64bit mode since it is always 64bit when used. + While ABI specify also 32bit relocation but we don't produce it in + small PIC model at all. */ + if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF + || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF) + && !TARGET_64BIT) + return gotoff_operand (XVECEXP (disp, 0, 0), Pmode); + return false; + case UNSPEC_GOTTPOFF: + case UNSPEC_GOTNTPOFF: + case UNSPEC_INDNTPOFF: + if (saw_plus) + return false; + disp = XVECEXP (disp, 0, 0); + return (GET_CODE (disp) == SYMBOL_REF + && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC); + case UNSPEC_NTPOFF: + disp = XVECEXP (disp, 0, 0); + return (GET_CODE (disp) == SYMBOL_REF + && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC); + case UNSPEC_DTPOFF: + disp = XVECEXP (disp, 0, 0); + return (GET_CODE (disp) == SYMBOL_REF + && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC); + } + + return false; +} + +/* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to + replace the input X, or the original X if no replacement is called for. + The output parameter *WIN is 1 if the calling macro should goto WIN, + 0 if it should not. */ + +bool +ix86_legitimize_reload_address (rtx x, + enum machine_mode mode ATTRIBUTE_UNUSED, + int opnum, int type, + int ind_levels ATTRIBUTE_UNUSED) +{ + /* Reload can generate: + + (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP) + (reg:DI 97)) + (reg:DI 2 cx)) + + This RTX is rejected from ix86_legitimate_address_p due to + non-strictness of base register 97. Following this rejection, + reload pushes all three components into separate registers, + creating invalid memory address RTX. + + Following code reloads only the invalid part of the + memory address RTX. */ + + if (GET_CODE (x) == PLUS + && REG_P (XEXP (x, 1)) + && GET_CODE (XEXP (x, 0)) == PLUS + && REG_P (XEXP (XEXP (x, 0), 1))) + { + rtx base, index; + bool something_reloaded = false; + + base = XEXP (XEXP (x, 0), 1); + if (!REG_OK_FOR_BASE_STRICT_P (base)) + { + push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL, + BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0, + opnum, (enum reload_type)type); + something_reloaded = true; + } + + index = XEXP (x, 1); + if (!REG_OK_FOR_INDEX_STRICT_P (index)) + { + push_reload (index, NULL_RTX, &XEXP (x, 1), NULL, + INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0, + opnum, (enum reload_type)type); + something_reloaded = true; + } + + gcc_assert (something_reloaded); + return true; + } + + return false; +} + +/* Recognizes RTL expressions that are valid memory addresses for an + instruction. The MODE argument is the machine mode for the MEM + expression that wants to use this address. + + It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should + convert common non-canonical forms to canonical form so that they will + be recognized. */ + +static bool +ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED, + rtx addr, bool strict) +{ + struct ix86_address parts; + rtx base, index, disp; + HOST_WIDE_INT scale; + + if (ix86_decompose_address (addr, &parts) <= 0) + /* Decomposition failed. */ + return false; + + base = parts.base; + index = parts.index; + disp = parts.disp; + scale = parts.scale; + + /* Validate base register. + + Don't allow SUBREG's that span more than a word here. It can lead to spill + failures when the base is one word out of a two word structure, which is + represented internally as a DImode int. */ + + if (base) + { + rtx reg; + + if (REG_P (base)) + reg = base; + else if (GET_CODE (base) == SUBREG + && REG_P (SUBREG_REG (base)) + && GET_MODE_SIZE (GET_MODE (SUBREG_REG (base))) + <= UNITS_PER_WORD) + reg = SUBREG_REG (base); + else + /* Base is not a register. */ + return false; + + if (GET_MODE (base) != Pmode) + /* Base is not in Pmode. */ + return false; + + if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg)) + || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg))) + /* Base is not valid. */ + return false; + } + + /* Validate index register. + + Don't allow SUBREG's that span more than a word here -- same as above. */ + + if (index) + { + rtx reg; + + if (REG_P (index)) + reg = index; + else if (GET_CODE (index) == SUBREG + && REG_P (SUBREG_REG (index)) + && GET_MODE_SIZE (GET_MODE (SUBREG_REG (index))) + <= UNITS_PER_WORD) + reg = SUBREG_REG (index); + else + /* Index is not a register. */ + return false; + + if (GET_MODE (index) != Pmode) + /* Index is not in Pmode. */ + return false; + + if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg)) + || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg))) + /* Index is not valid. */ + return false; + } + + /* Validate scale factor. */ + if (scale != 1) + { + if (!index) + /* Scale without index. */ + return false; + + if (scale != 2 && scale != 4 && scale != 8) + /* Scale is not a valid multiplier. */ + return false; + } + + /* Validate displacement. */ + if (disp) + { + if (GET_CODE (disp) == CONST + && GET_CODE (XEXP (disp, 0)) == UNSPEC + && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET) + switch (XINT (XEXP (disp, 0), 1)) + { + /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when + used. While ABI specify also 32bit relocations, we don't produce + them at all and use IP relative instead. */ + case UNSPEC_GOT: + case UNSPEC_GOTOFF: + gcc_assert (flag_pic); + if (!TARGET_64BIT) + goto is_legitimate_pic; + + /* 64bit address unspec. */ + return false; + + case UNSPEC_GOTPCREL: + case UNSPEC_PCREL: + gcc_assert (flag_pic); + goto is_legitimate_pic; + + case UNSPEC_GOTTPOFF: + case UNSPEC_GOTNTPOFF: + case UNSPEC_INDNTPOFF: + case UNSPEC_NTPOFF: + case UNSPEC_DTPOFF: + break; + + case UNSPEC_STACK_CHECK: + gcc_assert (flag_split_stack); + break; + + default: + /* Invalid address unspec. */ + return false; + } + + else if (SYMBOLIC_CONST (disp) + && (flag_pic + || (TARGET_MACHO +#if TARGET_MACHO + && MACHOPIC_INDIRECT + && !machopic_operand_p (disp) +#endif + ))) + { + + is_legitimate_pic: + if (TARGET_64BIT && (index || base)) + { + /* foo@dtpoff(%rX) is ok. */ + if (GET_CODE (disp) != CONST + || GET_CODE (XEXP (disp, 0)) != PLUS + || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC + || !CONST_INT_P (XEXP (XEXP (disp, 0), 1)) + || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF + && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF)) + /* Non-constant pic memory reference. */ + return false; + } + else if ((!TARGET_MACHO || flag_pic) + && ! legitimate_pic_address_disp_p (disp)) + /* Displacement is an invalid pic construct. */ + return false; +#if TARGET_MACHO + else if (MACHO_DYNAMIC_NO_PIC_P && !legitimate_constant_p (disp)) + /* displacment must be referenced via non_lazy_pointer */ + return false; +#endif + + /* This code used to verify that a symbolic pic displacement + includes the pic_offset_table_rtx register. + + While this is good idea, unfortunately these constructs may + be created by "adds using lea" optimization for incorrect + code like: + + int a; + int foo(int i) + { + return *(&a+i); + } + + This code is nonsensical, but results in addressing + GOT table with pic_offset_table_rtx base. We can't + just refuse it easily, since it gets matched by + "addsi3" pattern, that later gets split to lea in the + case output register differs from input. While this + can be handled by separate addsi pattern for this case + that never results in lea, this seems to be easier and + correct fix for crash to disable this test. */ + } + else if (GET_CODE (disp) != LABEL_REF + && !CONST_INT_P (disp) + && (GET_CODE (disp) != CONST + || !legitimate_constant_p (disp)) + && (GET_CODE (disp) != SYMBOL_REF + || !legitimate_constant_p (disp))) + /* Displacement is not constant. */ + return false; + else if (TARGET_64BIT + && !x86_64_immediate_operand (disp, VOIDmode)) + /* Displacement is out of range. */ + return false; + } + + /* Everything looks valid. */ + return true; +} + +/* Determine if a given RTX is a valid constant address. */ + +bool +constant_address_p (rtx x) +{ + return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1); +} + +/* Return a unique alias set for the GOT. */ + +static alias_set_type +ix86_GOT_alias_set (void) +{ + static alias_set_type set = -1; + if (set == -1) + set = new_alias_set (); + return set; +} + +/* Return a legitimate reference for ORIG (an address) using the + register REG. If REG is 0, a new pseudo is generated. + + There are two types of references that must be handled: + + 1. Global data references must load the address from the GOT, via + the PIC reg. An insn is emitted to do this load, and the reg is + returned. + + 2. Static data references, constant pool addresses, and code labels + compute the address as an offset from the GOT, whose base is in + the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to + differentiate them from global data objects. The returned + address is the PIC reg + an unspec constant. + + TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC + reg also appears in the address. */ + +static rtx +legitimize_pic_address (rtx orig, rtx reg) +{ + rtx addr = orig; + rtx new_rtx = orig; + rtx base; + +#if TARGET_MACHO + if (TARGET_MACHO && !TARGET_64BIT) + { + if (reg == 0) + reg = gen_reg_rtx (Pmode); + /* Use the generic Mach-O PIC machinery. */ + return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg); + } +#endif + + if (TARGET_64BIT && legitimate_pic_address_disp_p (addr)) + new_rtx = addr; + else if (TARGET_64BIT + && ix86_cmodel != CM_SMALL_PIC + && gotoff_operand (addr, Pmode)) + { + rtx tmpreg; + /* This symbol may be referenced via a displacement from the PIC + base address (@GOTOFF). */ + + if (reload_in_progress) + df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true); + if (GET_CODE (addr) == CONST) + addr = XEXP (addr, 0); + if (GET_CODE (addr) == PLUS) + { + new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)), + UNSPEC_GOTOFF); + new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1)); + } + else + new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF); + new_rtx = gen_rtx_CONST (Pmode, new_rtx); + if (!reg) + tmpreg = gen_reg_rtx (Pmode); + else + tmpreg = reg; + emit_move_insn (tmpreg, new_rtx); + + if (reg != 0) + { + new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx, + tmpreg, 1, OPTAB_DIRECT); + new_rtx = reg; + } + else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg); + } + else if (!TARGET_64BIT && gotoff_operand (addr, Pmode)) + { + /* This symbol may be referenced via a displacement from the PIC + base address (@GOTOFF). */ + + if (reload_in_progress) + df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true); + if (GET_CODE (addr) == CONST) + addr = XEXP (addr, 0); + if (GET_CODE (addr) == PLUS) + { + new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)), + UNSPEC_GOTOFF); + new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1)); + } + else + new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF); + new_rtx = gen_rtx_CONST (Pmode, new_rtx); + new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx); + + if (reg != 0) + { + emit_move_insn (reg, new_rtx); + new_rtx = reg; + } + } + else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0) + /* We can't use @GOTOFF for text labels on VxWorks; + see gotoff_operand. */ + || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF)) + { + if (TARGET_DLLIMPORT_DECL_ATTRIBUTES) + { + if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr)) + return legitimize_dllimport_symbol (addr, true); + if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS + && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF + && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0))) + { + rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true); + return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1)); + } + } + + /* For x64 PE-COFF there is no GOT table. So we use address + directly. */ + if (TARGET_64BIT && DEFAULT_ABI == MS_ABI) + { + new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL); + new_rtx = gen_rtx_CONST (Pmode, new_rtx); + + if (reg == 0) + reg = gen_reg_rtx (Pmode); + emit_move_insn (reg, new_rtx); + new_rtx = reg; + } + else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC) + { + new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL); + new_rtx = gen_rtx_CONST (Pmode, new_rtx); + new_rtx = gen_const_mem (Pmode, new_rtx); + set_mem_alias_set (new_rtx, ix86_GOT_alias_set ()); + + if (reg == 0) + reg = gen_reg_rtx (Pmode); + /* Use directly gen_movsi, otherwise the address is loaded + into register for CSE. We don't want to CSE this addresses, + instead we CSE addresses from the GOT table, so skip this. */ + emit_insn (gen_movsi (reg, new_rtx)); + new_rtx = reg; + } + else + { + /* This symbol must be referenced via a load from the + Global Offset Table (@GOT). */ + + if (reload_in_progress) + df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true); + new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT); + new_rtx = gen_rtx_CONST (Pmode, new_rtx); + if (TARGET_64BIT) + new_rtx = force_reg (Pmode, new_rtx); + new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx); + new_rtx = gen_const_mem (Pmode, new_rtx); + set_mem_alias_set (new_rtx, ix86_GOT_alias_set ()); + + if (reg == 0) + reg = gen_reg_rtx (Pmode); + emit_move_insn (reg, new_rtx); + new_rtx = reg; + } + } + else + { + if (CONST_INT_P (addr) + && !x86_64_immediate_operand (addr, VOIDmode)) + { + if (reg) + { + emit_move_insn (reg, addr); + new_rtx = reg; + } + else + new_rtx = force_reg (Pmode, addr); + } + else if (GET_CODE (addr) == CONST) + { + addr = XEXP (addr, 0); + + /* We must match stuff we generate before. Assume the only + unspecs that can get here are ours. Not that we could do + anything with them anyway.... */ + if (GET_CODE (addr) == UNSPEC + || (GET_CODE (addr) == PLUS + && GET_CODE (XEXP (addr, 0)) == UNSPEC)) + return orig; + gcc_assert (GET_CODE (addr) == PLUS); + } + if (GET_CODE (addr) == PLUS) + { + rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1); + + /* Check first to see if this is a constant offset from a @GOTOFF + symbol reference. */ + if (gotoff_operand (op0, Pmode) + && CONST_INT_P (op1)) + { + if (!TARGET_64BIT) + { + if (reload_in_progress) + df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true); + new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0), + UNSPEC_GOTOFF); + new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1); + new_rtx = gen_rtx_CONST (Pmode, new_rtx); + new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx); + + if (reg != 0) + { + emit_move_insn (reg, new_rtx); + new_rtx = reg; + } + } + else + { + if (INTVAL (op1) < -16*1024*1024 + || INTVAL (op1) >= 16*1024*1024) + { + if (!x86_64_immediate_operand (op1, Pmode)) + op1 = force_reg (Pmode, op1); + new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1); + } + } + } + else + { + base = legitimize_pic_address (XEXP (addr, 0), reg); + new_rtx = legitimize_pic_address (XEXP (addr, 1), + base == reg ? NULL_RTX : reg); + + if (CONST_INT_P (new_rtx)) + new_rtx = plus_constant (base, INTVAL (new_rtx)); + else + { + if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1))) + { + base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0)); + new_rtx = XEXP (new_rtx, 1); + } + new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx); + } + } + } + } + return new_rtx; +} + +/* Load the thread pointer. If TO_REG is true, force it into a register. */ + +static rtx +get_thread_pointer (int to_reg) +{ + rtx tp, reg, insn; + + tp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx), UNSPEC_TP); + if (!to_reg) + return tp; + + reg = gen_reg_rtx (Pmode); + insn = gen_rtx_SET (VOIDmode, reg, tp); + insn = emit_insn (insn); + + return reg; +} + +/* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is + false if we expect this to be used for a memory address and true if + we expect to load the address into a register. */ + +static rtx +legitimize_tls_address (rtx x, enum tls_model model, int for_mov) +{ + rtx dest, base, off, pic, tp; + int type; + + switch (model) + { + case TLS_MODEL_GLOBAL_DYNAMIC: + dest = gen_reg_rtx (Pmode); + tp = TARGET_GNU2_TLS ? get_thread_pointer (1) : 0; + + if (TARGET_64BIT && ! TARGET_GNU2_TLS) + { + rtx rax = gen_rtx_REG (Pmode, AX_REG), insns; + + start_sequence (); + emit_call_insn (gen_tls_global_dynamic_64 (rax, x)); + insns = get_insns (); + end_sequence (); + + RTL_CONST_CALL_P (insns) = 1; + emit_libcall_block (insns, dest, rax, x); + } + else if (TARGET_64BIT && TARGET_GNU2_TLS) + emit_insn (gen_tls_global_dynamic_64 (dest, x)); + else + emit_insn (gen_tls_global_dynamic_32 (dest, x)); + + if (TARGET_GNU2_TLS) + { + dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest)); + + set_unique_reg_note (get_last_insn (), REG_EQUAL, x); + } + break; + + case TLS_MODEL_LOCAL_DYNAMIC: + base = gen_reg_rtx (Pmode); + tp = TARGET_GNU2_TLS ? get_thread_pointer (1) : 0; + + if (TARGET_64BIT && ! TARGET_GNU2_TLS) + { + rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, note; + + start_sequence (); + emit_call_insn (gen_tls_local_dynamic_base_64 (rax)); + insns = get_insns (); + end_sequence (); + + note = gen_rtx_EXPR_LIST (VOIDmode, const0_rtx, NULL); + note = gen_rtx_EXPR_LIST (VOIDmode, ix86_tls_get_addr (), note); + RTL_CONST_CALL_P (insns) = 1; + emit_libcall_block (insns, base, rax, note); + } + else if (TARGET_64BIT && TARGET_GNU2_TLS) + emit_insn (gen_tls_local_dynamic_base_64 (base)); + else + emit_insn (gen_tls_local_dynamic_base_32 (base)); + + if (TARGET_GNU2_TLS) + { + rtx x = ix86_tls_module_base (); + + set_unique_reg_note (get_last_insn (), REG_EQUAL, + gen_rtx_MINUS (Pmode, x, tp)); + } + + off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF); + off = gen_rtx_CONST (Pmode, off); + + dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off)); + + if (TARGET_GNU2_TLS) + { + dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp)); + + set_unique_reg_note (get_last_insn (), REG_EQUAL, x); + } + + break; + + case TLS_MODEL_INITIAL_EXEC: + if (TARGET_64BIT) + { + if (TARGET_SUN_TLS) + { + /* The Sun linker took the AMD64 TLS spec literally + and can only handle %rax as destination of the + initial executable code sequence. */ + + dest = gen_reg_rtx (Pmode); + emit_insn (gen_tls_initial_exec_64_sun (dest, x)); + return dest; + } + + pic = NULL; + type = UNSPEC_GOTNTPOFF; + } + else if (flag_pic) + { + if (reload_in_progress) + df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true); + pic = pic_offset_table_rtx; + type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF; + } + else if (!TARGET_ANY_GNU_TLS) + { + pic = gen_reg_rtx (Pmode); + emit_insn (gen_set_got (pic)); + type = UNSPEC_GOTTPOFF; + } + else + { + pic = NULL; + type = UNSPEC_INDNTPOFF; + } + + off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), type); + off = gen_rtx_CONST (Pmode, off); + if (pic) + off = gen_rtx_PLUS (Pmode, pic, off); + off = gen_const_mem (Pmode, off); + set_mem_alias_set (off, ix86_GOT_alias_set ()); + + if (TARGET_64BIT || TARGET_ANY_GNU_TLS) + { + base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS); + off = force_reg (Pmode, off); + return gen_rtx_PLUS (Pmode, base, off); + } + else + { + base = get_thread_pointer (true); + dest = gen_reg_rtx (Pmode); + emit_insn (gen_subsi3 (dest, base, off)); + } + break; + + case TLS_MODEL_LOCAL_EXEC: + off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), + (TARGET_64BIT || TARGET_ANY_GNU_TLS) + ? UNSPEC_NTPOFF : UNSPEC_TPOFF); + off = gen_rtx_CONST (Pmode, off); + + if (TARGET_64BIT || TARGET_ANY_GNU_TLS) + { + base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS); + return gen_rtx_PLUS (Pmode, base, off); + } + else + { + base = get_thread_pointer (true); + dest = gen_reg_rtx (Pmode); + emit_insn (gen_subsi3 (dest, base, off)); + } + break; + + default: + gcc_unreachable (); + } + + return dest; +} + +/* Create or return the unique __imp_DECL dllimport symbol corresponding + to symbol DECL. */ + +static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map))) + htab_t dllimport_map; + +static tree +get_dllimport_decl (tree decl) +{ + struct tree_map *h, in; + void **loc; + const char *name; + const char *prefix; + size_t namelen, prefixlen; + char *imp_name; + tree to; + rtx rtl; + + if (!dllimport_map) + dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0); + + in.hash = htab_hash_pointer (decl); + in.base.from = decl; + loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT); + h = (struct tree_map *) *loc; + if (h) + return h->to; + + *loc = h = ggc_alloc_tree_map (); + h->hash = in.hash; + h->base.from = decl; + h->to = to = build_decl (DECL_SOURCE_LOCATION (decl), + VAR_DECL, NULL, ptr_type_node); + DECL_ARTIFICIAL (to) = 1; + DECL_IGNORED_P (to) = 1; + DECL_EXTERNAL (to) = 1; + TREE_READONLY (to) = 1; + + name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)); + name = targetm.strip_name_encoding (name); + prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0 + ? "*__imp_" : "*__imp__"; + namelen = strlen (name); + prefixlen = strlen (prefix); + imp_name = (char *) alloca (namelen + prefixlen + 1); + memcpy (imp_name, prefix, prefixlen); + memcpy (imp_name + prefixlen, name, namelen + 1); + + name = ggc_alloc_string (imp_name, namelen + prefixlen); + rtl = gen_rtx_SYMBOL_REF (Pmode, name); + SET_SYMBOL_REF_DECL (rtl, to); + SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL; + + rtl = gen_const_mem (Pmode, rtl); + set_mem_alias_set (rtl, ix86_GOT_alias_set ()); + + SET_DECL_RTL (to, rtl); + SET_DECL_ASSEMBLER_NAME (to, get_identifier (name)); + + return to; +} + +/* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is + true if we require the result be a register. */ + +static rtx +legitimize_dllimport_symbol (rtx symbol, bool want_reg) +{ + tree imp_decl; + rtx x; + + gcc_assert (SYMBOL_REF_DECL (symbol)); + imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol)); + + x = DECL_RTL (imp_decl); + if (want_reg) + x = force_reg (Pmode, x); + return x; +} + +/* Try machine-dependent ways of modifying an illegitimate address + to be legitimate. If we find one, return the new, valid address. + This macro is used in only one place: `memory_address' in explow.c. + + OLDX is the address as it was before break_out_memory_refs was called. + In some cases it is useful to look at this to decide what needs to be done. + + It is always safe for this macro to do nothing. It exists to recognize + opportunities to optimize the output. + + For the 80386, we handle X+REG by loading X into a register R and + using R+REG. R will go in a general reg and indexing will be used. + However, if REG is a broken-out memory address or multiplication, + nothing needs to be done because REG can certainly go in a general reg. + + When -fpic is used, special handling is needed for symbolic references. + See comments by legitimize_pic_address in i386.c for details. */ + +static rtx +ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, + enum machine_mode mode) +{ + int changed = 0; + unsigned log; + + log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0; + if (log) + return legitimize_tls_address (x, (enum tls_model) log, false); + if (GET_CODE (x) == CONST + && GET_CODE (XEXP (x, 0)) == PLUS + && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF + && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0)))) + { + rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0), + (enum tls_model) log, false); + return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1)); + } + + if (TARGET_DLLIMPORT_DECL_ATTRIBUTES) + { + if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x)) + return legitimize_dllimport_symbol (x, true); + if (GET_CODE (x) == CONST + && GET_CODE (XEXP (x, 0)) == PLUS + && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF + && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0))) + { + rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true); + return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1)); + } + } + + if (flag_pic && SYMBOLIC_CONST (x)) + return legitimize_pic_address (x, 0); + +#if TARGET_MACHO + if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x)) + return machopic_indirect_data_reference (x, 0); +#endif + + /* Canonicalize shifts by 0, 1, 2, 3 into multiply */ + if (GET_CODE (x) == ASHIFT + && CONST_INT_P (XEXP (x, 1)) + && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4) + { + changed = 1; + log = INTVAL (XEXP (x, 1)); + x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)), + GEN_INT (1 << log)); + } + + if (GET_CODE (x) == PLUS) + { + /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */ + + if (GET_CODE (XEXP (x, 0)) == ASHIFT + && CONST_INT_P (XEXP (XEXP (x, 0), 1)) + && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4) + { + changed = 1; + log = INTVAL (XEXP (XEXP (x, 0), 1)); + XEXP (x, 0) = gen_rtx_MULT (Pmode, + force_reg (Pmode, XEXP (XEXP (x, 0), 0)), + GEN_INT (1 << log)); + } + + if (GET_CODE (XEXP (x, 1)) == ASHIFT + && CONST_INT_P (XEXP (XEXP (x, 1), 1)) + && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4) + { + changed = 1; + log = INTVAL (XEXP (XEXP (x, 1), 1)); + XEXP (x, 1) = gen_rtx_MULT (Pmode, + force_reg (Pmode, XEXP (XEXP (x, 1), 0)), + GEN_INT (1 << log)); + } + + /* Put multiply first if it isn't already. */ + if (GET_CODE (XEXP (x, 1)) == MULT) + { + rtx tmp = XEXP (x, 0); + XEXP (x, 0) = XEXP (x, 1); + XEXP (x, 1) = tmp; + changed = 1; + } + + /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const))) + into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be + created by virtual register instantiation, register elimination, and + similar optimizations. */ + if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS) + { + changed = 1; + x = gen_rtx_PLUS (Pmode, + gen_rtx_PLUS (Pmode, XEXP (x, 0), + XEXP (XEXP (x, 1), 0)), + XEXP (XEXP (x, 1), 1)); + } + + /* Canonicalize + (plus (plus (mult (reg) (const)) (plus (reg) (const))) const) + into (plus (plus (mult (reg) (const)) (reg)) (const)). */ + else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS + && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT + && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS + && CONSTANT_P (XEXP (x, 1))) + { + rtx constant; + rtx other = NULL_RTX; + + if (CONST_INT_P (XEXP (x, 1))) + { + constant = XEXP (x, 1); + other = XEXP (XEXP (XEXP (x, 0), 1), 1); + } + else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1))) + { + constant = XEXP (XEXP (XEXP (x, 0), 1), 1); + other = XEXP (x, 1); + } + else + constant = 0; + + if (constant) + { + changed = 1; + x = gen_rtx_PLUS (Pmode, + gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0), + XEXP (XEXP (XEXP (x, 0), 1), 0)), + plus_constant (other, INTVAL (constant))); + } + } + + if (changed && ix86_legitimate_address_p (mode, x, false)) + return x; + + if (GET_CODE (XEXP (x, 0)) == MULT) + { + changed = 1; + XEXP (x, 0) = force_operand (XEXP (x, 0), 0); + } + + if (GET_CODE (XEXP (x, 1)) == MULT) + { + changed = 1; + XEXP (x, 1) = force_operand (XEXP (x, 1), 0); + } + + if (changed + && REG_P (XEXP (x, 1)) + && REG_P (XEXP (x, 0))) + return x; + + if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1))) + { + changed = 1; + x = legitimize_pic_address (x, 0); + } + + if (changed && ix86_legitimate_address_p (mode, x, false)) + return x; + + if (REG_P (XEXP (x, 0))) + { + rtx temp = gen_reg_rtx (Pmode); + rtx val = force_operand (XEXP (x, 1), temp); + if (val != temp) + emit_move_insn (temp, val); + + XEXP (x, 1) = temp; + return x; + } + + else if (REG_P (XEXP (x, 1))) + { + rtx temp = gen_reg_rtx (Pmode); + rtx val = force_operand (XEXP (x, 0), temp); + if (val != temp) + emit_move_insn (temp, val); + + XEXP (x, 0) = temp; + return x; + } + } + + return x; +} + +/* Print an integer constant expression in assembler syntax. Addition + and subtraction are the only arithmetic that may appear in these + expressions. FILE is the stdio stream to write to, X is the rtx, and + CODE is the operand print code from the output string. */ + +static void +output_pic_addr_const (FILE *file, rtx x, int code) +{ + char buf[256]; + + switch (GET_CODE (x)) + { + case PC: + gcc_assert (flag_pic); + putc ('.', file); + break; + + case SYMBOL_REF: + if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS) + output_addr_const (file, x); + else + { + const char *name = XSTR (x, 0); + + /* Mark the decl as referenced so that cgraph will + output the function. */ + if (SYMBOL_REF_DECL (x)) + mark_decl_referenced (SYMBOL_REF_DECL (x)); + +#if TARGET_MACHO + if (MACHOPIC_INDIRECT + && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION) + name = machopic_indirection_name (x, /*stub_p=*/true); +#endif + assemble_name (file, name); + } + if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI) + && code == 'P' && ! SYMBOL_REF_LOCAL_P (x)) + fputs ("@PLT", file); + break; + + case LABEL_REF: + x = XEXP (x, 0); + /* FALLTHRU */ + case CODE_LABEL: + ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x)); + assemble_name (asm_out_file, buf); + break; + + case CONST_INT: + fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x)); + break; + + case CONST: + /* This used to output parentheses around the expression, + but that does not work on the 386 (either ATT or BSD assembler). */ + output_pic_addr_const (file, XEXP (x, 0), code); + break; + + case CONST_DOUBLE: + if (GET_MODE (x) == VOIDmode) + { + /* We can use %d if the number is <32 bits and positive. */ + if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0) + fprintf (file, "0x%lx%08lx", + (unsigned long) CONST_DOUBLE_HIGH (x), + (unsigned long) CONST_DOUBLE_LOW (x)); + else + fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x)); + } + else + /* We can't handle floating point constants; + TARGET_PRINT_OPERAND must handle them. */ + output_operand_lossage ("floating constant misused"); + break; + + case PLUS: + /* Some assemblers need integer constants to appear first. */ + if (CONST_INT_P (XEXP (x, 0))) + { + output_pic_addr_const (file, XEXP (x, 0), code); + putc ('+', file); + output_pic_addr_const (file, XEXP (x, 1), code); + } + else + { + gcc_assert (CONST_INT_P (XEXP (x, 1))); + output_pic_addr_const (file, XEXP (x, 1), code); + putc ('+', file); + output_pic_addr_const (file, XEXP (x, 0), code); + } + break; + + case MINUS: + if (!TARGET_MACHO) + putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file); + output_pic_addr_const (file, XEXP (x, 0), code); + putc ('-', file); + output_pic_addr_const (file, XEXP (x, 1), code); + if (!TARGET_MACHO) + putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file); + break; + + case UNSPEC: + if (XINT (x, 1) == UNSPEC_STACK_CHECK) + { + bool f = i386_asm_output_addr_const_extra (file, x); + gcc_assert (f); + break; + } + + gcc_assert (XVECLEN (x, 0) == 1); + output_pic_addr_const (file, XVECEXP (x, 0, 0), code); + switch (XINT (x, 1)) + { + case UNSPEC_GOT: + fputs ("@GOT", file); + break; + case UNSPEC_GOTOFF: + fputs ("@GOTOFF", file); + break; + case UNSPEC_PLTOFF: + fputs ("@PLTOFF", file); + break; + case UNSPEC_PCREL: + fputs (ASSEMBLER_DIALECT == ASM_ATT ? + "(%rip)" : "[rip]", file); + break; + case UNSPEC_GOTPCREL: + fputs (ASSEMBLER_DIALECT == ASM_ATT ? + "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file); + break; + case UNSPEC_GOTTPOFF: + /* FIXME: This might be @TPOFF in Sun ld too. */ + fputs ("@gottpoff", file); + break; + case UNSPEC_TPOFF: + fputs ("@tpoff", file); + break; + case UNSPEC_NTPOFF: + if (TARGET_64BIT) + fputs ("@tpoff", file); + else + fputs ("@ntpoff", file); + break; + case UNSPEC_DTPOFF: + fputs ("@dtpoff", file); + break; + case UNSPEC_GOTNTPOFF: + if (TARGET_64BIT) + fputs (ASSEMBLER_DIALECT == ASM_ATT ? + "@gottpoff(%rip)": "@gottpoff[rip]", file); + else + fputs ("@gotntpoff", file); + break; + case UNSPEC_INDNTPOFF: + fputs ("@indntpoff", file); + break; +#if TARGET_MACHO + case UNSPEC_MACHOPIC_OFFSET: + putc ('-', file); + machopic_output_function_base_name (file); + break; +#endif + default: + output_operand_lossage ("invalid UNSPEC as operand"); + break; + } + break; + + default: + output_operand_lossage ("invalid expression as operand"); + } +} + +/* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL. + We need to emit DTP-relative relocations. */ + +static void ATTRIBUTE_UNUSED +i386_output_dwarf_dtprel (FILE *file, int size, rtx x) +{ + fputs (ASM_LONG, file); + output_addr_const (file, x); + fputs ("@dtpoff", file); + switch (size) + { + case 4: + break; + case 8: + fputs (", 0", file); + break; + default: + gcc_unreachable (); + } +} + +/* Return true if X is a representation of the PIC register. This copes + with calls from ix86_find_base_term, where the register might have + been replaced by a cselib value. */ + +static bool +ix86_pic_register_p (rtx x) +{ + if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x)) + return (pic_offset_table_rtx + && rtx_equal_for_cselib_p (x, pic_offset_table_rtx)); + else + return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM; +} + +/* Helper function for ix86_delegitimize_address. + Attempt to delegitimize TLS local-exec accesses. */ + +static rtx +ix86_delegitimize_tls_address (rtx orig_x) +{ + rtx x = orig_x, unspec; + struct ix86_address addr; + + if (!TARGET_TLS_DIRECT_SEG_REFS) + return orig_x; + if (MEM_P (x)) + x = XEXP (x, 0); + if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode) + return orig_x; + if (ix86_decompose_address (x, &addr) == 0 + || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS) + || addr.disp == NULL_RTX + || GET_CODE (addr.disp) != CONST) + return orig_x; + unspec = XEXP (addr.disp, 0); + if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1))) + unspec = XEXP (unspec, 0); + if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF) + return orig_x; + x = XVECEXP (unspec, 0, 0); + gcc_assert (GET_CODE (x) == SYMBOL_REF); + if (unspec != XEXP (addr.disp, 0)) + x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1)); + if (addr.index) + { + rtx idx = addr.index; + if (addr.scale != 1) + idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale)); + x = gen_rtx_PLUS (Pmode, idx, x); + } + if (addr.base) + x = gen_rtx_PLUS (Pmode, addr.base, x); + if (MEM_P (orig_x)) + x = replace_equiv_address_nv (orig_x, x); + return x; +} + +/* In the name of slightly smaller debug output, and to cater to + general assembler lossage, recognize PIC+GOTOFF and turn it back + into a direct symbol reference. + + On Darwin, this is necessary to avoid a crash, because Darwin + has a different PIC label for each routine but the DWARF debugging + information is not associated with any particular routine, so it's + necessary to remove references to the PIC label from RTL stored by + the DWARF output code. */ + +static rtx +ix86_delegitimize_address (rtx x) +{ + rtx orig_x = delegitimize_mem_from_attrs (x); + /* addend is NULL or some rtx if x is something+GOTOFF where + something doesn't include the PIC register. */ + rtx addend = NULL_RTX; + /* reg_addend is NULL or a multiple of some register. */ + rtx reg_addend = NULL_RTX; + /* const_addend is NULL or a const_int. */ + rtx const_addend = NULL_RTX; + /* This is the result, or NULL. */ + rtx result = NULL_RTX; + + x = orig_x; + + if (MEM_P (x)) + x = XEXP (x, 0); + + if (TARGET_64BIT) + { + if (GET_CODE (x) != CONST + || GET_CODE (XEXP (x, 0)) != UNSPEC + || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL + && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL) + || !MEM_P (orig_x)) + return ix86_delegitimize_tls_address (orig_x); + x = XVECEXP (XEXP (x, 0), 0, 0); + if (GET_MODE (orig_x) != Pmode) + { + x = simplify_gen_subreg (GET_MODE (orig_x), x, Pmode, 0); + if (x == NULL_RTX) + return orig_x; + } + return x; + } + + if (GET_CODE (x) != PLUS + || GET_CODE (XEXP (x, 1)) != CONST) + return ix86_delegitimize_tls_address (orig_x); + + if (ix86_pic_register_p (XEXP (x, 0))) + /* %ebx + GOT/GOTOFF */ + ; + else if (GET_CODE (XEXP (x, 0)) == PLUS) + { + /* %ebx + %reg * scale + GOT/GOTOFF */ + reg_addend = XEXP (x, 0); + if (ix86_pic_register_p (XEXP (reg_addend, 0))) + reg_addend = XEXP (reg_addend, 1); + else if (ix86_pic_register_p (XEXP (reg_addend, 1))) + reg_addend = XEXP (reg_addend, 0); + else + { + reg_addend = NULL_RTX; + addend = XEXP (x, 0); + } + } + else + addend = XEXP (x, 0); + + x = XEXP (XEXP (x, 1), 0); + if (GET_CODE (x) == PLUS + && CONST_INT_P (XEXP (x, 1))) + { + const_addend = XEXP (x, 1); + x = XEXP (x, 0); + } + + if (GET_CODE (x) == UNSPEC + && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend) + || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x)))) + result = XVECEXP (x, 0, 0); + + if (TARGET_MACHO && darwin_local_data_pic (x) + && !MEM_P (orig_x)) + result = XVECEXP (x, 0, 0); + + if (! result) + return ix86_delegitimize_tls_address (orig_x); + + if (const_addend) + result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend)); + if (reg_addend) + result = gen_rtx_PLUS (Pmode, reg_addend, result); + if (addend) + { + /* If the rest of original X doesn't involve the PIC register, add + addend and subtract pic_offset_table_rtx. This can happen e.g. + for code like: + leal (%ebx, %ecx, 4), %ecx + ... + movl foo@GOTOFF(%ecx), %edx + in which case we return (%ecx - %ebx) + foo. */ + if (pic_offset_table_rtx) + result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend), + pic_offset_table_rtx), + result); + else + return orig_x; + } + if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x)) + { + result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0); + if (result == NULL_RTX) + return orig_x; + } + return result; +} + +/* If X is a machine specific address (i.e. a symbol or label being + referenced as a displacement from the GOT implemented using an + UNSPEC), then return the base term. Otherwise return X. */ + +rtx +ix86_find_base_term (rtx x) +{ + rtx term; + + if (TARGET_64BIT) + { + if (GET_CODE (x) != CONST) + return x; + term = XEXP (x, 0); + if (GET_CODE (term) == PLUS + && (CONST_INT_P (XEXP (term, 1)) + || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE)) + term = XEXP (term, 0); + if (GET_CODE (term) != UNSPEC + || (XINT (term, 1) != UNSPEC_GOTPCREL + && XINT (term, 1) != UNSPEC_PCREL)) + return x; + + return XVECEXP (term, 0, 0); + } + + return ix86_delegitimize_address (x); +} + +static void +put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse, + int fp, FILE *file) +{ + const char *suffix; + + if (mode == CCFPmode || mode == CCFPUmode) + { + code = ix86_fp_compare_code_to_integer (code); + mode = CCmode; + } + if (reverse) + code = reverse_condition (code); + + switch (code) + { + case EQ: + switch (mode) + { + case CCAmode: + suffix = "a"; + break; + + case CCCmode: + suffix = "c"; + break; + + case CCOmode: + suffix = "o"; + break; + + case CCSmode: + suffix = "s"; + break; + + default: + suffix = "e"; + } + break; + case NE: + switch (mode) + { + case CCAmode: + suffix = "na"; + break; + + case CCCmode: + suffix = "nc"; + break; + + case CCOmode: + suffix = "no"; + break; + + case CCSmode: + suffix = "ns"; + break; + + default: + suffix = "ne"; + } + break; + case GT: + gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode); + suffix = "g"; + break; + case GTU: + /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers. + Those same assemblers have the same but opposite lossage on cmov. */ + if (mode == CCmode) + suffix = fp ? "nbe" : "a"; + else if (mode == CCCmode) + suffix = "b"; + else + gcc_unreachable (); + break; + case LT: + switch (mode) + { + case CCNOmode: + case CCGOCmode: + suffix = "s"; + break; + + case CCmode: + case CCGCmode: + suffix = "l"; + break; + + default: + gcc_unreachable (); + } + break; + case LTU: + gcc_assert (mode == CCmode || mode == CCCmode); + suffix = "b"; + break; + case GE: + switch (mode) + { + case CCNOmode: + case CCGOCmode: + suffix = "ns"; + break; + + case CCmode: + case CCGCmode: + suffix = "ge"; + break; + + default: + gcc_unreachable (); + } + break; + case GEU: + /* ??? As above. */ + gcc_assert (mode == CCmode || mode == CCCmode); + suffix = fp ? "nb" : "ae"; + break; + case LE: + gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode); + suffix = "le"; + break; + case LEU: + /* ??? As above. */ + if (mode == CCmode) + suffix = "be"; + else if (mode == CCCmode) + suffix = fp ? "nb" : "ae"; + else + gcc_unreachable (); + break; + case UNORDERED: + suffix = fp ? "u" : "p"; + break; + case ORDERED: + suffix = fp ? "nu" : "np"; + break; + default: + gcc_unreachable (); + } + fputs (suffix, file); +} + +/* Print the name of register X to FILE based on its machine mode and number. + If CODE is 'w', pretend the mode is HImode. + If CODE is 'b', pretend the mode is QImode. + If CODE is 'k', pretend the mode is SImode. + If CODE is 'q', pretend the mode is DImode. + If CODE is 'x', pretend the mode is V4SFmode. + If CODE is 't', pretend the mode is V8SFmode. + If CODE is 'h', pretend the reg is the 'high' byte register. + If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op. + If CODE is 'd', duplicate the operand for AVX instruction. + */ + +void +print_reg (rtx x, int code, FILE *file) +{ + const char *reg; + bool duplicated = code == 'd' && TARGET_AVX; + + gcc_assert (x == pc_rtx + || (REGNO (x) != ARG_POINTER_REGNUM + && REGNO (x) != FRAME_POINTER_REGNUM + && REGNO (x) != FLAGS_REG + && REGNO (x) != FPSR_REG + && REGNO (x) != FPCR_REG)); + + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('%', file); + + if (x == pc_rtx) + { + gcc_assert (TARGET_64BIT); + fputs ("rip", file); + return; + } + + if (code == 'w' || MMX_REG_P (x)) + code = 2; + else if (code == 'b') + code = 1; + else if (code == 'k') + code = 4; + else if (code == 'q') + code = 8; + else if (code == 'y') + code = 3; + else if (code == 'h') + code = 0; + else if (code == 'x') + code = 16; + else if (code == 't') + code = 32; + else + code = GET_MODE_SIZE (GET_MODE (x)); + + /* Irritatingly, AMD extended registers use different naming convention + from the normal registers. */ + if (REX_INT_REG_P (x)) + { + gcc_assert (TARGET_64BIT); + switch (code) + { + case 0: + error ("extended registers have no high halves"); + break; + case 1: + fprintf (file, "r%ib", REGNO (x) - FIRST_REX_INT_REG + 8); + break; + case 2: + fprintf (file, "r%iw", REGNO (x) - FIRST_REX_INT_REG + 8); + break; + case 4: + fprintf (file, "r%id", REGNO (x) - FIRST_REX_INT_REG + 8); + break; + case 8: + fprintf (file, "r%i", REGNO (x) - FIRST_REX_INT_REG + 8); + break; + default: + error ("unsupported operand size for extended register"); + break; + } + return; + } + + reg = NULL; + switch (code) + { + case 3: + if (STACK_TOP_P (x)) + { + reg = "st(0)"; + break; + } + /* FALLTHRU */ + case 8: + case 4: + case 12: + if (! ANY_FP_REG_P (x)) + putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file); + /* FALLTHRU */ + case 16: + case 2: + normal: + reg = hi_reg_name[REGNO (x)]; + break; + case 1: + if (REGNO (x) >= ARRAY_SIZE (qi_reg_name)) + goto normal; + reg = qi_reg_name[REGNO (x)]; + break; + case 0: + if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name)) + goto normal; + reg = qi_high_reg_name[REGNO (x)]; + break; + case 32: + if (SSE_REG_P (x)) + { + gcc_assert (!duplicated); + putc ('y', file); + fputs (hi_reg_name[REGNO (x)] + 1, file); + return; + } + break; + default: + gcc_unreachable (); + } + + fputs (reg, file); + if (duplicated) + { + if (ASSEMBLER_DIALECT == ASM_ATT) + fprintf (file, ", %%%s", reg); + else + fprintf (file, ", %s", reg); + } +} + +/* Locate some local-dynamic symbol still in use by this function + so that we can print its name in some tls_local_dynamic_base + pattern. */ + +static int +get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED) +{ + rtx x = *px; + + if (GET_CODE (x) == SYMBOL_REF + && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC) + { + cfun->machine->some_ld_name = XSTR (x, 0); + return 1; + } + + return 0; +} + +static const char * +get_some_local_dynamic_name (void) +{ + rtx insn; + + if (cfun->machine->some_ld_name) + return cfun->machine->some_ld_name; + + for (insn = get_insns (); insn ; insn = NEXT_INSN (insn)) + if (NONDEBUG_INSN_P (insn) + && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0)) + return cfun->machine->some_ld_name; + + return NULL; +} + +/* Meaning of CODE: + L,W,B,Q,S,T -- print the opcode suffix for specified size of operand. + C -- print opcode suffix for set/cmov insn. + c -- like C, but print reversed condition + F,f -- likewise, but for floating-point. + O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.", + otherwise nothing + R -- print the prefix for register names. + z -- print the opcode suffix for the size of the current operand. + Z -- likewise, with special suffixes for x87 instructions. + * -- print a star (in certain assembler syntax) + A -- print an absolute memory reference. + w -- print the operand as if it's a "word" (HImode) even if it isn't. + s -- print a shift double count, followed by the assemblers argument + delimiter. + b -- print the QImode name of the register for the indicated operand. + %b0 would print %al if operands[0] is reg 0. + w -- likewise, print the HImode name of the register. + k -- likewise, print the SImode name of the register. + q -- likewise, print the DImode name of the register. + x -- likewise, print the V4SFmode name of the register. + t -- likewise, print the V8SFmode name of the register. + h -- print the QImode name for a "high" register, either ah, bh, ch or dh. + y -- print "st(0)" instead of "st" as a register. + d -- print duplicated register operand for AVX instruction. + D -- print condition for SSE cmp instruction. + P -- if PIC, print an @PLT suffix. + X -- don't print any sort of PIC '@' suffix for a symbol. + & -- print some in-use local-dynamic symbol name. + H -- print a memory address offset by 8; used for sse high-parts + Y -- print condition for XOP pcom* instruction. + + -- print a branch hint as 'cs' or 'ds' prefix + ; -- print a semicolon (after prefixes due to bug in older gas). + @ -- print a segment register of thread base pointer load + */ + +void +ix86_print_operand (FILE *file, rtx x, int code) +{ + if (code) + { + switch (code) + { + case '*': + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('*', file); + return; + + case '&': + { + const char *name = get_some_local_dynamic_name (); + if (name == NULL) + output_operand_lossage ("'%%&' used without any " + "local dynamic TLS references"); + else + assemble_name (file, name); + return; + } + + case 'A': + switch (ASSEMBLER_DIALECT) + { + case ASM_ATT: + putc ('*', file); + break; + + case ASM_INTEL: + /* Intel syntax. For absolute addresses, registers should not + be surrounded by braces. */ + if (!REG_P (x)) + { + putc ('[', file); + ix86_print_operand (file, x, 0); + putc (']', file); + return; + } + break; + + default: + gcc_unreachable (); + } + + ix86_print_operand (file, x, 0); + return; + + + case 'L': + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('l', file); + return; + + case 'W': + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('w', file); + return; + + case 'B': + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('b', file); + return; + + case 'Q': + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('l', file); + return; + + case 'S': + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('s', file); + return; + + case 'T': + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('t', file); + return; + + case 'z': + if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT) + { + /* Opcodes don't get size suffixes if using Intel opcodes. */ + if (ASSEMBLER_DIALECT == ASM_INTEL) + return; + + switch (GET_MODE_SIZE (GET_MODE (x))) + { + case 1: + putc ('b', file); + return; + + case 2: + putc ('w', file); + return; + + case 4: + putc ('l', file); + return; + + case 8: + putc ('q', file); + return; + + default: + output_operand_lossage + ("invalid operand size for operand code '%c'", code); + return; + } + } + + if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT) + warning + (0, "non-integer operand used with operand code '%c'", code); + /* FALLTHRU */ + + case 'Z': + /* 387 opcodes don't get size suffixes if using Intel opcodes. */ + if (ASSEMBLER_DIALECT == ASM_INTEL) + return; + + if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT) + { + switch (GET_MODE_SIZE (GET_MODE (x))) + { + case 2: +#ifdef HAVE_AS_IX86_FILDS + putc ('s', file); +#endif + return; + + case 4: + putc ('l', file); + return; + + case 8: +#ifdef HAVE_AS_IX86_FILDQ + putc ('q', file); +#else + fputs ("ll", file); +#endif + return; + + default: + break; + } + } + else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT) + { + /* 387 opcodes don't get size suffixes + if the operands are registers. */ + if (STACK_REG_P (x)) + return; + + switch (GET_MODE_SIZE (GET_MODE (x))) + { + case 4: + putc ('s', file); + return; + + case 8: + putc ('l', file); + return; + + case 12: + case 16: + putc ('t', file); + return; + + default: + break; + } + } + else + { + output_operand_lossage + ("invalid operand type used with operand code '%c'", code); + return; + } + + output_operand_lossage + ("invalid operand size for operand code '%c'", code); + return; + + case 'd': + case 'b': + case 'w': + case 'k': + case 'q': + case 'h': + case 't': + case 'y': + case 'x': + case 'X': + case 'P': + break; + + case 's': + if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT) + { + ix86_print_operand (file, x, 0); + fputs (", ", file); + } + return; + + case 'D': + /* Little bit of braindamage here. The SSE compare instructions + does use completely different names for the comparisons that the + fp conditional moves. */ + if (TARGET_AVX) + { + switch (GET_CODE (x)) + { + case EQ: + fputs ("eq", file); + break; + case UNEQ: + fputs ("eq_us", file); + break; + case LT: + fputs ("lt", file); + break; + case UNLT: + fputs ("nge", file); + break; + case LE: + fputs ("le", file); + break; + case UNLE: + fputs ("ngt", file); + break; + case UNORDERED: + fputs ("unord", file); + break; + case NE: + fputs ("neq", file); + break; + case LTGT: + fputs ("neq_oq", file); + break; + case GE: + fputs ("ge", file); + break; + case UNGE: + fputs ("nlt", file); + break; + case GT: + fputs ("gt", file); + break; + case UNGT: + fputs ("nle", file); + break; + case ORDERED: + fputs ("ord", file); + break; + default: + output_operand_lossage ("operand is not a condition code, " + "invalid operand code 'D'"); + return; + } + } + else + { + switch (GET_CODE (x)) + { + case EQ: + case UNEQ: + fputs ("eq", file); + break; + case LT: + case UNLT: + fputs ("lt", file); + break; + case LE: + case UNLE: + fputs ("le", file); + break; + case UNORDERED: + fputs ("unord", file); + break; + case NE: + case LTGT: + fputs ("neq", file); + break; + case UNGE: + case GE: + fputs ("nlt", file); + break; + case UNGT: + case GT: + fputs ("nle", file); + break; + case ORDERED: + fputs ("ord", file); + break; + default: + output_operand_lossage ("operand is not a condition code, " + "invalid operand code 'D'"); + return; + } + } + return; + case 'O': +#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX + if (ASSEMBLER_DIALECT == ASM_ATT) + { + switch (GET_MODE (x)) + { + case HImode: putc ('w', file); break; + case SImode: + case SFmode: putc ('l', file); break; + case DImode: + case DFmode: putc ('q', file); break; + default: gcc_unreachable (); + } + putc ('.', file); + } +#endif + return; + case 'C': + if (!COMPARISON_P (x)) + { + output_operand_lossage ("operand is neither a constant nor a " + "condition code, invalid operand code " + "'C'"); + return; + } + put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 0, file); + return; + case 'F': + if (!COMPARISON_P (x)) + { + output_operand_lossage ("operand is neither a constant nor a " + "condition code, invalid operand code " + "'F'"); + return; + } +#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('.', file); +#endif + put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 1, file); + return; + + /* Like above, but reverse condition */ + case 'c': + /* Check to see if argument to %c is really a constant + and not a condition code which needs to be reversed. */ + if (!COMPARISON_P (x)) + { + output_operand_lossage ("operand is neither a constant nor a " + "condition code, invalid operand " + "code 'c'"); + return; + } + put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 0, file); + return; + case 'f': + if (!COMPARISON_P (x)) + { + output_operand_lossage ("operand is neither a constant nor a " + "condition code, invalid operand " + "code 'f'"); + return; + } +#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('.', file); +#endif + put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file); + return; + + case 'H': + if (!offsettable_memref_p (x)) + { + output_operand_lossage ("operand is not an offsettable memory " + "reference, invalid operand " + "code 'H'"); + return; + } + /* It doesn't actually matter what mode we use here, as we're + only going to use this for printing. */ + x = adjust_address_nv (x, DImode, 8); + break; + + case '+': + { + rtx x; + + if (!optimize + || optimize_function_for_size_p (cfun) || !TARGET_BRANCH_PREDICTION_HINTS) + return; + + x = find_reg_note (current_output_insn, REG_BR_PROB, 0); + if (x) + { + int pred_val = INTVAL (XEXP (x, 0)); + + if (pred_val < REG_BR_PROB_BASE * 45 / 100 + || pred_val > REG_BR_PROB_BASE * 55 / 100) + { + int taken = pred_val > REG_BR_PROB_BASE / 2; + int cputaken = final_forward_branch_p (current_output_insn) == 0; + + /* Emit hints only in the case default branch prediction + heuristics would fail. */ + if (taken != cputaken) + { + /* We use 3e (DS) prefix for taken branches and + 2e (CS) prefix for not taken branches. */ + if (taken) + fputs ("ds ; ", file); + else + fputs ("cs ; ", file); + } + } + } + return; + } + + case 'Y': + switch (GET_CODE (x)) + { + case NE: + fputs ("neq", file); + break; + case EQ: + fputs ("eq", file); + break; + case GE: + case GEU: + fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file); + break; + case GT: + case GTU: + fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file); + break; + case LE: + case LEU: + fputs ("le", file); + break; + case LT: + case LTU: + fputs ("lt", file); + break; + case UNORDERED: + fputs ("unord", file); + break; + case ORDERED: + fputs ("ord", file); + break; + case UNEQ: + fputs ("ueq", file); + break; + case UNGE: + fputs ("nlt", file); + break; + case UNGT: + fputs ("nle", file); + break; + case UNLE: + fputs ("ule", file); + break; + case UNLT: + fputs ("ult", file); + break; + case LTGT: + fputs ("une", file); + break; + default: + output_operand_lossage ("operand is not a condition code, " + "invalid operand code 'Y'"); + return; + } + return; + + case ';': +#ifndef HAVE_AS_IX86_REP_LOCK_PREFIX + putc (';', file); +#endif + return; + + case '@': + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('%', file); + + /* The kernel uses a different segment register for performance + reasons; a system call would not have to trash the userspace + segment register, which would be expensive. */ + if (TARGET_64BIT && ix86_cmodel != CM_KERNEL) + fputs ("fs", file); + else + fputs ("gs", file); + return; + + default: + output_operand_lossage ("invalid operand code '%c'", code); + } + } + + if (REG_P (x)) + print_reg (x, code, file); + + else if (MEM_P (x)) + { + /* No `byte ptr' prefix for call instructions or BLKmode operands. */ + if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P' + && GET_MODE (x) != BLKmode) + { + const char * size; + switch (GET_MODE_SIZE (GET_MODE (x))) + { + case 1: size = "BYTE"; break; + case 2: size = "WORD"; break; + case 4: size = "DWORD"; break; + case 8: size = "QWORD"; break; + case 12: size = "TBYTE"; break; + case 16: + if (GET_MODE (x) == XFmode) + size = "TBYTE"; + else + size = "XMMWORD"; + break; + case 32: size = "YMMWORD"; break; + default: + gcc_unreachable (); + } + + /* Check for explicit size override (codes 'b', 'w' and 'k') */ + if (code == 'b') + size = "BYTE"; + else if (code == 'w') + size = "WORD"; + else if (code == 'k') + size = "DWORD"; + + fputs (size, file); + fputs (" PTR ", file); + } + + x = XEXP (x, 0); + /* Avoid (%rip) for call operands. */ + if (CONSTANT_ADDRESS_P (x) && code == 'P' + && !CONST_INT_P (x)) + output_addr_const (file, x); + else if (this_is_asm_operands && ! address_operand (x, VOIDmode)) + output_operand_lossage ("invalid constraints for operand"); + else + output_address (x); + } + + else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode) + { + REAL_VALUE_TYPE r; + long l; + + REAL_VALUE_FROM_CONST_DOUBLE (r, x); + REAL_VALUE_TO_TARGET_SINGLE (r, l); + + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('$', file); + /* Sign extend 32bit SFmode immediate to 8 bytes. */ + if (code == 'q') + fprintf (file, "0x%08llx", (unsigned long long) (int) l); + else + fprintf (file, "0x%08x", (unsigned int) l); + } + + else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode) + { + REAL_VALUE_TYPE r; + long l[2]; + + REAL_VALUE_FROM_CONST_DOUBLE (r, x); + REAL_VALUE_TO_TARGET_DOUBLE (r, l); + + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('$', file); + fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff); + } + + /* These float cases don't actually occur as immediate operands. */ + else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode) + { + char dstr[30]; + + real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1); + fputs (dstr, file); + } + + else + { + /* We have patterns that allow zero sets of memory, for instance. + In 64-bit mode, we should probably support all 8-byte vectors, + since we can in fact encode that into an immediate. */ + if (GET_CODE (x) == CONST_VECTOR) + { + gcc_assert (x == CONST0_RTX (GET_MODE (x))); + x = const0_rtx; + } + + if (code != 'P') + { + if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE) + { + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('$', file); + } + else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF + || GET_CODE (x) == LABEL_REF) + { + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('$', file); + else + fputs ("OFFSET FLAT:", file); + } + } + if (CONST_INT_P (x)) + fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x)); + else if (flag_pic || MACHOPIC_INDIRECT) + output_pic_addr_const (file, x, code); + else + output_addr_const (file, x); + } +} + +static bool +ix86_print_operand_punct_valid_p (unsigned char code) +{ + return (code == '@' || code == '*' || code == '+' + || code == '&' || code == ';'); +} + +/* Print a memory operand whose address is ADDR. */ + +static void +ix86_print_operand_address (FILE *file, rtx addr) +{ + struct ix86_address parts; + rtx base, index, disp; + int scale; + int ok = ix86_decompose_address (addr, &parts); + + gcc_assert (ok); + + base = parts.base; + index = parts.index; + disp = parts.disp; + scale = parts.scale; + + switch (parts.seg) + { + case SEG_DEFAULT: + break; + case SEG_FS: + case SEG_GS: + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('%', file); + fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file); + break; + default: + gcc_unreachable (); + } + + /* Use one byte shorter RIP relative addressing for 64bit mode. */ + if (TARGET_64BIT && !base && !index) + { + rtx symbol = disp; + + if (GET_CODE (disp) == CONST + && GET_CODE (XEXP (disp, 0)) == PLUS + && CONST_INT_P (XEXP (XEXP (disp, 0), 1))) + symbol = XEXP (XEXP (disp, 0), 0); + + if (GET_CODE (symbol) == LABEL_REF + || (GET_CODE (symbol) == SYMBOL_REF + && SYMBOL_REF_TLS_MODEL (symbol) == 0)) + base = pc_rtx; + } + if (!base && !index) + { + /* Displacement only requires special attention. */ + + if (CONST_INT_P (disp)) + { + if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT) + fputs ("ds:", file); + fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp)); + } + else if (flag_pic) + output_pic_addr_const (file, disp, 0); + else + output_addr_const (file, disp); + } + else + { + if (ASSEMBLER_DIALECT == ASM_ATT) + { + if (disp) + { + if (flag_pic) + output_pic_addr_const (file, disp, 0); + else if (GET_CODE (disp) == LABEL_REF) + output_asm_label (disp); + else + output_addr_const (file, disp); + } + + putc ('(', file); + if (base) + print_reg (base, 0, file); + if (index) + { + putc (',', file); + print_reg (index, 0, file); + if (scale != 1) + fprintf (file, ",%d", scale); + } + putc (')', file); + } + else + { + rtx offset = NULL_RTX; + + if (disp) + { + /* Pull out the offset of a symbol; print any symbol itself. */ + if (GET_CODE (disp) == CONST + && GET_CODE (XEXP (disp, 0)) == PLUS + && CONST_INT_P (XEXP (XEXP (disp, 0), 1))) + { + offset = XEXP (XEXP (disp, 0), 1); + disp = gen_rtx_CONST (VOIDmode, + XEXP (XEXP (disp, 0), 0)); + } + + if (flag_pic) + output_pic_addr_const (file, disp, 0); + else if (GET_CODE (disp) == LABEL_REF) + output_asm_label (disp); + else if (CONST_INT_P (disp)) + offset = disp; + else + output_addr_const (file, disp); + } + + putc ('[', file); + if (base) + { + print_reg (base, 0, file); + if (offset) + { + if (INTVAL (offset) >= 0) + putc ('+', file); + fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset)); + } + } + else if (offset) + fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset)); + else + putc ('0', file); + + if (index) + { + putc ('+', file); + print_reg (index, 0, file); + if (scale != 1) + fprintf (file, "*%d", scale); + } + putc (']', file); + } + } +} + +/* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */ + +static bool +i386_asm_output_addr_const_extra (FILE *file, rtx x) +{ + rtx op; + + if (GET_CODE (x) != UNSPEC) + return false; + + op = XVECEXP (x, 0, 0); + switch (XINT (x, 1)) + { + case UNSPEC_GOTTPOFF: + output_addr_const (file, op); + /* FIXME: This might be @TPOFF in Sun ld. */ + fputs ("@gottpoff", file); + break; + case UNSPEC_TPOFF: + output_addr_const (file, op); + fputs ("@tpoff", file); + break; + case UNSPEC_NTPOFF: + output_addr_const (file, op); + if (TARGET_64BIT) + fputs ("@tpoff", file); + else + fputs ("@ntpoff", file); + break; + case UNSPEC_DTPOFF: + output_addr_const (file, op); + fputs ("@dtpoff", file); + break; + case UNSPEC_GOTNTPOFF: + output_addr_const (file, op); + if (TARGET_64BIT) + fputs (ASSEMBLER_DIALECT == ASM_ATT ? + "@gottpoff(%rip)" : "@gottpoff[rip]", file); + else + fputs ("@gotntpoff", file); + break; + case UNSPEC_INDNTPOFF: + output_addr_const (file, op); + fputs ("@indntpoff", file); + break; +#if TARGET_MACHO + case UNSPEC_MACHOPIC_OFFSET: + output_addr_const (file, op); + putc ('-', file); + machopic_output_function_base_name (file); + break; +#endif + + case UNSPEC_STACK_CHECK: + { + int offset; + + gcc_assert (flag_split_stack); + +#ifdef TARGET_THREAD_SPLIT_STACK_OFFSET + offset = TARGET_THREAD_SPLIT_STACK_OFFSET; +#else + gcc_unreachable (); +#endif + + fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset); + } + break; + + default: + return false; + } + + return true; +} + +/* Split one or more double-mode RTL references into pairs of half-mode + references. The RTL can be REG, offsettable MEM, integer constant, or + CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to + split and "num" is its length. lo_half and hi_half are output arrays + that parallel "operands". */ + +void +split_double_mode (enum machine_mode mode, rtx operands[], + int num, rtx lo_half[], rtx hi_half[]) +{ + enum machine_mode half_mode; + unsigned int byte; + + switch (mode) + { + case TImode: + half_mode = DImode; + break; + case DImode: + half_mode = SImode; + break; + default: + gcc_unreachable (); + } + + byte = GET_MODE_SIZE (half_mode); + + while (num--) + { + rtx op = operands[num]; + + /* simplify_subreg refuse to split volatile memory addresses, + but we still have to handle it. */ + if (MEM_P (op)) + { + lo_half[num] = adjust_address (op, half_mode, 0); + hi_half[num] = adjust_address (op, half_mode, byte); + } + else + { + lo_half[num] = simplify_gen_subreg (half_mode, op, + GET_MODE (op) == VOIDmode + ? mode : GET_MODE (op), 0); + hi_half[num] = simplify_gen_subreg (half_mode, op, + GET_MODE (op) == VOIDmode + ? mode : GET_MODE (op), byte); + } + } +} + +/* Output code to perform a 387 binary operation in INSN, one of PLUS, + MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3] + is the expression of the binary operation. The output may either be + emitted here, or returned to the caller, like all output_* functions. + + There is no guarantee that the operands are the same mode, as they + might be within FLOAT or FLOAT_EXTEND expressions. */ + +#ifndef SYSV386_COMPAT +/* Set to 1 for compatibility with brain-damaged assemblers. No-one + wants to fix the assemblers because that causes incompatibility + with gcc. No-one wants to fix gcc because that causes + incompatibility with assemblers... You can use the option of + -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */ +#define SYSV386_COMPAT 1 +#endif + +const char * +output_387_binary_op (rtx insn, rtx *operands) +{ + static char buf[40]; + const char *p; + const char *ssep; + int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]); + +#ifdef ENABLE_CHECKING + /* Even if we do not want to check the inputs, this documents input + constraints. Which helps in understanding the following code. */ + if (STACK_REG_P (operands[0]) + && ((REG_P (operands[1]) + && REGNO (operands[0]) == REGNO (operands[1]) + && (STACK_REG_P (operands[2]) || MEM_P (operands[2]))) + || (REG_P (operands[2]) + && REGNO (operands[0]) == REGNO (operands[2]) + && (STACK_REG_P (operands[1]) || MEM_P (operands[1])))) + && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2]))) + ; /* ok */ + else + gcc_assert (is_sse); +#endif + + switch (GET_CODE (operands[3])) + { + case PLUS: + if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT + || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT) + p = "fiadd"; + else + p = "fadd"; + ssep = "vadd"; + break; + + case MINUS: + if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT + || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT) + p = "fisub"; + else + p = "fsub"; + ssep = "vsub"; + break; + + case MULT: + if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT + || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT) + p = "fimul"; + else + p = "fmul"; + ssep = "vmul"; + break; + + case DIV: + if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT + || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT) + p = "fidiv"; + else + p = "fdiv"; + ssep = "vdiv"; + break; + + default: + gcc_unreachable (); + } + + if (is_sse) + { + if (TARGET_AVX) + { + strcpy (buf, ssep); + if (GET_MODE (operands[0]) == SFmode) + strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}"); + else + strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}"); + } + else + { + strcpy (buf, ssep + 1); + if (GET_MODE (operands[0]) == SFmode) + strcat (buf, "ss\t{%2, %0|%0, %2}"); + else + strcat (buf, "sd\t{%2, %0|%0, %2}"); + } + return buf; + } + strcpy (buf, p); + + switch (GET_CODE (operands[3])) + { + case MULT: + case PLUS: + if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2])) + { + rtx temp = operands[2]; + operands[2] = operands[1]; + operands[1] = temp; + } + + /* know operands[0] == operands[1]. */ + + if (MEM_P (operands[2])) + { + p = "%Z2\t%2"; + break; + } + + if (find_regno_note (insn, REG_DEAD, REGNO (operands[2]))) + { + if (STACK_TOP_P (operands[0])) + /* How is it that we are storing to a dead operand[2]? + Well, presumably operands[1] is dead too. We can't + store the result to st(0) as st(0) gets popped on this + instruction. Instead store to operands[2] (which I + think has to be st(1)). st(1) will be popped later. + gcc <= 2.8.1 didn't have this check and generated + assembly code that the Unixware assembler rejected. */ + p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */ + else + p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */ + break; + } + + if (STACK_TOP_P (operands[0])) + p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */ + else + p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */ + break; + + case MINUS: + case DIV: + if (MEM_P (operands[1])) + { + p = "r%Z1\t%1"; + break; + } + + if (MEM_P (operands[2])) + { + p = "%Z2\t%2"; + break; + } + + if (find_regno_note (insn, REG_DEAD, REGNO (operands[2]))) + { +#if SYSV386_COMPAT + /* The SystemV/386 SVR3.2 assembler, and probably all AT&T + derived assemblers, confusingly reverse the direction of + the operation for fsub{r} and fdiv{r} when the + destination register is not st(0). The Intel assembler + doesn't have this brain damage. Read !SYSV386_COMPAT to + figure out what the hardware really does. */ + if (STACK_TOP_P (operands[0])) + p = "{p\t%0, %2|rp\t%2, %0}"; + else + p = "{rp\t%2, %0|p\t%0, %2}"; +#else + if (STACK_TOP_P (operands[0])) + /* As above for fmul/fadd, we can't store to st(0). */ + p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */ + else + p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */ +#endif + break; + } + + if (find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) + { +#if SYSV386_COMPAT + if (STACK_TOP_P (operands[0])) + p = "{rp\t%0, %1|p\t%1, %0}"; + else + p = "{p\t%1, %0|rp\t%0, %1}"; +#else + if (STACK_TOP_P (operands[0])) + p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */ + else + p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */ +#endif + break; + } + + if (STACK_TOP_P (operands[0])) + { + if (STACK_TOP_P (operands[1])) + p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */ + else + p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */ + break; + } + else if (STACK_TOP_P (operands[1])) + { +#if SYSV386_COMPAT + p = "{\t%1, %0|r\t%0, %1}"; +#else + p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */ +#endif + } + else + { +#if SYSV386_COMPAT + p = "{r\t%2, %0|\t%0, %2}"; +#else + p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */ +#endif + } + break; + + default: + gcc_unreachable (); + } + + strcat (buf, p); + return buf; +} + +/* Return needed mode for entity in optimize_mode_switching pass. */ + +int +ix86_mode_needed (int entity, rtx insn) +{ + enum attr_i387_cw mode; + + /* The mode UNINITIALIZED is used to store control word after a + function call or ASM pattern. The mode ANY specify that function + has no requirements on the control word and make no changes in the + bits we are interested in. */ + + if (CALL_P (insn) + || (NONJUMP_INSN_P (insn) + && (asm_noperands (PATTERN (insn)) >= 0 + || GET_CODE (PATTERN (insn)) == ASM_INPUT))) + return I387_CW_UNINITIALIZED; + + if (recog_memoized (insn) < 0) + return I387_CW_ANY; + + mode = get_attr_i387_cw (insn); + + switch (entity) + { + case I387_TRUNC: + if (mode == I387_CW_TRUNC) + return mode; + break; + + case I387_FLOOR: + if (mode == I387_CW_FLOOR) + return mode; + break; + + case I387_CEIL: + if (mode == I387_CW_CEIL) + return mode; + break; + + case I387_MASK_PM: + if (mode == I387_CW_MASK_PM) + return mode; + break; + + default: + gcc_unreachable (); + } + + return I387_CW_ANY; +} + +/* Output code to initialize control word copies used by trunc?f?i and + rounding patterns. CURRENT_MODE is set to current control word, + while NEW_MODE is set to new control word. */ + +void +emit_i387_cw_initialization (int mode) +{ + rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED); + rtx new_mode; + + enum ix86_stack_slot slot; + + rtx reg = gen_reg_rtx (HImode); + + emit_insn (gen_x86_fnstcw_1 (stored_mode)); + emit_move_insn (reg, copy_rtx (stored_mode)); + + if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL + || optimize_function_for_size_p (cfun)) + { + switch (mode) + { + case I387_CW_TRUNC: + /* round toward zero (truncate) */ + emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00))); + slot = SLOT_CW_TRUNC; + break; + + case I387_CW_FLOOR: + /* round down toward -oo */ + emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00))); + emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400))); + slot = SLOT_CW_FLOOR; + break; + + case I387_CW_CEIL: + /* round up toward +oo */ + emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00))); + emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800))); + slot = SLOT_CW_CEIL; + break; + + case I387_CW_MASK_PM: + /* mask precision exception for nearbyint() */ + emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020))); + slot = SLOT_CW_MASK_PM; + break; + + default: + gcc_unreachable (); + } + } + else + { + switch (mode) + { + case I387_CW_TRUNC: + /* round toward zero (truncate) */ + emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc))); + slot = SLOT_CW_TRUNC; + break; + + case I387_CW_FLOOR: + /* round down toward -oo */ + emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4))); + slot = SLOT_CW_FLOOR; + break; + + case I387_CW_CEIL: + /* round up toward +oo */ + emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8))); + slot = SLOT_CW_CEIL; + break; + + case I387_CW_MASK_PM: + /* mask precision exception for nearbyint() */ + emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020))); + slot = SLOT_CW_MASK_PM; + break; + + default: + gcc_unreachable (); + } + } + + gcc_assert (slot < MAX_386_STACK_LOCALS); + + new_mode = assign_386_stack_local (HImode, slot); + emit_move_insn (new_mode, reg); +} + +/* Output code for INSN to convert a float to a signed int. OPERANDS + are the insn operands. The output may be [HSD]Imode and the input + operand may be [SDX]Fmode. */ + +const char * +output_fix_trunc (rtx insn, rtx *operands, int fisttp) +{ + int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0; + int dimode_p = GET_MODE (operands[0]) == DImode; + int round_mode = get_attr_i387_cw (insn); + + /* Jump through a hoop or two for DImode, since the hardware has no + non-popping instruction. We used to do this a different way, but + that was somewhat fragile and broke with post-reload splitters. */ + if ((dimode_p || fisttp) && !stack_top_dies) + output_asm_insn ("fld\t%y1", operands); + + gcc_assert (STACK_TOP_P (operands[1])); + gcc_assert (MEM_P (operands[0])); + gcc_assert (GET_MODE (operands[1]) != TFmode); + + if (fisttp) + output_asm_insn ("fisttp%Z0\t%0", operands); + else + { + if (round_mode != I387_CW_ANY) + output_asm_insn ("fldcw\t%3", operands); + if (stack_top_dies || dimode_p) + output_asm_insn ("fistp%Z0\t%0", operands); + else + output_asm_insn ("fist%Z0\t%0", operands); + if (round_mode != I387_CW_ANY) + output_asm_insn ("fldcw\t%2", operands); + } + + return ""; +} + +/* Output code for x87 ffreep insn. The OPNO argument, which may only + have the values zero or one, indicates the ffreep insn's operand + from the OPERANDS array. */ + +static const char * +output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno) +{ + if (TARGET_USE_FFREEP) +#ifdef HAVE_AS_IX86_FFREEP + return opno ? "ffreep\t%y1" : "ffreep\t%y0"; +#else + { + static char retval[32]; + int regno = REGNO (operands[opno]); + + gcc_assert (FP_REGNO_P (regno)); + + regno -= FIRST_STACK_REG; + + snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno); + return retval; + } +#endif + + return opno ? "fstp\t%y1" : "fstp\t%y0"; +} + + +/* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi + should be used. UNORDERED_P is true when fucom should be used. */ + +const char * +output_fp_compare (rtx insn, rtx *operands, int eflags_p, int unordered_p) +{ + int stack_top_dies; + rtx cmp_op0, cmp_op1; + int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]); + + if (eflags_p) + { + cmp_op0 = operands[0]; + cmp_op1 = operands[1]; + } + else + { + cmp_op0 = operands[1]; + cmp_op1 = operands[2]; + } + + if (is_sse) + { + static const char ucomiss[] = "vucomiss\t{%1, %0|%0, %1}"; + static const char ucomisd[] = "vucomisd\t{%1, %0|%0, %1}"; + static const char comiss[] = "vcomiss\t{%1, %0|%0, %1}"; + static const char comisd[] = "vcomisd\t{%1, %0|%0, %1}"; + + if (GET_MODE (operands[0]) == SFmode) + if (unordered_p) + return &ucomiss[TARGET_AVX ? 0 : 1]; + else + return &comiss[TARGET_AVX ? 0 : 1]; + else + if (unordered_p) + return &ucomisd[TARGET_AVX ? 0 : 1]; + else + return &comisd[TARGET_AVX ? 0 : 1]; + } + + gcc_assert (STACK_TOP_P (cmp_op0)); + + stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0; + + if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1))) + { + if (stack_top_dies) + { + output_asm_insn ("ftst\n\tfnstsw\t%0", operands); + return output_387_ffreep (operands, 1); + } + else + return "ftst\n\tfnstsw\t%0"; + } + + if (STACK_REG_P (cmp_op1) + && stack_top_dies + && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1)) + && REGNO (cmp_op1) != FIRST_STACK_REG) + { + /* If both the top of the 387 stack dies, and the other operand + is also a stack register that dies, then this must be a + `fcompp' float compare */ + + if (eflags_p) + { + /* There is no double popping fcomi variant. Fortunately, + eflags is immune from the fstp's cc clobbering. */ + if (unordered_p) + output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands); + else + output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands); + return output_387_ffreep (operands, 0); + } + else + { + if (unordered_p) + return "fucompp\n\tfnstsw\t%0"; + else + return "fcompp\n\tfnstsw\t%0"; + } + } + else + { + /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */ + + static const char * const alt[16] = + { + "fcom%Z2\t%y2\n\tfnstsw\t%0", + "fcomp%Z2\t%y2\n\tfnstsw\t%0", + "fucom%Z2\t%y2\n\tfnstsw\t%0", + "fucomp%Z2\t%y2\n\tfnstsw\t%0", + + "ficom%Z2\t%y2\n\tfnstsw\t%0", + "ficomp%Z2\t%y2\n\tfnstsw\t%0", + NULL, + NULL, + + "fcomi\t{%y1, %0|%0, %y1}", + "fcomip\t{%y1, %0|%0, %y1}", + "fucomi\t{%y1, %0|%0, %y1}", + "fucomip\t{%y1, %0|%0, %y1}", + + NULL, + NULL, + NULL, + NULL + }; + + int mask; + const char *ret; + + mask = eflags_p << 3; + mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2; + mask |= unordered_p << 1; + mask |= stack_top_dies; + + gcc_assert (mask < 16); + ret = alt[mask]; + gcc_assert (ret); + + return ret; + } +} + +void +ix86_output_addr_vec_elt (FILE *file, int value) +{ + const char *directive = ASM_LONG; + +#ifdef ASM_QUAD + if (TARGET_64BIT) + directive = ASM_QUAD; +#else + gcc_assert (!TARGET_64BIT); +#endif + + fprintf (file, "%s%s%d\n", directive, LPREFIX, value); +} + +void +ix86_output_addr_diff_elt (FILE *file, int value, int rel) +{ + const char *directive = ASM_LONG; + +#ifdef ASM_QUAD + if (TARGET_64BIT && CASE_VECTOR_MODE == DImode) + directive = ASM_QUAD; +#else + gcc_assert (!TARGET_64BIT); +#endif + /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */ + if (TARGET_64BIT || TARGET_VXWORKS_RTP) + fprintf (file, "%s%s%d-%s%d\n", + directive, LPREFIX, value, LPREFIX, rel); + else if (HAVE_AS_GOTOFF_IN_DATA) + fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value); +#if TARGET_MACHO + else if (TARGET_MACHO) + { + fprintf (file, ASM_LONG "%s%d-", LPREFIX, value); + machopic_output_function_base_name (file); + putc ('\n', file); + } +#endif + else + asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n", + GOT_SYMBOL_NAME, LPREFIX, value); +} + +/* Generate either "mov $0, reg" or "xor reg, reg", as appropriate + for the target. */ + +void +ix86_expand_clear (rtx dest) +{ + rtx tmp; + + /* We play register width games, which are only valid after reload. */ + gcc_assert (reload_completed); + + /* Avoid HImode and its attendant prefix byte. */ + if (GET_MODE_SIZE (GET_MODE (dest)) < 4) + dest = gen_rtx_REG (SImode, REGNO (dest)); + tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx); + + /* This predicate should match that for movsi_xor and movdi_xor_rex64. */ + if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ()) + { + rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); + tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob)); + } + + emit_insn (tmp); +} + +/* X is an unchanging MEM. If it is a constant pool reference, return + the constant pool rtx, else NULL. */ + +rtx +maybe_get_pool_constant (rtx x) +{ + x = ix86_delegitimize_address (XEXP (x, 0)); + + if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)) + return get_pool_constant (x); + + return NULL_RTX; +} + +void +ix86_expand_move (enum machine_mode mode, rtx operands[]) +{ + rtx op0, op1; + enum tls_model model; + + op0 = operands[0]; + op1 = operands[1]; + + if (GET_CODE (op1) == SYMBOL_REF) + { + model = SYMBOL_REF_TLS_MODEL (op1); + if (model) + { + op1 = legitimize_tls_address (op1, model, true); + op1 = force_operand (op1, op0); + if (op1 == op0) + return; + } + else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES + && SYMBOL_REF_DLLIMPORT_P (op1)) + op1 = legitimize_dllimport_symbol (op1, false); + } + else if (GET_CODE (op1) == CONST + && GET_CODE (XEXP (op1, 0)) == PLUS + && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF) + { + rtx addend = XEXP (XEXP (op1, 0), 1); + rtx symbol = XEXP (XEXP (op1, 0), 0); + rtx tmp = NULL; + + model = SYMBOL_REF_TLS_MODEL (symbol); + if (model) + tmp = legitimize_tls_address (symbol, model, true); + else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES + && SYMBOL_REF_DLLIMPORT_P (symbol)) + tmp = legitimize_dllimport_symbol (symbol, true); + + if (tmp) + { + tmp = force_operand (tmp, NULL); + op1 = expand_simple_binop (Pmode, PLUS, tmp, addend, + op0, 1, OPTAB_DIRECT); + if (op1 == op0) + return; + } + } + + if ((flag_pic || MACHOPIC_INDIRECT) + && mode == Pmode && symbolic_operand (op1, Pmode)) + { + if (TARGET_MACHO && !TARGET_64BIT) + { +#if TARGET_MACHO + /* dynamic-no-pic */ + if (MACHOPIC_INDIRECT) + { + rtx temp = ((reload_in_progress + || ((op0 && REG_P (op0)) + && mode == Pmode)) + ? op0 : gen_reg_rtx (Pmode)); + op1 = machopic_indirect_data_reference (op1, temp); + if (MACHOPIC_PURE) + op1 = machopic_legitimize_pic_address (op1, mode, + temp == op1 ? 0 : temp); + } + if (op0 != op1 && GET_CODE (op0) != MEM) + { + rtx insn = gen_rtx_SET (VOIDmode, op0, op1); + emit_insn (insn); + return; + } + if (GET_CODE (op0) == MEM) + op1 = force_reg (Pmode, op1); + else + { + rtx temp = op0; + if (GET_CODE (temp) != REG) + temp = gen_reg_rtx (Pmode); + temp = legitimize_pic_address (op1, temp); + if (temp == op0) + return; + op1 = temp; + } + /* dynamic-no-pic */ +#endif + } + else + { + if (MEM_P (op0)) + op1 = force_reg (Pmode, op1); + else if (!TARGET_64BIT || !x86_64_movabs_operand (op1, Pmode)) + { + rtx reg = can_create_pseudo_p () ? NULL_RTX : op0; + op1 = legitimize_pic_address (op1, reg); + if (op0 == op1) + return; + } + } + } + else + { + if (MEM_P (op0) + && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode) + || !push_operand (op0, mode)) + && MEM_P (op1)) + op1 = force_reg (mode, op1); + + if (push_operand (op0, mode) + && ! general_no_elim_operand (op1, mode)) + op1 = copy_to_mode_reg (mode, op1); + + /* Force large constants in 64bit compilation into register + to get them CSEed. */ + if (can_create_pseudo_p () + && (mode == DImode) && TARGET_64BIT + && immediate_operand (op1, mode) + && !x86_64_zext_immediate_operand (op1, VOIDmode) + && !register_operand (op0, mode) + && optimize) + op1 = copy_to_mode_reg (mode, op1); + + if (can_create_pseudo_p () + && FLOAT_MODE_P (mode) + && GET_CODE (op1) == CONST_DOUBLE) + { + /* If we are loading a floating point constant to a register, + force the value to memory now, since we'll get better code + out the back end. */ + + op1 = validize_mem (force_const_mem (mode, op1)); + if (!register_operand (op0, mode)) + { + rtx temp = gen_reg_rtx (mode); + emit_insn (gen_rtx_SET (VOIDmode, temp, op1)); + emit_move_insn (op0, temp); + return; + } + } + } + + emit_insn (gen_rtx_SET (VOIDmode, op0, op1)); +} + +void +ix86_expand_vector_move (enum machine_mode mode, rtx operands[]) +{ + rtx op0 = operands[0], op1 = operands[1]; + unsigned int align = GET_MODE_ALIGNMENT (mode); + + /* Force constants other than zero into memory. We do not know how + the instructions used to build constants modify the upper 64 bits + of the register, once we have that information we may be able + to handle some of them more efficiently. */ + if (can_create_pseudo_p () + && register_operand (op0, mode) + && (CONSTANT_P (op1) + || (GET_CODE (op1) == SUBREG + && CONSTANT_P (SUBREG_REG (op1)))) + && !standard_sse_constant_p (op1)) + op1 = validize_mem (force_const_mem (mode, op1)); + + /* We need to check memory alignment for SSE mode since attribute + can make operands unaligned. */ + if (can_create_pseudo_p () + && SSE_REG_MODE_P (mode) + && ((MEM_P (op0) && (MEM_ALIGN (op0) < align)) + || (MEM_P (op1) && (MEM_ALIGN (op1) < align)))) + { + rtx tmp[2]; + + /* ix86_expand_vector_move_misalign() does not like constants ... */ + if (CONSTANT_P (op1) + || (GET_CODE (op1) == SUBREG + && CONSTANT_P (SUBREG_REG (op1)))) + op1 = validize_mem (force_const_mem (mode, op1)); + + /* ... nor both arguments in memory. */ + if (!register_operand (op0, mode) + && !register_operand (op1, mode)) + op1 = force_reg (mode, op1); + + tmp[0] = op0; tmp[1] = op1; + ix86_expand_vector_move_misalign (mode, tmp); + return; + } + + /* Make operand1 a register if it isn't already. */ + if (can_create_pseudo_p () + && !register_operand (op0, mode) + && !register_operand (op1, mode)) + { + emit_move_insn (op0, force_reg (GET_MODE (op0), op1)); + return; + } + + emit_insn (gen_rtx_SET (VOIDmode, op0, op1)); +} + +/* Split 32-byte AVX unaligned load and store if needed. */ + +static void +ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1) +{ + rtx m; + rtx (*extract) (rtx, rtx, rtx); + rtx (*move_unaligned) (rtx, rtx); + enum machine_mode mode; + + switch (GET_MODE (op0)) + { + default: + gcc_unreachable (); + case V32QImode: + extract = gen_avx_vextractf128v32qi; + move_unaligned = gen_avx_movdqu256; + mode = V16QImode; + break; + case V8SFmode: + extract = gen_avx_vextractf128v8sf; + move_unaligned = gen_avx_movups256; + mode = V4SFmode; + break; + case V4DFmode: + extract = gen_avx_vextractf128v4df; + move_unaligned = gen_avx_movupd256; + mode = V2DFmode; + break; + } + + if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD) + { + rtx r = gen_reg_rtx (mode); + m = adjust_address (op1, mode, 0); + emit_move_insn (r, m); + m = adjust_address (op1, mode, 16); + r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m); + emit_move_insn (op0, r); + } + else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE) + { + m = adjust_address (op0, mode, 0); + emit_insn (extract (m, op1, const0_rtx)); + m = adjust_address (op0, mode, 16); + emit_insn (extract (m, op1, const1_rtx)); + } + else + emit_insn (move_unaligned (op0, op1)); +} + +/* Implement the movmisalign patterns for SSE. Non-SSE modes go + straight to ix86_expand_vector_move. */ +/* Code generation for scalar reg-reg moves of single and double precision data: + if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true) + movaps reg, reg + else + movss reg, reg + if (x86_sse_partial_reg_dependency == true) + movapd reg, reg + else + movsd reg, reg + + Code generation for scalar loads of double precision data: + if (x86_sse_split_regs == true) + movlpd mem, reg (gas syntax) + else + movsd mem, reg + + Code generation for unaligned packed loads of single precision data + (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency): + if (x86_sse_unaligned_move_optimal) + movups mem, reg + + if (x86_sse_partial_reg_dependency == true) + { + xorps reg, reg + movlps mem, reg + movhps mem+8, reg + } + else + { + movlps mem, reg + movhps mem+8, reg + } + + Code generation for unaligned packed loads of double precision data + (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs): + if (x86_sse_unaligned_move_optimal) + movupd mem, reg + + if (x86_sse_split_regs == true) + { + movlpd mem, reg + movhpd mem+8, reg + } + else + { + movsd mem, reg + movhpd mem+8, reg + } + */ + +void +ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) +{ + rtx op0, op1, m; + + op0 = operands[0]; + op1 = operands[1]; + + if (TARGET_AVX) + { + switch (GET_MODE_CLASS (mode)) + { + case MODE_VECTOR_INT: + case MODE_INT: + switch (GET_MODE_SIZE (mode)) + { + case 16: + /* If we're optimizing for size, movups is the smallest. */ + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + { + op0 = gen_lowpart (V4SFmode, op0); + op1 = gen_lowpart (V4SFmode, op1); + emit_insn (gen_avx_movups (op0, op1)); + return; + } + op0 = gen_lowpart (V16QImode, op0); + op1 = gen_lowpart (V16QImode, op1); + emit_insn (gen_avx_movdqu (op0, op1)); + break; + case 32: + op0 = gen_lowpart (V32QImode, op0); + op1 = gen_lowpart (V32QImode, op1); + ix86_avx256_split_vector_move_misalign (op0, op1); + break; + default: + gcc_unreachable (); + } + break; + case MODE_VECTOR_FLOAT: + op0 = gen_lowpart (mode, op0); + op1 = gen_lowpart (mode, op1); + + switch (mode) + { + case V4SFmode: + emit_insn (gen_avx_movups (op0, op1)); + break; + case V8SFmode: + ix86_avx256_split_vector_move_misalign (op0, op1); + break; + case V2DFmode: + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + { + op0 = gen_lowpart (V4SFmode, op0); + op1 = gen_lowpart (V4SFmode, op1); + emit_insn (gen_avx_movups (op0, op1)); + return; + } + emit_insn (gen_avx_movupd (op0, op1)); + break; + case V4DFmode: + ix86_avx256_split_vector_move_misalign (op0, op1); + break; + default: + gcc_unreachable (); + } + break; + + default: + gcc_unreachable (); + } + + return; + } + + if (MEM_P (op1)) + { + /* If we're optimizing for size, movups is the smallest. */ + if (optimize_insn_for_size_p () + || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + { + op0 = gen_lowpart (V4SFmode, op0); + op1 = gen_lowpart (V4SFmode, op1); + emit_insn (gen_sse_movups (op0, op1)); + return; + } + + /* ??? If we have typed data, then it would appear that using + movdqu is the only way to get unaligned data loaded with + integer type. */ + if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT) + { + op0 = gen_lowpart (V16QImode, op0); + op1 = gen_lowpart (V16QImode, op1); + emit_insn (gen_sse2_movdqu (op0, op1)); + return; + } + + if (TARGET_SSE2 && mode == V2DFmode) + { + rtx zero; + + if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL) + { + op0 = gen_lowpart (V2DFmode, op0); + op1 = gen_lowpart (V2DFmode, op1); + emit_insn (gen_sse2_movupd (op0, op1)); + return; + } + + /* When SSE registers are split into halves, we can avoid + writing to the top half twice. */ + if (TARGET_SSE_SPLIT_REGS) + { + emit_clobber (op0); + zero = op0; + } + else + { + /* ??? Not sure about the best option for the Intel chips. + The following would seem to satisfy; the register is + entirely cleared, breaking the dependency chain. We + then store to the upper half, with a dependency depth + of one. A rumor has it that Intel recommends two movsd + followed by an unpacklpd, but this is unconfirmed. And + given that the dependency depth of the unpacklpd would + still be one, I'm not sure why this would be better. */ + zero = CONST0_RTX (V2DFmode); + } + + m = adjust_address (op1, DFmode, 0); + emit_insn (gen_sse2_loadlpd (op0, zero, m)); + m = adjust_address (op1, DFmode, 8); + emit_insn (gen_sse2_loadhpd (op0, op0, m)); + } + else + { + if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL) + { + op0 = gen_lowpart (V4SFmode, op0); + op1 = gen_lowpart (V4SFmode, op1); + emit_insn (gen_sse_movups (op0, op1)); + return; + } + + if (TARGET_SSE_PARTIAL_REG_DEPENDENCY) + emit_move_insn (op0, CONST0_RTX (mode)); + else + emit_clobber (op0); + + if (mode != V4SFmode) + op0 = gen_lowpart (V4SFmode, op0); + m = adjust_address (op1, V2SFmode, 0); + emit_insn (gen_sse_loadlps (op0, op0, m)); + m = adjust_address (op1, V2SFmode, 8); + emit_insn (gen_sse_loadhps (op0, op0, m)); + } + } + else if (MEM_P (op0)) + { + /* If we're optimizing for size, movups is the smallest. */ + if (optimize_insn_for_size_p () + || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + { + op0 = gen_lowpart (V4SFmode, op0); + op1 = gen_lowpart (V4SFmode, op1); + emit_insn (gen_sse_movups (op0, op1)); + return; + } + + /* ??? Similar to above, only less clear because of quote + typeless stores unquote. */ + if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES + && GET_MODE_CLASS (mode) == MODE_VECTOR_INT) + { + op0 = gen_lowpart (V16QImode, op0); + op1 = gen_lowpart (V16QImode, op1); + emit_insn (gen_sse2_movdqu (op0, op1)); + return; + } + + if (TARGET_SSE2 && mode == V2DFmode) + { + if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL) + { + op0 = gen_lowpart (V2DFmode, op0); + op1 = gen_lowpart (V2DFmode, op1); + emit_insn (gen_sse2_movupd (op0, op1)); + } + else + { + m = adjust_address (op0, DFmode, 0); + emit_insn (gen_sse2_storelpd (m, op1)); + m = adjust_address (op0, DFmode, 8); + emit_insn (gen_sse2_storehpd (m, op1)); + } + } + else + { + if (mode != V4SFmode) + op1 = gen_lowpart (V4SFmode, op1); + + if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL) + { + op0 = gen_lowpart (V4SFmode, op0); + emit_insn (gen_sse_movups (op0, op1)); + } + else + { + m = adjust_address (op0, V2SFmode, 0); + emit_insn (gen_sse_storelps (m, op1)); + m = adjust_address (op0, V2SFmode, 8); + emit_insn (gen_sse_storehps (m, op1)); + } + } + } + else + gcc_unreachable (); +} + +/* Expand a push in MODE. This is some mode for which we do not support + proper push instructions, at least from the registers that we expect + the value to live in. */ + +void +ix86_expand_push (enum machine_mode mode, rtx x) +{ + rtx tmp; + + tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx, + GEN_INT (-GET_MODE_SIZE (mode)), + stack_pointer_rtx, 1, OPTAB_DIRECT); + if (tmp != stack_pointer_rtx) + emit_move_insn (stack_pointer_rtx, tmp); + + tmp = gen_rtx_MEM (mode, stack_pointer_rtx); + + /* When we push an operand onto stack, it has to be aligned at least + at the function argument boundary. However since we don't have + the argument type, we can't determine the actual argument + boundary. */ + emit_move_insn (tmp, x); +} + +/* Helper function of ix86_fixup_binary_operands to canonicalize + operand order. Returns true if the operands should be swapped. */ + +static bool +ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode, + rtx operands[]) +{ + rtx dst = operands[0]; + rtx src1 = operands[1]; + rtx src2 = operands[2]; + + /* If the operation is not commutative, we can't do anything. */ + if (GET_RTX_CLASS (code) != RTX_COMM_ARITH) + return false; + + /* Highest priority is that src1 should match dst. */ + if (rtx_equal_p (dst, src1)) + return false; + if (rtx_equal_p (dst, src2)) + return true; + + /* Next highest priority is that immediate constants come second. */ + if (immediate_operand (src2, mode)) + return false; + if (immediate_operand (src1, mode)) + return true; + + /* Lowest priority is that memory references should come second. */ + if (MEM_P (src2)) + return false; + if (MEM_P (src1)) + return true; + + return false; +} + + +/* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the + destination to use for the operation. If different from the true + destination in operands[0], a copy operation will be required. */ + +rtx +ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode, + rtx operands[]) +{ + rtx dst = operands[0]; + rtx src1 = operands[1]; + rtx src2 = operands[2]; + + /* Canonicalize operand order. */ + if (ix86_swap_binary_operands_p (code, mode, operands)) + { + rtx temp; + + /* It is invalid to swap operands of different modes. */ + gcc_assert (GET_MODE (src1) == GET_MODE (src2)); + + temp = src1; + src1 = src2; + src2 = temp; + } + + /* Both source operands cannot be in memory. */ + if (MEM_P (src1) && MEM_P (src2)) + { + /* Optimization: Only read from memory once. */ + if (rtx_equal_p (src1, src2)) + { + src2 = force_reg (mode, src2); + src1 = src2; + } + else + src2 = force_reg (mode, src2); + } + + /* If the destination is memory, and we do not have matching source + operands, do things in registers. */ + if (MEM_P (dst) && !rtx_equal_p (dst, src1)) + dst = gen_reg_rtx (mode); + + /* Source 1 cannot be a constant. */ + if (CONSTANT_P (src1)) + src1 = force_reg (mode, src1); + + /* Source 1 cannot be a non-matching memory. */ + if (MEM_P (src1) && !rtx_equal_p (dst, src1)) + src1 = force_reg (mode, src1); + + operands[1] = src1; + operands[2] = src2; + return dst; +} + +/* Similarly, but assume that the destination has already been + set up properly. */ + +void +ix86_fixup_binary_operands_no_copy (enum rtx_code code, + enum machine_mode mode, rtx operands[]) +{ + rtx dst = ix86_fixup_binary_operands (code, mode, operands); + gcc_assert (dst == operands[0]); +} + +/* Attempt to expand a binary operator. Make the expansion closer to the + actual machine, then just general_operand, which will allow 3 separate + memory references (one output, two input) in a single insn. */ + +void +ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode, + rtx operands[]) +{ + rtx src1, src2, dst, op, clob; + + dst = ix86_fixup_binary_operands (code, mode, operands); + src1 = operands[1]; + src2 = operands[2]; + + /* Emit the instruction. */ + + op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2)); + if (reload_in_progress) + { + /* Reload doesn't know about the flags register, and doesn't know that + it doesn't want to clobber it. We can only do this with PLUS. */ + gcc_assert (code == PLUS); + emit_insn (op); + } + else if (reload_completed + && code == PLUS + && !rtx_equal_p (dst, src1)) + { + /* This is going to be an LEA; avoid splitting it later. */ + emit_insn (op); + } + else + { + clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); + emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob))); + } + + /* Fix up the destination if needed. */ + if (dst != operands[0]) + emit_move_insn (operands[0], dst); +} + +/* Return TRUE or FALSE depending on whether the binary operator meets the + appropriate constraints. */ + +bool +ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode, + rtx operands[3]) +{ + rtx dst = operands[0]; + rtx src1 = operands[1]; + rtx src2 = operands[2]; + + /* Both source operands cannot be in memory. */ + if (MEM_P (src1) && MEM_P (src2)) + return false; + + /* Canonicalize operand order for commutative operators. */ + if (ix86_swap_binary_operands_p (code, mode, operands)) + { + rtx temp = src1; + src1 = src2; + src2 = temp; + } + + /* If the destination is memory, we must have a matching source operand. */ + if (MEM_P (dst) && !rtx_equal_p (dst, src1)) + return false; + + /* Source 1 cannot be a constant. */ + if (CONSTANT_P (src1)) + return false; + + /* Source 1 cannot be a non-matching memory. */ + if (MEM_P (src1) && !rtx_equal_p (dst, src1)) + { + /* Support "andhi/andsi/anddi" as a zero-extending move. */ + return (code == AND + && (mode == HImode + || mode == SImode + || (TARGET_64BIT && mode == DImode)) + && CONST_INT_P (src2) + && (INTVAL (src2) == 0xff + || INTVAL (src2) == 0xffff)); + } + + return true; +} + +/* Attempt to expand a unary operator. Make the expansion closer to the + actual machine, then just general_operand, which will allow 2 separate + memory references (one output, one input) in a single insn. */ + +void +ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode, + rtx operands[]) +{ + int matching_memory; + rtx src, dst, op, clob; + + dst = operands[0]; + src = operands[1]; + + /* If the destination is memory, and we do not have matching source + operands, do things in registers. */ + matching_memory = 0; + if (MEM_P (dst)) + { + if (rtx_equal_p (dst, src)) + matching_memory = 1; + else + dst = gen_reg_rtx (mode); + } + + /* When source operand is memory, destination must match. */ + if (MEM_P (src) && !matching_memory) + src = force_reg (mode, src); + + /* Emit the instruction. */ + + op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src)); + if (reload_in_progress || code == NOT) + { + /* Reload doesn't know about the flags register, and doesn't know that + it doesn't want to clobber it. */ + gcc_assert (code == NOT); + emit_insn (op); + } + else + { + clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); + emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob))); + } + + /* Fix up the destination if needed. */ + if (dst != operands[0]) + emit_move_insn (operands[0], dst); +} + +/* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and + divisor are within the the range [0-255]. */ + +void +ix86_split_idivmod (enum machine_mode mode, rtx operands[], + bool signed_p) +{ + rtx end_label, qimode_label; + rtx insn, div, mod; + rtx scratch, tmp0, tmp1, tmp2; + rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx); + rtx (*gen_zero_extend) (rtx, rtx); + rtx (*gen_test_ccno_1) (rtx, rtx); + + switch (mode) + { + case SImode: + gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1; + gen_test_ccno_1 = gen_testsi_ccno_1; + gen_zero_extend = gen_zero_extendqisi2; + break; + case DImode: + gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1; + gen_test_ccno_1 = gen_testdi_ccno_1; + gen_zero_extend = gen_zero_extendqidi2; + break; + default: + gcc_unreachable (); + } + + end_label = gen_label_rtx (); + qimode_label = gen_label_rtx (); + + scratch = gen_reg_rtx (mode); + + /* Use 8bit unsigned divimod if dividend and divisor are within the + the range [0-255]. */ + emit_move_insn (scratch, operands[2]); + scratch = expand_simple_binop (mode, IOR, scratch, operands[3], + scratch, 1, OPTAB_DIRECT); + emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100))); + tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG); + tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx); + tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0, + gen_rtx_LABEL_REF (VOIDmode, qimode_label), + pc_rtx); + insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0)); + predict_jump (REG_BR_PROB_BASE * 50 / 100); + JUMP_LABEL (insn) = qimode_label; + + /* Generate original signed/unsigned divimod. */ + div = gen_divmod4_1 (operands[0], operands[1], + operands[2], operands[3]); + emit_insn (div); + + /* Branch to the end. */ + emit_jump_insn (gen_jump (end_label)); + emit_barrier (); + + /* Generate 8bit unsigned divide. */ + emit_label (qimode_label); + /* Don't use operands[0] for result of 8bit divide since not all + registers support QImode ZERO_EXTRACT. */ + tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0); + tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0); + tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0); + emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2)); + + if (signed_p) + { + div = gen_rtx_DIV (SImode, operands[2], operands[3]); + mod = gen_rtx_MOD (SImode, operands[2], operands[3]); + } + else + { + div = gen_rtx_UDIV (SImode, operands[2], operands[3]); + mod = gen_rtx_UMOD (SImode, operands[2], operands[3]); + } + + /* Extract remainder from AH. */ + tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8)); + if (REG_P (operands[1])) + insn = emit_move_insn (operands[1], tmp1); + else + { + /* Need a new scratch register since the old one has result + of 8bit divide. */ + scratch = gen_reg_rtx (mode); + emit_move_insn (scratch, tmp1); + insn = emit_move_insn (operands[1], scratch); + } + set_unique_reg_note (insn, REG_EQUAL, mod); + + /* Zero extend quotient from AL. */ + tmp1 = gen_lowpart (QImode, tmp0); + insn = emit_insn (gen_zero_extend (operands[0], tmp1)); + set_unique_reg_note (insn, REG_EQUAL, div); + + emit_label (end_label); +} + +#define LEA_SEARCH_THRESHOLD 12 + +/* Search backward for non-agu definition of register number REGNO1 + or register number REGNO2 in INSN's basic block until + 1. Pass LEA_SEARCH_THRESHOLD instructions, or + 2. Reach BB boundary, or + 3. Reach agu definition. + Returns the distance between the non-agu definition point and INSN. + If no definition point, returns -1. */ + +static int +distance_non_agu_define (unsigned int regno1, unsigned int regno2, + rtx insn) +{ + basic_block bb = BLOCK_FOR_INSN (insn); + int distance = 0; + df_ref *def_rec; + + if (insn != BB_HEAD (bb)) + { + rtx prev = PREV_INSN (insn); + while (prev && distance < LEA_SEARCH_THRESHOLD) + { + if (NONDEBUG_INSN_P (prev)) + { + distance++; + for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++) + if (DF_REF_TYPE (*def_rec) == DF_REF_REG_DEF + && !DF_REF_IS_ARTIFICIAL (*def_rec) + && (regno1 == DF_REF_REGNO (*def_rec) + || regno2 == DF_REF_REGNO (*def_rec))) + { + if (recog_memoized (prev) < 0 + || get_attr_type (prev) != TYPE_LEA) + goto done; + } + } + if (prev == BB_HEAD (bb)) + break; + prev = PREV_INSN (prev); + } + } + + if (distance < LEA_SEARCH_THRESHOLD) + { + edge e; + edge_iterator ei; + bool simple_loop = false; + + FOR_EACH_EDGE (e, ei, bb->preds) + if (e->src == bb) + { + simple_loop = true; + break; + } + + if (simple_loop) + { + rtx prev = BB_END (bb); + while (prev + && prev != insn + && distance < LEA_SEARCH_THRESHOLD) + { + if (NONDEBUG_INSN_P (prev)) + { + distance++; + for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++) + if (DF_REF_TYPE (*def_rec) == DF_REF_REG_DEF + && !DF_REF_IS_ARTIFICIAL (*def_rec) + && (regno1 == DF_REF_REGNO (*def_rec) + || regno2 == DF_REF_REGNO (*def_rec))) + { + if (recog_memoized (prev) < 0 + || get_attr_type (prev) != TYPE_LEA) + goto done; + } + } + prev = PREV_INSN (prev); + } + } + } + + distance = -1; + +done: + /* get_attr_type may modify recog data. We want to make sure + that recog data is valid for instruction INSN, on which + distance_non_agu_define is called. INSN is unchanged here. */ + extract_insn_cached (insn); + return distance; +} + +/* Return the distance between INSN and the next insn that uses + register number REGNO0 in memory address. Return -1 if no such + a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */ + +static int +distance_agu_use (unsigned int regno0, rtx insn) +{ + basic_block bb = BLOCK_FOR_INSN (insn); + int distance = 0; + df_ref *def_rec; + df_ref *use_rec; + + if (insn != BB_END (bb)) + { + rtx next = NEXT_INSN (insn); + while (next && distance < LEA_SEARCH_THRESHOLD) + { + if (NONDEBUG_INSN_P (next)) + { + distance++; + + for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++) + if ((DF_REF_TYPE (*use_rec) == DF_REF_REG_MEM_LOAD + || DF_REF_TYPE (*use_rec) == DF_REF_REG_MEM_STORE) + && regno0 == DF_REF_REGNO (*use_rec)) + { + /* Return DISTANCE if OP0 is used in memory + address in NEXT. */ + return distance; + } + + for (def_rec = DF_INSN_DEFS (next); *def_rec; def_rec++) + if (DF_REF_TYPE (*def_rec) == DF_REF_REG_DEF + && !DF_REF_IS_ARTIFICIAL (*def_rec) + && regno0 == DF_REF_REGNO (*def_rec)) + { + /* Return -1 if OP0 is set in NEXT. */ + return -1; + } + } + if (next == BB_END (bb)) + break; + next = NEXT_INSN (next); + } + } + + if (distance < LEA_SEARCH_THRESHOLD) + { + edge e; + edge_iterator ei; + bool simple_loop = false; + + FOR_EACH_EDGE (e, ei, bb->succs) + if (e->dest == bb) + { + simple_loop = true; + break; + } + + if (simple_loop) + { + rtx next = BB_HEAD (bb); + while (next + && next != insn + && distance < LEA_SEARCH_THRESHOLD) + { + if (NONDEBUG_INSN_P (next)) + { + distance++; + + for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++) + if ((DF_REF_TYPE (*use_rec) == DF_REF_REG_MEM_LOAD + || DF_REF_TYPE (*use_rec) == DF_REF_REG_MEM_STORE) + && regno0 == DF_REF_REGNO (*use_rec)) + { + /* Return DISTANCE if OP0 is used in memory + address in NEXT. */ + return distance; + } + + for (def_rec = DF_INSN_DEFS (next); *def_rec; def_rec++) + if (DF_REF_TYPE (*def_rec) == DF_REF_REG_DEF + && !DF_REF_IS_ARTIFICIAL (*def_rec) + && regno0 == DF_REF_REGNO (*def_rec)) + { + /* Return -1 if OP0 is set in NEXT. */ + return -1; + } + + } + next = NEXT_INSN (next); + } + } + } + + return -1; +} + +/* Define this macro to tune LEA priority vs ADD, it take effect when + there is a dilemma of choicing LEA or ADD + Negative value: ADD is more preferred than LEA + Zero: Netrual + Positive value: LEA is more preferred than ADD*/ +#define IX86_LEA_PRIORITY 2 + +/* Return true if it is ok to optimize an ADD operation to LEA + operation to avoid flag register consumation. For most processors, + ADD is faster than LEA. For the processors like ATOM, if the + destination register of LEA holds an actual address which will be + used soon, LEA is better and otherwise ADD is better. */ + +bool +ix86_lea_for_add_ok (rtx insn, rtx operands[]) +{ + unsigned int regno0 = true_regnum (operands[0]); + unsigned int regno1 = true_regnum (operands[1]); + unsigned int regno2 = true_regnum (operands[2]); + + /* If a = b + c, (a!=b && a!=c), must use lea form. */ + if (regno0 != regno1 && regno0 != regno2) + return true; + + if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun)) + return false; + else + { + int dist_define, dist_use; + + /* Return false if REGNO0 isn't used in memory address. */ + dist_use = distance_agu_use (regno0, insn); + if (dist_use <= 0) + return false; + + dist_define = distance_non_agu_define (regno1, regno2, insn); + if (dist_define <= 0) + return true; + + /* If this insn has both backward non-agu dependence and forward + agu dependence, the one with short distance take effect. */ + if ((dist_define + IX86_LEA_PRIORITY) < dist_use) + return false; + + return true; + } +} + +/* Return true if destination reg of SET_BODY is shift count of + USE_BODY. */ + +static bool +ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body) +{ + rtx set_dest; + rtx shift_rtx; + int i; + + /* Retrieve destination of SET_BODY. */ + switch (GET_CODE (set_body)) + { + case SET: + set_dest = SET_DEST (set_body); + if (!set_dest || !REG_P (set_dest)) + return false; + break; + case PARALLEL: + for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--) + if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i), + use_body)) + return true; + default: + return false; + break; + } + + /* Retrieve shift count of USE_BODY. */ + switch (GET_CODE (use_body)) + { + case SET: + shift_rtx = XEXP (use_body, 1); + break; + case PARALLEL: + for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--) + if (ix86_dep_by_shift_count_body (set_body, + XVECEXP (use_body, 0, i))) + return true; + default: + return false; + break; + } + + if (shift_rtx + && (GET_CODE (shift_rtx) == ASHIFT + || GET_CODE (shift_rtx) == LSHIFTRT + || GET_CODE (shift_rtx) == ASHIFTRT + || GET_CODE (shift_rtx) == ROTATE + || GET_CODE (shift_rtx) == ROTATERT)) + { + rtx shift_count = XEXP (shift_rtx, 1); + + /* Return true if shift count is dest of SET_BODY. */ + if (REG_P (shift_count) + && true_regnum (set_dest) == true_regnum (shift_count)) + return true; + } + + return false; +} + +/* Return true if destination reg of SET_INSN is shift count of + USE_INSN. */ + +bool +ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn) +{ + return ix86_dep_by_shift_count_body (PATTERN (set_insn), + PATTERN (use_insn)); +} + +/* Return TRUE or FALSE depending on whether the unary operator meets the + appropriate constraints. */ + +bool +ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED, + enum machine_mode mode ATTRIBUTE_UNUSED, + rtx operands[2] ATTRIBUTE_UNUSED) +{ + /* If one of operands is memory, source and destination must match. */ + if ((MEM_P (operands[0]) + || MEM_P (operands[1])) + && ! rtx_equal_p (operands[0], operands[1])) + return false; + return true; +} + +/* Return TRUE if the operands to a vec_interleave_{high,low}v2df + are ok, keeping in mind the possible movddup alternative. */ + +bool +ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high) +{ + if (MEM_P (operands[0])) + return rtx_equal_p (operands[0], operands[1 + high]); + if (MEM_P (operands[1]) && MEM_P (operands[2])) + return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]); + return true; +} + +/* Post-reload splitter for converting an SF or DFmode value in an + SSE register into an unsigned SImode. */ + +void +ix86_split_convert_uns_si_sse (rtx operands[]) +{ + enum machine_mode vecmode; + rtx value, large, zero_or_two31, input, two31, x; + + large = operands[1]; + zero_or_two31 = operands[2]; + input = operands[3]; + two31 = operands[4]; + vecmode = GET_MODE (large); + value = gen_rtx_REG (vecmode, REGNO (operands[0])); + + /* Load up the value into the low element. We must ensure that the other + elements are valid floats -- zero is the easiest such value. */ + if (MEM_P (input)) + { + if (vecmode == V4SFmode) + emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input)); + else + emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input)); + } + else + { + input = gen_rtx_REG (vecmode, REGNO (input)); + emit_move_insn (value, CONST0_RTX (vecmode)); + if (vecmode == V4SFmode) + emit_insn (gen_sse_movss (value, value, input)); + else + emit_insn (gen_sse2_movsd (value, value, input)); + } + + emit_move_insn (large, two31); + emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31); + + x = gen_rtx_fmt_ee (LE, vecmode, large, value); + emit_insn (gen_rtx_SET (VOIDmode, large, x)); + + x = gen_rtx_AND (vecmode, zero_or_two31, large); + emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x)); + + x = gen_rtx_MINUS (vecmode, value, zero_or_two31); + emit_insn (gen_rtx_SET (VOIDmode, value, x)); + + large = gen_rtx_REG (V4SImode, REGNO (large)); + emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31))); + + x = gen_rtx_REG (V4SImode, REGNO (value)); + if (vecmode == V4SFmode) + emit_insn (gen_sse2_cvttps2dq (x, value)); + else + emit_insn (gen_sse2_cvttpd2dq (x, value)); + value = x; + + emit_insn (gen_xorv4si3 (value, value, large)); +} + +/* Convert an unsigned DImode value into a DFmode, using only SSE. + Expects the 64-bit DImode to be supplied in a pair of integral + registers. Requires SSE2; will use SSE3 if available. For x86_32, + -mfpmath=sse, !optimize_size only. */ + +void +ix86_expand_convert_uns_didf_sse (rtx target, rtx input) +{ + REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt; + rtx int_xmm, fp_xmm; + rtx biases, exponents; + rtx x; + + int_xmm = gen_reg_rtx (V4SImode); + if (TARGET_INTER_UNIT_MOVES) + emit_insn (gen_movdi_to_sse (int_xmm, input)); + else if (TARGET_SSE_SPLIT_REGS) + { + emit_clobber (int_xmm); + emit_move_insn (gen_lowpart (DImode, int_xmm), input); + } + else + { + x = gen_reg_rtx (V2DImode); + ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0); + emit_move_insn (int_xmm, gen_lowpart (V4SImode, x)); + } + + x = gen_rtx_CONST_VECTOR (V4SImode, + gen_rtvec (4, GEN_INT (0x43300000UL), + GEN_INT (0x45300000UL), + const0_rtx, const0_rtx)); + exponents = validize_mem (force_const_mem (V4SImode, x)); + + /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */ + emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents)); + + /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm) + yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)). + Similarly (0x45300000UL ## fp_value_hi_xmm) yields + (0x1.0p84 + double(fp_value_hi_xmm)). + Note these exponents differ by 32. */ + + fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm)); + + /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values + in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */ + real_ldexp (&bias_lo_rvt, &dconst1, 52); + real_ldexp (&bias_hi_rvt, &dconst1, 84); + biases = const_double_from_real_value (bias_lo_rvt, DFmode); + x = const_double_from_real_value (bias_hi_rvt, DFmode); + biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x)); + biases = validize_mem (force_const_mem (V2DFmode, biases)); + emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases)); + + /* Add the upper and lower DFmode values together. */ + if (TARGET_SSE3) + emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm)); + else + { + x = copy_to_mode_reg (V2DFmode, fp_xmm); + emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm)); + emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x)); + } + + ix86_expand_vector_extract (false, target, fp_xmm, 0); +} + +/* Not used, but eases macroization of patterns. */ +void +ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED, + rtx input ATTRIBUTE_UNUSED) +{ + gcc_unreachable (); +} + +/* Convert an unsigned SImode value into a DFmode. Only currently used + for SSE, but applicable anywhere. */ + +void +ix86_expand_convert_uns_sidf_sse (rtx target, rtx input) +{ + REAL_VALUE_TYPE TWO31r; + rtx x, fp; + + x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1), + NULL, 1, OPTAB_DIRECT); + + fp = gen_reg_rtx (DFmode); + emit_insn (gen_floatsidf2 (fp, x)); + + real_ldexp (&TWO31r, &dconst1, 31); + x = const_double_from_real_value (TWO31r, DFmode); + + x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT); + if (x != target) + emit_move_insn (target, x); +} + +/* Convert a signed DImode value into a DFmode. Only used for SSE in + 32-bit mode; otherwise we have a direct convert instruction. */ + +void +ix86_expand_convert_sign_didf_sse (rtx target, rtx input) +{ + REAL_VALUE_TYPE TWO32r; + rtx fp_lo, fp_hi, x; + + fp_lo = gen_reg_rtx (DFmode); + fp_hi = gen_reg_rtx (DFmode); + + emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input))); + + real_ldexp (&TWO32r, &dconst1, 32); + x = const_double_from_real_value (TWO32r, DFmode); + fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT); + + ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input)); + + x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target, + 0, OPTAB_DIRECT); + if (x != target) + emit_move_insn (target, x); +} + +/* Convert an unsigned SImode value into a SFmode, using only SSE. + For x86_32, -mfpmath=sse, !optimize_size only. */ +void +ix86_expand_convert_uns_sisf_sse (rtx target, rtx input) +{ + REAL_VALUE_TYPE ONE16r; + rtx fp_hi, fp_lo, int_hi, int_lo, x; + + real_ldexp (&ONE16r, &dconst1, 16); + x = const_double_from_real_value (ONE16r, SFmode); + int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff), + NULL, 0, OPTAB_DIRECT); + int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16), + NULL, 0, OPTAB_DIRECT); + fp_hi = gen_reg_rtx (SFmode); + fp_lo = gen_reg_rtx (SFmode); + emit_insn (gen_floatsisf2 (fp_hi, int_hi)); + emit_insn (gen_floatsisf2 (fp_lo, int_lo)); + fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi, + 0, OPTAB_DIRECT); + fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target, + 0, OPTAB_DIRECT); + if (!rtx_equal_p (target, fp_hi)) + emit_move_insn (target, fp_hi); +} + +/* A subroutine of ix86_build_signbit_mask. If VECT is true, + then replicate the value for all elements of the vector + register. */ + +rtx +ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value) +{ + rtvec v; + switch (mode) + { + case V4SImode: + gcc_assert (vect); + v = gen_rtvec (4, value, value, value, value); + return gen_rtx_CONST_VECTOR (V4SImode, v); + + case V2DImode: + gcc_assert (vect); + v = gen_rtvec (2, value, value); + return gen_rtx_CONST_VECTOR (V2DImode, v); + + case V8SFmode: + if (vect) + v = gen_rtvec (8, value, value, value, value, + value, value, value, value); + else + v = gen_rtvec (8, value, CONST0_RTX (SFmode), + CONST0_RTX (SFmode), CONST0_RTX (SFmode), + CONST0_RTX (SFmode), CONST0_RTX (SFmode), + CONST0_RTX (SFmode), CONST0_RTX (SFmode)); + return gen_rtx_CONST_VECTOR (V8SFmode, v); + + case V4SFmode: + if (vect) + v = gen_rtvec (4, value, value, value, value); + else + v = gen_rtvec (4, value, CONST0_RTX (SFmode), + CONST0_RTX (SFmode), CONST0_RTX (SFmode)); + return gen_rtx_CONST_VECTOR (V4SFmode, v); + + case V4DFmode: + if (vect) + v = gen_rtvec (4, value, value, value, value); + else + v = gen_rtvec (4, value, CONST0_RTX (DFmode), + CONST0_RTX (DFmode), CONST0_RTX (DFmode)); + return gen_rtx_CONST_VECTOR (V4DFmode, v); + + case V2DFmode: + if (vect) + v = gen_rtvec (2, value, value); + else + v = gen_rtvec (2, value, CONST0_RTX (DFmode)); + return gen_rtx_CONST_VECTOR (V2DFmode, v); + + default: + gcc_unreachable (); + } +} + +/* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders + and ix86_expand_int_vcond. Create a mask for the sign bit in MODE + for an SSE register. If VECT is true, then replicate the mask for + all elements of the vector register. If INVERT is true, then create + a mask excluding the sign bit. */ + +rtx +ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert) +{ + enum machine_mode vec_mode, imode; + HOST_WIDE_INT hi, lo; + int shift = 63; + rtx v; + rtx mask; + + /* Find the sign bit, sign extended to 2*HWI. */ + switch (mode) + { + case V4SImode: + case V8SFmode: + case V4SFmode: + vec_mode = mode; + mode = GET_MODE_INNER (mode); + imode = SImode; + lo = 0x80000000, hi = lo < 0; + break; + + case V2DImode: + case V4DFmode: + case V2DFmode: + vec_mode = mode; + mode = GET_MODE_INNER (mode); + imode = DImode; + if (HOST_BITS_PER_WIDE_INT >= 64) + lo = (HOST_WIDE_INT)1 << shift, hi = -1; + else + lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT); + break; + + case TImode: + case TFmode: + vec_mode = VOIDmode; + if (HOST_BITS_PER_WIDE_INT >= 64) + { + imode = TImode; + lo = 0, hi = (HOST_WIDE_INT)1 << shift; + } + else + { + rtvec vec; + + imode = DImode; + lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT); + + if (invert) + { + lo = ~lo, hi = ~hi; + v = constm1_rtx; + } + else + v = const0_rtx; + + mask = immed_double_const (lo, hi, imode); + + vec = gen_rtvec (2, v, mask); + v = gen_rtx_CONST_VECTOR (V2DImode, vec); + v = copy_to_mode_reg (mode, gen_lowpart (mode, v)); + + return v; + } + break; + + default: + gcc_unreachable (); + } + + if (invert) + lo = ~lo, hi = ~hi; + + /* Force this value into the low part of a fp vector constant. */ + mask = immed_double_const (lo, hi, imode); + mask = gen_lowpart (mode, mask); + + if (vec_mode == VOIDmode) + return force_reg (mode, mask); + + v = ix86_build_const_vector (vec_mode, vect, mask); + return force_reg (vec_mode, v); +} + +/* Generate code for floating point ABS or NEG. */ + +void +ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode, + rtx operands[]) +{ + rtx mask, set, dst, src; + bool use_sse = false; + bool vector_mode = VECTOR_MODE_P (mode); + enum machine_mode vmode = mode; + + if (vector_mode) + use_sse = true; + else if (mode == TFmode) + use_sse = true; + else if (TARGET_SSE_MATH) + { + use_sse = SSE_FLOAT_MODE_P (mode); + if (mode == SFmode) + vmode = V4SFmode; + else if (mode == DFmode) + vmode = V2DFmode; + } + + /* NEG and ABS performed with SSE use bitwise mask operations. + Create the appropriate mask now. */ + if (use_sse) + mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS); + else + mask = NULL_RTX; + + dst = operands[0]; + src = operands[1]; + + set = gen_rtx_fmt_e (code, mode, src); + set = gen_rtx_SET (VOIDmode, dst, set); + + if (mask) + { + rtx use, clob; + rtvec par; + + use = gen_rtx_USE (VOIDmode, mask); + if (vector_mode) + par = gen_rtvec (2, set, use); + else + { + clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); + par = gen_rtvec (3, set, use, clob); + } + emit_insn (gen_rtx_PARALLEL (VOIDmode, par)); + } + else + emit_insn (set); +} + +/* Expand a copysign operation. Special case operand 0 being a constant. */ + +void +ix86_expand_copysign (rtx operands[]) +{ + enum machine_mode mode, vmode; + rtx dest, op0, op1, mask, nmask; + + dest = operands[0]; + op0 = operands[1]; + op1 = operands[2]; + + mode = GET_MODE (dest); + + if (mode == SFmode) + vmode = V4SFmode; + else if (mode == DFmode) + vmode = V2DFmode; + else + vmode = mode; + + if (GET_CODE (op0) == CONST_DOUBLE) + { + rtx (*copysign_insn)(rtx, rtx, rtx, rtx); + + if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0))) + op0 = simplify_unary_operation (ABS, mode, op0, mode); + + if (mode == SFmode || mode == DFmode) + { + if (op0 == CONST0_RTX (mode)) + op0 = CONST0_RTX (vmode); + else + { + rtx v = ix86_build_const_vector (vmode, false, op0); + + op0 = force_reg (vmode, v); + } + } + else if (op0 != CONST0_RTX (mode)) + op0 = force_reg (mode, op0); + + mask = ix86_build_signbit_mask (vmode, 0, 0); + + if (mode == SFmode) + copysign_insn = gen_copysignsf3_const; + else if (mode == DFmode) + copysign_insn = gen_copysigndf3_const; + else + copysign_insn = gen_copysigntf3_const; + + emit_insn (copysign_insn (dest, op0, op1, mask)); + } + else + { + rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx); + + nmask = ix86_build_signbit_mask (vmode, 0, 1); + mask = ix86_build_signbit_mask (vmode, 0, 0); + + if (mode == SFmode) + copysign_insn = gen_copysignsf3_var; + else if (mode == DFmode) + copysign_insn = gen_copysigndf3_var; + else + copysign_insn = gen_copysigntf3_var; + + emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask)); + } +} + +/* Deconstruct a copysign operation into bit masks. Operand 0 is known to + be a constant, and so has already been expanded into a vector constant. */ + +void +ix86_split_copysign_const (rtx operands[]) +{ + enum machine_mode mode, vmode; + rtx dest, op0, mask, x; + + dest = operands[0]; + op0 = operands[1]; + mask = operands[3]; + + mode = GET_MODE (dest); + vmode = GET_MODE (mask); + + dest = simplify_gen_subreg (vmode, dest, mode, 0); + x = gen_rtx_AND (vmode, dest, mask); + emit_insn (gen_rtx_SET (VOIDmode, dest, x)); + + if (op0 != CONST0_RTX (vmode)) + { + x = gen_rtx_IOR (vmode, dest, op0); + emit_insn (gen_rtx_SET (VOIDmode, dest, x)); + } +} + +/* Deconstruct a copysign operation into bit masks. Operand 0 is variable, + so we have to do two masks. */ + +void +ix86_split_copysign_var (rtx operands[]) +{ + enum machine_mode mode, vmode; + rtx dest, scratch, op0, op1, mask, nmask, x; + + dest = operands[0]; + scratch = operands[1]; + op0 = operands[2]; + op1 = operands[3]; + nmask = operands[4]; + mask = operands[5]; + + mode = GET_MODE (dest); + vmode = GET_MODE (mask); + + if (rtx_equal_p (op0, op1)) + { + /* Shouldn't happen often (it's useless, obviously), but when it does + we'd generate incorrect code if we continue below. */ + emit_move_insn (dest, op0); + return; + } + + if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */ + { + gcc_assert (REGNO (op1) == REGNO (scratch)); + + x = gen_rtx_AND (vmode, scratch, mask); + emit_insn (gen_rtx_SET (VOIDmode, scratch, x)); + + dest = mask; + op0 = simplify_gen_subreg (vmode, op0, mode, 0); + x = gen_rtx_NOT (vmode, dest); + x = gen_rtx_AND (vmode, x, op0); + emit_insn (gen_rtx_SET (VOIDmode, dest, x)); + } + else + { + if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */ + { + x = gen_rtx_AND (vmode, scratch, mask); + } + else /* alternative 2,4 */ + { + gcc_assert (REGNO (mask) == REGNO (scratch)); + op1 = simplify_gen_subreg (vmode, op1, mode, 0); + x = gen_rtx_AND (vmode, scratch, op1); + } + emit_insn (gen_rtx_SET (VOIDmode, scratch, x)); + + if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */ + { + dest = simplify_gen_subreg (vmode, op0, mode, 0); + x = gen_rtx_AND (vmode, dest, nmask); + } + else /* alternative 3,4 */ + { + gcc_assert (REGNO (nmask) == REGNO (dest)); + dest = nmask; + op0 = simplify_gen_subreg (vmode, op0, mode, 0); + x = gen_rtx_AND (vmode, dest, op0); + } + emit_insn (gen_rtx_SET (VOIDmode, dest, x)); + } + + x = gen_rtx_IOR (vmode, dest, scratch); + emit_insn (gen_rtx_SET (VOIDmode, dest, x)); +} + +/* Return TRUE or FALSE depending on whether the first SET in INSN + has source and destination with matching CC modes, and that the + CC mode is at least as constrained as REQ_MODE. */ + +bool +ix86_match_ccmode (rtx insn, enum machine_mode req_mode) +{ + rtx set; + enum machine_mode set_mode; + + set = PATTERN (insn); + if (GET_CODE (set) == PARALLEL) + set = XVECEXP (set, 0, 0); + gcc_assert (GET_CODE (set) == SET); + gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE); + + set_mode = GET_MODE (SET_DEST (set)); + switch (set_mode) + { + case CCNOmode: + if (req_mode != CCNOmode + && (req_mode != CCmode + || XEXP (SET_SRC (set), 1) != const0_rtx)) + return false; + break; + case CCmode: + if (req_mode == CCGCmode) + return false; + /* FALLTHRU */ + case CCGCmode: + if (req_mode == CCGOCmode || req_mode == CCNOmode) + return false; + /* FALLTHRU */ + case CCGOCmode: + if (req_mode == CCZmode) + return false; + /* FALLTHRU */ + case CCZmode: + break; + + case CCAmode: + case CCCmode: + case CCOmode: + case CCSmode: + if (set_mode != req_mode) + return false; + break; + + default: + gcc_unreachable (); + } + + return GET_MODE (SET_SRC (set)) == set_mode; +} + +/* Generate insn patterns to do an integer compare of OPERANDS. */ + +static rtx +ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1) +{ + enum machine_mode cmpmode; + rtx tmp, flags; + + cmpmode = SELECT_CC_MODE (code, op0, op1); + flags = gen_rtx_REG (cmpmode, FLAGS_REG); + + /* This is very simple, but making the interface the same as in the + FP case makes the rest of the code easier. */ + tmp = gen_rtx_COMPARE (cmpmode, op0, op1); + emit_insn (gen_rtx_SET (VOIDmode, flags, tmp)); + + /* Return the test that should be put into the flags user, i.e. + the bcc, scc, or cmov instruction. */ + return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx); +} + +/* Figure out whether to use ordered or unordered fp comparisons. + Return the appropriate mode to use. */ + +enum machine_mode +ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED) +{ + /* ??? In order to make all comparisons reversible, we do all comparisons + non-trapping when compiling for IEEE. Once gcc is able to distinguish + all forms trapping and nontrapping comparisons, we can make inequality + comparisons trapping again, since it results in better code when using + FCOM based compares. */ + return TARGET_IEEE_FP ? CCFPUmode : CCFPmode; +} + +enum machine_mode +ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1) +{ + enum machine_mode mode = GET_MODE (op0); + + if (SCALAR_FLOAT_MODE_P (mode)) + { + gcc_assert (!DECIMAL_FLOAT_MODE_P (mode)); + return ix86_fp_compare_mode (code); + } + + switch (code) + { + /* Only zero flag is needed. */ + case EQ: /* ZF=0 */ + case NE: /* ZF!=0 */ + return CCZmode; + /* Codes needing carry flag. */ + case GEU: /* CF=0 */ + case LTU: /* CF=1 */ + /* Detect overflow checks. They need just the carry flag. */ + if (GET_CODE (op0) == PLUS + && rtx_equal_p (op1, XEXP (op0, 0))) + return CCCmode; + else + return CCmode; + case GTU: /* CF=0 & ZF=0 */ + case LEU: /* CF=1 | ZF=1 */ + /* Detect overflow checks. They need just the carry flag. */ + if (GET_CODE (op0) == MINUS + && rtx_equal_p (op1, XEXP (op0, 0))) + return CCCmode; + else + return CCmode; + /* Codes possibly doable only with sign flag when + comparing against zero. */ + case GE: /* SF=OF or SF=0 */ + case LT: /* SF<>OF or SF=1 */ + if (op1 == const0_rtx) + return CCGOCmode; + else + /* For other cases Carry flag is not required. */ + return CCGCmode; + /* Codes doable only with sign flag when comparing + against zero, but we miss jump instruction for it + so we need to use relational tests against overflow + that thus needs to be zero. */ + case GT: /* ZF=0 & SF=OF */ + case LE: /* ZF=1 | SF<>OF */ + if (op1 == const0_rtx) + return CCNOmode; + else + return CCGCmode; + /* strcmp pattern do (use flags) and combine may ask us for proper + mode. */ + case USE: + return CCmode; + default: + gcc_unreachable (); + } +} + +/* Return the fixed registers used for condition codes. */ + +static bool +ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2) +{ + *p1 = FLAGS_REG; + *p2 = FPSR_REG; + return true; +} + +/* If two condition code modes are compatible, return a condition code + mode which is compatible with both. Otherwise, return + VOIDmode. */ + +static enum machine_mode +ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2) +{ + if (m1 == m2) + return m1; + + if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC) + return VOIDmode; + + if ((m1 == CCGCmode && m2 == CCGOCmode) + || (m1 == CCGOCmode && m2 == CCGCmode)) + return CCGCmode; + + switch (m1) + { + default: + gcc_unreachable (); + + case CCmode: + case CCGCmode: + case CCGOCmode: + case CCNOmode: + case CCAmode: + case CCCmode: + case CCOmode: + case CCSmode: + case CCZmode: + switch (m2) + { + default: + return VOIDmode; + + case CCmode: + case CCGCmode: + case CCGOCmode: + case CCNOmode: + case CCAmode: + case CCCmode: + case CCOmode: + case CCSmode: + case CCZmode: + return CCmode; + } + + case CCFPmode: + case CCFPUmode: + /* These are only compatible with themselves, which we already + checked above. */ + return VOIDmode; + } +} + + +/* Return a comparison we can do and that it is equivalent to + swap_condition (code) apart possibly from orderedness. + But, never change orderedness if TARGET_IEEE_FP, returning + UNKNOWN in that case if necessary. */ + +static enum rtx_code +ix86_fp_swap_condition (enum rtx_code code) +{ + switch (code) + { + case GT: /* GTU - CF=0 & ZF=0 */ + return TARGET_IEEE_FP ? UNKNOWN : UNLT; + case GE: /* GEU - CF=0 */ + return TARGET_IEEE_FP ? UNKNOWN : UNLE; + case UNLT: /* LTU - CF=1 */ + return TARGET_IEEE_FP ? UNKNOWN : GT; + case UNLE: /* LEU - CF=1 | ZF=1 */ + return TARGET_IEEE_FP ? UNKNOWN : GE; + default: + return swap_condition (code); + } +} + +/* Return cost of comparison CODE using the best strategy for performance. + All following functions do use number of instructions as a cost metrics. + In future this should be tweaked to compute bytes for optimize_size and + take into account performance of various instructions on various CPUs. */ + +static int +ix86_fp_comparison_cost (enum rtx_code code) +{ + int arith_cost; + + /* The cost of code using bit-twiddling on %ah. */ + switch (code) + { + case UNLE: + case UNLT: + case LTGT: + case GT: + case GE: + case UNORDERED: + case ORDERED: + case UNEQ: + arith_cost = 4; + break; + case LT: + case NE: + case EQ: + case UNGE: + arith_cost = TARGET_IEEE_FP ? 5 : 4; + break; + case LE: + case UNGT: + arith_cost = TARGET_IEEE_FP ? 6 : 4; + break; + default: + gcc_unreachable (); + } + + switch (ix86_fp_comparison_strategy (code)) + { + case IX86_FPCMP_COMI: + return arith_cost > 4 ? 3 : 2; + case IX86_FPCMP_SAHF: + return arith_cost > 4 ? 4 : 3; + default: + return arith_cost; + } +} + +/* Return strategy to use for floating-point. We assume that fcomi is always + preferrable where available, since that is also true when looking at size + (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */ + +enum ix86_fpcmp_strategy +ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED) +{ + /* Do fcomi/sahf based test when profitable. */ + + if (TARGET_CMOVE) + return IX86_FPCMP_COMI; + + if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun))) + return IX86_FPCMP_SAHF; + + return IX86_FPCMP_ARITH; +} + +/* Swap, force into registers, or otherwise massage the two operands + to a fp comparison. The operands are updated in place; the new + comparison code is returned. */ + +static enum rtx_code +ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1) +{ + enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code); + rtx op0 = *pop0, op1 = *pop1; + enum machine_mode op_mode = GET_MODE (op0); + int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode); + + /* All of the unordered compare instructions only work on registers. + The same is true of the fcomi compare instructions. The XFmode + compare instructions require registers except when comparing + against zero or when converting operand 1 from fixed point to + floating point. */ + + if (!is_sse + && (fpcmp_mode == CCFPUmode + || (op_mode == XFmode + && ! (standard_80387_constant_p (op0) == 1 + || standard_80387_constant_p (op1) == 1) + && GET_CODE (op1) != FLOAT) + || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI)) + { + op0 = force_reg (op_mode, op0); + op1 = force_reg (op_mode, op1); + } + else + { + /* %%% We only allow op1 in memory; op0 must be st(0). So swap + things around if they appear profitable, otherwise force op0 + into a register. */ + + if (standard_80387_constant_p (op0) == 0 + || (MEM_P (op0) + && ! (standard_80387_constant_p (op1) == 0 + || MEM_P (op1)))) + { + enum rtx_code new_code = ix86_fp_swap_condition (code); + if (new_code != UNKNOWN) + { + rtx tmp; + tmp = op0, op0 = op1, op1 = tmp; + code = new_code; + } + } + + if (!REG_P (op0)) + op0 = force_reg (op_mode, op0); + + if (CONSTANT_P (op1)) + { + int tmp = standard_80387_constant_p (op1); + if (tmp == 0) + op1 = validize_mem (force_const_mem (op_mode, op1)); + else if (tmp == 1) + { + if (TARGET_CMOVE) + op1 = force_reg (op_mode, op1); + } + else + op1 = force_reg (op_mode, op1); + } + } + + /* Try to rearrange the comparison to make it cheaper. */ + if (ix86_fp_comparison_cost (code) + > ix86_fp_comparison_cost (swap_condition (code)) + && (REG_P (op1) || can_create_pseudo_p ())) + { + rtx tmp; + tmp = op0, op0 = op1, op1 = tmp; + code = swap_condition (code); + if (!REG_P (op0)) + op0 = force_reg (op_mode, op0); + } + + *pop0 = op0; + *pop1 = op1; + return code; +} + +/* Convert comparison codes we use to represent FP comparison to integer + code that will result in proper branch. Return UNKNOWN if no such code + is available. */ + +enum rtx_code +ix86_fp_compare_code_to_integer (enum rtx_code code) +{ + switch (code) + { + case GT: + return GTU; + case GE: + return GEU; + case ORDERED: + case UNORDERED: + return code; + break; + case UNEQ: + return EQ; + break; + case UNLT: + return LTU; + break; + case UNLE: + return LEU; + break; + case LTGT: + return NE; + break; + default: + return UNKNOWN; + } +} + +/* Generate insn patterns to do a floating point compare of OPERANDS. */ + +static rtx +ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch) +{ + enum machine_mode fpcmp_mode, intcmp_mode; + rtx tmp, tmp2; + + fpcmp_mode = ix86_fp_compare_mode (code); + code = ix86_prepare_fp_compare_args (code, &op0, &op1); + + /* Do fcomi/sahf based test when profitable. */ + switch (ix86_fp_comparison_strategy (code)) + { + case IX86_FPCMP_COMI: + intcmp_mode = fpcmp_mode; + tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1); + tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG), + tmp); + emit_insn (tmp); + break; + + case IX86_FPCMP_SAHF: + intcmp_mode = fpcmp_mode; + tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1); + tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG), + tmp); + + if (!scratch) + scratch = gen_reg_rtx (HImode); + tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch); + emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2))); + break; + + case IX86_FPCMP_ARITH: + /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */ + tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1); + tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW); + if (!scratch) + scratch = gen_reg_rtx (HImode); + emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2)); + + /* In the unordered case, we have to check C2 for NaN's, which + doesn't happen to work out to anything nice combination-wise. + So do some bit twiddling on the value we've got in AH to come + up with an appropriate set of condition codes. */ + + intcmp_mode = CCNOmode; + switch (code) + { + case GT: + case UNGT: + if (code == GT || !TARGET_IEEE_FP) + { + emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45))); + code = EQ; + } + else + { + emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45))); + emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx)); + emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44))); + intcmp_mode = CCmode; + code = GEU; + } + break; + case LT: + case UNLT: + if (code == LT && TARGET_IEEE_FP) + { + emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45))); + emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx)); + intcmp_mode = CCmode; + code = EQ; + } + else + { + emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx)); + code = NE; + } + break; + case GE: + case UNGE: + if (code == GE || !TARGET_IEEE_FP) + { + emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05))); + code = EQ; + } + else + { + emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45))); + emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx)); + code = NE; + } + break; + case LE: + case UNLE: + if (code == LE && TARGET_IEEE_FP) + { + emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45))); + emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx)); + emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40))); + intcmp_mode = CCmode; + code = LTU; + } + else + { + emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45))); + code = NE; + } + break; + case EQ: + case UNEQ: + if (code == EQ && TARGET_IEEE_FP) + { + emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45))); + emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40))); + intcmp_mode = CCmode; + code = EQ; + } + else + { + emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40))); + code = NE; + } + break; + case NE: + case LTGT: + if (code == NE && TARGET_IEEE_FP) + { + emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45))); + emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, + GEN_INT (0x40))); + code = NE; + } + else + { + emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40))); + code = EQ; + } + break; + + case UNORDERED: + emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04))); + code = NE; + break; + case ORDERED: + emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04))); + code = EQ; + break; + + default: + gcc_unreachable (); + } + break; + + default: + gcc_unreachable(); + } + + /* Return the test that should be put into the flags user, i.e. + the bcc, scc, or cmov instruction. */ + return gen_rtx_fmt_ee (code, VOIDmode, + gen_rtx_REG (intcmp_mode, FLAGS_REG), + const0_rtx); +} + +static rtx +ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1) +{ + rtx ret; + + if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC) + ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1); + + else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0))) + { + gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0))); + ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX); + } + else + ret = ix86_expand_int_compare (code, op0, op1); + + return ret; +} + +void +ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label) +{ + enum machine_mode mode = GET_MODE (op0); + rtx tmp; + + switch (mode) + { + case SFmode: + case DFmode: + case XFmode: + case QImode: + case HImode: + case SImode: + simple: + tmp = ix86_expand_compare (code, op0, op1); + tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, + gen_rtx_LABEL_REF (VOIDmode, label), + pc_rtx); + emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp)); + return; + + case DImode: + if (TARGET_64BIT) + goto simple; + case TImode: + /* Expand DImode branch into multiple compare+branch. */ + { + rtx lo[2], hi[2], label2; + enum rtx_code code1, code2, code3; + enum machine_mode submode; + + if (CONSTANT_P (op0) && !CONSTANT_P (op1)) + { + tmp = op0, op0 = op1, op1 = tmp; + code = swap_condition (code); + } + + split_double_mode (mode, &op0, 1, lo+0, hi+0); + split_double_mode (mode, &op1, 1, lo+1, hi+1); + + submode = mode == DImode ? SImode : DImode; + + /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to + avoid two branches. This costs one extra insn, so disable when + optimizing for size. */ + + if ((code == EQ || code == NE) + && (!optimize_insn_for_size_p () + || hi[1] == const0_rtx || lo[1] == const0_rtx)) + { + rtx xor0, xor1; + + xor1 = hi[0]; + if (hi[1] != const0_rtx) + xor1 = expand_binop (submode, xor_optab, xor1, hi[1], + NULL_RTX, 0, OPTAB_WIDEN); + + xor0 = lo[0]; + if (lo[1] != const0_rtx) + xor0 = expand_binop (submode, xor_optab, xor0, lo[1], + NULL_RTX, 0, OPTAB_WIDEN); + + tmp = expand_binop (submode, ior_optab, xor1, xor0, + NULL_RTX, 0, OPTAB_WIDEN); + + ix86_expand_branch (code, tmp, const0_rtx, label); + return; + } + + /* Otherwise, if we are doing less-than or greater-or-equal-than, + op1 is a constant and the low word is zero, then we can just + examine the high word. Similarly for low word -1 and + less-or-equal-than or greater-than. */ + + if (CONST_INT_P (hi[1])) + switch (code) + { + case LT: case LTU: case GE: case GEU: + if (lo[1] == const0_rtx) + { + ix86_expand_branch (code, hi[0], hi[1], label); + return; + } + break; + case LE: case LEU: case GT: case GTU: + if (lo[1] == constm1_rtx) + { + ix86_expand_branch (code, hi[0], hi[1], label); + return; + } + break; + default: + break; + } + + /* Otherwise, we need two or three jumps. */ + + label2 = gen_label_rtx (); + + code1 = code; + code2 = swap_condition (code); + code3 = unsigned_condition (code); + + switch (code) + { + case LT: case GT: case LTU: case GTU: + break; + + case LE: code1 = LT; code2 = GT; break; + case GE: code1 = GT; code2 = LT; break; + case LEU: code1 = LTU; code2 = GTU; break; + case GEU: code1 = GTU; code2 = LTU; break; + + case EQ: code1 = UNKNOWN; code2 = NE; break; + case NE: code2 = UNKNOWN; break; + + default: + gcc_unreachable (); + } + + /* + * a < b => + * if (hi(a) < hi(b)) goto true; + * if (hi(a) > hi(b)) goto false; + * if (lo(a) < lo(b)) goto true; + * false: + */ + + if (code1 != UNKNOWN) + ix86_expand_branch (code1, hi[0], hi[1], label); + if (code2 != UNKNOWN) + ix86_expand_branch (code2, hi[0], hi[1], label2); + + ix86_expand_branch (code3, lo[0], lo[1], label); + + if (code2 != UNKNOWN) + emit_label (label2); + return; + } + + default: + gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC); + goto simple; + } +} + +/* Split branch based on floating point condition. */ +void +ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2, + rtx target1, rtx target2, rtx tmp, rtx pushed) +{ + rtx condition; + rtx i; + + if (target2 != pc_rtx) + { + rtx tmp = target2; + code = reverse_condition_maybe_unordered (code); + target2 = target1; + target1 = tmp; + } + + condition = ix86_expand_fp_compare (code, op1, op2, + tmp); + + /* Remove pushed operand from stack. */ + if (pushed) + ix86_free_from_memory (GET_MODE (pushed)); + + i = emit_jump_insn (gen_rtx_SET + (VOIDmode, pc_rtx, + gen_rtx_IF_THEN_ELSE (VOIDmode, + condition, target1, target2))); + if (split_branch_probability >= 0) + add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability)); +} + +void +ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1) +{ + rtx ret; + + gcc_assert (GET_MODE (dest) == QImode); + + ret = ix86_expand_compare (code, op0, op1); + PUT_MODE (ret, QImode); + emit_insn (gen_rtx_SET (VOIDmode, dest, ret)); +} + +/* Expand comparison setting or clearing carry flag. Return true when + successful and set pop for the operation. */ +static bool +ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop) +{ + enum machine_mode mode = + GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1); + + /* Do not handle double-mode compares that go through special path. */ + if (mode == (TARGET_64BIT ? TImode : DImode)) + return false; + + if (SCALAR_FLOAT_MODE_P (mode)) + { + rtx compare_op, compare_seq; + + gcc_assert (!DECIMAL_FLOAT_MODE_P (mode)); + + /* Shortcut: following common codes never translate + into carry flag compares. */ + if (code == EQ || code == NE || code == UNEQ || code == LTGT + || code == ORDERED || code == UNORDERED) + return false; + + /* These comparisons require zero flag; swap operands so they won't. */ + if ((code == GT || code == UNLE || code == LE || code == UNGT) + && !TARGET_IEEE_FP) + { + rtx tmp = op0; + op0 = op1; + op1 = tmp; + code = swap_condition (code); + } + + /* Try to expand the comparison and verify that we end up with + carry flag based comparison. This fails to be true only when + we decide to expand comparison using arithmetic that is not + too common scenario. */ + start_sequence (); + compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX); + compare_seq = get_insns (); + end_sequence (); + + if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode + || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode) + code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op)); + else + code = GET_CODE (compare_op); + + if (code != LTU && code != GEU) + return false; + + emit_insn (compare_seq); + *pop = compare_op; + return true; + } + + if (!INTEGRAL_MODE_P (mode)) + return false; + + switch (code) + { + case LTU: + case GEU: + break; + + /* Convert a==0 into (unsigned)a<1. */ + case EQ: + case NE: + if (op1 != const0_rtx) + return false; + op1 = const1_rtx; + code = (code == EQ ? LTU : GEU); + break; + + /* Convert a>b into b=b-1. */ + case GTU: + case LEU: + if (CONST_INT_P (op1)) + { + op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0)); + /* Bail out on overflow. We still can swap operands but that + would force loading of the constant into register. */ + if (op1 == const0_rtx + || !x86_64_immediate_operand (op1, GET_MODE (op1))) + return false; + code = (code == GTU ? GEU : LTU); + } + else + { + rtx tmp = op1; + op1 = op0; + op0 = tmp; + code = (code == GTU ? LTU : GEU); + } + break; + + /* Convert a>=0 into (unsigned)a<0x80000000. */ + case LT: + case GE: + if (mode == DImode || op1 != const0_rtx) + return false; + op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode); + code = (code == LT ? GEU : LTU); + break; + case LE: + case GT: + if (mode == DImode || op1 != constm1_rtx) + return false; + op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode); + code = (code == LE ? GEU : LTU); + break; + + default: + return false; + } + /* Swapping operands may cause constant to appear as first operand. */ + if (!nonimmediate_operand (op0, VOIDmode)) + { + if (!can_create_pseudo_p ()) + return false; + op0 = force_reg (mode, op0); + } + *pop = ix86_expand_compare (code, op0, op1); + gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU); + return true; +} + +bool +ix86_expand_int_movcc (rtx operands[]) +{ + enum rtx_code code = GET_CODE (operands[1]), compare_code; + rtx compare_seq, compare_op; + enum machine_mode mode = GET_MODE (operands[0]); + bool sign_bit_compare_p = false; + rtx op0 = XEXP (operands[1], 0); + rtx op1 = XEXP (operands[1], 1); + + start_sequence (); + compare_op = ix86_expand_compare (code, op0, op1); + compare_seq = get_insns (); + end_sequence (); + + compare_code = GET_CODE (compare_op); + + if ((op1 == const0_rtx && (code == GE || code == LT)) + || (op1 == constm1_rtx && (code == GT || code == LE))) + sign_bit_compare_p = true; + + /* Don't attempt mode expansion here -- if we had to expand 5 or 6 + HImode insns, we'd be swallowed in word prefix ops. */ + + if ((mode != HImode || TARGET_FAST_PREFIX) + && (mode != (TARGET_64BIT ? TImode : DImode)) + && CONST_INT_P (operands[2]) + && CONST_INT_P (operands[3])) + { + rtx out = operands[0]; + HOST_WIDE_INT ct = INTVAL (operands[2]); + HOST_WIDE_INT cf = INTVAL (operands[3]); + HOST_WIDE_INT diff; + + diff = ct - cf; + /* Sign bit compares are better done using shifts than we do by using + sbb. */ + if (sign_bit_compare_p + || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op)) + { + /* Detect overlap between destination and compare sources. */ + rtx tmp = out; + + if (!sign_bit_compare_p) + { + rtx flags; + bool fpcmp = false; + + compare_code = GET_CODE (compare_op); + + flags = XEXP (compare_op, 0); + + if (GET_MODE (flags) == CCFPmode + || GET_MODE (flags) == CCFPUmode) + { + fpcmp = true; + compare_code + = ix86_fp_compare_code_to_integer (compare_code); + } + + /* To simplify rest of code, restrict to the GEU case. */ + if (compare_code == LTU) + { + HOST_WIDE_INT tmp = ct; + ct = cf; + cf = tmp; + compare_code = reverse_condition (compare_code); + code = reverse_condition (code); + } + else + { + if (fpcmp) + PUT_CODE (compare_op, + reverse_condition_maybe_unordered + (GET_CODE (compare_op))); + else + PUT_CODE (compare_op, + reverse_condition (GET_CODE (compare_op))); + } + diff = ct - cf; + + if (reg_overlap_mentioned_p (out, op0) + || reg_overlap_mentioned_p (out, op1)) + tmp = gen_reg_rtx (mode); + + if (mode == DImode) + emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op)); + else + emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp), + flags, compare_op)); + } + else + { + if (code == GT || code == GE) + code = reverse_condition (code); + else + { + HOST_WIDE_INT tmp = ct; + ct = cf; + cf = tmp; + diff = ct - cf; + } + tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1); + } + + if (diff == 1) + { + /* + * cmpl op0,op1 + * sbbl dest,dest + * [addl dest, ct] + * + * Size 5 - 8. + */ + if (ct) + tmp = expand_simple_binop (mode, PLUS, + tmp, GEN_INT (ct), + copy_rtx (tmp), 1, OPTAB_DIRECT); + } + else if (cf == -1) + { + /* + * cmpl op0,op1 + * sbbl dest,dest + * orl $ct, dest + * + * Size 8. + */ + tmp = expand_simple_binop (mode, IOR, + tmp, GEN_INT (ct), + copy_rtx (tmp), 1, OPTAB_DIRECT); + } + else if (diff == -1 && ct) + { + /* + * cmpl op0,op1 + * sbbl dest,dest + * notl dest + * [addl dest, cf] + * + * Size 8 - 11. + */ + tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1); + if (cf) + tmp = expand_simple_binop (mode, PLUS, + copy_rtx (tmp), GEN_INT (cf), + copy_rtx (tmp), 1, OPTAB_DIRECT); + } + else + { + /* + * cmpl op0,op1 + * sbbl dest,dest + * [notl dest] + * andl cf - ct, dest + * [addl dest, ct] + * + * Size 8 - 11. + */ + + if (cf == 0) + { + cf = ct; + ct = 0; + tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1); + } + + tmp = expand_simple_binop (mode, AND, + copy_rtx (tmp), + gen_int_mode (cf - ct, mode), + copy_rtx (tmp), 1, OPTAB_DIRECT); + if (ct) + tmp = expand_simple_binop (mode, PLUS, + copy_rtx (tmp), GEN_INT (ct), + copy_rtx (tmp), 1, OPTAB_DIRECT); + } + + if (!rtx_equal_p (tmp, out)) + emit_move_insn (copy_rtx (out), copy_rtx (tmp)); + + return true; + } + + if (diff < 0) + { + enum machine_mode cmp_mode = GET_MODE (op0); + + HOST_WIDE_INT tmp; + tmp = ct, ct = cf, cf = tmp; + diff = -diff; + + if (SCALAR_FLOAT_MODE_P (cmp_mode)) + { + gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode)); + + /* We may be reversing unordered compare to normal compare, that + is not valid in general (we may convert non-trapping condition + to trapping one), however on i386 we currently emit all + comparisons unordered. */ + compare_code = reverse_condition_maybe_unordered (compare_code); + code = reverse_condition_maybe_unordered (code); + } + else + { + compare_code = reverse_condition (compare_code); + code = reverse_condition (code); + } + } + + compare_code = UNKNOWN; + if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT + && CONST_INT_P (op1)) + { + if (op1 == const0_rtx + && (code == LT || code == GE)) + compare_code = code; + else if (op1 == constm1_rtx) + { + if (code == LE) + compare_code = LT; + else if (code == GT) + compare_code = GE; + } + } + + /* Optimize dest = (op0 < 0) ? -1 : cf. */ + if (compare_code != UNKNOWN + && GET_MODE (op0) == GET_MODE (out) + && (cf == -1 || ct == -1)) + { + /* If lea code below could be used, only optimize + if it results in a 2 insn sequence. */ + + if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8 + || diff == 3 || diff == 5 || diff == 9) + || (compare_code == LT && ct == -1) + || (compare_code == GE && cf == -1)) + { + /* + * notl op1 (if necessary) + * sarl $31, op1 + * orl cf, op1 + */ + if (ct != -1) + { + cf = ct; + ct = -1; + code = reverse_condition (code); + } + + out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1); + + out = expand_simple_binop (mode, IOR, + out, GEN_INT (cf), + out, 1, OPTAB_DIRECT); + if (out != operands[0]) + emit_move_insn (operands[0], out); + + return true; + } + } + + + if ((diff == 1 || diff == 2 || diff == 4 || diff == 8 + || diff == 3 || diff == 5 || diff == 9) + && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL) + && (mode != DImode + || x86_64_immediate_operand (GEN_INT (cf), VOIDmode))) + { + /* + * xorl dest,dest + * cmpl op1,op2 + * setcc dest + * lea cf(dest*(ct-cf)),dest + * + * Size 14. + * + * This also catches the degenerate setcc-only case. + */ + + rtx tmp; + int nops; + + out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1); + + nops = 0; + /* On x86_64 the lea instruction operates on Pmode, so we need + to get arithmetics done in proper mode to match. */ + if (diff == 1) + tmp = copy_rtx (out); + else + { + rtx out1; + out1 = copy_rtx (out); + tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1)); + nops++; + if (diff & 1) + { + tmp = gen_rtx_PLUS (mode, tmp, out1); + nops++; + } + } + if (cf != 0) + { + tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf)); + nops++; + } + if (!rtx_equal_p (tmp, out)) + { + if (nops == 1) + out = force_operand (tmp, copy_rtx (out)); + else + emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp))); + } + if (!rtx_equal_p (out, operands[0])) + emit_move_insn (operands[0], copy_rtx (out)); + + return true; + } + + /* + * General case: Jumpful: + * xorl dest,dest cmpl op1, op2 + * cmpl op1, op2 movl ct, dest + * setcc dest jcc 1f + * decl dest movl cf, dest + * andl (cf-ct),dest 1: + * addl ct,dest + * + * Size 20. Size 14. + * + * This is reasonably steep, but branch mispredict costs are + * high on modern cpus, so consider failing only if optimizing + * for space. + */ + + if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL)) + && BRANCH_COST (optimize_insn_for_speed_p (), + false) >= 2) + { + if (cf == 0) + { + enum machine_mode cmp_mode = GET_MODE (op0); + + cf = ct; + ct = 0; + + if (SCALAR_FLOAT_MODE_P (cmp_mode)) + { + gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode)); + + /* We may be reversing unordered compare to normal compare, + that is not valid in general (we may convert non-trapping + condition to trapping one), however on i386 we currently + emit all comparisons unordered. */ + code = reverse_condition_maybe_unordered (code); + } + else + { + code = reverse_condition (code); + if (compare_code != UNKNOWN) + compare_code = reverse_condition (compare_code); + } + } + + if (compare_code != UNKNOWN) + { + /* notl op1 (if needed) + sarl $31, op1 + andl (cf-ct), op1 + addl ct, op1 + + For x < 0 (resp. x <= -1) there will be no notl, + so if possible swap the constants to get rid of the + complement. + True/false will be -1/0 while code below (store flag + followed by decrement) is 0/-1, so the constants need + to be exchanged once more. */ + + if (compare_code == GE || !cf) + { + code = reverse_condition (code); + compare_code = LT; + } + else + { + HOST_WIDE_INT tmp = cf; + cf = ct; + ct = tmp; + } + + out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1); + } + else + { + out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1); + + out = expand_simple_binop (mode, PLUS, copy_rtx (out), + constm1_rtx, + copy_rtx (out), 1, OPTAB_DIRECT); + } + + out = expand_simple_binop (mode, AND, copy_rtx (out), + gen_int_mode (cf - ct, mode), + copy_rtx (out), 1, OPTAB_DIRECT); + if (ct) + out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct), + copy_rtx (out), 1, OPTAB_DIRECT); + if (!rtx_equal_p (out, operands[0])) + emit_move_insn (operands[0], copy_rtx (out)); + + return true; + } + } + + if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL)) + { + /* Try a few things more with specific constants and a variable. */ + + optab op; + rtx var, orig_out, out, tmp; + + if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2) + return false; + + /* If one of the two operands is an interesting constant, load a + constant with the above and mask it in with a logical operation. */ + + if (CONST_INT_P (operands[2])) + { + var = operands[3]; + if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx) + operands[3] = constm1_rtx, op = and_optab; + else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx) + operands[3] = const0_rtx, op = ior_optab; + else + return false; + } + else if (CONST_INT_P (operands[3])) + { + var = operands[2]; + if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx) + operands[2] = constm1_rtx, op = and_optab; + else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx) + operands[2] = const0_rtx, op = ior_optab; + else + return false; + } + else + return false; + + orig_out = operands[0]; + tmp = gen_reg_rtx (mode); + operands[0] = tmp; + + /* Recurse to get the constant loaded. */ + if (ix86_expand_int_movcc (operands) == 0) + return false; + + /* Mask in the interesting variable. */ + out = expand_binop (mode, op, var, tmp, orig_out, 0, + OPTAB_WIDEN); + if (!rtx_equal_p (out, orig_out)) + emit_move_insn (copy_rtx (orig_out), copy_rtx (out)); + + return true; + } + + /* + * For comparison with above, + * + * movl cf,dest + * movl ct,tmp + * cmpl op1,op2 + * cmovcc tmp,dest + * + * Size 15. + */ + + if (! nonimmediate_operand (operands[2], mode)) + operands[2] = force_reg (mode, operands[2]); + if (! nonimmediate_operand (operands[3], mode)) + operands[3] = force_reg (mode, operands[3]); + + if (! register_operand (operands[2], VOIDmode) + && (mode == QImode + || ! register_operand (operands[3], VOIDmode))) + operands[2] = force_reg (mode, operands[2]); + + if (mode == QImode + && ! register_operand (operands[3], VOIDmode)) + operands[3] = force_reg (mode, operands[3]); + + emit_insn (compare_seq); + emit_insn (gen_rtx_SET (VOIDmode, operands[0], + gen_rtx_IF_THEN_ELSE (mode, + compare_op, operands[2], + operands[3]))); + return true; +} + +/* Swap, force into registers, or otherwise massage the two operands + to an sse comparison with a mask result. Thus we differ a bit from + ix86_prepare_fp_compare_args which expects to produce a flags result. + + The DEST operand exists to help determine whether to commute commutative + operators. The POP0/POP1 operands are updated in place. The new + comparison code is returned, or UNKNOWN if not implementable. */ + +static enum rtx_code +ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code, + rtx *pop0, rtx *pop1) +{ + rtx tmp; + + /* AVX supports all the needed comparisons, no need to swap arguments + nor help reload. */ + if (TARGET_AVX) + return code; + + switch (code) + { + case LTGT: + case UNEQ: + /* We have no LTGT as an operator. We could implement it with + NE & ORDERED, but this requires an extra temporary. It's + not clear that it's worth it. */ + return UNKNOWN; + + case LT: + case LE: + case UNGT: + case UNGE: + /* These are supported directly. */ + break; + + case EQ: + case NE: + case UNORDERED: + case ORDERED: + /* For commutative operators, try to canonicalize the destination + operand to be first in the comparison - this helps reload to + avoid extra moves. */ + if (!dest || !rtx_equal_p (dest, *pop1)) + break; + /* FALLTHRU */ + + case GE: + case GT: + case UNLE: + case UNLT: + /* These are not supported directly. Swap the comparison operands + to transform into something that is supported. */ + tmp = *pop0; + *pop0 = *pop1; + *pop1 = tmp; + code = swap_condition (code); + break; + + default: + gcc_unreachable (); + } + + return code; +} + +/* Detect conditional moves that exactly match min/max operational + semantics. Note that this is IEEE safe, as long as we don't + interchange the operands. + + Returns FALSE if this conditional move doesn't match a MIN/MAX, + and TRUE if the operation is successful and instructions are emitted. */ + +static bool +ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0, + rtx cmp_op1, rtx if_true, rtx if_false) +{ + enum machine_mode mode; + bool is_min; + rtx tmp; + + if (code == LT) + ; + else if (code == UNGE) + { + tmp = if_true; + if_true = if_false; + if_false = tmp; + } + else + return false; + + if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false)) + is_min = true; + else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false)) + is_min = false; + else + return false; + + mode = GET_MODE (dest); + + /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here, + but MODE may be a vector mode and thus not appropriate. */ + if (!flag_finite_math_only || !flag_unsafe_math_optimizations) + { + int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX; + rtvec v; + + if_true = force_reg (mode, if_true); + v = gen_rtvec (2, if_true, if_false); + tmp = gen_rtx_UNSPEC (mode, v, u); + } + else + { + code = is_min ? SMIN : SMAX; + tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false); + } + + emit_insn (gen_rtx_SET (VOIDmode, dest, tmp)); + return true; +} + +/* Expand an sse vector comparison. Return the register with the result. */ + +static rtx +ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1, + rtx op_true, rtx op_false) +{ + enum machine_mode mode = GET_MODE (dest); + rtx x; + + cmp_op0 = force_reg (mode, cmp_op0); + if (!nonimmediate_operand (cmp_op1, mode)) + cmp_op1 = force_reg (mode, cmp_op1); + + if (optimize + || reg_overlap_mentioned_p (dest, op_true) + || reg_overlap_mentioned_p (dest, op_false)) + dest = gen_reg_rtx (mode); + + x = gen_rtx_fmt_ee (code, mode, cmp_op0, cmp_op1); + emit_insn (gen_rtx_SET (VOIDmode, dest, x)); + + return dest; +} + +/* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical + operations. This is used for both scalar and vector conditional moves. */ + +static void +ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) +{ + enum machine_mode mode = GET_MODE (dest); + rtx t2, t3, x; + + if (op_false == CONST0_RTX (mode)) + { + op_true = force_reg (mode, op_true); + x = gen_rtx_AND (mode, cmp, op_true); + emit_insn (gen_rtx_SET (VOIDmode, dest, x)); + } + else if (op_true == CONST0_RTX (mode)) + { + op_false = force_reg (mode, op_false); + x = gen_rtx_NOT (mode, cmp); + x = gen_rtx_AND (mode, x, op_false); + emit_insn (gen_rtx_SET (VOIDmode, dest, x)); + } + else if (TARGET_XOP) + { + op_true = force_reg (mode, op_true); + + if (!nonimmediate_operand (op_false, mode)) + op_false = force_reg (mode, op_false); + + emit_insn (gen_rtx_SET (mode, dest, + gen_rtx_IF_THEN_ELSE (mode, cmp, + op_true, + op_false))); + } + else + { + op_true = force_reg (mode, op_true); + op_false = force_reg (mode, op_false); + + t2 = gen_reg_rtx (mode); + if (optimize) + t3 = gen_reg_rtx (mode); + else + t3 = dest; + + x = gen_rtx_AND (mode, op_true, cmp); + emit_insn (gen_rtx_SET (VOIDmode, t2, x)); + + x = gen_rtx_NOT (mode, cmp); + x = gen_rtx_AND (mode, x, op_false); + emit_insn (gen_rtx_SET (VOIDmode, t3, x)); + + x = gen_rtx_IOR (mode, t3, t2); + emit_insn (gen_rtx_SET (VOIDmode, dest, x)); + } +} + +/* Expand a floating-point conditional move. Return true if successful. */ + +bool +ix86_expand_fp_movcc (rtx operands[]) +{ + enum machine_mode mode = GET_MODE (operands[0]); + enum rtx_code code = GET_CODE (operands[1]); + rtx tmp, compare_op; + rtx op0 = XEXP (operands[1], 0); + rtx op1 = XEXP (operands[1], 1); + + if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode)) + { + enum machine_mode cmode; + + /* Since we've no cmove for sse registers, don't force bad register + allocation just to gain access to it. Deny movcc when the + comparison mode doesn't match the move mode. */ + cmode = GET_MODE (op0); + if (cmode == VOIDmode) + cmode = GET_MODE (op1); + if (cmode != mode) + return false; + + code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1); + if (code == UNKNOWN) + return false; + + if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1, + operands[2], operands[3])) + return true; + + tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1, + operands[2], operands[3]); + ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]); + return true; + } + + /* The floating point conditional move instructions don't directly + support conditions resulting from a signed integer comparison. */ + + compare_op = ix86_expand_compare (code, op0, op1); + if (!fcmov_comparison_operator (compare_op, VOIDmode)) + { + tmp = gen_reg_rtx (QImode); + ix86_expand_setcc (tmp, code, op0, op1); + + compare_op = ix86_expand_compare (NE, tmp, const0_rtx); + } + + emit_insn (gen_rtx_SET (VOIDmode, operands[0], + gen_rtx_IF_THEN_ELSE (mode, compare_op, + operands[2], operands[3]))); + + return true; +} + +/* Expand a floating-point vector conditional move; a vcond operation + rather than a movcc operation. */ + +bool +ix86_expand_fp_vcond (rtx operands[]) +{ + enum rtx_code code = GET_CODE (operands[3]); + rtx cmp; + + code = ix86_prepare_sse_fp_compare_args (operands[0], code, + &operands[4], &operands[5]); + if (code == UNKNOWN) + { + rtx temp; + switch (GET_CODE (operands[3])) + { + case LTGT: + temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4], + operands[5], operands[0], operands[0]); + cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4], + operands[5], operands[1], operands[2]); + code = AND; + break; + case UNEQ: + temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4], + operands[5], operands[0], operands[0]); + cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4], + operands[5], operands[1], operands[2]); + code = IOR; + break; + default: + gcc_unreachable (); + } + cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1, + OPTAB_DIRECT); + ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]); + return true; + } + + if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4], + operands[5], operands[1], operands[2])) + return true; + + cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5], + operands[1], operands[2]); + ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]); + return true; +} + +/* Expand a signed/unsigned integral vector conditional move. */ + +bool +ix86_expand_int_vcond (rtx operands[]) +{ + enum machine_mode mode = GET_MODE (operands[0]); + enum rtx_code code = GET_CODE (operands[3]); + bool negate = false; + rtx x, cop0, cop1; + + cop0 = operands[4]; + cop1 = operands[5]; + + /* XOP supports all of the comparisons on all vector int types. */ + if (!TARGET_XOP) + { + /* Canonicalize the comparison to EQ, GT, GTU. */ + switch (code) + { + case EQ: + case GT: + case GTU: + break; + + case NE: + case LE: + case LEU: + code = reverse_condition (code); + negate = true; + break; + + case GE: + case GEU: + code = reverse_condition (code); + negate = true; + /* FALLTHRU */ + + case LT: + case LTU: + code = swap_condition (code); + x = cop0, cop0 = cop1, cop1 = x; + break; + + default: + gcc_unreachable (); + } + + /* Only SSE4.1/SSE4.2 supports V2DImode. */ + if (mode == V2DImode) + { + switch (code) + { + case EQ: + /* SSE4.1 supports EQ. */ + if (!TARGET_SSE4_1) + return false; + break; + + case GT: + case GTU: + /* SSE4.2 supports GT/GTU. */ + if (!TARGET_SSE4_2) + return false; + break; + + default: + gcc_unreachable (); + } + } + + /* Unsigned parallel compare is not supported by the hardware. + Play some tricks to turn this into a signed comparison + against 0. */ + if (code == GTU) + { + cop0 = force_reg (mode, cop0); + + switch (mode) + { + case V4SImode: + case V2DImode: + { + rtx t1, t2, mask; + rtx (*gen_sub3) (rtx, rtx, rtx); + + /* Subtract (-(INT MAX) - 1) from both operands to make + them signed. */ + mask = ix86_build_signbit_mask (mode, true, false); + gen_sub3 = (mode == V4SImode + ? gen_subv4si3 : gen_subv2di3); + t1 = gen_reg_rtx (mode); + emit_insn (gen_sub3 (t1, cop0, mask)); + + t2 = gen_reg_rtx (mode); + emit_insn (gen_sub3 (t2, cop1, mask)); + + cop0 = t1; + cop1 = t2; + code = GT; + } + break; + + case V16QImode: + case V8HImode: + /* Perform a parallel unsigned saturating subtraction. */ + x = gen_reg_rtx (mode); + emit_insn (gen_rtx_SET (VOIDmode, x, + gen_rtx_US_MINUS (mode, cop0, cop1))); + + cop0 = x; + cop1 = CONST0_RTX (mode); + code = EQ; + negate = !negate; + break; + + default: + gcc_unreachable (); + } + } + } + + x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1, + operands[1+negate], operands[2-negate]); + + ix86_expand_sse_movcc (operands[0], x, operands[1+negate], + operands[2-negate]); + return true; +} + +/* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is + true if we should do zero extension, else sign extension. HIGH_P is + true if we want the N/2 high elements, else the low elements. */ + +void +ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p) +{ + enum machine_mode imode = GET_MODE (operands[1]); + rtx (*unpack)(rtx, rtx, rtx); + rtx se, dest; + + switch (imode) + { + case V16QImode: + if (high_p) + unpack = gen_vec_interleave_highv16qi; + else + unpack = gen_vec_interleave_lowv16qi; + break; + case V8HImode: + if (high_p) + unpack = gen_vec_interleave_highv8hi; + else + unpack = gen_vec_interleave_lowv8hi; + break; + case V4SImode: + if (high_p) + unpack = gen_vec_interleave_highv4si; + else + unpack = gen_vec_interleave_lowv4si; + break; + default: + gcc_unreachable (); + } + + dest = gen_lowpart (imode, operands[0]); + + if (unsigned_p) + se = force_reg (imode, CONST0_RTX (imode)); + else + se = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode), + operands[1], pc_rtx, pc_rtx); + + emit_insn (unpack (dest, operands[1], se)); +} + +/* This function performs the same task as ix86_expand_sse_unpack, + but with SSE4.1 instructions. */ + +void +ix86_expand_sse4_unpack (rtx operands[2], bool unsigned_p, bool high_p) +{ + enum machine_mode imode = GET_MODE (operands[1]); + rtx (*unpack)(rtx, rtx); + rtx src, dest; + + switch (imode) + { + case V16QImode: + if (unsigned_p) + unpack = gen_sse4_1_zero_extendv8qiv8hi2; + else + unpack = gen_sse4_1_sign_extendv8qiv8hi2; + break; + case V8HImode: + if (unsigned_p) + unpack = gen_sse4_1_zero_extendv4hiv4si2; + else + unpack = gen_sse4_1_sign_extendv4hiv4si2; + break; + case V4SImode: + if (unsigned_p) + unpack = gen_sse4_1_zero_extendv2siv2di2; + else + unpack = gen_sse4_1_sign_extendv2siv2di2; + break; + default: + gcc_unreachable (); + } + + dest = operands[0]; + if (high_p) + { + /* Shift higher 8 bytes to lower 8 bytes. */ + src = gen_reg_rtx (imode); + emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, src), + gen_lowpart (V1TImode, operands[1]), + GEN_INT (64))); + } + else + src = operands[1]; + + emit_insn (unpack (dest, src)); +} + +/* Expand conditional increment or decrement using adb/sbb instructions. + The default case using setcc followed by the conditional move can be + done by generic code. */ +bool +ix86_expand_int_addcc (rtx operands[]) +{ + enum rtx_code code = GET_CODE (operands[1]); + rtx flags; + rtx (*insn)(rtx, rtx, rtx, rtx, rtx); + rtx compare_op; + rtx val = const0_rtx; + bool fpcmp = false; + enum machine_mode mode; + rtx op0 = XEXP (operands[1], 0); + rtx op1 = XEXP (operands[1], 1); + + if (operands[3] != const1_rtx + && operands[3] != constm1_rtx) + return false; + if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op)) + return false; + code = GET_CODE (compare_op); + + flags = XEXP (compare_op, 0); + + if (GET_MODE (flags) == CCFPmode + || GET_MODE (flags) == CCFPUmode) + { + fpcmp = true; + code = ix86_fp_compare_code_to_integer (code); + } + + if (code != LTU) + { + val = constm1_rtx; + if (fpcmp) + PUT_CODE (compare_op, + reverse_condition_maybe_unordered + (GET_CODE (compare_op))); + else + PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op))); + } + + mode = GET_MODE (operands[0]); + + /* Construct either adc or sbb insn. */ + if ((code == LTU) == (operands[3] == constm1_rtx)) + { + switch (mode) + { + case QImode: + insn = gen_subqi3_carry; + break; + case HImode: + insn = gen_subhi3_carry; + break; + case SImode: + insn = gen_subsi3_carry; + break; + case DImode: + insn = gen_subdi3_carry; + break; + default: + gcc_unreachable (); + } + } + else + { + switch (mode) + { + case QImode: + insn = gen_addqi3_carry; + break; + case HImode: + insn = gen_addhi3_carry; + break; + case SImode: + insn = gen_addsi3_carry; + break; + case DImode: + insn = gen_adddi3_carry; + break; + default: + gcc_unreachable (); + } + } + emit_insn (insn (operands[0], operands[2], val, flags, compare_op)); + + return true; +} + + +/* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode, + but works for floating pointer parameters and nonoffsetable memories. + For pushes, it returns just stack offsets; the values will be saved + in the right order. Maximally three parts are generated. */ + +static int +ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode) +{ + int size; + + if (!TARGET_64BIT) + size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4; + else + size = (GET_MODE_SIZE (mode) + 4) / 8; + + gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand))); + gcc_assert (size >= 2 && size <= 4); + + /* Optimize constant pool reference to immediates. This is used by fp + moves, that force all constants to memory to allow combining. */ + if (MEM_P (operand) && MEM_READONLY_P (operand)) + { + rtx tmp = maybe_get_pool_constant (operand); + if (tmp) + operand = tmp; + } + + if (MEM_P (operand) && !offsettable_memref_p (operand)) + { + /* The only non-offsetable memories we handle are pushes. */ + int ok = push_operand (operand, VOIDmode); + + gcc_assert (ok); + + operand = copy_rtx (operand); + PUT_MODE (operand, Pmode); + parts[0] = parts[1] = parts[2] = parts[3] = operand; + return size; + } + + if (GET_CODE (operand) == CONST_VECTOR) + { + enum machine_mode imode = int_mode_for_mode (mode); + /* Caution: if we looked through a constant pool memory above, + the operand may actually have a different mode now. That's + ok, since we want to pun this all the way back to an integer. */ + operand = simplify_subreg (imode, operand, GET_MODE (operand), 0); + gcc_assert (operand != NULL); + mode = imode; + } + + if (!TARGET_64BIT) + { + if (mode == DImode) + split_double_mode (mode, &operand, 1, &parts[0], &parts[1]); + else + { + int i; + + if (REG_P (operand)) + { + gcc_assert (reload_completed); + for (i = 0; i < size; i++) + parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i); + } + else if (offsettable_memref_p (operand)) + { + operand = adjust_address (operand, SImode, 0); + parts[0] = operand; + for (i = 1; i < size; i++) + parts[i] = adjust_address (operand, SImode, 4 * i); + } + else if (GET_CODE (operand) == CONST_DOUBLE) + { + REAL_VALUE_TYPE r; + long l[4]; + + REAL_VALUE_FROM_CONST_DOUBLE (r, operand); + switch (mode) + { + case TFmode: + real_to_target (l, &r, mode); + parts[3] = gen_int_mode (l[3], SImode); + parts[2] = gen_int_mode (l[2], SImode); + break; + case XFmode: + REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l); + parts[2] = gen_int_mode (l[2], SImode); + break; + case DFmode: + REAL_VALUE_TO_TARGET_DOUBLE (r, l); + break; + default: + gcc_unreachable (); + } + parts[1] = gen_int_mode (l[1], SImode); + parts[0] = gen_int_mode (l[0], SImode); + } + else + gcc_unreachable (); + } + } + else + { + if (mode == TImode) + split_double_mode (mode, &operand, 1, &parts[0], &parts[1]); + if (mode == XFmode || mode == TFmode) + { + enum machine_mode upper_mode = mode==XFmode ? SImode : DImode; + if (REG_P (operand)) + { + gcc_assert (reload_completed); + parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0); + parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1); + } + else if (offsettable_memref_p (operand)) + { + operand = adjust_address (operand, DImode, 0); + parts[0] = operand; + parts[1] = adjust_address (operand, upper_mode, 8); + } + else if (GET_CODE (operand) == CONST_DOUBLE) + { + REAL_VALUE_TYPE r; + long l[4]; + + REAL_VALUE_FROM_CONST_DOUBLE (r, operand); + real_to_target (l, &r, mode); + + /* Do not use shift by 32 to avoid warning on 32bit systems. */ + if (HOST_BITS_PER_WIDE_INT >= 64) + parts[0] + = gen_int_mode + ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1)) + + ((((HOST_WIDE_INT) l[1]) << 31) << 1), + DImode); + else + parts[0] = immed_double_const (l[0], l[1], DImode); + + if (upper_mode == SImode) + parts[1] = gen_int_mode (l[2], SImode); + else if (HOST_BITS_PER_WIDE_INT >= 64) + parts[1] + = gen_int_mode + ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1)) + + ((((HOST_WIDE_INT) l[3]) << 31) << 1), + DImode); + else + parts[1] = immed_double_const (l[2], l[3], DImode); + } + else + gcc_unreachable (); + } + } + + return size; +} + +/* Emit insns to perform a move or push of DI, DF, XF, and TF values. + Return false when normal moves are needed; true when all required + insns have been emitted. Operands 2-4 contain the input values + int the correct order; operands 5-7 contain the output values. */ + +void +ix86_split_long_move (rtx operands[]) +{ + rtx part[2][4]; + int nparts, i, j; + int push = 0; + int collisions = 0; + enum machine_mode mode = GET_MODE (operands[0]); + bool collisionparts[4]; + + /* The DFmode expanders may ask us to move double. + For 64bit target this is single move. By hiding the fact + here we simplify i386.md splitters. */ + if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8) + { + /* Optimize constant pool reference to immediates. This is used by + fp moves, that force all constants to memory to allow combining. */ + + if (MEM_P (operands[1]) + && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF + && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0))) + operands[1] = get_pool_constant (XEXP (operands[1], 0)); + if (push_operand (operands[0], VOIDmode)) + { + operands[0] = copy_rtx (operands[0]); + PUT_MODE (operands[0], Pmode); + } + else + operands[0] = gen_lowpart (DImode, operands[0]); + operands[1] = gen_lowpart (DImode, operands[1]); + emit_move_insn (operands[0], operands[1]); + return; + } + + /* The only non-offsettable memory we handle is push. */ + if (push_operand (operands[0], VOIDmode)) + push = 1; + else + gcc_assert (!MEM_P (operands[0]) + || offsettable_memref_p (operands[0])); + + nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0])); + ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0])); + + /* When emitting push, take care for source operands on the stack. */ + if (push && MEM_P (operands[1]) + && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1])) + { + rtx src_base = XEXP (part[1][nparts - 1], 0); + + /* Compensate for the stack decrement by 4. */ + if (!TARGET_64BIT && nparts == 3 + && mode == XFmode && TARGET_128BIT_LONG_DOUBLE) + src_base = plus_constant (src_base, 4); + + /* src_base refers to the stack pointer and is + automatically decreased by emitted push. */ + for (i = 0; i < nparts; i++) + part[1][i] = change_address (part[1][i], + GET_MODE (part[1][i]), src_base); + } + + /* We need to do copy in the right order in case an address register + of the source overlaps the destination. */ + if (REG_P (part[0][0]) && MEM_P (part[1][0])) + { + rtx tmp; + + for (i = 0; i < nparts; i++) + { + collisionparts[i] + = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0)); + if (collisionparts[i]) + collisions++; + } + + /* Collision in the middle part can be handled by reordering. */ + if (collisions == 1 && nparts == 3 && collisionparts [1]) + { + tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp; + tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp; + } + else if (collisions == 1 + && nparts == 4 + && (collisionparts [1] || collisionparts [2])) + { + if (collisionparts [1]) + { + tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp; + tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp; + } + else + { + tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp; + tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp; + } + } + + /* If there are more collisions, we can't handle it by reordering. + Do an lea to the last part and use only one colliding move. */ + else if (collisions > 1) + { + rtx base; + + collisions = 1; + + base = part[0][nparts - 1]; + + /* Handle the case when the last part isn't valid for lea. + Happens in 64-bit mode storing the 12-byte XFmode. */ + if (GET_MODE (base) != Pmode) + base = gen_rtx_REG (Pmode, REGNO (base)); + + emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0))); + part[1][0] = replace_equiv_address (part[1][0], base); + for (i = 1; i < nparts; i++) + { + tmp = plus_constant (base, UNITS_PER_WORD * i); + part[1][i] = replace_equiv_address (part[1][i], tmp); + } + } + } + + if (push) + { + if (!TARGET_64BIT) + { + if (nparts == 3) + { + if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode) + emit_insn (gen_addsi3 (stack_pointer_rtx, + stack_pointer_rtx, GEN_INT (-4))); + emit_move_insn (part[0][2], part[1][2]); + } + else if (nparts == 4) + { + emit_move_insn (part[0][3], part[1][3]); + emit_move_insn (part[0][2], part[1][2]); + } + } + else + { + /* In 64bit mode we don't have 32bit push available. In case this is + register, it is OK - we will just use larger counterpart. We also + retype memory - these comes from attempt to avoid REX prefix on + moving of second half of TFmode value. */ + if (GET_MODE (part[1][1]) == SImode) + { + switch (GET_CODE (part[1][1])) + { + case MEM: + part[1][1] = adjust_address (part[1][1], DImode, 0); + break; + + case REG: + part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1])); + break; + + default: + gcc_unreachable (); + } + + if (GET_MODE (part[1][0]) == SImode) + part[1][0] = part[1][1]; + } + } + emit_move_insn (part[0][1], part[1][1]); + emit_move_insn (part[0][0], part[1][0]); + return; + } + + /* Choose correct order to not overwrite the source before it is copied. */ + if ((REG_P (part[0][0]) + && REG_P (part[1][1]) + && (REGNO (part[0][0]) == REGNO (part[1][1]) + || (nparts == 3 + && REGNO (part[0][0]) == REGNO (part[1][2])) + || (nparts == 4 + && REGNO (part[0][0]) == REGNO (part[1][3])))) + || (collisions > 0 + && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0)))) + { + for (i = 0, j = nparts - 1; i < nparts; i++, j--) + { + operands[2 + i] = part[0][j]; + operands[6 + i] = part[1][j]; + } + } + else + { + for (i = 0; i < nparts; i++) + { + operands[2 + i] = part[0][i]; + operands[6 + i] = part[1][i]; + } + } + + /* If optimizing for size, attempt to locally unCSE nonzero constants. */ + if (optimize_insn_for_size_p ()) + { + for (j = 0; j < nparts - 1; j++) + if (CONST_INT_P (operands[6 + j]) + && operands[6 + j] != const0_rtx + && REG_P (operands[2 + j])) + for (i = j; i < nparts - 1; i++) + if (CONST_INT_P (operands[7 + i]) + && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j])) + operands[7 + i] = operands[2 + j]; + } + + for (i = 0; i < nparts; i++) + emit_move_insn (operands[2 + i], operands[6 + i]); + + return; +} + +/* Helper function of ix86_split_ashl used to generate an SImode/DImode + left shift by a constant, either using a single shift or + a sequence of add instructions. */ + +static void +ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode) +{ + rtx (*insn)(rtx, rtx, rtx); + + if (count == 1 + || (count * ix86_cost->add <= ix86_cost->shift_const + && !optimize_insn_for_size_p ())) + { + insn = mode == DImode ? gen_addsi3 : gen_adddi3; + while (count-- > 0) + emit_insn (insn (operand, operand, operand)); + } + else + { + insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3; + emit_insn (insn (operand, operand, GEN_INT (count))); + } +} + +void +ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode) +{ + rtx (*gen_ashl3)(rtx, rtx, rtx); + rtx (*gen_shld)(rtx, rtx, rtx); + int half_width = GET_MODE_BITSIZE (mode) >> 1; + + rtx low[2], high[2]; + int count; + + if (CONST_INT_P (operands[2])) + { + split_double_mode (mode, operands, 2, low, high); + count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1); + + if (count >= half_width) + { + emit_move_insn (high[0], low[1]); + emit_move_insn (low[0], const0_rtx); + + if (count > half_width) + ix86_expand_ashl_const (high[0], count - half_width, mode); + } + else + { + gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld; + + if (!rtx_equal_p (operands[0], operands[1])) + emit_move_insn (operands[0], operands[1]); + + emit_insn (gen_shld (high[0], low[0], GEN_INT (count))); + ix86_expand_ashl_const (low[0], count, mode); + } + return; + } + + split_double_mode (mode, operands, 1, low, high); + + gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3; + + if (operands[1] == const1_rtx) + { + /* Assuming we've chosen a QImode capable registers, then 1 << N + can be done with two 32/64-bit shifts, no branches, no cmoves. */ + if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0])) + { + rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG); + + ix86_expand_clear (low[0]); + ix86_expand_clear (high[0]); + emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width))); + + d = gen_lowpart (QImode, low[0]); + d = gen_rtx_STRICT_LOW_PART (VOIDmode, d); + s = gen_rtx_EQ (QImode, flags, const0_rtx); + emit_insn (gen_rtx_SET (VOIDmode, d, s)); + + d = gen_lowpart (QImode, high[0]); + d = gen_rtx_STRICT_LOW_PART (VOIDmode, d); + s = gen_rtx_NE (QImode, flags, const0_rtx); + emit_insn (gen_rtx_SET (VOIDmode, d, s)); + } + + /* Otherwise, we can get the same results by manually performing + a bit extract operation on bit 5/6, and then performing the two + shifts. The two methods of getting 0/1 into low/high are exactly + the same size. Avoiding the shift in the bit extract case helps + pentium4 a bit; no one else seems to care much either way. */ + else + { + enum machine_mode half_mode; + rtx (*gen_lshr3)(rtx, rtx, rtx); + rtx (*gen_and3)(rtx, rtx, rtx); + rtx (*gen_xor3)(rtx, rtx, rtx); + HOST_WIDE_INT bits; + rtx x; + + if (mode == DImode) + { + half_mode = SImode; + gen_lshr3 = gen_lshrsi3; + gen_and3 = gen_andsi3; + gen_xor3 = gen_xorsi3; + bits = 5; + } + else + { + half_mode = DImode; + gen_lshr3 = gen_lshrdi3; + gen_and3 = gen_anddi3; + gen_xor3 = gen_xordi3; + bits = 6; + } + + if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ()) + x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]); + else + x = gen_lowpart (half_mode, operands[2]); + emit_insn (gen_rtx_SET (VOIDmode, high[0], x)); + + emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits))); + emit_insn (gen_and3 (high[0], high[0], const1_rtx)); + emit_move_insn (low[0], high[0]); + emit_insn (gen_xor3 (low[0], low[0], const1_rtx)); + } + + emit_insn (gen_ashl3 (low[0], low[0], operands[2])); + emit_insn (gen_ashl3 (high[0], high[0], operands[2])); + return; + } + + if (operands[1] == constm1_rtx) + { + /* For -1 << N, we can avoid the shld instruction, because we + know that we're shifting 0...31/63 ones into a -1. */ + emit_move_insn (low[0], constm1_rtx); + if (optimize_insn_for_size_p ()) + emit_move_insn (high[0], low[0]); + else + emit_move_insn (high[0], constm1_rtx); + } + else + { + gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld; + + if (!rtx_equal_p (operands[0], operands[1])) + emit_move_insn (operands[0], operands[1]); + + split_double_mode (mode, operands, 1, low, high); + emit_insn (gen_shld (high[0], low[0], operands[2])); + } + + emit_insn (gen_ashl3 (low[0], low[0], operands[2])); + + if (TARGET_CMOVE && scratch) + { + rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx) + = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1; + + ix86_expand_clear (scratch); + emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch)); + } + else + { + rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx) + = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2; + + emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2])); + } +} + +void +ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode) +{ + rtx (*gen_ashr3)(rtx, rtx, rtx) + = mode == DImode ? gen_ashrsi3 : gen_ashrdi3; + rtx (*gen_shrd)(rtx, rtx, rtx); + int half_width = GET_MODE_BITSIZE (mode) >> 1; + + rtx low[2], high[2]; + int count; + + if (CONST_INT_P (operands[2])) + { + split_double_mode (mode, operands, 2, low, high); + count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1); + + if (count == GET_MODE_BITSIZE (mode) - 1) + { + emit_move_insn (high[0], high[1]); + emit_insn (gen_ashr3 (high[0], high[0], + GEN_INT (half_width - 1))); + emit_move_insn (low[0], high[0]); + + } + else if (count >= half_width) + { + emit_move_insn (low[0], high[1]); + emit_move_insn (high[0], low[0]); + emit_insn (gen_ashr3 (high[0], high[0], + GEN_INT (half_width - 1))); + + if (count > half_width) + emit_insn (gen_ashr3 (low[0], low[0], + GEN_INT (count - half_width))); + } + else + { + gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd; + + if (!rtx_equal_p (operands[0], operands[1])) + emit_move_insn (operands[0], operands[1]); + + emit_insn (gen_shrd (low[0], high[0], GEN_INT (count))); + emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count))); + } + } + else + { + gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd; + + if (!rtx_equal_p (operands[0], operands[1])) + emit_move_insn (operands[0], operands[1]); + + split_double_mode (mode, operands, 1, low, high); + + emit_insn (gen_shrd (low[0], high[0], operands[2])); + emit_insn (gen_ashr3 (high[0], high[0], operands[2])); + + if (TARGET_CMOVE && scratch) + { + rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx) + = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1; + + emit_move_insn (scratch, high[0]); + emit_insn (gen_ashr3 (scratch, scratch, + GEN_INT (half_width - 1))); + emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2], + scratch)); + } + else + { + rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx) + = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3; + + emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2])); + } + } +} + +void +ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode) +{ + rtx (*gen_lshr3)(rtx, rtx, rtx) + = mode == DImode ? gen_lshrsi3 : gen_lshrdi3; + rtx (*gen_shrd)(rtx, rtx, rtx); + int half_width = GET_MODE_BITSIZE (mode) >> 1; + + rtx low[2], high[2]; + int count; + + if (CONST_INT_P (operands[2])) + { + split_double_mode (mode, operands, 2, low, high); + count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1); + + if (count >= half_width) + { + emit_move_insn (low[0], high[1]); + ix86_expand_clear (high[0]); + + if (count > half_width) + emit_insn (gen_lshr3 (low[0], low[0], + GEN_INT (count - half_width))); + } + else + { + gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd; + + if (!rtx_equal_p (operands[0], operands[1])) + emit_move_insn (operands[0], operands[1]); + + emit_insn (gen_shrd (low[0], high[0], GEN_INT (count))); + emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count))); + } + } + else + { + gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd; + + if (!rtx_equal_p (operands[0], operands[1])) + emit_move_insn (operands[0], operands[1]); + + split_double_mode (mode, operands, 1, low, high); + + emit_insn (gen_shrd (low[0], high[0], operands[2])); + emit_insn (gen_lshr3 (high[0], high[0], operands[2])); + + if (TARGET_CMOVE && scratch) + { + rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx) + = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1; + + ix86_expand_clear (scratch); + emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2], + scratch)); + } + else + { + rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx) + = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2; + + emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2])); + } + } +} + +/* Predict just emitted jump instruction to be taken with probability PROB. */ +static void +predict_jump (int prob) +{ + rtx insn = get_last_insn (); + gcc_assert (JUMP_P (insn)); + add_reg_note (insn, REG_BR_PROB, GEN_INT (prob)); +} + +/* Helper function for the string operations below. Dest VARIABLE whether + it is aligned to VALUE bytes. If true, jump to the label. */ +static rtx +ix86_expand_aligntest (rtx variable, int value, bool epilogue) +{ + rtx label = gen_label_rtx (); + rtx tmpcount = gen_reg_rtx (GET_MODE (variable)); + if (GET_MODE (variable) == DImode) + emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value))); + else + emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value))); + emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable), + 1, label); + if (epilogue) + predict_jump (REG_BR_PROB_BASE * 50 / 100); + else + predict_jump (REG_BR_PROB_BASE * 90 / 100); + return label; +} + +/* Adjust COUNTER by the VALUE. */ +static void +ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value) +{ + rtx (*gen_add)(rtx, rtx, rtx) + = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3; + + emit_insn (gen_add (countreg, countreg, GEN_INT (-value))); +} + +/* Zero extend possibly SImode EXP to Pmode register. */ +rtx +ix86_zero_extend_to_Pmode (rtx exp) +{ + rtx r; + if (GET_MODE (exp) == VOIDmode) + return force_reg (Pmode, exp); + if (GET_MODE (exp) == Pmode) + return copy_to_mode_reg (Pmode, exp); + r = gen_reg_rtx (Pmode); + emit_insn (gen_zero_extendsidi2 (r, exp)); + return r; +} + +/* Divide COUNTREG by SCALE. */ +static rtx +scale_counter (rtx countreg, int scale) +{ + rtx sc; + + if (scale == 1) + return countreg; + if (CONST_INT_P (countreg)) + return GEN_INT (INTVAL (countreg) / scale); + gcc_assert (REG_P (countreg)); + + sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg, + GEN_INT (exact_log2 (scale)), + NULL, 1, OPTAB_DIRECT); + return sc; +} + +/* Return mode for the memcpy/memset loop counter. Prefer SImode over + DImode for constant loop counts. */ + +static enum machine_mode +counter_mode (rtx count_exp) +{ + if (GET_MODE (count_exp) != VOIDmode) + return GET_MODE (count_exp); + if (!CONST_INT_P (count_exp)) + return Pmode; + if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff)) + return DImode; + return SImode; +} + +/* When SRCPTR is non-NULL, output simple loop to move memory + pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times, + overall size is COUNT specified in bytes. When SRCPTR is NULL, output the + equivalent loop to set memory by VALUE (supposed to be in MODE). + + The size is rounded down to whole number of chunk size moved at once. + SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */ + + +static void +expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, rtx value, + rtx count, enum machine_mode mode, int unroll, + int expected_size) +{ + rtx out_label, top_label, iter, tmp; + enum machine_mode iter_mode = counter_mode (count); + rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll); + rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1)); + rtx size; + rtx x_addr; + rtx y_addr; + int i; + + top_label = gen_label_rtx (); + out_label = gen_label_rtx (); + iter = gen_reg_rtx (iter_mode); + + size = expand_simple_binop (iter_mode, AND, count, piece_size_mask, + NULL, 1, OPTAB_DIRECT); + /* Those two should combine. */ + if (piece_size == const1_rtx) + { + emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode, + true, out_label); + predict_jump (REG_BR_PROB_BASE * 10 / 100); + } + emit_move_insn (iter, const0_rtx); + + emit_label (top_label); + + tmp = convert_modes (Pmode, iter_mode, iter, true); + x_addr = gen_rtx_PLUS (Pmode, destptr, tmp); + destmem = change_address (destmem, mode, x_addr); + + if (srcmem) + { + y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp)); + srcmem = change_address (srcmem, mode, y_addr); + + /* When unrolling for chips that reorder memory reads and writes, + we can save registers by using single temporary. + Also using 4 temporaries is overkill in 32bit mode. */ + if (!TARGET_64BIT && 0) + { + for (i = 0; i < unroll; i++) + { + if (i) + { + destmem = + adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode)); + srcmem = + adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode)); + } + emit_move_insn (destmem, srcmem); + } + } + else + { + rtx tmpreg[4]; + gcc_assert (unroll <= 4); + for (i = 0; i < unroll; i++) + { + tmpreg[i] = gen_reg_rtx (mode); + if (i) + { + srcmem = + adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode)); + } + emit_move_insn (tmpreg[i], srcmem); + } + for (i = 0; i < unroll; i++) + { + if (i) + { + destmem = + adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode)); + } + emit_move_insn (destmem, tmpreg[i]); + } + } + } + else + for (i = 0; i < unroll; i++) + { + if (i) + destmem = + adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode)); + emit_move_insn (destmem, value); + } + + tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter, + true, OPTAB_LIB_WIDEN); + if (tmp != iter) + emit_move_insn (iter, tmp); + + emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode, + true, top_label); + if (expected_size != -1) + { + expected_size /= GET_MODE_SIZE (mode) * unroll; + if (expected_size == 0) + predict_jump (0); + else if (expected_size > REG_BR_PROB_BASE) + predict_jump (REG_BR_PROB_BASE - 1); + else + predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size); + } + else + predict_jump (REG_BR_PROB_BASE * 80 / 100); + iter = ix86_zero_extend_to_Pmode (iter); + tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr, + true, OPTAB_LIB_WIDEN); + if (tmp != destptr) + emit_move_insn (destptr, tmp); + if (srcptr) + { + tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr, + true, OPTAB_LIB_WIDEN); + if (tmp != srcptr) + emit_move_insn (srcptr, tmp); + } + emit_label (out_label); +} + +/* Output "rep; mov" instruction. + Arguments have same meaning as for previous function */ +static void +expand_movmem_via_rep_mov (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, + rtx count, + enum machine_mode mode) +{ + rtx destexp; + rtx srcexp; + rtx countreg; + + /* If the size is known, it is shorter to use rep movs. */ + if (mode == QImode && CONST_INT_P (count) + && !(INTVAL (count) & 3)) + mode = SImode; + + if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode) + destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0); + if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode) + srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0); + countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode))); + if (mode != QImode) + { + destexp = gen_rtx_ASHIFT (Pmode, countreg, + GEN_INT (exact_log2 (GET_MODE_SIZE (mode)))); + destexp = gen_rtx_PLUS (Pmode, destexp, destptr); + srcexp = gen_rtx_ASHIFT (Pmode, countreg, + GEN_INT (exact_log2 (GET_MODE_SIZE (mode)))); + srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr); + } + else + { + destexp = gen_rtx_PLUS (Pmode, destptr, countreg); + srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg); + } + if (CONST_INT_P (count)) + { + count = GEN_INT (INTVAL (count) + & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1)); + destmem = shallow_copy_rtx (destmem); + srcmem = shallow_copy_rtx (srcmem); + set_mem_size (destmem, count); + set_mem_size (srcmem, count); + } + else + { + if (MEM_SIZE (destmem)) + set_mem_size (destmem, NULL_RTX); + if (MEM_SIZE (srcmem)) + set_mem_size (srcmem, NULL_RTX); + } + emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg, + destexp, srcexp)); +} + +/* Output "rep; stos" instruction. + Arguments have same meaning as for previous function */ +static void +expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value, + rtx count, enum machine_mode mode, + rtx orig_value) +{ + rtx destexp; + rtx countreg; + + if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode) + destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0); + value = force_reg (mode, gen_lowpart (mode, value)); + countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode))); + if (mode != QImode) + { + destexp = gen_rtx_ASHIFT (Pmode, countreg, + GEN_INT (exact_log2 (GET_MODE_SIZE (mode)))); + destexp = gen_rtx_PLUS (Pmode, destexp, destptr); + } + else + destexp = gen_rtx_PLUS (Pmode, destptr, countreg); + if (orig_value == const0_rtx && CONST_INT_P (count)) + { + count = GEN_INT (INTVAL (count) + & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1)); + destmem = shallow_copy_rtx (destmem); + set_mem_size (destmem, count); + } + else if (MEM_SIZE (destmem)) + set_mem_size (destmem, NULL_RTX); + emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp)); +} + +static void +emit_strmov (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, enum machine_mode mode, int offset) +{ + rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset); + rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset); + emit_insn (gen_strmov (destptr, dest, srcptr, src)); +} + +/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */ +static void +expand_movmem_epilogue (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, rtx count, int max_size) +{ + rtx src, dest; + if (CONST_INT_P (count)) + { + HOST_WIDE_INT countval = INTVAL (count); + int offset = 0; + + if ((countval & 0x10) && max_size > 16) + { + if (TARGET_64BIT) + { + emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset); + emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8); + } + else + gcc_unreachable (); + offset += 16; + } + if ((countval & 0x08) && max_size > 8) + { + if (TARGET_64BIT) + emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset); + else + { + emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset); + emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4); + } + offset += 8; + } + if ((countval & 0x04) && max_size > 4) + { + emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset); + offset += 4; + } + if ((countval & 0x02) && max_size > 2) + { + emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset); + offset += 2; + } + if ((countval & 0x01) && max_size > 1) + { + emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset); + offset += 1; + } + return; + } + if (max_size > 8) + { + count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1), + count, 1, OPTAB_DIRECT); + expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL, + count, QImode, 1, 4); + return; + } + + /* When there are stringops, we can cheaply increase dest and src pointers. + Otherwise we save code size by maintaining offset (zero is readily + available from preceding rep operation) and using x86 addressing modes. + */ + if (TARGET_SINGLE_STRINGOP) + { + if (max_size > 4) + { + rtx label = ix86_expand_aligntest (count, 4, true); + src = change_address (srcmem, SImode, srcptr); + dest = change_address (destmem, SImode, destptr); + emit_insn (gen_strmov (destptr, dest, srcptr, src)); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 2) + { + rtx label = ix86_expand_aligntest (count, 2, true); + src = change_address (srcmem, HImode, srcptr); + dest = change_address (destmem, HImode, destptr); + emit_insn (gen_strmov (destptr, dest, srcptr, src)); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 1) + { + rtx label = ix86_expand_aligntest (count, 1, true); + src = change_address (srcmem, QImode, srcptr); + dest = change_address (destmem, QImode, destptr); + emit_insn (gen_strmov (destptr, dest, srcptr, src)); + emit_label (label); + LABEL_NUSES (label) = 1; + } + } + else + { + rtx offset = force_reg (Pmode, const0_rtx); + rtx tmp; + + if (max_size > 4) + { + rtx label = ix86_expand_aligntest (count, 4, true); + src = change_address (srcmem, SImode, srcptr); + dest = change_address (destmem, SImode, destptr); + emit_move_insn (dest, src); + tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL, + true, OPTAB_LIB_WIDEN); + if (tmp != offset) + emit_move_insn (offset, tmp); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 2) + { + rtx label = ix86_expand_aligntest (count, 2, true); + tmp = gen_rtx_PLUS (Pmode, srcptr, offset); + src = change_address (srcmem, HImode, tmp); + tmp = gen_rtx_PLUS (Pmode, destptr, offset); + dest = change_address (destmem, HImode, tmp); + emit_move_insn (dest, src); + tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp, + true, OPTAB_LIB_WIDEN); + if (tmp != offset) + emit_move_insn (offset, tmp); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 1) + { + rtx label = ix86_expand_aligntest (count, 1, true); + tmp = gen_rtx_PLUS (Pmode, srcptr, offset); + src = change_address (srcmem, QImode, tmp); + tmp = gen_rtx_PLUS (Pmode, destptr, offset); + dest = change_address (destmem, QImode, tmp); + emit_move_insn (dest, src); + emit_label (label); + LABEL_NUSES (label) = 1; + } + } +} + +/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */ +static void +expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value, + rtx count, int max_size) +{ + count = + expand_simple_binop (counter_mode (count), AND, count, + GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT); + expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL, + gen_lowpart (QImode, value), count, QImode, + 1, max_size / 2); +} + +/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */ +static void +expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size) +{ + rtx dest; + + if (CONST_INT_P (count)) + { + HOST_WIDE_INT countval = INTVAL (count); + int offset = 0; + + if ((countval & 0x10) && max_size > 16) + { + if (TARGET_64BIT) + { + dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset); + emit_insn (gen_strset (destptr, dest, value)); + dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8); + emit_insn (gen_strset (destptr, dest, value)); + } + else + gcc_unreachable (); + offset += 16; + } + if ((countval & 0x08) && max_size > 8) + { + if (TARGET_64BIT) + { + dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset); + emit_insn (gen_strset (destptr, dest, value)); + } + else + { + dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset); + emit_insn (gen_strset (destptr, dest, value)); + dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4); + emit_insn (gen_strset (destptr, dest, value)); + } + offset += 8; + } + if ((countval & 0x04) && max_size > 4) + { + dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset); + emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value))); + offset += 4; + } + if ((countval & 0x02) && max_size > 2) + { + dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset); + emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value))); + offset += 2; + } + if ((countval & 0x01) && max_size > 1) + { + dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset); + emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value))); + offset += 1; + } + return; + } + if (max_size > 32) + { + expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size); + return; + } + if (max_size > 16) + { + rtx label = ix86_expand_aligntest (count, 16, true); + if (TARGET_64BIT) + { + dest = change_address (destmem, DImode, destptr); + emit_insn (gen_strset (destptr, dest, value)); + emit_insn (gen_strset (destptr, dest, value)); + } + else + { + dest = change_address (destmem, SImode, destptr); + emit_insn (gen_strset (destptr, dest, value)); + emit_insn (gen_strset (destptr, dest, value)); + emit_insn (gen_strset (destptr, dest, value)); + emit_insn (gen_strset (destptr, dest, value)); + } + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 8) + { + rtx label = ix86_expand_aligntest (count, 8, true); + if (TARGET_64BIT) + { + dest = change_address (destmem, DImode, destptr); + emit_insn (gen_strset (destptr, dest, value)); + } + else + { + dest = change_address (destmem, SImode, destptr); + emit_insn (gen_strset (destptr, dest, value)); + emit_insn (gen_strset (destptr, dest, value)); + } + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 4) + { + rtx label = ix86_expand_aligntest (count, 4, true); + dest = change_address (destmem, SImode, destptr); + emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value))); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 2) + { + rtx label = ix86_expand_aligntest (count, 2, true); + dest = change_address (destmem, HImode, destptr); + emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value))); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 1) + { + rtx label = ix86_expand_aligntest (count, 1, true); + dest = change_address (destmem, QImode, destptr); + emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value))); + emit_label (label); + LABEL_NUSES (label) = 1; + } +} + +/* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to + DESIRED_ALIGNMENT. */ +static void +expand_movmem_prologue (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, rtx count, + int align, int desired_alignment) +{ + if (align <= 1 && desired_alignment > 1) + { + rtx label = ix86_expand_aligntest (destptr, 1, false); + srcmem = change_address (srcmem, QImode, srcptr); + destmem = change_address (destmem, QImode, destptr); + emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem)); + ix86_adjust_counter (count, 1); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 2 && desired_alignment > 2) + { + rtx label = ix86_expand_aligntest (destptr, 2, false); + srcmem = change_address (srcmem, HImode, srcptr); + destmem = change_address (destmem, HImode, destptr); + emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem)); + ix86_adjust_counter (count, 2); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 4 && desired_alignment > 4) + { + rtx label = ix86_expand_aligntest (destptr, 4, false); + srcmem = change_address (srcmem, SImode, srcptr); + destmem = change_address (destmem, SImode, destptr); + emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem)); + ix86_adjust_counter (count, 4); + emit_label (label); + LABEL_NUSES (label) = 1; + } + gcc_assert (desired_alignment <= 8); +} + +/* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN. + ALIGN_BYTES is how many bytes need to be copied. */ +static rtx +expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg, + int desired_align, int align_bytes) +{ + rtx src = *srcp; + rtx src_size, dst_size; + int off = 0; + int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT); + if (src_align_bytes >= 0) + src_align_bytes = desired_align - src_align_bytes; + src_size = MEM_SIZE (src); + dst_size = MEM_SIZE (dst); + if (align_bytes & 1) + { + dst = adjust_automodify_address_nv (dst, QImode, destreg, 0); + src = adjust_automodify_address_nv (src, QImode, srcreg, 0); + off = 1; + emit_insn (gen_strmov (destreg, dst, srcreg, src)); + } + if (align_bytes & 2) + { + dst = adjust_automodify_address_nv (dst, HImode, destreg, off); + src = adjust_automodify_address_nv (src, HImode, srcreg, off); + if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT) + set_mem_align (dst, 2 * BITS_PER_UNIT); + if (src_align_bytes >= 0 + && (src_align_bytes & 1) == (align_bytes & 1) + && MEM_ALIGN (src) < 2 * BITS_PER_UNIT) + set_mem_align (src, 2 * BITS_PER_UNIT); + off = 2; + emit_insn (gen_strmov (destreg, dst, srcreg, src)); + } + if (align_bytes & 4) + { + dst = adjust_automodify_address_nv (dst, SImode, destreg, off); + src = adjust_automodify_address_nv (src, SImode, srcreg, off); + if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT) + set_mem_align (dst, 4 * BITS_PER_UNIT); + if (src_align_bytes >= 0) + { + unsigned int src_align = 0; + if ((src_align_bytes & 3) == (align_bytes & 3)) + src_align = 4; + else if ((src_align_bytes & 1) == (align_bytes & 1)) + src_align = 2; + if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT) + set_mem_align (src, src_align * BITS_PER_UNIT); + } + off = 4; + emit_insn (gen_strmov (destreg, dst, srcreg, src)); + } + dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off); + src = adjust_automodify_address_nv (src, BLKmode, srcreg, off); + if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT) + set_mem_align (dst, desired_align * BITS_PER_UNIT); + if (src_align_bytes >= 0) + { + unsigned int src_align = 0; + if ((src_align_bytes & 7) == (align_bytes & 7)) + src_align = 8; + else if ((src_align_bytes & 3) == (align_bytes & 3)) + src_align = 4; + else if ((src_align_bytes & 1) == (align_bytes & 1)) + src_align = 2; + if (src_align > (unsigned int) desired_align) + src_align = desired_align; + if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT) + set_mem_align (src, src_align * BITS_PER_UNIT); + } + if (dst_size) + set_mem_size (dst, GEN_INT (INTVAL (dst_size) - align_bytes)); + if (src_size) + set_mem_size (dst, GEN_INT (INTVAL (src_size) - align_bytes)); + *srcp = src; + return dst; +} + +/* Set enough from DEST to align DEST known to by aligned by ALIGN to + DESIRED_ALIGNMENT. */ +static void +expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count, + int align, int desired_alignment) +{ + if (align <= 1 && desired_alignment > 1) + { + rtx label = ix86_expand_aligntest (destptr, 1, false); + destmem = change_address (destmem, QImode, destptr); + emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value))); + ix86_adjust_counter (count, 1); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 2 && desired_alignment > 2) + { + rtx label = ix86_expand_aligntest (destptr, 2, false); + destmem = change_address (destmem, HImode, destptr); + emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value))); + ix86_adjust_counter (count, 2); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 4 && desired_alignment > 4) + { + rtx label = ix86_expand_aligntest (destptr, 4, false); + destmem = change_address (destmem, SImode, destptr); + emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value))); + ix86_adjust_counter (count, 4); + emit_label (label); + LABEL_NUSES (label) = 1; + } + gcc_assert (desired_alignment <= 8); +} + +/* Set enough from DST to align DST known to by aligned by ALIGN to + DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */ +static rtx +expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value, + int desired_align, int align_bytes) +{ + int off = 0; + rtx dst_size = MEM_SIZE (dst); + if (align_bytes & 1) + { + dst = adjust_automodify_address_nv (dst, QImode, destreg, 0); + off = 1; + emit_insn (gen_strset (destreg, dst, + gen_lowpart (QImode, value))); + } + if (align_bytes & 2) + { + dst = adjust_automodify_address_nv (dst, HImode, destreg, off); + if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT) + set_mem_align (dst, 2 * BITS_PER_UNIT); + off = 2; + emit_insn (gen_strset (destreg, dst, + gen_lowpart (HImode, value))); + } + if (align_bytes & 4) + { + dst = adjust_automodify_address_nv (dst, SImode, destreg, off); + if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT) + set_mem_align (dst, 4 * BITS_PER_UNIT); + off = 4; + emit_insn (gen_strset (destreg, dst, + gen_lowpart (SImode, value))); + } + dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off); + if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT) + set_mem_align (dst, desired_align * BITS_PER_UNIT); + if (dst_size) + set_mem_size (dst, GEN_INT (INTVAL (dst_size) - align_bytes)); + return dst; +} + +/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */ +static enum stringop_alg +decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset, + int *dynamic_check) +{ + const struct stringop_algs * algs; + bool optimize_for_speed; + /* Algorithms using the rep prefix want at least edi and ecx; + additionally, memset wants eax and memcpy wants esi. Don't + consider such algorithms if the user has appropriated those + registers for their own purposes. */ + bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG] + || (memset + ? fixed_regs[AX_REG] : fixed_regs[SI_REG])); + +#define ALG_USABLE_P(alg) (rep_prefix_usable \ + || (alg != rep_prefix_1_byte \ + && alg != rep_prefix_4_byte \ + && alg != rep_prefix_8_byte)) + const struct processor_costs *cost; + + /* Even if the string operation call is cold, we still might spend a lot + of time processing large blocks. */ + if (optimize_function_for_size_p (cfun) + || (optimize_insn_for_size_p () + && expected_size != -1 && expected_size < 256)) + optimize_for_speed = false; + else + optimize_for_speed = true; + + cost = optimize_for_speed ? ix86_cost : &ix86_size_cost; + + *dynamic_check = -1; + if (memset) + algs = &cost->memset[TARGET_64BIT != 0]; + else + algs = &cost->memcpy[TARGET_64BIT != 0]; + if (stringop_alg != no_stringop && ALG_USABLE_P (stringop_alg)) + return stringop_alg; + /* rep; movq or rep; movl is the smallest variant. */ + else if (!optimize_for_speed) + { + if (!count || (count & 3)) + return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte; + else + return rep_prefix_usable ? rep_prefix_4_byte : loop; + } + /* Very tiny blocks are best handled via the loop, REP is expensive to setup. + */ + else if (expected_size != -1 && expected_size < 4) + return loop_1_byte; + else if (expected_size != -1) + { + unsigned int i; + enum stringop_alg alg = libcall; + for (i = 0; i < MAX_STRINGOP_ALGS; i++) + { + /* We get here if the algorithms that were not libcall-based + were rep-prefix based and we are unable to use rep prefixes + based on global register usage. Break out of the loop and + use the heuristic below. */ + if (algs->size[i].max == 0) + break; + if (algs->size[i].max >= expected_size || algs->size[i].max == -1) + { + enum stringop_alg candidate = algs->size[i].alg; + + if (candidate != libcall && ALG_USABLE_P (candidate)) + alg = candidate; + /* Honor TARGET_INLINE_ALL_STRINGOPS by picking + last non-libcall inline algorithm. */ + if (TARGET_INLINE_ALL_STRINGOPS) + { + /* When the current size is best to be copied by a libcall, + but we are still forced to inline, run the heuristic below + that will pick code for medium sized blocks. */ + if (alg != libcall) + return alg; + break; + } + else if (ALG_USABLE_P (candidate)) + return candidate; + } + } + gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable); + } + /* When asked to inline the call anyway, try to pick meaningful choice. + We look for maximal size of block that is faster to copy by hand and + take blocks of at most of that size guessing that average size will + be roughly half of the block. + + If this turns out to be bad, we might simply specify the preferred + choice in ix86_costs. */ + if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY) + && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size))) + { + int max = -1; + enum stringop_alg alg; + int i; + bool any_alg_usable_p = true; + + for (i = 0; i < MAX_STRINGOP_ALGS; i++) + { + enum stringop_alg candidate = algs->size[i].alg; + any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate); + + if (candidate != libcall && candidate + && ALG_USABLE_P (candidate)) + max = algs->size[i].max; + } + /* If there aren't any usable algorithms, then recursing on + smaller sizes isn't going to find anything. Just return the + simple byte-at-a-time copy loop. */ + if (!any_alg_usable_p) + { + /* Pick something reasonable. */ + if (TARGET_INLINE_STRINGOPS_DYNAMICALLY) + *dynamic_check = 128; + return loop_1_byte; + } + if (max == -1) + max = 4096; + alg = decide_alg (count, max / 2, memset, dynamic_check); + gcc_assert (*dynamic_check == -1); + gcc_assert (alg != libcall); + if (TARGET_INLINE_STRINGOPS_DYNAMICALLY) + *dynamic_check = max; + return alg; + } + return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall; +#undef ALG_USABLE_P +} + +/* Decide on alignment. We know that the operand is already aligned to ALIGN + (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */ +static int +decide_alignment (int align, + enum stringop_alg alg, + int expected_size) +{ + int desired_align = 0; + switch (alg) + { + case no_stringop: + gcc_unreachable (); + case loop: + case unrolled_loop: + desired_align = GET_MODE_SIZE (Pmode); + break; + case rep_prefix_8_byte: + desired_align = 8; + break; + case rep_prefix_4_byte: + /* PentiumPro has special logic triggering for 8 byte aligned blocks. + copying whole cacheline at once. */ + if (TARGET_PENTIUMPRO) + desired_align = 8; + else + desired_align = 4; + break; + case rep_prefix_1_byte: + /* PentiumPro has special logic triggering for 8 byte aligned blocks. + copying whole cacheline at once. */ + if (TARGET_PENTIUMPRO) + desired_align = 8; + else + desired_align = 1; + break; + case loop_1_byte: + desired_align = 1; + break; + case libcall: + return 0; + } + + if (optimize_size) + desired_align = 1; + if (desired_align < align) + desired_align = align; + if (expected_size != -1 && expected_size < 4) + desired_align = align; + return desired_align; +} + +/* Return the smallest power of 2 greater than VAL. */ +static int +smallest_pow2_greater_than (int val) +{ + int ret = 1; + while (ret <= val) + ret <<= 1; + return ret; +} + +/* Expand string move (memcpy) operation. Use i386 string operations when + profitable. expand_setmem contains similar code. The code depends upon + architecture, block size and alignment, but always has the same + overall structure: + + 1) Prologue guard: Conditional that jumps up to epilogues for small + blocks that can be handled by epilogue alone. This is faster but + also needed for correctness, since prologue assume the block is larger + than the desired alignment. + + Optional dynamic check for size and libcall for large + blocks is emitted here too, with -minline-stringops-dynamically. + + 2) Prologue: copy first few bytes in order to get destination aligned + to DESIRED_ALIGN. It is emitted only when ALIGN is less than + DESIRED_ALIGN and and up to DESIRED_ALIGN - ALIGN bytes can be copied. + We emit either a jump tree on power of two sized blocks, or a byte loop. + + 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks + with specified algorithm. + + 4) Epilogue: code copying tail of the block that is too small to be + handled by main body (or up to size guarded by prologue guard). */ + +bool +ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, + rtx expected_align_exp, rtx expected_size_exp) +{ + rtx destreg; + rtx srcreg; + rtx label = NULL; + rtx tmp; + rtx jump_around_label = NULL; + HOST_WIDE_INT align = 1; + unsigned HOST_WIDE_INT count = 0; + HOST_WIDE_INT expected_size = -1; + int size_needed = 0, epilogue_size_needed; + int desired_align = 0, align_bytes = 0; + enum stringop_alg alg; + int dynamic_check; + bool need_zero_guard = false; + + if (CONST_INT_P (align_exp)) + align = INTVAL (align_exp); + /* i386 can do misaligned access on reasonably increased cost. */ + if (CONST_INT_P (expected_align_exp) + && INTVAL (expected_align_exp) > align) + align = INTVAL (expected_align_exp); + /* ALIGN is the minimum of destination and source alignment, but we care here + just about destination alignment. */ + else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT) + align = MEM_ALIGN (dst) / BITS_PER_UNIT; + + if (CONST_INT_P (count_exp)) + count = expected_size = INTVAL (count_exp); + if (CONST_INT_P (expected_size_exp) && count == 0) + expected_size = INTVAL (expected_size_exp); + + /* Make sure we don't need to care about overflow later on. */ + if (count > ((unsigned HOST_WIDE_INT) 1 << 30)) + return false; + + /* Step 0: Decide on preferred algorithm, desired alignment and + size of chunks to be copied by main loop. */ + + alg = decide_alg (count, expected_size, false, &dynamic_check); + desired_align = decide_alignment (align, alg, expected_size); + + if (!TARGET_ALIGN_STRINGOPS) + align = desired_align; + + if (alg == libcall) + return false; + gcc_assert (alg != no_stringop); + if (!count) + count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp); + destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0)); + srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0)); + switch (alg) + { + case libcall: + case no_stringop: + gcc_unreachable (); + case loop: + need_zero_guard = true; + size_needed = GET_MODE_SIZE (Pmode); + break; + case unrolled_loop: + need_zero_guard = true; + size_needed = GET_MODE_SIZE (Pmode) * (TARGET_64BIT ? 4 : 2); + break; + case rep_prefix_8_byte: + size_needed = 8; + break; + case rep_prefix_4_byte: + size_needed = 4; + break; + case rep_prefix_1_byte: + size_needed = 1; + break; + case loop_1_byte: + need_zero_guard = true; + size_needed = 1; + break; + } + + epilogue_size_needed = size_needed; + + /* Step 1: Prologue guard. */ + + /* Alignment code needs count to be in register. */ + if (CONST_INT_P (count_exp) && desired_align > align) + { + if (INTVAL (count_exp) > desired_align + && INTVAL (count_exp) > size_needed) + { + align_bytes + = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT); + if (align_bytes <= 0) + align_bytes = 0; + else + align_bytes = desired_align - align_bytes; + } + if (align_bytes == 0) + count_exp = force_reg (counter_mode (count_exp), count_exp); + } + gcc_assert (desired_align >= 1 && align >= 1); + + /* Ensure that alignment prologue won't copy past end of block. */ + if (size_needed > 1 || (desired_align > 1 && desired_align > align)) + { + epilogue_size_needed = MAX (size_needed - 1, desired_align - align); + /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes. + Make sure it is power of 2. */ + epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed); + + if (count) + { + if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed) + { + /* If main algorithm works on QImode, no epilogue is needed. + For small sizes just don't align anything. */ + if (size_needed == 1) + desired_align = align; + else + goto epilogue; + } + } + else + { + label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, + GEN_INT (epilogue_size_needed), + LTU, 0, counter_mode (count_exp), 1, label); + if (expected_size == -1 || expected_size < epilogue_size_needed) + predict_jump (REG_BR_PROB_BASE * 60 / 100); + else + predict_jump (REG_BR_PROB_BASE * 20 / 100); + } + } + + /* Emit code to decide on runtime whether library call or inline should be + used. */ + if (dynamic_check != -1) + { + if (CONST_INT_P (count_exp)) + { + if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check) + { + emit_block_move_via_libcall (dst, src, count_exp, false); + count_exp = const0_rtx; + goto epilogue; + } + } + else + { + rtx hot_label = gen_label_rtx (); + jump_around_label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1), + LEU, 0, GET_MODE (count_exp), 1, hot_label); + predict_jump (REG_BR_PROB_BASE * 90 / 100); + emit_block_move_via_libcall (dst, src, count_exp, false); + emit_jump (jump_around_label); + emit_label (hot_label); + } + } + + /* Step 2: Alignment prologue. */ + + if (desired_align > align) + { + if (align_bytes == 0) + { + /* Except for the first move in epilogue, we no longer know + constant offset in aliasing info. It don't seems to worth + the pain to maintain it for the first move, so throw away + the info early. */ + src = change_address (src, BLKmode, srcreg); + dst = change_address (dst, BLKmode, destreg); + expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align, + desired_align); + } + else + { + /* If we know how many bytes need to be stored before dst is + sufficiently aligned, maintain aliasing info accurately. */ + dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg, + desired_align, align_bytes); + count_exp = plus_constant (count_exp, -align_bytes); + count -= align_bytes; + } + if (need_zero_guard + && (count < (unsigned HOST_WIDE_INT) size_needed + || (align_bytes == 0 + && count < ((unsigned HOST_WIDE_INT) size_needed + + desired_align - align)))) + { + /* It is possible that we copied enough so the main loop will not + execute. */ + gcc_assert (size_needed > 1); + if (label == NULL_RTX) + label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, + GEN_INT (size_needed), + LTU, 0, counter_mode (count_exp), 1, label); + if (expected_size == -1 + || expected_size < (desired_align - align) / 2 + size_needed) + predict_jump (REG_BR_PROB_BASE * 20 / 100); + else + predict_jump (REG_BR_PROB_BASE * 60 / 100); + } + } + if (label && size_needed == 1) + { + emit_label (label); + LABEL_NUSES (label) = 1; + label = NULL; + epilogue_size_needed = 1; + } + else if (label == NULL_RTX) + epilogue_size_needed = size_needed; + + /* Step 3: Main loop. */ + + switch (alg) + { + case libcall: + case no_stringop: + gcc_unreachable (); + case loop_1_byte: + expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL, + count_exp, QImode, 1, expected_size); + break; + case loop: + expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL, + count_exp, Pmode, 1, expected_size); + break; + case unrolled_loop: + /* Unroll only by factor of 2 in 32bit mode, since we don't have enough + registers for 4 temporaries anyway. */ + expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL, + count_exp, Pmode, TARGET_64BIT ? 4 : 2, + expected_size); + break; + case rep_prefix_8_byte: + expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp, + DImode); + break; + case rep_prefix_4_byte: + expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp, + SImode); + break; + case rep_prefix_1_byte: + expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp, + QImode); + break; + } + /* Adjust properly the offset of src and dest memory for aliasing. */ + if (CONST_INT_P (count_exp)) + { + src = adjust_automodify_address_nv (src, BLKmode, srcreg, + (count / size_needed) * size_needed); + dst = adjust_automodify_address_nv (dst, BLKmode, destreg, + (count / size_needed) * size_needed); + } + else + { + src = change_address (src, BLKmode, srcreg); + dst = change_address (dst, BLKmode, destreg); + } + + /* Step 4: Epilogue to copy the remaining bytes. */ + epilogue: + if (label) + { + /* When the main loop is done, COUNT_EXP might hold original count, + while we want to copy only COUNT_EXP & SIZE_NEEDED bytes. + Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED + bytes. Compensate if needed. */ + + if (size_needed < epilogue_size_needed) + { + tmp = + expand_simple_binop (counter_mode (count_exp), AND, count_exp, + GEN_INT (size_needed - 1), count_exp, 1, + OPTAB_DIRECT); + if (tmp != count_exp) + emit_move_insn (count_exp, tmp); + } + emit_label (label); + LABEL_NUSES (label) = 1; + } + + if (count_exp != const0_rtx && epilogue_size_needed > 1) + expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp, + epilogue_size_needed); + if (jump_around_label) + emit_label (jump_around_label); + return true; +} + +/* Helper function for memcpy. For QImode value 0xXY produce + 0xXYXYXYXY of wide specified by MODE. This is essentially + a * 0x10101010, but we can do slightly better than + synth_mult by unwinding the sequence by hand on CPUs with + slow multiply. */ +static rtx +promote_duplicated_reg (enum machine_mode mode, rtx val) +{ + enum machine_mode valmode = GET_MODE (val); + rtx tmp; + int nops = mode == DImode ? 3 : 2; + + gcc_assert (mode == SImode || mode == DImode); + if (val == const0_rtx) + return copy_to_mode_reg (mode, const0_rtx); + if (CONST_INT_P (val)) + { + HOST_WIDE_INT v = INTVAL (val) & 255; + + v |= v << 8; + v |= v << 16; + if (mode == DImode) + v |= (v << 16) << 16; + return copy_to_mode_reg (mode, gen_int_mode (v, mode)); + } + + if (valmode == VOIDmode) + valmode = QImode; + if (valmode != QImode) + val = gen_lowpart (QImode, val); + if (mode == QImode) + return val; + if (!TARGET_PARTIAL_REG_STALL) + nops--; + if (ix86_cost->mult_init[mode == DImode ? 3 : 2] + + ix86_cost->mult_bit * (mode == DImode ? 8 : 4) + <= (ix86_cost->shift_const + ix86_cost->add) * nops + + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0))) + { + rtx reg = convert_modes (mode, QImode, val, true); + tmp = promote_duplicated_reg (mode, const1_rtx); + return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1, + OPTAB_DIRECT); + } + else + { + rtx reg = convert_modes (mode, QImode, val, true); + + if (!TARGET_PARTIAL_REG_STALL) + if (mode == SImode) + emit_insn (gen_movsi_insv_1 (reg, reg)); + else + emit_insn (gen_movdi_insv_1 (reg, reg)); + else + { + tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8), + NULL, 1, OPTAB_DIRECT); + reg = + expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); + } + tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16), + NULL, 1, OPTAB_DIRECT); + reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); + if (mode == SImode) + return reg; + tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32), + NULL, 1, OPTAB_DIRECT); + reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); + return reg; + } +} + +/* Duplicate value VAL using promote_duplicated_reg into maximal size that will + be needed by main loop copying SIZE_NEEDED chunks and prologue getting + alignment from ALIGN to DESIRED_ALIGN. */ +static rtx +promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align) +{ + rtx promoted_val; + + if (TARGET_64BIT + && (size_needed > 4 || (desired_align > align && desired_align > 4))) + promoted_val = promote_duplicated_reg (DImode, val); + else if (size_needed > 2 || (desired_align > align && desired_align > 2)) + promoted_val = promote_duplicated_reg (SImode, val); + else if (size_needed > 1 || (desired_align > align && desired_align > 1)) + promoted_val = promote_duplicated_reg (HImode, val); + else + promoted_val = val; + + return promoted_val; +} + +/* Expand string clear operation (bzero). Use i386 string operations when + profitable. See expand_movmem comment for explanation of individual + steps performed. */ +bool +ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, + rtx expected_align_exp, rtx expected_size_exp) +{ + rtx destreg; + rtx label = NULL; + rtx tmp; + rtx jump_around_label = NULL; + HOST_WIDE_INT align = 1; + unsigned HOST_WIDE_INT count = 0; + HOST_WIDE_INT expected_size = -1; + int size_needed = 0, epilogue_size_needed; + int desired_align = 0, align_bytes = 0; + enum stringop_alg alg; + rtx promoted_val = NULL; + bool force_loopy_epilogue = false; + int dynamic_check; + bool need_zero_guard = false; + + if (CONST_INT_P (align_exp)) + align = INTVAL (align_exp); + /* i386 can do misaligned access on reasonably increased cost. */ + if (CONST_INT_P (expected_align_exp) + && INTVAL (expected_align_exp) > align) + align = INTVAL (expected_align_exp); + if (CONST_INT_P (count_exp)) + count = expected_size = INTVAL (count_exp); + if (CONST_INT_P (expected_size_exp) && count == 0) + expected_size = INTVAL (expected_size_exp); + + /* Make sure we don't need to care about overflow later on. */ + if (count > ((unsigned HOST_WIDE_INT) 1 << 30)) + return false; + + /* Step 0: Decide on preferred algorithm, desired alignment and + size of chunks to be copied by main loop. */ + + alg = decide_alg (count, expected_size, true, &dynamic_check); + desired_align = decide_alignment (align, alg, expected_size); + + if (!TARGET_ALIGN_STRINGOPS) + align = desired_align; + + if (alg == libcall) + return false; + gcc_assert (alg != no_stringop); + if (!count) + count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp); + destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0)); + switch (alg) + { + case libcall: + case no_stringop: + gcc_unreachable (); + case loop: + need_zero_guard = true; + size_needed = GET_MODE_SIZE (Pmode); + break; + case unrolled_loop: + need_zero_guard = true; + size_needed = GET_MODE_SIZE (Pmode) * 4; + break; + case rep_prefix_8_byte: + size_needed = 8; + break; + case rep_prefix_4_byte: + size_needed = 4; + break; + case rep_prefix_1_byte: + size_needed = 1; + break; + case loop_1_byte: + need_zero_guard = true; + size_needed = 1; + break; + } + epilogue_size_needed = size_needed; + + /* Step 1: Prologue guard. */ + + /* Alignment code needs count to be in register. */ + if (CONST_INT_P (count_exp) && desired_align > align) + { + if (INTVAL (count_exp) > desired_align + && INTVAL (count_exp) > size_needed) + { + align_bytes + = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT); + if (align_bytes <= 0) + align_bytes = 0; + else + align_bytes = desired_align - align_bytes; + } + if (align_bytes == 0) + { + enum machine_mode mode = SImode; + if (TARGET_64BIT && (count & ~0xffffffff)) + mode = DImode; + count_exp = force_reg (mode, count_exp); + } + } + /* Do the cheap promotion to allow better CSE across the + main loop and epilogue (ie one load of the big constant in the + front of all code. */ + if (CONST_INT_P (val_exp)) + promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed, + desired_align, align); + /* Ensure that alignment prologue won't copy past end of block. */ + if (size_needed > 1 || (desired_align > 1 && desired_align > align)) + { + epilogue_size_needed = MAX (size_needed - 1, desired_align - align); + /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes. + Make sure it is power of 2. */ + epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed); + + /* To improve performance of small blocks, we jump around the VAL + promoting mode. This mean that if the promoted VAL is not constant, + we might not use it in the epilogue and have to use byte + loop variant. */ + if (epilogue_size_needed > 2 && !promoted_val) + force_loopy_epilogue = true; + if (count) + { + if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed) + { + /* If main algorithm works on QImode, no epilogue is needed. + For small sizes just don't align anything. */ + if (size_needed == 1) + desired_align = align; + else + goto epilogue; + } + } + else + { + label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, + GEN_INT (epilogue_size_needed), + LTU, 0, counter_mode (count_exp), 1, label); + if (expected_size == -1 || expected_size <= epilogue_size_needed) + predict_jump (REG_BR_PROB_BASE * 60 / 100); + else + predict_jump (REG_BR_PROB_BASE * 20 / 100); + } + } + if (dynamic_check != -1) + { + rtx hot_label = gen_label_rtx (); + jump_around_label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1), + LEU, 0, counter_mode (count_exp), 1, hot_label); + predict_jump (REG_BR_PROB_BASE * 90 / 100); + set_storage_via_libcall (dst, count_exp, val_exp, false); + emit_jump (jump_around_label); + emit_label (hot_label); + } + + /* Step 2: Alignment prologue. */ + + /* Do the expensive promotion once we branched off the small blocks. */ + if (!promoted_val) + promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed, + desired_align, align); + gcc_assert (desired_align >= 1 && align >= 1); + + if (desired_align > align) + { + if (align_bytes == 0) + { + /* Except for the first move in epilogue, we no longer know + constant offset in aliasing info. It don't seems to worth + the pain to maintain it for the first move, so throw away + the info early. */ + dst = change_address (dst, BLKmode, destreg); + expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align, + desired_align); + } + else + { + /* If we know how many bytes need to be stored before dst is + sufficiently aligned, maintain aliasing info accurately. */ + dst = expand_constant_setmem_prologue (dst, destreg, promoted_val, + desired_align, align_bytes); + count_exp = plus_constant (count_exp, -align_bytes); + count -= align_bytes; + } + if (need_zero_guard + && (count < (unsigned HOST_WIDE_INT) size_needed + || (align_bytes == 0 + && count < ((unsigned HOST_WIDE_INT) size_needed + + desired_align - align)))) + { + /* It is possible that we copied enough so the main loop will not + execute. */ + gcc_assert (size_needed > 1); + if (label == NULL_RTX) + label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, + GEN_INT (size_needed), + LTU, 0, counter_mode (count_exp), 1, label); + if (expected_size == -1 + || expected_size < (desired_align - align) / 2 + size_needed) + predict_jump (REG_BR_PROB_BASE * 20 / 100); + else + predict_jump (REG_BR_PROB_BASE * 60 / 100); + } + } + if (label && size_needed == 1) + { + emit_label (label); + LABEL_NUSES (label) = 1; + label = NULL; + promoted_val = val_exp; + epilogue_size_needed = 1; + } + else if (label == NULL_RTX) + epilogue_size_needed = size_needed; + + /* Step 3: Main loop. */ + + switch (alg) + { + case libcall: + case no_stringop: + gcc_unreachable (); + case loop_1_byte: + expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val, + count_exp, QImode, 1, expected_size); + break; + case loop: + expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val, + count_exp, Pmode, 1, expected_size); + break; + case unrolled_loop: + expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val, + count_exp, Pmode, 4, expected_size); + break; + case rep_prefix_8_byte: + expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp, + DImode, val_exp); + break; + case rep_prefix_4_byte: + expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp, + SImode, val_exp); + break; + case rep_prefix_1_byte: + expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp, + QImode, val_exp); + break; + } + /* Adjust properly the offset of src and dest memory for aliasing. */ + if (CONST_INT_P (count_exp)) + dst = adjust_automodify_address_nv (dst, BLKmode, destreg, + (count / size_needed) * size_needed); + else + dst = change_address (dst, BLKmode, destreg); + + /* Step 4: Epilogue to copy the remaining bytes. */ + + if (label) + { + /* When the main loop is done, COUNT_EXP might hold original count, + while we want to copy only COUNT_EXP & SIZE_NEEDED bytes. + Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED + bytes. Compensate if needed. */ + + if (size_needed < epilogue_size_needed) + { + tmp = + expand_simple_binop (counter_mode (count_exp), AND, count_exp, + GEN_INT (size_needed - 1), count_exp, 1, + OPTAB_DIRECT); + if (tmp != count_exp) + emit_move_insn (count_exp, tmp); + } + emit_label (label); + LABEL_NUSES (label) = 1; + } + epilogue: + if (count_exp != const0_rtx && epilogue_size_needed > 1) + { + if (force_loopy_epilogue) + expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp, + epilogue_size_needed); + else + expand_setmem_epilogue (dst, destreg, promoted_val, count_exp, + epilogue_size_needed); + } + if (jump_around_label) + emit_label (jump_around_label); + return true; +} + +/* Expand the appropriate insns for doing strlen if not just doing + repnz; scasb + + out = result, initialized with the start address + align_rtx = alignment of the address. + scratch = scratch register, initialized with the startaddress when + not aligned, otherwise undefined + + This is just the body. It needs the initializations mentioned above and + some address computing at the end. These things are done in i386.md. */ + +static void +ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx) +{ + int align; + rtx tmp; + rtx align_2_label = NULL_RTX; + rtx align_3_label = NULL_RTX; + rtx align_4_label = gen_label_rtx (); + rtx end_0_label = gen_label_rtx (); + rtx mem; + rtx tmpreg = gen_reg_rtx (SImode); + rtx scratch = gen_reg_rtx (SImode); + rtx cmp; + + align = 0; + if (CONST_INT_P (align_rtx)) + align = INTVAL (align_rtx); + + /* Loop to check 1..3 bytes for null to get an aligned pointer. */ + + /* Is there a known alignment and is it less than 4? */ + if (align < 4) + { + rtx scratch1 = gen_reg_rtx (Pmode); + emit_move_insn (scratch1, out); + /* Is there a known alignment and is it not 2? */ + if (align != 2) + { + align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */ + align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */ + + /* Leave just the 3 lower bits. */ + align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3), + NULL_RTX, 0, OPTAB_WIDEN); + + emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL, + Pmode, 1, align_4_label); + emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL, + Pmode, 1, align_2_label); + emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL, + Pmode, 1, align_3_label); + } + else + { + /* Since the alignment is 2, we have to check 2 or 0 bytes; + check if is aligned to 4 - byte. */ + + align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx, + NULL_RTX, 0, OPTAB_WIDEN); + + emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL, + Pmode, 1, align_4_label); + } + + mem = change_address (src, QImode, out); + + /* Now compare the bytes. */ + + /* Compare the first n unaligned byte on a byte per byte basis. */ + emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, + QImode, 1, end_0_label); + + /* Increment the address. */ + emit_insn (ix86_gen_add3 (out, out, const1_rtx)); + + /* Not needed with an alignment of 2 */ + if (align != 2) + { + emit_label (align_2_label); + + emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1, + end_0_label); + + emit_insn (ix86_gen_add3 (out, out, const1_rtx)); + + emit_label (align_3_label); + } + + emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1, + end_0_label); + + emit_insn (ix86_gen_add3 (out, out, const1_rtx)); + } + + /* Generate loop to check 4 bytes at a time. It is not a good idea to + align this loop. It gives only huge programs, but does not help to + speed up. */ + emit_label (align_4_label); + + mem = change_address (src, SImode, out); + emit_move_insn (scratch, mem); + emit_insn (ix86_gen_add3 (out, out, GEN_INT (4))); + + /* This formula yields a nonzero result iff one of the bytes is zero. + This saves three branches inside loop and many cycles. */ + + emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101))); + emit_insn (gen_one_cmplsi2 (scratch, scratch)); + emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch)); + emit_insn (gen_andsi3 (tmpreg, tmpreg, + gen_int_mode (0x80808080, SImode))); + emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1, + align_4_label); + + if (TARGET_CMOVE) + { + rtx reg = gen_reg_rtx (SImode); + rtx reg2 = gen_reg_rtx (Pmode); + emit_move_insn (reg, tmpreg); + emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16))); + + /* If zero is not in the first two bytes, move two bytes forward. */ + emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080))); + tmp = gen_rtx_REG (CCNOmode, FLAGS_REG); + tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx); + emit_insn (gen_rtx_SET (VOIDmode, tmpreg, + gen_rtx_IF_THEN_ELSE (SImode, tmp, + reg, + tmpreg))); + /* Emit lea manually to avoid clobbering of flags. */ + emit_insn (gen_rtx_SET (SImode, reg2, + gen_rtx_PLUS (Pmode, out, const2_rtx))); + + tmp = gen_rtx_REG (CCNOmode, FLAGS_REG); + tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx); + emit_insn (gen_rtx_SET (VOIDmode, out, + gen_rtx_IF_THEN_ELSE (Pmode, tmp, + reg2, + out))); + } + else + { + rtx end_2_label = gen_label_rtx (); + /* Is zero in the first two bytes? */ + + emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080))); + tmp = gen_rtx_REG (CCNOmode, FLAGS_REG); + tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx); + tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, + gen_rtx_LABEL_REF (VOIDmode, end_2_label), + pc_rtx); + tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp)); + JUMP_LABEL (tmp) = end_2_label; + + /* Not in the first two. Move two bytes forward. */ + emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16))); + emit_insn (ix86_gen_add3 (out, out, const2_rtx)); + + emit_label (end_2_label); + + } + + /* Avoid branch in fixing the byte. */ + tmpreg = gen_lowpart (QImode, tmpreg); + emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg)); + tmp = gen_rtx_REG (CCmode, FLAGS_REG); + cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx); + emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp)); + + emit_label (end_0_label); +} + +/* Expand strlen. */ + +bool +ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align) +{ + rtx addr, scratch1, scratch2, scratch3, scratch4; + + /* The generic case of strlen expander is long. Avoid it's + expanding unless TARGET_INLINE_ALL_STRINGOPS. */ + + if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1 + && !TARGET_INLINE_ALL_STRINGOPS + && !optimize_insn_for_size_p () + && (!CONST_INT_P (align) || INTVAL (align) < 4)) + return false; + + addr = force_reg (Pmode, XEXP (src, 0)); + scratch1 = gen_reg_rtx (Pmode); + + if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1 + && !optimize_insn_for_size_p ()) + { + /* Well it seems that some optimizer does not combine a call like + foo(strlen(bar), strlen(bar)); + when the move and the subtraction is done here. It does calculate + the length just once when these instructions are done inside of + output_strlen_unroll(). But I think since &bar[strlen(bar)] is + often used and I use one fewer register for the lifetime of + output_strlen_unroll() this is better. */ + + emit_move_insn (out, addr); + + ix86_expand_strlensi_unroll_1 (out, src, align); + + /* strlensi_unroll_1 returns the address of the zero at the end of + the string, like memchr(), so compute the length by subtracting + the start address. */ + emit_insn (ix86_gen_sub3 (out, out, addr)); + } + else + { + rtx unspec; + + /* Can't use this if the user has appropriated eax, ecx, or edi. */ + if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG]) + return false; + + scratch2 = gen_reg_rtx (Pmode); + scratch3 = gen_reg_rtx (Pmode); + scratch4 = force_reg (Pmode, constm1_rtx); + + emit_move_insn (scratch3, addr); + eoschar = force_reg (QImode, eoschar); + + src = replace_equiv_address_nv (src, scratch3); + + /* If .md starts supporting :P, this can be done in .md. */ + unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align, + scratch4), UNSPEC_SCAS); + emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec)); + emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1)); + emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx)); + } + return true; +} + +/* For given symbol (function) construct code to compute address of it's PLT + entry in large x86-64 PIC model. */ +rtx +construct_plt_address (rtx symbol) +{ + rtx tmp = gen_reg_rtx (Pmode); + rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF); + + gcc_assert (GET_CODE (symbol) == SYMBOL_REF); + gcc_assert (ix86_cmodel == CM_LARGE_PIC); + + emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec)); + emit_insn (gen_adddi3 (tmp, tmp, pic_offset_table_rtx)); + return tmp; +} + +rtx +ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1, + rtx callarg2, + rtx pop, int sibcall) +{ + rtx use = NULL, call; + + if (pop == const0_rtx) + pop = NULL; + gcc_assert (!TARGET_64BIT || !pop); + + if (TARGET_MACHO && !TARGET_64BIT) + { +#if TARGET_MACHO + if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF) + fnaddr = machopic_indirect_call_target (fnaddr); +#endif + } + else + { + /* Static functions and indirect calls don't need the pic register. */ + if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC) + && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF + && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0))) + use_reg (&use, pic_offset_table_rtx); + } + + if (TARGET_64BIT && INTVAL (callarg2) >= 0) + { + rtx al = gen_rtx_REG (QImode, AX_REG); + emit_move_insn (al, callarg2); + use_reg (&use, al); + } + + if (ix86_cmodel == CM_LARGE_PIC + && MEM_P (fnaddr) + && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF + && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode)) + fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0))); + else if (sibcall + ? !sibcall_insn_operand (XEXP (fnaddr, 0), Pmode) + : !call_insn_operand (XEXP (fnaddr, 0), Pmode)) + { + fnaddr = copy_to_mode_reg (Pmode, XEXP (fnaddr, 0)); + fnaddr = gen_rtx_MEM (QImode, fnaddr); + } + + call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1); + if (retval) + call = gen_rtx_SET (VOIDmode, retval, call); + if (pop) + { + pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop); + pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop); + call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, call, pop)); + } + if (TARGET_64BIT + && ix86_cfun_abi () == MS_ABI + && (!callarg2 || INTVAL (callarg2) != -2)) + { + /* We need to represent that SI and DI registers are clobbered + by SYSV calls. */ + static int clobbered_registers[] = { + XMM6_REG, XMM7_REG, XMM8_REG, + XMM9_REG, XMM10_REG, XMM11_REG, + XMM12_REG, XMM13_REG, XMM14_REG, + XMM15_REG, SI_REG, DI_REG + }; + unsigned int i; + rtx vec[ARRAY_SIZE (clobbered_registers) + 2]; + rtx unspec = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx), + UNSPEC_MS_TO_SYSV_CALL); + + vec[0] = call; + vec[1] = unspec; + for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++) + vec[i + 2] = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i]) + ? TImode : DImode, + gen_rtx_REG + (SSE_REGNO_P (clobbered_registers[i]) + ? TImode : DImode, + clobbered_registers[i])); + + call = gen_rtx_PARALLEL (VOIDmode, + gen_rtvec_v (ARRAY_SIZE (clobbered_registers) + + 2, vec)); + } + + /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */ + if (TARGET_VZEROUPPER) + { + rtx unspec; + int avx256; + + if (cfun->machine->callee_pass_avx256_p) + { + if (cfun->machine->callee_return_avx256_p) + avx256 = callee_return_pass_avx256; + else + avx256 = callee_pass_avx256; + } + else if (cfun->machine->callee_return_avx256_p) + avx256 = callee_return_avx256; + else + avx256 = call_no_avx256; + + if (reload_completed) + emit_insn (gen_avx_vzeroupper (GEN_INT (avx256))); + else + { + unspec = gen_rtx_UNSPEC (VOIDmode, + gen_rtvec (1, GEN_INT (avx256)), + UNSPEC_CALL_NEEDS_VZEROUPPER); + call = gen_rtx_PARALLEL (VOIDmode, + gen_rtvec (2, call, unspec)); + } + } + + call = emit_call_insn (call); + if (use) + CALL_INSN_FUNCTION_USAGE (call) = use; + + return call; +} + +void +ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper) +{ + rtx call = XVECEXP (PATTERN (insn), 0, 0); + emit_insn (gen_avx_vzeroupper (vzeroupper)); + emit_call_insn (call); +} + +/* Output the assembly for a call instruction. */ + +const char * +ix86_output_call_insn (rtx insn, rtx call_op, int addr_op) +{ + bool direct_p = constant_call_address_operand (call_op, Pmode); + bool seh_nop_p = false; + + gcc_assert (addr_op == 0 || addr_op == 1); + + if (SIBLING_CALL_P (insn)) + { + if (direct_p) + return addr_op ? "jmp\t%P1" : "jmp\t%P0"; + /* SEH epilogue detection requires the indirect branch case + to include REX.W. */ + else if (TARGET_SEH) + return addr_op ? "rex.W jmp %A1" : "rex.W jmp %A0"; + else + return addr_op ? "jmp\t%A1" : "jmp\t%A0"; + } + + /* SEH unwinding can require an extra nop to be emitted in several + circumstances. Determine if we have one of those. */ + if (TARGET_SEH) + { + rtx i; + + for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i)) + { + /* If we get to another real insn, we don't need the nop. */ + if (INSN_P (i)) + break; + + /* If we get to the epilogue note, prevent a catch region from + being adjacent to the standard epilogue sequence. If non- + call-exceptions, we'll have done this during epilogue emission. */ + if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG + && !flag_non_call_exceptions + && !can_throw_internal (insn)) + { + seh_nop_p = true; + break; + } + } + + /* If we didn't find a real insn following the call, prevent the + unwinder from looking into the next function. */ + if (i == NULL) + seh_nop_p = true; + } + + if (direct_p) + { + if (seh_nop_p) + return addr_op ? "call\t%P1\n\tnop" : "call\t%P0\n\tnop"; + else + return addr_op ? "call\t%P1" : "call\t%P0"; + } + else + { + if (seh_nop_p) + return addr_op ? "call\t%A1\n\tnop" : "call\t%A0\n\tnop"; + else + return addr_op ? "call\t%A1" : "call\t%A0"; + } +} + +/* Clear stack slot assignments remembered from previous functions. + This is called from INIT_EXPANDERS once before RTL is emitted for each + function. */ + +static struct machine_function * +ix86_init_machine_status (void) +{ + struct machine_function *f; + + f = ggc_alloc_cleared_machine_function (); + f->use_fast_prologue_epilogue_nregs = -1; + f->tls_descriptor_call_expanded_p = 0; + f->call_abi = ix86_abi; + + return f; +} + +/* Return a MEM corresponding to a stack slot with mode MODE. + Allocate a new slot if necessary. + + The RTL for a function can have several slots available: N is + which slot to use. */ + +rtx +assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n) +{ + struct stack_local_entry *s; + + gcc_assert (n < MAX_386_STACK_LOCALS); + + /* Virtual slot is valid only before vregs are instantiated. */ + gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated); + + for (s = ix86_stack_locals; s; s = s->next) + if (s->mode == mode && s->n == n) + return validize_mem (copy_rtx (s->rtl)); + + s = ggc_alloc_stack_local_entry (); + s->n = n; + s->mode = mode; + s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0); + + s->next = ix86_stack_locals; + ix86_stack_locals = s; + return validize_mem (s->rtl); +} + +/* Construct the SYMBOL_REF for the tls_get_addr function. */ + +static GTY(()) rtx ix86_tls_symbol; +rtx +ix86_tls_get_addr (void) +{ + + if (!ix86_tls_symbol) + { + ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, + (TARGET_ANY_GNU_TLS + && !TARGET_64BIT) + ? "___tls_get_addr" + : "__tls_get_addr"); + } + + return ix86_tls_symbol; +} + +/* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */ + +static GTY(()) rtx ix86_tls_module_base_symbol; +rtx +ix86_tls_module_base (void) +{ + + if (!ix86_tls_module_base_symbol) + { + ix86_tls_module_base_symbol = gen_rtx_SYMBOL_REF (Pmode, + "_TLS_MODULE_BASE_"); + SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol) + |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT; + } + + return ix86_tls_module_base_symbol; +} + +/* Calculate the length of the memory address in the instruction + encoding. Does not include the one-byte modrm, opcode, or prefix. */ + +int +memory_address_length (rtx addr) +{ + struct ix86_address parts; + rtx base, index, disp; + int len; + int ok; + + if (GET_CODE (addr) == PRE_DEC + || GET_CODE (addr) == POST_INC + || GET_CODE (addr) == PRE_MODIFY + || GET_CODE (addr) == POST_MODIFY) + return 0; + + ok = ix86_decompose_address (addr, &parts); + gcc_assert (ok); + + if (parts.base && GET_CODE (parts.base) == SUBREG) + parts.base = SUBREG_REG (parts.base); + if (parts.index && GET_CODE (parts.index) == SUBREG) + parts.index = SUBREG_REG (parts.index); + + base = parts.base; + index = parts.index; + disp = parts.disp; + len = 0; + + /* Rule of thumb: + - esp as the base always wants an index, + - ebp as the base always wants a displacement, + - r12 as the base always wants an index, + - r13 as the base always wants a displacement. */ + + /* Register Indirect. */ + if (base && !index && !disp) + { + /* esp (for its index) and ebp (for its displacement) need + the two-byte modrm form. Similarly for r12 and r13 in 64-bit + code. */ + if (REG_P (addr) + && (addr == arg_pointer_rtx + || addr == frame_pointer_rtx + || REGNO (addr) == SP_REG + || REGNO (addr) == BP_REG + || REGNO (addr) == R12_REG + || REGNO (addr) == R13_REG)) + len = 1; + } + + /* Direct Addressing. In 64-bit mode mod 00 r/m 5 + is not disp32, but disp32(%rip), so for disp32 + SIB byte is needed, unless print_operand_address + optimizes it into disp32(%rip) or (%rip) is implied + by UNSPEC. */ + else if (disp && !base && !index) + { + len = 4; + if (TARGET_64BIT) + { + rtx symbol = disp; + + if (GET_CODE (disp) == CONST) + symbol = XEXP (disp, 0); + if (GET_CODE (symbol) == PLUS + && CONST_INT_P (XEXP (symbol, 1))) + symbol = XEXP (symbol, 0); + + if (GET_CODE (symbol) != LABEL_REF + && (GET_CODE (symbol) != SYMBOL_REF + || SYMBOL_REF_TLS_MODEL (symbol) != 0) + && (GET_CODE (symbol) != UNSPEC + || (XINT (symbol, 1) != UNSPEC_GOTPCREL + && XINT (symbol, 1) != UNSPEC_PCREL + && XINT (symbol, 1) != UNSPEC_GOTNTPOFF))) + len += 1; + } + } + + else + { + /* Find the length of the displacement constant. */ + if (disp) + { + if (base && satisfies_constraint_K (disp)) + len = 1; + else + len = 4; + } + /* ebp always wants a displacement. Similarly r13. */ + else if (base && REG_P (base) + && (REGNO (base) == BP_REG || REGNO (base) == R13_REG)) + len = 1; + + /* An index requires the two-byte modrm form.... */ + if (index + /* ...like esp (or r12), which always wants an index. */ + || base == arg_pointer_rtx + || base == frame_pointer_rtx + || (base && REG_P (base) + && (REGNO (base) == SP_REG || REGNO (base) == R12_REG))) + len += 1; + } + + switch (parts.seg) + { + case SEG_FS: + case SEG_GS: + len += 1; + break; + default: + break; + } + + return len; +} + +/* Compute default value for "length_immediate" attribute. When SHORTFORM + is set, expect that insn have 8bit immediate alternative. */ +int +ix86_attr_length_immediate_default (rtx insn, int shortform) +{ + int len = 0; + int i; + extract_insn_cached (insn); + for (i = recog_data.n_operands - 1; i >= 0; --i) + if (CONSTANT_P (recog_data.operand[i])) + { + enum attr_mode mode = get_attr_mode (insn); + + gcc_assert (!len); + if (shortform && CONST_INT_P (recog_data.operand[i])) + { + HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]); + switch (mode) + { + case MODE_QI: + len = 1; + continue; + case MODE_HI: + ival = trunc_int_for_mode (ival, HImode); + break; + case MODE_SI: + ival = trunc_int_for_mode (ival, SImode); + break; + default: + break; + } + if (IN_RANGE (ival, -128, 127)) + { + len = 1; + continue; + } + } + switch (mode) + { + case MODE_QI: + len = 1; + break; + case MODE_HI: + len = 2; + break; + case MODE_SI: + len = 4; + break; + /* Immediates for DImode instructions are encoded as 32bit sign extended values. */ + case MODE_DI: + len = 4; + break; + default: + fatal_insn ("unknown insn mode", insn); + } + } + return len; +} +/* Compute default value for "length_address" attribute. */ +int +ix86_attr_length_address_default (rtx insn) +{ + int i; + + if (get_attr_type (insn) == TYPE_LEA) + { + rtx set = PATTERN (insn), addr; + + if (GET_CODE (set) == PARALLEL) + set = XVECEXP (set, 0, 0); + + gcc_assert (GET_CODE (set) == SET); + + addr = SET_SRC (set); + if (TARGET_64BIT && get_attr_mode (insn) == MODE_SI) + { + if (GET_CODE (addr) == ZERO_EXTEND) + addr = XEXP (addr, 0); + if (GET_CODE (addr) == SUBREG) + addr = SUBREG_REG (addr); + } + + return memory_address_length (addr); + } + + extract_insn_cached (insn); + for (i = recog_data.n_operands - 1; i >= 0; --i) + if (MEM_P (recog_data.operand[i])) + { + constrain_operands_cached (reload_completed); + if (which_alternative != -1) + { + const char *constraints = recog_data.constraints[i]; + int alt = which_alternative; + + while (*constraints == '=' || *constraints == '+') + constraints++; + while (alt-- > 0) + while (*constraints++ != ',') + ; + /* Skip ignored operands. */ + if (*constraints == 'X') + continue; + } + return memory_address_length (XEXP (recog_data.operand[i], 0)); + } + return 0; +} + +/* Compute default value for "length_vex" attribute. It includes + 2 or 3 byte VEX prefix and 1 opcode byte. */ + +int +ix86_attr_length_vex_default (rtx insn, int has_0f_opcode, + int has_vex_w) +{ + int i; + + /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3 + byte VEX prefix. */ + if (!has_0f_opcode || has_vex_w) + return 3 + 1; + + /* We can always use 2 byte VEX prefix in 32bit. */ + if (!TARGET_64BIT) + return 2 + 1; + + extract_insn_cached (insn); + + for (i = recog_data.n_operands - 1; i >= 0; --i) + if (REG_P (recog_data.operand[i])) + { + /* REX.W bit uses 3 byte VEX prefix. */ + if (GET_MODE (recog_data.operand[i]) == DImode + && GENERAL_REG_P (recog_data.operand[i])) + return 3 + 1; + } + else + { + /* REX.X or REX.B bits use 3 byte VEX prefix. */ + if (MEM_P (recog_data.operand[i]) + && x86_extended_reg_mentioned_p (recog_data.operand[i])) + return 3 + 1; + } + + return 2 + 1; +} + +/* Return the maximum number of instructions a cpu can issue. */ + +static int +ix86_issue_rate (void) +{ + switch (ix86_tune) + { + case PROCESSOR_PENTIUM: + case PROCESSOR_ATOM: + case PROCESSOR_K6: + return 2; + + case PROCESSOR_PENTIUMPRO: + case PROCESSOR_PENTIUM4: + case PROCESSOR_CORE2_32: + case PROCESSOR_CORE2_64: + case PROCESSOR_COREI7_32: + case PROCESSOR_COREI7_64: + case PROCESSOR_ATHLON: + case PROCESSOR_K8: + case PROCESSOR_AMDFAM10: + case PROCESSOR_NOCONA: + case PROCESSOR_GENERIC32: + case PROCESSOR_GENERIC64: + case PROCESSOR_BDVER1: + case PROCESSOR_BTVER1: + return 3; + + default: + return 1; + } +} + +/* A subroutine of ix86_adjust_cost -- return true iff INSN reads flags set + by DEP_INSN and nothing set by DEP_INSN. */ + +static int +ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type) +{ + rtx set, set2; + + /* Simplify the test for uninteresting insns. */ + if (insn_type != TYPE_SETCC + && insn_type != TYPE_ICMOV + && insn_type != TYPE_FCMOV + && insn_type != TYPE_IBR) + return 0; + + if ((set = single_set (dep_insn)) != 0) + { + set = SET_DEST (set); + set2 = NULL_RTX; + } + else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL + && XVECLEN (PATTERN (dep_insn), 0) == 2 + && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET + && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET) + { + set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0)); + set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0)); + } + else + return 0; + + if (!REG_P (set) || REGNO (set) != FLAGS_REG) + return 0; + + /* This test is true if the dependent insn reads the flags but + not any other potentially set register. */ + if (!reg_overlap_mentioned_p (set, PATTERN (insn))) + return 0; + + if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn))) + return 0; + + return 1; +} + +/* Return true iff USE_INSN has a memory address with operands set by + SET_INSN. */ + +bool +ix86_agi_dependent (rtx set_insn, rtx use_insn) +{ + int i; + extract_insn_cached (use_insn); + for (i = recog_data.n_operands - 1; i >= 0; --i) + if (MEM_P (recog_data.operand[i])) + { + rtx addr = XEXP (recog_data.operand[i], 0); + return modified_in_p (addr, set_insn) != 0; + } + return false; +} + +static int +ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) +{ + enum attr_type insn_type, dep_insn_type; + enum attr_memory memory; + rtx set, set2; + int dep_insn_code_number; + + /* Anti and output dependencies have zero cost on all CPUs. */ + if (REG_NOTE_KIND (link) != 0) + return 0; + + dep_insn_code_number = recog_memoized (dep_insn); + + /* If we can't recognize the insns, we can't really do anything. */ + if (dep_insn_code_number < 0 || recog_memoized (insn) < 0) + return cost; + + insn_type = get_attr_type (insn); + dep_insn_type = get_attr_type (dep_insn); + + switch (ix86_tune) + { + case PROCESSOR_PENTIUM: + /* Address Generation Interlock adds a cycle of latency. */ + if (insn_type == TYPE_LEA) + { + rtx addr = PATTERN (insn); + + if (GET_CODE (addr) == PARALLEL) + addr = XVECEXP (addr, 0, 0); + + gcc_assert (GET_CODE (addr) == SET); + + addr = SET_SRC (addr); + if (modified_in_p (addr, dep_insn)) + cost += 1; + } + else if (ix86_agi_dependent (dep_insn, insn)) + cost += 1; + + /* ??? Compares pair with jump/setcc. */ + if (ix86_flags_dependent (insn, dep_insn, insn_type)) + cost = 0; + + /* Floating point stores require value to be ready one cycle earlier. */ + if (insn_type == TYPE_FMOV + && get_attr_memory (insn) == MEMORY_STORE + && !ix86_agi_dependent (dep_insn, insn)) + cost += 1; + break; + + case PROCESSOR_PENTIUMPRO: + memory = get_attr_memory (insn); + + /* INT->FP conversion is expensive. */ + if (get_attr_fp_int_src (dep_insn)) + cost += 5; + + /* There is one cycle extra latency between an FP op and a store. */ + if (insn_type == TYPE_FMOV + && (set = single_set (dep_insn)) != NULL_RTX + && (set2 = single_set (insn)) != NULL_RTX + && rtx_equal_p (SET_DEST (set), SET_SRC (set2)) + && MEM_P (SET_DEST (set2))) + cost += 1; + + /* Show ability of reorder buffer to hide latency of load by executing + in parallel with previous instruction in case + previous instruction is not needed to compute the address. */ + if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) + && !ix86_agi_dependent (dep_insn, insn)) + { + /* Claim moves to take one cycle, as core can issue one load + at time and the next load can start cycle later. */ + if (dep_insn_type == TYPE_IMOV + || dep_insn_type == TYPE_FMOV) + cost = 1; + else if (cost > 1) + cost--; + } + break; + + case PROCESSOR_K6: + memory = get_attr_memory (insn); + + /* The esp dependency is resolved before the instruction is really + finished. */ + if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) + && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) + return 1; + + /* INT->FP conversion is expensive. */ + if (get_attr_fp_int_src (dep_insn)) + cost += 5; + + /* Show ability of reorder buffer to hide latency of load by executing + in parallel with previous instruction in case + previous instruction is not needed to compute the address. */ + if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) + && !ix86_agi_dependent (dep_insn, insn)) + { + /* Claim moves to take one cycle, as core can issue one load + at time and the next load can start cycle later. */ + if (dep_insn_type == TYPE_IMOV + || dep_insn_type == TYPE_FMOV) + cost = 1; + else if (cost > 2) + cost -= 2; + else + cost = 1; + } + break; + + case PROCESSOR_ATHLON: + case PROCESSOR_K8: + case PROCESSOR_AMDFAM10: + case PROCESSOR_BDVER1: + case PROCESSOR_BTVER1: + case PROCESSOR_ATOM: + case PROCESSOR_GENERIC32: + case PROCESSOR_GENERIC64: + memory = get_attr_memory (insn); + + /* Show ability of reorder buffer to hide latency of load by executing + in parallel with previous instruction in case + previous instruction is not needed to compute the address. */ + if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) + && !ix86_agi_dependent (dep_insn, insn)) + { + enum attr_unit unit = get_attr_unit (insn); + int loadcost = 3; + + /* Because of the difference between the length of integer and + floating unit pipeline preparation stages, the memory operands + for floating point are cheaper. + + ??? For Athlon it the difference is most probably 2. */ + if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN) + loadcost = 3; + else + loadcost = TARGET_ATHLON ? 2 : 0; + + if (cost >= loadcost) + cost -= loadcost; + else + cost = 0; + } + + default: + break; + } + + return cost; +} + +/* How many alternative schedules to try. This should be as wide as the + scheduling freedom in the DFA, but no wider. Making this value too + large results extra work for the scheduler. */ + +static int +ia32_multipass_dfa_lookahead (void) +{ + switch (ix86_tune) + { + case PROCESSOR_PENTIUM: + return 2; + + case PROCESSOR_PENTIUMPRO: + case PROCESSOR_K6: + return 1; + + case PROCESSOR_CORE2_32: + case PROCESSOR_CORE2_64: + case PROCESSOR_COREI7_32: + case PROCESSOR_COREI7_64: + /* Generally, we want haifa-sched:max_issue() to look ahead as far + as many instructions can be executed on a cycle, i.e., + issue_rate. I wonder why tuning for many CPUs does not do this. */ + return ix86_issue_rate (); + + default: + return 0; + } +} + + + +/* Model decoder of Core 2/i7. + Below hooks for multipass scheduling (see haifa-sched.c:max_issue) + track the instruction fetch block boundaries and make sure that long + (9+ bytes) instructions are assigned to D0. */ + +/* Maximum length of an insn that can be handled by + a secondary decoder unit. '8' for Core 2/i7. */ +static int core2i7_secondary_decoder_max_insn_size; + +/* Ifetch block size, i.e., number of bytes decoder reads per cycle. + '16' for Core 2/i7. */ +static int core2i7_ifetch_block_size; + +/* Maximum number of instructions decoder can handle per cycle. + '6' for Core 2/i7. */ +static int core2i7_ifetch_block_max_insns; + +typedef struct ix86_first_cycle_multipass_data_ * + ix86_first_cycle_multipass_data_t; +typedef const struct ix86_first_cycle_multipass_data_ * + const_ix86_first_cycle_multipass_data_t; + +/* A variable to store target state across calls to max_issue within + one cycle. */ +static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data, + *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data; + +/* Initialize DATA. */ +static void +core2i7_first_cycle_multipass_init (void *_data) +{ + ix86_first_cycle_multipass_data_t data + = (ix86_first_cycle_multipass_data_t) _data; + + data->ifetch_block_len = 0; + data->ifetch_block_n_insns = 0; + data->ready_try_change = NULL; + data->ready_try_change_size = 0; +} + +/* Advancing the cycle; reset ifetch block counts. */ +static void +core2i7_dfa_post_advance_cycle (void) +{ + ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data; + + gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns); + + data->ifetch_block_len = 0; + data->ifetch_block_n_insns = 0; +} + +static int min_insn_size (rtx); + +/* Filter out insns from ready_try that the core will not be able to issue + on current cycle due to decoder. */ +static void +core2i7_first_cycle_multipass_filter_ready_try +(const_ix86_first_cycle_multipass_data_t data, + char *ready_try, int n_ready, bool first_cycle_insn_p) +{ + while (n_ready--) + { + rtx insn; + int insn_size; + + if (ready_try[n_ready]) + continue; + + insn = get_ready_element (n_ready); + insn_size = min_insn_size (insn); + + if (/* If this is a too long an insn for a secondary decoder ... */ + (!first_cycle_insn_p + && insn_size > core2i7_secondary_decoder_max_insn_size) + /* ... or it would not fit into the ifetch block ... */ + || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size + /* ... or the decoder is full already ... */ + || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns) + /* ... mask the insn out. */ + { + ready_try[n_ready] = 1; + + if (data->ready_try_change) + SET_BIT (data->ready_try_change, n_ready); + } + } +} + +/* Prepare for a new round of multipass lookahead scheduling. */ +static void +core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready, + bool first_cycle_insn_p) +{ + ix86_first_cycle_multipass_data_t data + = (ix86_first_cycle_multipass_data_t) _data; + const_ix86_first_cycle_multipass_data_t prev_data + = ix86_first_cycle_multipass_data; + + /* Restore the state from the end of the previous round. */ + data->ifetch_block_len = prev_data->ifetch_block_len; + data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns; + + /* Filter instructions that cannot be issued on current cycle due to + decoder restrictions. */ + core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready, + first_cycle_insn_p); +} + +/* INSN is being issued in current solution. Account for its impact on + the decoder model. */ +static void +core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready, + rtx insn, const void *_prev_data) +{ + ix86_first_cycle_multipass_data_t data + = (ix86_first_cycle_multipass_data_t) _data; + const_ix86_first_cycle_multipass_data_t prev_data + = (const_ix86_first_cycle_multipass_data_t) _prev_data; + + int insn_size = min_insn_size (insn); + + data->ifetch_block_len = prev_data->ifetch_block_len + insn_size; + data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1; + gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size + && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns); + + /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */ + if (!data->ready_try_change) + { + data->ready_try_change = sbitmap_alloc (n_ready); + data->ready_try_change_size = n_ready; + } + else if (data->ready_try_change_size < n_ready) + { + data->ready_try_change = sbitmap_resize (data->ready_try_change, + n_ready, 0); + data->ready_try_change_size = n_ready; + } + sbitmap_zero (data->ready_try_change); + + /* Filter out insns from ready_try that the core will not be able to issue + on current cycle due to decoder. */ + core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready, + false); +} + +/* Revert the effect on ready_try. */ +static void +core2i7_first_cycle_multipass_backtrack (const void *_data, + char *ready_try, + int n_ready ATTRIBUTE_UNUSED) +{ + const_ix86_first_cycle_multipass_data_t data + = (const_ix86_first_cycle_multipass_data_t) _data; + unsigned int i = 0; + sbitmap_iterator sbi; + + gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready); + EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi) + { + ready_try[i] = 0; + } +} + +/* Save the result of multipass lookahead scheduling for the next round. */ +static void +core2i7_first_cycle_multipass_end (const void *_data) +{ + const_ix86_first_cycle_multipass_data_t data + = (const_ix86_first_cycle_multipass_data_t) _data; + ix86_first_cycle_multipass_data_t next_data + = ix86_first_cycle_multipass_data; + + if (data != NULL) + { + next_data->ifetch_block_len = data->ifetch_block_len; + next_data->ifetch_block_n_insns = data->ifetch_block_n_insns; + } +} + +/* Deallocate target data. */ +static void +core2i7_first_cycle_multipass_fini (void *_data) +{ + ix86_first_cycle_multipass_data_t data + = (ix86_first_cycle_multipass_data_t) _data; + + if (data->ready_try_change) + { + sbitmap_free (data->ready_try_change); + data->ready_try_change = NULL; + data->ready_try_change_size = 0; + } +} + +/* Prepare for scheduling pass. */ +static void +ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED, + int verbose ATTRIBUTE_UNUSED, + int max_uid ATTRIBUTE_UNUSED) +{ + /* Install scheduling hooks for current CPU. Some of these hooks are used + in time-critical parts of the scheduler, so we only set them up when + they are actually used. */ + switch (ix86_tune) + { + case PROCESSOR_CORE2_32: + case PROCESSOR_CORE2_64: + case PROCESSOR_COREI7_32: + case PROCESSOR_COREI7_64: + targetm.sched.dfa_post_advance_cycle + = core2i7_dfa_post_advance_cycle; + targetm.sched.first_cycle_multipass_init + = core2i7_first_cycle_multipass_init; + targetm.sched.first_cycle_multipass_begin + = core2i7_first_cycle_multipass_begin; + targetm.sched.first_cycle_multipass_issue + = core2i7_first_cycle_multipass_issue; + targetm.sched.first_cycle_multipass_backtrack + = core2i7_first_cycle_multipass_backtrack; + targetm.sched.first_cycle_multipass_end + = core2i7_first_cycle_multipass_end; + targetm.sched.first_cycle_multipass_fini + = core2i7_first_cycle_multipass_fini; + + /* Set decoder parameters. */ + core2i7_secondary_decoder_max_insn_size = 8; + core2i7_ifetch_block_size = 16; + core2i7_ifetch_block_max_insns = 6; + break; + + default: + targetm.sched.dfa_post_advance_cycle = NULL; + targetm.sched.first_cycle_multipass_init = NULL; + targetm.sched.first_cycle_multipass_begin = NULL; + targetm.sched.first_cycle_multipass_issue = NULL; + targetm.sched.first_cycle_multipass_backtrack = NULL; + targetm.sched.first_cycle_multipass_end = NULL; + targetm.sched.first_cycle_multipass_fini = NULL; + break; + } +} + + +/* Compute the alignment given to a constant that is being placed in memory. + EXP is the constant and ALIGN is the alignment that the object would + ordinarily have. + The value of this function is used instead of that alignment to align + the object. */ + +int +ix86_constant_alignment (tree exp, int align) +{ + if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST + || TREE_CODE (exp) == INTEGER_CST) + { + if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64) + return 64; + else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128) + return 128; + } + else if (!optimize_size && TREE_CODE (exp) == STRING_CST + && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD) + return BITS_PER_WORD; + + return align; +} + +/* Compute the alignment for a static variable. + TYPE is the data type, and ALIGN is the alignment that + the object would ordinarily have. The value of this function is used + instead of that alignment to align the object. */ + +int +ix86_data_alignment (tree type, int align) +{ + int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT); + + if (AGGREGATE_TYPE_P (type) + && TYPE_SIZE (type) + && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST + && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align + || TREE_INT_CST_HIGH (TYPE_SIZE (type))) + && align < max_align) + align = max_align; + + /* x86-64 ABI requires arrays greater than 16 bytes to be aligned + to 16byte boundary. */ + if (TARGET_64BIT) + { + if (AGGREGATE_TYPE_P (type) + && TYPE_SIZE (type) + && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST + && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128 + || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128) + return 128; + } + + if (TREE_CODE (type) == ARRAY_TYPE) + { + if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64) + return 64; + if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128) + return 128; + } + else if (TREE_CODE (type) == COMPLEX_TYPE) + { + + if (TYPE_MODE (type) == DCmode && align < 64) + return 64; + if ((TYPE_MODE (type) == XCmode + || TYPE_MODE (type) == TCmode) && align < 128) + return 128; + } + else if ((TREE_CODE (type) == RECORD_TYPE + || TREE_CODE (type) == UNION_TYPE + || TREE_CODE (type) == QUAL_UNION_TYPE) + && TYPE_FIELDS (type)) + { + if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64) + return 64; + if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128) + return 128; + } + else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE + || TREE_CODE (type) == INTEGER_TYPE) + { + if (TYPE_MODE (type) == DFmode && align < 64) + return 64; + if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128) + return 128; + } + + return align; +} + +/* Compute the alignment for a local variable or a stack slot. EXP is + the data type or decl itself, MODE is the widest mode available and + ALIGN is the alignment that the object would ordinarily have. The + value of this macro is used instead of that alignment to align the + object. */ + +unsigned int +ix86_local_alignment (tree exp, enum machine_mode mode, + unsigned int align) +{ + tree type, decl; + + if (exp && DECL_P (exp)) + { + type = TREE_TYPE (exp); + decl = exp; + } + else + { + type = exp; + decl = NULL; + } + + /* Don't do dynamic stack realignment for long long objects with + -mpreferred-stack-boundary=2. */ + if (!TARGET_64BIT + && align == 64 + && ix86_preferred_stack_boundary < 64 + && (mode == DImode || (type && TYPE_MODE (type) == DImode)) + && (!type || !TYPE_USER_ALIGN (type)) + && (!decl || !DECL_USER_ALIGN (decl))) + align = 32; + + /* If TYPE is NULL, we are allocating a stack slot for caller-save + register in MODE. We will return the largest alignment of XF + and DF. */ + if (!type) + { + if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode)) + align = GET_MODE_ALIGNMENT (DFmode); + return align; + } + + /* x86-64 ABI requires arrays greater than 16 bytes to be aligned + to 16byte boundary. Exact wording is: + + An array uses the same alignment as its elements, except that a local or + global array variable of length at least 16 bytes or + a C99 variable-length array variable always has alignment of at least 16 bytes. + + This was added to allow use of aligned SSE instructions at arrays. This + rule is meant for static storage (where compiler can not do the analysis + by itself). We follow it for automatic variables only when convenient. + We fully control everything in the function compiled and functions from + other unit can not rely on the alignment. + + Exclude va_list type. It is the common case of local array where + we can not benefit from the alignment. */ + if (TARGET_64BIT && optimize_function_for_speed_p (cfun) + && TARGET_SSE) + { + if (AGGREGATE_TYPE_P (type) + && (va_list_type_node == NULL_TREE + || (TYPE_MAIN_VARIANT (type) + != TYPE_MAIN_VARIANT (va_list_type_node))) + && TYPE_SIZE (type) + && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST + && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16 + || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128) + return 128; + } + if (TREE_CODE (type) == ARRAY_TYPE) + { + if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64) + return 64; + if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128) + return 128; + } + else if (TREE_CODE (type) == COMPLEX_TYPE) + { + if (TYPE_MODE (type) == DCmode && align < 64) + return 64; + if ((TYPE_MODE (type) == XCmode + || TYPE_MODE (type) == TCmode) && align < 128) + return 128; + } + else if ((TREE_CODE (type) == RECORD_TYPE + || TREE_CODE (type) == UNION_TYPE + || TREE_CODE (type) == QUAL_UNION_TYPE) + && TYPE_FIELDS (type)) + { + if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64) + return 64; + if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128) + return 128; + } + else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE + || TREE_CODE (type) == INTEGER_TYPE) + { + + if (TYPE_MODE (type) == DFmode && align < 64) + return 64; + if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128) + return 128; + } + return align; +} + +/* Compute the minimum required alignment for dynamic stack realignment + purposes for a local variable, parameter or a stack slot. EXP is + the data type or decl itself, MODE is its mode and ALIGN is the + alignment that the object would ordinarily have. */ + +unsigned int +ix86_minimum_alignment (tree exp, enum machine_mode mode, + unsigned int align) +{ + tree type, decl; + + if (exp && DECL_P (exp)) + { + type = TREE_TYPE (exp); + decl = exp; + } + else + { + type = exp; + decl = NULL; + } + + if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64) + return align; + + /* Don't do dynamic stack realignment for long long objects with + -mpreferred-stack-boundary=2. */ + if ((mode == DImode || (type && TYPE_MODE (type) == DImode)) + && (!type || !TYPE_USER_ALIGN (type)) + && (!decl || !DECL_USER_ALIGN (decl))) + return 32; + + return align; +} + +/* Find a location for the static chain incoming to a nested function. + This is a register, unless all free registers are used by arguments. */ + +static rtx +ix86_static_chain (const_tree fndecl, bool incoming_p) +{ + unsigned regno; + + if (!DECL_STATIC_CHAIN (fndecl)) + return NULL; + + if (TARGET_64BIT) + { + /* We always use R10 in 64-bit mode. */ + regno = R10_REG; + } + else + { + tree fntype; + /* By default in 32-bit mode we use ECX to pass the static chain. */ + regno = CX_REG; + + fntype = TREE_TYPE (fndecl); + if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype))) + { + /* Fastcall functions use ecx/edx for arguments, which leaves + us with EAX for the static chain. */ + regno = AX_REG; + } + else if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype))) + { + /* Thiscall functions use ecx for arguments, which leaves + us with EAX for the static chain. */ + regno = AX_REG; + } + else if (ix86_function_regparm (fntype, fndecl) == 3) + { + /* For regparm 3, we have no free call-clobbered registers in + which to store the static chain. In order to implement this, + we have the trampoline push the static chain to the stack. + However, we can't push a value below the return address when + we call the nested function directly, so we have to use an + alternate entry point. For this we use ESI, and have the + alternate entry point push ESI, so that things appear the + same once we're executing the nested function. */ + if (incoming_p) + { + if (fndecl == current_function_decl) + ix86_static_chain_on_stack = true; + return gen_frame_mem (SImode, + plus_constant (arg_pointer_rtx, -8)); + } + regno = SI_REG; + } + } + + return gen_rtx_REG (Pmode, regno); +} + +/* Emit RTL insns to initialize the variable parts of a trampoline. + FNDECL is the decl of the target address; M_TRAMP is a MEM for + the trampoline, and CHAIN_VALUE is an RTX for the static chain + to be passed to the target function. */ + +static void +ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value) +{ + rtx mem, fnaddr; + + fnaddr = XEXP (DECL_RTL (fndecl), 0); + + if (!TARGET_64BIT) + { + rtx disp, chain; + int opcode; + + /* Depending on the static chain location, either load a register + with a constant, or push the constant to the stack. All of the + instructions are the same size. */ + chain = ix86_static_chain (fndecl, true); + if (REG_P (chain)) + { + if (REGNO (chain) == CX_REG) + opcode = 0xb9; + else if (REGNO (chain) == AX_REG) + opcode = 0xb8; + else + gcc_unreachable (); + } + else + opcode = 0x68; + + mem = adjust_address (m_tramp, QImode, 0); + emit_move_insn (mem, gen_int_mode (opcode, QImode)); + + mem = adjust_address (m_tramp, SImode, 1); + emit_move_insn (mem, chain_value); + + /* Compute offset from the end of the jmp to the target function. + In the case in which the trampoline stores the static chain on + the stack, we need to skip the first insn which pushes the + (call-saved) register static chain; this push is 1 byte. */ + disp = expand_binop (SImode, sub_optab, fnaddr, + plus_constant (XEXP (m_tramp, 0), + MEM_P (chain) ? 9 : 10), + NULL_RTX, 1, OPTAB_DIRECT); + + mem = adjust_address (m_tramp, QImode, 5); + emit_move_insn (mem, gen_int_mode (0xe9, QImode)); + + mem = adjust_address (m_tramp, SImode, 6); + emit_move_insn (mem, disp); + } + else + { + int offset = 0; + + /* Load the function address to r11. Try to load address using + the shorter movl instead of movabs. We may want to support + movq for kernel mode, but kernel does not use trampolines at + the moment. */ + if (x86_64_zext_immediate_operand (fnaddr, VOIDmode)) + { + fnaddr = copy_to_mode_reg (DImode, fnaddr); + + mem = adjust_address (m_tramp, HImode, offset); + emit_move_insn (mem, gen_int_mode (0xbb41, HImode)); + + mem = adjust_address (m_tramp, SImode, offset + 2); + emit_move_insn (mem, gen_lowpart (SImode, fnaddr)); + offset += 6; + } + else + { + mem = adjust_address (m_tramp, HImode, offset); + emit_move_insn (mem, gen_int_mode (0xbb49, HImode)); + + mem = adjust_address (m_tramp, DImode, offset + 2); + emit_move_insn (mem, fnaddr); + offset += 10; + } + + /* Load static chain using movabs to r10. */ + mem = adjust_address (m_tramp, HImode, offset); + emit_move_insn (mem, gen_int_mode (0xba49, HImode)); + + mem = adjust_address (m_tramp, DImode, offset + 2); + emit_move_insn (mem, chain_value); + offset += 10; + + /* Jump to r11; the last (unused) byte is a nop, only there to + pad the write out to a single 32-bit store. */ + mem = adjust_address (m_tramp, SImode, offset); + emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode)); + offset += 4; + + gcc_assert (offset <= TRAMPOLINE_SIZE); + } + +#ifdef ENABLE_EXECUTE_STACK +#ifdef CHECK_EXECUTE_STACK_ENABLED + if (CHECK_EXECUTE_STACK_ENABLED) +#endif + emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"), + LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode); +#endif +} + +/* The following file contains several enumerations and data structures + built from the definitions in i386-builtin-types.def. */ + +#include "i386-builtin-types.inc" + +/* Table for the ix86 builtin non-function types. */ +static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1]; + +/* Retrieve an element from the above table, building some of + the types lazily. */ + +static tree +ix86_get_builtin_type (enum ix86_builtin_type tcode) +{ + unsigned int index; + tree type, itype; + + gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab)); + + type = ix86_builtin_type_tab[(int) tcode]; + if (type != NULL) + return type; + + gcc_assert (tcode > IX86_BT_LAST_PRIM); + if (tcode <= IX86_BT_LAST_VECT) + { + enum machine_mode mode; + + index = tcode - IX86_BT_LAST_PRIM - 1; + itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]); + mode = ix86_builtin_type_vect_mode[index]; + + type = build_vector_type_for_mode (itype, mode); + } + else + { + int quals; + + index = tcode - IX86_BT_LAST_VECT - 1; + if (tcode <= IX86_BT_LAST_PTR) + quals = TYPE_UNQUALIFIED; + else + quals = TYPE_QUAL_CONST; + + itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]); + if (quals != TYPE_UNQUALIFIED) + itype = build_qualified_type (itype, quals); + + type = build_pointer_type (itype); + } + + ix86_builtin_type_tab[(int) tcode] = type; + return type; +} + +/* Table for the ix86 builtin function types. */ +static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1]; + +/* Retrieve an element from the above table, building some of + the types lazily. */ + +static tree +ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode) +{ + tree type; + + gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab)); + + type = ix86_builtin_func_type_tab[(int) tcode]; + if (type != NULL) + return type; + + if (tcode <= IX86_BT_LAST_FUNC) + { + unsigned start = ix86_builtin_func_start[(int) tcode]; + unsigned after = ix86_builtin_func_start[(int) tcode + 1]; + tree rtype, atype, args = void_list_node; + unsigned i; + + rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]); + for (i = after - 1; i > start; --i) + { + atype = ix86_get_builtin_type (ix86_builtin_func_args[i]); + args = tree_cons (NULL, atype, args); + } + + type = build_function_type (rtype, args); + } + else + { + unsigned index = tcode - IX86_BT_LAST_FUNC - 1; + enum ix86_builtin_func_type icode; + + icode = ix86_builtin_func_alias_base[index]; + type = ix86_get_builtin_func_type (icode); + } + + ix86_builtin_func_type_tab[(int) tcode] = type; + return type; +} + + +/* Codes for all the SSE/MMX builtins. */ +enum ix86_builtins +{ + IX86_BUILTIN_ADDPS, + IX86_BUILTIN_ADDSS, + IX86_BUILTIN_DIVPS, + IX86_BUILTIN_DIVSS, + IX86_BUILTIN_MULPS, + IX86_BUILTIN_MULSS, + IX86_BUILTIN_SUBPS, + IX86_BUILTIN_SUBSS, + + IX86_BUILTIN_CMPEQPS, + IX86_BUILTIN_CMPLTPS, + IX86_BUILTIN_CMPLEPS, + IX86_BUILTIN_CMPGTPS, + IX86_BUILTIN_CMPGEPS, + IX86_BUILTIN_CMPNEQPS, + IX86_BUILTIN_CMPNLTPS, + IX86_BUILTIN_CMPNLEPS, + IX86_BUILTIN_CMPNGTPS, + IX86_BUILTIN_CMPNGEPS, + IX86_BUILTIN_CMPORDPS, + IX86_BUILTIN_CMPUNORDPS, + IX86_BUILTIN_CMPEQSS, + IX86_BUILTIN_CMPLTSS, + IX86_BUILTIN_CMPLESS, + IX86_BUILTIN_CMPNEQSS, + IX86_BUILTIN_CMPNLTSS, + IX86_BUILTIN_CMPNLESS, + IX86_BUILTIN_CMPNGTSS, + IX86_BUILTIN_CMPNGESS, + IX86_BUILTIN_CMPORDSS, + IX86_BUILTIN_CMPUNORDSS, + + IX86_BUILTIN_COMIEQSS, + IX86_BUILTIN_COMILTSS, + IX86_BUILTIN_COMILESS, + IX86_BUILTIN_COMIGTSS, + IX86_BUILTIN_COMIGESS, + IX86_BUILTIN_COMINEQSS, + IX86_BUILTIN_UCOMIEQSS, + IX86_BUILTIN_UCOMILTSS, + IX86_BUILTIN_UCOMILESS, + IX86_BUILTIN_UCOMIGTSS, + IX86_BUILTIN_UCOMIGESS, + IX86_BUILTIN_UCOMINEQSS, + + IX86_BUILTIN_CVTPI2PS, + IX86_BUILTIN_CVTPS2PI, + IX86_BUILTIN_CVTSI2SS, + IX86_BUILTIN_CVTSI642SS, + IX86_BUILTIN_CVTSS2SI, + IX86_BUILTIN_CVTSS2SI64, + IX86_BUILTIN_CVTTPS2PI, + IX86_BUILTIN_CVTTSS2SI, + IX86_BUILTIN_CVTTSS2SI64, + + IX86_BUILTIN_MAXPS, + IX86_BUILTIN_MAXSS, + IX86_BUILTIN_MINPS, + IX86_BUILTIN_MINSS, + + IX86_BUILTIN_LOADUPS, + IX86_BUILTIN_STOREUPS, + IX86_BUILTIN_MOVSS, + + IX86_BUILTIN_MOVHLPS, + IX86_BUILTIN_MOVLHPS, + IX86_BUILTIN_LOADHPS, + IX86_BUILTIN_LOADLPS, + IX86_BUILTIN_STOREHPS, + IX86_BUILTIN_STORELPS, + + IX86_BUILTIN_MASKMOVQ, + IX86_BUILTIN_MOVMSKPS, + IX86_BUILTIN_PMOVMSKB, + + IX86_BUILTIN_MOVNTPS, + IX86_BUILTIN_MOVNTQ, + + IX86_BUILTIN_LOADDQU, + IX86_BUILTIN_STOREDQU, + + IX86_BUILTIN_PACKSSWB, + IX86_BUILTIN_PACKSSDW, + IX86_BUILTIN_PACKUSWB, + + IX86_BUILTIN_PADDB, + IX86_BUILTIN_PADDW, + IX86_BUILTIN_PADDD, + IX86_BUILTIN_PADDQ, + IX86_BUILTIN_PADDSB, + IX86_BUILTIN_PADDSW, + IX86_BUILTIN_PADDUSB, + IX86_BUILTIN_PADDUSW, + IX86_BUILTIN_PSUBB, + IX86_BUILTIN_PSUBW, + IX86_BUILTIN_PSUBD, + IX86_BUILTIN_PSUBQ, + IX86_BUILTIN_PSUBSB, + IX86_BUILTIN_PSUBSW, + IX86_BUILTIN_PSUBUSB, + IX86_BUILTIN_PSUBUSW, + + IX86_BUILTIN_PAND, + IX86_BUILTIN_PANDN, + IX86_BUILTIN_POR, + IX86_BUILTIN_PXOR, + + IX86_BUILTIN_PAVGB, + IX86_BUILTIN_PAVGW, + + IX86_BUILTIN_PCMPEQB, + IX86_BUILTIN_PCMPEQW, + IX86_BUILTIN_PCMPEQD, + IX86_BUILTIN_PCMPGTB, + IX86_BUILTIN_PCMPGTW, + IX86_BUILTIN_PCMPGTD, + + IX86_BUILTIN_PMADDWD, + + IX86_BUILTIN_PMAXSW, + IX86_BUILTIN_PMAXUB, + IX86_BUILTIN_PMINSW, + IX86_BUILTIN_PMINUB, + + IX86_BUILTIN_PMULHUW, + IX86_BUILTIN_PMULHW, + IX86_BUILTIN_PMULLW, + + IX86_BUILTIN_PSADBW, + IX86_BUILTIN_PSHUFW, + + IX86_BUILTIN_PSLLW, + IX86_BUILTIN_PSLLD, + IX86_BUILTIN_PSLLQ, + IX86_BUILTIN_PSRAW, + IX86_BUILTIN_PSRAD, + IX86_BUILTIN_PSRLW, + IX86_BUILTIN_PSRLD, + IX86_BUILTIN_PSRLQ, + IX86_BUILTIN_PSLLWI, + IX86_BUILTIN_PSLLDI, + IX86_BUILTIN_PSLLQI, + IX86_BUILTIN_PSRAWI, + IX86_BUILTIN_PSRADI, + IX86_BUILTIN_PSRLWI, + IX86_BUILTIN_PSRLDI, + IX86_BUILTIN_PSRLQI, + + IX86_BUILTIN_PUNPCKHBW, + IX86_BUILTIN_PUNPCKHWD, + IX86_BUILTIN_PUNPCKHDQ, + IX86_BUILTIN_PUNPCKLBW, + IX86_BUILTIN_PUNPCKLWD, + IX86_BUILTIN_PUNPCKLDQ, + + IX86_BUILTIN_SHUFPS, + + IX86_BUILTIN_RCPPS, + IX86_BUILTIN_RCPSS, + IX86_BUILTIN_RSQRTPS, + IX86_BUILTIN_RSQRTPS_NR, + IX86_BUILTIN_RSQRTSS, + IX86_BUILTIN_RSQRTF, + IX86_BUILTIN_SQRTPS, + IX86_BUILTIN_SQRTPS_NR, + IX86_BUILTIN_SQRTSS, + + IX86_BUILTIN_UNPCKHPS, + IX86_BUILTIN_UNPCKLPS, + + IX86_BUILTIN_ANDPS, + IX86_BUILTIN_ANDNPS, + IX86_BUILTIN_ORPS, + IX86_BUILTIN_XORPS, + + IX86_BUILTIN_EMMS, + IX86_BUILTIN_LDMXCSR, + IX86_BUILTIN_STMXCSR, + IX86_BUILTIN_SFENCE, + + /* 3DNow! Original */ + IX86_BUILTIN_FEMMS, + IX86_BUILTIN_PAVGUSB, + IX86_BUILTIN_PF2ID, + IX86_BUILTIN_PFACC, + IX86_BUILTIN_PFADD, + IX86_BUILTIN_PFCMPEQ, + IX86_BUILTIN_PFCMPGE, + IX86_BUILTIN_PFCMPGT, + IX86_BUILTIN_PFMAX, + IX86_BUILTIN_PFMIN, + IX86_BUILTIN_PFMUL, + IX86_BUILTIN_PFRCP, + IX86_BUILTIN_PFRCPIT1, + IX86_BUILTIN_PFRCPIT2, + IX86_BUILTIN_PFRSQIT1, + IX86_BUILTIN_PFRSQRT, + IX86_BUILTIN_PFSUB, + IX86_BUILTIN_PFSUBR, + IX86_BUILTIN_PI2FD, + IX86_BUILTIN_PMULHRW, + + /* 3DNow! Athlon Extensions */ + IX86_BUILTIN_PF2IW, + IX86_BUILTIN_PFNACC, + IX86_BUILTIN_PFPNACC, + IX86_BUILTIN_PI2FW, + IX86_BUILTIN_PSWAPDSI, + IX86_BUILTIN_PSWAPDSF, + + /* SSE2 */ + IX86_BUILTIN_ADDPD, + IX86_BUILTIN_ADDSD, + IX86_BUILTIN_DIVPD, + IX86_BUILTIN_DIVSD, + IX86_BUILTIN_MULPD, + IX86_BUILTIN_MULSD, + IX86_BUILTIN_SUBPD, + IX86_BUILTIN_SUBSD, + + IX86_BUILTIN_CMPEQPD, + IX86_BUILTIN_CMPLTPD, + IX86_BUILTIN_CMPLEPD, + IX86_BUILTIN_CMPGTPD, + IX86_BUILTIN_CMPGEPD, + IX86_BUILTIN_CMPNEQPD, + IX86_BUILTIN_CMPNLTPD, + IX86_BUILTIN_CMPNLEPD, + IX86_BUILTIN_CMPNGTPD, + IX86_BUILTIN_CMPNGEPD, + IX86_BUILTIN_CMPORDPD, + IX86_BUILTIN_CMPUNORDPD, + IX86_BUILTIN_CMPEQSD, + IX86_BUILTIN_CMPLTSD, + IX86_BUILTIN_CMPLESD, + IX86_BUILTIN_CMPNEQSD, + IX86_BUILTIN_CMPNLTSD, + IX86_BUILTIN_CMPNLESD, + IX86_BUILTIN_CMPORDSD, + IX86_BUILTIN_CMPUNORDSD, + + IX86_BUILTIN_COMIEQSD, + IX86_BUILTIN_COMILTSD, + IX86_BUILTIN_COMILESD, + IX86_BUILTIN_COMIGTSD, + IX86_BUILTIN_COMIGESD, + IX86_BUILTIN_COMINEQSD, + IX86_BUILTIN_UCOMIEQSD, + IX86_BUILTIN_UCOMILTSD, + IX86_BUILTIN_UCOMILESD, + IX86_BUILTIN_UCOMIGTSD, + IX86_BUILTIN_UCOMIGESD, + IX86_BUILTIN_UCOMINEQSD, + + IX86_BUILTIN_MAXPD, + IX86_BUILTIN_MAXSD, + IX86_BUILTIN_MINPD, + IX86_BUILTIN_MINSD, + + IX86_BUILTIN_ANDPD, + IX86_BUILTIN_ANDNPD, + IX86_BUILTIN_ORPD, + IX86_BUILTIN_XORPD, + + IX86_BUILTIN_SQRTPD, + IX86_BUILTIN_SQRTSD, + + IX86_BUILTIN_UNPCKHPD, + IX86_BUILTIN_UNPCKLPD, + + IX86_BUILTIN_SHUFPD, + + IX86_BUILTIN_LOADUPD, + IX86_BUILTIN_STOREUPD, + IX86_BUILTIN_MOVSD, + + IX86_BUILTIN_LOADHPD, + IX86_BUILTIN_LOADLPD, + + IX86_BUILTIN_CVTDQ2PD, + IX86_BUILTIN_CVTDQ2PS, + + IX86_BUILTIN_CVTPD2DQ, + IX86_BUILTIN_CVTPD2PI, + IX86_BUILTIN_CVTPD2PS, + IX86_BUILTIN_CVTTPD2DQ, + IX86_BUILTIN_CVTTPD2PI, + + IX86_BUILTIN_CVTPI2PD, + IX86_BUILTIN_CVTSI2SD, + IX86_BUILTIN_CVTSI642SD, + + IX86_BUILTIN_CVTSD2SI, + IX86_BUILTIN_CVTSD2SI64, + IX86_BUILTIN_CVTSD2SS, + IX86_BUILTIN_CVTSS2SD, + IX86_BUILTIN_CVTTSD2SI, + IX86_BUILTIN_CVTTSD2SI64, + + IX86_BUILTIN_CVTPS2DQ, + IX86_BUILTIN_CVTPS2PD, + IX86_BUILTIN_CVTTPS2DQ, + + IX86_BUILTIN_MOVNTI, + IX86_BUILTIN_MOVNTPD, + IX86_BUILTIN_MOVNTDQ, + + IX86_BUILTIN_MOVQ128, + + /* SSE2 MMX */ + IX86_BUILTIN_MASKMOVDQU, + IX86_BUILTIN_MOVMSKPD, + IX86_BUILTIN_PMOVMSKB128, + + IX86_BUILTIN_PACKSSWB128, + IX86_BUILTIN_PACKSSDW128, + IX86_BUILTIN_PACKUSWB128, + + IX86_BUILTIN_PADDB128, + IX86_BUILTIN_PADDW128, + IX86_BUILTIN_PADDD128, + IX86_BUILTIN_PADDQ128, + IX86_BUILTIN_PADDSB128, + IX86_BUILTIN_PADDSW128, + IX86_BUILTIN_PADDUSB128, + IX86_BUILTIN_PADDUSW128, + IX86_BUILTIN_PSUBB128, + IX86_BUILTIN_PSUBW128, + IX86_BUILTIN_PSUBD128, + IX86_BUILTIN_PSUBQ128, + IX86_BUILTIN_PSUBSB128, + IX86_BUILTIN_PSUBSW128, + IX86_BUILTIN_PSUBUSB128, + IX86_BUILTIN_PSUBUSW128, + + IX86_BUILTIN_PAND128, + IX86_BUILTIN_PANDN128, + IX86_BUILTIN_POR128, + IX86_BUILTIN_PXOR128, + + IX86_BUILTIN_PAVGB128, + IX86_BUILTIN_PAVGW128, + + IX86_BUILTIN_PCMPEQB128, + IX86_BUILTIN_PCMPEQW128, + IX86_BUILTIN_PCMPEQD128, + IX86_BUILTIN_PCMPGTB128, + IX86_BUILTIN_PCMPGTW128, + IX86_BUILTIN_PCMPGTD128, + + IX86_BUILTIN_PMADDWD128, + + IX86_BUILTIN_PMAXSW128, + IX86_BUILTIN_PMAXUB128, + IX86_BUILTIN_PMINSW128, + IX86_BUILTIN_PMINUB128, + + IX86_BUILTIN_PMULUDQ, + IX86_BUILTIN_PMULUDQ128, + IX86_BUILTIN_PMULHUW128, + IX86_BUILTIN_PMULHW128, + IX86_BUILTIN_PMULLW128, + + IX86_BUILTIN_PSADBW128, + IX86_BUILTIN_PSHUFHW, + IX86_BUILTIN_PSHUFLW, + IX86_BUILTIN_PSHUFD, + + IX86_BUILTIN_PSLLDQI128, + IX86_BUILTIN_PSLLWI128, + IX86_BUILTIN_PSLLDI128, + IX86_BUILTIN_PSLLQI128, + IX86_BUILTIN_PSRAWI128, + IX86_BUILTIN_PSRADI128, + IX86_BUILTIN_PSRLDQI128, + IX86_BUILTIN_PSRLWI128, + IX86_BUILTIN_PSRLDI128, + IX86_BUILTIN_PSRLQI128, + + IX86_BUILTIN_PSLLDQ128, + IX86_BUILTIN_PSLLW128, + IX86_BUILTIN_PSLLD128, + IX86_BUILTIN_PSLLQ128, + IX86_BUILTIN_PSRAW128, + IX86_BUILTIN_PSRAD128, + IX86_BUILTIN_PSRLW128, + IX86_BUILTIN_PSRLD128, + IX86_BUILTIN_PSRLQ128, + + IX86_BUILTIN_PUNPCKHBW128, + IX86_BUILTIN_PUNPCKHWD128, + IX86_BUILTIN_PUNPCKHDQ128, + IX86_BUILTIN_PUNPCKHQDQ128, + IX86_BUILTIN_PUNPCKLBW128, + IX86_BUILTIN_PUNPCKLWD128, + IX86_BUILTIN_PUNPCKLDQ128, + IX86_BUILTIN_PUNPCKLQDQ128, + + IX86_BUILTIN_CLFLUSH, + IX86_BUILTIN_MFENCE, + IX86_BUILTIN_LFENCE, + + IX86_BUILTIN_BSRSI, + IX86_BUILTIN_BSRDI, + IX86_BUILTIN_RDPMC, + IX86_BUILTIN_RDTSC, + IX86_BUILTIN_RDTSCP, + IX86_BUILTIN_ROLQI, + IX86_BUILTIN_ROLHI, + IX86_BUILTIN_RORQI, + IX86_BUILTIN_RORHI, + + /* SSE3. */ + IX86_BUILTIN_ADDSUBPS, + IX86_BUILTIN_HADDPS, + IX86_BUILTIN_HSUBPS, + IX86_BUILTIN_MOVSHDUP, + IX86_BUILTIN_MOVSLDUP, + IX86_BUILTIN_ADDSUBPD, + IX86_BUILTIN_HADDPD, + IX86_BUILTIN_HSUBPD, + IX86_BUILTIN_LDDQU, + + IX86_BUILTIN_MONITOR, + IX86_BUILTIN_MWAIT, + + /* SSSE3. */ + IX86_BUILTIN_PHADDW, + IX86_BUILTIN_PHADDD, + IX86_BUILTIN_PHADDSW, + IX86_BUILTIN_PHSUBW, + IX86_BUILTIN_PHSUBD, + IX86_BUILTIN_PHSUBSW, + IX86_BUILTIN_PMADDUBSW, + IX86_BUILTIN_PMULHRSW, + IX86_BUILTIN_PSHUFB, + IX86_BUILTIN_PSIGNB, + IX86_BUILTIN_PSIGNW, + IX86_BUILTIN_PSIGND, + IX86_BUILTIN_PALIGNR, + IX86_BUILTIN_PABSB, + IX86_BUILTIN_PABSW, + IX86_BUILTIN_PABSD, + + IX86_BUILTIN_PHADDW128, + IX86_BUILTIN_PHADDD128, + IX86_BUILTIN_PHADDSW128, + IX86_BUILTIN_PHSUBW128, + IX86_BUILTIN_PHSUBD128, + IX86_BUILTIN_PHSUBSW128, + IX86_BUILTIN_PMADDUBSW128, + IX86_BUILTIN_PMULHRSW128, + IX86_BUILTIN_PSHUFB128, + IX86_BUILTIN_PSIGNB128, + IX86_BUILTIN_PSIGNW128, + IX86_BUILTIN_PSIGND128, + IX86_BUILTIN_PALIGNR128, + IX86_BUILTIN_PABSB128, + IX86_BUILTIN_PABSW128, + IX86_BUILTIN_PABSD128, + + /* AMDFAM10 - SSE4A New Instructions. */ + IX86_BUILTIN_MOVNTSD, + IX86_BUILTIN_MOVNTSS, + IX86_BUILTIN_EXTRQI, + IX86_BUILTIN_EXTRQ, + IX86_BUILTIN_INSERTQI, + IX86_BUILTIN_INSERTQ, + + /* SSE4.1. */ + IX86_BUILTIN_BLENDPD, + IX86_BUILTIN_BLENDPS, + IX86_BUILTIN_BLENDVPD, + IX86_BUILTIN_BLENDVPS, + IX86_BUILTIN_PBLENDVB128, + IX86_BUILTIN_PBLENDW128, + + IX86_BUILTIN_DPPD, + IX86_BUILTIN_DPPS, + + IX86_BUILTIN_INSERTPS128, + + IX86_BUILTIN_MOVNTDQA, + IX86_BUILTIN_MPSADBW128, + IX86_BUILTIN_PACKUSDW128, + IX86_BUILTIN_PCMPEQQ, + IX86_BUILTIN_PHMINPOSUW128, + + IX86_BUILTIN_PMAXSB128, + IX86_BUILTIN_PMAXSD128, + IX86_BUILTIN_PMAXUD128, + IX86_BUILTIN_PMAXUW128, + + IX86_BUILTIN_PMINSB128, + IX86_BUILTIN_PMINSD128, + IX86_BUILTIN_PMINUD128, + IX86_BUILTIN_PMINUW128, + + IX86_BUILTIN_PMOVSXBW128, + IX86_BUILTIN_PMOVSXBD128, + IX86_BUILTIN_PMOVSXBQ128, + IX86_BUILTIN_PMOVSXWD128, + IX86_BUILTIN_PMOVSXWQ128, + IX86_BUILTIN_PMOVSXDQ128, + + IX86_BUILTIN_PMOVZXBW128, + IX86_BUILTIN_PMOVZXBD128, + IX86_BUILTIN_PMOVZXBQ128, + IX86_BUILTIN_PMOVZXWD128, + IX86_BUILTIN_PMOVZXWQ128, + IX86_BUILTIN_PMOVZXDQ128, + + IX86_BUILTIN_PMULDQ128, + IX86_BUILTIN_PMULLD128, + + IX86_BUILTIN_ROUNDPD, + IX86_BUILTIN_ROUNDPS, + IX86_BUILTIN_ROUNDSD, + IX86_BUILTIN_ROUNDSS, + + IX86_BUILTIN_PTESTZ, + IX86_BUILTIN_PTESTC, + IX86_BUILTIN_PTESTNZC, + + IX86_BUILTIN_VEC_INIT_V2SI, + IX86_BUILTIN_VEC_INIT_V4HI, + IX86_BUILTIN_VEC_INIT_V8QI, + IX86_BUILTIN_VEC_EXT_V2DF, + IX86_BUILTIN_VEC_EXT_V2DI, + IX86_BUILTIN_VEC_EXT_V4SF, + IX86_BUILTIN_VEC_EXT_V4SI, + IX86_BUILTIN_VEC_EXT_V8HI, + IX86_BUILTIN_VEC_EXT_V2SI, + IX86_BUILTIN_VEC_EXT_V4HI, + IX86_BUILTIN_VEC_EXT_V16QI, + IX86_BUILTIN_VEC_SET_V2DI, + IX86_BUILTIN_VEC_SET_V4SF, + IX86_BUILTIN_VEC_SET_V4SI, + IX86_BUILTIN_VEC_SET_V8HI, + IX86_BUILTIN_VEC_SET_V4HI, + IX86_BUILTIN_VEC_SET_V16QI, + + IX86_BUILTIN_VEC_PACK_SFIX, + + /* SSE4.2. */ + IX86_BUILTIN_CRC32QI, + IX86_BUILTIN_CRC32HI, + IX86_BUILTIN_CRC32SI, + IX86_BUILTIN_CRC32DI, + + IX86_BUILTIN_PCMPESTRI128, + IX86_BUILTIN_PCMPESTRM128, + IX86_BUILTIN_PCMPESTRA128, + IX86_BUILTIN_PCMPESTRC128, + IX86_BUILTIN_PCMPESTRO128, + IX86_BUILTIN_PCMPESTRS128, + IX86_BUILTIN_PCMPESTRZ128, + IX86_BUILTIN_PCMPISTRI128, + IX86_BUILTIN_PCMPISTRM128, + IX86_BUILTIN_PCMPISTRA128, + IX86_BUILTIN_PCMPISTRC128, + IX86_BUILTIN_PCMPISTRO128, + IX86_BUILTIN_PCMPISTRS128, + IX86_BUILTIN_PCMPISTRZ128, + + IX86_BUILTIN_PCMPGTQ, + + /* AES instructions */ + IX86_BUILTIN_AESENC128, + IX86_BUILTIN_AESENCLAST128, + IX86_BUILTIN_AESDEC128, + IX86_BUILTIN_AESDECLAST128, + IX86_BUILTIN_AESIMC128, + IX86_BUILTIN_AESKEYGENASSIST128, + + /* PCLMUL instruction */ + IX86_BUILTIN_PCLMULQDQ128, + + /* AVX */ + IX86_BUILTIN_ADDPD256, + IX86_BUILTIN_ADDPS256, + IX86_BUILTIN_ADDSUBPD256, + IX86_BUILTIN_ADDSUBPS256, + IX86_BUILTIN_ANDPD256, + IX86_BUILTIN_ANDPS256, + IX86_BUILTIN_ANDNPD256, + IX86_BUILTIN_ANDNPS256, + IX86_BUILTIN_BLENDPD256, + IX86_BUILTIN_BLENDPS256, + IX86_BUILTIN_BLENDVPD256, + IX86_BUILTIN_BLENDVPS256, + IX86_BUILTIN_DIVPD256, + IX86_BUILTIN_DIVPS256, + IX86_BUILTIN_DPPS256, + IX86_BUILTIN_HADDPD256, + IX86_BUILTIN_HADDPS256, + IX86_BUILTIN_HSUBPD256, + IX86_BUILTIN_HSUBPS256, + IX86_BUILTIN_MAXPD256, + IX86_BUILTIN_MAXPS256, + IX86_BUILTIN_MINPD256, + IX86_BUILTIN_MINPS256, + IX86_BUILTIN_MULPD256, + IX86_BUILTIN_MULPS256, + IX86_BUILTIN_ORPD256, + IX86_BUILTIN_ORPS256, + IX86_BUILTIN_SHUFPD256, + IX86_BUILTIN_SHUFPS256, + IX86_BUILTIN_SUBPD256, + IX86_BUILTIN_SUBPS256, + IX86_BUILTIN_XORPD256, + IX86_BUILTIN_XORPS256, + IX86_BUILTIN_CMPSD, + IX86_BUILTIN_CMPSS, + IX86_BUILTIN_CMPPD, + IX86_BUILTIN_CMPPS, + IX86_BUILTIN_CMPPD256, + IX86_BUILTIN_CMPPS256, + IX86_BUILTIN_CVTDQ2PD256, + IX86_BUILTIN_CVTDQ2PS256, + IX86_BUILTIN_CVTPD2PS256, + IX86_BUILTIN_CVTPS2DQ256, + IX86_BUILTIN_CVTPS2PD256, + IX86_BUILTIN_CVTTPD2DQ256, + IX86_BUILTIN_CVTPD2DQ256, + IX86_BUILTIN_CVTTPS2DQ256, + IX86_BUILTIN_EXTRACTF128PD256, + IX86_BUILTIN_EXTRACTF128PS256, + IX86_BUILTIN_EXTRACTF128SI256, + IX86_BUILTIN_VZEROALL, + IX86_BUILTIN_VZEROUPPER, + IX86_BUILTIN_VPERMILVARPD, + IX86_BUILTIN_VPERMILVARPS, + IX86_BUILTIN_VPERMILVARPD256, + IX86_BUILTIN_VPERMILVARPS256, + IX86_BUILTIN_VPERMILPD, + IX86_BUILTIN_VPERMILPS, + IX86_BUILTIN_VPERMILPD256, + IX86_BUILTIN_VPERMILPS256, + IX86_BUILTIN_VPERMIL2PD, + IX86_BUILTIN_VPERMIL2PS, + IX86_BUILTIN_VPERMIL2PD256, + IX86_BUILTIN_VPERMIL2PS256, + IX86_BUILTIN_VPERM2F128PD256, + IX86_BUILTIN_VPERM2F128PS256, + IX86_BUILTIN_VPERM2F128SI256, + IX86_BUILTIN_VBROADCASTSS, + IX86_BUILTIN_VBROADCASTSD256, + IX86_BUILTIN_VBROADCASTSS256, + IX86_BUILTIN_VBROADCASTPD256, + IX86_BUILTIN_VBROADCASTPS256, + IX86_BUILTIN_VINSERTF128PD256, + IX86_BUILTIN_VINSERTF128PS256, + IX86_BUILTIN_VINSERTF128SI256, + IX86_BUILTIN_LOADUPD256, + IX86_BUILTIN_LOADUPS256, + IX86_BUILTIN_STOREUPD256, + IX86_BUILTIN_STOREUPS256, + IX86_BUILTIN_LDDQU256, + IX86_BUILTIN_MOVNTDQ256, + IX86_BUILTIN_MOVNTPD256, + IX86_BUILTIN_MOVNTPS256, + IX86_BUILTIN_LOADDQU256, + IX86_BUILTIN_STOREDQU256, + IX86_BUILTIN_MASKLOADPD, + IX86_BUILTIN_MASKLOADPS, + IX86_BUILTIN_MASKSTOREPD, + IX86_BUILTIN_MASKSTOREPS, + IX86_BUILTIN_MASKLOADPD256, + IX86_BUILTIN_MASKLOADPS256, + IX86_BUILTIN_MASKSTOREPD256, + IX86_BUILTIN_MASKSTOREPS256, + IX86_BUILTIN_MOVSHDUP256, + IX86_BUILTIN_MOVSLDUP256, + IX86_BUILTIN_MOVDDUP256, + + IX86_BUILTIN_SQRTPD256, + IX86_BUILTIN_SQRTPS256, + IX86_BUILTIN_SQRTPS_NR256, + IX86_BUILTIN_RSQRTPS256, + IX86_BUILTIN_RSQRTPS_NR256, + + IX86_BUILTIN_RCPPS256, + + IX86_BUILTIN_ROUNDPD256, + IX86_BUILTIN_ROUNDPS256, + + IX86_BUILTIN_UNPCKHPD256, + IX86_BUILTIN_UNPCKLPD256, + IX86_BUILTIN_UNPCKHPS256, + IX86_BUILTIN_UNPCKLPS256, + + IX86_BUILTIN_SI256_SI, + IX86_BUILTIN_PS256_PS, + IX86_BUILTIN_PD256_PD, + IX86_BUILTIN_SI_SI256, + IX86_BUILTIN_PS_PS256, + IX86_BUILTIN_PD_PD256, + + IX86_BUILTIN_VTESTZPD, + IX86_BUILTIN_VTESTCPD, + IX86_BUILTIN_VTESTNZCPD, + IX86_BUILTIN_VTESTZPS, + IX86_BUILTIN_VTESTCPS, + IX86_BUILTIN_VTESTNZCPS, + IX86_BUILTIN_VTESTZPD256, + IX86_BUILTIN_VTESTCPD256, + IX86_BUILTIN_VTESTNZCPD256, + IX86_BUILTIN_VTESTZPS256, + IX86_BUILTIN_VTESTCPS256, + IX86_BUILTIN_VTESTNZCPS256, + IX86_BUILTIN_PTESTZ256, + IX86_BUILTIN_PTESTC256, + IX86_BUILTIN_PTESTNZC256, + + IX86_BUILTIN_MOVMSKPD256, + IX86_BUILTIN_MOVMSKPS256, + + /* TFmode support builtins. */ + IX86_BUILTIN_INFQ, + IX86_BUILTIN_HUGE_VALQ, + IX86_BUILTIN_FABSQ, + IX86_BUILTIN_COPYSIGNQ, + + /* Vectorizer support builtins. */ + IX86_BUILTIN_CPYSGNPS, + IX86_BUILTIN_CPYSGNPD, + IX86_BUILTIN_CPYSGNPS256, + IX86_BUILTIN_CPYSGNPD256, + + IX86_BUILTIN_CVTUDQ2PS, + + IX86_BUILTIN_VEC_PERM_V2DF, + IX86_BUILTIN_VEC_PERM_V4SF, + IX86_BUILTIN_VEC_PERM_V2DI, + IX86_BUILTIN_VEC_PERM_V4SI, + IX86_BUILTIN_VEC_PERM_V8HI, + IX86_BUILTIN_VEC_PERM_V16QI, + IX86_BUILTIN_VEC_PERM_V2DI_U, + IX86_BUILTIN_VEC_PERM_V4SI_U, + IX86_BUILTIN_VEC_PERM_V8HI_U, + IX86_BUILTIN_VEC_PERM_V16QI_U, + IX86_BUILTIN_VEC_PERM_V4DF, + IX86_BUILTIN_VEC_PERM_V8SF, + + /* FMA4 and XOP instructions. */ + IX86_BUILTIN_VFMADDSS, + IX86_BUILTIN_VFMADDSD, + IX86_BUILTIN_VFMADDPS, + IX86_BUILTIN_VFMADDPD, + IX86_BUILTIN_VFMADDPS256, + IX86_BUILTIN_VFMADDPD256, + IX86_BUILTIN_VFMADDSUBPS, + IX86_BUILTIN_VFMADDSUBPD, + IX86_BUILTIN_VFMADDSUBPS256, + IX86_BUILTIN_VFMADDSUBPD256, + + IX86_BUILTIN_VPCMOV, + IX86_BUILTIN_VPCMOV_V2DI, + IX86_BUILTIN_VPCMOV_V4SI, + IX86_BUILTIN_VPCMOV_V8HI, + IX86_BUILTIN_VPCMOV_V16QI, + IX86_BUILTIN_VPCMOV_V4SF, + IX86_BUILTIN_VPCMOV_V2DF, + IX86_BUILTIN_VPCMOV256, + IX86_BUILTIN_VPCMOV_V4DI256, + IX86_BUILTIN_VPCMOV_V8SI256, + IX86_BUILTIN_VPCMOV_V16HI256, + IX86_BUILTIN_VPCMOV_V32QI256, + IX86_BUILTIN_VPCMOV_V8SF256, + IX86_BUILTIN_VPCMOV_V4DF256, + + IX86_BUILTIN_VPPERM, + + IX86_BUILTIN_VPMACSSWW, + IX86_BUILTIN_VPMACSWW, + IX86_BUILTIN_VPMACSSWD, + IX86_BUILTIN_VPMACSWD, + IX86_BUILTIN_VPMACSSDD, + IX86_BUILTIN_VPMACSDD, + IX86_BUILTIN_VPMACSSDQL, + IX86_BUILTIN_VPMACSSDQH, + IX86_BUILTIN_VPMACSDQL, + IX86_BUILTIN_VPMACSDQH, + IX86_BUILTIN_VPMADCSSWD, + IX86_BUILTIN_VPMADCSWD, + + IX86_BUILTIN_VPHADDBW, + IX86_BUILTIN_VPHADDBD, + IX86_BUILTIN_VPHADDBQ, + IX86_BUILTIN_VPHADDWD, + IX86_BUILTIN_VPHADDWQ, + IX86_BUILTIN_VPHADDDQ, + IX86_BUILTIN_VPHADDUBW, + IX86_BUILTIN_VPHADDUBD, + IX86_BUILTIN_VPHADDUBQ, + IX86_BUILTIN_VPHADDUWD, + IX86_BUILTIN_VPHADDUWQ, + IX86_BUILTIN_VPHADDUDQ, + IX86_BUILTIN_VPHSUBBW, + IX86_BUILTIN_VPHSUBWD, + IX86_BUILTIN_VPHSUBDQ, + + IX86_BUILTIN_VPROTB, + IX86_BUILTIN_VPROTW, + IX86_BUILTIN_VPROTD, + IX86_BUILTIN_VPROTQ, + IX86_BUILTIN_VPROTB_IMM, + IX86_BUILTIN_VPROTW_IMM, + IX86_BUILTIN_VPROTD_IMM, + IX86_BUILTIN_VPROTQ_IMM, + + IX86_BUILTIN_VPSHLB, + IX86_BUILTIN_VPSHLW, + IX86_BUILTIN_VPSHLD, + IX86_BUILTIN_VPSHLQ, + IX86_BUILTIN_VPSHAB, + IX86_BUILTIN_VPSHAW, + IX86_BUILTIN_VPSHAD, + IX86_BUILTIN_VPSHAQ, + + IX86_BUILTIN_VFRCZSS, + IX86_BUILTIN_VFRCZSD, + IX86_BUILTIN_VFRCZPS, + IX86_BUILTIN_VFRCZPD, + IX86_BUILTIN_VFRCZPS256, + IX86_BUILTIN_VFRCZPD256, + + IX86_BUILTIN_VPCOMEQUB, + IX86_BUILTIN_VPCOMNEUB, + IX86_BUILTIN_VPCOMLTUB, + IX86_BUILTIN_VPCOMLEUB, + IX86_BUILTIN_VPCOMGTUB, + IX86_BUILTIN_VPCOMGEUB, + IX86_BUILTIN_VPCOMFALSEUB, + IX86_BUILTIN_VPCOMTRUEUB, + + IX86_BUILTIN_VPCOMEQUW, + IX86_BUILTIN_VPCOMNEUW, + IX86_BUILTIN_VPCOMLTUW, + IX86_BUILTIN_VPCOMLEUW, + IX86_BUILTIN_VPCOMGTUW, + IX86_BUILTIN_VPCOMGEUW, + IX86_BUILTIN_VPCOMFALSEUW, + IX86_BUILTIN_VPCOMTRUEUW, + + IX86_BUILTIN_VPCOMEQUD, + IX86_BUILTIN_VPCOMNEUD, + IX86_BUILTIN_VPCOMLTUD, + IX86_BUILTIN_VPCOMLEUD, + IX86_BUILTIN_VPCOMGTUD, + IX86_BUILTIN_VPCOMGEUD, + IX86_BUILTIN_VPCOMFALSEUD, + IX86_BUILTIN_VPCOMTRUEUD, + + IX86_BUILTIN_VPCOMEQUQ, + IX86_BUILTIN_VPCOMNEUQ, + IX86_BUILTIN_VPCOMLTUQ, + IX86_BUILTIN_VPCOMLEUQ, + IX86_BUILTIN_VPCOMGTUQ, + IX86_BUILTIN_VPCOMGEUQ, + IX86_BUILTIN_VPCOMFALSEUQ, + IX86_BUILTIN_VPCOMTRUEUQ, + + IX86_BUILTIN_VPCOMEQB, + IX86_BUILTIN_VPCOMNEB, + IX86_BUILTIN_VPCOMLTB, + IX86_BUILTIN_VPCOMLEB, + IX86_BUILTIN_VPCOMGTB, + IX86_BUILTIN_VPCOMGEB, + IX86_BUILTIN_VPCOMFALSEB, + IX86_BUILTIN_VPCOMTRUEB, + + IX86_BUILTIN_VPCOMEQW, + IX86_BUILTIN_VPCOMNEW, + IX86_BUILTIN_VPCOMLTW, + IX86_BUILTIN_VPCOMLEW, + IX86_BUILTIN_VPCOMGTW, + IX86_BUILTIN_VPCOMGEW, + IX86_BUILTIN_VPCOMFALSEW, + IX86_BUILTIN_VPCOMTRUEW, + + IX86_BUILTIN_VPCOMEQD, + IX86_BUILTIN_VPCOMNED, + IX86_BUILTIN_VPCOMLTD, + IX86_BUILTIN_VPCOMLED, + IX86_BUILTIN_VPCOMGTD, + IX86_BUILTIN_VPCOMGED, + IX86_BUILTIN_VPCOMFALSED, + IX86_BUILTIN_VPCOMTRUED, + + IX86_BUILTIN_VPCOMEQQ, + IX86_BUILTIN_VPCOMNEQ, + IX86_BUILTIN_VPCOMLTQ, + IX86_BUILTIN_VPCOMLEQ, + IX86_BUILTIN_VPCOMGTQ, + IX86_BUILTIN_VPCOMGEQ, + IX86_BUILTIN_VPCOMFALSEQ, + IX86_BUILTIN_VPCOMTRUEQ, + + /* LWP instructions. */ + IX86_BUILTIN_LLWPCB, + IX86_BUILTIN_SLWPCB, + IX86_BUILTIN_LWPVAL32, + IX86_BUILTIN_LWPVAL64, + IX86_BUILTIN_LWPINS32, + IX86_BUILTIN_LWPINS64, + + IX86_BUILTIN_CLZS, + + /* BMI instructions. */ + IX86_BUILTIN_BEXTR32, + IX86_BUILTIN_BEXTR64, + IX86_BUILTIN_CTZS, + + /* TBM instructions. */ + IX86_BUILTIN_BEXTRI32, + IX86_BUILTIN_BEXTRI64, + + + /* FSGSBASE instructions. */ + IX86_BUILTIN_RDFSBASE32, + IX86_BUILTIN_RDFSBASE64, + IX86_BUILTIN_RDGSBASE32, + IX86_BUILTIN_RDGSBASE64, + IX86_BUILTIN_WRFSBASE32, + IX86_BUILTIN_WRFSBASE64, + IX86_BUILTIN_WRGSBASE32, + IX86_BUILTIN_WRGSBASE64, + + /* RDRND instructions. */ + IX86_BUILTIN_RDRAND16_STEP, + IX86_BUILTIN_RDRAND32_STEP, + IX86_BUILTIN_RDRAND64_STEP, + + /* F16C instructions. */ + IX86_BUILTIN_CVTPH2PS, + IX86_BUILTIN_CVTPH2PS256, + IX86_BUILTIN_CVTPS2PH, + IX86_BUILTIN_CVTPS2PH256, + + /* CFString built-in for darwin */ + IX86_BUILTIN_CFSTRING, + + IX86_BUILTIN_MAX +}; + +/* Table for the ix86 builtin decls. */ +static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX]; + +/* Table of all of the builtin functions that are possible with different ISA's + but are waiting to be built until a function is declared to use that + ISA. */ +struct builtin_isa { + const char *name; /* function name */ + enum ix86_builtin_func_type tcode; /* type to use in the declaration */ + int isa; /* isa_flags this builtin is defined for */ + bool const_p; /* true if the declaration is constant */ + bool set_and_not_built_p; +}; + +static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX]; + + +/* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK + of which isa_flags to use in the ix86_builtins_isa array. Stores the + function decl in the ix86_builtins array. Returns the function decl or + NULL_TREE, if the builtin was not added. + + If the front end has a special hook for builtin functions, delay adding + builtin functions that aren't in the current ISA until the ISA is changed + with function specific optimization. Doing so, can save about 300K for the + default compiler. When the builtin is expanded, check at that time whether + it is valid. + + If the front end doesn't have a special hook, record all builtins, even if + it isn't an instruction set in the current ISA in case the user uses + function specific options for a different ISA, so that we don't get scope + errors if a builtin is added in the middle of a function scope. */ + +static inline tree +def_builtin (int mask, const char *name, enum ix86_builtin_func_type tcode, + enum ix86_builtins code) +{ + tree decl = NULL_TREE; + + if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT) + { + ix86_builtins_isa[(int) code].isa = mask; + + mask &= ~OPTION_MASK_ISA_64BIT; + if (mask == 0 + || (mask & ix86_isa_flags) != 0 + || (lang_hooks.builtin_function + == lang_hooks.builtin_function_ext_scope)) + + { + tree type = ix86_get_builtin_func_type (tcode); + decl = add_builtin_function (name, type, code, BUILT_IN_MD, + NULL, NULL_TREE); + ix86_builtins[(int) code] = decl; + ix86_builtins_isa[(int) code].set_and_not_built_p = false; + } + else + { + ix86_builtins[(int) code] = NULL_TREE; + ix86_builtins_isa[(int) code].tcode = tcode; + ix86_builtins_isa[(int) code].name = name; + ix86_builtins_isa[(int) code].const_p = false; + ix86_builtins_isa[(int) code].set_and_not_built_p = true; + } + } + + return decl; +} + +/* Like def_builtin, but also marks the function decl "const". */ + +static inline tree +def_builtin_const (int mask, const char *name, + enum ix86_builtin_func_type tcode, enum ix86_builtins code) +{ + tree decl = def_builtin (mask, name, tcode, code); + if (decl) + TREE_READONLY (decl) = 1; + else + ix86_builtins_isa[(int) code].const_p = true; + + return decl; +} + +/* Add any new builtin functions for a given ISA that may not have been + declared. This saves a bit of space compared to adding all of the + declarations to the tree, even if we didn't use them. */ + +static void +ix86_add_new_builtins (int isa) +{ + int i; + + for (i = 0; i < (int)IX86_BUILTIN_MAX; i++) + { + if ((ix86_builtins_isa[i].isa & isa) != 0 + && ix86_builtins_isa[i].set_and_not_built_p) + { + tree decl, type; + + /* Don't define the builtin again. */ + ix86_builtins_isa[i].set_and_not_built_p = false; + + type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode); + decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name, + type, i, BUILT_IN_MD, NULL, + NULL_TREE); + + ix86_builtins[i] = decl; + if (ix86_builtins_isa[i].const_p) + TREE_READONLY (decl) = 1; + } + } +} + +/* Bits for builtin_description.flag. */ + +/* Set when we don't support the comparison natively, and should + swap_comparison in order to support it. */ +#define BUILTIN_DESC_SWAP_OPERANDS 1 + +struct builtin_description +{ + const unsigned int mask; + const enum insn_code icode; + const char *const name; + const enum ix86_builtins code; + const enum rtx_code comparison; + const int flag; +}; + +static const struct builtin_description bdesc_comi[] = +{ + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 }, +}; + +static const struct builtin_description bdesc_pcmpestr[] = +{ + /* SSE4.2 */ + { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 }, + { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 }, + { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode }, + { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode }, + { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode }, + { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode }, + { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode }, +}; + +static const struct builtin_description bdesc_pcmpistr[] = +{ + /* SSE4.2 */ + { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 }, + { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 }, + { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode }, + { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode }, + { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode }, + { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode }, + { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode }, +}; + +/* Special builtins with variable number of arguments. */ +static const struct builtin_description bdesc_special_args[] = +{ + { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtsc, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID }, + { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtscp, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED }, + + /* MMX */ + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID }, + + /* 3DNow! */ + { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID }, + + /* SSE */ + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT }, + + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF }, + + /* SSE or 3DNow!A */ + { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID }, + { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntdi, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG }, + + /* SSE2 */ + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntsi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE }, + + /* SSE3 */ + { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR }, + + /* SSE4.1 */ + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI }, + + /* SSE4A */ + { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF }, + { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF }, + + /* AVX */ + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID }, + + { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE }, + { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF }, + + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR }, + + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF }, + + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF }, + + { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID }, + { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID }, + { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT }, + { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT }, + { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT }, + { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT }, + + /* FSGSBASE */ + { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID }, + { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID }, + { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID }, + { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID }, + { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED }, + { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 }, + { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED }, + { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 }, +}; + +/* Builtins with variable number of arguments. */ +static const struct builtin_description bdesc_args[] = +{ + { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT }, + { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 }, + { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdpmc, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT }, + { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT }, + { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT }, + { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT }, + { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT }, + + /* MMX */ + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI }, + + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI }, + + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI }, + + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI }, + + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI }, + + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI}, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI}, + + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI }, + + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI }, + + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT }, + + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT }, + + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT }, + { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT }, + + /* 3DNow! */ + { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF }, + { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI }, + { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF }, + { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF }, + + { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI }, + { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF }, + { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF }, + { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF }, + { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF }, + { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF }, + { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF }, + { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF }, + { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF }, + { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF }, + { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF }, + { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF }, + { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF }, + { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF }, + { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI }, + + /* 3DNow!A */ + { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF }, + { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI }, + { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI }, + { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF }, + { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF }, + { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF }, + + /* SSE */ + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF }, + { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF }, + { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF }, + + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT }, + + { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP}, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF }, + + { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + + { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + + { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI }, + { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI }, + + { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT }, + + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE }, + + /* SSE MMX or 3Dnow!A */ + { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI }, + { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI }, + { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI }, + + { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI }, + { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI }, + { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI }, + { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI }, + + { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI }, + { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI }, + + { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT }, + + /* SSE2 */ + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v2df", IX86_BUILTIN_VEC_PERM_V2DF, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DI }, + { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v4sf", IX86_BUILTIN_VEC_PERM_V4SF, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v2di", IX86_BUILTIN_VEC_PERM_V2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_V2DI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v4si", IX86_BUILTIN_VEC_PERM_V4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v8hi", IX86_BUILTIN_VEC_PERM_V8HI, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v16qi", IX86_BUILTIN_VEC_PERM_V16QI, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v2di_u", IX86_BUILTIN_VEC_PERM_V2DI_U, UNKNOWN, (int) V2UDI_FTYPE_V2UDI_V2UDI_V2UDI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v4si_u", IX86_BUILTIN_VEC_PERM_V4SI_U, UNKNOWN, (int) V4USI_FTYPE_V4USI_V4USI_V4USI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v8hi_u", IX86_BUILTIN_VEC_PERM_V8HI_U, UNKNOWN, (int) V8UHI_FTYPE_V8UHI_V8UHI_V8UHI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v16qi_u", IX86_BUILTIN_VEC_PERM_V16QI_U, UNKNOWN, (int) V16UQI_FTYPE_V16UQI_V16UQI_V16UQI }, + { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v4df", IX86_BUILTIN_VEC_PERM_V4DF, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DI }, + { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v8sf", IX86_BUILTIN_VEC_PERM_V8SF, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SI }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2ps, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtudq2ps, "__builtin_ia32_cvtudq2ps", IX86_BUILTIN_CVTUDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF }, + { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF }, + { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttps2dq, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP}, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI }, + { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 }, + + { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI }, + + /* SSE2 MMX */ + { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI }, + + /* SSE3 */ + { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF}, + { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF }, + + { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF }, + + /* SSSE3 */ + { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI }, + + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI }, + + /* SSSE3. */ + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT }, + { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT }, + + /* SSE4.1 */ + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT }, + + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI }, + + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, + + /* SSE4.1 */ + { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT }, + { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT }, + { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT }, + { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT }, + + { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST }, + { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST }, + { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST }, + + /* SSE4.2 */ + { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI }, + { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR }, + { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT }, + { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT }, + { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 }, + + /* SSE4A */ + { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT }, + { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI }, + { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT }, + { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI }, + + /* AES */ + { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI }, + + { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI }, + + /* PCLMUL */ + { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT }, + + /* AVX */ + { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF }, + + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI }, + + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpsdv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpssv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmppdv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmppsv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmppdv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmppsv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtdq2pd256, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtdq2ps256, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvttpd2dq256, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvttps2dq256, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT }, + + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF }, + + { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF }, + + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF }, + + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT }, + + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF }, + + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI }, + { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF }, + + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST }, + + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF }, + + { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF }, + + { OPTION_MASK_ISA_ABM, CODE_FOR_clzhi2_abm, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 }, + + /* BMI */ + { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT }, + { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 }, + { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 }, + + /* TBM */ + { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT }, + { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 }, + + /* F16C */ + { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI }, + { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI }, + { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT }, + { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT }, +}; + +/* FMA4 and XOP. */ +#define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT +#define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT +#define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT +#define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT +#define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF +#define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF +#define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF +#define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF +#define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI +#define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI +#define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI +#define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI +#define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI +#define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI +#define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI +#define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI +#define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI +#define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI +#define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF +#define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF +#define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI +#define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI +#define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI +#define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI +#define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI +#define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI +#define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI +#define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI +#define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP +#define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP +#define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP +#define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP +#define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF +#define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF +#define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF +#define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF +#define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF +#define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF +#define MULTI_ARG_1_SF V4SF_FTYPE_V4SF +#define MULTI_ARG_1_DF V2DF_FTYPE_V2DF +#define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF +#define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF +#define MULTI_ARG_1_DI V2DI_FTYPE_V2DI +#define MULTI_ARG_1_SI V4SI_FTYPE_V4SI +#define MULTI_ARG_1_HI V8HI_FTYPE_V8HI +#define MULTI_ARG_1_QI V16QI_FTYPE_V16QI +#define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI +#define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI +#define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI +#define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI +#define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI +#define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI + +static const struct builtin_description bdesc_multi_arg[] = +{ + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf, + "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS, + UNKNOWN, (int)MULTI_ARG_3_SF }, + { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df, + "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD, + UNKNOWN, (int)MULTI_ARG_3_DF }, + + { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf, + "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS, + UNKNOWN, (int)MULTI_ARG_3_SF }, + { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df, + "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD, + UNKNOWN, (int)MULTI_ARG_3_DF }, + { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf, + "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256, + UNKNOWN, (int)MULTI_ARG_3_SF2 }, + { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df, + "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256, + UNKNOWN, (int)MULTI_ARG_3_DF2 }, + + { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf, + "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS, + UNKNOWN, (int)MULTI_ARG_3_SF }, + { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df, + "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD, + UNKNOWN, (int)MULTI_ARG_3_DF }, + { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf, + "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256, + UNKNOWN, (int)MULTI_ARG_3_SF2 }, + { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df, + "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256, + UNKNOWN, (int)MULTI_ARG_3_DF2 }, + + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF }, + + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 }, + + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI }, + + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI }, + + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_ashlv2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_ashlv4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_ashlv8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_ashlv16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_lshlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_lshlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_lshlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_lshlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI }, + + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 }, + + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI }, + + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP }, + + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP }, + + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP }, + + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP }, + + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP }, + + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP }, + + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP }, + + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP }, + + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF }, + + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF }, + + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 }, + +}; + +/* Set up all the MMX/SSE builtins, even builtins for instructions that are not + in the current target ISA to allow the user to compile particular modules + with different target specific options that differ from the command line + options. */ +static void +ix86_init_mmx_sse_builtins (void) +{ + const struct builtin_description * d; + enum ix86_builtin_func_type ftype; + size_t i; + + /* Add all special builtins with variable number of operands. */ + for (i = 0, d = bdesc_special_args; + i < ARRAY_SIZE (bdesc_special_args); + i++, d++) + { + if (d->name == 0) + continue; + + ftype = (enum ix86_builtin_func_type) d->flag; + def_builtin (d->mask, d->name, ftype, d->code); + } + + /* Add all builtins with variable number of operands. */ + for (i = 0, d = bdesc_args; + i < ARRAY_SIZE (bdesc_args); + i++, d++) + { + if (d->name == 0) + continue; + + ftype = (enum ix86_builtin_func_type) d->flag; + def_builtin_const (d->mask, d->name, ftype, d->code); + } + + /* pcmpestr[im] insns. */ + for (i = 0, d = bdesc_pcmpestr; + i < ARRAY_SIZE (bdesc_pcmpestr); + i++, d++) + { + if (d->code == IX86_BUILTIN_PCMPESTRM128) + ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT; + else + ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT; + def_builtin_const (d->mask, d->name, ftype, d->code); + } + + /* pcmpistr[im] insns. */ + for (i = 0, d = bdesc_pcmpistr; + i < ARRAY_SIZE (bdesc_pcmpistr); + i++, d++) + { + if (d->code == IX86_BUILTIN_PCMPISTRM128) + ftype = V16QI_FTYPE_V16QI_V16QI_INT; + else + ftype = INT_FTYPE_V16QI_V16QI_INT; + def_builtin_const (d->mask, d->name, ftype, d->code); + } + + /* comi/ucomi insns. */ + for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++) + { + if (d->mask == OPTION_MASK_ISA_SSE2) + ftype = INT_FTYPE_V2DF_V2DF; + else + ftype = INT_FTYPE_V4SF_V4SF; + def_builtin_const (d->mask, d->name, ftype, d->code); + } + + /* SSE */ + def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr", + VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR); + def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr", + UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR); + + /* SSE or 3DNow!A */ + def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, + "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR, + IX86_BUILTIN_MASKMOVQ); + + /* SSE2 */ + def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu", + VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU); + + def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush", + VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH); + x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence", + VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE); + + /* SSE3. */ + def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor", + VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR); + def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait", + VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT); + + /* AES */ + def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128", + V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128); + def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128", + V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128); + def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128", + V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128); + def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128", + V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128); + def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128", + V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128); + def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128", + V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128); + + /* PCLMUL */ + def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128", + V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128); + + /* RDRND */ + def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step", + INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP); + def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step", + INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP); + def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT, + "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG, + IX86_BUILTIN_RDRAND64_STEP); + + /* MMX access to the vec_init patterns. */ + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si", + V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI); + + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi", + V4HI_FTYPE_HI_HI_HI_HI, + IX86_BUILTIN_VEC_INIT_V4HI); + + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi", + V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI, + IX86_BUILTIN_VEC_INIT_V8QI); + + /* Access to the vec_extract patterns. */ + def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df", + DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF); + def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di", + DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI); + def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf", + FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF); + def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si", + SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI); + def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi", + HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI); + + def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, + "__builtin_ia32_vec_ext_v4hi", + HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI); + + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si", + SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI); + + def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi", + QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI); + + /* Access to the vec_set patterns. */ + def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT, + "__builtin_ia32_vec_set_v2di", + V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI); + + def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf", + V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF); + + def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si", + V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI); + + def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi", + V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI); + + def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, + "__builtin_ia32_vec_set_v4hi", + V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI); + + def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi", + V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI); + + /* Add FMA4 multi-arg argument instructions */ + for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++) + { + if (d->name == 0) + continue; + + ftype = (enum ix86_builtin_func_type) d->flag; + def_builtin_const (d->mask, d->name, ftype, d->code); + } +} + +/* Internal method for ix86_init_builtins. */ + +static void +ix86_init_builtins_va_builtins_abi (void) +{ + tree ms_va_ref, sysv_va_ref; + tree fnvoid_va_end_ms, fnvoid_va_end_sysv; + tree fnvoid_va_start_ms, fnvoid_va_start_sysv; + tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv; + tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE; + + if (!TARGET_64BIT) + return; + fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE); + fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE); + ms_va_ref = build_reference_type (ms_va_list_type_node); + sysv_va_ref = + build_pointer_type (TREE_TYPE (sysv_va_list_type_node)); + + fnvoid_va_end_ms = + build_function_type_list (void_type_node, ms_va_ref, NULL_TREE); + fnvoid_va_start_ms = + build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE); + fnvoid_va_end_sysv = + build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE); + fnvoid_va_start_sysv = + build_varargs_function_type_list (void_type_node, sysv_va_ref, + NULL_TREE); + fnvoid_va_copy_ms = + build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node, + NULL_TREE); + fnvoid_va_copy_sysv = + build_function_type_list (void_type_node, sysv_va_ref, + sysv_va_ref, NULL_TREE); + + add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms, + BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms); + add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms, + BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms); + add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms, + BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms); + add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv, + BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv); + add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv, + BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv); + add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv, + BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv); +} + +static void +ix86_init_builtin_types (void) +{ + tree float128_type_node, float80_type_node; + + /* The __float80 type. */ + float80_type_node = long_double_type_node; + if (TYPE_MODE (float80_type_node) != XFmode) + { + /* The __float80 type. */ + float80_type_node = make_node (REAL_TYPE); + + TYPE_PRECISION (float80_type_node) = 80; + layout_type (float80_type_node); + } + lang_hooks.types.register_builtin_type (float80_type_node, "__float80"); + + /* The __float128 type. */ + float128_type_node = make_node (REAL_TYPE); + TYPE_PRECISION (float128_type_node) = 128; + layout_type (float128_type_node); + lang_hooks.types.register_builtin_type (float128_type_node, "__float128"); + + /* This macro is built by i386-builtin-types.awk. */ + DEFINE_BUILTIN_PRIMITIVE_TYPES; +} + +static void +ix86_init_builtins (void) +{ + tree t; + + ix86_init_builtin_types (); + + /* TFmode support builtins. */ + def_builtin_const (0, "__builtin_infq", + FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ); + def_builtin_const (0, "__builtin_huge_valq", + FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ); + + /* We will expand them to normal call if SSE2 isn't available since + they are used by libgcc. */ + t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128); + t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ, + BUILT_IN_MD, "__fabstf2", NULL_TREE); + TREE_READONLY (t) = 1; + ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t; + + t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128); + t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ, + BUILT_IN_MD, "__copysigntf3", NULL_TREE); + TREE_READONLY (t) = 1; + ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t; + + ix86_init_mmx_sse_builtins (); + + if (TARGET_64BIT) + ix86_init_builtins_va_builtins_abi (); + +#ifdef SUBTARGET_INIT_BUILTINS + SUBTARGET_INIT_BUILTINS; +#endif +} + +/* Return the ix86 builtin for CODE. */ + +static tree +ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED) +{ + if (code >= IX86_BUILTIN_MAX) + return error_mark_node; + + return ix86_builtins[code]; +} + +/* Errors in the source file can cause expand_expr to return const0_rtx + where we expect a vector. To avoid crashing, use one of the vector + clear instructions. */ +static rtx +safe_vector_operand (rtx x, enum machine_mode mode) +{ + if (x == const0_rtx) + x = CONST0_RTX (mode); + return x; +} + +/* Subroutine of ix86_expand_builtin to take care of binop insns. */ + +static rtx +ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); + rtx op0 = expand_normal (arg0); + rtx op1 = expand_normal (arg1); + enum machine_mode tmode = insn_data[icode].operand[0].mode; + enum machine_mode mode0 = insn_data[icode].operand[1].mode; + enum machine_mode mode1 = insn_data[icode].operand[2].mode; + + if (VECTOR_MODE_P (mode0)) + op0 = safe_vector_operand (op0, mode0); + if (VECTOR_MODE_P (mode1)) + op1 = safe_vector_operand (op1, mode1); + + if (optimize || !target + || GET_MODE (target) != tmode + || !insn_data[icode].operand[0].predicate (target, tmode)) + target = gen_reg_rtx (tmode); + + if (GET_MODE (op1) == SImode && mode1 == TImode) + { + rtx x = gen_reg_rtx (V4SImode); + emit_insn (gen_sse2_loadd (x, op1)); + op1 = gen_lowpart (TImode, x); + } + + if (!insn_data[icode].operand[1].predicate (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + if (!insn_data[icode].operand[2].predicate (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + + pat = GEN_FCN (icode) (target, op0, op1); + if (! pat) + return 0; + + emit_insn (pat); + + return target; +} + +/* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */ + +static rtx +ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target, + enum ix86_builtin_func_type m_type, + enum rtx_code sub_code) +{ + rtx pat; + int i; + int nargs; + bool comparison_p = false; + bool tf_p = false; + bool last_arg_constant = false; + int num_memory = 0; + struct { + rtx op; + enum machine_mode mode; + } args[4]; + + enum machine_mode tmode = insn_data[icode].operand[0].mode; + + switch (m_type) + { + case MULTI_ARG_4_DF2_DI_I: + case MULTI_ARG_4_DF2_DI_I1: + case MULTI_ARG_4_SF2_SI_I: + case MULTI_ARG_4_SF2_SI_I1: + nargs = 4; + last_arg_constant = true; + break; + + case MULTI_ARG_3_SF: + case MULTI_ARG_3_DF: + case MULTI_ARG_3_SF2: + case MULTI_ARG_3_DF2: + case MULTI_ARG_3_DI: + case MULTI_ARG_3_SI: + case MULTI_ARG_3_SI_DI: + case MULTI_ARG_3_HI: + case MULTI_ARG_3_HI_SI: + case MULTI_ARG_3_QI: + case MULTI_ARG_3_DI2: + case MULTI_ARG_3_SI2: + case MULTI_ARG_3_HI2: + case MULTI_ARG_3_QI2: + nargs = 3; + break; + + case MULTI_ARG_2_SF: + case MULTI_ARG_2_DF: + case MULTI_ARG_2_DI: + case MULTI_ARG_2_SI: + case MULTI_ARG_2_HI: + case MULTI_ARG_2_QI: + nargs = 2; + break; + + case MULTI_ARG_2_DI_IMM: + case MULTI_ARG_2_SI_IMM: + case MULTI_ARG_2_HI_IMM: + case MULTI_ARG_2_QI_IMM: + nargs = 2; + last_arg_constant = true; + break; + + case MULTI_ARG_1_SF: + case MULTI_ARG_1_DF: + case MULTI_ARG_1_SF2: + case MULTI_ARG_1_DF2: + case MULTI_ARG_1_DI: + case MULTI_ARG_1_SI: + case MULTI_ARG_1_HI: + case MULTI_ARG_1_QI: + case MULTI_ARG_1_SI_DI: + case MULTI_ARG_1_HI_DI: + case MULTI_ARG_1_HI_SI: + case MULTI_ARG_1_QI_DI: + case MULTI_ARG_1_QI_SI: + case MULTI_ARG_1_QI_HI: + nargs = 1; + break; + + case MULTI_ARG_2_DI_CMP: + case MULTI_ARG_2_SI_CMP: + case MULTI_ARG_2_HI_CMP: + case MULTI_ARG_2_QI_CMP: + nargs = 2; + comparison_p = true; + break; + + case MULTI_ARG_2_SF_TF: + case MULTI_ARG_2_DF_TF: + case MULTI_ARG_2_DI_TF: + case MULTI_ARG_2_SI_TF: + case MULTI_ARG_2_HI_TF: + case MULTI_ARG_2_QI_TF: + nargs = 2; + tf_p = true; + break; + + default: + gcc_unreachable (); + } + + if (optimize || !target + || GET_MODE (target) != tmode + || !insn_data[icode].operand[0].predicate (target, tmode)) + target = gen_reg_rtx (tmode); + + gcc_assert (nargs <= 4); + + for (i = 0; i < nargs; i++) + { + tree arg = CALL_EXPR_ARG (exp, i); + rtx op = expand_normal (arg); + int adjust = (comparison_p) ? 1 : 0; + enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode; + + if (last_arg_constant && i == nargs - 1) + { + if (!insn_data[icode].operand[i + 1].predicate (op, mode)) + { + enum insn_code new_icode = icode; + switch (icode) + { + case CODE_FOR_xop_vpermil2v2df3: + case CODE_FOR_xop_vpermil2v4sf3: + case CODE_FOR_xop_vpermil2v4df3: + case CODE_FOR_xop_vpermil2v8sf3: + error ("the last argument must be a 2-bit immediate"); + return gen_reg_rtx (tmode); + case CODE_FOR_xop_rotlv2di3: + new_icode = CODE_FOR_rotlv2di3; + goto xop_rotl; + case CODE_FOR_xop_rotlv4si3: + new_icode = CODE_FOR_rotlv4si3; + goto xop_rotl; + case CODE_FOR_xop_rotlv8hi3: + new_icode = CODE_FOR_rotlv8hi3; + goto xop_rotl; + case CODE_FOR_xop_rotlv16qi3: + new_icode = CODE_FOR_rotlv16qi3; + xop_rotl: + if (CONST_INT_P (op)) + { + int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1; + op = GEN_INT (INTVAL (op) & mask); + gcc_checking_assert + (insn_data[icode].operand[i + 1].predicate (op, mode)); + } + else + { + gcc_checking_assert + (nargs == 2 + && insn_data[new_icode].operand[0].mode == tmode + && insn_data[new_icode].operand[1].mode == tmode + && insn_data[new_icode].operand[2].mode == mode + && insn_data[new_icode].operand[0].predicate + == insn_data[icode].operand[0].predicate + && insn_data[new_icode].operand[1].predicate + == insn_data[icode].operand[1].predicate); + icode = new_icode; + goto non_constant; + } + break; + default: + gcc_unreachable (); + } + } + } + else + { + non_constant: + if (VECTOR_MODE_P (mode)) + op = safe_vector_operand (op, mode); + + /* If we aren't optimizing, only allow one memory operand to be + generated. */ + if (memory_operand (op, mode)) + num_memory++; + + gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode); + + if (optimize + || !insn_data[icode].operand[i+adjust+1].predicate (op, mode) + || num_memory > 1) + op = force_reg (mode, op); + } + + args[i].op = op; + args[i].mode = mode; + } + + switch (nargs) + { + case 1: + pat = GEN_FCN (icode) (target, args[0].op); + break; + + case 2: + if (tf_p) + pat = GEN_FCN (icode) (target, args[0].op, args[1].op, + GEN_INT ((int)sub_code)); + else if (! comparison_p) + pat = GEN_FCN (icode) (target, args[0].op, args[1].op); + else + { + rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target), + args[0].op, + args[1].op); + + pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op); + } + break; + + case 3: + pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op); + break; + + case 4: + pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op); + break; + + default: + gcc_unreachable (); + } + + if (! pat) + return 0; + + emit_insn (pat); + return target; +} + +/* Subroutine of ix86_expand_args_builtin to take care of scalar unop + insns with vec_merge. */ + +static rtx +ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp, + rtx target) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + rtx op1, op0 = expand_normal (arg0); + enum machine_mode tmode = insn_data[icode].operand[0].mode; + enum machine_mode mode0 = insn_data[icode].operand[1].mode; + + if (optimize || !target + || GET_MODE (target) != tmode + || !insn_data[icode].operand[0].predicate (target, tmode)) + target = gen_reg_rtx (tmode); + + if (VECTOR_MODE_P (mode0)) + op0 = safe_vector_operand (op0, mode0); + + if ((optimize && !register_operand (op0, mode0)) + || !insn_data[icode].operand[1].predicate (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + + op1 = op0; + if (!insn_data[icode].operand[2].predicate (op1, mode0)) + op1 = copy_to_mode_reg (mode0, op1); + + pat = GEN_FCN (icode) (target, op0, op1); + if (! pat) + return 0; + emit_insn (pat); + return target; +} + +/* Subroutine of ix86_expand_builtin to take care of comparison insns. */ + +static rtx +ix86_expand_sse_compare (const struct builtin_description *d, + tree exp, rtx target, bool swap) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); + rtx op0 = expand_normal (arg0); + rtx op1 = expand_normal (arg1); + rtx op2; + enum machine_mode tmode = insn_data[d->icode].operand[0].mode; + enum machine_mode mode0 = insn_data[d->icode].operand[1].mode; + enum machine_mode mode1 = insn_data[d->icode].operand[2].mode; + enum rtx_code comparison = d->comparison; + + if (VECTOR_MODE_P (mode0)) + op0 = safe_vector_operand (op0, mode0); + if (VECTOR_MODE_P (mode1)) + op1 = safe_vector_operand (op1, mode1); + + /* Swap operands if we have a comparison that isn't available in + hardware. */ + if (swap) + { + rtx tmp = gen_reg_rtx (mode1); + emit_move_insn (tmp, op1); + op1 = op0; + op0 = tmp; + } + + if (optimize || !target + || GET_MODE (target) != tmode + || !insn_data[d->icode].operand[0].predicate (target, tmode)) + target = gen_reg_rtx (tmode); + + if ((optimize && !register_operand (op0, mode0)) + || !insn_data[d->icode].operand[1].predicate (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + if ((optimize && !register_operand (op1, mode1)) + || !insn_data[d->icode].operand[2].predicate (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + + op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1); + pat = GEN_FCN (d->icode) (target, op0, op1, op2); + if (! pat) + return 0; + emit_insn (pat); + return target; +} + +/* Subroutine of ix86_expand_builtin to take care of comi insns. */ + +static rtx +ix86_expand_sse_comi (const struct builtin_description *d, tree exp, + rtx target) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); + rtx op0 = expand_normal (arg0); + rtx op1 = expand_normal (arg1); + enum machine_mode mode0 = insn_data[d->icode].operand[0].mode; + enum machine_mode mode1 = insn_data[d->icode].operand[1].mode; + enum rtx_code comparison = d->comparison; + + if (VECTOR_MODE_P (mode0)) + op0 = safe_vector_operand (op0, mode0); + if (VECTOR_MODE_P (mode1)) + op1 = safe_vector_operand (op1, mode1); + + /* Swap operands if we have a comparison that isn't available in + hardware. */ + if (d->flag & BUILTIN_DESC_SWAP_OPERANDS) + { + rtx tmp = op1; + op1 = op0; + op0 = tmp; + } + + target = gen_reg_rtx (SImode); + emit_move_insn (target, const0_rtx); + target = gen_rtx_SUBREG (QImode, target, 0); + + if ((optimize && !register_operand (op0, mode0)) + || !insn_data[d->icode].operand[0].predicate (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + if ((optimize && !register_operand (op1, mode1)) + || !insn_data[d->icode].operand[1].predicate (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + + pat = GEN_FCN (d->icode) (op0, op1); + if (! pat) + return 0; + emit_insn (pat); + emit_insn (gen_rtx_SET (VOIDmode, + gen_rtx_STRICT_LOW_PART (VOIDmode, target), + gen_rtx_fmt_ee (comparison, QImode, + SET_DEST (pat), + const0_rtx))); + + return SUBREG_REG (target); +} + +/* Subroutine of ix86_expand_builtin to take care of ptest insns. */ + +static rtx +ix86_expand_sse_ptest (const struct builtin_description *d, tree exp, + rtx target) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); + rtx op0 = expand_normal (arg0); + rtx op1 = expand_normal (arg1); + enum machine_mode mode0 = insn_data[d->icode].operand[0].mode; + enum machine_mode mode1 = insn_data[d->icode].operand[1].mode; + enum rtx_code comparison = d->comparison; + + if (VECTOR_MODE_P (mode0)) + op0 = safe_vector_operand (op0, mode0); + if (VECTOR_MODE_P (mode1)) + op1 = safe_vector_operand (op1, mode1); + + target = gen_reg_rtx (SImode); + emit_move_insn (target, const0_rtx); + target = gen_rtx_SUBREG (QImode, target, 0); + + if ((optimize && !register_operand (op0, mode0)) + || !insn_data[d->icode].operand[0].predicate (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + if ((optimize && !register_operand (op1, mode1)) + || !insn_data[d->icode].operand[1].predicate (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + + pat = GEN_FCN (d->icode) (op0, op1); + if (! pat) + return 0; + emit_insn (pat); + emit_insn (gen_rtx_SET (VOIDmode, + gen_rtx_STRICT_LOW_PART (VOIDmode, target), + gen_rtx_fmt_ee (comparison, QImode, + SET_DEST (pat), + const0_rtx))); + + return SUBREG_REG (target); +} + +/* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */ + +static rtx +ix86_expand_sse_pcmpestr (const struct builtin_description *d, + tree exp, rtx target) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); + tree arg2 = CALL_EXPR_ARG (exp, 2); + tree arg3 = CALL_EXPR_ARG (exp, 3); + tree arg4 = CALL_EXPR_ARG (exp, 4); + rtx scratch0, scratch1; + rtx op0 = expand_normal (arg0); + rtx op1 = expand_normal (arg1); + rtx op2 = expand_normal (arg2); + rtx op3 = expand_normal (arg3); + rtx op4 = expand_normal (arg4); + enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm; + + tmode0 = insn_data[d->icode].operand[0].mode; + tmode1 = insn_data[d->icode].operand[1].mode; + modev2 = insn_data[d->icode].operand[2].mode; + modei3 = insn_data[d->icode].operand[3].mode; + modev4 = insn_data[d->icode].operand[4].mode; + modei5 = insn_data[d->icode].operand[5].mode; + modeimm = insn_data[d->icode].operand[6].mode; + + if (VECTOR_MODE_P (modev2)) + op0 = safe_vector_operand (op0, modev2); + if (VECTOR_MODE_P (modev4)) + op2 = safe_vector_operand (op2, modev4); + + if (!insn_data[d->icode].operand[2].predicate (op0, modev2)) + op0 = copy_to_mode_reg (modev2, op0); + if (!insn_data[d->icode].operand[3].predicate (op1, modei3)) + op1 = copy_to_mode_reg (modei3, op1); + if ((optimize && !register_operand (op2, modev4)) + || !insn_data[d->icode].operand[4].predicate (op2, modev4)) + op2 = copy_to_mode_reg (modev4, op2); + if (!insn_data[d->icode].operand[5].predicate (op3, modei5)) + op3 = copy_to_mode_reg (modei5, op3); + + if (!insn_data[d->icode].operand[6].predicate (op4, modeimm)) + { + error ("the fifth argument must be an 8-bit immediate"); + return const0_rtx; + } + + if (d->code == IX86_BUILTIN_PCMPESTRI128) + { + if (optimize || !target + || GET_MODE (target) != tmode0 + || !insn_data[d->icode].operand[0].predicate (target, tmode0)) + target = gen_reg_rtx (tmode0); + + scratch1 = gen_reg_rtx (tmode1); + + pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4); + } + else if (d->code == IX86_BUILTIN_PCMPESTRM128) + { + if (optimize || !target + || GET_MODE (target) != tmode1 + || !insn_data[d->icode].operand[1].predicate (target, tmode1)) + target = gen_reg_rtx (tmode1); + + scratch0 = gen_reg_rtx (tmode0); + + pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4); + } + else + { + gcc_assert (d->flag); + + scratch0 = gen_reg_rtx (tmode0); + scratch1 = gen_reg_rtx (tmode1); + + pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4); + } + + if (! pat) + return 0; + + emit_insn (pat); + + if (d->flag) + { + target = gen_reg_rtx (SImode); + emit_move_insn (target, const0_rtx); + target = gen_rtx_SUBREG (QImode, target, 0); + + emit_insn + (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target), + gen_rtx_fmt_ee (EQ, QImode, + gen_rtx_REG ((enum machine_mode) d->flag, + FLAGS_REG), + const0_rtx))); + return SUBREG_REG (target); + } + else + return target; +} + + +/* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */ + +static rtx +ix86_expand_sse_pcmpistr (const struct builtin_description *d, + tree exp, rtx target) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); + tree arg2 = CALL_EXPR_ARG (exp, 2); + rtx scratch0, scratch1; + rtx op0 = expand_normal (arg0); + rtx op1 = expand_normal (arg1); + rtx op2 = expand_normal (arg2); + enum machine_mode tmode0, tmode1, modev2, modev3, modeimm; + + tmode0 = insn_data[d->icode].operand[0].mode; + tmode1 = insn_data[d->icode].operand[1].mode; + modev2 = insn_data[d->icode].operand[2].mode; + modev3 = insn_data[d->icode].operand[3].mode; + modeimm = insn_data[d->icode].operand[4].mode; + + if (VECTOR_MODE_P (modev2)) + op0 = safe_vector_operand (op0, modev2); + if (VECTOR_MODE_P (modev3)) + op1 = safe_vector_operand (op1, modev3); + + if (!insn_data[d->icode].operand[2].predicate (op0, modev2)) + op0 = copy_to_mode_reg (modev2, op0); + if ((optimize && !register_operand (op1, modev3)) + || !insn_data[d->icode].operand[3].predicate (op1, modev3)) + op1 = copy_to_mode_reg (modev3, op1); + + if (!insn_data[d->icode].operand[4].predicate (op2, modeimm)) + { + error ("the third argument must be an 8-bit immediate"); + return const0_rtx; + } + + if (d->code == IX86_BUILTIN_PCMPISTRI128) + { + if (optimize || !target + || GET_MODE (target) != tmode0 + || !insn_data[d->icode].operand[0].predicate (target, tmode0)) + target = gen_reg_rtx (tmode0); + + scratch1 = gen_reg_rtx (tmode1); + + pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2); + } + else if (d->code == IX86_BUILTIN_PCMPISTRM128) + { + if (optimize || !target + || GET_MODE (target) != tmode1 + || !insn_data[d->icode].operand[1].predicate (target, tmode1)) + target = gen_reg_rtx (tmode1); + + scratch0 = gen_reg_rtx (tmode0); + + pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2); + } + else + { + gcc_assert (d->flag); + + scratch0 = gen_reg_rtx (tmode0); + scratch1 = gen_reg_rtx (tmode1); + + pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2); + } + + if (! pat) + return 0; + + emit_insn (pat); + + if (d->flag) + { + target = gen_reg_rtx (SImode); + emit_move_insn (target, const0_rtx); + target = gen_rtx_SUBREG (QImode, target, 0); + + emit_insn + (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target), + gen_rtx_fmt_ee (EQ, QImode, + gen_rtx_REG ((enum machine_mode) d->flag, + FLAGS_REG), + const0_rtx))); + return SUBREG_REG (target); + } + else + return target; +} + +/* Subroutine of ix86_expand_builtin to take care of insns with + variable number of operands. */ + +static rtx +ix86_expand_args_builtin (const struct builtin_description *d, + tree exp, rtx target) +{ + rtx pat, real_target; + unsigned int i, nargs; + unsigned int nargs_constant = 0; + int num_memory = 0; + struct + { + rtx op; + enum machine_mode mode; + } args[4]; + bool last_arg_count = false; + enum insn_code icode = d->icode; + const struct insn_data_d *insn_p = &insn_data[icode]; + enum machine_mode tmode = insn_p->operand[0].mode; + enum machine_mode rmode = VOIDmode; + bool swap = false; + enum rtx_code comparison = d->comparison; + + switch ((enum ix86_builtin_func_type) d->flag) + { + case INT_FTYPE_V8SF_V8SF_PTEST: + case INT_FTYPE_V4DI_V4DI_PTEST: + case INT_FTYPE_V4DF_V4DF_PTEST: + case INT_FTYPE_V4SF_V4SF_PTEST: + case INT_FTYPE_V2DI_V2DI_PTEST: + case INT_FTYPE_V2DF_V2DF_PTEST: + return ix86_expand_sse_ptest (d, exp, target); + case FLOAT128_FTYPE_FLOAT128: + case FLOAT_FTYPE_FLOAT: + case INT_FTYPE_INT: + case UINT64_FTYPE_INT: + case UINT16_FTYPE_UINT16: + case INT64_FTYPE_INT64: + case INT64_FTYPE_V4SF: + case INT64_FTYPE_V2DF: + case INT_FTYPE_V16QI: + case INT_FTYPE_V8QI: + case INT_FTYPE_V8SF: + case INT_FTYPE_V4DF: + case INT_FTYPE_V4SF: + case INT_FTYPE_V2DF: + case V16QI_FTYPE_V16QI: + case V8SI_FTYPE_V8SF: + case V8SI_FTYPE_V4SI: + case V8HI_FTYPE_V8HI: + case V8HI_FTYPE_V16QI: + case V8QI_FTYPE_V8QI: + case V8SF_FTYPE_V8SF: + case V8SF_FTYPE_V8SI: + case V8SF_FTYPE_V4SF: + case V8SF_FTYPE_V8HI: + case V4SI_FTYPE_V4SI: + case V4SI_FTYPE_V16QI: + case V4SI_FTYPE_V4SF: + case V4SI_FTYPE_V8SI: + case V4SI_FTYPE_V8HI: + case V4SI_FTYPE_V4DF: + case V4SI_FTYPE_V2DF: + case V4HI_FTYPE_V4HI: + case V4DF_FTYPE_V4DF: + case V4DF_FTYPE_V4SI: + case V4DF_FTYPE_V4SF: + case V4DF_FTYPE_V2DF: + case V4SF_FTYPE_V4SF: + case V4SF_FTYPE_V4SI: + case V4SF_FTYPE_V8SF: + case V4SF_FTYPE_V4DF: + case V4SF_FTYPE_V8HI: + case V4SF_FTYPE_V2DF: + case V2DI_FTYPE_V2DI: + case V2DI_FTYPE_V16QI: + case V2DI_FTYPE_V8HI: + case V2DI_FTYPE_V4SI: + case V2DF_FTYPE_V2DF: + case V2DF_FTYPE_V4SI: + case V2DF_FTYPE_V4DF: + case V2DF_FTYPE_V4SF: + case V2DF_FTYPE_V2SI: + case V2SI_FTYPE_V2SI: + case V2SI_FTYPE_V4SF: + case V2SI_FTYPE_V2SF: + case V2SI_FTYPE_V2DF: + case V2SF_FTYPE_V2SF: + case V2SF_FTYPE_V2SI: + nargs = 1; + break; + case V4SF_FTYPE_V4SF_VEC_MERGE: + case V2DF_FTYPE_V2DF_VEC_MERGE: + return ix86_expand_unop_vec_merge_builtin (icode, exp, target); + case FLOAT128_FTYPE_FLOAT128_FLOAT128: + case V16QI_FTYPE_V16QI_V16QI: + case V16QI_FTYPE_V8HI_V8HI: + case V8QI_FTYPE_V8QI_V8QI: + case V8QI_FTYPE_V4HI_V4HI: + case V8HI_FTYPE_V8HI_V8HI: + case V8HI_FTYPE_V16QI_V16QI: + case V8HI_FTYPE_V4SI_V4SI: + case V8SF_FTYPE_V8SF_V8SF: + case V8SF_FTYPE_V8SF_V8SI: + case V4SI_FTYPE_V4SI_V4SI: + case V4SI_FTYPE_V8HI_V8HI: + case V4SI_FTYPE_V4SF_V4SF: + case V4SI_FTYPE_V2DF_V2DF: + case V4HI_FTYPE_V4HI_V4HI: + case V4HI_FTYPE_V8QI_V8QI: + case V4HI_FTYPE_V2SI_V2SI: + case V4DF_FTYPE_V4DF_V4DF: + case V4DF_FTYPE_V4DF_V4DI: + case V4SF_FTYPE_V4SF_V4SF: + case V4SF_FTYPE_V4SF_V4SI: + case V4SF_FTYPE_V4SF_V2SI: + case V4SF_FTYPE_V4SF_V2DF: + case V4SF_FTYPE_V4SF_DI: + case V4SF_FTYPE_V4SF_SI: + case V2DI_FTYPE_V2DI_V2DI: + case V2DI_FTYPE_V16QI_V16QI: + case V2DI_FTYPE_V4SI_V4SI: + case V2DI_FTYPE_V2DI_V16QI: + case V2DI_FTYPE_V2DF_V2DF: + case V2SI_FTYPE_V2SI_V2SI: + case V2SI_FTYPE_V4HI_V4HI: + case V2SI_FTYPE_V2SF_V2SF: + case V2DF_FTYPE_V2DF_V2DF: + case V2DF_FTYPE_V2DF_V4SF: + case V2DF_FTYPE_V2DF_V2DI: + case V2DF_FTYPE_V2DF_DI: + case V2DF_FTYPE_V2DF_SI: + case V2SF_FTYPE_V2SF_V2SF: + case V1DI_FTYPE_V1DI_V1DI: + case V1DI_FTYPE_V8QI_V8QI: + case V1DI_FTYPE_V2SI_V2SI: + if (comparison == UNKNOWN) + return ix86_expand_binop_builtin (icode, exp, target); + nargs = 2; + break; + case V4SF_FTYPE_V4SF_V4SF_SWAP: + case V2DF_FTYPE_V2DF_V2DF_SWAP: + gcc_assert (comparison != UNKNOWN); + nargs = 2; + swap = true; + break; + case V8HI_FTYPE_V8HI_V8HI_COUNT: + case V8HI_FTYPE_V8HI_SI_COUNT: + case V4SI_FTYPE_V4SI_V4SI_COUNT: + case V4SI_FTYPE_V4SI_SI_COUNT: + case V4HI_FTYPE_V4HI_V4HI_COUNT: + case V4HI_FTYPE_V4HI_SI_COUNT: + case V2DI_FTYPE_V2DI_V2DI_COUNT: + case V2DI_FTYPE_V2DI_SI_COUNT: + case V2SI_FTYPE_V2SI_V2SI_COUNT: + case V2SI_FTYPE_V2SI_SI_COUNT: + case V1DI_FTYPE_V1DI_V1DI_COUNT: + case V1DI_FTYPE_V1DI_SI_COUNT: + nargs = 2; + last_arg_count = true; + break; + case UINT64_FTYPE_UINT64_UINT64: + case UINT_FTYPE_UINT_UINT: + case UINT_FTYPE_UINT_USHORT: + case UINT_FTYPE_UINT_UCHAR: + case UINT16_FTYPE_UINT16_INT: + case UINT8_FTYPE_UINT8_INT: + nargs = 2; + break; + case V2DI_FTYPE_V2DI_INT_CONVERT: + nargs = 2; + rmode = V1TImode; + nargs_constant = 1; + break; + case V8HI_FTYPE_V8HI_INT: + case V8HI_FTYPE_V8SF_INT: + case V8HI_FTYPE_V4SF_INT: + case V8SF_FTYPE_V8SF_INT: + case V4SI_FTYPE_V4SI_INT: + case V4SI_FTYPE_V8SI_INT: + case V4HI_FTYPE_V4HI_INT: + case V4DF_FTYPE_V4DF_INT: + case V4SF_FTYPE_V4SF_INT: + case V4SF_FTYPE_V8SF_INT: + case V2DI_FTYPE_V2DI_INT: + case V2DF_FTYPE_V2DF_INT: + case V2DF_FTYPE_V4DF_INT: + nargs = 2; + nargs_constant = 1; + break; + case V16QI_FTYPE_V16QI_V16QI_V16QI: + case V8SF_FTYPE_V8SF_V8SF_V8SF: + case V4DF_FTYPE_V4DF_V4DF_V4DF: + case V4SF_FTYPE_V4SF_V4SF_V4SF: + case V2DF_FTYPE_V2DF_V2DF_V2DF: + nargs = 3; + break; + case V16QI_FTYPE_V16QI_V16QI_INT: + case V8HI_FTYPE_V8HI_V8HI_INT: + case V8SI_FTYPE_V8SI_V8SI_INT: + case V8SI_FTYPE_V8SI_V4SI_INT: + case V8SF_FTYPE_V8SF_V8SF_INT: + case V8SF_FTYPE_V8SF_V4SF_INT: + case V4SI_FTYPE_V4SI_V4SI_INT: + case V4DF_FTYPE_V4DF_V4DF_INT: + case V4DF_FTYPE_V4DF_V2DF_INT: + case V4SF_FTYPE_V4SF_V4SF_INT: + case V2DI_FTYPE_V2DI_V2DI_INT: + case V2DF_FTYPE_V2DF_V2DF_INT: + nargs = 3; + nargs_constant = 1; + break; + case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT: + nargs = 3; + rmode = V2DImode; + nargs_constant = 1; + break; + case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT: + nargs = 3; + rmode = DImode; + nargs_constant = 1; + break; + case V2DI_FTYPE_V2DI_UINT_UINT: + nargs = 3; + nargs_constant = 2; + break; + case V2DF_FTYPE_V2DF_V2DF_V2DI_INT: + case V4DF_FTYPE_V4DF_V4DF_V4DI_INT: + case V4SF_FTYPE_V4SF_V4SF_V4SI_INT: + case V8SF_FTYPE_V8SF_V8SF_V8SI_INT: + nargs = 4; + nargs_constant = 1; + break; + case V2DI_FTYPE_V2DI_V2DI_UINT_UINT: + nargs = 4; + nargs_constant = 2; + break; + default: + gcc_unreachable (); + } + + gcc_assert (nargs <= ARRAY_SIZE (args)); + + if (comparison != UNKNOWN) + { + gcc_assert (nargs == 2); + return ix86_expand_sse_compare (d, exp, target, swap); + } + + if (rmode == VOIDmode || rmode == tmode) + { + if (optimize + || target == 0 + || GET_MODE (target) != tmode + || !insn_p->operand[0].predicate (target, tmode)) + target = gen_reg_rtx (tmode); + real_target = target; + } + else + { + target = gen_reg_rtx (rmode); + real_target = simplify_gen_subreg (tmode, target, rmode, 0); + } + + for (i = 0; i < nargs; i++) + { + tree arg = CALL_EXPR_ARG (exp, i); + rtx op = expand_normal (arg); + enum machine_mode mode = insn_p->operand[i + 1].mode; + bool match = insn_p->operand[i + 1].predicate (op, mode); + + if (last_arg_count && (i + 1) == nargs) + { + /* SIMD shift insns take either an 8-bit immediate or + register as count. But builtin functions take int as + count. If count doesn't match, we put it in register. */ + if (!match) + { + op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0); + if (!insn_p->operand[i + 1].predicate (op, mode)) + op = copy_to_reg (op); + } + } + else if ((nargs - i) <= nargs_constant) + { + if (!match) + switch (icode) + { + case CODE_FOR_sse4_1_roundpd: + case CODE_FOR_sse4_1_roundps: + case CODE_FOR_sse4_1_roundsd: + case CODE_FOR_sse4_1_roundss: + case CODE_FOR_sse4_1_blendps: + case CODE_FOR_avx_blendpd256: + case CODE_FOR_avx_vpermilv4df: + case CODE_FOR_avx_roundpd256: + case CODE_FOR_avx_roundps256: + error ("the last argument must be a 4-bit immediate"); + return const0_rtx; + + case CODE_FOR_sse4_1_blendpd: + case CODE_FOR_avx_vpermilv2df: + case CODE_FOR_xop_vpermil2v2df3: + case CODE_FOR_xop_vpermil2v4sf3: + case CODE_FOR_xop_vpermil2v4df3: + case CODE_FOR_xop_vpermil2v8sf3: + error ("the last argument must be a 2-bit immediate"); + return const0_rtx; + + case CODE_FOR_avx_vextractf128v4df: + case CODE_FOR_avx_vextractf128v8sf: + case CODE_FOR_avx_vextractf128v8si: + case CODE_FOR_avx_vinsertf128v4df: + case CODE_FOR_avx_vinsertf128v8sf: + case CODE_FOR_avx_vinsertf128v8si: + error ("the last argument must be a 1-bit immediate"); + return const0_rtx; + + case CODE_FOR_avx_cmpsdv2df3: + case CODE_FOR_avx_cmpssv4sf3: + case CODE_FOR_avx_cmppdv2df3: + case CODE_FOR_avx_cmppsv4sf3: + case CODE_FOR_avx_cmppdv4df3: + case CODE_FOR_avx_cmppsv8sf3: + error ("the last argument must be a 5-bit immediate"); + return const0_rtx; + + default: + switch (nargs_constant) + { + case 2: + if ((nargs - i) == nargs_constant) + { + error ("the next to last argument must be an 8-bit immediate"); + break; + } + case 1: + error ("the last argument must be an 8-bit immediate"); + break; + default: + gcc_unreachable (); + } + return const0_rtx; + } + } + else + { + if (VECTOR_MODE_P (mode)) + op = safe_vector_operand (op, mode); + + /* If we aren't optimizing, only allow one memory operand to + be generated. */ + if (memory_operand (op, mode)) + num_memory++; + + if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode) + { + if (optimize || !match || num_memory > 1) + op = copy_to_mode_reg (mode, op); + } + else + { + op = copy_to_reg (op); + op = simplify_gen_subreg (mode, op, GET_MODE (op), 0); + } + } + + args[i].op = op; + args[i].mode = mode; + } + + switch (nargs) + { + case 1: + pat = GEN_FCN (icode) (real_target, args[0].op); + break; + case 2: + pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op); + break; + case 3: + pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op, + args[2].op); + break; + case 4: + pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op, + args[2].op, args[3].op); + break; + default: + gcc_unreachable (); + } + + if (! pat) + return 0; + + emit_insn (pat); + return target; +} + +/* Subroutine of ix86_expand_builtin to take care of special insns + with variable number of operands. */ + +static rtx +ix86_expand_special_args_builtin (const struct builtin_description *d, + tree exp, rtx target) +{ + tree arg; + rtx pat, op; + unsigned int i, nargs, arg_adjust, memory; + struct + { + rtx op; + enum machine_mode mode; + } args[3]; + enum insn_code icode = d->icode; + bool last_arg_constant = false; + const struct insn_data_d *insn_p = &insn_data[icode]; + enum machine_mode tmode = insn_p->operand[0].mode; + enum { load, store } klass; + + switch ((enum ix86_builtin_func_type) d->flag) + { + case VOID_FTYPE_VOID: + if (icode == CODE_FOR_avx_vzeroupper) + target = GEN_INT (vzeroupper_intrinsic); + emit_insn (GEN_FCN (icode) (target)); + return 0; + case VOID_FTYPE_UINT64: + case VOID_FTYPE_UNSIGNED: + nargs = 0; + klass = store; + memory = 0; + break; + break; + case UINT64_FTYPE_VOID: + case UNSIGNED_FTYPE_VOID: + nargs = 0; + klass = load; + memory = 0; + break; + case UINT64_FTYPE_PUNSIGNED: + case V2DI_FTYPE_PV2DI: + case V32QI_FTYPE_PCCHAR: + case V16QI_FTYPE_PCCHAR: + case V8SF_FTYPE_PCV4SF: + case V8SF_FTYPE_PCFLOAT: + case V4SF_FTYPE_PCFLOAT: + case V4DF_FTYPE_PCV2DF: + case V4DF_FTYPE_PCDOUBLE: + case V2DF_FTYPE_PCDOUBLE: + case VOID_FTYPE_PVOID: + nargs = 1; + klass = load; + memory = 0; + break; + case VOID_FTYPE_PV2SF_V4SF: + case VOID_FTYPE_PV4DI_V4DI: + case VOID_FTYPE_PV2DI_V2DI: + case VOID_FTYPE_PCHAR_V32QI: + case VOID_FTYPE_PCHAR_V16QI: + case VOID_FTYPE_PFLOAT_V8SF: + case VOID_FTYPE_PFLOAT_V4SF: + case VOID_FTYPE_PDOUBLE_V4DF: + case VOID_FTYPE_PDOUBLE_V2DF: + case VOID_FTYPE_PULONGLONG_ULONGLONG: + case VOID_FTYPE_PINT_INT: + nargs = 1; + klass = store; + /* Reserve memory operand for target. */ + memory = ARRAY_SIZE (args); + break; + case V4SF_FTYPE_V4SF_PCV2SF: + case V2DF_FTYPE_V2DF_PCDOUBLE: + nargs = 2; + klass = load; + memory = 1; + break; + case V8SF_FTYPE_PCV8SF_V8SI: + case V4DF_FTYPE_PCV4DF_V4DI: + case V4SF_FTYPE_PCV4SF_V4SI: + case V2DF_FTYPE_PCV2DF_V2DI: + nargs = 2; + klass = load; + memory = 0; + break; + case VOID_FTYPE_PV8SF_V8SI_V8SF: + case VOID_FTYPE_PV4DF_V4DI_V4DF: + case VOID_FTYPE_PV4SF_V4SI_V4SF: + case VOID_FTYPE_PV2DF_V2DI_V2DF: + nargs = 2; + klass = store; + /* Reserve memory operand for target. */ + memory = ARRAY_SIZE (args); + break; + case VOID_FTYPE_UINT_UINT_UINT: + case VOID_FTYPE_UINT64_UINT_UINT: + case UCHAR_FTYPE_UINT_UINT_UINT: + case UCHAR_FTYPE_UINT64_UINT_UINT: + nargs = 3; + klass = load; + memory = ARRAY_SIZE (args); + last_arg_constant = true; + break; + default: + gcc_unreachable (); + } + + gcc_assert (nargs <= ARRAY_SIZE (args)); + + if (klass == store) + { + arg = CALL_EXPR_ARG (exp, 0); + op = expand_normal (arg); + gcc_assert (target == 0); + if (memory) + target = gen_rtx_MEM (tmode, copy_to_mode_reg (Pmode, op)); + else + target = force_reg (tmode, op); + arg_adjust = 1; + } + else + { + arg_adjust = 0; + if (optimize + || target == 0 + || !register_operand (target, tmode) + || GET_MODE (target) != tmode) + target = gen_reg_rtx (tmode); + } + + for (i = 0; i < nargs; i++) + { + enum machine_mode mode = insn_p->operand[i + 1].mode; + bool match; + + arg = CALL_EXPR_ARG (exp, i + arg_adjust); + op = expand_normal (arg); + match = insn_p->operand[i + 1].predicate (op, mode); + + if (last_arg_constant && (i + 1) == nargs) + { + if (!match) + { + if (icode == CODE_FOR_lwp_lwpvalsi3 + || icode == CODE_FOR_lwp_lwpinssi3 + || icode == CODE_FOR_lwp_lwpvaldi3 + || icode == CODE_FOR_lwp_lwpinsdi3) + error ("the last argument must be a 32-bit immediate"); + else + error ("the last argument must be an 8-bit immediate"); + return const0_rtx; + } + } + else + { + if (i == memory) + { + /* This must be the memory operand. */ + op = gen_rtx_MEM (mode, copy_to_mode_reg (Pmode, op)); + gcc_assert (GET_MODE (op) == mode + || GET_MODE (op) == VOIDmode); + } + else + { + /* This must be register. */ + if (VECTOR_MODE_P (mode)) + op = safe_vector_operand (op, mode); + + gcc_assert (GET_MODE (op) == mode + || GET_MODE (op) == VOIDmode); + op = copy_to_mode_reg (mode, op); + } + } + + args[i].op = op; + args[i].mode = mode; + } + + switch (nargs) + { + case 0: + pat = GEN_FCN (icode) (target); + break; + case 1: + pat = GEN_FCN (icode) (target, args[0].op); + break; + case 2: + pat = GEN_FCN (icode) (target, args[0].op, args[1].op); + break; + case 3: + pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op); + break; + default: + gcc_unreachable (); + } + + if (! pat) + return 0; + emit_insn (pat); + return klass == store ? 0 : target; +} + +/* Return the integer constant in ARG. Constrain it to be in the range + of the subparts of VEC_TYPE; issue an error if not. */ + +static int +get_element_number (tree vec_type, tree arg) +{ + unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1; + + if (!host_integerp (arg, 1) + || (elt = tree_low_cst (arg, 1), elt > max)) + { + error ("selector must be an integer constant in the range 0..%wi", max); + return 0; + } + + return elt; +} + +/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around + ix86_expand_vector_init. We DO have language-level syntax for this, in + the form of (type){ init-list }. Except that since we can't place emms + instructions from inside the compiler, we can't allow the use of MMX + registers unless the user explicitly asks for it. So we do *not* define + vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead + we have builtins invoked by mmintrin.h that gives us license to emit + these sorts of instructions. */ + +static rtx +ix86_expand_vec_init_builtin (tree type, tree exp, rtx target) +{ + enum machine_mode tmode = TYPE_MODE (type); + enum machine_mode inner_mode = GET_MODE_INNER (tmode); + int i, n_elt = GET_MODE_NUNITS (tmode); + rtvec v = rtvec_alloc (n_elt); + + gcc_assert (VECTOR_MODE_P (tmode)); + gcc_assert (call_expr_nargs (exp) == n_elt); + + for (i = 0; i < n_elt; ++i) + { + rtx x = expand_normal (CALL_EXPR_ARG (exp, i)); + RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x); + } + + if (!target || !register_operand (target, tmode)) + target = gen_reg_rtx (tmode); + + ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v)); + return target; +} + +/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around + ix86_expand_vector_extract. They would be redundant (for non-MMX) if we + had a language-level syntax for referencing vector elements. */ + +static rtx +ix86_expand_vec_ext_builtin (tree exp, rtx target) +{ + enum machine_mode tmode, mode0; + tree arg0, arg1; + int elt; + rtx op0; + + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + + op0 = expand_normal (arg0); + elt = get_element_number (TREE_TYPE (arg0), arg1); + + tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0))); + mode0 = TYPE_MODE (TREE_TYPE (arg0)); + gcc_assert (VECTOR_MODE_P (mode0)); + + op0 = force_reg (mode0, op0); + + if (optimize || !target || !register_operand (target, tmode)) + target = gen_reg_rtx (tmode); + + ix86_expand_vector_extract (true, target, op0, elt); + + return target; +} + +/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around + ix86_expand_vector_set. They would be redundant (for non-MMX) if we had + a language-level syntax for referencing vector elements. */ + +static rtx +ix86_expand_vec_set_builtin (tree exp) +{ + enum machine_mode tmode, mode1; + tree arg0, arg1, arg2; + int elt; + rtx op0, op1, target; + + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + + tmode = TYPE_MODE (TREE_TYPE (arg0)); + mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0))); + gcc_assert (VECTOR_MODE_P (tmode)); + + op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL); + op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL); + elt = get_element_number (TREE_TYPE (arg0), arg2); + + if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode) + op1 = convert_modes (mode1, GET_MODE (op1), op1, true); + + op0 = force_reg (tmode, op0); + op1 = force_reg (mode1, op1); + + /* OP0 is the source of these builtin functions and shouldn't be + modified. Create a copy, use it and return it as target. */ + target = gen_reg_rtx (tmode); + emit_move_insn (target, op0); + ix86_expand_vector_set (true, target, op1, elt); + + return target; +} + +/* Expand an expression EXP that calls a built-in function, + with result going to TARGET if that's convenient + (and in mode MODE if that's convenient). + SUBTARGET may be used as the target for computing one of EXP's operands. + IGNORE is nonzero if the value is to be ignored. */ + +static rtx +ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, + enum machine_mode mode ATTRIBUTE_UNUSED, + int ignore ATTRIBUTE_UNUSED) +{ + const struct builtin_description *d; + size_t i; + enum insn_code icode; + tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); + tree arg0, arg1, arg2; + rtx op0, op1, op2, pat; + enum machine_mode mode0, mode1, mode2; + unsigned int fcode = DECL_FUNCTION_CODE (fndecl); + + /* Determine whether the builtin function is available under the current ISA. + Originally the builtin was not created if it wasn't applicable to the + current ISA based on the command line switches. With function specific + options, we need to check in the context of the function making the call + whether it is supported. */ + if (ix86_builtins_isa[fcode].isa + && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags)) + { + char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL, + NULL, NULL, false); + + if (!opts) + error ("%qE needs unknown isa option", fndecl); + else + { + gcc_assert (opts != NULL); + error ("%qE needs isa option %s", fndecl, opts); + free (opts); + } + return const0_rtx; + } + + switch (fcode) + { + case IX86_BUILTIN_MASKMOVQ: + case IX86_BUILTIN_MASKMOVDQU: + icode = (fcode == IX86_BUILTIN_MASKMOVQ + ? CODE_FOR_mmx_maskmovq + : CODE_FOR_sse2_maskmovdqu); + /* Note the arg order is different from the operand order. */ + arg1 = CALL_EXPR_ARG (exp, 0); + arg2 = CALL_EXPR_ARG (exp, 1); + arg0 = CALL_EXPR_ARG (exp, 2); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + op2 = expand_normal (arg2); + mode0 = insn_data[icode].operand[0].mode; + mode1 = insn_data[icode].operand[1].mode; + mode2 = insn_data[icode].operand[2].mode; + + op0 = force_reg (Pmode, op0); + op0 = gen_rtx_MEM (mode1, op0); + + if (!insn_data[icode].operand[0].predicate (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + if (!insn_data[icode].operand[1].predicate (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + if (!insn_data[icode].operand[2].predicate (op2, mode2)) + op2 = copy_to_mode_reg (mode2, op2); + pat = GEN_FCN (icode) (op0, op1, op2); + if (! pat) + return 0; + emit_insn (pat); + return 0; + + case IX86_BUILTIN_LDMXCSR: + op0 = expand_normal (CALL_EXPR_ARG (exp, 0)); + target = assign_386_stack_local (SImode, SLOT_VIRTUAL); + emit_move_insn (target, op0); + emit_insn (gen_sse_ldmxcsr (target)); + return 0; + + case IX86_BUILTIN_STMXCSR: + target = assign_386_stack_local (SImode, SLOT_VIRTUAL); + emit_insn (gen_sse_stmxcsr (target)); + return copy_to_mode_reg (SImode, target); + + case IX86_BUILTIN_CLFLUSH: + arg0 = CALL_EXPR_ARG (exp, 0); + op0 = expand_normal (arg0); + icode = CODE_FOR_sse2_clflush; + if (!insn_data[icode].operand[0].predicate (op0, Pmode)) + op0 = copy_to_mode_reg (Pmode, op0); + + emit_insn (gen_sse2_clflush (op0)); + return 0; + + case IX86_BUILTIN_MONITOR: + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + op2 = expand_normal (arg2); + if (!REG_P (op0)) + op0 = copy_to_mode_reg (Pmode, op0); + if (!REG_P (op1)) + op1 = copy_to_mode_reg (SImode, op1); + if (!REG_P (op2)) + op2 = copy_to_mode_reg (SImode, op2); + emit_insn (ix86_gen_monitor (op0, op1, op2)); + return 0; + + case IX86_BUILTIN_MWAIT: + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + if (!REG_P (op0)) + op0 = copy_to_mode_reg (SImode, op0); + if (!REG_P (op1)) + op1 = copy_to_mode_reg (SImode, op1); + emit_insn (gen_sse3_mwait (op0, op1)); + return 0; + + case IX86_BUILTIN_VEC_INIT_V2SI: + case IX86_BUILTIN_VEC_INIT_V4HI: + case IX86_BUILTIN_VEC_INIT_V8QI: + return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target); + + case IX86_BUILTIN_VEC_EXT_V2DF: + case IX86_BUILTIN_VEC_EXT_V2DI: + case IX86_BUILTIN_VEC_EXT_V4SF: + case IX86_BUILTIN_VEC_EXT_V4SI: + case IX86_BUILTIN_VEC_EXT_V8HI: + case IX86_BUILTIN_VEC_EXT_V2SI: + case IX86_BUILTIN_VEC_EXT_V4HI: + case IX86_BUILTIN_VEC_EXT_V16QI: + return ix86_expand_vec_ext_builtin (exp, target); + + case IX86_BUILTIN_VEC_SET_V2DI: + case IX86_BUILTIN_VEC_SET_V4SF: + case IX86_BUILTIN_VEC_SET_V4SI: + case IX86_BUILTIN_VEC_SET_V8HI: + case IX86_BUILTIN_VEC_SET_V4HI: + case IX86_BUILTIN_VEC_SET_V16QI: + return ix86_expand_vec_set_builtin (exp); + + case IX86_BUILTIN_VEC_PERM_V2DF: + case IX86_BUILTIN_VEC_PERM_V4SF: + case IX86_BUILTIN_VEC_PERM_V2DI: + case IX86_BUILTIN_VEC_PERM_V4SI: + case IX86_BUILTIN_VEC_PERM_V8HI: + case IX86_BUILTIN_VEC_PERM_V16QI: + case IX86_BUILTIN_VEC_PERM_V2DI_U: + case IX86_BUILTIN_VEC_PERM_V4SI_U: + case IX86_BUILTIN_VEC_PERM_V8HI_U: + case IX86_BUILTIN_VEC_PERM_V16QI_U: + case IX86_BUILTIN_VEC_PERM_V4DF: + case IX86_BUILTIN_VEC_PERM_V8SF: + return ix86_expand_vec_perm_builtin (exp); + + case IX86_BUILTIN_INFQ: + case IX86_BUILTIN_HUGE_VALQ: + { + REAL_VALUE_TYPE inf; + rtx tmp; + + real_inf (&inf); + tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode); + + tmp = validize_mem (force_const_mem (mode, tmp)); + + if (target == 0) + target = gen_reg_rtx (mode); + + emit_move_insn (target, tmp); + return target; + } + + case IX86_BUILTIN_LLWPCB: + arg0 = CALL_EXPR_ARG (exp, 0); + op0 = expand_normal (arg0); + icode = CODE_FOR_lwp_llwpcb; + if (!insn_data[icode].operand[0].predicate (op0, Pmode)) + op0 = copy_to_mode_reg (Pmode, op0); + emit_insn (gen_lwp_llwpcb (op0)); + return 0; + + case IX86_BUILTIN_SLWPCB: + icode = CODE_FOR_lwp_slwpcb; + if (!target + || !insn_data[icode].operand[0].predicate (target, Pmode)) + target = gen_reg_rtx (Pmode); + emit_insn (gen_lwp_slwpcb (target)); + return target; + + case IX86_BUILTIN_BEXTRI32: + case IX86_BUILTIN_BEXTRI64: + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + icode = (fcode == IX86_BUILTIN_BEXTRI32 + ? CODE_FOR_tbm_bextri_si + : CODE_FOR_tbm_bextri_di); + if (!CONST_INT_P (op1)) + { + error ("last argument must be an immediate"); + return const0_rtx; + } + else + { + unsigned char length = (INTVAL (op1) >> 8) & 0xFF; + unsigned char lsb_index = INTVAL (op1) & 0xFF; + op1 = GEN_INT (length); + op2 = GEN_INT (lsb_index); + pat = GEN_FCN (icode) (target, op0, op1, op2); + if (pat) + emit_insn (pat); + return target; + } + + case IX86_BUILTIN_RDRAND16_STEP: + icode = CODE_FOR_rdrandhi_1; + mode0 = HImode; + goto rdrand_step; + + case IX86_BUILTIN_RDRAND32_STEP: + icode = CODE_FOR_rdrandsi_1; + mode0 = SImode; + goto rdrand_step; + + case IX86_BUILTIN_RDRAND64_STEP: + icode = CODE_FOR_rdranddi_1; + mode0 = DImode; + +rdrand_step: + op0 = gen_reg_rtx (mode0); + emit_insn (GEN_FCN (icode) (op0)); + + arg0 = CALL_EXPR_ARG (exp, 0); + op1 = expand_normal (arg0); + if (!address_operand (op1, VOIDmode)) + op1 = copy_addr_to_reg (op1); + emit_move_insn (gen_rtx_MEM (mode0, op1), op0); + + op1 = gen_reg_rtx (SImode); + emit_move_insn (op1, CONST1_RTX (SImode)); + + /* Emit SImode conditional move. */ + if (mode0 == HImode) + { + op2 = gen_reg_rtx (SImode); + emit_insn (gen_zero_extendhisi2 (op2, op0)); + } + else if (mode0 == SImode) + op2 = op0; + else + op2 = gen_rtx_SUBREG (SImode, op0, 0); + + if (target == 0) + target = gen_reg_rtx (SImode); + + pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG), + const0_rtx); + emit_insn (gen_rtx_SET (VOIDmode, target, + gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1))); + return target; + + default: + break; + } + + for (i = 0, d = bdesc_special_args; + i < ARRAY_SIZE (bdesc_special_args); + i++, d++) + if (d->code == fcode) + return ix86_expand_special_args_builtin (d, exp, target); + + for (i = 0, d = bdesc_args; + i < ARRAY_SIZE (bdesc_args); + i++, d++) + if (d->code == fcode) + switch (fcode) + { + case IX86_BUILTIN_FABSQ: + case IX86_BUILTIN_COPYSIGNQ: + if (!TARGET_SSE2) + /* Emit a normal call if SSE2 isn't available. */ + return expand_call (exp, target, ignore); + default: + return ix86_expand_args_builtin (d, exp, target); + } + + for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++) + if (d->code == fcode) + return ix86_expand_sse_comi (d, exp, target); + + for (i = 0, d = bdesc_pcmpestr; + i < ARRAY_SIZE (bdesc_pcmpestr); + i++, d++) + if (d->code == fcode) + return ix86_expand_sse_pcmpestr (d, exp, target); + + for (i = 0, d = bdesc_pcmpistr; + i < ARRAY_SIZE (bdesc_pcmpistr); + i++, d++) + if (d->code == fcode) + return ix86_expand_sse_pcmpistr (d, exp, target); + + for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++) + if (d->code == fcode) + return ix86_expand_multi_arg_builtin (d->icode, exp, target, + (enum ix86_builtin_func_type) + d->flag, d->comparison); + + gcc_unreachable (); +} + +/* Returns a function decl for a vectorized version of the builtin function + with builtin function code FN and the result vector type TYPE, or NULL_TREE + if it is not available. */ + +static tree +ix86_builtin_vectorized_function (tree fndecl, tree type_out, + tree type_in) +{ + enum machine_mode in_mode, out_mode; + int in_n, out_n; + enum built_in_function fn = DECL_FUNCTION_CODE (fndecl); + + if (TREE_CODE (type_out) != VECTOR_TYPE + || TREE_CODE (type_in) != VECTOR_TYPE + || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL) + return NULL_TREE; + + out_mode = TYPE_MODE (TREE_TYPE (type_out)); + out_n = TYPE_VECTOR_SUBPARTS (type_out); + in_mode = TYPE_MODE (TREE_TYPE (type_in)); + in_n = TYPE_VECTOR_SUBPARTS (type_in); + + switch (fn) + { + case BUILT_IN_SQRT: + if (out_mode == DFmode && in_mode == DFmode) + { + if (out_n == 2 && in_n == 2) + return ix86_builtins[IX86_BUILTIN_SQRTPD]; + else if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_SQRTPD256]; + } + break; + + case BUILT_IN_SQRTF: + if (out_mode == SFmode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_SQRTPS_NR]; + else if (out_n == 8 && in_n == 8) + return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256]; + } + break; + + case BUILT_IN_LRINT: + if (out_mode == SImode && out_n == 4 + && in_mode == DFmode && in_n == 2) + return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX]; + break; + + case BUILT_IN_LRINTF: + if (out_mode == SImode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_CVTPS2DQ]; + else if (out_n == 8 && in_n == 8) + return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256]; + } + break; + + case BUILT_IN_COPYSIGN: + if (out_mode == DFmode && in_mode == DFmode) + { + if (out_n == 2 && in_n == 2) + return ix86_builtins[IX86_BUILTIN_CPYSGNPD]; + else if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_CPYSGNPD256]; + } + break; + + case BUILT_IN_COPYSIGNF: + if (out_mode == SFmode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_CPYSGNPS]; + else if (out_n == 8 && in_n == 8) + return ix86_builtins[IX86_BUILTIN_CPYSGNPS256]; + } + break; + + case BUILT_IN_FMA: + if (out_mode == DFmode && in_mode == DFmode) + { + if (out_n == 2 && in_n == 2) + return ix86_builtins[IX86_BUILTIN_VFMADDPD]; + if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_VFMADDPD256]; + } + break; + + case BUILT_IN_FMAF: + if (out_mode == SFmode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_VFMADDPS]; + if (out_n == 8 && in_n == 8) + return ix86_builtins[IX86_BUILTIN_VFMADDPS256]; + } + break; + + default: + break; + } + + /* Dispatch to a handler for a vectorization library. */ + if (ix86_veclib_handler) + return ix86_veclib_handler ((enum built_in_function) fn, type_out, + type_in); + + return NULL_TREE; +} + +/* Handler for an SVML-style interface to + a library with vectorized intrinsics. */ + +static tree +ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in) +{ + char name[20]; + tree fntype, new_fndecl, args; + unsigned arity; + const char *bname; + enum machine_mode el_mode, in_mode; + int n, in_n; + + /* The SVML is suitable for unsafe math only. */ + if (!flag_unsafe_math_optimizations) + return NULL_TREE; + + el_mode = TYPE_MODE (TREE_TYPE (type_out)); + n = TYPE_VECTOR_SUBPARTS (type_out); + in_mode = TYPE_MODE (TREE_TYPE (type_in)); + in_n = TYPE_VECTOR_SUBPARTS (type_in); + if (el_mode != in_mode + || n != in_n) + return NULL_TREE; + + switch (fn) + { + case BUILT_IN_EXP: + case BUILT_IN_LOG: + case BUILT_IN_LOG10: + case BUILT_IN_POW: + case BUILT_IN_TANH: + case BUILT_IN_TAN: + case BUILT_IN_ATAN: + case BUILT_IN_ATAN2: + case BUILT_IN_ATANH: + case BUILT_IN_CBRT: + case BUILT_IN_SINH: + case BUILT_IN_SIN: + case BUILT_IN_ASINH: + case BUILT_IN_ASIN: + case BUILT_IN_COSH: + case BUILT_IN_COS: + case BUILT_IN_ACOSH: + case BUILT_IN_ACOS: + if (el_mode != DFmode || n != 2) + return NULL_TREE; + break; + + case BUILT_IN_EXPF: + case BUILT_IN_LOGF: + case BUILT_IN_LOG10F: + case BUILT_IN_POWF: + case BUILT_IN_TANHF: + case BUILT_IN_TANF: + case BUILT_IN_ATANF: + case BUILT_IN_ATAN2F: + case BUILT_IN_ATANHF: + case BUILT_IN_CBRTF: + case BUILT_IN_SINHF: + case BUILT_IN_SINF: + case BUILT_IN_ASINHF: + case BUILT_IN_ASINF: + case BUILT_IN_COSHF: + case BUILT_IN_COSF: + case BUILT_IN_ACOSHF: + case BUILT_IN_ACOSF: + if (el_mode != SFmode || n != 4) + return NULL_TREE; + break; + + default: + return NULL_TREE; + } + + bname = IDENTIFIER_POINTER (DECL_NAME (implicit_built_in_decls[fn])); + + if (fn == BUILT_IN_LOGF) + strcpy (name, "vmlsLn4"); + else if (fn == BUILT_IN_LOG) + strcpy (name, "vmldLn2"); + else if (n == 4) + { + sprintf (name, "vmls%s", bname+10); + name[strlen (name)-1] = '4'; + } + else + sprintf (name, "vmld%s2", bname+10); + + /* Convert to uppercase. */ + name[4] &= ~0x20; + + arity = 0; + for (args = DECL_ARGUMENTS (implicit_built_in_decls[fn]); args; + args = TREE_CHAIN (args)) + arity++; + + if (arity == 1) + fntype = build_function_type_list (type_out, type_in, NULL); + else + fntype = build_function_type_list (type_out, type_in, type_in, NULL); + + /* Build a function declaration for the vectorized function. */ + new_fndecl = build_decl (BUILTINS_LOCATION, + FUNCTION_DECL, get_identifier (name), fntype); + TREE_PUBLIC (new_fndecl) = 1; + DECL_EXTERNAL (new_fndecl) = 1; + DECL_IS_NOVOPS (new_fndecl) = 1; + TREE_READONLY (new_fndecl) = 1; + + return new_fndecl; +} + +/* Handler for an ACML-style interface to + a library with vectorized intrinsics. */ + +static tree +ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in) +{ + char name[20] = "__vr.._"; + tree fntype, new_fndecl, args; + unsigned arity; + const char *bname; + enum machine_mode el_mode, in_mode; + int n, in_n; + + /* The ACML is 64bits only and suitable for unsafe math only as + it does not correctly support parts of IEEE with the required + precision such as denormals. */ + if (!TARGET_64BIT + || !flag_unsafe_math_optimizations) + return NULL_TREE; + + el_mode = TYPE_MODE (TREE_TYPE (type_out)); + n = TYPE_VECTOR_SUBPARTS (type_out); + in_mode = TYPE_MODE (TREE_TYPE (type_in)); + in_n = TYPE_VECTOR_SUBPARTS (type_in); + if (el_mode != in_mode + || n != in_n) + return NULL_TREE; + + switch (fn) + { + case BUILT_IN_SIN: + case BUILT_IN_COS: + case BUILT_IN_EXP: + case BUILT_IN_LOG: + case BUILT_IN_LOG2: + case BUILT_IN_LOG10: + name[4] = 'd'; + name[5] = '2'; + if (el_mode != DFmode + || n != 2) + return NULL_TREE; + break; + + case BUILT_IN_SINF: + case BUILT_IN_COSF: + case BUILT_IN_EXPF: + case BUILT_IN_POWF: + case BUILT_IN_LOGF: + case BUILT_IN_LOG2F: + case BUILT_IN_LOG10F: + name[4] = 's'; + name[5] = '4'; + if (el_mode != SFmode + || n != 4) + return NULL_TREE; + break; + + default: + return NULL_TREE; + } + + bname = IDENTIFIER_POINTER (DECL_NAME (implicit_built_in_decls[fn])); + sprintf (name + 7, "%s", bname+10); + + arity = 0; + for (args = DECL_ARGUMENTS (implicit_built_in_decls[fn]); args; + args = TREE_CHAIN (args)) + arity++; + + if (arity == 1) + fntype = build_function_type_list (type_out, type_in, NULL); + else + fntype = build_function_type_list (type_out, type_in, type_in, NULL); + + /* Build a function declaration for the vectorized function. */ + new_fndecl = build_decl (BUILTINS_LOCATION, + FUNCTION_DECL, get_identifier (name), fntype); + TREE_PUBLIC (new_fndecl) = 1; + DECL_EXTERNAL (new_fndecl) = 1; + DECL_IS_NOVOPS (new_fndecl) = 1; + TREE_READONLY (new_fndecl) = 1; + + return new_fndecl; +} + + +/* Returns a decl of a function that implements conversion of an integer vector + into a floating-point vector, or vice-versa. DEST_TYPE and SRC_TYPE + are the types involved when converting according to CODE. + Return NULL_TREE if it is not available. */ + +static tree +ix86_vectorize_builtin_conversion (unsigned int code, + tree dest_type, tree src_type) +{ + if (! TARGET_SSE2) + return NULL_TREE; + + switch (code) + { + case FLOAT_EXPR: + switch (TYPE_MODE (src_type)) + { + case V4SImode: + switch (TYPE_MODE (dest_type)) + { + case V4SFmode: + return (TYPE_UNSIGNED (src_type) + ? ix86_builtins[IX86_BUILTIN_CVTUDQ2PS] + : ix86_builtins[IX86_BUILTIN_CVTDQ2PS]); + case V4DFmode: + return (TYPE_UNSIGNED (src_type) + ? NULL_TREE + : ix86_builtins[IX86_BUILTIN_CVTDQ2PD256]); + default: + return NULL_TREE; + } + break; + case V8SImode: + switch (TYPE_MODE (dest_type)) + { + case V8SFmode: + return (TYPE_UNSIGNED (src_type) + ? NULL_TREE + : ix86_builtins[IX86_BUILTIN_CVTDQ2PS256]); + default: + return NULL_TREE; + } + break; + default: + return NULL_TREE; + } + + case FIX_TRUNC_EXPR: + switch (TYPE_MODE (dest_type)) + { + case V4SImode: + switch (TYPE_MODE (src_type)) + { + case V4SFmode: + return (TYPE_UNSIGNED (dest_type) + ? NULL_TREE + : ix86_builtins[IX86_BUILTIN_CVTTPS2DQ]); + case V4DFmode: + return (TYPE_UNSIGNED (dest_type) + ? NULL_TREE + : ix86_builtins[IX86_BUILTIN_CVTTPD2DQ256]); + default: + return NULL_TREE; + } + break; + + case V8SImode: + switch (TYPE_MODE (src_type)) + { + case V8SFmode: + return (TYPE_UNSIGNED (dest_type) + ? NULL_TREE + : ix86_builtins[IX86_BUILTIN_CVTTPS2DQ256]); + default: + return NULL_TREE; + } + break; + + default: + return NULL_TREE; + } + + default: + return NULL_TREE; + } + + return NULL_TREE; +} + +/* Returns a code for a target-specific builtin that implements + reciprocal of the function, or NULL_TREE if not available. */ + +static tree +ix86_builtin_reciprocal (unsigned int fn, bool md_fn, + bool sqrt ATTRIBUTE_UNUSED) +{ + if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p () + && flag_finite_math_only && !flag_trapping_math + && flag_unsafe_math_optimizations)) + return NULL_TREE; + + if (md_fn) + /* Machine dependent builtins. */ + switch (fn) + { + /* Vectorized version of sqrt to rsqrt conversion. */ + case IX86_BUILTIN_SQRTPS_NR: + return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR]; + + case IX86_BUILTIN_SQRTPS_NR256: + return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256]; + + default: + return NULL_TREE; + } + else + /* Normal builtins. */ + switch (fn) + { + /* Sqrt to rsqrt conversion. */ + case BUILT_IN_SQRTF: + return ix86_builtins[IX86_BUILTIN_RSQRTF]; + + default: + return NULL_TREE; + } +} + +/* Helper for avx_vpermilps256_operand et al. This is also used by + the expansion functions to turn the parallel back into a mask. + The return value is 0 for no match and the imm8+1 for a match. */ + +int +avx_vpermilp_parallel (rtx par, enum machine_mode mode) +{ + unsigned i, nelt = GET_MODE_NUNITS (mode); + unsigned mask = 0; + unsigned char ipar[8]; + + if (XVECLEN (par, 0) != (int) nelt) + return 0; + + /* Validate that all of the elements are constants, and not totally + out of range. Copy the data into an integral array to make the + subsequent checks easier. */ + for (i = 0; i < nelt; ++i) + { + rtx er = XVECEXP (par, 0, i); + unsigned HOST_WIDE_INT ei; + + if (!CONST_INT_P (er)) + return 0; + ei = INTVAL (er); + if (ei >= nelt) + return 0; + ipar[i] = ei; + } + + switch (mode) + { + case V4DFmode: + /* In the 256-bit DFmode case, we can only move elements within + a 128-bit lane. */ + for (i = 0; i < 2; ++i) + { + if (ipar[i] >= 2) + return 0; + mask |= ipar[i] << i; + } + for (i = 2; i < 4; ++i) + { + if (ipar[i] < 2) + return 0; + mask |= (ipar[i] - 2) << i; + } + break; + + case V8SFmode: + /* In the 256-bit SFmode case, we have full freedom of movement + within the low 128-bit lane, but the high 128-bit lane must + mirror the exact same pattern. */ + for (i = 0; i < 4; ++i) + if (ipar[i] + 4 != ipar[i + 4]) + return 0; + nelt = 4; + /* FALLTHRU */ + + case V2DFmode: + case V4SFmode: + /* In the 128-bit case, we've full freedom in the placement of + the elements from the source operand. */ + for (i = 0; i < nelt; ++i) + mask |= ipar[i] << (i * (nelt / 2)); + break; + + default: + gcc_unreachable (); + } + + /* Make sure success has a non-zero value by adding one. */ + return mask + 1; +} + +/* Helper for avx_vperm2f128_v4df_operand et al. This is also used by + the expansion functions to turn the parallel back into a mask. + The return value is 0 for no match and the imm8+1 for a match. */ + +int +avx_vperm2f128_parallel (rtx par, enum machine_mode mode) +{ + unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2; + unsigned mask = 0; + unsigned char ipar[8]; + + if (XVECLEN (par, 0) != (int) nelt) + return 0; + + /* Validate that all of the elements are constants, and not totally + out of range. Copy the data into an integral array to make the + subsequent checks easier. */ + for (i = 0; i < nelt; ++i) + { + rtx er = XVECEXP (par, 0, i); + unsigned HOST_WIDE_INT ei; + + if (!CONST_INT_P (er)) + return 0; + ei = INTVAL (er); + if (ei >= 2 * nelt) + return 0; + ipar[i] = ei; + } + + /* Validate that the halves of the permute are halves. */ + for (i = 0; i < nelt2 - 1; ++i) + if (ipar[i] + 1 != ipar[i + 1]) + return 0; + for (i = nelt2; i < nelt - 1; ++i) + if (ipar[i] + 1 != ipar[i + 1]) + return 0; + + /* Reconstruct the mask. */ + for (i = 0; i < 2; ++i) + { + unsigned e = ipar[i * nelt2]; + if (e % nelt2) + return 0; + e /= nelt2; + mask |= e << (i * 4); + } + + /* Make sure success has a non-zero value by adding one. */ + return mask + 1; +} + + +/* Store OPERAND to the memory after reload is completed. This means + that we can't easily use assign_stack_local. */ +rtx +ix86_force_to_memory (enum machine_mode mode, rtx operand) +{ + rtx result; + + gcc_assert (reload_completed); + if (ix86_using_red_zone ()) + { + result = gen_rtx_MEM (mode, + gen_rtx_PLUS (Pmode, + stack_pointer_rtx, + GEN_INT (-RED_ZONE_SIZE))); + emit_move_insn (result, operand); + } + else if (TARGET_64BIT) + { + switch (mode) + { + case HImode: + case SImode: + operand = gen_lowpart (DImode, operand); + /* FALLTHRU */ + case DImode: + emit_insn ( + gen_rtx_SET (VOIDmode, + gen_rtx_MEM (DImode, + gen_rtx_PRE_DEC (DImode, + stack_pointer_rtx)), + operand)); + break; + default: + gcc_unreachable (); + } + result = gen_rtx_MEM (mode, stack_pointer_rtx); + } + else + { + switch (mode) + { + case DImode: + { + rtx operands[2]; + split_double_mode (mode, &operand, 1, operands, operands + 1); + emit_insn ( + gen_rtx_SET (VOIDmode, + gen_rtx_MEM (SImode, + gen_rtx_PRE_DEC (Pmode, + stack_pointer_rtx)), + operands[1])); + emit_insn ( + gen_rtx_SET (VOIDmode, + gen_rtx_MEM (SImode, + gen_rtx_PRE_DEC (Pmode, + stack_pointer_rtx)), + operands[0])); + } + break; + case HImode: + /* Store HImodes as SImodes. */ + operand = gen_lowpart (SImode, operand); + /* FALLTHRU */ + case SImode: + emit_insn ( + gen_rtx_SET (VOIDmode, + gen_rtx_MEM (GET_MODE (operand), + gen_rtx_PRE_DEC (SImode, + stack_pointer_rtx)), + operand)); + break; + default: + gcc_unreachable (); + } + result = gen_rtx_MEM (mode, stack_pointer_rtx); + } + return result; +} + +/* Free operand from the memory. */ +void +ix86_free_from_memory (enum machine_mode mode) +{ + if (!ix86_using_red_zone ()) + { + int size; + + if (mode == DImode || TARGET_64BIT) + size = 8; + else + size = 4; + /* Use LEA to deallocate stack space. In peephole2 it will be converted + to pop or add instruction if registers are available. */ + emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, + gen_rtx_PLUS (Pmode, stack_pointer_rtx, + GEN_INT (size)))); + } +} + +/* Implement TARGET_IRA_COVER_CLASSES. If -mfpmath=sse, we prefer + SSE_REGS to FLOAT_REGS if their costs for a pseudo are the + same. */ +static const reg_class_t * +i386_ira_cover_classes (void) +{ + static const reg_class_t sse_fpmath_classes[] = { + GENERAL_REGS, SSE_REGS, MMX_REGS, FLOAT_REGS, LIM_REG_CLASSES + }; + static const reg_class_t no_sse_fpmath_classes[] = { + GENERAL_REGS, FLOAT_REGS, MMX_REGS, SSE_REGS, LIM_REG_CLASSES + }; + + return TARGET_SSE_MATH ? sse_fpmath_classes : no_sse_fpmath_classes; +} + +/* Implement TARGET_PREFERRED_RELOAD_CLASS. + + Put float CONST_DOUBLE in the constant pool instead of fp regs. + QImode must go into class Q_REGS. + Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and + movdf to do mem-to-mem moves through integer regs. */ + +static reg_class_t +ix86_preferred_reload_class (rtx x, reg_class_t regclass) +{ + enum machine_mode mode = GET_MODE (x); + + /* We're only allowed to return a subclass of CLASS. Many of the + following checks fail for NO_REGS, so eliminate that early. */ + if (regclass == NO_REGS) + return NO_REGS; + + /* All classes can load zeros. */ + if (x == CONST0_RTX (mode)) + return regclass; + + /* Force constants into memory if we are loading a (nonzero) constant into + an MMX or SSE register. This is because there are no MMX/SSE instructions + to load from a constant. */ + if (CONSTANT_P (x) + && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass))) + return NO_REGS; + + /* Prefer SSE regs only, if we can use them for math. */ + if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode)) + return SSE_CLASS_P (regclass) ? regclass : NO_REGS; + + /* Floating-point constants need more complex checks. */ + if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode) + { + /* General regs can load everything. */ + if (reg_class_subset_p (regclass, GENERAL_REGS)) + return regclass; + + /* Floats can load 0 and 1 plus some others. Note that we eliminated + zero above. We only want to wind up preferring 80387 registers if + we plan on doing computation with them. */ + if (TARGET_80387 + && standard_80387_constant_p (x) > 0) + { + /* Limit class to non-sse. */ + if (regclass == FLOAT_SSE_REGS) + return FLOAT_REGS; + if (regclass == FP_TOP_SSE_REGS) + return FP_TOP_REG; + if (regclass == FP_SECOND_SSE_REGS) + return FP_SECOND_REG; + if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS) + return regclass; + } + + return NO_REGS; + } + + /* Generally when we see PLUS here, it's the function invariant + (plus soft-fp const_int). Which can only be computed into general + regs. */ + if (GET_CODE (x) == PLUS) + return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS; + + /* QImode constants are easy to load, but non-constant QImode data + must go into Q_REGS. */ + if (GET_MODE (x) == QImode && !CONSTANT_P (x)) + { + if (reg_class_subset_p (regclass, Q_REGS)) + return regclass; + if (reg_class_subset_p (Q_REGS, regclass)) + return Q_REGS; + return NO_REGS; + } + + return regclass; +} + +/* Discourage putting floating-point values in SSE registers unless + SSE math is being used, and likewise for the 387 registers. */ +static reg_class_t +ix86_preferred_output_reload_class (rtx x, reg_class_t regclass) +{ + enum machine_mode mode = GET_MODE (x); + + /* Restrict the output reload class to the register bank that we are doing + math on. If we would like not to return a subset of CLASS, reject this + alternative: if reload cannot do this, it will still use its choice. */ + mode = GET_MODE (x); + if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode)) + return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS; + + if (X87_FLOAT_MODE_P (mode)) + { + if (regclass == FP_TOP_SSE_REGS) + return FP_TOP_REG; + else if (regclass == FP_SECOND_SSE_REGS) + return FP_SECOND_REG; + else + return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS; + } + + return regclass; +} + +static reg_class_t +ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass, + enum machine_mode mode, + secondary_reload_info *sri ATTRIBUTE_UNUSED) +{ + /* QImode spills from non-QI registers require + intermediate register on 32bit targets. */ + if (!TARGET_64BIT + && !in_p && mode == QImode + && (rclass == GENERAL_REGS + || rclass == LEGACY_REGS + || rclass == INDEX_REGS)) + { + int regno; + + if (REG_P (x)) + regno = REGNO (x); + else + regno = -1; + + if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG) + regno = true_regnum (x); + + /* Return Q_REGS if the operand is in memory. */ + if (regno == -1) + return Q_REGS; + } + + /* This condition handles corner case where an expression involving + pointers gets vectorized. We're trying to use the address of a + stack slot as a vector initializer. + + (set (reg:V2DI 74 [ vect_cst_.2 ]) + (vec_duplicate:V2DI (reg/f:DI 20 frame))) + + Eventually frame gets turned into sp+offset like this: + + (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74]) + (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp) + (const_int 392 [0x188])))) + + That later gets turned into: + + (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74]) + (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp) + (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64])))) + + We'll have the following reload recorded: + + Reload 0: reload_in (DI) = + (plus:DI (reg/f:DI 7 sp) + (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64])) + reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74]) + SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine + reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188])) + reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74]) + reload_reg_rtx: (reg:V2DI 22 xmm1) + + Which isn't going to work since SSE instructions can't handle scalar + additions. Returning GENERAL_REGS forces the addition into integer + register and reload can handle subsequent reloads without problems. */ + + if (in_p && GET_CODE (x) == PLUS + && SSE_CLASS_P (rclass) + && SCALAR_INT_MODE_P (mode)) + return GENERAL_REGS; + + return NO_REGS; +} + +/* Implement TARGET_CLASS_LIKELY_SPILLED_P. */ + +static bool +ix86_class_likely_spilled_p (reg_class_t rclass) +{ + switch (rclass) + { + case AREG: + case DREG: + case CREG: + case BREG: + case AD_REGS: + case SIREG: + case DIREG: + case SSE_FIRST_REG: + case FP_TOP_REG: + case FP_SECOND_REG: + return true; + + default: + break; + } + + return false; +} + +/* If we are copying between general and FP registers, we need a memory + location. The same is true for SSE and MMX registers. + + To optimize register_move_cost performance, allow inline variant. + + The macro can't work reliably when one of the CLASSES is class containing + registers from multiple units (SSE, MMX, integer). We avoid this by never + combining those units in single alternative in the machine description. + Ensure that this constraint holds to avoid unexpected surprises. + + When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not + enforce these sanity checks. */ + +static inline bool +inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2, + enum machine_mode mode, int strict) +{ + if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1) + || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2) + || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1) + || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2) + || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1) + || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)) + { + gcc_assert (!strict); + return true; + } + + if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2)) + return true; + + /* ??? This is a lie. We do have moves between mmx/general, and for + mmx/sse2. But by saying we need secondary memory we discourage the + register allocator from using the mmx registers unless needed. */ + if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)) + return true; + + if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2)) + { + /* SSE1 doesn't have any direct moves from other classes. */ + if (!TARGET_SSE2) + return true; + + /* If the target says that inter-unit moves are more expensive + than moving through memory, then don't generate them. */ + if (!TARGET_INTER_UNIT_MOVES) + return true; + + /* Between SSE and general, we have moves no larger than word size. */ + if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) + return true; + } + + return false; +} + +bool +ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2, + enum machine_mode mode, int strict) +{ + return inline_secondary_memory_needed (class1, class2, mode, strict); +} + +/* Return true if the registers in CLASS cannot represent the change from + modes FROM to TO. */ + +bool +ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to, + enum reg_class regclass) +{ + if (from == to) + return false; + + /* x87 registers can't do subreg at all, as all values are reformatted + to extended precision. */ + if (MAYBE_FLOAT_CLASS_P (regclass)) + return true; + + if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass)) + { + /* Vector registers do not support QI or HImode loads. If we don't + disallow a change to these modes, reload will assume it's ok to + drop the subreg from (subreg:SI (reg:HI 100) 0). This affects + the vec_dupv4hi pattern. */ + if (GET_MODE_SIZE (from) < 4) + return true; + + /* Vector registers do not support subreg with nonzero offsets, which + are otherwise valid for integer registers. Since we can't see + whether we have a nonzero offset from here, prohibit all + nonparadoxical subregs changing size. */ + if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from)) + return true; + } + + return false; +} + +/* Return the cost of moving data of mode M between a + register and memory. A value of 2 is the default; this cost is + relative to those in `REGISTER_MOVE_COST'. + + This function is used extensively by register_move_cost that is used to + build tables at startup. Make it inline in this case. + When IN is 2, return maximum of in and out move cost. + + If moving between registers and memory is more expensive than + between two registers, you should define this macro to express the + relative cost. + + Model also increased moving costs of QImode registers in non + Q_REGS classes. + */ +static inline int +inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass, + int in) +{ + int cost; + if (FLOAT_CLASS_P (regclass)) + { + int index; + switch (mode) + { + case SFmode: + index = 0; + break; + case DFmode: + index = 1; + break; + case XFmode: + index = 2; + break; + default: + return 100; + } + if (in == 2) + return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]); + return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index]; + } + if (SSE_CLASS_P (regclass)) + { + int index; + switch (GET_MODE_SIZE (mode)) + { + case 4: + index = 0; + break; + case 8: + index = 1; + break; + case 16: + index = 2; + break; + default: + return 100; + } + if (in == 2) + return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]); + return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index]; + } + if (MMX_CLASS_P (regclass)) + { + int index; + switch (GET_MODE_SIZE (mode)) + { + case 4: + index = 0; + break; + case 8: + index = 1; + break; + default: + return 100; + } + if (in) + return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]); + return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index]; + } + switch (GET_MODE_SIZE (mode)) + { + case 1: + if (Q_CLASS_P (regclass) || TARGET_64BIT) + { + if (!in) + return ix86_cost->int_store[0]; + if (TARGET_PARTIAL_REG_DEPENDENCY + && optimize_function_for_speed_p (cfun)) + cost = ix86_cost->movzbl_load; + else + cost = ix86_cost->int_load[0]; + if (in == 2) + return MAX (cost, ix86_cost->int_store[0]); + return cost; + } + else + { + if (in == 2) + return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4); + if (in) + return ix86_cost->movzbl_load; + else + return ix86_cost->int_store[0] + 4; + } + break; + case 2: + if (in == 2) + return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]); + return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1]; + default: + /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */ + if (mode == TFmode) + mode = XFmode; + if (in == 2) + cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]); + else if (in) + cost = ix86_cost->int_load[2]; + else + cost = ix86_cost->int_store[2]; + return (cost * (((int) GET_MODE_SIZE (mode) + + UNITS_PER_WORD - 1) / UNITS_PER_WORD)); + } +} + +static int +ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass, + bool in) +{ + return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0); +} + + +/* Return the cost of moving data from a register in class CLASS1 to + one in class CLASS2. + + It is not required that the cost always equal 2 when FROM is the same as TO; + on some machines it is expensive to move between registers if they are not + general registers. */ + +static int +ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i, + reg_class_t class2_i) +{ + enum reg_class class1 = (enum reg_class) class1_i; + enum reg_class class2 = (enum reg_class) class2_i; + + /* In case we require secondary memory, compute cost of the store followed + by load. In order to avoid bad register allocation choices, we need + for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */ + + if (inline_secondary_memory_needed (class1, class2, mode, 0)) + { + int cost = 1; + + cost += inline_memory_move_cost (mode, class1, 2); + cost += inline_memory_move_cost (mode, class2, 2); + + /* In case of copying from general_purpose_register we may emit multiple + stores followed by single load causing memory size mismatch stall. + Count this as arbitrarily high cost of 20. */ + if (CLASS_MAX_NREGS (class1, mode) > CLASS_MAX_NREGS (class2, mode)) + cost += 20; + + /* In the case of FP/MMX moves, the registers actually overlap, and we + have to switch modes in order to treat them differently. */ + if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2)) + || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1))) + cost += 20; + + return cost; + } + + /* Moves between SSE/MMX and integer unit are expensive. */ + if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2) + || SSE_CLASS_P (class1) != SSE_CLASS_P (class2)) + + /* ??? By keeping returned value relatively high, we limit the number + of moves between integer and MMX/SSE registers for all targets. + Additionally, high value prevents problem with x86_modes_tieable_p(), + where integer modes in MMX/SSE registers are not tieable + because of missing QImode and HImode moves to, from or between + MMX/SSE registers. */ + return MAX (8, ix86_cost->mmxsse_to_integer); + + if (MAYBE_FLOAT_CLASS_P (class1)) + return ix86_cost->fp_move; + if (MAYBE_SSE_CLASS_P (class1)) + return ix86_cost->sse_move; + if (MAYBE_MMX_CLASS_P (class1)) + return ix86_cost->mmx_move; + return 2; +} + +/* Return 1 if hard register REGNO can hold a value of machine-mode MODE. */ + +bool +ix86_hard_regno_mode_ok (int regno, enum machine_mode mode) +{ + /* Flags and only flags can only hold CCmode values. */ + if (CC_REGNO_P (regno)) + return GET_MODE_CLASS (mode) == MODE_CC; + if (GET_MODE_CLASS (mode) == MODE_CC + || GET_MODE_CLASS (mode) == MODE_RANDOM + || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT) + return 0; + if (FP_REGNO_P (regno)) + return VALID_FP_MODE_P (mode); + if (SSE_REGNO_P (regno)) + { + /* We implement the move patterns for all vector modes into and + out of SSE registers, even when no operation instructions + are available. OImode move is available only when AVX is + enabled. */ + return ((TARGET_AVX && mode == OImode) + || VALID_AVX256_REG_MODE (mode) + || VALID_SSE_REG_MODE (mode) + || VALID_SSE2_REG_MODE (mode) + || VALID_MMX_REG_MODE (mode) + || VALID_MMX_REG_MODE_3DNOW (mode)); + } + if (MMX_REGNO_P (regno)) + { + /* We implement the move patterns for 3DNOW modes even in MMX mode, + so if the register is available at all, then we can move data of + the given mode into or out of it. */ + return (VALID_MMX_REG_MODE (mode) + || VALID_MMX_REG_MODE_3DNOW (mode)); + } + + if (mode == QImode) + { + /* Take care for QImode values - they can be in non-QI regs, + but then they do cause partial register stalls. */ + if (regno <= BX_REG || TARGET_64BIT) + return 1; + if (!TARGET_PARTIAL_REG_STALL) + return 1; + return reload_in_progress || reload_completed; + } + /* We handle both integer and floats in the general purpose registers. */ + else if (VALID_INT_MODE_P (mode)) + return 1; + else if (VALID_FP_MODE_P (mode)) + return 1; + else if (VALID_DFP_MODE_P (mode)) + return 1; + /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go + on to use that value in smaller contexts, this can easily force a + pseudo to be allocated to GENERAL_REGS. Since this is no worse than + supporting DImode, allow it. */ + else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode)) + return 1; + + return 0; +} + +/* A subroutine of ix86_modes_tieable_p. Return true if MODE is a + tieable integer mode. */ + +static bool +ix86_tieable_integer_mode_p (enum machine_mode mode) +{ + switch (mode) + { + case HImode: + case SImode: + return true; + + case QImode: + return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL; + + case DImode: + return TARGET_64BIT; + + default: + return false; + } +} + +/* Return true if MODE1 is accessible in a register that can hold MODE2 + without copying. That is, all register classes that can hold MODE2 + can also hold MODE1. */ + +bool +ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2) +{ + if (mode1 == mode2) + return true; + + if (ix86_tieable_integer_mode_p (mode1) + && ix86_tieable_integer_mode_p (mode2)) + return true; + + /* MODE2 being XFmode implies fp stack or general regs, which means we + can tie any smaller floating point modes to it. Note that we do not + tie this with TFmode. */ + if (mode2 == XFmode) + return mode1 == SFmode || mode1 == DFmode; + + /* MODE2 being DFmode implies fp stack, general or sse regs, which means + that we can tie it with SFmode. */ + if (mode2 == DFmode) + return mode1 == SFmode; + + /* If MODE2 is only appropriate for an SSE register, then tie with + any other mode acceptable to SSE registers. */ + if (GET_MODE_SIZE (mode2) == 16 + && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2)) + return (GET_MODE_SIZE (mode1) == 16 + && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1)); + + /* If MODE2 is appropriate for an MMX register, then tie + with any other mode acceptable to MMX registers. */ + if (GET_MODE_SIZE (mode2) == 8 + && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2)) + return (GET_MODE_SIZE (mode1) == 8 + && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1)); + + return false; +} + +/* Compute a (partial) cost for rtx X. Return true if the complete + cost has been computed, and false if subexpressions should be + scanned. In either case, *TOTAL contains the cost result. */ + +static bool +ix86_rtx_costs (rtx x, int code, int outer_code_i, int *total, bool speed) +{ + enum rtx_code outer_code = (enum rtx_code) outer_code_i; + enum machine_mode mode = GET_MODE (x); + const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost; + + switch (code) + { + case CONST_INT: + case CONST: + case LABEL_REF: + case SYMBOL_REF: + if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode)) + *total = 3; + else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode)) + *total = 2; + else if (flag_pic && SYMBOLIC_CONST (x) + && (!TARGET_64BIT + || (!GET_CODE (x) != LABEL_REF + && (GET_CODE (x) != SYMBOL_REF + || !SYMBOL_REF_LOCAL_P (x))))) + *total = 1; + else + *total = 0; + return true; + + case CONST_DOUBLE: + if (mode == VOIDmode) + *total = 0; + else + switch (standard_80387_constant_p (x)) + { + case 1: /* 0.0 */ + *total = 1; + break; + default: /* Other constants */ + *total = 2; + break; + case 0: + case -1: + /* Start with (MEM (SYMBOL_REF)), since that's where + it'll probably end up. Add a penalty for size. */ + *total = (COSTS_N_INSNS (1) + + (flag_pic != 0 && !TARGET_64BIT) + + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2)); + break; + } + return true; + + case ZERO_EXTEND: + /* The zero extensions is often completely free on x86_64, so make + it as cheap as possible. */ + if (TARGET_64BIT && mode == DImode + && GET_MODE (XEXP (x, 0)) == SImode) + *total = 1; + else if (TARGET_ZERO_EXTEND_WITH_AND) + *total = cost->add; + else + *total = cost->movzx; + return false; + + case SIGN_EXTEND: + *total = cost->movsx; + return false; + + case ASHIFT: + if (CONST_INT_P (XEXP (x, 1)) + && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT)) + { + HOST_WIDE_INT value = INTVAL (XEXP (x, 1)); + if (value == 1) + { + *total = cost->add; + return false; + } + if ((value == 2 || value == 3) + && cost->lea <= cost->shift_const) + { + *total = cost->lea; + return false; + } + } + /* FALLTHRU */ + + case ROTATE: + case ASHIFTRT: + case LSHIFTRT: + case ROTATERT: + if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode) + { + if (CONST_INT_P (XEXP (x, 1))) + { + if (INTVAL (XEXP (x, 1)) > 32) + *total = cost->shift_const + COSTS_N_INSNS (2); + else + *total = cost->shift_const * 2; + } + else + { + if (GET_CODE (XEXP (x, 1)) == AND) + *total = cost->shift_var * 2; + else + *total = cost->shift_var * 6 + COSTS_N_INSNS (2); + } + } + else + { + if (CONST_INT_P (XEXP (x, 1))) + *total = cost->shift_const; + else if (GET_CODE (XEXP (x, 1)) == SUBREG + && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND) + { + /* Return the cost after shift-and truncation. */ + *total = cost->shift_var; + return true; + } + else + *total = cost->shift_var; + } + return false; + + case FMA: + { + rtx sub; + + gcc_assert (FLOAT_MODE_P (mode)); + gcc_assert (TARGET_FMA || TARGET_FMA4); + + /* ??? SSE scalar/vector cost should be used here. */ + /* ??? Bald assumption that fma has the same cost as fmul. */ + *total = cost->fmul; + *total += rtx_cost (XEXP (x, 1), FMA, speed); + + /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */ + sub = XEXP (x, 0); + if (GET_CODE (sub) == NEG) + sub = XEXP (sub, 0); + *total += rtx_cost (sub, FMA, speed); + + sub = XEXP (x, 2); + if (GET_CODE (sub) == NEG) + sub = XEXP (sub, 0); + *total += rtx_cost (sub, FMA, speed); + return true; + } + + case MULT: + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + { + /* ??? SSE scalar cost should be used here. */ + *total = cost->fmul; + return false; + } + else if (X87_FLOAT_MODE_P (mode)) + { + *total = cost->fmul; + return false; + } + else if (FLOAT_MODE_P (mode)) + { + /* ??? SSE vector cost should be used here. */ + *total = cost->fmul; + return false; + } + else + { + rtx op0 = XEXP (x, 0); + rtx op1 = XEXP (x, 1); + int nbits; + if (CONST_INT_P (XEXP (x, 1))) + { + unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1)); + for (nbits = 0; value != 0; value &= value - 1) + nbits++; + } + else + /* This is arbitrary. */ + nbits = 7; + + /* Compute costs correctly for widening multiplication. */ + if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND) + && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2 + == GET_MODE_SIZE (mode)) + { + int is_mulwiden = 0; + enum machine_mode inner_mode = GET_MODE (op0); + + if (GET_CODE (op0) == GET_CODE (op1)) + is_mulwiden = 1, op1 = XEXP (op1, 0); + else if (CONST_INT_P (op1)) + { + if (GET_CODE (op0) == SIGN_EXTEND) + is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode) + == INTVAL (op1); + else + is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode)); + } + + if (is_mulwiden) + op0 = XEXP (op0, 0), mode = GET_MODE (op0); + } + + *total = (cost->mult_init[MODE_INDEX (mode)] + + nbits * cost->mult_bit + + rtx_cost (op0, outer_code, speed) + rtx_cost (op1, outer_code, speed)); + + return true; + } + + case DIV: + case UDIV: + case MOD: + case UMOD: + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + /* ??? SSE cost should be used here. */ + *total = cost->fdiv; + else if (X87_FLOAT_MODE_P (mode)) + *total = cost->fdiv; + else if (FLOAT_MODE_P (mode)) + /* ??? SSE vector cost should be used here. */ + *total = cost->fdiv; + else + *total = cost->divide[MODE_INDEX (mode)]; + return false; + + case PLUS: + if (GET_MODE_CLASS (mode) == MODE_INT + && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode)) + { + if (GET_CODE (XEXP (x, 0)) == PLUS + && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT + && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1)) + && CONSTANT_P (XEXP (x, 1))) + { + HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1)); + if (val == 2 || val == 4 || val == 8) + { + *total = cost->lea; + *total += rtx_cost (XEXP (XEXP (x, 0), 1), outer_code, speed); + *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), + outer_code, speed); + *total += rtx_cost (XEXP (x, 1), outer_code, speed); + return true; + } + } + else if (GET_CODE (XEXP (x, 0)) == MULT + && CONST_INT_P (XEXP (XEXP (x, 0), 1))) + { + HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1)); + if (val == 2 || val == 4 || val == 8) + { + *total = cost->lea; + *total += rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, speed); + *total += rtx_cost (XEXP (x, 1), outer_code, speed); + return true; + } + } + else if (GET_CODE (XEXP (x, 0)) == PLUS) + { + *total = cost->lea; + *total += rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, speed); + *total += rtx_cost (XEXP (XEXP (x, 0), 1), outer_code, speed); + *total += rtx_cost (XEXP (x, 1), outer_code, speed); + return true; + } + } + /* FALLTHRU */ + + case MINUS: + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + { + /* ??? SSE cost should be used here. */ + *total = cost->fadd; + return false; + } + else if (X87_FLOAT_MODE_P (mode)) + { + *total = cost->fadd; + return false; + } + else if (FLOAT_MODE_P (mode)) + { + /* ??? SSE vector cost should be used here. */ + *total = cost->fadd; + return false; + } + /* FALLTHRU */ + + case AND: + case IOR: + case XOR: + if (!TARGET_64BIT && mode == DImode) + { + *total = (cost->add * 2 + + (rtx_cost (XEXP (x, 0), outer_code, speed) + << (GET_MODE (XEXP (x, 0)) != DImode)) + + (rtx_cost (XEXP (x, 1), outer_code, speed) + << (GET_MODE (XEXP (x, 1)) != DImode))); + return true; + } + /* FALLTHRU */ + + case NEG: + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + { + /* ??? SSE cost should be used here. */ + *total = cost->fchs; + return false; + } + else if (X87_FLOAT_MODE_P (mode)) + { + *total = cost->fchs; + return false; + } + else if (FLOAT_MODE_P (mode)) + { + /* ??? SSE vector cost should be used here. */ + *total = cost->fchs; + return false; + } + /* FALLTHRU */ + + case NOT: + if (!TARGET_64BIT && mode == DImode) + *total = cost->add * 2; + else + *total = cost->add; + return false; + + case COMPARE: + if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT + && XEXP (XEXP (x, 0), 1) == const1_rtx + && CONST_INT_P (XEXP (XEXP (x, 0), 2)) + && XEXP (x, 1) == const0_rtx) + { + /* This kind of construct is implemented using test[bwl]. + Treat it as if we had an AND. */ + *total = (cost->add + + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, speed) + + rtx_cost (const1_rtx, outer_code, speed)); + return true; + } + return false; + + case FLOAT_EXTEND: + if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)) + *total = 0; + return false; + + case ABS: + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + /* ??? SSE cost should be used here. */ + *total = cost->fabs; + else if (X87_FLOAT_MODE_P (mode)) + *total = cost->fabs; + else if (FLOAT_MODE_P (mode)) + /* ??? SSE vector cost should be used here. */ + *total = cost->fabs; + return false; + + case SQRT: + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + /* ??? SSE cost should be used here. */ + *total = cost->fsqrt; + else if (X87_FLOAT_MODE_P (mode)) + *total = cost->fsqrt; + else if (FLOAT_MODE_P (mode)) + /* ??? SSE vector cost should be used here. */ + *total = cost->fsqrt; + return false; + + case UNSPEC: + if (XINT (x, 1) == UNSPEC_TP) + *total = 0; + return false; + + case VEC_SELECT: + case VEC_CONCAT: + case VEC_MERGE: + case VEC_DUPLICATE: + /* ??? Assume all of these vector manipulation patterns are + recognizable. In which case they all pretty much have the + same cost. */ + *total = COSTS_N_INSNS (1); + return true; + + default: + return false; + } +} + +#if TARGET_MACHO + +static int current_machopic_label_num; + +/* Given a symbol name and its associated stub, write out the + definition of the stub. */ + +void +machopic_output_stub (FILE *file, const char *symb, const char *stub) +{ + unsigned int length; + char *binder_name, *symbol_name, lazy_ptr_name[32]; + int label = ++current_machopic_label_num; + + /* For 64-bit we shouldn't get here. */ + gcc_assert (!TARGET_64BIT); + + /* Lose our funky encoding stuff so it doesn't contaminate the stub. */ + symb = targetm.strip_name_encoding (symb); + + length = strlen (stub); + binder_name = XALLOCAVEC (char, length + 32); + GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length); + + length = strlen (symb); + symbol_name = XALLOCAVEC (char, length + 32); + GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length); + + sprintf (lazy_ptr_name, "L%d$lz", label); + + if (MACHOPIC_ATT_STUB) + switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]); + else if (MACHOPIC_PURE) + { + if (TARGET_DEEP_BRANCH_PREDICTION) + switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]); + else + switch_to_section (darwin_sections[machopic_picsymbol_stub_section]); + } + else + switch_to_section (darwin_sections[machopic_symbol_stub_section]); + + fprintf (file, "%s:\n", stub); + fprintf (file, "\t.indirect_symbol %s\n", symbol_name); + + if (MACHOPIC_ATT_STUB) + { + fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n"); + } + else if (MACHOPIC_PURE) + { + /* PIC stub. */ + if (TARGET_DEEP_BRANCH_PREDICTION) + { + /* 25-byte PIC stub using "CALL get_pc_thunk". */ + rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */); + output_set_got (tmp, NULL_RTX); /* "CALL ___.get_pc_thunk.cx". */ + fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n", label, lazy_ptr_name, label); + } + else + { + /* 26-byte PIC stub using inline picbase: "CALL L42 ! L42: pop %eax". */ + fprintf (file, "\tcall LPC$%d\nLPC$%d:\tpopl %%ecx\n", label, label); + fprintf (file, "\tmovl %s-LPC$%d(%%ecx),%%ecx\n", lazy_ptr_name, label); + } + fprintf (file, "\tjmp\t*%%ecx\n"); + } + else + fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name); + + /* The AT&T-style ("self-modifying") stub is not lazily bound, thus + it needs no stub-binding-helper. */ + if (MACHOPIC_ATT_STUB) + return; + + fprintf (file, "%s:\n", binder_name); + + if (MACHOPIC_PURE) + { + fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name); + fprintf (file, "\tpushl\t%%ecx\n"); + } + else + fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name); + + fputs ("\tjmp\tdyld_stub_binding_helper\n", file); + + /* N.B. Keep the correspondence of these + 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the + old-pic/new-pic/non-pic stubs; altering this will break + compatibility with existing dylibs. */ + if (MACHOPIC_PURE) + { + /* PIC stubs. */ + if (TARGET_DEEP_BRANCH_PREDICTION) + /* 25-byte PIC stub using "CALL get_pc_thunk". */ + switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]); + else + /* 26-byte PIC stub using inline picbase: "CALL L42 ! L42: pop %ebx". */ + switch_to_section (darwin_sections[machopic_lazy_symbol_ptr_section]); + } + else + /* 16-byte -mdynamic-no-pic stub. */ + switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]); + + fprintf (file, "%s:\n", lazy_ptr_name); + fprintf (file, "\t.indirect_symbol %s\n", symbol_name); + fprintf (file, ASM_LONG "%s\n", binder_name); +} +#endif /* TARGET_MACHO */ + +/* Order the registers for register allocator. */ + +void +x86_order_regs_for_local_alloc (void) +{ + int pos = 0; + int i; + + /* First allocate the local general purpose registers. */ + for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) + if (GENERAL_REGNO_P (i) && call_used_regs[i]) + reg_alloc_order [pos++] = i; + + /* Global general purpose registers. */ + for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) + if (GENERAL_REGNO_P (i) && !call_used_regs[i]) + reg_alloc_order [pos++] = i; + + /* x87 registers come first in case we are doing FP math + using them. */ + if (!TARGET_SSE_MATH) + for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++) + reg_alloc_order [pos++] = i; + + /* SSE registers. */ + for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++) + reg_alloc_order [pos++] = i; + for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++) + reg_alloc_order [pos++] = i; + + /* x87 registers. */ + if (TARGET_SSE_MATH) + for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++) + reg_alloc_order [pos++] = i; + + for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++) + reg_alloc_order [pos++] = i; + + /* Initialize the rest of array as we do not allocate some registers + at all. */ + while (pos < FIRST_PSEUDO_REGISTER) + reg_alloc_order [pos++] = 0; +} + +/* Handle a "callee_pop_aggregate_return" attribute; arguments as + in struct attribute_spec handler. */ +static tree +ix86_handle_callee_pop_aggregate_return (tree *node, tree name, + tree args, + int flags ATTRIBUTE_UNUSED, + bool *no_add_attrs) +{ + if (TREE_CODE (*node) != FUNCTION_TYPE + && TREE_CODE (*node) != METHOD_TYPE + && TREE_CODE (*node) != FIELD_DECL + && TREE_CODE (*node) != TYPE_DECL) + { + warning (OPT_Wattributes, "%qE attribute only applies to functions", + name); + *no_add_attrs = true; + return NULL_TREE; + } + if (TARGET_64BIT) + { + warning (OPT_Wattributes, "%qE attribute only available for 32-bit", + name); + *no_add_attrs = true; + return NULL_TREE; + } + if (is_attribute_p ("callee_pop_aggregate_return", name)) + { + tree cst; + + cst = TREE_VALUE (args); + if (TREE_CODE (cst) != INTEGER_CST) + { + warning (OPT_Wattributes, + "%qE attribute requires an integer constant argument", + name); + *no_add_attrs = true; + } + else if (compare_tree_int (cst, 0) != 0 + && compare_tree_int (cst, 1) != 0) + { + warning (OPT_Wattributes, + "argument to %qE attribute is neither zero, nor one", + name); + *no_add_attrs = true; + } + + return NULL_TREE; + } + + return NULL_TREE; +} + +/* Handle a "ms_abi" or "sysv" attribute; arguments as in + struct attribute_spec.handler. */ +static tree +ix86_handle_abi_attribute (tree *node, tree name, + tree args ATTRIBUTE_UNUSED, + int flags ATTRIBUTE_UNUSED, bool *no_add_attrs) +{ + if (TREE_CODE (*node) != FUNCTION_TYPE + && TREE_CODE (*node) != METHOD_TYPE + && TREE_CODE (*node) != FIELD_DECL + && TREE_CODE (*node) != TYPE_DECL) + { + warning (OPT_Wattributes, "%qE attribute only applies to functions", + name); + *no_add_attrs = true; + return NULL_TREE; + } + if (!TARGET_64BIT) + { + warning (OPT_Wattributes, "%qE attribute only available for 64-bit", + name); + *no_add_attrs = true; + return NULL_TREE; + } + + /* Can combine regparm with all attributes but fastcall. */ + if (is_attribute_p ("ms_abi", name)) + { + if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node))) + { + error ("ms_abi and sysv_abi attributes are not compatible"); + } + + return NULL_TREE; + } + else if (is_attribute_p ("sysv_abi", name)) + { + if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node))) + { + error ("ms_abi and sysv_abi attributes are not compatible"); + } + + return NULL_TREE; + } + + return NULL_TREE; +} + +/* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in + struct attribute_spec.handler. */ +static tree +ix86_handle_struct_attribute (tree *node, tree name, + tree args ATTRIBUTE_UNUSED, + int flags ATTRIBUTE_UNUSED, bool *no_add_attrs) +{ + tree *type = NULL; + if (DECL_P (*node)) + { + if (TREE_CODE (*node) == TYPE_DECL) + type = &TREE_TYPE (*node); + } + else + type = node; + + if (!(type && (TREE_CODE (*type) == RECORD_TYPE + || TREE_CODE (*type) == UNION_TYPE))) + { + warning (OPT_Wattributes, "%qE attribute ignored", + name); + *no_add_attrs = true; + } + + else if ((is_attribute_p ("ms_struct", name) + && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type))) + || ((is_attribute_p ("gcc_struct", name) + && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type))))) + { + warning (OPT_Wattributes, "%qE incompatible attribute ignored", + name); + *no_add_attrs = true; + } + + return NULL_TREE; +} + +static tree +ix86_handle_fndecl_attribute (tree *node, tree name, + tree args ATTRIBUTE_UNUSED, + int flags ATTRIBUTE_UNUSED, bool *no_add_attrs) +{ + if (TREE_CODE (*node) != FUNCTION_DECL) + { + warning (OPT_Wattributes, "%qE attribute only applies to functions", + name); + *no_add_attrs = true; + } + return NULL_TREE; +} + +static bool +ix86_ms_bitfield_layout_p (const_tree record_type) +{ + return ((TARGET_MS_BITFIELD_LAYOUT + && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type))) + || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type))); +} + +/* Returns an expression indicating where the this parameter is + located on entry to the FUNCTION. */ + +static rtx +x86_this_parameter (tree function) +{ + tree type = TREE_TYPE (function); + bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0; + int nregs; + + if (TARGET_64BIT) + { + const int *parm_regs; + + if (ix86_function_type_abi (type) == MS_ABI) + parm_regs = x86_64_ms_abi_int_parameter_registers; + else + parm_regs = x86_64_int_parameter_registers; + return gen_rtx_REG (DImode, parm_regs[aggr]); + } + + nregs = ix86_function_regparm (type, function); + + if (nregs > 0 && !stdarg_p (type)) + { + int regno; + + if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type))) + regno = aggr ? DX_REG : CX_REG; + else if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (type))) + { + regno = CX_REG; + if (aggr) + return gen_rtx_MEM (SImode, + plus_constant (stack_pointer_rtx, 4)); + } + else + { + regno = AX_REG; + if (aggr) + { + regno = DX_REG; + if (nregs == 1) + return gen_rtx_MEM (SImode, + plus_constant (stack_pointer_rtx, 4)); + } + } + return gen_rtx_REG (SImode, regno); + } + + return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, aggr ? 8 : 4)); +} + +/* Determine whether x86_output_mi_thunk can succeed. */ + +static bool +x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED, + HOST_WIDE_INT delta ATTRIBUTE_UNUSED, + HOST_WIDE_INT vcall_offset, const_tree function) +{ + /* 64-bit can handle anything. */ + if (TARGET_64BIT) + return true; + + /* For 32-bit, everything's fine if we have one free register. */ + if (ix86_function_regparm (TREE_TYPE (function), function) < 3) + return true; + + /* Need a free register for vcall_offset. */ + if (vcall_offset) + return false; + + /* Need a free register for GOT references. */ + if (flag_pic && !targetm.binds_local_p (function)) + return false; + + /* Otherwise ok. */ + return true; +} + +/* Output the assembler code for a thunk function. THUNK_DECL is the + declaration for the thunk function itself, FUNCTION is the decl for + the target function. DELTA is an immediate constant offset to be + added to THIS. If VCALL_OFFSET is nonzero, the word at + *(*this + vcall_offset) should be added to THIS. */ + +static void +x86_output_mi_thunk (FILE *file, + tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta, + HOST_WIDE_INT vcall_offset, tree function) +{ + rtx xops[3]; + rtx this_param = x86_this_parameter (function); + rtx this_reg, tmp; + + /* Make sure unwind info is emitted for the thunk if needed. */ + final_start_function (emit_barrier (), file, 1); + + /* If VCALL_OFFSET, we'll need THIS in a register. Might as well + pull it in now and let DELTA benefit. */ + if (REG_P (this_param)) + this_reg = this_param; + else if (vcall_offset) + { + /* Put the this parameter into %eax. */ + xops[0] = this_param; + xops[1] = this_reg = gen_rtx_REG (Pmode, AX_REG); + output_asm_insn ("mov%z1\t{%0, %1|%1, %0}", xops); + } + else + this_reg = NULL_RTX; + + /* Adjust the this parameter by a fixed constant. */ + if (delta) + { + xops[0] = GEN_INT (delta); + xops[1] = this_reg ? this_reg : this_param; + if (TARGET_64BIT) + { + if (!x86_64_general_operand (xops[0], DImode)) + { + tmp = gen_rtx_REG (DImode, R10_REG); + xops[1] = tmp; + output_asm_insn ("mov{q}\t{%1, %0|%0, %1}", xops); + xops[0] = tmp; + xops[1] = this_param; + } + if (x86_maybe_negate_const_int (&xops[0], DImode)) + output_asm_insn ("sub{q}\t{%0, %1|%1, %0}", xops); + else + output_asm_insn ("add{q}\t{%0, %1|%1, %0}", xops); + } + else if (x86_maybe_negate_const_int (&xops[0], SImode)) + output_asm_insn ("sub{l}\t{%0, %1|%1, %0}", xops); + else + output_asm_insn ("add{l}\t{%0, %1|%1, %0}", xops); + } + + /* Adjust the this parameter by a value stored in the vtable. */ + if (vcall_offset) + { + if (TARGET_64BIT) + tmp = gen_rtx_REG (DImode, R10_REG); + else + { + int tmp_regno = CX_REG; + if (lookup_attribute ("fastcall", + TYPE_ATTRIBUTES (TREE_TYPE (function))) + || lookup_attribute ("thiscall", + TYPE_ATTRIBUTES (TREE_TYPE (function)))) + tmp_regno = AX_REG; + tmp = gen_rtx_REG (SImode, tmp_regno); + } + + xops[0] = gen_rtx_MEM (Pmode, this_reg); + xops[1] = tmp; + output_asm_insn ("mov%z1\t{%0, %1|%1, %0}", xops); + + /* Adjust the this parameter. */ + xops[0] = gen_rtx_MEM (Pmode, plus_constant (tmp, vcall_offset)); + if (TARGET_64BIT && !memory_operand (xops[0], Pmode)) + { + rtx tmp2 = gen_rtx_REG (DImode, R11_REG); + xops[0] = GEN_INT (vcall_offset); + xops[1] = tmp2; + output_asm_insn ("mov{q}\t{%0, %1|%1, %0}", xops); + xops[0] = gen_rtx_MEM (Pmode, gen_rtx_PLUS (Pmode, tmp, tmp2)); + } + xops[1] = this_reg; + output_asm_insn ("add%z1\t{%0, %1|%1, %0}", xops); + } + + /* If necessary, drop THIS back to its stack slot. */ + if (this_reg && this_reg != this_param) + { + xops[0] = this_reg; + xops[1] = this_param; + output_asm_insn ("mov%z1\t{%0, %1|%1, %0}", xops); + } + + xops[0] = XEXP (DECL_RTL (function), 0); + if (TARGET_64BIT) + { + if (!flag_pic || targetm.binds_local_p (function) + || DEFAULT_ABI == MS_ABI) + output_asm_insn ("jmp\t%P0", xops); + /* All thunks should be in the same object as their target, + and thus binds_local_p should be true. */ + else if (TARGET_64BIT && cfun->machine->call_abi == MS_ABI) + gcc_unreachable (); + else + { + tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, xops[0]), UNSPEC_GOTPCREL); + tmp = gen_rtx_CONST (Pmode, tmp); + tmp = gen_rtx_MEM (QImode, tmp); + xops[0] = tmp; + output_asm_insn ("jmp\t%A0", xops); + } + } + else + { + if (!flag_pic || targetm.binds_local_p (function)) + output_asm_insn ("jmp\t%P0", xops); + else +#if TARGET_MACHO + if (TARGET_MACHO) + { + rtx sym_ref = XEXP (DECL_RTL (function), 0); + if (TARGET_MACHO_BRANCH_ISLANDS) + sym_ref = (gen_rtx_SYMBOL_REF + (Pmode, + machopic_indirection_name (sym_ref, /*stub_p=*/true))); + tmp = gen_rtx_MEM (QImode, sym_ref); + xops[0] = tmp; + output_asm_insn ("jmp\t%0", xops); + } + else +#endif /* TARGET_MACHO */ + { + tmp = gen_rtx_REG (SImode, CX_REG); + output_set_got (tmp, NULL_RTX); + + xops[1] = tmp; + output_asm_insn ("mov{l}\t{%0@GOT(%1), %1|%1, %0@GOT[%1]}", xops); + output_asm_insn ("jmp\t{*}%1", xops); + } + } + final_end_function (); +} + +static void +x86_file_start (void) +{ + default_file_start (); +#if TARGET_MACHO + darwin_file_start (); +#endif + if (X86_FILE_START_VERSION_DIRECTIVE) + fputs ("\t.version\t\"01.01\"\n", asm_out_file); + if (X86_FILE_START_FLTUSED) + fputs ("\t.global\t__fltused\n", asm_out_file); + if (ix86_asm_dialect == ASM_INTEL) + fputs ("\t.intel_syntax noprefix\n", asm_out_file); +} + +int +x86_field_alignment (tree field, int computed) +{ + enum machine_mode mode; + tree type = TREE_TYPE (field); + + if (TARGET_64BIT || TARGET_ALIGN_DOUBLE) + return computed; + mode = TYPE_MODE (strip_array_types (type)); + if (mode == DFmode || mode == DCmode + || GET_MODE_CLASS (mode) == MODE_INT + || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT) + return MIN (32, computed); + return computed; +} + +/* Output assembler code to FILE to increment profiler label # LABELNO + for profiling a function entry. */ +void +x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED) +{ + const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE + : MCOUNT_NAME); + + if (TARGET_64BIT) + { +#ifndef NO_PROFILE_COUNTERS + fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno); +#endif + + if (DEFAULT_ABI == SYSV_ABI && flag_pic) + fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name); + else + fprintf (file, "\tcall\t%s\n", mcount_name); + } + else if (flag_pic) + { +#ifndef NO_PROFILE_COUNTERS + fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n", + LPREFIX, labelno); +#endif + fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name); + } + else + { +#ifndef NO_PROFILE_COUNTERS + fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n", + LPREFIX, labelno); +#endif + fprintf (file, "\tcall\t%s\n", mcount_name); + } +} + +/* We don't have exact information about the insn sizes, but we may assume + quite safely that we are informed about all 1 byte insns and memory + address sizes. This is enough to eliminate unnecessary padding in + 99% of cases. */ + +static int +min_insn_size (rtx insn) +{ + int l = 0, len; + + if (!INSN_P (insn) || !active_insn_p (insn)) + return 0; + + /* Discard alignments we've emit and jump instructions. */ + if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE + && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN) + return 0; + if (JUMP_TABLE_DATA_P (insn)) + return 0; + + /* Important case - calls are always 5 bytes. + It is common to have many calls in the row. */ + if (CALL_P (insn) + && symbolic_reference_mentioned_p (PATTERN (insn)) + && !SIBLING_CALL_P (insn)) + return 5; + len = get_attr_length (insn); + if (len <= 1) + return 1; + + /* For normal instructions we rely on get_attr_length being exact, + with a few exceptions. */ + if (!JUMP_P (insn)) + { + enum attr_type type = get_attr_type (insn); + + switch (type) + { + case TYPE_MULTI: + if (GET_CODE (PATTERN (insn)) == ASM_INPUT + || asm_noperands (PATTERN (insn)) >= 0) + return 0; + break; + case TYPE_OTHER: + case TYPE_FCMP: + break; + default: + /* Otherwise trust get_attr_length. */ + return len; + } + + l = get_attr_length_address (insn); + if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn))) + l = 4; + } + if (l) + return 1+l; + else + return 2; +} + +#ifdef ASM_OUTPUT_MAX_SKIP_PAD + +/* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte + window. */ + +static void +ix86_avoid_jump_mispredicts (void) +{ + rtx insn, start = get_insns (); + int nbytes = 0, njumps = 0; + int isjump = 0; + + /* Look for all minimal intervals of instructions containing 4 jumps. + The intervals are bounded by START and INSN. NBYTES is the total + size of instructions in the interval including INSN and not including + START. When the NBYTES is smaller than 16 bytes, it is possible + that the end of START and INSN ends up in the same 16byte page. + + The smallest offset in the page INSN can start is the case where START + ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN). + We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN). + */ + for (insn = start; insn; insn = NEXT_INSN (insn)) + { + int min_size; + + if (LABEL_P (insn)) + { + int align = label_to_alignment (insn); + int max_skip = label_to_max_skip (insn); + + if (max_skip > 15) + max_skip = 15; + /* If align > 3, only up to 16 - max_skip - 1 bytes can be + already in the current 16 byte page, because otherwise + ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer + bytes to reach 16 byte boundary. */ + if (align <= 0 + || (align <= 3 && max_skip != (1 << align) - 1)) + max_skip = 0; + if (dump_file) + fprintf (dump_file, "Label %i with max_skip %i\n", + INSN_UID (insn), max_skip); + if (max_skip) + { + while (nbytes + max_skip >= 16) + { + start = NEXT_INSN (start); + if ((JUMP_P (start) + && GET_CODE (PATTERN (start)) != ADDR_VEC + && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC) + || CALL_P (start)) + njumps--, isjump = 1; + else + isjump = 0; + nbytes -= min_insn_size (start); + } + } + continue; + } + + min_size = min_insn_size (insn); + nbytes += min_size; + if (dump_file) + fprintf (dump_file, "Insn %i estimated to %i bytes\n", + INSN_UID (insn), min_size); + if ((JUMP_P (insn) + && GET_CODE (PATTERN (insn)) != ADDR_VEC + && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC) + || CALL_P (insn)) + njumps++; + else + continue; + + while (njumps > 3) + { + start = NEXT_INSN (start); + if ((JUMP_P (start) + && GET_CODE (PATTERN (start)) != ADDR_VEC + && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC) + || CALL_P (start)) + njumps--, isjump = 1; + else + isjump = 0; + nbytes -= min_insn_size (start); + } + gcc_assert (njumps >= 0); + if (dump_file) + fprintf (dump_file, "Interval %i to %i has %i bytes\n", + INSN_UID (start), INSN_UID (insn), nbytes); + + if (njumps == 3 && isjump && nbytes < 16) + { + int padsize = 15 - nbytes + min_insn_size (insn); + + if (dump_file) + fprintf (dump_file, "Padding insn %i by %i bytes!\n", + INSN_UID (insn), padsize); + emit_insn_before (gen_pad (GEN_INT (padsize)), insn); + } + } +} +#endif + +/* AMD Athlon works faster + when RET is not destination of conditional jump or directly preceded + by other jump instruction. We avoid the penalty by inserting NOP just + before the RET instructions in such cases. */ +static void +ix86_pad_returns (void) +{ + edge e; + edge_iterator ei; + + FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds) + { + basic_block bb = e->src; + rtx ret = BB_END (bb); + rtx prev; + bool replace = false; + + if (!JUMP_P (ret) || GET_CODE (PATTERN (ret)) != RETURN + || optimize_bb_for_size_p (bb)) + continue; + for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev)) + if (active_insn_p (prev) || LABEL_P (prev)) + break; + if (prev && LABEL_P (prev)) + { + edge e; + edge_iterator ei; + + FOR_EACH_EDGE (e, ei, bb->preds) + if (EDGE_FREQUENCY (e) && e->src->index >= 0 + && !(e->flags & EDGE_FALLTHRU)) + replace = true; + } + if (!replace) + { + prev = prev_active_insn (ret); + if (prev + && ((JUMP_P (prev) && any_condjump_p (prev)) + || CALL_P (prev))) + replace = true; + /* Empty functions get branch mispredict even when + the jump destination is not visible to us. */ + if (!prev && !optimize_function_for_size_p (cfun)) + replace = true; + } + if (replace) + { + emit_jump_insn_before (gen_return_internal_long (), ret); + delete_insn (ret); + } + } +} + +/* Count the minimum number of instructions in BB. Return 4 if the + number of instructions >= 4. */ + +static int +ix86_count_insn_bb (basic_block bb) +{ + rtx insn; + int insn_count = 0; + + /* Count number of instructions in this block. Return 4 if the number + of instructions >= 4. */ + FOR_BB_INSNS (bb, insn) + { + /* Only happen in exit blocks. */ + if (JUMP_P (insn) + && GET_CODE (PATTERN (insn)) == RETURN) + break; + + if (NONDEBUG_INSN_P (insn) + && GET_CODE (PATTERN (insn)) != USE + && GET_CODE (PATTERN (insn)) != CLOBBER) + { + insn_count++; + if (insn_count >= 4) + return insn_count; + } + } + + return insn_count; +} + + +/* Count the minimum number of instructions in code path in BB. + Return 4 if the number of instructions >= 4. */ + +static int +ix86_count_insn (basic_block bb) +{ + edge e; + edge_iterator ei; + int min_prev_count; + + /* Only bother counting instructions along paths with no + more than 2 basic blocks between entry and exit. Given + that BB has an edge to exit, determine if a predecessor + of BB has an edge from entry. If so, compute the number + of instructions in the predecessor block. If there + happen to be multiple such blocks, compute the minimum. */ + min_prev_count = 4; + FOR_EACH_EDGE (e, ei, bb->preds) + { + edge prev_e; + edge_iterator prev_ei; + + if (e->src == ENTRY_BLOCK_PTR) + { + min_prev_count = 0; + break; + } + FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds) + { + if (prev_e->src == ENTRY_BLOCK_PTR) + { + int count = ix86_count_insn_bb (e->src); + if (count < min_prev_count) + min_prev_count = count; + break; + } + } + } + + if (min_prev_count < 4) + min_prev_count += ix86_count_insn_bb (bb); + + return min_prev_count; +} + +/* Pad short funtion to 4 instructions. */ + +static void +ix86_pad_short_function (void) +{ + edge e; + edge_iterator ei; + + FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds) + { + rtx ret = BB_END (e->src); + if (JUMP_P (ret) && GET_CODE (PATTERN (ret)) == RETURN) + { + int insn_count = ix86_count_insn (e->src); + + /* Pad short function. */ + if (insn_count < 4) + { + rtx insn = ret; + + /* Find epilogue. */ + while (insn + && (!NOTE_P (insn) + || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG)) + insn = PREV_INSN (insn); + + if (!insn) + insn = ret; + + /* Two NOPs count as one instruction. */ + insn_count = 2 * (4 - insn_count); + emit_insn_before (gen_nops (GEN_INT (insn_count)), insn); + } + } + } +} + +/* Implement machine specific optimizations. We implement padding of returns + for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */ +static void +ix86_reorg (void) +{ + /* We are freeing block_for_insn in the toplev to keep compatibility + with old MDEP_REORGS that are not CFG based. Recompute it now. */ + compute_bb_for_insn (); + + if (optimize && optimize_function_for_speed_p (cfun)) + { + if (TARGET_PAD_SHORT_FUNCTION) + ix86_pad_short_function (); + else if (TARGET_PAD_RETURNS) + ix86_pad_returns (); +#ifdef ASM_OUTPUT_MAX_SKIP_PAD + if (TARGET_FOUR_JUMP_LIMIT) + ix86_avoid_jump_mispredicts (); +#endif + } + + /* Run the vzeroupper optimization if needed. */ + if (TARGET_VZEROUPPER) + move_or_delete_vzeroupper (); +} + +/* Return nonzero when QImode register that must be represented via REX prefix + is used. */ +bool +x86_extended_QIreg_mentioned_p (rtx insn) +{ + int i; + extract_insn_cached (insn); + for (i = 0; i < recog_data.n_operands; i++) + if (REG_P (recog_data.operand[i]) + && REGNO (recog_data.operand[i]) > BX_REG) + return true; + return false; +} + +/* Return nonzero when P points to register encoded via REX prefix. + Called via for_each_rtx. */ +static int +extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED) +{ + unsigned int regno; + if (!REG_P (*p)) + return 0; + regno = REGNO (*p); + return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno); +} + +/* Return true when INSN mentions register that must be encoded using REX + prefix. */ +bool +x86_extended_reg_mentioned_p (rtx insn) +{ + return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn, + extended_reg_mentioned_1, NULL); +} + +/* If profitable, negate (without causing overflow) integer constant + of mode MODE at location LOC. Return true in this case. */ +bool +x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode) +{ + HOST_WIDE_INT val; + + if (!CONST_INT_P (*loc)) + return false; + + switch (mode) + { + case DImode: + /* DImode x86_64 constants must fit in 32 bits. */ + gcc_assert (x86_64_immediate_operand (*loc, mode)); + + mode = SImode; + break; + + case SImode: + case HImode: + case QImode: + break; + + default: + gcc_unreachable (); + } + + /* Avoid overflows. */ + if (mode_signbit_p (mode, *loc)) + return false; + + val = INTVAL (*loc); + + /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'. + Exceptions: -128 encodes smaller than 128, so swap sign and op. */ + if ((val < 0 && val != -128) + || val == 128) + { + *loc = GEN_INT (-val); + return true; + } + + return false; +} + +/* Generate an unsigned DImode/SImode to FP conversion. This is the same code + optabs would emit if we didn't have TFmode patterns. */ + +void +x86_emit_floatuns (rtx operands[2]) +{ + rtx neglab, donelab, i0, i1, f0, in, out; + enum machine_mode mode, inmode; + + inmode = GET_MODE (operands[1]); + gcc_assert (inmode == SImode || inmode == DImode); + + out = operands[0]; + in = force_reg (inmode, operands[1]); + mode = GET_MODE (out); + neglab = gen_label_rtx (); + donelab = gen_label_rtx (); + f0 = gen_reg_rtx (mode); + + emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab); + + expand_float (out, in, 0); + + emit_jump_insn (gen_jump (donelab)); + emit_barrier (); + + emit_label (neglab); + + i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL, + 1, OPTAB_DIRECT); + i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL, + 1, OPTAB_DIRECT); + i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT); + + expand_float (f0, i0, 0); + + emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0))); + + emit_label (donelab); +} + +/* AVX does not support 32-byte integer vector operations, + thus the longest vector we are faced with is V16QImode. */ +#define MAX_VECT_LEN 16 + +struct expand_vec_perm_d +{ + rtx target, op0, op1; + unsigned char perm[MAX_VECT_LEN]; + enum machine_mode vmode; + unsigned char nelt; + bool testing_p; +}; + +static bool expand_vec_perm_1 (struct expand_vec_perm_d *d); +static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d); + +/* Get a vector mode of the same size as the original but with elements + twice as wide. This is only guaranteed to apply to integral vectors. */ + +static inline enum machine_mode +get_mode_wider_vector (enum machine_mode o) +{ + /* ??? Rely on the ordering that genmodes.c gives to vectors. */ + enum machine_mode n = GET_MODE_WIDER_MODE (o); + gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2); + gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n)); + return n; +} + +/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector + with all elements equal to VAR. Return true if successful. */ + +static bool +ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode, + rtx target, rtx val) +{ + bool ok; + + switch (mode) + { + case V2SImode: + case V2SFmode: + if (!mmx_ok) + return false; + /* FALLTHRU */ + + case V4DFmode: + case V4DImode: + case V8SFmode: + case V8SImode: + case V2DFmode: + case V2DImode: + case V4SFmode: + case V4SImode: + { + rtx insn, dup; + + /* First attempt to recognize VAL as-is. */ + dup = gen_rtx_VEC_DUPLICATE (mode, val); + insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup)); + if (recog_memoized (insn) < 0) + { + rtx seq; + /* If that fails, force VAL into a register. */ + + start_sequence (); + XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val); + seq = get_insns (); + end_sequence (); + if (seq) + emit_insn_before (seq, insn); + + ok = recog_memoized (insn) >= 0; + gcc_assert (ok); + } + } + return true; + + case V4HImode: + if (!mmx_ok) + return false; + if (TARGET_SSE || TARGET_3DNOW_A) + { + rtx x; + + val = gen_lowpart (SImode, val); + x = gen_rtx_TRUNCATE (HImode, val); + x = gen_rtx_VEC_DUPLICATE (mode, x); + emit_insn (gen_rtx_SET (VOIDmode, target, x)); + return true; + } + goto widen; + + case V8QImode: + if (!mmx_ok) + return false; + goto widen; + + case V8HImode: + if (TARGET_SSE2) + { + struct expand_vec_perm_d dperm; + rtx tmp1, tmp2; + + permute: + memset (&dperm, 0, sizeof (dperm)); + dperm.target = target; + dperm.vmode = mode; + dperm.nelt = GET_MODE_NUNITS (mode); + dperm.op0 = dperm.op1 = gen_reg_rtx (mode); + + /* Extend to SImode using a paradoxical SUBREG. */ + tmp1 = gen_reg_rtx (SImode); + emit_move_insn (tmp1, gen_lowpart (SImode, val)); + + /* Insert the SImode value as low element of a V4SImode vector. */ + tmp2 = gen_lowpart (V4SImode, dperm.op0); + emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1)); + + ok = (expand_vec_perm_1 (&dperm) + || expand_vec_perm_broadcast_1 (&dperm)); + gcc_assert (ok); + return ok; + } + goto widen; + + case V16QImode: + if (TARGET_SSE2) + goto permute; + goto widen; + + widen: + /* Replicate the value once into the next wider mode and recurse. */ + { + enum machine_mode smode, wsmode, wvmode; + rtx x; + + smode = GET_MODE_INNER (mode); + wvmode = get_mode_wider_vector (mode); + wsmode = GET_MODE_INNER (wvmode); + + val = convert_modes (wsmode, smode, val, true); + x = expand_simple_binop (wsmode, ASHIFT, val, + GEN_INT (GET_MODE_BITSIZE (smode)), + NULL_RTX, 1, OPTAB_LIB_WIDEN); + val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN); + + x = gen_lowpart (wvmode, target); + ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val); + gcc_assert (ok); + return ok; + } + + case V16HImode: + case V32QImode: + { + enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode); + rtx x = gen_reg_rtx (hvmode); + + ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val); + gcc_assert (ok); + + x = gen_rtx_VEC_CONCAT (mode, x, x); + emit_insn (gen_rtx_SET (VOIDmode, target, x)); + } + return true; + + default: + return false; + } +} + +/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector + whose ONE_VAR element is VAR, and other elements are zero. Return true + if successful. */ + +static bool +ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode, + rtx target, rtx var, int one_var) +{ + enum machine_mode vsimode; + rtx new_target; + rtx x, tmp; + bool use_vector_set = false; + + switch (mode) + { + case V2DImode: + /* For SSE4.1, we normally use vector set. But if the second + element is zero and inter-unit moves are OK, we use movq + instead. */ + use_vector_set = (TARGET_64BIT + && TARGET_SSE4_1 + && !(TARGET_INTER_UNIT_MOVES + && one_var == 0)); + break; + case V16QImode: + case V4SImode: + case V4SFmode: + use_vector_set = TARGET_SSE4_1; + break; + case V8HImode: + use_vector_set = TARGET_SSE2; + break; + case V4HImode: + use_vector_set = TARGET_SSE || TARGET_3DNOW_A; + break; + case V32QImode: + case V16HImode: + case V8SImode: + case V8SFmode: + case V4DFmode: + use_vector_set = TARGET_AVX; + break; + case V4DImode: + /* Use ix86_expand_vector_set in 64bit mode only. */ + use_vector_set = TARGET_AVX && TARGET_64BIT; + break; + default: + break; + } + + if (use_vector_set) + { + emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode))); + var = force_reg (GET_MODE_INNER (mode), var); + ix86_expand_vector_set (mmx_ok, target, var, one_var); + return true; + } + + switch (mode) + { + case V2SFmode: + case V2SImode: + if (!mmx_ok) + return false; + /* FALLTHRU */ + + case V2DFmode: + case V2DImode: + if (one_var != 0) + return false; + var = force_reg (GET_MODE_INNER (mode), var); + x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode))); + emit_insn (gen_rtx_SET (VOIDmode, target, x)); + return true; + + case V4SFmode: + case V4SImode: + if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER) + new_target = gen_reg_rtx (mode); + else + new_target = target; + var = force_reg (GET_MODE_INNER (mode), var); + x = gen_rtx_VEC_DUPLICATE (mode, var); + x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx); + emit_insn (gen_rtx_SET (VOIDmode, new_target, x)); + if (one_var != 0) + { + /* We need to shuffle the value to the correct position, so + create a new pseudo to store the intermediate result. */ + + /* With SSE2, we can use the integer shuffle insns. */ + if (mode != V4SFmode && TARGET_SSE2) + { + emit_insn (gen_sse2_pshufd_1 (new_target, new_target, + const1_rtx, + GEN_INT (one_var == 1 ? 0 : 1), + GEN_INT (one_var == 2 ? 0 : 1), + GEN_INT (one_var == 3 ? 0 : 1))); + if (target != new_target) + emit_move_insn (target, new_target); + return true; + } + + /* Otherwise convert the intermediate result to V4SFmode and + use the SSE1 shuffle instructions. */ + if (mode != V4SFmode) + { + tmp = gen_reg_rtx (V4SFmode); + emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target)); + } + else + tmp = new_target; + + emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp, + const1_rtx, + GEN_INT (one_var == 1 ? 0 : 1), + GEN_INT (one_var == 2 ? 0+4 : 1+4), + GEN_INT (one_var == 3 ? 0+4 : 1+4))); + + if (mode != V4SFmode) + emit_move_insn (target, gen_lowpart (V4SImode, tmp)); + else if (tmp != target) + emit_move_insn (target, tmp); + } + else if (target != new_target) + emit_move_insn (target, new_target); + return true; + + case V8HImode: + case V16QImode: + vsimode = V4SImode; + goto widen; + case V4HImode: + case V8QImode: + if (!mmx_ok) + return false; + vsimode = V2SImode; + goto widen; + widen: + if (one_var != 0) + return false; + + /* Zero extend the variable element to SImode and recurse. */ + var = convert_modes (SImode, GET_MODE_INNER (mode), var, true); + + x = gen_reg_rtx (vsimode); + if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x, + var, one_var)) + gcc_unreachable (); + + emit_move_insn (target, gen_lowpart (mode, x)); + return true; + + default: + return false; + } +} + +/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector + consisting of the values in VALS. It is known that all elements + except ONE_VAR are constants. Return true if successful. */ + +static bool +ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode, + rtx target, rtx vals, int one_var) +{ + rtx var = XVECEXP (vals, 0, one_var); + enum machine_mode wmode; + rtx const_vec, x; + + const_vec = copy_rtx (vals); + XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode)); + const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0)); + + switch (mode) + { + case V2DFmode: + case V2DImode: + case V2SFmode: + case V2SImode: + /* For the two element vectors, it's just as easy to use + the general case. */ + return false; + + case V4DImode: + /* Use ix86_expand_vector_set in 64bit mode only. */ + if (!TARGET_64BIT) + return false; + case V4DFmode: + case V8SFmode: + case V8SImode: + case V16HImode: + case V32QImode: + case V4SFmode: + case V4SImode: + case V8HImode: + case V4HImode: + break; + + case V16QImode: + if (TARGET_SSE4_1) + break; + wmode = V8HImode; + goto widen; + case V8QImode: + wmode = V4HImode; + goto widen; + widen: + /* There's no way to set one QImode entry easily. Combine + the variable value with its adjacent constant value, and + promote to an HImode set. */ + x = XVECEXP (vals, 0, one_var ^ 1); + if (one_var & 1) + { + var = convert_modes (HImode, QImode, var, true); + var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8), + NULL_RTX, 1, OPTAB_LIB_WIDEN); + x = GEN_INT (INTVAL (x) & 0xff); + } + else + { + var = convert_modes (HImode, QImode, var, true); + x = gen_int_mode (INTVAL (x) << 8, HImode); + } + if (x != const0_rtx) + var = expand_simple_binop (HImode, IOR, var, x, var, + 1, OPTAB_LIB_WIDEN); + + x = gen_reg_rtx (wmode); + emit_move_insn (x, gen_lowpart (wmode, const_vec)); + ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1); + + emit_move_insn (target, gen_lowpart (mode, x)); + return true; + + default: + return false; + } + + emit_move_insn (target, const_vec); + ix86_expand_vector_set (mmx_ok, target, var, one_var); + return true; +} + +/* A subroutine of ix86_expand_vector_init_general. Use vector + concatenate to handle the most general case: all values variable, + and none identical. */ + +static void +ix86_expand_vector_init_concat (enum machine_mode mode, + rtx target, rtx *ops, int n) +{ + enum machine_mode cmode, hmode = VOIDmode; + rtx first[8], second[4]; + rtvec v; + int i, j; + + switch (n) + { + case 2: + switch (mode) + { + case V8SImode: + cmode = V4SImode; + break; + case V8SFmode: + cmode = V4SFmode; + break; + case V4DImode: + cmode = V2DImode; + break; + case V4DFmode: + cmode = V2DFmode; + break; + case V4SImode: + cmode = V2SImode; + break; + case V4SFmode: + cmode = V2SFmode; + break; + case V2DImode: + cmode = DImode; + break; + case V2SImode: + cmode = SImode; + break; + case V2DFmode: + cmode = DFmode; + break; + case V2SFmode: + cmode = SFmode; + break; + default: + gcc_unreachable (); + } + + if (!register_operand (ops[1], cmode)) + ops[1] = force_reg (cmode, ops[1]); + if (!register_operand (ops[0], cmode)) + ops[0] = force_reg (cmode, ops[0]); + emit_insn (gen_rtx_SET (VOIDmode, target, + gen_rtx_VEC_CONCAT (mode, ops[0], + ops[1]))); + break; + + case 4: + switch (mode) + { + case V4DImode: + cmode = V2DImode; + break; + case V4DFmode: + cmode = V2DFmode; + break; + case V4SImode: + cmode = V2SImode; + break; + case V4SFmode: + cmode = V2SFmode; + break; + default: + gcc_unreachable (); + } + goto half; + + case 8: + switch (mode) + { + case V8SImode: + cmode = V2SImode; + hmode = V4SImode; + break; + case V8SFmode: + cmode = V2SFmode; + hmode = V4SFmode; + break; + default: + gcc_unreachable (); + } + goto half; + +half: + /* FIXME: We process inputs backward to help RA. PR 36222. */ + i = n - 1; + j = (n >> 1) - 1; + for (; i > 0; i -= 2, j--) + { + first[j] = gen_reg_rtx (cmode); + v = gen_rtvec (2, ops[i - 1], ops[i]); + ix86_expand_vector_init (false, first[j], + gen_rtx_PARALLEL (cmode, v)); + } + + n >>= 1; + if (n > 2) + { + gcc_assert (hmode != VOIDmode); + for (i = j = 0; i < n; i += 2, j++) + { + second[j] = gen_reg_rtx (hmode); + ix86_expand_vector_init_concat (hmode, second [j], + &first [i], 2); + } + n >>= 1; + ix86_expand_vector_init_concat (mode, target, second, n); + } + else + ix86_expand_vector_init_concat (mode, target, first, n); + break; + + default: + gcc_unreachable (); + } +} + +/* A subroutine of ix86_expand_vector_init_general. Use vector + interleave to handle the most general case: all values variable, + and none identical. */ + +static void +ix86_expand_vector_init_interleave (enum machine_mode mode, + rtx target, rtx *ops, int n) +{ + enum machine_mode first_imode, second_imode, third_imode, inner_mode; + int i, j; + rtx op0, op1; + rtx (*gen_load_even) (rtx, rtx, rtx); + rtx (*gen_interleave_first_low) (rtx, rtx, rtx); + rtx (*gen_interleave_second_low) (rtx, rtx, rtx); + + switch (mode) + { + case V8HImode: + gen_load_even = gen_vec_setv8hi; + gen_interleave_first_low = gen_vec_interleave_lowv4si; + gen_interleave_second_low = gen_vec_interleave_lowv2di; + inner_mode = HImode; + first_imode = V4SImode; + second_imode = V2DImode; + third_imode = VOIDmode; + break; + case V16QImode: + gen_load_even = gen_vec_setv16qi; + gen_interleave_first_low = gen_vec_interleave_lowv8hi; + gen_interleave_second_low = gen_vec_interleave_lowv4si; + inner_mode = QImode; + first_imode = V8HImode; + second_imode = V4SImode; + third_imode = V2DImode; + break; + default: + gcc_unreachable (); + } + + for (i = 0; i < n; i++) + { + /* Extend the odd elment to SImode using a paradoxical SUBREG. */ + op0 = gen_reg_rtx (SImode); + emit_move_insn (op0, gen_lowpart (SImode, ops [i + i])); + + /* Insert the SImode value as low element of V4SImode vector. */ + op1 = gen_reg_rtx (V4SImode); + op0 = gen_rtx_VEC_MERGE (V4SImode, + gen_rtx_VEC_DUPLICATE (V4SImode, + op0), + CONST0_RTX (V4SImode), + const1_rtx); + emit_insn (gen_rtx_SET (VOIDmode, op1, op0)); + + /* Cast the V4SImode vector back to a vector in orignal mode. */ + op0 = gen_reg_rtx (mode); + emit_move_insn (op0, gen_lowpart (mode, op1)); + + /* Load even elements into the second positon. */ + emit_insn (gen_load_even (op0, + force_reg (inner_mode, + ops [i + i + 1]), + const1_rtx)); + + /* Cast vector to FIRST_IMODE vector. */ + ops[i] = gen_reg_rtx (first_imode); + emit_move_insn (ops[i], gen_lowpart (first_imode, op0)); + } + + /* Interleave low FIRST_IMODE vectors. */ + for (i = j = 0; i < n; i += 2, j++) + { + op0 = gen_reg_rtx (first_imode); + emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1])); + + /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */ + ops[j] = gen_reg_rtx (second_imode); + emit_move_insn (ops[j], gen_lowpart (second_imode, op0)); + } + + /* Interleave low SECOND_IMODE vectors. */ + switch (second_imode) + { + case V4SImode: + for (i = j = 0; i < n / 2; i += 2, j++) + { + op0 = gen_reg_rtx (second_imode); + emit_insn (gen_interleave_second_low (op0, ops[i], + ops[i + 1])); + + /* Cast the SECOND_IMODE vector to the THIRD_IMODE + vector. */ + ops[j] = gen_reg_rtx (third_imode); + emit_move_insn (ops[j], gen_lowpart (third_imode, op0)); + } + second_imode = V2DImode; + gen_interleave_second_low = gen_vec_interleave_lowv2di; + /* FALLTHRU */ + + case V2DImode: + op0 = gen_reg_rtx (second_imode); + emit_insn (gen_interleave_second_low (op0, ops[0], + ops[1])); + + /* Cast the SECOND_IMODE vector back to a vector on original + mode. */ + emit_insn (gen_rtx_SET (VOIDmode, target, + gen_lowpart (mode, op0))); + break; + + default: + gcc_unreachable (); + } +} + +/* A subroutine of ix86_expand_vector_init. Handle the most general case: + all values variable, and none identical. */ + +static void +ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode, + rtx target, rtx vals) +{ + rtx ops[32], op0, op1; + enum machine_mode half_mode = VOIDmode; + int n, i; + + switch (mode) + { + case V2SFmode: + case V2SImode: + if (!mmx_ok && !TARGET_SSE) + break; + /* FALLTHRU */ + + case V8SFmode: + case V8SImode: + case V4DFmode: + case V4DImode: + case V4SFmode: + case V4SImode: + case V2DFmode: + case V2DImode: + n = GET_MODE_NUNITS (mode); + for (i = 0; i < n; i++) + ops[i] = XVECEXP (vals, 0, i); + ix86_expand_vector_init_concat (mode, target, ops, n); + return; + + case V32QImode: + half_mode = V16QImode; + goto half; + + case V16HImode: + half_mode = V8HImode; + goto half; + +half: + n = GET_MODE_NUNITS (mode); + for (i = 0; i < n; i++) + ops[i] = XVECEXP (vals, 0, i); + op0 = gen_reg_rtx (half_mode); + op1 = gen_reg_rtx (half_mode); + ix86_expand_vector_init_interleave (half_mode, op0, ops, + n >> 2); + ix86_expand_vector_init_interleave (half_mode, op1, + &ops [n >> 1], n >> 2); + emit_insn (gen_rtx_SET (VOIDmode, target, + gen_rtx_VEC_CONCAT (mode, op0, op1))); + return; + + case V16QImode: + if (!TARGET_SSE4_1) + break; + /* FALLTHRU */ + + case V8HImode: + if (!TARGET_SSE2) + break; + + /* Don't use ix86_expand_vector_init_interleave if we can't + move from GPR to SSE register directly. */ + if (!TARGET_INTER_UNIT_MOVES) + break; + + n = GET_MODE_NUNITS (mode); + for (i = 0; i < n; i++) + ops[i] = XVECEXP (vals, 0, i); + ix86_expand_vector_init_interleave (mode, target, ops, n >> 1); + return; + + case V4HImode: + case V8QImode: + break; + + default: + gcc_unreachable (); + } + + { + int i, j, n_elts, n_words, n_elt_per_word; + enum machine_mode inner_mode; + rtx words[4], shift; + + inner_mode = GET_MODE_INNER (mode); + n_elts = GET_MODE_NUNITS (mode); + n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD; + n_elt_per_word = n_elts / n_words; + shift = GEN_INT (GET_MODE_BITSIZE (inner_mode)); + + for (i = 0; i < n_words; ++i) + { + rtx word = NULL_RTX; + + for (j = 0; j < n_elt_per_word; ++j) + { + rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1); + elt = convert_modes (word_mode, inner_mode, elt, true); + + if (j == 0) + word = elt; + else + { + word = expand_simple_binop (word_mode, ASHIFT, word, shift, + word, 1, OPTAB_LIB_WIDEN); + word = expand_simple_binop (word_mode, IOR, word, elt, + word, 1, OPTAB_LIB_WIDEN); + } + } + + words[i] = word; + } + + if (n_words == 1) + emit_move_insn (target, gen_lowpart (mode, words[0])); + else if (n_words == 2) + { + rtx tmp = gen_reg_rtx (mode); + emit_clobber (tmp); + emit_move_insn (gen_lowpart (word_mode, tmp), words[0]); + emit_move_insn (gen_highpart (word_mode, tmp), words[1]); + emit_move_insn (target, tmp); + } + else if (n_words == 4) + { + rtx tmp = gen_reg_rtx (V4SImode); + gcc_assert (word_mode == SImode); + vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words)); + ix86_expand_vector_init_general (false, V4SImode, tmp, vals); + emit_move_insn (target, gen_lowpart (mode, tmp)); + } + else + gcc_unreachable (); + } +} + +/* Initialize vector TARGET via VALS. Suppress the use of MMX + instructions unless MMX_OK is true. */ + +void +ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals) +{ + enum machine_mode mode = GET_MODE (target); + enum machine_mode inner_mode = GET_MODE_INNER (mode); + int n_elts = GET_MODE_NUNITS (mode); + int n_var = 0, one_var = -1; + bool all_same = true, all_const_zero = true; + int i; + rtx x; + + for (i = 0; i < n_elts; ++i) + { + x = XVECEXP (vals, 0, i); + if (!(CONST_INT_P (x) + || GET_CODE (x) == CONST_DOUBLE + || GET_CODE (x) == CONST_FIXED)) + n_var++, one_var = i; + else if (x != CONST0_RTX (inner_mode)) + all_const_zero = false; + if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0))) + all_same = false; + } + + /* Constants are best loaded from the constant pool. */ + if (n_var == 0) + { + emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0))); + return; + } + + /* If all values are identical, broadcast the value. */ + if (all_same + && ix86_expand_vector_init_duplicate (mmx_ok, mode, target, + XVECEXP (vals, 0, 0))) + return; + + /* Values where only one field is non-constant are best loaded from + the pool and overwritten via move later. */ + if (n_var == 1) + { + if (all_const_zero + && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target, + XVECEXP (vals, 0, one_var), + one_var)) + return; + + if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var)) + return; + } + + ix86_expand_vector_init_general (mmx_ok, mode, target, vals); +} + +void +ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt) +{ + enum machine_mode mode = GET_MODE (target); + enum machine_mode inner_mode = GET_MODE_INNER (mode); + enum machine_mode half_mode; + bool use_vec_merge = false; + rtx tmp; + static rtx (*gen_extract[6][2]) (rtx, rtx) + = { + { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi }, + { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi }, + { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si }, + { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di }, + { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf }, + { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df } + }; + static rtx (*gen_insert[6][2]) (rtx, rtx, rtx) + = { + { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi }, + { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi }, + { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si }, + { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di }, + { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf }, + { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df } + }; + int i, j, n; + + switch (mode) + { + case V2SFmode: + case V2SImode: + if (mmx_ok) + { + tmp = gen_reg_rtx (GET_MODE_INNER (mode)); + ix86_expand_vector_extract (true, tmp, target, 1 - elt); + if (elt == 0) + tmp = gen_rtx_VEC_CONCAT (mode, val, tmp); + else + tmp = gen_rtx_VEC_CONCAT (mode, tmp, val); + emit_insn (gen_rtx_SET (VOIDmode, target, tmp)); + return; + } + break; + + case V2DImode: + use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT; + if (use_vec_merge) + break; + + tmp = gen_reg_rtx (GET_MODE_INNER (mode)); + ix86_expand_vector_extract (false, tmp, target, 1 - elt); + if (elt == 0) + tmp = gen_rtx_VEC_CONCAT (mode, val, tmp); + else + tmp = gen_rtx_VEC_CONCAT (mode, tmp, val); + emit_insn (gen_rtx_SET (VOIDmode, target, tmp)); + return; + + case V2DFmode: + { + rtx op0, op1; + + /* For the two element vectors, we implement a VEC_CONCAT with + the extraction of the other element. */ + + tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt))); + tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp); + + if (elt == 0) + op0 = val, op1 = tmp; + else + op0 = tmp, op1 = val; + + tmp = gen_rtx_VEC_CONCAT (mode, op0, op1); + emit_insn (gen_rtx_SET (VOIDmode, target, tmp)); + } + return; + + case V4SFmode: + use_vec_merge = TARGET_SSE4_1; + if (use_vec_merge) + break; + + switch (elt) + { + case 0: + use_vec_merge = true; + break; + + case 1: + /* tmp = target = A B C D */ + tmp = copy_to_reg (target); + /* target = A A B B */ + emit_insn (gen_vec_interleave_lowv4sf (target, target, target)); + /* target = X A B B */ + ix86_expand_vector_set (false, target, val, 0); + /* target = A X C D */ + emit_insn (gen_sse_shufps_v4sf (target, target, tmp, + const1_rtx, const0_rtx, + GEN_INT (2+4), GEN_INT (3+4))); + return; + + case 2: + /* tmp = target = A B C D */ + tmp = copy_to_reg (target); + /* tmp = X B C D */ + ix86_expand_vector_set (false, tmp, val, 0); + /* target = A B X D */ + emit_insn (gen_sse_shufps_v4sf (target, target, tmp, + const0_rtx, const1_rtx, + GEN_INT (0+4), GEN_INT (3+4))); + return; + + case 3: + /* tmp = target = A B C D */ + tmp = copy_to_reg (target); + /* tmp = X B C D */ + ix86_expand_vector_set (false, tmp, val, 0); + /* target = A B X D */ + emit_insn (gen_sse_shufps_v4sf (target, target, tmp, + const0_rtx, const1_rtx, + GEN_INT (2+4), GEN_INT (0+4))); + return; + + default: + gcc_unreachable (); + } + break; + + case V4SImode: + use_vec_merge = TARGET_SSE4_1; + if (use_vec_merge) + break; + + /* Element 0 handled by vec_merge below. */ + if (elt == 0) + { + use_vec_merge = true; + break; + } + + if (TARGET_SSE2) + { + /* With SSE2, use integer shuffles to swap element 0 and ELT, + store into element 0, then shuffle them back. */ + + rtx order[4]; + + order[0] = GEN_INT (elt); + order[1] = const1_rtx; + order[2] = const2_rtx; + order[3] = GEN_INT (3); + order[elt] = const0_rtx; + + emit_insn (gen_sse2_pshufd_1 (target, target, order[0], + order[1], order[2], order[3])); + + ix86_expand_vector_set (false, target, val, 0); + + emit_insn (gen_sse2_pshufd_1 (target, target, order[0], + order[1], order[2], order[3])); + } + else + { + /* For SSE1, we have to reuse the V4SF code. */ + ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target), + gen_lowpart (SFmode, val), elt); + } + return; + + case V8HImode: + use_vec_merge = TARGET_SSE2; + break; + case V4HImode: + use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A); + break; + + case V16QImode: + use_vec_merge = TARGET_SSE4_1; + break; + + case V8QImode: + break; + + case V32QImode: + half_mode = V16QImode; + j = 0; + n = 16; + goto half; + + case V16HImode: + half_mode = V8HImode; + j = 1; + n = 8; + goto half; + + case V8SImode: + half_mode = V4SImode; + j = 2; + n = 4; + goto half; + + case V4DImode: + half_mode = V2DImode; + j = 3; + n = 2; + goto half; + + case V8SFmode: + half_mode = V4SFmode; + j = 4; + n = 4; + goto half; + + case V4DFmode: + half_mode = V2DFmode; + j = 5; + n = 2; + goto half; + +half: + /* Compute offset. */ + i = elt / n; + elt %= n; + + gcc_assert (i <= 1); + + /* Extract the half. */ + tmp = gen_reg_rtx (half_mode); + emit_insn (gen_extract[j][i] (tmp, target)); + + /* Put val in tmp at elt. */ + ix86_expand_vector_set (false, tmp, val, elt); + + /* Put it back. */ + emit_insn (gen_insert[j][i] (target, target, tmp)); + return; + + default: + break; + } + + if (use_vec_merge) + { + tmp = gen_rtx_VEC_DUPLICATE (mode, val); + tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt)); + emit_insn (gen_rtx_SET (VOIDmode, target, tmp)); + } + else + { + rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false); + + emit_move_insn (mem, target); + + tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode)); + emit_move_insn (tmp, val); + + emit_move_insn (target, mem); + } +} + +void +ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt) +{ + enum machine_mode mode = GET_MODE (vec); + enum machine_mode inner_mode = GET_MODE_INNER (mode); + bool use_vec_extr = false; + rtx tmp; + + switch (mode) + { + case V2SImode: + case V2SFmode: + if (!mmx_ok) + break; + /* FALLTHRU */ + + case V2DFmode: + case V2DImode: + use_vec_extr = true; + break; + + case V4SFmode: + use_vec_extr = TARGET_SSE4_1; + if (use_vec_extr) + break; + + switch (elt) + { + case 0: + tmp = vec; + break; + + case 1: + case 3: + tmp = gen_reg_rtx (mode); + emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec, + GEN_INT (elt), GEN_INT (elt), + GEN_INT (elt+4), GEN_INT (elt+4))); + break; + + case 2: + tmp = gen_reg_rtx (mode); + emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec)); + break; + + default: + gcc_unreachable (); + } + vec = tmp; + use_vec_extr = true; + elt = 0; + break; + + case V4SImode: + use_vec_extr = TARGET_SSE4_1; + if (use_vec_extr) + break; + + if (TARGET_SSE2) + { + switch (elt) + { + case 0: + tmp = vec; + break; + + case 1: + case 3: + tmp = gen_reg_rtx (mode); + emit_insn (gen_sse2_pshufd_1 (tmp, vec, + GEN_INT (elt), GEN_INT (elt), + GEN_INT (elt), GEN_INT (elt))); + break; + + case 2: + tmp = gen_reg_rtx (mode); + emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec)); + break; + + default: + gcc_unreachable (); + } + vec = tmp; + use_vec_extr = true; + elt = 0; + } + else + { + /* For SSE1, we have to reuse the V4SF code. */ + ix86_expand_vector_extract (false, gen_lowpart (SFmode, target), + gen_lowpart (V4SFmode, vec), elt); + return; + } + break; + + case V8HImode: + use_vec_extr = TARGET_SSE2; + break; + case V4HImode: + use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A); + break; + + case V16QImode: + use_vec_extr = TARGET_SSE4_1; + break; + + case V8QImode: + /* ??? Could extract the appropriate HImode element and shift. */ + default: + break; + } + + if (use_vec_extr) + { + tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt))); + tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp); + + /* Let the rtl optimizers know about the zero extension performed. */ + if (inner_mode == QImode || inner_mode == HImode) + { + tmp = gen_rtx_ZERO_EXTEND (SImode, tmp); + target = gen_lowpart (SImode, target); + } + + emit_insn (gen_rtx_SET (VOIDmode, target, tmp)); + } + else + { + rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false); + + emit_move_insn (mem, vec); + + tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode)); + emit_move_insn (target, tmp); + } +} + +/* Expand a vector reduction on V4SFmode for SSE1. FN is the binary + pattern to reduce; DEST is the destination; IN is the input vector. */ + +void +ix86_expand_reduc_v4sf (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in) +{ + rtx tmp1, tmp2, tmp3; + + tmp1 = gen_reg_rtx (V4SFmode); + tmp2 = gen_reg_rtx (V4SFmode); + tmp3 = gen_reg_rtx (V4SFmode); + + emit_insn (gen_sse_movhlps (tmp1, in, in)); + emit_insn (fn (tmp2, tmp1, in)); + + emit_insn (gen_sse_shufps_v4sf (tmp3, tmp2, tmp2, + const1_rtx, const1_rtx, + GEN_INT (1+4), GEN_INT (1+4))); + emit_insn (fn (dest, tmp2, tmp3)); +} + +/* Target hook for scalar_mode_supported_p. */ +static bool +ix86_scalar_mode_supported_p (enum machine_mode mode) +{ + if (DECIMAL_FLOAT_MODE_P (mode)) + return default_decimal_float_supported_p (); + else if (mode == TFmode) + return true; + else + return default_scalar_mode_supported_p (mode); +} + +/* Implements target hook vector_mode_supported_p. */ +static bool +ix86_vector_mode_supported_p (enum machine_mode mode) +{ + if (TARGET_SSE && VALID_SSE_REG_MODE (mode)) + return true; + if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode)) + return true; + if (TARGET_AVX && VALID_AVX256_REG_MODE (mode)) + return true; + if (TARGET_MMX && VALID_MMX_REG_MODE (mode)) + return true; + if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode)) + return true; + return false; +} + +/* Target hook for c_mode_for_suffix. */ +static enum machine_mode +ix86_c_mode_for_suffix (char suffix) +{ + if (suffix == 'q') + return TFmode; + if (suffix == 'w') + return XFmode; + + return VOIDmode; +} + +/* Worker function for TARGET_MD_ASM_CLOBBERS. + + We do this in the new i386 backend to maintain source compatibility + with the old cc0-based compiler. */ + +static tree +ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED, + tree inputs ATTRIBUTE_UNUSED, + tree clobbers) +{ + clobbers = tree_cons (NULL_TREE, build_string (5, "flags"), + clobbers); + clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"), + clobbers); + return clobbers; +} + +/* Implements target vector targetm.asm.encode_section_info. This + is not used by netware. */ + +static void ATTRIBUTE_UNUSED +ix86_encode_section_info (tree decl, rtx rtl, int first) +{ + default_encode_section_info (decl, rtl, first); + + if (TREE_CODE (decl) == VAR_DECL + && (TREE_STATIC (decl) || DECL_EXTERNAL (decl)) + && ix86_in_large_data_p (decl)) + SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR; +} + +/* Worker function for REVERSE_CONDITION. */ + +enum rtx_code +ix86_reverse_condition (enum rtx_code code, enum machine_mode mode) +{ + return (mode != CCFPmode && mode != CCFPUmode + ? reverse_condition (code) + : reverse_condition_maybe_unordered (code)); +} + +/* Output code to perform an x87 FP register move, from OPERANDS[1] + to OPERANDS[0]. */ + +const char * +output_387_reg_move (rtx insn, rtx *operands) +{ + if (REG_P (operands[0])) + { + if (REG_P (operands[1]) + && find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) + { + if (REGNO (operands[0]) == FIRST_STACK_REG) + return output_387_ffreep (operands, 0); + return "fstp\t%y0"; + } + if (STACK_TOP_P (operands[0])) + return "fld%Z1\t%y1"; + return "fst\t%y0"; + } + else if (MEM_P (operands[0])) + { + gcc_assert (REG_P (operands[1])); + if (find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) + return "fstp%Z0\t%y0"; + else + { + /* There is no non-popping store to memory for XFmode. + So if we need one, follow the store with a load. */ + if (GET_MODE (operands[0]) == XFmode) + return "fstp%Z0\t%y0\n\tfld%Z0\t%y0"; + else + return "fst%Z0\t%y0"; + } + } + else + gcc_unreachable(); +} + +/* Output code to perform a conditional jump to LABEL, if C2 flag in + FP status register is set. */ + +void +ix86_emit_fp_unordered_jump (rtx label) +{ + rtx reg = gen_reg_rtx (HImode); + rtx temp; + + emit_insn (gen_x86_fnstsw_1 (reg)); + + if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ())) + { + emit_insn (gen_x86_sahf_1 (reg)); + + temp = gen_rtx_REG (CCmode, FLAGS_REG); + temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx); + } + else + { + emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04))); + + temp = gen_rtx_REG (CCNOmode, FLAGS_REG); + temp = gen_rtx_NE (VOIDmode, temp, const0_rtx); + } + + temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp, + gen_rtx_LABEL_REF (VOIDmode, label), + pc_rtx); + temp = gen_rtx_SET (VOIDmode, pc_rtx, temp); + + emit_jump_insn (temp); + predict_jump (REG_BR_PROB_BASE * 10 / 100); +} + +/* Output code to perform a log1p XFmode calculation. */ + +void ix86_emit_i387_log1p (rtx op0, rtx op1) +{ + rtx label1 = gen_label_rtx (); + rtx label2 = gen_label_rtx (); + + rtx tmp = gen_reg_rtx (XFmode); + rtx tmp2 = gen_reg_rtx (XFmode); + rtx test; + + emit_insn (gen_absxf2 (tmp, op1)); + test = gen_rtx_GE (VOIDmode, tmp, + CONST_DOUBLE_FROM_REAL_VALUE ( + REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), + XFmode)); + emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1)); + + emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */ + emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2)); + emit_jump (label2); + + emit_label (label1); + emit_move_insn (tmp, CONST1_RTX (XFmode)); + emit_insn (gen_addxf3 (tmp, op1, tmp)); + emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */ + emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2)); + + emit_label (label2); +} + +/* Output code to perform a Newton-Rhapson approximation of a single precision + floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */ + +void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode) +{ + rtx x0, x1, e0, e1, two; + + x0 = gen_reg_rtx (mode); + e0 = gen_reg_rtx (mode); + e1 = gen_reg_rtx (mode); + x1 = gen_reg_rtx (mode); + + two = CONST_DOUBLE_FROM_REAL_VALUE (dconst2, SFmode); + + if (VECTOR_MODE_P (mode)) + two = ix86_build_const_vector (mode, true, two); + + two = force_reg (mode, two); + + /* a / b = a * rcp(b) * (2.0 - b * rcp(b)) */ + + /* x0 = rcp(b) estimate */ + emit_insn (gen_rtx_SET (VOIDmode, x0, + gen_rtx_UNSPEC (mode, gen_rtvec (1, b), + UNSPEC_RCP))); + /* e0 = x0 * a */ + emit_insn (gen_rtx_SET (VOIDmode, e0, + gen_rtx_MULT (mode, x0, a))); + /* e1 = x0 * b */ + emit_insn (gen_rtx_SET (VOIDmode, e1, + gen_rtx_MULT (mode, x0, b))); + /* x1 = 2. - e1 */ + emit_insn (gen_rtx_SET (VOIDmode, x1, + gen_rtx_MINUS (mode, two, e1))); + /* res = e0 * x1 */ + emit_insn (gen_rtx_SET (VOIDmode, res, + gen_rtx_MULT (mode, e0, x1))); +} + +/* Output code to perform a Newton-Rhapson approximation of a + single precision floating point [reciprocal] square root. */ + +void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode, + bool recip) +{ + rtx x0, e0, e1, e2, e3, mthree, mhalf; + REAL_VALUE_TYPE r; + + x0 = gen_reg_rtx (mode); + e0 = gen_reg_rtx (mode); + e1 = gen_reg_rtx (mode); + e2 = gen_reg_rtx (mode); + e3 = gen_reg_rtx (mode); + + real_from_integer (&r, VOIDmode, -3, -1, 0); + mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode); + + real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL); + mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode); + + if (VECTOR_MODE_P (mode)) + { + mthree = ix86_build_const_vector (mode, true, mthree); + mhalf = ix86_build_const_vector (mode, true, mhalf); + } + + /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) + rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */ + + /* x0 = rsqrt(a) estimate */ + emit_insn (gen_rtx_SET (VOIDmode, x0, + gen_rtx_UNSPEC (mode, gen_rtvec (1, a), + UNSPEC_RSQRT))); + + /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */ + if (!recip) + { + rtx zero, mask; + + zero = gen_reg_rtx (mode); + mask = gen_reg_rtx (mode); + + zero = force_reg (mode, CONST0_RTX(mode)); + emit_insn (gen_rtx_SET (VOIDmode, mask, + gen_rtx_NE (mode, zero, a))); + + emit_insn (gen_rtx_SET (VOIDmode, x0, + gen_rtx_AND (mode, x0, mask))); + } + + /* e0 = x0 * a */ + emit_insn (gen_rtx_SET (VOIDmode, e0, + gen_rtx_MULT (mode, x0, a))); + /* e1 = e0 * x0 */ + emit_insn (gen_rtx_SET (VOIDmode, e1, + gen_rtx_MULT (mode, e0, x0))); + + /* e2 = e1 - 3. */ + mthree = force_reg (mode, mthree); + emit_insn (gen_rtx_SET (VOIDmode, e2, + gen_rtx_PLUS (mode, e1, mthree))); + + mhalf = force_reg (mode, mhalf); + if (recip) + /* e3 = -.5 * x0 */ + emit_insn (gen_rtx_SET (VOIDmode, e3, + gen_rtx_MULT (mode, x0, mhalf))); + else + /* e3 = -.5 * e0 */ + emit_insn (gen_rtx_SET (VOIDmode, e3, + gen_rtx_MULT (mode, e0, mhalf))); + /* ret = e2 * e3 */ + emit_insn (gen_rtx_SET (VOIDmode, res, + gen_rtx_MULT (mode, e2, e3))); +} + +/* Solaris implementation of TARGET_ASM_NAMED_SECTION. */ + +static void ATTRIBUTE_UNUSED +i386_solaris_elf_named_section (const char *name, unsigned int flags, + tree decl) +{ + /* With Binutils 2.15, the "@unwind" marker must be specified on + every occurrence of the ".eh_frame" section, not just the first + one. */ + if (TARGET_64BIT + && strcmp (name, ".eh_frame") == 0) + { + fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name, + flags & SECTION_WRITE ? "aw" : "a"); + return; + } + default_elf_asm_named_section (name, flags, decl); +} + +/* Return the mangling of TYPE if it is an extended fundamental type. */ + +static const char * +ix86_mangle_type (const_tree type) +{ + type = TYPE_MAIN_VARIANT (type); + + if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE + && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE) + return NULL; + + switch (TYPE_MODE (type)) + { + case TFmode: + /* __float128 is "g". */ + return "g"; + case XFmode: + /* "long double" or __float80 is "e". */ + return "e"; + default: + return NULL; + } +} + +/* For 32-bit code we can save PIC register setup by using + __stack_chk_fail_local hidden function instead of calling + __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC + register, so it is better to call __stack_chk_fail directly. */ + +static tree +ix86_stack_protect_fail (void) +{ + return TARGET_64BIT + ? default_external_stack_protect_fail () + : default_hidden_stack_protect_fail (); +} + +/* Select a format to encode pointers in exception handling data. CODE + is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is + true if the symbol may be affected by dynamic relocations. + + ??? All x86 object file formats are capable of representing this. + After all, the relocation needed is the same as for the call insn. + Whether or not a particular assembler allows us to enter such, I + guess we'll have to see. */ +int +asm_preferred_eh_data_format (int code, int global) +{ + if (flag_pic) + { + int type = DW_EH_PE_sdata8; + if (!TARGET_64BIT + || ix86_cmodel == CM_SMALL_PIC + || (ix86_cmodel == CM_MEDIUM_PIC && (global || code))) + type = DW_EH_PE_sdata4; + return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type; + } + if (ix86_cmodel == CM_SMALL + || (ix86_cmodel == CM_MEDIUM && code)) + return DW_EH_PE_udata4; + return DW_EH_PE_absptr; +} + +/* Expand copysign from SIGN to the positive value ABS_VALUE + storing in RESULT. If MASK is non-null, it shall be a mask to mask out + the sign-bit. */ +static void +ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask) +{ + enum machine_mode mode = GET_MODE (sign); + rtx sgn = gen_reg_rtx (mode); + if (mask == NULL_RTX) + { + enum machine_mode vmode; + + if (mode == SFmode) + vmode = V4SFmode; + else if (mode == DFmode) + vmode = V2DFmode; + else + vmode = mode; + + mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false); + if (!VECTOR_MODE_P (mode)) + { + /* We need to generate a scalar mode mask in this case. */ + rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx)); + tmp = gen_rtx_VEC_SELECT (mode, mask, tmp); + mask = gen_reg_rtx (mode); + emit_insn (gen_rtx_SET (VOIDmode, mask, tmp)); + } + } + else + mask = gen_rtx_NOT (mode, mask); + emit_insn (gen_rtx_SET (VOIDmode, sgn, + gen_rtx_AND (mode, mask, sign))); + emit_insn (gen_rtx_SET (VOIDmode, result, + gen_rtx_IOR (mode, abs_value, sgn))); +} + +/* Expand fabs (OP0) and return a new rtx that holds the result. The + mask for masking out the sign-bit is stored in *SMASK, if that is + non-null. */ +static rtx +ix86_expand_sse_fabs (rtx op0, rtx *smask) +{ + enum machine_mode vmode, mode = GET_MODE (op0); + rtx xa, mask; + + xa = gen_reg_rtx (mode); + if (mode == SFmode) + vmode = V4SFmode; + else if (mode == DFmode) + vmode = V2DFmode; + else + vmode = mode; + mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true); + if (!VECTOR_MODE_P (mode)) + { + /* We need to generate a scalar mode mask in this case. */ + rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx)); + tmp = gen_rtx_VEC_SELECT (mode, mask, tmp); + mask = gen_reg_rtx (mode); + emit_insn (gen_rtx_SET (VOIDmode, mask, tmp)); + } + emit_insn (gen_rtx_SET (VOIDmode, xa, + gen_rtx_AND (mode, op0, mask))); + + if (smask) + *smask = mask; + + return xa; +} + +/* Expands a comparison of OP0 with OP1 using comparison code CODE, + swapping the operands if SWAP_OPERANDS is true. The expanded + code is a forward jump to a newly created label in case the + comparison is true. The generated label rtx is returned. */ +static rtx +ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1, + bool swap_operands) +{ + rtx label, tmp; + + if (swap_operands) + { + tmp = op0; + op0 = op1; + op1 = tmp; + } + + label = gen_label_rtx (); + tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG); + emit_insn (gen_rtx_SET (VOIDmode, tmp, + gen_rtx_COMPARE (CCFPUmode, op0, op1))); + tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx); + tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, + gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx); + tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp)); + JUMP_LABEL (tmp) = label; + + return label; +} + +/* Expand a mask generating SSE comparison instruction comparing OP0 with OP1 + using comparison code CODE. Operands are swapped for the comparison if + SWAP_OPERANDS is true. Returns a rtx for the generated mask. */ +static rtx +ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1, + bool swap_operands) +{ + enum machine_mode mode = GET_MODE (op0); + rtx mask = gen_reg_rtx (mode); + + if (swap_operands) + { + rtx tmp = op0; + op0 = op1; + op1 = tmp; + } + + if (mode == DFmode) + emit_insn (gen_sse2_maskcmpdf3 (mask, op0, op1, + gen_rtx_fmt_ee (code, mode, op0, op1))); + else + emit_insn (gen_sse_maskcmpsf3 (mask, op0, op1, + gen_rtx_fmt_ee (code, mode, op0, op1))); + + return mask; +} + +/* Generate and return a rtx of mode MODE for 2**n where n is the number + of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */ +static rtx +ix86_gen_TWO52 (enum machine_mode mode) +{ + REAL_VALUE_TYPE TWO52r; + rtx TWO52; + + real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23); + TWO52 = const_double_from_real_value (TWO52r, mode); + TWO52 = force_reg (mode, TWO52); + + return TWO52; +} + +/* Expand SSE sequence for computing lround from OP1 storing + into OP0. */ +void +ix86_expand_lround (rtx op0, rtx op1) +{ + /* C code for the stuff we're doing below: + tmp = op1 + copysign (nextafter (0.5, 0.0), op1) + return (long)tmp; + */ + enum machine_mode mode = GET_MODE (op1); + const struct real_format *fmt; + REAL_VALUE_TYPE pred_half, half_minus_pred_half; + rtx adj; + + /* load nextafter (0.5, 0.0) */ + fmt = REAL_MODE_FORMAT (mode); + real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode); + REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half); + + /* adj = copysign (0.5, op1) */ + adj = force_reg (mode, const_double_from_real_value (pred_half, mode)); + ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX); + + /* adj = op1 + adj */ + adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT); + + /* op0 = (imode)adj */ + expand_fix (op0, adj, 0); +} + +/* Expand SSE2 sequence for computing lround from OPERAND1 storing + into OPERAND0. */ +void +ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor) +{ + /* C code for the stuff we're doing below (for do_floor): + xi = (long)op1; + xi -= (double)xi > op1 ? 1 : 0; + return xi; + */ + enum machine_mode fmode = GET_MODE (op1); + enum machine_mode imode = GET_MODE (op0); + rtx ireg, freg, label, tmp; + + /* reg = (long)op1 */ + ireg = gen_reg_rtx (imode); + expand_fix (ireg, op1, 0); + + /* freg = (double)reg */ + freg = gen_reg_rtx (fmode); + expand_float (freg, ireg, 0); + + /* ireg = (freg > op1) ? ireg - 1 : ireg */ + label = ix86_expand_sse_compare_and_jump (UNLE, + freg, op1, !do_floor); + tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS, + ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT); + emit_move_insn (ireg, tmp); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (op0, ireg); +} + +/* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the + result in OPERAND0. */ +void +ix86_expand_rint (rtx operand0, rtx operand1) +{ + /* C code for the stuff we're doing below: + xa = fabs (operand1); + if (!isless (xa, 2**52)) + return operand1; + xa = xa + 2**52 - 2**52; + return copysign (xa, operand1); + */ + enum machine_mode mode = GET_MODE (operand0); + rtx res, xa, label, TWO52, mask; + + res = gen_reg_rtx (mode); + emit_move_insn (res, operand1); + + /* xa = abs (operand1) */ + xa = ix86_expand_sse_fabs (res, &mask); + + /* if (!isless (xa, TWO52)) goto label; */ + TWO52 = ix86_gen_TWO52 (mode); + label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + + xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); + xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT); + + ix86_sse_copysign_to_positive (res, xa, res, mask); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operand0, res); +} + +/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing + into OPERAND0. */ +void +ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor) +{ + /* C code for the stuff we expand below. + double xa = fabs (x), x2; + if (!isless (xa, TWO52)) + return x; + xa = xa + TWO52 - TWO52; + x2 = copysign (xa, x); + Compensate. Floor: + if (x2 > x) + x2 -= 1; + Compensate. Ceil: + if (x2 < x) + x2 -= -1; + return x2; + */ + enum machine_mode mode = GET_MODE (operand0); + rtx xa, TWO52, tmp, label, one, res, mask; + + TWO52 = ix86_gen_TWO52 (mode); + + /* Temporary for holding the result, initialized to the input + operand to ease control flow. */ + res = gen_reg_rtx (mode); + emit_move_insn (res, operand1); + + /* xa = abs (operand1) */ + xa = ix86_expand_sse_fabs (res, &mask); + + /* if (!isless (xa, TWO52)) goto label; */ + label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + + /* xa = xa + TWO52 - TWO52; */ + xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); + xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT); + + /* xa = copysign (xa, operand1) */ + ix86_sse_copysign_to_positive (xa, xa, res, mask); + + /* generate 1.0 or -1.0 */ + one = force_reg (mode, + const_double_from_real_value (do_floor + ? dconst1 : dconstm1, mode)); + + /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */ + tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor); + emit_insn (gen_rtx_SET (VOIDmode, tmp, + gen_rtx_AND (mode, one, tmp))); + /* We always need to subtract here to preserve signed zero. */ + tmp = expand_simple_binop (mode, MINUS, + xa, tmp, NULL_RTX, 0, OPTAB_DIRECT); + emit_move_insn (res, tmp); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operand0, res); +} + +/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing + into OPERAND0. */ +void +ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor) +{ + /* C code for the stuff we expand below. + double xa = fabs (x), x2; + if (!isless (xa, TWO52)) + return x; + x2 = (double)(long)x; + Compensate. Floor: + if (x2 > x) + x2 -= 1; + Compensate. Ceil: + if (x2 < x) + x2 += 1; + if (HONOR_SIGNED_ZEROS (mode)) + return copysign (x2, x); + return x2; + */ + enum machine_mode mode = GET_MODE (operand0); + rtx xa, xi, TWO52, tmp, label, one, res, mask; + + TWO52 = ix86_gen_TWO52 (mode); + + /* Temporary for holding the result, initialized to the input + operand to ease control flow. */ + res = gen_reg_rtx (mode); + emit_move_insn (res, operand1); + + /* xa = abs (operand1) */ + xa = ix86_expand_sse_fabs (res, &mask); + + /* if (!isless (xa, TWO52)) goto label; */ + label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + + /* xa = (double)(long)x */ + xi = gen_reg_rtx (mode == DFmode ? DImode : SImode); + expand_fix (xi, res, 0); + expand_float (xa, xi, 0); + + /* generate 1.0 */ + one = force_reg (mode, const_double_from_real_value (dconst1, mode)); + + /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */ + tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor); + emit_insn (gen_rtx_SET (VOIDmode, tmp, + gen_rtx_AND (mode, one, tmp))); + tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS, + xa, tmp, NULL_RTX, 0, OPTAB_DIRECT); + emit_move_insn (res, tmp); + + if (HONOR_SIGNED_ZEROS (mode)) + ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operand0, res); +} + +/* Expand SSE sequence for computing round from OPERAND1 storing + into OPERAND0. Sequence that works without relying on DImode truncation + via cvttsd2siq that is only available on 64bit targets. */ +void +ix86_expand_rounddf_32 (rtx operand0, rtx operand1) +{ + /* C code for the stuff we expand below. + double xa = fabs (x), xa2, x2; + if (!isless (xa, TWO52)) + return x; + Using the absolute value and copying back sign makes + -0.0 -> -0.0 correct. + xa2 = xa + TWO52 - TWO52; + Compensate. + dxa = xa2 - xa; + if (dxa <= -0.5) + xa2 += 1; + else if (dxa > 0.5) + xa2 -= 1; + x2 = copysign (xa2, x); + return x2; + */ + enum machine_mode mode = GET_MODE (operand0); + rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask; + + TWO52 = ix86_gen_TWO52 (mode); + + /* Temporary for holding the result, initialized to the input + operand to ease control flow. */ + res = gen_reg_rtx (mode); + emit_move_insn (res, operand1); + + /* xa = abs (operand1) */ + xa = ix86_expand_sse_fabs (res, &mask); + + /* if (!isless (xa, TWO52)) goto label; */ + label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + + /* xa2 = xa + TWO52 - TWO52; */ + xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); + xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT); + + /* dxa = xa2 - xa; */ + dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT); + + /* generate 0.5, 1.0 and -0.5 */ + half = force_reg (mode, const_double_from_real_value (dconsthalf, mode)); + one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT); + mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX, + 0, OPTAB_DIRECT); + + /* Compensate. */ + tmp = gen_reg_rtx (mode); + /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */ + tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false); + emit_insn (gen_rtx_SET (VOIDmode, tmp, + gen_rtx_AND (mode, one, tmp))); + xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT); + /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */ + tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false); + emit_insn (gen_rtx_SET (VOIDmode, tmp, + gen_rtx_AND (mode, one, tmp))); + xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT); + + /* res = copysign (xa2, operand1) */ + ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operand0, res); +} + +/* Expand SSE sequence for computing trunc from OPERAND1 storing + into OPERAND0. */ +void +ix86_expand_trunc (rtx operand0, rtx operand1) +{ + /* C code for SSE variant we expand below. + double xa = fabs (x), x2; + if (!isless (xa, TWO52)) + return x; + x2 = (double)(long)x; + if (HONOR_SIGNED_ZEROS (mode)) + return copysign (x2, x); + return x2; + */ + enum machine_mode mode = GET_MODE (operand0); + rtx xa, xi, TWO52, label, res, mask; + + TWO52 = ix86_gen_TWO52 (mode); + + /* Temporary for holding the result, initialized to the input + operand to ease control flow. */ + res = gen_reg_rtx (mode); + emit_move_insn (res, operand1); + + /* xa = abs (operand1) */ + xa = ix86_expand_sse_fabs (res, &mask); + + /* if (!isless (xa, TWO52)) goto label; */ + label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + + /* x = (double)(long)x */ + xi = gen_reg_rtx (mode == DFmode ? DImode : SImode); + expand_fix (xi, res, 0); + expand_float (res, xi, 0); + + if (HONOR_SIGNED_ZEROS (mode)) + ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operand0, res); +} + +/* Expand SSE sequence for computing trunc from OPERAND1 storing + into OPERAND0. */ +void +ix86_expand_truncdf_32 (rtx operand0, rtx operand1) +{ + enum machine_mode mode = GET_MODE (operand0); + rtx xa, mask, TWO52, label, one, res, smask, tmp; + + /* C code for SSE variant we expand below. + double xa = fabs (x), x2; + if (!isless (xa, TWO52)) + return x; + xa2 = xa + TWO52 - TWO52; + Compensate: + if (xa2 > xa) + xa2 -= 1.0; + x2 = copysign (xa2, x); + return x2; + */ + + TWO52 = ix86_gen_TWO52 (mode); + + /* Temporary for holding the result, initialized to the input + operand to ease control flow. */ + res = gen_reg_rtx (mode); + emit_move_insn (res, operand1); + + /* xa = abs (operand1) */ + xa = ix86_expand_sse_fabs (res, &smask); + + /* if (!isless (xa, TWO52)) goto label; */ + label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + + /* res = xa + TWO52 - TWO52; */ + tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); + tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT); + emit_move_insn (res, tmp); + + /* generate 1.0 */ + one = force_reg (mode, const_double_from_real_value (dconst1, mode)); + + /* Compensate: res = xa2 - (res > xa ? 1 : 0) */ + mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false); + emit_insn (gen_rtx_SET (VOIDmode, mask, + gen_rtx_AND (mode, mask, one))); + tmp = expand_simple_binop (mode, MINUS, + res, mask, NULL_RTX, 0, OPTAB_DIRECT); + emit_move_insn (res, tmp); + + /* res = copysign (res, operand1) */ + ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operand0, res); +} + +/* Expand SSE sequence for computing round from OPERAND1 storing + into OPERAND0. */ +void +ix86_expand_round (rtx operand0, rtx operand1) +{ + /* C code for the stuff we're doing below: + double xa = fabs (x); + if (!isless (xa, TWO52)) + return x; + xa = (double)(long)(xa + nextafter (0.5, 0.0)); + return copysign (xa, x); + */ + enum machine_mode mode = GET_MODE (operand0); + rtx res, TWO52, xa, label, xi, half, mask; + const struct real_format *fmt; + REAL_VALUE_TYPE pred_half, half_minus_pred_half; + + /* Temporary for holding the result, initialized to the input + operand to ease control flow. */ + res = gen_reg_rtx (mode); + emit_move_insn (res, operand1); + + TWO52 = ix86_gen_TWO52 (mode); + xa = ix86_expand_sse_fabs (res, &mask); + label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + + /* load nextafter (0.5, 0.0) */ + fmt = REAL_MODE_FORMAT (mode); + real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode); + REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half); + + /* xa = xa + 0.5 */ + half = force_reg (mode, const_double_from_real_value (pred_half, mode)); + xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT); + + /* xa = (double)(int64_t)xa */ + xi = gen_reg_rtx (mode == DFmode ? DImode : SImode); + expand_fix (xi, xa, 0); + expand_float (xa, xi, 0); + + /* res = copysign (xa, operand1) */ + ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operand0, res); +} + + +/* Table of valid machine attributes. */ +static const struct attribute_spec ix86_attribute_table[] = +{ + /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler } */ + /* Stdcall attribute says callee is responsible for popping arguments + if they are not variable. */ + { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute }, + /* Fastcall attribute says callee is responsible for popping arguments + if they are not variable. */ + { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute }, + /* Thiscall attribute says callee is responsible for popping arguments + if they are not variable. */ + { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute }, + /* Cdecl attribute says the callee is a normal C declaration */ + { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute }, + /* Regparm attribute specifies how many integer arguments are to be + passed in registers. */ + { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute }, + /* Sseregparm attribute says we are using x86_64 calling conventions + for FP arguments. */ + { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute }, + /* force_align_arg_pointer says this function realigns the stack at entry. */ + { (const char *)&ix86_force_align_arg_pointer_string, 0, 0, + false, true, true, ix86_handle_cconv_attribute }, +#if TARGET_DLLIMPORT_DECL_ATTRIBUTES + { "dllimport", 0, 0, false, false, false, handle_dll_attribute }, + { "dllexport", 0, 0, false, false, false, handle_dll_attribute }, + { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute }, +#endif + { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute }, + { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute }, +#ifdef SUBTARGET_ATTRIBUTE_TABLE + SUBTARGET_ATTRIBUTE_TABLE, +#endif + /* ms_abi and sysv_abi calling convention function attributes. */ + { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute }, + { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute }, + { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute }, + { "callee_pop_aggregate_return", 1, 1, false, true, true, + ix86_handle_callee_pop_aggregate_return }, + /* End element. */ + { NULL, 0, 0, false, false, false, NULL } +}; + +/* Implement targetm.vectorize.builtin_vectorization_cost. */ +static int +ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, + tree vectype ATTRIBUTE_UNUSED, + int misalign ATTRIBUTE_UNUSED) +{ + switch (type_of_cost) + { + case scalar_stmt: + return ix86_cost->scalar_stmt_cost; + + case scalar_load: + return ix86_cost->scalar_load_cost; + + case scalar_store: + return ix86_cost->scalar_store_cost; + + case vector_stmt: + return ix86_cost->vec_stmt_cost; + + case vector_load: + return ix86_cost->vec_align_load_cost; + + case vector_store: + return ix86_cost->vec_store_cost; + + case vec_to_scalar: + return ix86_cost->vec_to_scalar_cost; + + case scalar_to_vec: + return ix86_cost->scalar_to_vec_cost; + + case unaligned_load: + case unaligned_store: + return ix86_cost->vec_unalign_load_cost; + + case cond_branch_taken: + return ix86_cost->cond_taken_branch_cost; + + case cond_branch_not_taken: + return ix86_cost->cond_not_taken_branch_cost; + + case vec_perm: + case vec_promote_demote: + return ix86_cost->vec_stmt_cost; + + default: + gcc_unreachable (); + } +} + + +/* Implement targetm.vectorize.builtin_vec_perm. */ + +static tree +ix86_vectorize_builtin_vec_perm (tree vec_type, tree *mask_type) +{ + tree itype = TREE_TYPE (vec_type); + bool u = TYPE_UNSIGNED (itype); + enum machine_mode vmode = TYPE_MODE (vec_type); + enum ix86_builtins fcode; + bool ok = TARGET_SSE2; + + switch (vmode) + { + case V4DFmode: + ok = TARGET_AVX; + fcode = IX86_BUILTIN_VEC_PERM_V4DF; + goto get_di; + case V2DFmode: + fcode = IX86_BUILTIN_VEC_PERM_V2DF; + get_di: + itype = ix86_get_builtin_type (IX86_BT_DI); + break; + + case V8SFmode: + ok = TARGET_AVX; + fcode = IX86_BUILTIN_VEC_PERM_V8SF; + goto get_si; + case V4SFmode: + ok = TARGET_SSE; + fcode = IX86_BUILTIN_VEC_PERM_V4SF; + get_si: + itype = ix86_get_builtin_type (IX86_BT_SI); + break; + + case V2DImode: + fcode = u ? IX86_BUILTIN_VEC_PERM_V2DI_U : IX86_BUILTIN_VEC_PERM_V2DI; + break; + case V4SImode: + fcode = u ? IX86_BUILTIN_VEC_PERM_V4SI_U : IX86_BUILTIN_VEC_PERM_V4SI; + break; + case V8HImode: + fcode = u ? IX86_BUILTIN_VEC_PERM_V8HI_U : IX86_BUILTIN_VEC_PERM_V8HI; + break; + case V16QImode: + fcode = u ? IX86_BUILTIN_VEC_PERM_V16QI_U : IX86_BUILTIN_VEC_PERM_V16QI; + break; + default: + ok = false; + break; + } + + if (!ok) + return NULL_TREE; + + *mask_type = itype; + return ix86_builtins[(int) fcode]; +} + +/* Return a vector mode with twice as many elements as VMODE. */ +/* ??? Consider moving this to a table generated by genmodes.c. */ + +static enum machine_mode +doublesize_vector_mode (enum machine_mode vmode) +{ + switch (vmode) + { + case V2SFmode: return V4SFmode; + case V1DImode: return V2DImode; + case V2SImode: return V4SImode; + case V4HImode: return V8HImode; + case V8QImode: return V16QImode; + + case V2DFmode: return V4DFmode; + case V4SFmode: return V8SFmode; + case V2DImode: return V4DImode; + case V4SImode: return V8SImode; + case V8HImode: return V16HImode; + case V16QImode: return V32QImode; + + case V4DFmode: return V8DFmode; + case V8SFmode: return V16SFmode; + case V4DImode: return V8DImode; + case V8SImode: return V16SImode; + case V16HImode: return V32HImode; + case V32QImode: return V64QImode; + + default: + gcc_unreachable (); + } +} + +/* Construct (set target (vec_select op0 (parallel perm))) and + return true if that's a valid instruction in the active ISA. */ + +static bool +expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt) +{ + rtx rperm[MAX_VECT_LEN], x; + unsigned i; + + for (i = 0; i < nelt; ++i) + rperm[i] = GEN_INT (perm[i]); + + x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, rperm)); + x = gen_rtx_VEC_SELECT (GET_MODE (target), op0, x); + x = gen_rtx_SET (VOIDmode, target, x); + + x = emit_insn (x); + if (recog_memoized (x) < 0) + { + remove_insn (x); + return false; + } + return true; +} + +/* Similar, but generate a vec_concat from op0 and op1 as well. */ + +static bool +expand_vselect_vconcat (rtx target, rtx op0, rtx op1, + const unsigned char *perm, unsigned nelt) +{ + enum machine_mode v2mode; + rtx x; + + v2mode = doublesize_vector_mode (GET_MODE (op0)); + x = gen_rtx_VEC_CONCAT (v2mode, op0, op1); + return expand_vselect (target, x, perm, nelt); +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D + in terms of blendp[sd] / pblendw / pblendvb. */ + +static bool +expand_vec_perm_blend (struct expand_vec_perm_d *d) +{ + enum machine_mode vmode = d->vmode; + unsigned i, mask, nelt = d->nelt; + rtx target, op0, op1, x; + + if (!TARGET_SSE4_1 || d->op0 == d->op1) + return false; + if (!(GET_MODE_SIZE (vmode) == 16 || vmode == V4DFmode || vmode == V8SFmode)) + return false; + + /* This is a blend, not a permute. Elements must stay in their + respective lanes. */ + for (i = 0; i < nelt; ++i) + { + unsigned e = d->perm[i]; + if (!(e == i || e == i + nelt)) + return false; + } + + if (d->testing_p) + return true; + + /* ??? Without SSE4.1, we could implement this with and/andn/or. This + decision should be extracted elsewhere, so that we only try that + sequence once all budget==3 options have been tried. */ + + /* For bytes, see if bytes move in pairs so we can use pblendw with + an immediate argument, rather than pblendvb with a vector argument. */ + if (vmode == V16QImode) + { + bool pblendw_ok = true; + for (i = 0; i < 16 && pblendw_ok; i += 2) + pblendw_ok = (d->perm[i] + 1 == d->perm[i + 1]); + + if (!pblendw_ok) + { + rtx rperm[16], vperm; + + for (i = 0; i < nelt; ++i) + rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx); + + vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm)); + vperm = force_reg (V16QImode, vperm); + + emit_insn (gen_sse4_1_pblendvb (d->target, d->op0, d->op1, vperm)); + return true; + } + } + + target = d->target; + op0 = d->op0; + op1 = d->op1; + mask = 0; + + switch (vmode) + { + case V4DFmode: + case V8SFmode: + case V2DFmode: + case V4SFmode: + case V8HImode: + for (i = 0; i < nelt; ++i) + mask |= (d->perm[i] >= nelt) << i; + break; + + case V2DImode: + for (i = 0; i < 2; ++i) + mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4); + goto do_subreg; + + case V4SImode: + for (i = 0; i < 4; ++i) + mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2); + goto do_subreg; + + case V16QImode: + for (i = 0; i < 8; ++i) + mask |= (d->perm[i * 2] >= 16) << i; + + do_subreg: + vmode = V8HImode; + target = gen_lowpart (vmode, target); + op0 = gen_lowpart (vmode, op0); + op1 = gen_lowpart (vmode, op1); + break; + + default: + gcc_unreachable (); + } + + /* This matches five different patterns with the different modes. */ + x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask)); + x = gen_rtx_SET (VOIDmode, target, x); + emit_insn (x); + + return true; +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D + in terms of the variable form of vpermilps. + + Note that we will have already failed the immediate input vpermilps, + which requires that the high and low part shuffle be identical; the + variable form doesn't require that. */ + +static bool +expand_vec_perm_vpermil (struct expand_vec_perm_d *d) +{ + rtx rperm[8], vperm; + unsigned i; + + if (!TARGET_AVX || d->vmode != V8SFmode || d->op0 != d->op1) + return false; + + /* We can only permute within the 128-bit lane. */ + for (i = 0; i < 8; ++i) + { + unsigned e = d->perm[i]; + if (i < 4 ? e >= 4 : e < 4) + return false; + } + + if (d->testing_p) + return true; + + for (i = 0; i < 8; ++i) + { + unsigned e = d->perm[i]; + + /* Within each 128-bit lane, the elements of op0 are numbered + from 0 and the elements of op1 are numbered from 4. */ + if (e >= 8 + 4) + e -= 8; + else if (e >= 4) + e -= 4; + + rperm[i] = GEN_INT (e); + } + + vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm)); + vperm = force_reg (V8SImode, vperm); + emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm)); + + return true; +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D + in terms of pshufb or vpperm. */ + +static bool +expand_vec_perm_pshufb (struct expand_vec_perm_d *d) +{ + unsigned i, nelt, eltsz; + rtx rperm[16], vperm, target, op0, op1; + + if (!(d->op0 == d->op1 ? TARGET_SSSE3 : TARGET_XOP)) + return false; + if (GET_MODE_SIZE (d->vmode) != 16) + return false; + + if (d->testing_p) + return true; + + nelt = d->nelt; + eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode)); + + for (i = 0; i < nelt; ++i) + { + unsigned j, e = d->perm[i]; + for (j = 0; j < eltsz; ++j) + rperm[i * eltsz + j] = GEN_INT (e * eltsz + j); + } + + vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm)); + vperm = force_reg (V16QImode, vperm); + + target = gen_lowpart (V16QImode, d->target); + op0 = gen_lowpart (V16QImode, d->op0); + if (d->op0 == d->op1) + emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm)); + else + { + op1 = gen_lowpart (V16QImode, d->op1); + emit_insn (gen_xop_pperm (target, op0, op1, vperm)); + } + + return true; +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D + in a single instruction. */ + +static bool +expand_vec_perm_1 (struct expand_vec_perm_d *d) +{ + unsigned i, nelt = d->nelt; + unsigned char perm2[MAX_VECT_LEN]; + + /* Check plain VEC_SELECT first, because AVX has instructions that could + match both SEL and SEL+CONCAT, but the plain SEL will allow a memory + input where SEL+CONCAT may not. */ + if (d->op0 == d->op1) + { + int mask = nelt - 1; + + for (i = 0; i < nelt; i++) + perm2[i] = d->perm[i] & mask; + + if (expand_vselect (d->target, d->op0, perm2, nelt)) + return true; + + /* There are plenty of patterns in sse.md that are written for + SEL+CONCAT and are not replicated for a single op. Perhaps + that should be changed, to avoid the nastiness here. */ + + /* Recognize interleave style patterns, which means incrementing + every other permutation operand. */ + for (i = 0; i < nelt; i += 2) + { + perm2[i] = d->perm[i] & mask; + perm2[i + 1] = (d->perm[i + 1] & mask) + nelt; + } + if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt)) + return true; + + /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */ + if (nelt >= 4) + { + for (i = 0; i < nelt; i += 4) + { + perm2[i + 0] = d->perm[i + 0] & mask; + perm2[i + 1] = d->perm[i + 1] & mask; + perm2[i + 2] = (d->perm[i + 2] & mask) + nelt; + perm2[i + 3] = (d->perm[i + 3] & mask) + nelt; + } + + if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt)) + return true; + } + } + + /* Finally, try the fully general two operand permute. */ + if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt)) + return true; + + /* Recognize interleave style patterns with reversed operands. */ + if (d->op0 != d->op1) + { + for (i = 0; i < nelt; ++i) + { + unsigned e = d->perm[i]; + if (e >= nelt) + e -= nelt; + else + e += nelt; + perm2[i] = e; + } + + if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt)) + return true; + } + + /* Try the SSE4.1 blend variable merge instructions. */ + if (expand_vec_perm_blend (d)) + return true; + + /* Try one of the AVX vpermil variable permutations. */ + if (expand_vec_perm_vpermil (d)) + return true; + + /* Try the SSSE3 pshufb or XOP vpperm variable permutation. */ + if (expand_vec_perm_pshufb (d)) + return true; + + return false; +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D + in terms of a pair of pshuflw + pshufhw instructions. */ + +static bool +expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d) +{ + unsigned char perm2[MAX_VECT_LEN]; + unsigned i; + bool ok; + + if (d->vmode != V8HImode || d->op0 != d->op1) + return false; + + /* The two permutations only operate in 64-bit lanes. */ + for (i = 0; i < 4; ++i) + if (d->perm[i] >= 4) + return false; + for (i = 4; i < 8; ++i) + if (d->perm[i] < 4) + return false; + + if (d->testing_p) + return true; + + /* Emit the pshuflw. */ + memcpy (perm2, d->perm, 4); + for (i = 4; i < 8; ++i) + perm2[i] = i; + ok = expand_vselect (d->target, d->op0, perm2, 8); + gcc_assert (ok); + + /* Emit the pshufhw. */ + memcpy (perm2 + 4, d->perm + 4, 4); + for (i = 0; i < 4; ++i) + perm2[i] = i; + ok = expand_vselect (d->target, d->target, perm2, 8); + gcc_assert (ok); + + return true; +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify + the permutation using the SSSE3 palignr instruction. This succeeds + when all of the elements in PERM fit within one vector and we merely + need to shift them down so that a single vector permutation has a + chance to succeed. */ + +static bool +expand_vec_perm_palignr (struct expand_vec_perm_d *d) +{ + unsigned i, nelt = d->nelt; + unsigned min, max; + bool in_order, ok; + rtx shift; + + /* Even with AVX, palignr only operates on 128-bit vectors. */ + if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16) + return false; + + min = nelt, max = 0; + for (i = 0; i < nelt; ++i) + { + unsigned e = d->perm[i]; + if (e < min) + min = e; + if (e > max) + max = e; + } + if (min == 0 || max - min >= nelt) + return false; + + /* Given that we have SSSE3, we know we'll be able to implement the + single operand permutation after the palignr with pshufb. */ + if (d->testing_p) + return true; + + shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode))); + emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target), + gen_lowpart (TImode, d->op1), + gen_lowpart (TImode, d->op0), shift)); + + d->op0 = d->op1 = d->target; + + in_order = true; + for (i = 0; i < nelt; ++i) + { + unsigned e = d->perm[i] - min; + if (e != i) + in_order = false; + d->perm[i] = e; + } + + /* Test for the degenerate case where the alignment by itself + produces the desired permutation. */ + if (in_order) + return true; + + ok = expand_vec_perm_1 (d); + gcc_assert (ok); + + return ok; +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify + a two vector permutation into a single vector permutation by using + an interleave operation to merge the vectors. */ + +static bool +expand_vec_perm_interleave2 (struct expand_vec_perm_d *d) +{ + struct expand_vec_perm_d dremap, dfinal; + unsigned i, nelt = d->nelt, nelt2 = nelt / 2; + unsigned contents, h1, h2, h3, h4; + unsigned char remap[2 * MAX_VECT_LEN]; + rtx seq; + bool ok; + + if (d->op0 == d->op1) + return false; + + /* The 256-bit unpck[lh]p[sd] instructions only operate within the 128-bit + lanes. We can use similar techniques with the vperm2f128 instruction, + but it requires slightly different logic. */ + if (GET_MODE_SIZE (d->vmode) != 16) + return false; + + /* Examine from whence the elements come. */ + contents = 0; + for (i = 0; i < nelt; ++i) + contents |= 1u << d->perm[i]; + + /* Split the two input vectors into 4 halves. */ + h1 = (1u << nelt2) - 1; + h2 = h1 << nelt2; + h3 = h2 << nelt2; + h4 = h3 << nelt2; + + memset (remap, 0xff, sizeof (remap)); + dremap = *d; + + /* If the elements from the low halves use interleave low, and similarly + for interleave high. If the elements are from mis-matched halves, we + can use shufps for V4SF/V4SI or do a DImode shuffle. */ + if ((contents & (h1 | h3)) == contents) + { + for (i = 0; i < nelt2; ++i) + { + remap[i] = i * 2; + remap[i + nelt] = i * 2 + 1; + dremap.perm[i * 2] = i; + dremap.perm[i * 2 + 1] = i + nelt; + } + } + else if ((contents & (h2 | h4)) == contents) + { + for (i = 0; i < nelt2; ++i) + { + remap[i + nelt2] = i * 2; + remap[i + nelt + nelt2] = i * 2 + 1; + dremap.perm[i * 2] = i + nelt2; + dremap.perm[i * 2 + 1] = i + nelt + nelt2; + } + } + else if ((contents & (h1 | h4)) == contents) + { + for (i = 0; i < nelt2; ++i) + { + remap[i] = i; + remap[i + nelt + nelt2] = i + nelt2; + dremap.perm[i] = i; + dremap.perm[i + nelt2] = i + nelt + nelt2; + } + if (nelt != 4) + { + dremap.vmode = V2DImode; + dremap.nelt = 2; + dremap.perm[0] = 0; + dremap.perm[1] = 3; + } + } + else if ((contents & (h2 | h3)) == contents) + { + for (i = 0; i < nelt2; ++i) + { + remap[i + nelt2] = i; + remap[i + nelt] = i + nelt2; + dremap.perm[i] = i + nelt2; + dremap.perm[i + nelt2] = i + nelt; + } + if (nelt != 4) + { + dremap.vmode = V2DImode; + dremap.nelt = 2; + dremap.perm[0] = 1; + dremap.perm[1] = 2; + } + } + else + return false; + + /* Use the remapping array set up above to move the elements from their + swizzled locations into their final destinations. */ + dfinal = *d; + for (i = 0; i < nelt; ++i) + { + unsigned e = remap[d->perm[i]]; + gcc_assert (e < nelt); + dfinal.perm[i] = e; + } + dfinal.op0 = gen_reg_rtx (dfinal.vmode); + dfinal.op1 = dfinal.op0; + dremap.target = dfinal.op0; + + /* Test if the final remap can be done with a single insn. For V4SFmode or + V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */ + start_sequence (); + ok = expand_vec_perm_1 (&dfinal); + seq = get_insns (); + end_sequence (); + + if (!ok) + return false; + + if (dremap.vmode != dfinal.vmode) + { + dremap.target = gen_lowpart (dremap.vmode, dremap.target); + dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0); + dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1); + } + + ok = expand_vec_perm_1 (&dremap); + gcc_assert (ok); + + emit_insn (seq); + return true; +} + +/* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word + permutation with two pshufb insns and an ior. We should have already + failed all two instruction sequences. */ + +static bool +expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d) +{ + rtx rperm[2][16], vperm, l, h, op, m128; + unsigned int i, nelt, eltsz; + + if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16) + return false; + gcc_assert (d->op0 != d->op1); + + nelt = d->nelt; + eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode)); + + /* Generate two permutation masks. If the required element is within + the given vector it is shuffled into the proper lane. If the required + element is in the other vector, force a zero into the lane by setting + bit 7 in the permutation mask. */ + m128 = GEN_INT (-128); + for (i = 0; i < nelt; ++i) + { + unsigned j, e = d->perm[i]; + unsigned which = (e >= nelt); + if (e >= nelt) + e -= nelt; + + for (j = 0; j < eltsz; ++j) + { + rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j); + rperm[1-which][i*eltsz + j] = m128; + } + } + + vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0])); + vperm = force_reg (V16QImode, vperm); + + l = gen_reg_rtx (V16QImode); + op = gen_lowpart (V16QImode, d->op0); + emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm)); + + vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1])); + vperm = force_reg (V16QImode, vperm); + + h = gen_reg_rtx (V16QImode); + op = gen_lowpart (V16QImode, d->op1); + emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm)); + + op = gen_lowpart (V16QImode, d->target); + emit_insn (gen_iorv16qi3 (op, l, h)); + + return true; +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even + and extract-odd permutations. */ + +static bool +expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) +{ + rtx t1, t2, t3; + + switch (d->vmode) + { + case V4DFmode: + t1 = gen_reg_rtx (V4DFmode); + t2 = gen_reg_rtx (V4DFmode); + + /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */ + emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20))); + emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31))); + + /* Now an unpck[lh]pd will produce the result required. */ + if (odd) + t3 = gen_avx_unpckhpd256 (d->target, t1, t2); + else + t3 = gen_avx_unpcklpd256 (d->target, t1, t2); + emit_insn (t3); + break; + + case V8SFmode: + { + int mask = odd ? 0xdd : 0x88; + + t1 = gen_reg_rtx (V8SFmode); + t2 = gen_reg_rtx (V8SFmode); + t3 = gen_reg_rtx (V8SFmode); + + /* Shuffle within the 128-bit lanes to produce: + { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */ + emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1, + GEN_INT (mask))); + + /* Shuffle the lanes around to produce: + { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */ + emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1, + GEN_INT (0x3))); + + /* Shuffle within the 128-bit lanes to produce: + { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */ + emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44))); + + /* Shuffle within the 128-bit lanes to produce: + { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */ + emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee))); + + /* Shuffle the lanes around to produce: + { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */ + emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2, + GEN_INT (0x20))); + } + break; + + case V2DFmode: + case V4SFmode: + case V2DImode: + case V4SImode: + /* These are always directly implementable by expand_vec_perm_1. */ + gcc_unreachable (); + + case V8HImode: + if (TARGET_SSSE3) + return expand_vec_perm_pshufb2 (d); + else + { + /* We need 2*log2(N)-1 operations to achieve odd/even + with interleave. */ + t1 = gen_reg_rtx (V8HImode); + t2 = gen_reg_rtx (V8HImode); + emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1)); + emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1)); + emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1)); + emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1)); + if (odd) + t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2); + else + t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2); + emit_insn (t3); + } + break; + + case V16QImode: + if (TARGET_SSSE3) + return expand_vec_perm_pshufb2 (d); + else + { + t1 = gen_reg_rtx (V16QImode); + t2 = gen_reg_rtx (V16QImode); + t3 = gen_reg_rtx (V16QImode); + emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1)); + emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1)); + emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1)); + emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1)); + emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2)); + emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2)); + if (odd) + t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3); + else + t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3); + emit_insn (t3); + } + break; + + default: + gcc_unreachable (); + } + + return true; +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match + extract-even and extract-odd permutations. */ + +static bool +expand_vec_perm_even_odd (struct expand_vec_perm_d *d) +{ + unsigned i, odd, nelt = d->nelt; + + odd = d->perm[0]; + if (odd != 0 && odd != 1) + return false; + + for (i = 1; i < nelt; ++i) + if (d->perm[i] != 2 * i + odd) + return false; + + return expand_vec_perm_even_odd_1 (d, odd); +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast + permutations. We assume that expand_vec_perm_1 has already failed. */ + +static bool +expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d) +{ + unsigned elt = d->perm[0], nelt2 = d->nelt / 2; + enum machine_mode vmode = d->vmode; + unsigned char perm2[4]; + rtx op0 = d->op0; + bool ok; + + switch (vmode) + { + case V4DFmode: + case V8SFmode: + /* These are special-cased in sse.md so that we can optionally + use the vbroadcast instruction. They expand to two insns + if the input happens to be in a register. */ + gcc_unreachable (); + + case V2DFmode: + case V2DImode: + case V4SFmode: + case V4SImode: + /* These are always implementable using standard shuffle patterns. */ + gcc_unreachable (); + + case V8HImode: + case V16QImode: + /* These can be implemented via interleave. We save one insn by + stopping once we have promoted to V4SImode and then use pshufd. */ + do + { + optab otab = vec_interleave_low_optab; + + if (elt >= nelt2) + { + otab = vec_interleave_high_optab; + elt -= nelt2; + } + nelt2 /= 2; + + op0 = expand_binop (vmode, otab, op0, op0, NULL, 0, OPTAB_DIRECT); + vmode = get_mode_wider_vector (vmode); + op0 = gen_lowpart (vmode, op0); + } + while (vmode != V4SImode); + + memset (perm2, elt, 4); + ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4); + gcc_assert (ok); + return true; + + default: + gcc_unreachable (); + } +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match + broadcast permutations. */ + +static bool +expand_vec_perm_broadcast (struct expand_vec_perm_d *d) +{ + unsigned i, elt, nelt = d->nelt; + + if (d->op0 != d->op1) + return false; + + elt = d->perm[0]; + for (i = 1; i < nelt; ++i) + if (d->perm[i] != elt) + return false; + + return expand_vec_perm_broadcast_1 (d); +} + +/* The guts of ix86_expand_vec_perm_builtin, also used by the ok hook. + With all of the interface bits taken care of, perform the expansion + in D and return true on success. */ + +static bool +ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d) +{ + /* Try a single instruction expansion. */ + if (expand_vec_perm_1 (d)) + return true; + + /* Try sequences of two instructions. */ + + if (expand_vec_perm_pshuflw_pshufhw (d)) + return true; + + if (expand_vec_perm_palignr (d)) + return true; + + if (expand_vec_perm_interleave2 (d)) + return true; + + if (expand_vec_perm_broadcast (d)) + return true; + + /* Try sequences of three instructions. */ + + if (expand_vec_perm_pshufb2 (d)) + return true; + + /* ??? Look for narrow permutations whose element orderings would + allow the promotion to a wider mode. */ + + /* ??? Look for sequences of interleave or a wider permute that place + the data into the correct lanes for a half-vector shuffle like + pshuf[lh]w or vpermilps. */ + + /* ??? Look for sequences of interleave that produce the desired results. + The combinatorics of punpck[lh] get pretty ugly... */ + + if (expand_vec_perm_even_odd (d)) + return true; + + return false; +} + +/* Extract the values from the vector CST into the permutation array in D. + Return 0 on error, 1 if all values from the permutation come from the + first vector, 2 if all values from the second vector, and 3 otherwise. */ + +static int +extract_vec_perm_cst (struct expand_vec_perm_d *d, tree cst) +{ + tree list = TREE_VECTOR_CST_ELTS (cst); + unsigned i, nelt = d->nelt; + int ret = 0; + + for (i = 0; i < nelt; ++i, list = TREE_CHAIN (list)) + { + unsigned HOST_WIDE_INT e; + + if (!host_integerp (TREE_VALUE (list), 1)) + return 0; + e = tree_low_cst (TREE_VALUE (list), 1); + if (e >= 2 * nelt) + return 0; + + ret |= (e < nelt ? 1 : 2); + d->perm[i] = e; + } + gcc_assert (list == NULL); + + /* For all elements from second vector, fold the elements to first. */ + if (ret == 2) + for (i = 0; i < nelt; ++i) + d->perm[i] -= nelt; + + return ret; +} + +static rtx +ix86_expand_vec_perm_builtin (tree exp) +{ + struct expand_vec_perm_d d; + tree arg0, arg1, arg2; + + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + + d.vmode = TYPE_MODE (TREE_TYPE (arg0)); + d.nelt = GET_MODE_NUNITS (d.vmode); + d.testing_p = false; + gcc_assert (VECTOR_MODE_P (d.vmode)); + + if (TREE_CODE (arg2) != VECTOR_CST) + { + error_at (EXPR_LOCATION (exp), + "vector permutation requires vector constant"); + goto exit_error; + } + + switch (extract_vec_perm_cst (&d, arg2)) + { + default: + gcc_unreachable(); + + case 0: + error_at (EXPR_LOCATION (exp), "invalid vector permutation constant"); + goto exit_error; + + case 3: + if (!operand_equal_p (arg0, arg1, 0)) + { + d.op0 = expand_expr (arg0, NULL_RTX, d.vmode, EXPAND_NORMAL); + d.op0 = force_reg (d.vmode, d.op0); + d.op1 = expand_expr (arg1, NULL_RTX, d.vmode, EXPAND_NORMAL); + d.op1 = force_reg (d.vmode, d.op1); + break; + } + + /* The elements of PERM do not suggest that only the first operand + is used, but both operands are identical. Allow easier matching + of the permutation by folding the permutation into the single + input vector. */ + { + unsigned i, nelt = d.nelt; + for (i = 0; i < nelt; ++i) + if (d.perm[i] >= nelt) + d.perm[i] -= nelt; + } + /* FALLTHRU */ + + case 1: + d.op0 = expand_expr (arg0, NULL_RTX, d.vmode, EXPAND_NORMAL); + d.op0 = force_reg (d.vmode, d.op0); + d.op1 = d.op0; + break; + + case 2: + d.op0 = expand_expr (arg1, NULL_RTX, d.vmode, EXPAND_NORMAL); + d.op0 = force_reg (d.vmode, d.op0); + d.op1 = d.op0; + break; + } + + d.target = gen_reg_rtx (d.vmode); + if (ix86_expand_vec_perm_builtin_1 (&d)) + return d.target; + + /* For compiler generated permutations, we should never got here, because + the compiler should also be checking the ok hook. But since this is a + builtin the user has access too, so don't abort. */ + switch (d.nelt) + { + case 2: + sorry ("vector permutation (%d %d)", d.perm[0], d.perm[1]); + break; + case 4: + sorry ("vector permutation (%d %d %d %d)", + d.perm[0], d.perm[1], d.perm[2], d.perm[3]); + break; + case 8: + sorry ("vector permutation (%d %d %d %d %d %d %d %d)", + d.perm[0], d.perm[1], d.perm[2], d.perm[3], + d.perm[4], d.perm[5], d.perm[6], d.perm[7]); + break; + case 16: + sorry ("vector permutation " + "(%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d)", + d.perm[0], d.perm[1], d.perm[2], d.perm[3], + d.perm[4], d.perm[5], d.perm[6], d.perm[7], + d.perm[8], d.perm[9], d.perm[10], d.perm[11], + d.perm[12], d.perm[13], d.perm[14], d.perm[15]); + break; + default: + gcc_unreachable (); + } + exit_error: + return CONST0_RTX (d.vmode); +} + +/* Implement targetm.vectorize.builtin_vec_perm_ok. */ + +static bool +ix86_vectorize_builtin_vec_perm_ok (tree vec_type, tree mask) +{ + struct expand_vec_perm_d d; + int vec_mask; + bool ret, one_vec; + + d.vmode = TYPE_MODE (vec_type); + d.nelt = GET_MODE_NUNITS (d.vmode); + d.testing_p = true; + + /* Given sufficient ISA support we can just return true here + for selected vector modes. */ + if (GET_MODE_SIZE (d.vmode) == 16) + { + /* All implementable with a single vpperm insn. */ + if (TARGET_XOP) + return true; + /* All implementable with 2 pshufb + 1 ior. */ + if (TARGET_SSSE3) + return true; + /* All implementable with shufpd or unpck[lh]pd. */ + if (d.nelt == 2) + return true; + } + + vec_mask = extract_vec_perm_cst (&d, mask); + + /* This hook is cannot be called in response to something that the + user does (unlike the builtin expander) so we shouldn't ever see + an error generated from the extract. */ + gcc_assert (vec_mask > 0 && vec_mask <= 3); + one_vec = (vec_mask != 3); + + /* Implementable with shufps or pshufd. */ + if (one_vec && (d.vmode == V4SFmode || d.vmode == V4SImode)) + return true; + + /* Otherwise we have to go through the motions and see if we can + figure out how to generate the requested permutation. */ + d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1); + d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2); + if (!one_vec) + d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3); + + start_sequence (); + ret = ix86_expand_vec_perm_builtin_1 (&d); + end_sequence (); + + return ret; +} + +void +ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd) +{ + struct expand_vec_perm_d d; + unsigned i, nelt; + + d.target = targ; + d.op0 = op0; + d.op1 = op1; + d.vmode = GET_MODE (targ); + d.nelt = nelt = GET_MODE_NUNITS (d.vmode); + d.testing_p = false; + + for (i = 0; i < nelt; ++i) + d.perm[i] = i * 2 + odd; + + /* We'll either be able to implement the permutation directly... */ + if (expand_vec_perm_1 (&d)) + return; + + /* ... or we use the special-case patterns. */ + expand_vec_perm_even_odd_1 (&d, odd); +} + +/* This function returns the calling abi specific va_list type node. + It returns the FNDECL specific va_list type. */ + +static tree +ix86_fn_abi_va_list (tree fndecl) +{ + if (!TARGET_64BIT) + return va_list_type_node; + gcc_assert (fndecl != NULL_TREE); + + if (ix86_function_abi ((const_tree) fndecl) == MS_ABI) + return ms_va_list_type_node; + else + return sysv_va_list_type_node; +} + +/* Returns the canonical va_list type specified by TYPE. If there + is no valid TYPE provided, it return NULL_TREE. */ + +static tree +ix86_canonical_va_list_type (tree type) +{ + tree wtype, htype; + + /* Resolve references and pointers to va_list type. */ + if (TREE_CODE (type) == MEM_REF) + type = TREE_TYPE (type); + else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type))) + type = TREE_TYPE (type); + else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE) + type = TREE_TYPE (type); + + if (TARGET_64BIT && va_list_type_node != NULL_TREE) + { + wtype = va_list_type_node; + gcc_assert (wtype != NULL_TREE); + htype = type; + if (TREE_CODE (wtype) == ARRAY_TYPE) + { + /* If va_list is an array type, the argument may have decayed + to a pointer type, e.g. by being passed to another function. + In that case, unwrap both types so that we can compare the + underlying records. */ + if (TREE_CODE (htype) == ARRAY_TYPE + || POINTER_TYPE_P (htype)) + { + wtype = TREE_TYPE (wtype); + htype = TREE_TYPE (htype); + } + } + if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype)) + return va_list_type_node; + wtype = sysv_va_list_type_node; + gcc_assert (wtype != NULL_TREE); + htype = type; + if (TREE_CODE (wtype) == ARRAY_TYPE) + { + /* If va_list is an array type, the argument may have decayed + to a pointer type, e.g. by being passed to another function. + In that case, unwrap both types so that we can compare the + underlying records. */ + if (TREE_CODE (htype) == ARRAY_TYPE + || POINTER_TYPE_P (htype)) + { + wtype = TREE_TYPE (wtype); + htype = TREE_TYPE (htype); + } + } + if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype)) + return sysv_va_list_type_node; + wtype = ms_va_list_type_node; + gcc_assert (wtype != NULL_TREE); + htype = type; + if (TREE_CODE (wtype) == ARRAY_TYPE) + { + /* If va_list is an array type, the argument may have decayed + to a pointer type, e.g. by being passed to another function. + In that case, unwrap both types so that we can compare the + underlying records. */ + if (TREE_CODE (htype) == ARRAY_TYPE + || POINTER_TYPE_P (htype)) + { + wtype = TREE_TYPE (wtype); + htype = TREE_TYPE (htype); + } + } + if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype)) + return ms_va_list_type_node; + return NULL_TREE; + } + return std_canonical_va_list_type (type); +} + +/* Iterate through the target-specific builtin types for va_list. + IDX denotes the iterator, *PTREE is set to the result type of + the va_list builtin, and *PNAME to its internal type. + Returns zero if there is no element for this index, otherwise + IDX should be increased upon the next call. + Note, do not iterate a base builtin's name like __builtin_va_list. + Used from c_common_nodes_and_builtins. */ + +static int +ix86_enum_va_list (int idx, const char **pname, tree *ptree) +{ + if (TARGET_64BIT) + { + switch (idx) + { + default: + break; + + case 0: + *ptree = ms_va_list_type_node; + *pname = "__builtin_ms_va_list"; + return 1; + + case 1: + *ptree = sysv_va_list_type_node; + *pname = "__builtin_sysv_va_list"; + return 1; + } + } + + return 0; +} + +#undef TARGET_SCHED_DISPATCH +#define TARGET_SCHED_DISPATCH has_dispatch +#undef TARGET_SCHED_DISPATCH_DO +#define TARGET_SCHED_DISPATCH_DO do_dispatch + +/* The size of the dispatch window is the total number of bytes of + object code allowed in a window. */ +#define DISPATCH_WINDOW_SIZE 16 + +/* Number of dispatch windows considered for scheduling. */ +#define MAX_DISPATCH_WINDOWS 3 + +/* Maximum number of instructions in a window. */ +#define MAX_INSN 4 + +/* Maximum number of immediate operands in a window. */ +#define MAX_IMM 4 + +/* Maximum number of immediate bits allowed in a window. */ +#define MAX_IMM_SIZE 128 + +/* Maximum number of 32 bit immediates allowed in a window. */ +#define MAX_IMM_32 4 + +/* Maximum number of 64 bit immediates allowed in a window. */ +#define MAX_IMM_64 2 + +/* Maximum total of loads or prefetches allowed in a window. */ +#define MAX_LOAD 2 + +/* Maximum total of stores allowed in a window. */ +#define MAX_STORE 1 + +#undef BIG +#define BIG 100 + + +/* Dispatch groups. Istructions that affect the mix in a dispatch window. */ +enum dispatch_group { + disp_no_group = 0, + disp_load, + disp_store, + disp_load_store, + disp_prefetch, + disp_imm, + disp_imm_32, + disp_imm_64, + disp_branch, + disp_cmp, + disp_jcc, + disp_last +}; + +/* Number of allowable groups in a dispatch window. It is an array + indexed by dispatch_group enum. 100 is used as a big number, + because the number of these kind of operations does not have any + effect in dispatch window, but we need them for other reasons in + the table. */ +static unsigned int num_allowable_groups[disp_last] = { + 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG +}; + +char group_name[disp_last + 1][16] = { + "disp_no_group", "disp_load", "disp_store", "disp_load_store", + "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64", + "disp_branch", "disp_cmp", "disp_jcc", "disp_last" +}; + +/* Instruction path. */ +enum insn_path { + no_path = 0, + path_single, /* Single micro op. */ + path_double, /* Double micro op. */ + path_multi, /* Instructions with more than 2 micro op.. */ + last_path +}; + +/* sched_insn_info defines a window to the instructions scheduled in + the basic block. It contains a pointer to the insn_info table and + the instruction scheduled. + + Windows are allocated for each basic block and are linked + together. */ +typedef struct sched_insn_info_s { + rtx insn; + enum dispatch_group group; + enum insn_path path; + int byte_len; + int imm_bytes; +} sched_insn_info; + +/* Linked list of dispatch windows. This is a two way list of + dispatch windows of a basic block. It contains information about + the number of uops in the window and the total number of + instructions and of bytes in the object code for this dispatch + window. */ +typedef struct dispatch_windows_s { + int num_insn; /* Number of insn in the window. */ + int num_uops; /* Number of uops in the window. */ + int window_size; /* Number of bytes in the window. */ + int window_num; /* Window number between 0 or 1. */ + int num_imm; /* Number of immediates in an insn. */ + int num_imm_32; /* Number of 32 bit immediates in an insn. */ + int num_imm_64; /* Number of 64 bit immediates in an insn. */ + int imm_size; /* Total immediates in the window. */ + int num_loads; /* Total memory loads in the window. */ + int num_stores; /* Total memory stores in the window. */ + int violation; /* Violation exists in window. */ + sched_insn_info *window; /* Pointer to the window. */ + struct dispatch_windows_s *next; + struct dispatch_windows_s *prev; +} dispatch_windows; + +/* Immediate valuse used in an insn. */ +typedef struct imm_info_s + { + int imm; + int imm32; + int imm64; + } imm_info; + +static dispatch_windows *dispatch_window_list; +static dispatch_windows *dispatch_window_list1; + +/* Get dispatch group of insn. */ + +static enum dispatch_group +get_mem_group (rtx insn) +{ + enum attr_memory memory; + + if (INSN_CODE (insn) < 0) + return disp_no_group; + memory = get_attr_memory (insn); + if (memory == MEMORY_STORE) + return disp_store; + + if (memory == MEMORY_LOAD) + return disp_load; + + if (memory == MEMORY_BOTH) + return disp_load_store; + + return disp_no_group; +} + +/* Return true if insn is a compare instruction. */ + +static bool +is_cmp (rtx insn) +{ + enum attr_type type; + + type = get_attr_type (insn); + return (type == TYPE_TEST + || type == TYPE_ICMP + || type == TYPE_FCMP + || GET_CODE (PATTERN (insn)) == COMPARE); +} + +/* Return true if a dispatch violation encountered. */ + +static bool +dispatch_violation (void) +{ + if (dispatch_window_list->next) + return dispatch_window_list->next->violation; + return dispatch_window_list->violation; +} + +/* Return true if insn is a branch instruction. */ + +static bool +is_branch (rtx insn) +{ + return (CALL_P (insn) || JUMP_P (insn)); +} + +/* Return true if insn is a prefetch instruction. */ + +static bool +is_prefetch (rtx insn) +{ + return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH; +} + +/* This function initializes a dispatch window and the list container holding a + pointer to the window. */ + +static void +init_window (int window_num) +{ + int i; + dispatch_windows *new_list; + + if (window_num == 0) + new_list = dispatch_window_list; + else + new_list = dispatch_window_list1; + + new_list->num_insn = 0; + new_list->num_uops = 0; + new_list->window_size = 0; + new_list->next = NULL; + new_list->prev = NULL; + new_list->window_num = window_num; + new_list->num_imm = 0; + new_list->num_imm_32 = 0; + new_list->num_imm_64 = 0; + new_list->imm_size = 0; + new_list->num_loads = 0; + new_list->num_stores = 0; + new_list->violation = false; + + for (i = 0; i < MAX_INSN; i++) + { + new_list->window[i].insn = NULL; + new_list->window[i].group = disp_no_group; + new_list->window[i].path = no_path; + new_list->window[i].byte_len = 0; + new_list->window[i].imm_bytes = 0; + } + return; +} + +/* This function allocates and initializes a dispatch window and the + list container holding a pointer to the window. */ + +static dispatch_windows * +allocate_window (void) +{ + dispatch_windows *new_list = XNEW (struct dispatch_windows_s); + new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1); + + return new_list; +} + +/* This routine initializes the dispatch scheduling information. It + initiates building dispatch scheduler tables and constructs the + first dispatch window. */ + +static void +init_dispatch_sched (void) +{ + /* Allocate a dispatch list and a window. */ + dispatch_window_list = allocate_window (); + dispatch_window_list1 = allocate_window (); + init_window (0); + init_window (1); +} + +/* This function returns true if a branch is detected. End of a basic block + does not have to be a branch, but here we assume only branches end a + window. */ + +static bool +is_end_basic_block (enum dispatch_group group) +{ + return group == disp_branch; +} + +/* This function is called when the end of a window processing is reached. */ + +static void +process_end_window (void) +{ + gcc_assert (dispatch_window_list->num_insn <= MAX_INSN); + if (dispatch_window_list->next) + { + gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN); + gcc_assert (dispatch_window_list->window_size + + dispatch_window_list1->window_size <= 48); + init_window (1); + } + init_window (0); +} + +/* Allocates a new dispatch window and adds it to WINDOW_LIST. + WINDOW_NUM is either 0 or 1. A maximum of two windows are generated + for 48 bytes of instructions. Note that these windows are not dispatch + windows that their sizes are DISPATCH_WINDOW_SIZE. */ + +static dispatch_windows * +allocate_next_window (int window_num) +{ + if (window_num == 0) + { + if (dispatch_window_list->next) + init_window (1); + init_window (0); + return dispatch_window_list; + } + + dispatch_window_list->next = dispatch_window_list1; + dispatch_window_list1->prev = dispatch_window_list; + + return dispatch_window_list1; +} + +/* Increment the number of immediate operands of an instruction. */ + +static int +find_constant_1 (rtx *in_rtx, imm_info *imm_values) +{ + if (*in_rtx == 0) + return 0; + + switch ( GET_CODE (*in_rtx)) + { + case CONST: + case SYMBOL_REF: + case CONST_INT: + (imm_values->imm)++; + if (x86_64_immediate_operand (*in_rtx, SImode)) + (imm_values->imm32)++; + else + (imm_values->imm64)++; + break; + + case CONST_DOUBLE: + (imm_values->imm)++; + (imm_values->imm64)++; + break; + + case CODE_LABEL: + if (LABEL_KIND (*in_rtx) == LABEL_NORMAL) + { + (imm_values->imm)++; + (imm_values->imm32)++; + } + break; + + default: + break; + } + + return 0; +} + +/* Compute number of immediate operands of an instruction. */ + +static void +find_constant (rtx in_rtx, imm_info *imm_values) +{ + for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx, + (rtx_function) find_constant_1, (void *) imm_values); +} + +/* Return total size of immediate operands of an instruction along with number + of corresponding immediate-operands. It initializes its parameters to zero + befor calling FIND_CONSTANT. + INSN is the input instruction. IMM is the total of immediates. + IMM32 is the number of 32 bit immediates. IMM64 is the number of 64 + bit immediates. */ + +static int +get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64) +{ + imm_info imm_values = {0, 0, 0}; + + find_constant (insn, &imm_values); + *imm = imm_values.imm; + *imm32 = imm_values.imm32; + *imm64 = imm_values.imm64; + return imm_values.imm32 * 4 + imm_values.imm64 * 8; +} + +/* This function indicates if an operand of an instruction is an + immediate. */ + +static bool +has_immediate (rtx insn) +{ + int num_imm_operand; + int num_imm32_operand; + int num_imm64_operand; + + if (insn) + return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand, + &num_imm64_operand); + return false; +} + +/* Return single or double path for instructions. */ + +static enum insn_path +get_insn_path (rtx insn) +{ + enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn); + + if ((int)path == 0) + return path_single; + + if ((int)path == 1) + return path_double; + + return path_multi; +} + +/* Return insn dispatch group. */ + +static enum dispatch_group +get_insn_group (rtx insn) +{ + enum dispatch_group group = get_mem_group (insn); + if (group) + return group; + + if (is_branch (insn)) + return disp_branch; + + if (is_cmp (insn)) + return disp_cmp; + + if (has_immediate (insn)) + return disp_imm; + + if (is_prefetch (insn)) + return disp_prefetch; + + return disp_no_group; +} + +/* Count number of GROUP restricted instructions in a dispatch + window WINDOW_LIST. */ + +static int +count_num_restricted (rtx insn, dispatch_windows *window_list) +{ + enum dispatch_group group = get_insn_group (insn); + int imm_size; + int num_imm_operand; + int num_imm32_operand; + int num_imm64_operand; + + if (group == disp_no_group) + return 0; + + if (group == disp_imm) + { + imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand, + &num_imm64_operand); + if (window_list->imm_size + imm_size > MAX_IMM_SIZE + || num_imm_operand + window_list->num_imm > MAX_IMM + || (num_imm32_operand > 0 + && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32 + || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32)) + || (num_imm64_operand > 0 + && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64 + || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32)) + || (window_list->imm_size + imm_size == MAX_IMM_SIZE + && num_imm64_operand > 0 + && ((window_list->num_imm_64 > 0 + && window_list->num_insn >= 2) + || window_list->num_insn >= 3))) + return BIG; + + return 1; + } + + if ((group == disp_load_store + && (window_list->num_loads >= MAX_LOAD + || window_list->num_stores >= MAX_STORE)) + || ((group == disp_load + || group == disp_prefetch) + && window_list->num_loads >= MAX_LOAD) + || (group == disp_store + && window_list->num_stores >= MAX_STORE)) + return BIG; + + return 1; +} + +/* This function returns true if insn satisfies dispatch rules on the + last window scheduled. */ + +static bool +fits_dispatch_window (rtx insn) +{ + dispatch_windows *window_list = dispatch_window_list; + dispatch_windows *window_list_next = dispatch_window_list->next; + unsigned int num_restrict; + enum dispatch_group group = get_insn_group (insn); + enum insn_path path = get_insn_path (insn); + int sum; + + /* Make disp_cmp and disp_jcc get scheduled at the latest. These + instructions should be given the lowest priority in the + scheduling process in Haifa scheduler to make sure they will be + scheduled in the same dispatch window as the refrence to them. */ + if (group == disp_jcc || group == disp_cmp) + return false; + + /* Check nonrestricted. */ + if (group == disp_no_group || group == disp_branch) + return true; + + /* Get last dispatch window. */ + if (window_list_next) + window_list = window_list_next; + + if (window_list->window_num == 1) + { + sum = window_list->prev->window_size + window_list->window_size; + + if (sum == 32 + || (min_insn_size (insn) + sum) >= 48) + /* Window 1 is full. Go for next window. */ + return true; + } + + num_restrict = count_num_restricted (insn, window_list); + + if (num_restrict > num_allowable_groups[group]) + return false; + + /* See if it fits in the first window. */ + if (window_list->window_num == 0) + { + /* The first widow should have only single and double path + uops. */ + if (path == path_double + && (window_list->num_uops + 2) > MAX_INSN) + return false; + else if (path != path_single) + return false; + } + return true; +} + +/* Add an instruction INSN with NUM_UOPS micro-operations to the + dispatch window WINDOW_LIST. */ + +static void +add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops) +{ + int byte_len = min_insn_size (insn); + int num_insn = window_list->num_insn; + int imm_size; + sched_insn_info *window = window_list->window; + enum dispatch_group group = get_insn_group (insn); + enum insn_path path = get_insn_path (insn); + int num_imm_operand; + int num_imm32_operand; + int num_imm64_operand; + + if (!window_list->violation && group != disp_cmp + && !fits_dispatch_window (insn)) + window_list->violation = true; + + imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand, + &num_imm64_operand); + + /* Initialize window with new instruction. */ + window[num_insn].insn = insn; + window[num_insn].byte_len = byte_len; + window[num_insn].group = group; + window[num_insn].path = path; + window[num_insn].imm_bytes = imm_size; + + window_list->window_size += byte_len; + window_list->num_insn = num_insn + 1; + window_list->num_uops = window_list->num_uops + num_uops; + window_list->imm_size += imm_size; + window_list->num_imm += num_imm_operand; + window_list->num_imm_32 += num_imm32_operand; + window_list->num_imm_64 += num_imm64_operand; + + if (group == disp_store) + window_list->num_stores += 1; + else if (group == disp_load + || group == disp_prefetch) + window_list->num_loads += 1; + else if (group == disp_load_store) + { + window_list->num_stores += 1; + window_list->num_loads += 1; + } +} + +/* Adds a scheduled instruction, INSN, to the current dispatch window. + If the total bytes of instructions or the number of instructions in + the window exceed allowable, it allocates a new window. */ + +static void +add_to_dispatch_window (rtx insn) +{ + int byte_len; + dispatch_windows *window_list; + dispatch_windows *next_list; + dispatch_windows *window0_list; + enum insn_path path; + enum dispatch_group insn_group; + bool insn_fits; + int num_insn; + int num_uops; + int window_num; + int insn_num_uops; + int sum; + + if (INSN_CODE (insn) < 0) + return; + + byte_len = min_insn_size (insn); + window_list = dispatch_window_list; + next_list = window_list->next; + path = get_insn_path (insn); + insn_group = get_insn_group (insn); + + /* Get the last dispatch window. */ + if (next_list) + window_list = dispatch_window_list->next; + + if (path == path_single) + insn_num_uops = 1; + else if (path == path_double) + insn_num_uops = 2; + else + insn_num_uops = (int) path; + + /* If current window is full, get a new window. + Window number zero is full, if MAX_INSN uops are scheduled in it. + Window number one is full, if window zero's bytes plus window + one's bytes is 32, or if the bytes of the new instruction added + to the total makes it greater than 48, or it has already MAX_INSN + instructions in it. */ + num_insn = window_list->num_insn; + num_uops = window_list->num_uops; + window_num = window_list->window_num; + insn_fits = fits_dispatch_window (insn); + + if (num_insn >= MAX_INSN + || num_uops + insn_num_uops > MAX_INSN + || !(insn_fits)) + { + window_num = ~window_num & 1; + window_list = allocate_next_window (window_num); + } + + if (window_num == 0) + { + add_insn_window (insn, window_list, insn_num_uops); + if (window_list->num_insn >= MAX_INSN + && insn_group == disp_branch) + { + process_end_window (); + return; + } + } + else if (window_num == 1) + { + window0_list = window_list->prev; + sum = window0_list->window_size + window_list->window_size; + if (sum == 32 + || (byte_len + sum) >= 48) + { + process_end_window (); + window_list = dispatch_window_list; + } + + add_insn_window (insn, window_list, insn_num_uops); + } + else + gcc_unreachable (); + + if (is_end_basic_block (insn_group)) + { + /* End of basic block is reached do end-basic-block process. */ + process_end_window (); + return; + } +} + +/* Print the dispatch window, WINDOW_NUM, to FILE. */ + +DEBUG_FUNCTION static void +debug_dispatch_window_file (FILE *file, int window_num) +{ + dispatch_windows *list; + int i; + + if (window_num == 0) + list = dispatch_window_list; + else + list = dispatch_window_list1; + + fprintf (file, "Window #%d:\n", list->window_num); + fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n", + list->num_insn, list->num_uops, list->window_size); + fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n", + list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size); + + fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads, + list->num_stores); + fprintf (file, " insn info:\n"); + + for (i = 0; i < MAX_INSN; i++) + { + if (!list->window[i].insn) + break; + fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n", + i, group_name[list->window[i].group], + i, (void *)list->window[i].insn, + i, list->window[i].path, + i, list->window[i].byte_len, + i, list->window[i].imm_bytes); + } +} + +/* Print to stdout a dispatch window. */ + +DEBUG_FUNCTION void +debug_dispatch_window (int window_num) +{ + debug_dispatch_window_file (stdout, window_num); +} + +/* Print INSN dispatch information to FILE. */ + +DEBUG_FUNCTION static void +debug_insn_dispatch_info_file (FILE *file, rtx insn) +{ + int byte_len; + enum insn_path path; + enum dispatch_group group; + int imm_size; + int num_imm_operand; + int num_imm32_operand; + int num_imm64_operand; + + if (INSN_CODE (insn) < 0) + return; + + byte_len = min_insn_size (insn); + path = get_insn_path (insn); + group = get_insn_group (insn); + imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand, + &num_imm64_operand); + + fprintf (file, " insn info:\n"); + fprintf (file, " group = %s, path = %d, byte_len = %d\n", + group_name[group], path, byte_len); + fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n", + num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size); +} + +/* Print to STDERR the status of the ready list with respect to + dispatch windows. */ + +DEBUG_FUNCTION void +debug_ready_dispatch (void) +{ + int i; + int no_ready = number_in_ready (); + + fprintf (stdout, "Number of ready: %d\n", no_ready); + + for (i = 0; i < no_ready; i++) + debug_insn_dispatch_info_file (stdout, get_ready_element (i)); +} + +/* This routine is the driver of the dispatch scheduler. */ + +static void +do_dispatch (rtx insn, int mode) +{ + if (mode == DISPATCH_INIT) + init_dispatch_sched (); + else if (mode == ADD_TO_DISPATCH_WINDOW) + add_to_dispatch_window (insn); +} + +/* Return TRUE if Dispatch Scheduling is supported. */ + +static bool +has_dispatch (rtx insn, int action) +{ + if (ix86_tune == PROCESSOR_BDVER1 && flag_dispatch_scheduler) + switch (action) + { + default: + return false; + + case IS_DISPATCH_ON: + return true; + break; + + case IS_CMP: + return is_cmp (insn); + + case DISPATCH_VIOLATION: + return dispatch_violation (); + + case FITS_DISPATCH_WINDOW: + return fits_dispatch_window (insn); + } + + return false; +} + +/* ??? No autovectorization into MMX or 3DNOW until we can reliably + place emms and femms instructions. */ + +static enum machine_mode +ix86_preferred_simd_mode (enum machine_mode mode) +{ + /* Disable double precision vectorizer if needed. */ + if (mode == DFmode && !TARGET_VECTORIZE_DOUBLE) + return word_mode; + + if (!TARGET_AVX && !TARGET_SSE) + return word_mode; + + switch (mode) + { + case SFmode: + return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SFmode : V4SFmode; + case DFmode: + return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DFmode : V2DFmode; + case DImode: + return V2DImode; + case SImode: + return V4SImode; + case HImode: + return V8HImode; + case QImode: + return V16QImode; + + default:; + } + + return word_mode; +} + +/* If AVX is enabled then try vectorizing with both 256bit and 128bit + vectors. */ + +static unsigned int +ix86_autovectorize_vector_sizes (void) +{ + return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0; +} + +/* Initialize the GCC target structure. */ +#undef TARGET_RETURN_IN_MEMORY +#define TARGET_RETURN_IN_MEMORY ix86_return_in_memory + +#undef TARGET_LEGITIMIZE_ADDRESS +#define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address + +#undef TARGET_ATTRIBUTE_TABLE +#define TARGET_ATTRIBUTE_TABLE ix86_attribute_table +#if TARGET_DLLIMPORT_DECL_ATTRIBUTES +# undef TARGET_MERGE_DECL_ATTRIBUTES +# define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes +#endif + +#undef TARGET_COMP_TYPE_ATTRIBUTES +#define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes + +#undef TARGET_INIT_BUILTINS +#define TARGET_INIT_BUILTINS ix86_init_builtins +#undef TARGET_BUILTIN_DECL +#define TARGET_BUILTIN_DECL ix86_builtin_decl +#undef TARGET_EXPAND_BUILTIN +#define TARGET_EXPAND_BUILTIN ix86_expand_builtin + +#undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION +#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \ + ix86_builtin_vectorized_function + +#undef TARGET_VECTORIZE_BUILTIN_CONVERSION +#define TARGET_VECTORIZE_BUILTIN_CONVERSION ix86_vectorize_builtin_conversion + +#undef TARGET_BUILTIN_RECIPROCAL +#define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal + +#undef TARGET_ASM_FUNCTION_EPILOGUE +#define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue + +#undef TARGET_ENCODE_SECTION_INFO +#ifndef SUBTARGET_ENCODE_SECTION_INFO +#define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info +#else +#define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO +#endif + +#undef TARGET_ASM_OPEN_PAREN +#define TARGET_ASM_OPEN_PAREN "" +#undef TARGET_ASM_CLOSE_PAREN +#define TARGET_ASM_CLOSE_PAREN "" + +#undef TARGET_ASM_BYTE_OP +#define TARGET_ASM_BYTE_OP ASM_BYTE + +#undef TARGET_ASM_ALIGNED_HI_OP +#define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT +#undef TARGET_ASM_ALIGNED_SI_OP +#define TARGET_ASM_ALIGNED_SI_OP ASM_LONG +#ifdef ASM_QUAD +#undef TARGET_ASM_ALIGNED_DI_OP +#define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD +#endif + +#undef TARGET_PROFILE_BEFORE_PROLOGUE +#define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue + +#undef TARGET_ASM_UNALIGNED_HI_OP +#define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP +#undef TARGET_ASM_UNALIGNED_SI_OP +#define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP +#undef TARGET_ASM_UNALIGNED_DI_OP +#define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP + +#undef TARGET_PRINT_OPERAND +#define TARGET_PRINT_OPERAND ix86_print_operand +#undef TARGET_PRINT_OPERAND_ADDRESS +#define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address +#undef TARGET_PRINT_OPERAND_PUNCT_VALID_P +#define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p +#undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA +#define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra + +#undef TARGET_SCHED_INIT_GLOBAL +#define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global +#undef TARGET_SCHED_ADJUST_COST +#define TARGET_SCHED_ADJUST_COST ix86_adjust_cost +#undef TARGET_SCHED_ISSUE_RATE +#define TARGET_SCHED_ISSUE_RATE ix86_issue_rate +#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD +#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \ + ia32_multipass_dfa_lookahead + +#undef TARGET_FUNCTION_OK_FOR_SIBCALL +#define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall + +#ifdef HAVE_AS_TLS +#undef TARGET_HAVE_TLS +#define TARGET_HAVE_TLS true +#endif +#undef TARGET_CANNOT_FORCE_CONST_MEM +#define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem +#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P +#define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true + +#undef TARGET_DELEGITIMIZE_ADDRESS +#define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address + +#undef TARGET_MS_BITFIELD_LAYOUT_P +#define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p + +#if TARGET_MACHO +#undef TARGET_BINDS_LOCAL_P +#define TARGET_BINDS_LOCAL_P darwin_binds_local_p +#endif +#if TARGET_DLLIMPORT_DECL_ATTRIBUTES +#undef TARGET_BINDS_LOCAL_P +#define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p +#endif + +#undef TARGET_ASM_OUTPUT_MI_THUNK +#define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk +#undef TARGET_ASM_CAN_OUTPUT_MI_THUNK +#define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk + +#undef TARGET_ASM_FILE_START +#define TARGET_ASM_FILE_START x86_file_start + +#undef TARGET_DEFAULT_TARGET_FLAGS +#define TARGET_DEFAULT_TARGET_FLAGS \ + (TARGET_DEFAULT \ + | TARGET_SUBTARGET_DEFAULT \ + | TARGET_TLS_DIRECT_SEG_REFS_DEFAULT) + +#undef TARGET_HANDLE_OPTION +#define TARGET_HANDLE_OPTION ix86_handle_option + +#undef TARGET_OPTION_OVERRIDE +#define TARGET_OPTION_OVERRIDE ix86_option_override +#undef TARGET_OPTION_OPTIMIZATION_TABLE +#define TARGET_OPTION_OPTIMIZATION_TABLE ix86_option_optimization_table +#undef TARGET_OPTION_INIT_STRUCT +#define TARGET_OPTION_INIT_STRUCT ix86_option_init_struct + +#undef TARGET_REGISTER_MOVE_COST +#define TARGET_REGISTER_MOVE_COST ix86_register_move_cost +#undef TARGET_MEMORY_MOVE_COST +#define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost +#undef TARGET_RTX_COSTS +#define TARGET_RTX_COSTS ix86_rtx_costs +#undef TARGET_ADDRESS_COST +#define TARGET_ADDRESS_COST ix86_address_cost + +#undef TARGET_FIXED_CONDITION_CODE_REGS +#define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs +#undef TARGET_CC_MODES_COMPATIBLE +#define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible + +#undef TARGET_MACHINE_DEPENDENT_REORG +#define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg + +#undef TARGET_BUILTIN_SETJMP_FRAME_VALUE +#define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value + +#undef TARGET_BUILD_BUILTIN_VA_LIST +#define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list + +#undef TARGET_ENUM_VA_LIST_P +#define TARGET_ENUM_VA_LIST_P ix86_enum_va_list + +#undef TARGET_FN_ABI_VA_LIST +#define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list + +#undef TARGET_CANONICAL_VA_LIST_TYPE +#define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type + +#undef TARGET_EXPAND_BUILTIN_VA_START +#define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start + +#undef TARGET_MD_ASM_CLOBBERS +#define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers + +#undef TARGET_PROMOTE_PROTOTYPES +#define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true +#undef TARGET_STRUCT_VALUE_RTX +#define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx +#undef TARGET_SETUP_INCOMING_VARARGS +#define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs +#undef TARGET_MUST_PASS_IN_STACK +#define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack +#undef TARGET_FUNCTION_ARG_ADVANCE +#define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance +#undef TARGET_FUNCTION_ARG +#define TARGET_FUNCTION_ARG ix86_function_arg +#undef TARGET_FUNCTION_ARG_BOUNDARY +#define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary +#undef TARGET_PASS_BY_REFERENCE +#define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference +#undef TARGET_INTERNAL_ARG_POINTER +#define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer +#undef TARGET_UPDATE_STACK_BOUNDARY +#define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary +#undef TARGET_GET_DRAP_RTX +#define TARGET_GET_DRAP_RTX ix86_get_drap_rtx +#undef TARGET_STRICT_ARGUMENT_NAMING +#define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true +#undef TARGET_STATIC_CHAIN +#define TARGET_STATIC_CHAIN ix86_static_chain +#undef TARGET_TRAMPOLINE_INIT +#define TARGET_TRAMPOLINE_INIT ix86_trampoline_init +#undef TARGET_RETURN_POPS_ARGS +#define TARGET_RETURN_POPS_ARGS ix86_return_pops_args + +#undef TARGET_GIMPLIFY_VA_ARG_EXPR +#define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg + +#undef TARGET_SCALAR_MODE_SUPPORTED_P +#define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p + +#undef TARGET_VECTOR_MODE_SUPPORTED_P +#define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p + +#undef TARGET_C_MODE_FOR_SUFFIX +#define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix + +#ifdef HAVE_AS_TLS +#undef TARGET_ASM_OUTPUT_DWARF_DTPREL +#define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel +#endif + +#ifdef SUBTARGET_INSERT_ATTRIBUTES +#undef TARGET_INSERT_ATTRIBUTES +#define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES +#endif + +#undef TARGET_MANGLE_TYPE +#define TARGET_MANGLE_TYPE ix86_mangle_type + +#undef TARGET_STACK_PROTECT_FAIL +#define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail + +#undef TARGET_SUPPORTS_SPLIT_STACK +#define TARGET_SUPPORTS_SPLIT_STACK ix86_supports_split_stack + +#undef TARGET_FUNCTION_VALUE +#define TARGET_FUNCTION_VALUE ix86_function_value + +#undef TARGET_FUNCTION_VALUE_REGNO_P +#define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p + +#undef TARGET_SECONDARY_RELOAD +#define TARGET_SECONDARY_RELOAD ix86_secondary_reload + +#undef TARGET_PREFERRED_RELOAD_CLASS +#define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class +#undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS +#define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class +#undef TARGET_CLASS_LIKELY_SPILLED_P +#define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p + +#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST +#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \ + ix86_builtin_vectorization_cost +#undef TARGET_VECTORIZE_BUILTIN_VEC_PERM +#define TARGET_VECTORIZE_BUILTIN_VEC_PERM \ + ix86_vectorize_builtin_vec_perm +#undef TARGET_VECTORIZE_BUILTIN_VEC_PERM_OK +#define TARGET_VECTORIZE_BUILTIN_VEC_PERM_OK \ + ix86_vectorize_builtin_vec_perm_ok +#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE +#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \ + ix86_preferred_simd_mode +#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES +#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \ + ix86_autovectorize_vector_sizes + +#undef TARGET_SET_CURRENT_FUNCTION +#define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function + +#undef TARGET_OPTION_VALID_ATTRIBUTE_P +#define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p + +#undef TARGET_OPTION_SAVE +#define TARGET_OPTION_SAVE ix86_function_specific_save + +#undef TARGET_OPTION_RESTORE +#define TARGET_OPTION_RESTORE ix86_function_specific_restore + +#undef TARGET_OPTION_PRINT +#define TARGET_OPTION_PRINT ix86_function_specific_print + +#undef TARGET_CAN_INLINE_P +#define TARGET_CAN_INLINE_P ix86_can_inline_p + +#undef TARGET_EXPAND_TO_RTL_HOOK +#define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi + +#undef TARGET_LEGITIMATE_ADDRESS_P +#define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p + +#undef TARGET_IRA_COVER_CLASSES +#define TARGET_IRA_COVER_CLASSES i386_ira_cover_classes + +#undef TARGET_FRAME_POINTER_REQUIRED +#define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required + +#undef TARGET_CAN_ELIMINATE +#define TARGET_CAN_ELIMINATE ix86_can_eliminate + +#undef TARGET_EXTRA_LIVE_ON_ENTRY +#define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry + +#undef TARGET_ASM_CODE_END +#define TARGET_ASM_CODE_END ix86_code_end + +#undef TARGET_CONDITIONAL_REGISTER_USAGE +#define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage + +#if TARGET_MACHO +#undef TARGET_INIT_LIBFUNCS +#define TARGET_INIT_LIBFUNCS darwin_rename_builtins +#endif + +struct gcc_target targetm = TARGET_INITIALIZER; + +#include "gt-i386.h" diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h new file mode 100644 index 000000000..bb23674d7 --- /dev/null +++ b/gcc/config/i386/i386.h @@ -0,0 +1,2400 @@ +/* Definitions of target machine for GCC for IA-32. + Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, + 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 + Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ + +/* The purpose of this file is to define the characteristics of the i386, + independent of assembler syntax or operating system. + + Three other files build on this one to describe a specific assembler syntax: + bsd386.h, att386.h, and sun386.h. + + The actual tm.h file for a particular system should include + this file, and then the file for the appropriate assembler syntax. + + Many macros that specify assembler syntax are omitted entirely from + this file because they really belong in the files for particular + assemblers. These include RP, IP, LPREFIX, PUT_OP_SIZE, USE_STAR, + ADDR_BEG, ADDR_END, PRINT_IREG, PRINT_SCALE, PRINT_B_I_S, and many + that start with ASM_ or end in ASM_OP. */ + +/* Redefines for option macros. */ + +#define TARGET_64BIT OPTION_ISA_64BIT +#define TARGET_MMX OPTION_ISA_MMX +#define TARGET_3DNOW OPTION_ISA_3DNOW +#define TARGET_3DNOW_A OPTION_ISA_3DNOW_A +#define TARGET_SSE OPTION_ISA_SSE +#define TARGET_SSE2 OPTION_ISA_SSE2 +#define TARGET_SSE3 OPTION_ISA_SSE3 +#define TARGET_SSSE3 OPTION_ISA_SSSE3 +#define TARGET_SSE4_1 OPTION_ISA_SSE4_1 +#define TARGET_SSE4_2 OPTION_ISA_SSE4_2 +#define TARGET_AVX OPTION_ISA_AVX +#define TARGET_FMA OPTION_ISA_FMA +#define TARGET_SSE4A OPTION_ISA_SSE4A +#define TARGET_FMA4 OPTION_ISA_FMA4 +#define TARGET_XOP OPTION_ISA_XOP +#define TARGET_LWP OPTION_ISA_LWP +#define TARGET_ROUND OPTION_ISA_ROUND +#define TARGET_ABM OPTION_ISA_ABM +#define TARGET_BMI OPTION_ISA_BMI +#define TARGET_TBM OPTION_ISA_TBM +#define TARGET_POPCNT OPTION_ISA_POPCNT +#define TARGET_SAHF OPTION_ISA_SAHF +#define TARGET_MOVBE OPTION_ISA_MOVBE +#define TARGET_CRC32 OPTION_ISA_CRC32 +#define TARGET_AES OPTION_ISA_AES +#define TARGET_PCLMUL OPTION_ISA_PCLMUL +#define TARGET_CMPXCHG16B OPTION_ISA_CX16 +#define TARGET_FSGSBASE OPTION_ISA_FSGSBASE +#define TARGET_RDRND OPTION_ISA_RDRND +#define TARGET_F16C OPTION_ISA_F16C + + +/* SSE4.1 defines round instructions */ +#define OPTION_MASK_ISA_ROUND OPTION_MASK_ISA_SSE4_1 +#define OPTION_ISA_ROUND ((ix86_isa_flags & OPTION_MASK_ISA_ROUND) != 0) + +#include "config/vxworks-dummy.h" + +/* Algorithm to expand string function with. */ +enum stringop_alg +{ + no_stringop, + libcall, + rep_prefix_1_byte, + rep_prefix_4_byte, + rep_prefix_8_byte, + loop_1_byte, + loop, + unrolled_loop +}; + +#define MAX_STRINGOP_ALGS 4 + +/* Specify what algorithm to use for stringops on known size. + When size is unknown, the UNKNOWN_SIZE alg is used. When size is + known at compile time or estimated via feedback, the SIZE array + is walked in order until MAX is greater then the estimate (or -1 + means infinity). Corresponding ALG is used then. + For example initializer: + {{256, loop}, {-1, rep_prefix_4_byte}} + will use loop for blocks smaller or equal to 256 bytes, rep prefix will + be used otherwise. */ +struct stringop_algs +{ + const enum stringop_alg unknown_size; + const struct stringop_strategy { + const int max; + const enum stringop_alg alg; + } size [MAX_STRINGOP_ALGS]; +}; + +/* Define the specific costs for a given cpu */ + +struct processor_costs { + const int add; /* cost of an add instruction */ + const int lea; /* cost of a lea instruction */ + const int shift_var; /* variable shift costs */ + const int shift_const; /* constant shift costs */ + const int mult_init[5]; /* cost of starting a multiply + in QImode, HImode, SImode, DImode, TImode*/ + const int mult_bit; /* cost of multiply per each bit set */ + const int divide[5]; /* cost of a divide/mod + in QImode, HImode, SImode, DImode, TImode*/ + int movsx; /* The cost of movsx operation. */ + int movzx; /* The cost of movzx operation. */ + const int large_insn; /* insns larger than this cost more */ + const int move_ratio; /* The threshold of number of scalar + memory-to-memory move insns. */ + const int movzbl_load; /* cost of loading using movzbl */ + const int int_load[3]; /* cost of loading integer registers + in QImode, HImode and SImode relative + to reg-reg move (2). */ + const int int_store[3]; /* cost of storing integer register + in QImode, HImode and SImode */ + const int fp_move; /* cost of reg,reg fld/fst */ + const int fp_load[3]; /* cost of loading FP register + in SFmode, DFmode and XFmode */ + const int fp_store[3]; /* cost of storing FP register + in SFmode, DFmode and XFmode */ + const int mmx_move; /* cost of moving MMX register. */ + const int mmx_load[2]; /* cost of loading MMX register + in SImode and DImode */ + const int mmx_store[2]; /* cost of storing MMX register + in SImode and DImode */ + const int sse_move; /* cost of moving SSE register. */ + const int sse_load[3]; /* cost of loading SSE register + in SImode, DImode and TImode*/ + const int sse_store[3]; /* cost of storing SSE register + in SImode, DImode and TImode*/ + const int mmxsse_to_integer; /* cost of moving mmxsse register to + integer and vice versa. */ + const int l1_cache_size; /* size of l1 cache, in kilobytes. */ + const int l2_cache_size; /* size of l2 cache, in kilobytes. */ + const int prefetch_block; /* bytes moved to cache for prefetch. */ + const int simultaneous_prefetches; /* number of parallel prefetch + operations. */ + const int branch_cost; /* Default value for BRANCH_COST. */ + const int fadd; /* cost of FADD and FSUB instructions. */ + const int fmul; /* cost of FMUL instruction. */ + const int fdiv; /* cost of FDIV instruction. */ + const int fabs; /* cost of FABS instruction. */ + const int fchs; /* cost of FCHS instruction. */ + const int fsqrt; /* cost of FSQRT instruction. */ + /* Specify what algorithm + to use for stringops on unknown size. */ + struct stringop_algs memcpy[2], memset[2]; + const int scalar_stmt_cost; /* Cost of any scalar operation, excluding + load and store. */ + const int scalar_load_cost; /* Cost of scalar load. */ + const int scalar_store_cost; /* Cost of scalar store. */ + const int vec_stmt_cost; /* Cost of any vector operation, excluding + load, store, vector-to-scalar and + scalar-to-vector operation. */ + const int vec_to_scalar_cost; /* Cost of vect-to-scalar operation. */ + const int scalar_to_vec_cost; /* Cost of scalar-to-vector operation. */ + const int vec_align_load_cost; /* Cost of aligned vector load. */ + const int vec_unalign_load_cost; /* Cost of unaligned vector load. */ + const int vec_store_cost; /* Cost of vector store. */ + const int cond_taken_branch_cost; /* Cost of taken branch for vectorizer + cost model. */ + const int cond_not_taken_branch_cost;/* Cost of not taken branch for + vectorizer cost model. */ +}; + +extern const struct processor_costs *ix86_cost; +extern const struct processor_costs ix86_size_cost; + +#define ix86_cur_cost() \ + (optimize_insn_for_size_p () ? &ix86_size_cost: ix86_cost) + +/* Macros used in the machine description to test the flags. */ + +/* configure can arrange to make this 2, to force a 486. */ + +#ifndef TARGET_CPU_DEFAULT +#define TARGET_CPU_DEFAULT TARGET_CPU_DEFAULT_generic +#endif + +#ifndef TARGET_FPMATH_DEFAULT +#define TARGET_FPMATH_DEFAULT \ + (TARGET_64BIT && TARGET_SSE ? FPMATH_SSE : FPMATH_387) +#endif + +#define TARGET_FLOAT_RETURNS_IN_80387 TARGET_FLOAT_RETURNS + +/* 64bit Sledgehammer mode. For libgcc2 we make sure this is a + compile-time constant. */ +#ifdef IN_LIBGCC2 +#undef TARGET_64BIT +#ifdef __x86_64__ +#define TARGET_64BIT 1 +#else +#define TARGET_64BIT 0 +#endif +#else +#ifndef TARGET_BI_ARCH +#undef TARGET_64BIT +#if TARGET_64BIT_DEFAULT +#define TARGET_64BIT 1 +#else +#define TARGET_64BIT 0 +#endif +#endif +#endif + +#define HAS_LONG_COND_BRANCH 1 +#define HAS_LONG_UNCOND_BRANCH 1 + +#define TARGET_386 (ix86_tune == PROCESSOR_I386) +#define TARGET_486 (ix86_tune == PROCESSOR_I486) +#define TARGET_PENTIUM (ix86_tune == PROCESSOR_PENTIUM) +#define TARGET_PENTIUMPRO (ix86_tune == PROCESSOR_PENTIUMPRO) +#define TARGET_GEODE (ix86_tune == PROCESSOR_GEODE) +#define TARGET_K6 (ix86_tune == PROCESSOR_K6) +#define TARGET_ATHLON (ix86_tune == PROCESSOR_ATHLON) +#define TARGET_PENTIUM4 (ix86_tune == PROCESSOR_PENTIUM4) +#define TARGET_K8 (ix86_tune == PROCESSOR_K8) +#define TARGET_ATHLON_K8 (TARGET_K8 || TARGET_ATHLON) +#define TARGET_NOCONA (ix86_tune == PROCESSOR_NOCONA) +#define TARGET_CORE2_32 (ix86_tune == PROCESSOR_CORE2_32) +#define TARGET_CORE2_64 (ix86_tune == PROCESSOR_CORE2_64) +#define TARGET_CORE2 (TARGET_CORE2_32 || TARGET_CORE2_64) +#define TARGET_COREI7_32 (ix86_tune == PROCESSOR_COREI7_32) +#define TARGET_COREI7_64 (ix86_tune == PROCESSOR_COREI7_64) +#define TARGET_COREI7 (TARGET_COREI7_32 || TARGET_COREI7_64) +#define TARGET_GENERIC32 (ix86_tune == PROCESSOR_GENERIC32) +#define TARGET_GENERIC64 (ix86_tune == PROCESSOR_GENERIC64) +#define TARGET_GENERIC (TARGET_GENERIC32 || TARGET_GENERIC64) +#define TARGET_AMDFAM10 (ix86_tune == PROCESSOR_AMDFAM10) +#define TARGET_BDVER1 (ix86_tune == PROCESSOR_BDVER1) +#define TARGET_BTVER1 (ix86_tune == PROCESSOR_BTVER1) +#define TARGET_ATOM (ix86_tune == PROCESSOR_ATOM) + +/* Feature tests against the various tunings. */ +enum ix86_tune_indices { + X86_TUNE_USE_LEAVE, + X86_TUNE_PUSH_MEMORY, + X86_TUNE_ZERO_EXTEND_WITH_AND, + X86_TUNE_UNROLL_STRLEN, + X86_TUNE_DEEP_BRANCH_PREDICTION, + X86_TUNE_BRANCH_PREDICTION_HINTS, + X86_TUNE_DOUBLE_WITH_ADD, + X86_TUNE_USE_SAHF, + X86_TUNE_MOVX, + X86_TUNE_PARTIAL_REG_STALL, + X86_TUNE_PARTIAL_FLAG_REG_STALL, + X86_TUNE_USE_HIMODE_FIOP, + X86_TUNE_USE_SIMODE_FIOP, + X86_TUNE_USE_MOV0, + X86_TUNE_USE_CLTD, + X86_TUNE_USE_XCHGB, + X86_TUNE_SPLIT_LONG_MOVES, + X86_TUNE_READ_MODIFY_WRITE, + X86_TUNE_READ_MODIFY, + X86_TUNE_PROMOTE_QIMODE, + X86_TUNE_FAST_PREFIX, + X86_TUNE_SINGLE_STRINGOP, + X86_TUNE_QIMODE_MATH, + X86_TUNE_HIMODE_MATH, + X86_TUNE_PROMOTE_QI_REGS, + X86_TUNE_PROMOTE_HI_REGS, + X86_TUNE_SINGLE_POP, + X86_TUNE_DOUBLE_POP, + X86_TUNE_SINGLE_PUSH, + X86_TUNE_DOUBLE_PUSH, + X86_TUNE_INTEGER_DFMODE_MOVES, + X86_TUNE_PARTIAL_REG_DEPENDENCY, + X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, + X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, + X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, + X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, + X86_TUNE_SSE_SPLIT_REGS, + X86_TUNE_SSE_TYPELESS_STORES, + X86_TUNE_SSE_LOAD0_BY_PXOR, + X86_TUNE_MEMORY_MISMATCH_STALL, + X86_TUNE_PROLOGUE_USING_MOVE, + X86_TUNE_EPILOGUE_USING_MOVE, + X86_TUNE_SHIFT1, + X86_TUNE_USE_FFREEP, + X86_TUNE_INTER_UNIT_MOVES, + X86_TUNE_INTER_UNIT_CONVERSIONS, + X86_TUNE_FOUR_JUMP_LIMIT, + X86_TUNE_SCHEDULE, + X86_TUNE_USE_BT, + X86_TUNE_USE_INCDEC, + X86_TUNE_PAD_RETURNS, + X86_TUNE_PAD_SHORT_FUNCTION, + X86_TUNE_EXT_80387_CONSTANTS, + X86_TUNE_SHORTEN_X87_SSE, + X86_TUNE_AVOID_VECTOR_DECODE, + X86_TUNE_PROMOTE_HIMODE_IMUL, + X86_TUNE_SLOW_IMUL_IMM32_MEM, + X86_TUNE_SLOW_IMUL_IMM8, + X86_TUNE_MOVE_M1_VIA_OR, + X86_TUNE_NOT_UNPAIRABLE, + X86_TUNE_NOT_VECTORMODE, + X86_TUNE_USE_VECTOR_FP_CONVERTS, + X86_TUNE_USE_VECTOR_CONVERTS, + X86_TUNE_FUSE_CMP_AND_BRANCH, + X86_TUNE_OPT_AGU, + X86_TUNE_VECTORIZE_DOUBLE, + X86_TUNE_AVX128_OPTIMAL, + + X86_TUNE_LAST +}; + +extern unsigned char ix86_tune_features[X86_TUNE_LAST]; + +#define TARGET_USE_LEAVE ix86_tune_features[X86_TUNE_USE_LEAVE] +#define TARGET_PUSH_MEMORY ix86_tune_features[X86_TUNE_PUSH_MEMORY] +#define TARGET_ZERO_EXTEND_WITH_AND \ + ix86_tune_features[X86_TUNE_ZERO_EXTEND_WITH_AND] +#define TARGET_UNROLL_STRLEN ix86_tune_features[X86_TUNE_UNROLL_STRLEN] +#define TARGET_DEEP_BRANCH_PREDICTION \ + ix86_tune_features[X86_TUNE_DEEP_BRANCH_PREDICTION] +#define TARGET_BRANCH_PREDICTION_HINTS \ + ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS] +#define TARGET_DOUBLE_WITH_ADD ix86_tune_features[X86_TUNE_DOUBLE_WITH_ADD] +#define TARGET_USE_SAHF ix86_tune_features[X86_TUNE_USE_SAHF] +#define TARGET_MOVX ix86_tune_features[X86_TUNE_MOVX] +#define TARGET_PARTIAL_REG_STALL ix86_tune_features[X86_TUNE_PARTIAL_REG_STALL] +#define TARGET_PARTIAL_FLAG_REG_STALL \ + ix86_tune_features[X86_TUNE_PARTIAL_FLAG_REG_STALL] +#define TARGET_USE_HIMODE_FIOP ix86_tune_features[X86_TUNE_USE_HIMODE_FIOP] +#define TARGET_USE_SIMODE_FIOP ix86_tune_features[X86_TUNE_USE_SIMODE_FIOP] +#define TARGET_USE_MOV0 ix86_tune_features[X86_TUNE_USE_MOV0] +#define TARGET_USE_CLTD ix86_tune_features[X86_TUNE_USE_CLTD] +#define TARGET_USE_XCHGB ix86_tune_features[X86_TUNE_USE_XCHGB] +#define TARGET_SPLIT_LONG_MOVES ix86_tune_features[X86_TUNE_SPLIT_LONG_MOVES] +#define TARGET_READ_MODIFY_WRITE ix86_tune_features[X86_TUNE_READ_MODIFY_WRITE] +#define TARGET_READ_MODIFY ix86_tune_features[X86_TUNE_READ_MODIFY] +#define TARGET_PROMOTE_QImode ix86_tune_features[X86_TUNE_PROMOTE_QIMODE] +#define TARGET_FAST_PREFIX ix86_tune_features[X86_TUNE_FAST_PREFIX] +#define TARGET_SINGLE_STRINGOP ix86_tune_features[X86_TUNE_SINGLE_STRINGOP] +#define TARGET_QIMODE_MATH ix86_tune_features[X86_TUNE_QIMODE_MATH] +#define TARGET_HIMODE_MATH ix86_tune_features[X86_TUNE_HIMODE_MATH] +#define TARGET_PROMOTE_QI_REGS ix86_tune_features[X86_TUNE_PROMOTE_QI_REGS] +#define TARGET_PROMOTE_HI_REGS ix86_tune_features[X86_TUNE_PROMOTE_HI_REGS] +#define TARGET_SINGLE_POP ix86_tune_features[X86_TUNE_SINGLE_POP] +#define TARGET_DOUBLE_POP ix86_tune_features[X86_TUNE_DOUBLE_POP] +#define TARGET_SINGLE_PUSH ix86_tune_features[X86_TUNE_SINGLE_PUSH] +#define TARGET_DOUBLE_PUSH ix86_tune_features[X86_TUNE_DOUBLE_PUSH] +#define TARGET_INTEGER_DFMODE_MOVES \ + ix86_tune_features[X86_TUNE_INTEGER_DFMODE_MOVES] +#define TARGET_PARTIAL_REG_DEPENDENCY \ + ix86_tune_features[X86_TUNE_PARTIAL_REG_DEPENDENCY] +#define TARGET_SSE_PARTIAL_REG_DEPENDENCY \ + ix86_tune_features[X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY] +#define TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \ + ix86_tune_features[X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL] +#define TARGET_SSE_UNALIGNED_STORE_OPTIMAL \ + ix86_tune_features[X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL] +#define TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL \ + ix86_tune_features[X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL] +#define TARGET_SSE_SPLIT_REGS ix86_tune_features[X86_TUNE_SSE_SPLIT_REGS] +#define TARGET_SSE_TYPELESS_STORES \ + ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES] +#define TARGET_SSE_LOAD0_BY_PXOR ix86_tune_features[X86_TUNE_SSE_LOAD0_BY_PXOR] +#define TARGET_MEMORY_MISMATCH_STALL \ + ix86_tune_features[X86_TUNE_MEMORY_MISMATCH_STALL] +#define TARGET_PROLOGUE_USING_MOVE \ + ix86_tune_features[X86_TUNE_PROLOGUE_USING_MOVE] +#define TARGET_EPILOGUE_USING_MOVE \ + ix86_tune_features[X86_TUNE_EPILOGUE_USING_MOVE] +#define TARGET_SHIFT1 ix86_tune_features[X86_TUNE_SHIFT1] +#define TARGET_USE_FFREEP ix86_tune_features[X86_TUNE_USE_FFREEP] +#define TARGET_INTER_UNIT_MOVES ix86_tune_features[X86_TUNE_INTER_UNIT_MOVES] +#define TARGET_INTER_UNIT_CONVERSIONS\ + ix86_tune_features[X86_TUNE_INTER_UNIT_CONVERSIONS] +#define TARGET_FOUR_JUMP_LIMIT ix86_tune_features[X86_TUNE_FOUR_JUMP_LIMIT] +#define TARGET_SCHEDULE ix86_tune_features[X86_TUNE_SCHEDULE] +#define TARGET_USE_BT ix86_tune_features[X86_TUNE_USE_BT] +#define TARGET_USE_INCDEC ix86_tune_features[X86_TUNE_USE_INCDEC] +#define TARGET_PAD_RETURNS ix86_tune_features[X86_TUNE_PAD_RETURNS] +#define TARGET_PAD_SHORT_FUNCTION \ + ix86_tune_features[X86_TUNE_PAD_SHORT_FUNCTION] +#define TARGET_EXT_80387_CONSTANTS \ + ix86_tune_features[X86_TUNE_EXT_80387_CONSTANTS] +#define TARGET_SHORTEN_X87_SSE ix86_tune_features[X86_TUNE_SHORTEN_X87_SSE] +#define TARGET_AVOID_VECTOR_DECODE \ + ix86_tune_features[X86_TUNE_AVOID_VECTOR_DECODE] +#define TARGET_TUNE_PROMOTE_HIMODE_IMUL \ + ix86_tune_features[X86_TUNE_PROMOTE_HIMODE_IMUL] +#define TARGET_SLOW_IMUL_IMM32_MEM \ + ix86_tune_features[X86_TUNE_SLOW_IMUL_IMM32_MEM] +#define TARGET_SLOW_IMUL_IMM8 ix86_tune_features[X86_TUNE_SLOW_IMUL_IMM8] +#define TARGET_MOVE_M1_VIA_OR ix86_tune_features[X86_TUNE_MOVE_M1_VIA_OR] +#define TARGET_NOT_UNPAIRABLE ix86_tune_features[X86_TUNE_NOT_UNPAIRABLE] +#define TARGET_NOT_VECTORMODE ix86_tune_features[X86_TUNE_NOT_VECTORMODE] +#define TARGET_USE_VECTOR_FP_CONVERTS \ + ix86_tune_features[X86_TUNE_USE_VECTOR_FP_CONVERTS] +#define TARGET_USE_VECTOR_CONVERTS \ + ix86_tune_features[X86_TUNE_USE_VECTOR_CONVERTS] +#define TARGET_FUSE_CMP_AND_BRANCH \ + ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH] +#define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU] +#define TARGET_VECTORIZE_DOUBLE \ + ix86_tune_features[X86_TUNE_VECTORIZE_DOUBLE] +#define TARGET_AVX128_OPTIMAL \ + ix86_tune_features[X86_TUNE_AVX128_OPTIMAL] + +/* Feature tests against the various architecture variations. */ +enum ix86_arch_indices { + X86_ARCH_CMOV, + X86_ARCH_CMPXCHG, + X86_ARCH_CMPXCHG8B, + X86_ARCH_XADD, + X86_ARCH_BSWAP, + + X86_ARCH_LAST +}; + +extern unsigned char ix86_arch_features[X86_ARCH_LAST]; + +#define TARGET_CMOV ix86_arch_features[X86_ARCH_CMOV] +#define TARGET_CMPXCHG ix86_arch_features[X86_ARCH_CMPXCHG] +#define TARGET_CMPXCHG8B ix86_arch_features[X86_ARCH_CMPXCHG8B] +#define TARGET_XADD ix86_arch_features[X86_ARCH_XADD] +#define TARGET_BSWAP ix86_arch_features[X86_ARCH_BSWAP] + +/* For sane SSE instruction set generation we need fcomi instruction. + It is safe to enable all CMOVE instructions. Also, RDRAND intrinsic + expands to a sequence that includes conditional move. */ +#define TARGET_CMOVE (TARGET_CMOV || TARGET_SSE || TARGET_RDRND) + +#define TARGET_FISTTP (TARGET_SSE3 && TARGET_80387) + +extern int x86_prefetch_sse; + +#define TARGET_PREFETCH_SSE x86_prefetch_sse + +#define ASSEMBLER_DIALECT (ix86_asm_dialect) + +#define TARGET_SSE_MATH ((ix86_fpmath & FPMATH_SSE) != 0) +#define TARGET_MIX_SSE_I387 \ + ((ix86_fpmath & (FPMATH_SSE | FPMATH_387)) == (FPMATH_SSE | FPMATH_387)) + +#define TARGET_GNU_TLS (ix86_tls_dialect == TLS_DIALECT_GNU) +#define TARGET_GNU2_TLS (ix86_tls_dialect == TLS_DIALECT_GNU2) +#define TARGET_ANY_GNU_TLS (TARGET_GNU_TLS || TARGET_GNU2_TLS) +#define TARGET_SUN_TLS 0 + +#ifndef TARGET_64BIT_DEFAULT +#define TARGET_64BIT_DEFAULT 0 +#endif +#ifndef TARGET_TLS_DIRECT_SEG_REFS_DEFAULT +#define TARGET_TLS_DIRECT_SEG_REFS_DEFAULT 0 +#endif + +/* Fence to use after loop using storent. */ + +extern tree x86_mfence; +#define FENCE_FOLLOWING_MOVNT x86_mfence + +/* Once GDB has been enhanced to deal with functions without frame + pointers, we can change this to allow for elimination of + the frame pointer in leaf functions. */ +#define TARGET_DEFAULT 0 + +/* Extra bits to force. */ +#define TARGET_SUBTARGET_DEFAULT 0 +#define TARGET_SUBTARGET_ISA_DEFAULT 0 + +/* Extra bits to force on w/ 32-bit mode. */ +#define TARGET_SUBTARGET32_DEFAULT 0 +#define TARGET_SUBTARGET32_ISA_DEFAULT 0 + +/* Extra bits to force on w/ 64-bit mode. */ +#define TARGET_SUBTARGET64_DEFAULT 0 +#define TARGET_SUBTARGET64_ISA_DEFAULT 0 + +/* Replace MACH-O, ifdefs by in-line tests, where possible. + (a) Macros defined in config/i386/darwin.h */ +#define TARGET_MACHO 0 +#define TARGET_MACHO_BRANCH_ISLANDS 0 +#define MACHOPIC_ATT_STUB 0 +/* (b) Macros defined in config/darwin.h */ +#define MACHO_DYNAMIC_NO_PIC_P 0 +#define MACHOPIC_INDIRECT 0 +#define MACHOPIC_PURE 0 + +/* For the Windows 64-bit ABI. */ +#define TARGET_64BIT_MS_ABI (TARGET_64BIT && ix86_cfun_abi () == MS_ABI) + +/* This is re-defined by cygming.h. */ +#define TARGET_SEH 0 + +/* Available call abi. */ +enum calling_abi +{ + SYSV_ABI = 0, + MS_ABI = 1 +}; + +/* The abi used by target. */ +extern enum calling_abi ix86_abi; + +/* The default abi used by target. */ +#define DEFAULT_ABI SYSV_ABI + +/* Subtargets may reset this to 1 in order to enable 96-bit long double + with the rounding mode forced to 53 bits. */ +#define TARGET_96_ROUND_53_LONG_DOUBLE 0 + +/* -march=native handling only makes sense with compiler running on + an x86 or x86_64 chip. If changing this condition, also change + the condition in driver-i386.c. */ +#if defined(__i386__) || defined(__x86_64__) +/* In driver-i386.c. */ +extern const char *host_detect_local_cpu (int argc, const char **argv); +#define EXTRA_SPEC_FUNCTIONS \ + { "local_cpu_detect", host_detect_local_cpu }, +#define HAVE_LOCAL_CPU_DETECT +#endif + +#if TARGET_64BIT_DEFAULT +#define OPT_ARCH64 "!m32" +#define OPT_ARCH32 "m32" +#else +#define OPT_ARCH64 "m64" +#define OPT_ARCH32 "!m64" +#endif + +/* Support for configure-time defaults of some command line options. + The order here is important so that -march doesn't squash the + tune or cpu values. */ +#define OPTION_DEFAULT_SPECS \ + {"tune", "%{!mtune=*:%{!mcpu=*:%{!march=*:-mtune=%(VALUE)}}}" }, \ + {"tune_32", "%{" OPT_ARCH32 ":%{!mtune=*:%{!mcpu=*:%{!march=*:-mtune=%(VALUE)}}}}" }, \ + {"tune_64", "%{" OPT_ARCH64 ":%{!mtune=*:%{!mcpu=*:%{!march=*:-mtune=%(VALUE)}}}}" }, \ + {"cpu", "%{!mtune=*:%{!mcpu=*:%{!march=*:-mtune=%(VALUE)}}}" }, \ + {"cpu_32", "%{" OPT_ARCH32 ":%{!mtune=*:%{!mcpu=*:%{!march=*:-mtune=%(VALUE)}}}}" }, \ + {"cpu_64", "%{" OPT_ARCH64 ":%{!mtune=*:%{!mcpu=*:%{!march=*:-mtune=%(VALUE)}}}}" }, \ + {"arch", "%{!march=*:-march=%(VALUE)}"}, \ + {"arch_32", "%{" OPT_ARCH32 ":%{!march=*:-march=%(VALUE)}}"}, \ + {"arch_64", "%{" OPT_ARCH64 ":%{!march=*:-march=%(VALUE)}}"}, + +/* Specs for the compiler proper */ + +#ifndef CC1_CPU_SPEC +#define CC1_CPU_SPEC_1 "" + +#ifndef HAVE_LOCAL_CPU_DETECT +#define CC1_CPU_SPEC CC1_CPU_SPEC_1 +#else +#define CC1_CPU_SPEC CC1_CPU_SPEC_1 \ +"%{march=native:%>march=native %:local_cpu_detect(arch) \ + %{!mtune=*:%>mtune=native %:local_cpu_detect(tune)}} \ +%{mtune=native:%>mtune=native %:local_cpu_detect(tune)}" +#endif +#endif + +/* Target CPU builtins. */ +#define TARGET_CPU_CPP_BUILTINS() ix86_target_macros () + +/* Target Pragmas. */ +#define REGISTER_TARGET_PRAGMAS() ix86_register_pragmas () + +enum target_cpu_default +{ + TARGET_CPU_DEFAULT_generic = 0, + + TARGET_CPU_DEFAULT_i386, + TARGET_CPU_DEFAULT_i486, + TARGET_CPU_DEFAULT_pentium, + TARGET_CPU_DEFAULT_pentium_mmx, + TARGET_CPU_DEFAULT_pentiumpro, + TARGET_CPU_DEFAULT_pentium2, + TARGET_CPU_DEFAULT_pentium3, + TARGET_CPU_DEFAULT_pentium4, + TARGET_CPU_DEFAULT_pentium_m, + TARGET_CPU_DEFAULT_prescott, + TARGET_CPU_DEFAULT_nocona, + TARGET_CPU_DEFAULT_core2, + TARGET_CPU_DEFAULT_corei7, + TARGET_CPU_DEFAULT_atom, + + TARGET_CPU_DEFAULT_geode, + TARGET_CPU_DEFAULT_k6, + TARGET_CPU_DEFAULT_k6_2, + TARGET_CPU_DEFAULT_k6_3, + TARGET_CPU_DEFAULT_athlon, + TARGET_CPU_DEFAULT_athlon_sse, + TARGET_CPU_DEFAULT_k8, + TARGET_CPU_DEFAULT_amdfam10, + TARGET_CPU_DEFAULT_bdver1, + TARGET_CPU_DEFAULT_btver1, + + TARGET_CPU_DEFAULT_max +}; + +#ifndef CC1_SPEC +#define CC1_SPEC "%(cc1_cpu) " +#endif + +/* This macro defines names of additional specifications to put in the + specs that can be used in various specifications like CC1_SPEC. Its + definition is an initializer with a subgrouping for each command option. + + Each subgrouping contains a string constant, that defines the + specification name, and a string constant that used by the GCC driver + program. + + Do not define this macro if it does not need to do anything. */ + +#ifndef SUBTARGET_EXTRA_SPECS +#define SUBTARGET_EXTRA_SPECS +#endif + +#define EXTRA_SPECS \ + { "cc1_cpu", CC1_CPU_SPEC }, \ + SUBTARGET_EXTRA_SPECS + + +/* Set the value of FLT_EVAL_METHOD in float.h. When using only the + FPU, assume that the fpcw is set to extended precision; when using + only SSE, rounding is correct; when using both SSE and the FPU, + the rounding precision is indeterminate, since either may be chosen + apparently at random. */ +#define TARGET_FLT_EVAL_METHOD \ + (TARGET_MIX_SSE_I387 ? -1 : TARGET_SSE_MATH ? 0 : 2) + +/* Whether to allow x87 floating-point arithmetic on MODE (one of + SFmode, DFmode and XFmode) in the current excess precision + configuration. */ +#define X87_ENABLE_ARITH(MODE) \ + (flag_excess_precision == EXCESS_PRECISION_FAST || (MODE) == XFmode) + +/* Likewise, whether to allow direct conversions from integer mode + IMODE (HImode, SImode or DImode) to MODE. */ +#define X87_ENABLE_FLOAT(MODE, IMODE) \ + (flag_excess_precision == EXCESS_PRECISION_FAST \ + || (MODE) == XFmode \ + || ((MODE) == DFmode && (IMODE) == SImode) \ + || (IMODE) == HImode) + +/* target machine storage layout */ + +#define SHORT_TYPE_SIZE 16 +#define INT_TYPE_SIZE 32 +#define LONG_LONG_TYPE_SIZE 64 +#define FLOAT_TYPE_SIZE 32 +#define DOUBLE_TYPE_SIZE 64 +#define LONG_DOUBLE_TYPE_SIZE 80 + +#define WIDEST_HARDWARE_FP_SIZE LONG_DOUBLE_TYPE_SIZE + +#if defined (TARGET_BI_ARCH) || TARGET_64BIT_DEFAULT +#define MAX_BITS_PER_WORD 64 +#else +#define MAX_BITS_PER_WORD 32 +#endif + +/* Define this if most significant byte of a word is the lowest numbered. */ +/* That is true on the 80386. */ + +#define BITS_BIG_ENDIAN 0 + +/* Define this if most significant byte of a word is the lowest numbered. */ +/* That is not true on the 80386. */ +#define BYTES_BIG_ENDIAN 0 + +/* Define this if most significant word of a multiword number is the lowest + numbered. */ +/* Not true for 80386 */ +#define WORDS_BIG_ENDIAN 0 + +/* Width of a word, in units (bytes). */ +#define UNITS_PER_WORD (TARGET_64BIT ? 8 : 4) + +#ifndef IN_LIBGCC2 +#define MIN_UNITS_PER_WORD 4 +#endif + +/* Allocation boundary (in *bits*) for storing arguments in argument list. */ +#define PARM_BOUNDARY BITS_PER_WORD + +/* Boundary (in *bits*) on which stack pointer should be aligned. */ +#define STACK_BOUNDARY \ + (TARGET_64BIT && ix86_abi == MS_ABI ? 128 : BITS_PER_WORD) + +/* Stack boundary of the main function guaranteed by OS. */ +#define MAIN_STACK_BOUNDARY (TARGET_64BIT ? 128 : 32) + +/* Minimum stack boundary. */ +#define MIN_STACK_BOUNDARY (TARGET_64BIT ? 128 : 32) + +/* Boundary (in *bits*) on which the stack pointer prefers to be + aligned; the compiler cannot rely on having this alignment. */ +#define PREFERRED_STACK_BOUNDARY ix86_preferred_stack_boundary + +/* It should be MIN_STACK_BOUNDARY. But we set it to 128 bits for + both 32bit and 64bit, to support codes that need 128 bit stack + alignment for SSE instructions, but can't realign the stack. */ +#define PREFERRED_STACK_BOUNDARY_DEFAULT 128 + +/* 1 if -mstackrealign should be turned on by default. It will + generate an alternate prologue and epilogue that realigns the + runtime stack if nessary. This supports mixing codes that keep a + 4-byte aligned stack, as specified by i386 psABI, with codes that + need a 16-byte aligned stack, as required by SSE instructions. */ +#define STACK_REALIGN_DEFAULT 0 + +/* Boundary (in *bits*) on which the incoming stack is aligned. */ +#define INCOMING_STACK_BOUNDARY ix86_incoming_stack_boundary + +/* Target OS keeps a vector-aligned (128-bit, 16-byte) stack. This is + mandatory for the 64-bit ABI, and may or may not be true for other + operating systems. */ +#define TARGET_KEEPS_VECTOR_ALIGNED_STACK TARGET_64BIT + +/* Minimum allocation boundary for the code of a function. */ +#define FUNCTION_BOUNDARY 8 + +/* C++ stores the virtual bit in the lowest bit of function pointers. */ +#define TARGET_PTRMEMFUNC_VBIT_LOCATION ptrmemfunc_vbit_in_pfn + +/* Minimum size in bits of the largest boundary to which any + and all fundamental data types supported by the hardware + might need to be aligned. No data type wants to be aligned + rounder than this. + + Pentium+ prefers DFmode values to be aligned to 64 bit boundary + and Pentium Pro XFmode values at 128 bit boundaries. */ + +#define BIGGEST_ALIGNMENT (TARGET_AVX ? 256 : 128) + +/* Maximum stack alignment. */ +#define MAX_STACK_ALIGNMENT MAX_OFILE_ALIGNMENT + +/* Alignment value for attribute ((aligned)). It is a constant since + it is the part of the ABI. We shouldn't change it with -mavx. */ +#define ATTRIBUTE_ALIGNED_VALUE 128 + +/* Decide whether a variable of mode MODE should be 128 bit aligned. */ +#define ALIGN_MODE_128(MODE) \ + ((MODE) == XFmode || SSE_REG_MODE_P (MODE)) + +/* The published ABIs say that doubles should be aligned on word + boundaries, so lower the alignment for structure fields unless + -malign-double is set. */ + +/* ??? Blah -- this macro is used directly by libobjc. Since it + supports no vector modes, cut out the complexity and fall back + on BIGGEST_FIELD_ALIGNMENT. */ +#ifdef IN_TARGET_LIBS +#ifdef __x86_64__ +#define BIGGEST_FIELD_ALIGNMENT 128 +#else +#define BIGGEST_FIELD_ALIGNMENT 32 +#endif +#else +#define ADJUST_FIELD_ALIGN(FIELD, COMPUTED) \ + x86_field_alignment (FIELD, COMPUTED) +#endif + +/* If defined, a C expression to compute the alignment given to a + constant that is being placed in memory. EXP is the constant + and ALIGN is the alignment that the object would ordinarily have. + The value of this macro is used instead of that alignment to align + the object. + + If this macro is not defined, then ALIGN is used. + + The typical use of this macro is to increase alignment for string + constants to be word aligned so that `strcpy' calls that copy + constants can be done inline. */ + +#define CONSTANT_ALIGNMENT(EXP, ALIGN) ix86_constant_alignment ((EXP), (ALIGN)) + +/* If defined, a C expression to compute the alignment for a static + variable. TYPE is the data type, and ALIGN is the alignment that + the object would ordinarily have. The value of this macro is used + instead of that alignment to align the object. + + If this macro is not defined, then ALIGN is used. + + One use of this macro is to increase alignment of medium-size + data to make it all fit in fewer cache lines. Another is to + cause character arrays to be word-aligned so that `strcpy' calls + that copy constants to character arrays can be done inline. */ + +#define DATA_ALIGNMENT(TYPE, ALIGN) ix86_data_alignment ((TYPE), (ALIGN)) + +/* If defined, a C expression to compute the alignment for a local + variable. TYPE is the data type, and ALIGN is the alignment that + the object would ordinarily have. The value of this macro is used + instead of that alignment to align the object. + + If this macro is not defined, then ALIGN is used. + + One use of this macro is to increase alignment of medium-size + data to make it all fit in fewer cache lines. */ + +#define LOCAL_ALIGNMENT(TYPE, ALIGN) \ + ix86_local_alignment ((TYPE), VOIDmode, (ALIGN)) + +/* If defined, a C expression to compute the alignment for stack slot. + TYPE is the data type, MODE is the widest mode available, and ALIGN + is the alignment that the slot would ordinarily have. The value of + this macro is used instead of that alignment to align the slot. + + If this macro is not defined, then ALIGN is used when TYPE is NULL, + Otherwise, LOCAL_ALIGNMENT will be used. + + One use of this macro is to set alignment of stack slot to the + maximum alignment of all possible modes which the slot may have. */ + +#define STACK_SLOT_ALIGNMENT(TYPE, MODE, ALIGN) \ + ix86_local_alignment ((TYPE), (MODE), (ALIGN)) + +/* If defined, a C expression to compute the alignment for a local + variable DECL. + + If this macro is not defined, then + LOCAL_ALIGNMENT (TREE_TYPE (DECL), DECL_ALIGN (DECL)) will be used. + + One use of this macro is to increase alignment of medium-size + data to make it all fit in fewer cache lines. */ + +#define LOCAL_DECL_ALIGNMENT(DECL) \ + ix86_local_alignment ((DECL), VOIDmode, DECL_ALIGN (DECL)) + +/* If defined, a C expression to compute the minimum required alignment + for dynamic stack realignment purposes for EXP (a TYPE or DECL), + MODE, assuming normal alignment ALIGN. + + If this macro is not defined, then (ALIGN) will be used. */ + +#define MINIMUM_ALIGNMENT(EXP, MODE, ALIGN) \ + ix86_minimum_alignment (EXP, MODE, ALIGN) + + +/* Set this nonzero if move instructions will actually fail to work + when given unaligned data. */ +#define STRICT_ALIGNMENT 0 + +/* If bit field type is int, don't let it cross an int, + and give entire struct the alignment of an int. */ +/* Required on the 386 since it doesn't have bit-field insns. */ +#define PCC_BITFIELD_TYPE_MATTERS 1 + +/* Standard register usage. */ + +/* This processor has special stack-like registers. See reg-stack.c + for details. */ + +#define STACK_REGS + +#define IS_STACK_MODE(MODE) \ + (((MODE) == SFmode && !(TARGET_SSE && TARGET_SSE_MATH)) \ + || ((MODE) == DFmode && !(TARGET_SSE2 && TARGET_SSE_MATH)) \ + || (MODE) == XFmode) + +/* Cover class containing the stack registers. */ +#define STACK_REG_COVER_CLASS FLOAT_REGS + +/* Number of actual hardware registers. + The hardware registers are assigned numbers for the compiler + from 0 to just below FIRST_PSEUDO_REGISTER. + All registers that the compiler knows about must be given numbers, + even those that are not normally considered general registers. + + In the 80386 we give the 8 general purpose registers the numbers 0-7. + We number the floating point registers 8-15. + Note that registers 0-7 can be accessed as a short or int, + while only 0-3 may be used with byte `mov' instructions. + + Reg 16 does not correspond to any hardware register, but instead + appears in the RTL as an argument pointer prior to reload, and is + eliminated during reloading in favor of either the stack or frame + pointer. */ + +#define FIRST_PSEUDO_REGISTER 53 + +/* Number of hardware registers that go into the DWARF-2 unwind info. + If not defined, equals FIRST_PSEUDO_REGISTER. */ + +#define DWARF_FRAME_REGISTERS 17 + +/* 1 for registers that have pervasive standard uses + and are not available for the register allocator. + On the 80386, the stack pointer is such, as is the arg pointer. + + The value is zero if the register is not fixed on either 32 or + 64 bit targets, one if the register if fixed on both 32 and 64 + bit targets, two if it is only fixed on 32bit targets and three + if its only fixed on 64bit targets. + Proper values are computed in TARGET_CONDITIONAL_REGISTER_USAGE. + */ +#define FIXED_REGISTERS \ +/*ax,dx,cx,bx,si,di,bp,sp,st,st1,st2,st3,st4,st5,st6,st7*/ \ +{ 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, \ +/*arg,flags,fpsr,fpcr,frame*/ \ + 1, 1, 1, 1, 1, \ +/*xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7*/ \ + 0, 0, 0, 0, 0, 0, 0, 0, \ +/* mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7*/ \ + 0, 0, 0, 0, 0, 0, 0, 0, \ +/* r8, r9, r10, r11, r12, r13, r14, r15*/ \ + 2, 2, 2, 2, 2, 2, 2, 2, \ +/*xmm8,xmm9,xmm10,xmm11,xmm12,xmm13,xmm14,xmm15*/ \ + 2, 2, 2, 2, 2, 2, 2, 2 } + + +/* 1 for registers not available across function calls. + These must include the FIXED_REGISTERS and also any + registers that can be used without being saved. + The latter must include the registers where values are returned + and the register where structure-value addresses are passed. + Aside from that, you can include as many other registers as you like. + + The value is zero if the register is not call used on either 32 or + 64 bit targets, one if the register if call used on both 32 and 64 + bit targets, two if it is only call used on 32bit targets and three + if its only call used on 64bit targets. + Proper values are computed in TARGET_CONDITIONAL_REGISTER_USAGE. +*/ +#define CALL_USED_REGISTERS \ +/*ax,dx,cx,bx,si,di,bp,sp,st,st1,st2,st3,st4,st5,st6,st7*/ \ +{ 1, 1, 1, 0, 3, 3, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, \ +/*arg,flags,fpsr,fpcr,frame*/ \ + 1, 1, 1, 1, 1, \ +/*xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7*/ \ + 1, 1, 1, 1, 1, 1, 1, 1, \ +/* mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7*/ \ + 1, 1, 1, 1, 1, 1, 1, 1, \ +/* r8, r9, r10, r11, r12, r13, r14, r15*/ \ + 1, 1, 1, 1, 2, 2, 2, 2, \ +/*xmm8,xmm9,xmm10,xmm11,xmm12,xmm13,xmm14,xmm15*/ \ + 1, 1, 1, 1, 1, 1, 1, 1 } + +/* Order in which to allocate registers. Each register must be + listed once, even those in FIXED_REGISTERS. List frame pointer + late and fixed registers last. Note that, in general, we prefer + registers listed in CALL_USED_REGISTERS, keeping the others + available for storage of persistent values. + + The ADJUST_REG_ALLOC_ORDER actually overwrite the order, + so this is just empty initializer for array. */ + +#define REG_ALLOC_ORDER \ +{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,\ + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, \ + 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, \ + 48, 49, 50, 51, 52 } + +/* ADJUST_REG_ALLOC_ORDER is a macro which permits reg_alloc_order + to be rearranged based on a particular function. When using sse math, + we want to allocate SSE before x87 registers and vice versa. */ + +#define ADJUST_REG_ALLOC_ORDER x86_order_regs_for_local_alloc () + + +#define OVERRIDE_ABI_FORMAT(FNDECL) ix86_call_abi_override (FNDECL) + +/* Return number of consecutive hard regs needed starting at reg REGNO + to hold something of mode MODE. + This is ordinarily the length in words of a value of mode MODE + but can be less for certain modes in special long registers. + + Actually there are no two word move instructions for consecutive + registers. And only registers 0-3 may have mov byte instructions + applied to them. */ + +#define HARD_REGNO_NREGS(REGNO, MODE) \ + (FP_REGNO_P (REGNO) || SSE_REGNO_P (REGNO) || MMX_REGNO_P (REGNO) \ + ? (COMPLEX_MODE_P (MODE) ? 2 : 1) \ + : ((MODE) == XFmode \ + ? (TARGET_64BIT ? 2 : 3) \ + : (MODE) == XCmode \ + ? (TARGET_64BIT ? 4 : 6) \ + : ((GET_MODE_SIZE (MODE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD))) + +#define HARD_REGNO_NREGS_HAS_PADDING(REGNO, MODE) \ + ((TARGET_128BIT_LONG_DOUBLE && !TARGET_64BIT) \ + ? (FP_REGNO_P (REGNO) || SSE_REGNO_P (REGNO) || MMX_REGNO_P (REGNO) \ + ? 0 \ + : ((MODE) == XFmode || (MODE) == XCmode)) \ + : 0) + +#define HARD_REGNO_NREGS_WITH_PADDING(REGNO, MODE) ((MODE) == XFmode ? 4 : 8) + +#define VALID_AVX256_REG_MODE(MODE) \ + ((MODE) == V32QImode || (MODE) == V16HImode || (MODE) == V8SImode \ + || (MODE) == V4DImode || (MODE) == V8SFmode || (MODE) == V4DFmode) + +#define VALID_SSE2_REG_MODE(MODE) \ + ((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode \ + || (MODE) == V2DImode || (MODE) == DFmode) + +#define VALID_SSE_REG_MODE(MODE) \ + ((MODE) == V1TImode || (MODE) == TImode \ + || (MODE) == V4SFmode || (MODE) == V4SImode \ + || (MODE) == SFmode || (MODE) == TFmode) + +#define VALID_MMX_REG_MODE_3DNOW(MODE) \ + ((MODE) == V2SFmode || (MODE) == SFmode) + +#define VALID_MMX_REG_MODE(MODE) \ + ((MODE == V1DImode) || (MODE) == DImode \ + || (MODE) == V2SImode || (MODE) == SImode \ + || (MODE) == V4HImode || (MODE) == V8QImode) + +#define VALID_DFP_MODE_P(MODE) \ + ((MODE) == SDmode || (MODE) == DDmode || (MODE) == TDmode) + +#define VALID_FP_MODE_P(MODE) \ + ((MODE) == SFmode || (MODE) == DFmode || (MODE) == XFmode \ + || (MODE) == SCmode || (MODE) == DCmode || (MODE) == XCmode) \ + +#define VALID_INT_MODE_P(MODE) \ + ((MODE) == QImode || (MODE) == HImode || (MODE) == SImode \ + || (MODE) == DImode \ + || (MODE) == CQImode || (MODE) == CHImode || (MODE) == CSImode \ + || (MODE) == CDImode \ + || (TARGET_64BIT && ((MODE) == TImode || (MODE) == CTImode \ + || (MODE) == TFmode || (MODE) == TCmode))) + +/* Return true for modes passed in SSE registers. */ +#define SSE_REG_MODE_P(MODE) \ + ((MODE) == V1TImode || (MODE) == TImode || (MODE) == V16QImode \ + || (MODE) == TFmode || (MODE) == V8HImode || (MODE) == V2DFmode \ + || (MODE) == V2DImode || (MODE) == V4SFmode || (MODE) == V4SImode \ + || (MODE) == V32QImode || (MODE) == V16HImode || (MODE) == V8SImode \ + || (MODE) == V4DImode || (MODE) == V8SFmode || (MODE) == V4DFmode) + +/* Value is 1 if hard register REGNO can hold a value of machine-mode MODE. */ + +#define HARD_REGNO_MODE_OK(REGNO, MODE) \ + ix86_hard_regno_mode_ok ((REGNO), (MODE)) + +/* Value is 1 if it is a good idea to tie two pseudo registers + when one has mode MODE1 and one has mode MODE2. + If HARD_REGNO_MODE_OK could produce different values for MODE1 and MODE2, + for any hard reg, then this must be 0 for correct output. */ + +#define MODES_TIEABLE_P(MODE1, MODE2) ix86_modes_tieable_p (MODE1, MODE2) + +/* It is possible to write patterns to move flags; but until someone + does it, */ +#define AVOID_CCMODE_COPIES + +/* Specify the modes required to caller save a given hard regno. + We do this on i386 to prevent flags from being saved at all. + + Kill any attempts to combine saving of modes. */ + +#define HARD_REGNO_CALLER_SAVE_MODE(REGNO, NREGS, MODE) \ + (CC_REGNO_P (REGNO) ? VOIDmode \ + : (MODE) == VOIDmode && (NREGS) != 1 ? VOIDmode \ + : (MODE) == VOIDmode ? choose_hard_reg_mode ((REGNO), (NREGS), false) \ + : (MODE) == HImode && !TARGET_PARTIAL_REG_STALL ? SImode \ + : (MODE) == QImode && (REGNO) > BX_REG && !TARGET_64BIT ? SImode \ + : (MODE)) + +/* The only ABI that saves SSE registers across calls is Win64 (thus no + need to check the current ABI here), and with AVX enabled Win64 only + guarantees that the low 16 bytes are saved. */ +#define HARD_REGNO_CALL_PART_CLOBBERED(REGNO, MODE) \ + (SSE_REGNO_P (REGNO) && GET_MODE_SIZE (MODE) > 16) + +/* Specify the registers used for certain standard purposes. + The values of these macros are register numbers. */ + +/* on the 386 the pc register is %eip, and is not usable as a general + register. The ordinary mov instructions won't work */ +/* #define PC_REGNUM */ + +/* Register to use for pushing function arguments. */ +#define STACK_POINTER_REGNUM 7 + +/* Base register for access to local variables of the function. */ +#define HARD_FRAME_POINTER_REGNUM 6 + +/* Base register for access to local variables of the function. */ +#define FRAME_POINTER_REGNUM 20 + +/* First floating point reg */ +#define FIRST_FLOAT_REG 8 + +/* First & last stack-like regs */ +#define FIRST_STACK_REG FIRST_FLOAT_REG +#define LAST_STACK_REG (FIRST_FLOAT_REG + 7) + +#define FIRST_SSE_REG (FRAME_POINTER_REGNUM + 1) +#define LAST_SSE_REG (FIRST_SSE_REG + 7) + +#define FIRST_MMX_REG (LAST_SSE_REG + 1) +#define LAST_MMX_REG (FIRST_MMX_REG + 7) + +#define FIRST_REX_INT_REG (LAST_MMX_REG + 1) +#define LAST_REX_INT_REG (FIRST_REX_INT_REG + 7) + +#define FIRST_REX_SSE_REG (LAST_REX_INT_REG + 1) +#define LAST_REX_SSE_REG (FIRST_REX_SSE_REG + 7) + +/* Override this in other tm.h files to cope with various OS lossage + requiring a frame pointer. */ +#ifndef SUBTARGET_FRAME_POINTER_REQUIRED +#define SUBTARGET_FRAME_POINTER_REQUIRED 0 +#endif + +/* Make sure we can access arbitrary call frames. */ +#define SETUP_FRAME_ADDRESSES() ix86_setup_frame_addresses () + +/* Base register for access to arguments of the function. */ +#define ARG_POINTER_REGNUM 16 + +/* Register to hold the addressing base for position independent + code access to data items. We don't use PIC pointer for 64bit + mode. Define the regnum to dummy value to prevent gcc from + pessimizing code dealing with EBX. + + To avoid clobbering a call-saved register unnecessarily, we renumber + the pic register when possible. The change is visible after the + prologue has been emitted. */ + +#define REAL_PIC_OFFSET_TABLE_REGNUM BX_REG + +#define PIC_OFFSET_TABLE_REGNUM \ + ((TARGET_64BIT && ix86_cmodel == CM_SMALL_PIC) \ + || !flag_pic ? INVALID_REGNUM \ + : reload_completed ? REGNO (pic_offset_table_rtx) \ + : REAL_PIC_OFFSET_TABLE_REGNUM) + +#define GOT_SYMBOL_NAME "_GLOBAL_OFFSET_TABLE_" + +/* This is overridden by . */ +#define MS_AGGREGATE_RETURN 0 + +/* This is overridden by . */ +#define KEEP_AGGREGATE_RETURN_POINTER 0 + +/* Define the classes of registers for register constraints in the + machine description. Also define ranges of constants. + + One of the classes must always be named ALL_REGS and include all hard regs. + If there is more than one class, another class must be named NO_REGS + and contain no registers. + + The name GENERAL_REGS must be the name of a class (or an alias for + another name such as ALL_REGS). This is the class of registers + that is allowed by "g" or "r" in a register constraint. + Also, registers outside this class are allocated only when + instructions express preferences for them. + + The classes must be numbered in nondecreasing order; that is, + a larger-numbered class must never be contained completely + in a smaller-numbered class. + + For any two classes, it is very desirable that there be another + class that represents their union. + + It might seem that class BREG is unnecessary, since no useful 386 + opcode needs reg %ebx. But some systems pass args to the OS in ebx, + and the "b" register constraint is useful in asms for syscalls. + + The flags, fpsr and fpcr registers are in no class. */ + +enum reg_class +{ + NO_REGS, + AREG, DREG, CREG, BREG, SIREG, DIREG, + AD_REGS, /* %eax/%edx for DImode */ + CLOBBERED_REGS, /* call-clobbered integers */ + Q_REGS, /* %eax %ebx %ecx %edx */ + NON_Q_REGS, /* %esi %edi %ebp %esp */ + INDEX_REGS, /* %eax %ebx %ecx %edx %esi %edi %ebp */ + LEGACY_REGS, /* %eax %ebx %ecx %edx %esi %edi %ebp %esp */ + GENERAL_REGS, /* %eax %ebx %ecx %edx %esi %edi %ebp %esp + %r8 %r9 %r10 %r11 %r12 %r13 %r14 %r15 */ + FP_TOP_REG, FP_SECOND_REG, /* %st(0) %st(1) */ + FLOAT_REGS, + SSE_FIRST_REG, + SSE_REGS, + MMX_REGS, + FP_TOP_SSE_REGS, + FP_SECOND_SSE_REGS, + FLOAT_SSE_REGS, + FLOAT_INT_REGS, + INT_SSE_REGS, + FLOAT_INT_SSE_REGS, + ALL_REGS, LIM_REG_CLASSES +}; + +#define N_REG_CLASSES ((int) LIM_REG_CLASSES) + +#define INTEGER_CLASS_P(CLASS) \ + reg_class_subset_p ((CLASS), GENERAL_REGS) +#define FLOAT_CLASS_P(CLASS) \ + reg_class_subset_p ((CLASS), FLOAT_REGS) +#define SSE_CLASS_P(CLASS) \ + reg_class_subset_p ((CLASS), SSE_REGS) +#define MMX_CLASS_P(CLASS) \ + ((CLASS) == MMX_REGS) +#define MAYBE_INTEGER_CLASS_P(CLASS) \ + reg_classes_intersect_p ((CLASS), GENERAL_REGS) +#define MAYBE_FLOAT_CLASS_P(CLASS) \ + reg_classes_intersect_p ((CLASS), FLOAT_REGS) +#define MAYBE_SSE_CLASS_P(CLASS) \ + reg_classes_intersect_p (SSE_REGS, (CLASS)) +#define MAYBE_MMX_CLASS_P(CLASS) \ + reg_classes_intersect_p (MMX_REGS, (CLASS)) + +#define Q_CLASS_P(CLASS) \ + reg_class_subset_p ((CLASS), Q_REGS) + +/* Give names of register classes as strings for dump file. */ + +#define REG_CLASS_NAMES \ +{ "NO_REGS", \ + "AREG", "DREG", "CREG", "BREG", \ + "SIREG", "DIREG", \ + "AD_REGS", \ + "CLOBBERED_REGS", \ + "Q_REGS", "NON_Q_REGS", \ + "INDEX_REGS", \ + "LEGACY_REGS", \ + "GENERAL_REGS", \ + "FP_TOP_REG", "FP_SECOND_REG", \ + "FLOAT_REGS", \ + "SSE_FIRST_REG", \ + "SSE_REGS", \ + "MMX_REGS", \ + "FP_TOP_SSE_REGS", \ + "FP_SECOND_SSE_REGS", \ + "FLOAT_SSE_REGS", \ + "FLOAT_INT_REGS", \ + "INT_SSE_REGS", \ + "FLOAT_INT_SSE_REGS", \ + "ALL_REGS" } + +/* Define which registers fit in which classes. This is an initializer + for a vector of HARD_REG_SET of length N_REG_CLASSES. + + Note that the default setting of CLOBBERED_REGS is for 32-bit; this + is adjusted by TARGET_CONDITIONAL_REGISTER_USAGE for the 64-bit ABI + in effect. */ + +#define REG_CLASS_CONTENTS \ +{ { 0x00, 0x0 }, \ + { 0x01, 0x0 }, { 0x02, 0x0 }, /* AREG, DREG */ \ + { 0x04, 0x0 }, { 0x08, 0x0 }, /* CREG, BREG */ \ + { 0x10, 0x0 }, { 0x20, 0x0 }, /* SIREG, DIREG */ \ + { 0x03, 0x0 }, /* AD_REGS */ \ + { 0x07, 0x0 }, /* CLOBBERED_REGS */ \ + { 0x0f, 0x0 }, /* Q_REGS */ \ + { 0x1100f0, 0x1fe0 }, /* NON_Q_REGS */ \ + { 0x7f, 0x1fe0 }, /* INDEX_REGS */ \ + { 0x1100ff, 0x0 }, /* LEGACY_REGS */ \ + { 0x1100ff, 0x1fe0 }, /* GENERAL_REGS */ \ + { 0x100, 0x0 }, { 0x0200, 0x0 },/* FP_TOP_REG, FP_SECOND_REG */\ + { 0xff00, 0x0 }, /* FLOAT_REGS */ \ + { 0x200000, 0x0 }, /* SSE_FIRST_REG */ \ +{ 0x1fe00000,0x1fe000 }, /* SSE_REGS */ \ +{ 0xe0000000, 0x1f }, /* MMX_REGS */ \ +{ 0x1fe00100,0x1fe000 }, /* FP_TOP_SSE_REG */ \ +{ 0x1fe00200,0x1fe000 }, /* FP_SECOND_SSE_REG */ \ +{ 0x1fe0ff00,0x1fe000 }, /* FLOAT_SSE_REGS */ \ + { 0x1ffff, 0x1fe0 }, /* FLOAT_INT_REGS */ \ +{ 0x1fe100ff,0x1fffe0 }, /* INT_SSE_REGS */ \ +{ 0x1fe1ffff,0x1fffe0 }, /* FLOAT_INT_SSE_REGS */ \ +{ 0xffffffff,0x1fffff } \ +} + +/* The same information, inverted: + Return the class number of the smallest class containing + reg number REGNO. This could be a conditional expression + or could index an array. */ + +#define REGNO_REG_CLASS(REGNO) (regclass_map[REGNO]) + +/* When this hook returns true for MODE, the compiler allows + registers explicitly used in the rtl to be used as spill registers + but prevents the compiler from extending the lifetime of these + registers. */ +#define TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P hook_bool_mode_true + +#define QI_REG_P(X) (REG_P (X) && REGNO (X) <= BX_REG) + +#define GENERAL_REGNO_P(N) \ + ((N) <= STACK_POINTER_REGNUM || REX_INT_REGNO_P (N)) + +#define GENERAL_REG_P(X) \ + (REG_P (X) && GENERAL_REGNO_P (REGNO (X))) + +#define ANY_QI_REG_P(X) (TARGET_64BIT ? GENERAL_REG_P(X) : QI_REG_P (X)) + +#define REX_INT_REGNO_P(N) \ + IN_RANGE ((N), FIRST_REX_INT_REG, LAST_REX_INT_REG) +#define REX_INT_REG_P(X) (REG_P (X) && REX_INT_REGNO_P (REGNO (X))) + +#define FP_REG_P(X) (REG_P (X) && FP_REGNO_P (REGNO (X))) +#define FP_REGNO_P(N) IN_RANGE ((N), FIRST_STACK_REG, LAST_STACK_REG) +#define ANY_FP_REG_P(X) (REG_P (X) && ANY_FP_REGNO_P (REGNO (X))) +#define ANY_FP_REGNO_P(N) (FP_REGNO_P (N) || SSE_REGNO_P (N)) + +#define X87_FLOAT_MODE_P(MODE) \ + (TARGET_80387 && ((MODE) == SFmode || (MODE) == DFmode || (MODE) == XFmode)) + +#define SSE_REG_P(N) (REG_P (N) && SSE_REGNO_P (REGNO (N))) +#define SSE_REGNO_P(N) \ + (IN_RANGE ((N), FIRST_SSE_REG, LAST_SSE_REG) \ + || REX_SSE_REGNO_P (N)) + +#define REX_SSE_REGNO_P(N) \ + IN_RANGE ((N), FIRST_REX_SSE_REG, LAST_REX_SSE_REG) + +#define SSE_REGNO(N) \ + ((N) < 8 ? FIRST_SSE_REG + (N) : FIRST_REX_SSE_REG + (N) - 8) + +#define SSE_FLOAT_MODE_P(MODE) \ + ((TARGET_SSE && (MODE) == SFmode) || (TARGET_SSE2 && (MODE) == DFmode)) + +#define SSE_VEC_FLOAT_MODE_P(MODE) \ + ((TARGET_SSE && (MODE) == V4SFmode) || (TARGET_SSE2 && (MODE) == V2DFmode)) + +#define AVX_FLOAT_MODE_P(MODE) \ + (TARGET_AVX && ((MODE) == SFmode || (MODE) == DFmode)) + +#define AVX128_VEC_FLOAT_MODE_P(MODE) \ + (TARGET_AVX && ((MODE) == V4SFmode || (MODE) == V2DFmode)) + +#define AVX256_VEC_FLOAT_MODE_P(MODE) \ + (TARGET_AVX && ((MODE) == V8SFmode || (MODE) == V4DFmode)) + +#define AVX_VEC_FLOAT_MODE_P(MODE) \ + (TARGET_AVX && ((MODE) == V4SFmode || (MODE) == V2DFmode \ + || (MODE) == V8SFmode || (MODE) == V4DFmode)) + +#define FMA4_VEC_FLOAT_MODE_P(MODE) \ + (TARGET_FMA4 && ((MODE) == V4SFmode || (MODE) == V2DFmode \ + || (MODE) == V8SFmode || (MODE) == V4DFmode)) + +#define MMX_REG_P(XOP) (REG_P (XOP) && MMX_REGNO_P (REGNO (XOP))) +#define MMX_REGNO_P(N) IN_RANGE ((N), FIRST_MMX_REG, LAST_MMX_REG) + +#define STACK_REG_P(XOP) (REG_P (XOP) && STACK_REGNO_P (REGNO (XOP))) +#define STACK_REGNO_P(N) IN_RANGE ((N), FIRST_STACK_REG, LAST_STACK_REG) + +#define STACK_TOP_P(XOP) (REG_P (XOP) && REGNO (XOP) == FIRST_STACK_REG) + +#define CC_REG_P(X) (REG_P (X) && CC_REGNO_P (REGNO (X))) +#define CC_REGNO_P(X) ((X) == FLAGS_REG || (X) == FPSR_REG) + +/* The class value for index registers, and the one for base regs. */ + +#define INDEX_REG_CLASS INDEX_REGS +#define BASE_REG_CLASS GENERAL_REGS + +/* Place additional restrictions on the register class to use when it + is necessary to be able to hold a value of mode MODE in a reload + register for which class CLASS would ordinarily be used. */ + +#define LIMIT_RELOAD_CLASS(MODE, CLASS) \ + ((MODE) == QImode && !TARGET_64BIT \ + && ((CLASS) == ALL_REGS || (CLASS) == GENERAL_REGS \ + || (CLASS) == LEGACY_REGS || (CLASS) == INDEX_REGS) \ + ? Q_REGS : (CLASS)) + +/* If we are copying between general and FP registers, we need a memory + location. The same is true for SSE and MMX registers. */ +#define SECONDARY_MEMORY_NEEDED(CLASS1, CLASS2, MODE) \ + ix86_secondary_memory_needed ((CLASS1), (CLASS2), (MODE), 1) + +/* Get_secondary_mem widens integral modes to BITS_PER_WORD. + There is no need to emit full 64 bit move on 64 bit targets + for integral modes that can be moved using 32 bit move. */ +#define SECONDARY_MEMORY_NEEDED_MODE(MODE) \ + (GET_MODE_BITSIZE (MODE) < 32 && INTEGRAL_MODE_P (MODE) \ + ? mode_for_size (32, GET_MODE_CLASS (MODE), 0) \ + : MODE) + +/* Return the maximum number of consecutive registers + needed to represent mode MODE in a register of class CLASS. */ +/* On the 80386, this is the size of MODE in words, + except in the FP regs, where a single reg is always enough. */ +#define CLASS_MAX_NREGS(CLASS, MODE) \ + (MAYBE_INTEGER_CLASS_P (CLASS) \ + ? ((MODE) == XFmode \ + ? (TARGET_64BIT ? 2 : 3) \ + : (MODE) == XCmode \ + ? (TARGET_64BIT ? 4 : 6) \ + : ((GET_MODE_SIZE (MODE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD)) \ + : (COMPLEX_MODE_P (MODE) ? 2 : 1)) + +/* Return a class of registers that cannot change FROM mode to TO mode. */ + +#define CANNOT_CHANGE_MODE_CLASS(FROM, TO, CLASS) \ + ix86_cannot_change_mode_class (FROM, TO, CLASS) + +/* Stack layout; function entry, exit and calling. */ + +/* Define this if pushing a word on the stack + makes the stack pointer a smaller address. */ +#define STACK_GROWS_DOWNWARD + +/* Define this to nonzero if the nominal address of the stack frame + is at the high-address end of the local variables; + that is, each additional local variable allocated + goes at a more negative offset in the frame. */ +#define FRAME_GROWS_DOWNWARD 1 + +/* Offset within stack frame to start allocating local variables at. + If FRAME_GROWS_DOWNWARD, this is the offset to the END of the + first local allocated. Otherwise, it is the offset to the BEGINNING + of the first local allocated. */ +#define STARTING_FRAME_OFFSET 0 + +/* If we generate an insn to push BYTES bytes, this says how many the stack + pointer really advances by. On 386, we have pushw instruction that + decrements by exactly 2 no matter what the position was, there is no pushb. + + But as CIE data alignment factor on this arch is -4 for 32bit targets + and -8 for 64bit targets, we need to make sure all stack pointer adjustments + are in multiple of 4 for 32bit targets and 8 for 64bit targets. */ + +#define PUSH_ROUNDING(BYTES) \ + (((BYTES) + UNITS_PER_WORD - 1) & -UNITS_PER_WORD) + +/* If defined, the maximum amount of space required for outgoing arguments + will be computed and placed into the variable `crtl->outgoing_args_size'. + No space will be pushed onto the stack for each call; instead, the + function prologue should increase the stack frame size by this amount. + + MS ABI seem to require 16 byte alignment everywhere except for function + prologue and apilogue. This is not possible without + ACCUMULATE_OUTGOING_ARGS. */ + +#define ACCUMULATE_OUTGOING_ARGS \ + (TARGET_ACCUMULATE_OUTGOING_ARGS || ix86_cfun_abi () == MS_ABI) + +/* If defined, a C expression whose value is nonzero when we want to use PUSH + instructions to pass outgoing arguments. */ + +#define PUSH_ARGS (TARGET_PUSH_ARGS && !ACCUMULATE_OUTGOING_ARGS) + +/* We want the stack and args grow in opposite directions, even if + PUSH_ARGS is 0. */ +#define PUSH_ARGS_REVERSED 1 + +/* Offset of first parameter from the argument pointer register value. */ +#define FIRST_PARM_OFFSET(FNDECL) 0 + +/* Define this macro if functions should assume that stack space has been + allocated for arguments even when their values are passed in registers. + + The value of this macro is the size, in bytes, of the area reserved for + arguments passed in registers for the function represented by FNDECL. + + This space can be allocated by the caller, or be a part of the + machine-dependent stack frame: `OUTGOING_REG_PARM_STACK_SPACE' says + which. */ +#define REG_PARM_STACK_SPACE(FNDECL) ix86_reg_parm_stack_space (FNDECL) + +#define OUTGOING_REG_PARM_STACK_SPACE(FNTYPE) \ + (ix86_function_type_abi (FNTYPE) == MS_ABI) + +/* Define how to find the value returned by a library function + assuming the value has mode MODE. */ + +#define LIBCALL_VALUE(MODE) ix86_libcall_value (MODE) + +/* Define the size of the result block used for communication between + untyped_call and untyped_return. The block contains a DImode value + followed by the block used by fnsave and frstor. */ + +#define APPLY_RESULT_SIZE (8+108) + +/* 1 if N is a possible register number for function argument passing. */ +#define FUNCTION_ARG_REGNO_P(N) ix86_function_arg_regno_p (N) + +#ifndef USED_FOR_TARGET +/* Define a data type for recording info about an argument list + during the scan of that argument list. This data type should + hold all necessary information about the function itself + and about the args processed so far, enough to enable macros + such as FUNCTION_ARG to determine where the next arg should go. */ + +typedef struct ix86_args { + int words; /* # words passed so far */ + int nregs; /* # registers available for passing */ + int regno; /* next available register number */ + int fastcall; /* fastcall or thiscall calling convention + is used */ + int sse_words; /* # sse words passed so far */ + int sse_nregs; /* # sse registers available for passing */ + int warn_avx; /* True when we want to warn about AVX ABI. */ + int warn_sse; /* True when we want to warn about SSE ABI. */ + int warn_mmx; /* True when we want to warn about MMX ABI. */ + int sse_regno; /* next available sse register number */ + int mmx_words; /* # mmx words passed so far */ + int mmx_nregs; /* # mmx registers available for passing */ + int mmx_regno; /* next available mmx register number */ + int maybe_vaarg; /* true for calls to possibly vardic fncts. */ + int caller; /* true if it is caller. */ + int float_in_sse; /* Set to 1 or 2 for 32bit targets if + SFmode/DFmode arguments should be passed + in SSE registers. Otherwise 0. */ + enum calling_abi call_abi; /* Set to SYSV_ABI for sysv abi. Otherwise + MS_ABI for ms abi. */ + /* Nonzero if it passes 256bit AVX modes. */ + BOOL_BITFIELD callee_pass_avx256_p : 1; + /* Nonzero if it returns 256bit AVX modes. */ + BOOL_BITFIELD callee_return_avx256_p : 1; +} CUMULATIVE_ARGS; +#endif + +/* Initialize a variable CUM of type CUMULATIVE_ARGS + for a call to a function whose data type is FNTYPE. + For a library call, FNTYPE is 0. */ + +#define INIT_CUMULATIVE_ARGS(CUM, FNTYPE, LIBNAME, FNDECL, N_NAMED_ARGS) \ + init_cumulative_args (&(CUM), (FNTYPE), (LIBNAME), (FNDECL), \ + (N_NAMED_ARGS) != -1) + +/* Output assembler code to FILE to increment profiler label # LABELNO + for profiling a function entry. */ + +#define FUNCTION_PROFILER(FILE, LABELNO) x86_function_profiler (FILE, LABELNO) + +#define MCOUNT_NAME "_mcount" + +#define MCOUNT_NAME_BEFORE_PROLOGUE "__fentry__" + +#define PROFILE_COUNT_REGISTER "edx" + +/* EXIT_IGNORE_STACK should be nonzero if, when returning from a function, + the stack pointer does not matter. The value is tested only in + functions that have frame pointers. + No definition is equivalent to always zero. */ +/* Note on the 386 it might be more efficient not to define this since + we have to restore it ourselves from the frame pointer, in order to + use pop */ + +#define EXIT_IGNORE_STACK 1 + +/* Output assembler code for a block containing the constant parts + of a trampoline, leaving space for the variable parts. */ + +/* On the 386, the trampoline contains two instructions: + mov #STATIC,ecx + jmp FUNCTION + The trampoline is generated entirely at runtime. The operand of JMP + is the address of FUNCTION relative to the instruction following the + JMP (which is 5 bytes long). */ + +/* Length in units of the trampoline for entering a nested function. */ + +#define TRAMPOLINE_SIZE (TARGET_64BIT ? 24 : 10) + +/* Definitions for register eliminations. + + This is an array of structures. Each structure initializes one pair + of eliminable registers. The "from" register number is given first, + followed by "to". Eliminations of the same "from" register are listed + in order of preference. + + There are two registers that can always be eliminated on the i386. + The frame pointer and the arg pointer can be replaced by either the + hard frame pointer or to the stack pointer, depending upon the + circumstances. The hard frame pointer is not used before reload and + so it is not eligible for elimination. */ + +#define ELIMINABLE_REGS \ +{{ ARG_POINTER_REGNUM, STACK_POINTER_REGNUM}, \ + { ARG_POINTER_REGNUM, HARD_FRAME_POINTER_REGNUM}, \ + { FRAME_POINTER_REGNUM, STACK_POINTER_REGNUM}, \ + { FRAME_POINTER_REGNUM, HARD_FRAME_POINTER_REGNUM}} \ + +/* Define the offset between two registers, one to be eliminated, and the other + its replacement, at the start of a routine. */ + +#define INITIAL_ELIMINATION_OFFSET(FROM, TO, OFFSET) \ + ((OFFSET) = ix86_initial_elimination_offset ((FROM), (TO))) + +/* Addressing modes, and classification of registers for them. */ + +/* Macros to check register numbers against specific register classes. */ + +/* These assume that REGNO is a hard or pseudo reg number. + They give nonzero only if REGNO is a hard reg of the suitable class + or a pseudo reg currently allocated to a suitable hard reg. + Since they use reg_renumber, they are safe only once reg_renumber + has been allocated, which happens in local-alloc.c. */ + +#define REGNO_OK_FOR_INDEX_P(REGNO) \ + ((REGNO) < STACK_POINTER_REGNUM \ + || REX_INT_REGNO_P (REGNO) \ + || (unsigned) reg_renumber[(REGNO)] < STACK_POINTER_REGNUM \ + || REX_INT_REGNO_P ((unsigned) reg_renumber[(REGNO)])) + +#define REGNO_OK_FOR_BASE_P(REGNO) \ + (GENERAL_REGNO_P (REGNO) \ + || (REGNO) == ARG_POINTER_REGNUM \ + || (REGNO) == FRAME_POINTER_REGNUM \ + || GENERAL_REGNO_P ((unsigned) reg_renumber[(REGNO)])) + +/* The macros REG_OK_FOR..._P assume that the arg is a REG rtx + and check its validity for a certain class. + We have two alternate definitions for each of them. + The usual definition accepts all pseudo regs; the other rejects + them unless they have been allocated suitable hard regs. + The symbol REG_OK_STRICT causes the latter definition to be used. + + Most source files want to accept pseudo regs in the hope that + they will get allocated to the class that the insn wants them to be in. + Source files for reload pass need to be strict. + After reload, it makes no difference, since pseudo regs have + been eliminated by then. */ + + +/* Non strict versions, pseudos are ok. */ +#define REG_OK_FOR_INDEX_NONSTRICT_P(X) \ + (REGNO (X) < STACK_POINTER_REGNUM \ + || REX_INT_REGNO_P (REGNO (X)) \ + || REGNO (X) >= FIRST_PSEUDO_REGISTER) + +#define REG_OK_FOR_BASE_NONSTRICT_P(X) \ + (GENERAL_REGNO_P (REGNO (X)) \ + || REGNO (X) == ARG_POINTER_REGNUM \ + || REGNO (X) == FRAME_POINTER_REGNUM \ + || REGNO (X) >= FIRST_PSEUDO_REGISTER) + +/* Strict versions, hard registers only */ +#define REG_OK_FOR_INDEX_STRICT_P(X) REGNO_OK_FOR_INDEX_P (REGNO (X)) +#define REG_OK_FOR_BASE_STRICT_P(X) REGNO_OK_FOR_BASE_P (REGNO (X)) + +#ifndef REG_OK_STRICT +#define REG_OK_FOR_INDEX_P(X) REG_OK_FOR_INDEX_NONSTRICT_P (X) +#define REG_OK_FOR_BASE_P(X) REG_OK_FOR_BASE_NONSTRICT_P (X) + +#else +#define REG_OK_FOR_INDEX_P(X) REG_OK_FOR_INDEX_STRICT_P (X) +#define REG_OK_FOR_BASE_P(X) REG_OK_FOR_BASE_STRICT_P (X) +#endif + +/* TARGET_LEGITIMATE_ADDRESS_P recognizes an RTL expression + that is a valid memory address for an instruction. + The MODE argument is the machine mode for the MEM expression + that wants to use this address. + + The other macros defined here are used only in TARGET_LEGITIMATE_ADDRESS_P, + except for CONSTANT_ADDRESS_P which is usually machine-independent. + + See legitimize_pic_address in i386.c for details as to what + constitutes a legitimate address when -fpic is used. */ + +#define MAX_REGS_PER_ADDRESS 2 + +#define CONSTANT_ADDRESS_P(X) constant_address_p (X) + +/* Nonzero if the constant value X is a legitimate general operand. + It is given that X satisfies CONSTANT_P or is a CONST_DOUBLE. */ + +#define LEGITIMATE_CONSTANT_P(X) legitimate_constant_p (X) + +/* Try a machine-dependent way of reloading an illegitimate address + operand. If we find one, push the reload and jump to WIN. This + macro is used in only one place: `find_reloads_address' in reload.c. */ + +#define LEGITIMIZE_RELOAD_ADDRESS(X, MODE, OPNUM, TYPE, INDL, WIN) \ +do { \ + if (ix86_legitimize_reload_address ((X), (MODE), (OPNUM), \ + (int)(TYPE), (INDL))) \ + goto WIN; \ +} while (0) + +/* If defined, a C expression to determine the base term of address X. + This macro is used in only one place: `find_base_term' in alias.c. + + It is always safe for this macro to not be defined. It exists so + that alias analysis can understand machine-dependent addresses. + + The typical use of this macro is to handle addresses containing + a label_ref or symbol_ref within an UNSPEC. */ + +#define FIND_BASE_TERM(X) ix86_find_base_term (X) + +/* Nonzero if the constant value X is a legitimate general operand + when generating PIC code. It is given that flag_pic is on and + that X satisfies CONSTANT_P or is a CONST_DOUBLE. */ + +#define LEGITIMATE_PIC_OPERAND_P(X) legitimate_pic_operand_p (X) + +#define SYMBOLIC_CONST(X) \ + (GET_CODE (X) == SYMBOL_REF \ + || GET_CODE (X) == LABEL_REF \ + || (GET_CODE (X) == CONST && symbolic_reference_mentioned_p (X))) + +/* Max number of args passed in registers. If this is more than 3, we will + have problems with ebx (register #4), since it is a caller save register and + is also used as the pic register in ELF. So for now, don't allow more than + 3 registers to be passed in registers. */ + +/* Abi specific values for REGPARM_MAX and SSE_REGPARM_MAX */ +#define X86_64_REGPARM_MAX 6 +#define X86_64_MS_REGPARM_MAX 4 + +#define X86_32_REGPARM_MAX 3 + +#define REGPARM_MAX \ + (TARGET_64BIT \ + ? (TARGET_64BIT_MS_ABI \ + ? X86_64_MS_REGPARM_MAX \ + : X86_64_REGPARM_MAX) \ + : X86_32_REGPARM_MAX) + +#define X86_64_SSE_REGPARM_MAX 8 +#define X86_64_MS_SSE_REGPARM_MAX 4 + +#define X86_32_SSE_REGPARM_MAX (TARGET_SSE ? (TARGET_MACHO ? 4 : 3) : 0) + +#define SSE_REGPARM_MAX \ + (TARGET_64BIT \ + ? (TARGET_64BIT_MS_ABI \ + ? X86_64_MS_SSE_REGPARM_MAX \ + : X86_64_SSE_REGPARM_MAX) \ + : X86_32_SSE_REGPARM_MAX) + +#define MMX_REGPARM_MAX (TARGET_64BIT ? 0 : (TARGET_MMX ? 3 : 0)) + +/* Specify the machine mode that this machine uses + for the index in the tablejump instruction. */ +#define CASE_VECTOR_MODE \ + (!TARGET_64BIT || (flag_pic && ix86_cmodel != CM_LARGE_PIC) ? SImode : DImode) + +/* Define this as 1 if `char' should by default be signed; else as 0. */ +#define DEFAULT_SIGNED_CHAR 1 + +/* Max number of bytes we can move from memory to memory + in one reasonably fast instruction. */ +#define MOVE_MAX 16 + +/* MOVE_MAX_PIECES is the number of bytes at a time which we can + move efficiently, as opposed to MOVE_MAX which is the maximum + number of bytes we can move with a single instruction. */ +#define MOVE_MAX_PIECES UNITS_PER_WORD + +/* If a memory-to-memory move would take MOVE_RATIO or more simple + move-instruction pairs, we will do a movmem or libcall instead. + Increasing the value will always make code faster, but eventually + incurs high cost in increased code size. + + If you don't define this, a reasonable default is used. */ + +#define MOVE_RATIO(speed) ((speed) ? ix86_cost->move_ratio : 3) + +/* If a clear memory operation would take CLEAR_RATIO or more simple + move-instruction sequences, we will do a clrmem or libcall instead. */ + +#define CLEAR_RATIO(speed) ((speed) ? MIN (6, ix86_cost->move_ratio) : 2) + +/* Define if shifts truncate the shift count which implies one can + omit a sign-extension or zero-extension of a shift count. + + On i386, shifts do truncate the count. But bit test instructions + take the modulo of the bit offset operand. */ + +/* #define SHIFT_COUNT_TRUNCATED */ + +/* Value is 1 if truncating an integer of INPREC bits to OUTPREC bits + is done just by pretending it is already truncated. */ +#define TRULY_NOOP_TRUNCATION(OUTPREC, INPREC) 1 + +/* A macro to update M and UNSIGNEDP when an object whose type is + TYPE and which has the specified mode and signedness is to be + stored in a register. This macro is only called when TYPE is a + scalar type. + + On i386 it is sometimes useful to promote HImode and QImode + quantities to SImode. The choice depends on target type. */ + +#define PROMOTE_MODE(MODE, UNSIGNEDP, TYPE) \ +do { \ + if (((MODE) == HImode && TARGET_PROMOTE_HI_REGS) \ + || ((MODE) == QImode && TARGET_PROMOTE_QI_REGS)) \ + (MODE) = SImode; \ +} while (0) + +/* Specify the machine mode that pointers have. + After generation of rtl, the compiler makes no further distinction + between pointers and any other objects of this machine mode. */ +#define Pmode (TARGET_64BIT ? DImode : SImode) + +/* A function address in a call instruction + is a byte address (for indexing purposes) + so give the MEM rtx a byte's mode. */ +#define FUNCTION_MODE QImode + + +/* A C expression for the cost of a branch instruction. A value of 1 + is the default; other values are interpreted relative to that. */ + +#define BRANCH_COST(speed_p, predictable_p) \ + (!(speed_p) ? 2 : (predictable_p) ? 0 : ix86_branch_cost) + +/* Define this macro as a C expression which is nonzero if accessing + less than a word of memory (i.e. a `char' or a `short') is no + faster than accessing a word of memory, i.e., if such access + require more than one instruction or if there is no difference in + cost between byte and (aligned) word loads. + + When this macro is not defined, the compiler will access a field by + finding the smallest containing object; when it is defined, a + fullword load will be used if alignment permits. Unless bytes + accesses are faster than word accesses, using word accesses is + preferable since it may eliminate subsequent memory access if + subsequent accesses occur to other fields in the same word of the + structure, but to different bytes. */ + +#define SLOW_BYTE_ACCESS 0 + +/* Nonzero if access to memory by shorts is slow and undesirable. */ +#define SLOW_SHORT_ACCESS 0 + +/* Define this macro to be the value 1 if unaligned accesses have a + cost many times greater than aligned accesses, for example if they + are emulated in a trap handler. + + When this macro is nonzero, the compiler will act as if + `STRICT_ALIGNMENT' were nonzero when generating code for block + moves. This can cause significantly more instructions to be + produced. Therefore, do not set this macro nonzero if unaligned + accesses only add a cycle or two to the time for a memory access. + + If the value of this macro is always zero, it need not be defined. */ + +/* #define SLOW_UNALIGNED_ACCESS(MODE, ALIGN) 0 */ + +/* Define this macro if it is as good or better to call a constant + function address than to call an address kept in a register. + + Desirable on the 386 because a CALL with a constant address is + faster than one with a register address. */ + +#define NO_FUNCTION_CSE + +/* Given a comparison code (EQ, NE, etc.) and the first operand of a COMPARE, + return the mode to be used for the comparison. + + For floating-point equality comparisons, CCFPEQmode should be used. + VOIDmode should be used in all other cases. + + For integer comparisons against zero, reduce to CCNOmode or CCZmode if + possible, to allow for more combinations. */ + +#define SELECT_CC_MODE(OP, X, Y) ix86_cc_mode ((OP), (X), (Y)) + +/* Return nonzero if MODE implies a floating point inequality can be + reversed. */ + +#define REVERSIBLE_CC_MODE(MODE) 1 + +/* A C expression whose value is reversed condition code of the CODE for + comparison done in CC_MODE mode. */ +#define REVERSE_CONDITION(CODE, MODE) ix86_reverse_condition ((CODE), (MODE)) + + +/* Control the assembler format that we output, to the extent + this does not vary between assemblers. */ + +/* How to refer to registers in assembler output. + This sequence is indexed by compiler's hard-register-number (see above). */ + +/* In order to refer to the first 8 regs as 32-bit regs, prefix an "e". + For non floating point regs, the following are the HImode names. + + For float regs, the stack top is sometimes referred to as "%st(0)" + instead of just "%st". TARGET_PRINT_OPERAND handles this with the + "y" code. */ + +#define HI_REGISTER_NAMES \ +{"ax","dx","cx","bx","si","di","bp","sp", \ + "st","st(1)","st(2)","st(3)","st(4)","st(5)","st(6)","st(7)", \ + "argp", "flags", "fpsr", "fpcr", "frame", \ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7", \ + "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", \ + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", \ + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"} + +#define REGISTER_NAMES HI_REGISTER_NAMES + +/* Table of additional register names to use in user input. */ + +#define ADDITIONAL_REGISTER_NAMES \ +{ { "eax", 0 }, { "edx", 1 }, { "ecx", 2 }, { "ebx", 3 }, \ + { "esi", 4 }, { "edi", 5 }, { "ebp", 6 }, { "esp", 7 }, \ + { "rax", 0 }, { "rdx", 1 }, { "rcx", 2 }, { "rbx", 3 }, \ + { "rsi", 4 }, { "rdi", 5 }, { "rbp", 6 }, { "rsp", 7 }, \ + { "al", 0 }, { "dl", 1 }, { "cl", 2 }, { "bl", 3 }, \ + { "ah", 0 }, { "dh", 1 }, { "ch", 2 }, { "bh", 3 } } + +/* Note we are omitting these since currently I don't know how +to get gcc to use these, since they want the same but different +number as al, and ax. +*/ + +#define QI_REGISTER_NAMES \ +{"al", "dl", "cl", "bl", "sil", "dil", "bpl", "spl",} + +/* These parallel the array above, and can be used to access bits 8:15 + of regs 0 through 3. */ + +#define QI_HIGH_REGISTER_NAMES \ +{"ah", "dh", "ch", "bh", } + +/* How to renumber registers for dbx and gdb. */ + +#define DBX_REGISTER_NUMBER(N) \ + (TARGET_64BIT ? dbx64_register_map[(N)] : dbx_register_map[(N)]) + +extern int const dbx_register_map[FIRST_PSEUDO_REGISTER]; +extern int const dbx64_register_map[FIRST_PSEUDO_REGISTER]; +extern int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER]; + +/* Before the prologue, RA is at 0(%esp). */ +#define INCOMING_RETURN_ADDR_RTX \ + gen_rtx_MEM (VOIDmode, gen_rtx_REG (VOIDmode, STACK_POINTER_REGNUM)) + +/* After the prologue, RA is at -4(AP) in the current frame. */ +#define RETURN_ADDR_RTX(COUNT, FRAME) \ + ((COUNT) == 0 \ + ? gen_rtx_MEM (Pmode, plus_constant (arg_pointer_rtx, -UNITS_PER_WORD)) \ + : gen_rtx_MEM (Pmode, plus_constant (FRAME, UNITS_PER_WORD))) + +/* PC is dbx register 8; let's use that column for RA. */ +#define DWARF_FRAME_RETURN_COLUMN (TARGET_64BIT ? 16 : 8) + +/* Before the prologue, the top of the frame is at 4(%esp). */ +#define INCOMING_FRAME_SP_OFFSET UNITS_PER_WORD + +/* Describe how we implement __builtin_eh_return. */ +#define EH_RETURN_DATA_REGNO(N) ((N) <= DX_REG ? (N) : INVALID_REGNUM) +#define EH_RETURN_STACKADJ_RTX gen_rtx_REG (Pmode, CX_REG) + + +/* Select a format to encode pointers in exception handling data. CODE + is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is + true if the symbol may be affected by dynamic relocations. + + ??? All x86 object file formats are capable of representing this. + After all, the relocation needed is the same as for the call insn. + Whether or not a particular assembler allows us to enter such, I + guess we'll have to see. */ +#define ASM_PREFERRED_EH_DATA_FORMAT(CODE, GLOBAL) \ + asm_preferred_eh_data_format ((CODE), (GLOBAL)) + +/* This is how to output an insn to push a register on the stack. + It need not be very fast code. */ + +#define ASM_OUTPUT_REG_PUSH(FILE, REGNO) \ +do { \ + if (TARGET_64BIT) \ + asm_fprintf ((FILE), "\tpush{q}\t%%r%s\n", \ + reg_names[(REGNO)] + (REX_INT_REGNO_P (REGNO) != 0)); \ + else \ + asm_fprintf ((FILE), "\tpush{l}\t%%e%s\n", reg_names[(REGNO)]); \ +} while (0) + +/* This is how to output an insn to pop a register from the stack. + It need not be very fast code. */ + +#define ASM_OUTPUT_REG_POP(FILE, REGNO) \ +do { \ + if (TARGET_64BIT) \ + asm_fprintf ((FILE), "\tpop{q}\t%%r%s\n", \ + reg_names[(REGNO)] + (REX_INT_REGNO_P (REGNO) != 0)); \ + else \ + asm_fprintf ((FILE), "\tpop{l}\t%%e%s\n", reg_names[(REGNO)]); \ +} while (0) + +/* This is how to output an element of a case-vector that is absolute. */ + +#define ASM_OUTPUT_ADDR_VEC_ELT(FILE, VALUE) \ + ix86_output_addr_vec_elt ((FILE), (VALUE)) + +/* This is how to output an element of a case-vector that is relative. */ + +#define ASM_OUTPUT_ADDR_DIFF_ELT(FILE, BODY, VALUE, REL) \ + ix86_output_addr_diff_elt ((FILE), (VALUE), (REL)) + +/* When we see %v, we will print the 'v' prefix if TARGET_AVX is true. */ + +#define ASM_OUTPUT_AVX_PREFIX(STREAM, PTR) \ +{ \ + if ((PTR)[0] == '%' && (PTR)[1] == 'v') \ + (PTR) += TARGET_AVX ? 1 : 2; \ +} + +/* A C statement or statements which output an assembler instruction + opcode to the stdio stream STREAM. The macro-operand PTR is a + variable of type `char *' which points to the opcode name in + its "internal" form--the form that is written in the machine + description. */ + +#define ASM_OUTPUT_OPCODE(STREAM, PTR) \ + ASM_OUTPUT_AVX_PREFIX ((STREAM), (PTR)) + +/* A C statement to output to the stdio stream FILE an assembler + command to pad the location counter to a multiple of 1<machine->stack_locals) +#define ix86_varargs_gpr_size (cfun->machine->varargs_gpr_size) +#define ix86_varargs_fpr_size (cfun->machine->varargs_fpr_size) +#define ix86_optimize_mode_switching (cfun->machine->optimize_mode_switching) +#define ix86_current_function_needs_cld (cfun->machine->needs_cld) +#define ix86_tls_descriptor_calls_expanded_in_cfun \ + (cfun->machine->tls_descriptor_call_expanded_p) +/* Since tls_descriptor_call_expanded is not cleared, even if all TLS + calls are optimized away, we try to detect cases in which it was + optimized away. Since such instructions (use (reg REG_SP)), we can + verify whether there's any such instruction live by testing that + REG_SP is live. */ +#define ix86_current_function_calls_tls_descriptor \ + (ix86_tls_descriptor_calls_expanded_in_cfun && df_regs_ever_live_p (SP_REG)) +#define ix86_static_chain_on_stack (cfun->machine->static_chain_on_stack) + +/* Control behavior of x86_file_start. */ +#define X86_FILE_START_VERSION_DIRECTIVE false +#define X86_FILE_START_FLTUSED false + +/* Flag to mark data that is in the large address area. */ +#define SYMBOL_FLAG_FAR_ADDR (SYMBOL_FLAG_MACH_DEP << 0) +#define SYMBOL_REF_FAR_ADDR_P(X) \ + ((SYMBOL_REF_FLAGS (X) & SYMBOL_FLAG_FAR_ADDR) != 0) + +/* Flags to mark dllimport/dllexport. Used by PE ports, but handy to + have defined always, to avoid ifdefing. */ +#define SYMBOL_FLAG_DLLIMPORT (SYMBOL_FLAG_MACH_DEP << 1) +#define SYMBOL_REF_DLLIMPORT_P(X) \ + ((SYMBOL_REF_FLAGS (X) & SYMBOL_FLAG_DLLIMPORT) != 0) + +#define SYMBOL_FLAG_DLLEXPORT (SYMBOL_FLAG_MACH_DEP << 2) +#define SYMBOL_REF_DLLEXPORT_P(X) \ + ((SYMBOL_REF_FLAGS (X) & SYMBOL_FLAG_DLLEXPORT) != 0) + +extern void debug_ready_dispatch (void); +extern void debug_dispatch_window (int); + +/* The value at zero is only defined for the BMI instructions + LZCNT and TZCNT, not the BSR/BSF insns in the original isa. */ +#define CTZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE) \ + ((VALUE) = GET_MODE_BITSIZE (MODE), TARGET_BMI) +#define CLZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE) \ + ((VALUE) = GET_MODE_BITSIZE (MODE), TARGET_BMI) + + +/* +Local variables: +version-control: t +End: +*/ diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md new file mode 100644 index 000000000..3a27ca46c --- /dev/null +++ b/gcc/config/i386/i386.md @@ -0,0 +1,18347 @@ +;; GCC machine description for IA-32 and x86-64. +;; Copyright (C) 1988, 1994, 1995, 1996, 1997, 1998, 1999, 2000, +;; 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 +;; Free Software Foundation, Inc. +;; Mostly by William Schelter. +;; x86_64 support added by Jan Hubicka +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ +;; +;; The original PO technology requires these to be ordered by speed, +;; so that assigner will pick the fastest. +;; +;; See file "rtl.def" for documentation on define_insn, match_*, et. al. +;; +;; The special asm out single letter directives following a '%' are: +;; L,W,B,Q,S,T -- print the opcode suffix for specified size of operand. +;; C -- print opcode suffix for set/cmov insn. +;; c -- like C, but print reversed condition +;; F,f -- likewise, but for floating-point. +;; O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.", +;; otherwise nothing +;; R -- print the prefix for register names. +;; z -- print the opcode suffix for the size of the current operand. +;; Z -- likewise, with special suffixes for x87 instructions. +;; * -- print a star (in certain assembler syntax) +;; A -- print an absolute memory reference. +;; w -- print the operand as if it's a "word" (HImode) even if it isn't. +;; s -- print a shift double count, followed by the assemblers argument +;; delimiter. +;; b -- print the QImode name of the register for the indicated operand. +;; %b0 would print %al if operands[0] is reg 0. +;; w -- likewise, print the HImode name of the register. +;; k -- likewise, print the SImode name of the register. +;; q -- likewise, print the DImode name of the register. +;; x -- likewise, print the V4SFmode name of the register. +;; t -- likewise, print the V8SFmode name of the register. +;; h -- print the QImode name for a "high" register, either ah, bh, ch or dh. +;; y -- print "st(0)" instead of "st" as a register. +;; d -- print duplicated register operand for AVX instruction. +;; D -- print condition for SSE cmp instruction. +;; P -- if PIC, print an @PLT suffix. +;; X -- don't print any sort of PIC '@' suffix for a symbol. +;; & -- print some in-use local-dynamic symbol name. +;; H -- print a memory address offset by 8; used for sse high-parts +;; Y -- print condition for XOP pcom* instruction. +;; + -- print a branch hint as 'cs' or 'ds' prefix +;; ; -- print a semicolon (after prefixes due to bug in older gas). +;; @ -- print a segment register of thread base pointer load + +;; UNSPEC usage: + +(define_c_enum "unspec" [ + ;; Relocation specifiers + UNSPEC_GOT + UNSPEC_GOTOFF + UNSPEC_GOTPCREL + UNSPEC_GOTTPOFF + UNSPEC_TPOFF + UNSPEC_NTPOFF + UNSPEC_DTPOFF + UNSPEC_GOTNTPOFF + UNSPEC_INDNTPOFF + UNSPEC_PLTOFF + UNSPEC_MACHOPIC_OFFSET + UNSPEC_PCREL + + ;; Prologue support + UNSPEC_STACK_ALLOC + UNSPEC_SET_GOT + UNSPEC_REG_SAVE + UNSPEC_DEF_CFA + UNSPEC_SET_RIP + UNSPEC_SET_GOT_OFFSET + UNSPEC_MEMORY_BLOCKAGE + UNSPEC_STACK_CHECK + + ;; TLS support + UNSPEC_TP + UNSPEC_TLS_GD + UNSPEC_TLS_LD_BASE + UNSPEC_TLSDESC + UNSPEC_TLS_IE_SUN + + ;; Other random patterns + UNSPEC_SCAS + UNSPEC_FNSTSW + UNSPEC_SAHF + UNSPEC_PARITY + UNSPEC_FSTCW + UNSPEC_ADD_CARRY + UNSPEC_FLDCW + UNSPEC_REP + UNSPEC_LD_MPIC ; load_macho_picbase + UNSPEC_TRUNC_NOOP + UNSPEC_DIV_ALREADY_SPLIT + UNSPEC_CALL_NEEDS_VZEROUPPER + + ;; For SSE/MMX support: + UNSPEC_FIX_NOTRUNC + UNSPEC_MASKMOV + UNSPEC_MOVMSK + UNSPEC_MOVNT + UNSPEC_MOVU + UNSPEC_RCP + UNSPEC_RSQRT + UNSPEC_SFENCE + UNSPEC_PFRCP + UNSPEC_PFRCPIT1 + UNSPEC_PFRCPIT2 + UNSPEC_PFRSQRT + UNSPEC_PFRSQIT1 + UNSPEC_MFENCE + UNSPEC_LFENCE + UNSPEC_PSADBW + UNSPEC_LDDQU + UNSPEC_MS_TO_SYSV_CALL + + ;; Generic math support + UNSPEC_COPYSIGN + UNSPEC_IEEE_MIN ; not commutative + UNSPEC_IEEE_MAX ; not commutative + + ;; x87 Floating point + UNSPEC_SIN + UNSPEC_COS + UNSPEC_FPATAN + UNSPEC_FYL2X + UNSPEC_FYL2XP1 + UNSPEC_FRNDINT + UNSPEC_FIST + UNSPEC_F2XM1 + UNSPEC_TAN + UNSPEC_FXAM + + ;; x87 Rounding + UNSPEC_FRNDINT_FLOOR + UNSPEC_FRNDINT_CEIL + UNSPEC_FRNDINT_TRUNC + UNSPEC_FRNDINT_MASK_PM + UNSPEC_FIST_FLOOR + UNSPEC_FIST_CEIL + + ;; x87 Double output FP + UNSPEC_SINCOS_COS + UNSPEC_SINCOS_SIN + UNSPEC_XTRACT_FRACT + UNSPEC_XTRACT_EXP + UNSPEC_FSCALE_FRACT + UNSPEC_FSCALE_EXP + UNSPEC_FPREM_F + UNSPEC_FPREM_U + UNSPEC_FPREM1_F + UNSPEC_FPREM1_U + + UNSPEC_C2_FLAG + UNSPEC_FXAM_MEM + + ;; SSP patterns + UNSPEC_SP_SET + UNSPEC_SP_TEST + UNSPEC_SP_TLS_SET + UNSPEC_SP_TLS_TEST + + ;; SSSE3 + UNSPEC_PSHUFB + UNSPEC_PSIGN + UNSPEC_PALIGNR + + ;; For SSE4A support + UNSPEC_EXTRQI + UNSPEC_EXTRQ + UNSPEC_INSERTQI + UNSPEC_INSERTQ + + ;; For SSE4.1 support + UNSPEC_BLENDV + UNSPEC_INSERTPS + UNSPEC_DP + UNSPEC_MOVNTDQA + UNSPEC_MPSADBW + UNSPEC_PHMINPOSUW + UNSPEC_PTEST + UNSPEC_ROUND + + ;; For SSE4.2 support + UNSPEC_CRC32 + UNSPEC_PCMPESTR + UNSPEC_PCMPISTR + + ;; For FMA4 support + UNSPEC_FMADDSUB + UNSPEC_XOP_UNSIGNED_CMP + UNSPEC_XOP_TRUEFALSE + UNSPEC_XOP_PERMUTE + UNSPEC_FRCZ + + ;; For AES support + UNSPEC_AESENC + UNSPEC_AESENCLAST + UNSPEC_AESDEC + UNSPEC_AESDECLAST + UNSPEC_AESIMC + UNSPEC_AESKEYGENASSIST + + ;; For PCLMUL support + UNSPEC_PCLMUL + + ;; For AVX support + UNSPEC_PCMP + UNSPEC_VPERMIL + UNSPEC_VPERMIL2 + UNSPEC_VPERMIL2F128 + UNSPEC_MASKLOAD + UNSPEC_MASKSTORE + UNSPEC_CAST + UNSPEC_VTESTP + UNSPEC_VCVTPH2PS + UNSPEC_VCVTPS2PH + + ;; For BMI support + UNSPEC_BEXTR +]) + +(define_c_enum "unspecv" [ + UNSPECV_BLOCKAGE + UNSPECV_STACK_PROBE + UNSPECV_PROBE_STACK_RANGE + UNSPECV_EMMS + UNSPECV_LDMXCSR + UNSPECV_STMXCSR + UNSPECV_FEMMS + UNSPECV_CLFLUSH + UNSPECV_ALIGN + UNSPECV_MONITOR + UNSPECV_MWAIT + UNSPECV_CMPXCHG + UNSPECV_XCHG + UNSPECV_LOCK + UNSPECV_PROLOGUE_USE + UNSPECV_CLD + UNSPECV_NOPS + UNSPECV_VZEROALL + UNSPECV_VZEROUPPER + UNSPECV_RDTSC + UNSPECV_RDTSCP + UNSPECV_RDPMC + UNSPECV_LLWP_INTRINSIC + UNSPECV_SLWP_INTRINSIC + UNSPECV_LWPVAL_INTRINSIC + UNSPECV_LWPINS_INTRINSIC + UNSPECV_RDFSBASE + UNSPECV_RDGSBASE + UNSPECV_WRFSBASE + UNSPECV_WRGSBASE + UNSPECV_SPLIT_STACK_RETURN + + ;; For RDRAND support + UNSPECV_RDRAND +]) + +;; Constants to represent pcomtrue/pcomfalse variants +(define_constants + [(PCOM_FALSE 0) + (PCOM_TRUE 1) + (COM_FALSE_S 2) + (COM_FALSE_P 3) + (COM_TRUE_S 4) + (COM_TRUE_P 5) + ]) + +;; Constants used in the XOP pperm instruction +(define_constants + [(PPERM_SRC 0x00) /* copy source */ + (PPERM_INVERT 0x20) /* invert source */ + (PPERM_REVERSE 0x40) /* bit reverse source */ + (PPERM_REV_INV 0x60) /* bit reverse & invert src */ + (PPERM_ZERO 0x80) /* all 0's */ + (PPERM_ONES 0xa0) /* all 1's */ + (PPERM_SIGN 0xc0) /* propagate sign bit */ + (PPERM_INV_SIGN 0xe0) /* invert & propagate sign */ + (PPERM_SRC1 0x00) /* use first source byte */ + (PPERM_SRC2 0x10) /* use second source byte */ + ]) + +;; Registers by name. +(define_constants + [(AX_REG 0) + (DX_REG 1) + (CX_REG 2) + (BX_REG 3) + (SI_REG 4) + (DI_REG 5) + (BP_REG 6) + (SP_REG 7) + (ST0_REG 8) + (ST1_REG 9) + (ST2_REG 10) + (ST3_REG 11) + (ST4_REG 12) + (ST5_REG 13) + (ST6_REG 14) + (ST7_REG 15) + (FLAGS_REG 17) + (FPSR_REG 18) + (FPCR_REG 19) + (XMM0_REG 21) + (XMM1_REG 22) + (XMM2_REG 23) + (XMM3_REG 24) + (XMM4_REG 25) + (XMM5_REG 26) + (XMM6_REG 27) + (XMM7_REG 28) + (MM0_REG 29) + (MM1_REG 30) + (MM2_REG 31) + (MM3_REG 32) + (MM4_REG 33) + (MM5_REG 34) + (MM6_REG 35) + (MM7_REG 36) + (R8_REG 37) + (R9_REG 38) + (R10_REG 39) + (R11_REG 40) + (R12_REG 41) + (R13_REG 42) + (XMM8_REG 45) + (XMM9_REG 46) + (XMM10_REG 47) + (XMM11_REG 48) + (XMM12_REG 49) + (XMM13_REG 50) + (XMM14_REG 51) + (XMM15_REG 52) + ]) + +;; Insns whose names begin with "x86_" are emitted by gen_FOO calls +;; from i386.c. + +;; In C guard expressions, put expressions which may be compile-time +;; constants first. This allows for better optimization. For +;; example, write "TARGET_64BIT && reload_completed", not +;; "reload_completed && TARGET_64BIT". + + +;; Processor type. +(define_attr "cpu" "none,pentium,pentiumpro,geode,k6,athlon,k8,core2,corei7, + atom,generic64,amdfam10,bdver1,btver1" + (const (symbol_ref "ix86_schedule"))) + +;; A basic instruction type. Refinements due to arguments to be +;; provided in other attributes. +(define_attr "type" + "other,multi, + alu,alu1,negnot,imov,imovx,lea, + incdec,ishift,ishift1,rotate,rotate1,imul,idiv, + icmp,test,ibr,setcc,icmov, + push,pop,call,callv,leave, + str,bitmanip, + fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,fistp,fisttp,frndint, + sselog,sselog1,sseiadd,sseiadd1,sseishft,sseishft1,sseimul, + sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,ssecvt1,sseicvt,ssediv,sseins, + ssemuladd,sse4arg,lwp, + mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft" + (const_string "other")) + +;; Main data type used by the insn +(define_attr "mode" + "unknown,none,QI,HI,SI,DI,TI,OI,SF,DF,XF,TF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF" + (const_string "unknown")) + +;; The CPU unit operations uses. +(define_attr "unit" "integer,i387,sse,mmx,unknown" + (cond [(eq_attr "type" "fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,fistp,fisttp,frndint") + (const_string "i387") + (eq_attr "type" "sselog,sselog1,sseiadd,sseiadd1,sseishft,sseishft1,sseimul, + sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt, + ssecvt1,sseicvt,ssediv,sseins,ssemuladd,sse4arg") + (const_string "sse") + (eq_attr "type" "mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft") + (const_string "mmx") + (eq_attr "type" "other") + (const_string "unknown")] + (const_string "integer"))) + +;; The (bounding maximum) length of an instruction immediate. +(define_attr "length_immediate" "" + (cond [(eq_attr "type" "incdec,setcc,icmov,str,lea,other,multi,idiv,leave, + bitmanip") + (const_int 0) + (eq_attr "unit" "i387,sse,mmx") + (const_int 0) + (eq_attr "type" "alu,alu1,negnot,imovx,ishift,rotate,ishift1,rotate1, + imul,icmp,push,pop") + (symbol_ref "ix86_attr_length_immediate_default(insn,1)") + (eq_attr "type" "imov,test") + (symbol_ref "ix86_attr_length_immediate_default(insn,0)") + (eq_attr "type" "call") + (if_then_else (match_operand 0 "constant_call_address_operand" "") + (const_int 4) + (const_int 0)) + (eq_attr "type" "callv") + (if_then_else (match_operand 1 "constant_call_address_operand" "") + (const_int 4) + (const_int 0)) + ;; We don't know the size before shorten_branches. Expect + ;; the instruction to fit for better scheduling. + (eq_attr "type" "ibr") + (const_int 1) + ] + (symbol_ref "/* Update immediate_length and other attributes! */ + gcc_unreachable (),1"))) + +;; The (bounding maximum) length of an instruction address. +(define_attr "length_address" "" + (cond [(eq_attr "type" "str,other,multi,fxch") + (const_int 0) + (and (eq_attr "type" "call") + (match_operand 0 "constant_call_address_operand" "")) + (const_int 0) + (and (eq_attr "type" "callv") + (match_operand 1 "constant_call_address_operand" "")) + (const_int 0) + ] + (symbol_ref "ix86_attr_length_address_default (insn)"))) + +;; Set when length prefix is used. +(define_attr "prefix_data16" "" + (cond [(eq_attr "type" "ssemuladd,sse4arg,sseiadd1,ssecvt1") + (const_int 0) + (eq_attr "mode" "HI") + (const_int 1) + (and (eq_attr "unit" "sse") (eq_attr "mode" "V2DF,TI")) + (const_int 1) + ] + (const_int 0))) + +;; Set when string REP prefix is used. +(define_attr "prefix_rep" "" + (cond [(eq_attr "type" "ssemuladd,sse4arg,sseiadd1,ssecvt1") + (const_int 0) + (and (eq_attr "unit" "sse") (eq_attr "mode" "SF,DF")) + (const_int 1) + ] + (const_int 0))) + +;; Set when 0f opcode prefix is used. +(define_attr "prefix_0f" "" + (if_then_else + (ior (eq_attr "type" "imovx,setcc,icmov,bitmanip") + (eq_attr "unit" "sse,mmx")) + (const_int 1) + (const_int 0))) + +;; Set when REX opcode prefix is used. +(define_attr "prefix_rex" "" + (cond [(eq (symbol_ref "TARGET_64BIT") (const_int 0)) + (const_int 0) + (and (eq_attr "mode" "DI") + (and (eq_attr "type" "!push,pop,call,callv,leave,ibr") + (eq_attr "unit" "!mmx"))) + (const_int 1) + (and (eq_attr "mode" "QI") + (ne (symbol_ref "x86_extended_QIreg_mentioned_p (insn)") + (const_int 0))) + (const_int 1) + (ne (symbol_ref "x86_extended_reg_mentioned_p (insn)") + (const_int 0)) + (const_int 1) + (and (eq_attr "type" "imovx") + (match_operand:QI 1 "ext_QIreg_operand" "")) + (const_int 1) + ] + (const_int 0))) + +;; There are also additional prefixes in 3DNOW, SSSE3. +;; ssemuladd,sse4arg default to 0f24/0f25 and DREX byte, +;; sseiadd1,ssecvt1 to 0f7a with no DREX byte. +;; 3DNOW has 0f0f prefix, SSSE3 and SSE4_{1,2} 0f38/0f3a. +(define_attr "prefix_extra" "" + (cond [(eq_attr "type" "ssemuladd,sse4arg") + (const_int 2) + (eq_attr "type" "sseiadd1,ssecvt1") + (const_int 1) + ] + (const_int 0))) + +;; Prefix used: original, VEX or maybe VEX. +(define_attr "prefix" "orig,vex,maybe_vex" + (if_then_else (eq_attr "mode" "OI,V8SF,V4DF") + (const_string "vex") + (const_string "orig"))) + +;; VEX W bit is used. +(define_attr "prefix_vex_w" "" (const_int 0)) + +;; The length of VEX prefix +;; Only instructions with 0f prefix can have 2 byte VEX prefix, +;; 0f38/0f3a prefixes can't. In i386.md 0f3[8a] is +;; still prefix_0f 1, with prefix_extra 1. +(define_attr "length_vex" "" + (if_then_else (and (eq_attr "prefix_0f" "1") + (eq_attr "prefix_extra" "0")) + (if_then_else (eq_attr "prefix_vex_w" "1") + (symbol_ref "ix86_attr_length_vex_default (insn, 1, 1)") + (symbol_ref "ix86_attr_length_vex_default (insn, 1, 0)")) + (if_then_else (eq_attr "prefix_vex_w" "1") + (symbol_ref "ix86_attr_length_vex_default (insn, 0, 1)") + (symbol_ref "ix86_attr_length_vex_default (insn, 0, 0)")))) + +;; Set when modrm byte is used. +(define_attr "modrm" "" + (cond [(eq_attr "type" "str,leave") + (const_int 0) + (eq_attr "unit" "i387") + (const_int 0) + (and (eq_attr "type" "incdec") + (and (eq (symbol_ref "TARGET_64BIT") (const_int 0)) + (ior (match_operand:SI 1 "register_operand" "") + (match_operand:HI 1 "register_operand" "")))) + (const_int 0) + (and (eq_attr "type" "push") + (not (match_operand 1 "memory_operand" ""))) + (const_int 0) + (and (eq_attr "type" "pop") + (not (match_operand 0 "memory_operand" ""))) + (const_int 0) + (and (eq_attr "type" "imov") + (and (not (eq_attr "mode" "DI")) + (ior (and (match_operand 0 "register_operand" "") + (match_operand 1 "immediate_operand" "")) + (ior (and (match_operand 0 "ax_reg_operand" "") + (match_operand 1 "memory_displacement_only_operand" "")) + (and (match_operand 0 "memory_displacement_only_operand" "") + (match_operand 1 "ax_reg_operand" "")))))) + (const_int 0) + (and (eq_attr "type" "call") + (match_operand 0 "constant_call_address_operand" "")) + (const_int 0) + (and (eq_attr "type" "callv") + (match_operand 1 "constant_call_address_operand" "")) + (const_int 0) + (and (eq_attr "type" "alu,alu1,icmp,test") + (match_operand 0 "ax_reg_operand" "")) + (symbol_ref "(get_attr_length_immediate (insn) <= (get_attr_mode (insn) != MODE_QI))") + ] + (const_int 1))) + +;; The (bounding maximum) length of an instruction in bytes. +;; ??? fistp and frndint are in fact fldcw/{fistp,frndint}/fldcw sequences. +;; Later we may want to split them and compute proper length as for +;; other insns. +(define_attr "length" "" + (cond [(eq_attr "type" "other,multi,fistp,frndint") + (const_int 16) + (eq_attr "type" "fcmp") + (const_int 4) + (eq_attr "unit" "i387") + (plus (const_int 2) + (plus (attr "prefix_data16") + (attr "length_address"))) + (ior (eq_attr "prefix" "vex") + (and (eq_attr "prefix" "maybe_vex") + (ne (symbol_ref "TARGET_AVX") (const_int 0)))) + (plus (attr "length_vex") + (plus (attr "length_immediate") + (plus (attr "modrm") + (attr "length_address"))))] + (plus (plus (attr "modrm") + (plus (attr "prefix_0f") + (plus (attr "prefix_rex") + (plus (attr "prefix_extra") + (const_int 1))))) + (plus (attr "prefix_rep") + (plus (attr "prefix_data16") + (plus (attr "length_immediate") + (attr "length_address"))))))) + +;; The `memory' attribute is `none' if no memory is referenced, `load' or +;; `store' if there is a simple memory reference therein, or `unknown' +;; if the instruction is complex. + +(define_attr "memory" "none,load,store,both,unknown" + (cond [(eq_attr "type" "other,multi,str,lwp") + (const_string "unknown") + (eq_attr "type" "lea,fcmov,fpspc") + (const_string "none") + (eq_attr "type" "fistp,leave") + (const_string "both") + (eq_attr "type" "frndint") + (const_string "load") + (eq_attr "type" "push") + (if_then_else (match_operand 1 "memory_operand" "") + (const_string "both") + (const_string "store")) + (eq_attr "type" "pop") + (if_then_else (match_operand 0 "memory_operand" "") + (const_string "both") + (const_string "load")) + (eq_attr "type" "setcc") + (if_then_else (match_operand 0 "memory_operand" "") + (const_string "store") + (const_string "none")) + (eq_attr "type" "icmp,test,ssecmp,ssecomi,mmxcmp,fcmp") + (if_then_else (ior (match_operand 0 "memory_operand" "") + (match_operand 1 "memory_operand" "")) + (const_string "load") + (const_string "none")) + (eq_attr "type" "ibr") + (if_then_else (match_operand 0 "memory_operand" "") + (const_string "load") + (const_string "none")) + (eq_attr "type" "call") + (if_then_else (match_operand 0 "constant_call_address_operand" "") + (const_string "none") + (const_string "load")) + (eq_attr "type" "callv") + (if_then_else (match_operand 1 "constant_call_address_operand" "") + (const_string "none") + (const_string "load")) + (and (eq_attr "type" "alu1,negnot,ishift1,sselog1") + (match_operand 1 "memory_operand" "")) + (const_string "both") + (and (match_operand 0 "memory_operand" "") + (match_operand 1 "memory_operand" "")) + (const_string "both") + (match_operand 0 "memory_operand" "") + (const_string "store") + (match_operand 1 "memory_operand" "") + (const_string "load") + (and (eq_attr "type" + "!alu1,negnot,ishift1, + imov,imovx,icmp,test,bitmanip, + fmov,fcmp,fsgn, + sse,ssemov,ssecmp,ssecomi,ssecvt,ssecvt1,sseicvt,sselog1, + sseiadd1,mmx,mmxmov,mmxcmp,mmxcvt") + (match_operand 2 "memory_operand" "")) + (const_string "load") + (and (eq_attr "type" "icmov,ssemuladd,sse4arg") + (match_operand 3 "memory_operand" "")) + (const_string "load") + ] + (const_string "none"))) + +;; Indicates if an instruction has both an immediate and a displacement. + +(define_attr "imm_disp" "false,true,unknown" + (cond [(eq_attr "type" "other,multi") + (const_string "unknown") + (and (eq_attr "type" "icmp,test,imov,alu1,ishift1,rotate1") + (and (match_operand 0 "memory_displacement_operand" "") + (match_operand 1 "immediate_operand" ""))) + (const_string "true") + (and (eq_attr "type" "alu,ishift,rotate,imul,idiv") + (and (match_operand 0 "memory_displacement_operand" "") + (match_operand 2 "immediate_operand" ""))) + (const_string "true") + ] + (const_string "false"))) + +;; Indicates if an FP operation has an integer source. + +(define_attr "fp_int_src" "false,true" + (const_string "false")) + +;; Defines rounding mode of an FP operation. + +(define_attr "i387_cw" "trunc,floor,ceil,mask_pm,uninitialized,any" + (const_string "any")) + +;; Define attribute to classify add/sub insns that consumes carry flag (CF) +(define_attr "use_carry" "0,1" (const_string "0")) + +;; Define attribute to indicate unaligned ssemov insns +(define_attr "movu" "0,1" (const_string "0")) + +;; Describe a user's asm statement. +(define_asm_attributes + [(set_attr "length" "128") + (set_attr "type" "multi")]) + +(define_code_iterator plusminus [plus minus]) + +(define_code_iterator sat_plusminus [ss_plus us_plus ss_minus us_minus]) + +;; Base name for define_insn +(define_code_attr plusminus_insn + [(plus "add") (ss_plus "ssadd") (us_plus "usadd") + (minus "sub") (ss_minus "sssub") (us_minus "ussub")]) + +;; Base name for insn mnemonic. +(define_code_attr plusminus_mnemonic + [(plus "add") (ss_plus "adds") (us_plus "addus") + (minus "sub") (ss_minus "subs") (us_minus "subus")]) +(define_code_attr plusminus_carry_mnemonic + [(plus "adc") (minus "sbb")]) + +;; Mark commutative operators as such in constraints. +(define_code_attr comm [(plus "%") (ss_plus "%") (us_plus "%") + (minus "") (ss_minus "") (us_minus "")]) + +;; Mapping of signed max and min +(define_code_iterator smaxmin [smax smin]) + +;; Mapping of unsigned max and min +(define_code_iterator umaxmin [umax umin]) + +;; Base name for integer and FP insn mnemonic +(define_code_attr maxmin_int [(smax "maxs") (smin "mins") + (umax "maxu") (umin "minu")]) +(define_code_attr maxmin_float [(smax "max") (smin "min")]) + +;; Mapping of logic operators +(define_code_iterator any_logic [and ior xor]) +(define_code_iterator any_or [ior xor]) + +;; Base name for insn mnemonic. +(define_code_attr logic [(and "and") (ior "or") (xor "xor")]) + +;; Mapping of shift-right operators +(define_code_iterator any_shiftrt [lshiftrt ashiftrt]) + +;; Base name for define_insn +(define_code_attr shiftrt_insn [(lshiftrt "lshr") (ashiftrt "ashr")]) + +;; Base name for insn mnemonic. +(define_code_attr shiftrt [(lshiftrt "shr") (ashiftrt "sar")]) + +;; Mapping of rotate operators +(define_code_iterator any_rotate [rotate rotatert]) + +;; Base name for define_insn +(define_code_attr rotate_insn [(rotate "rotl") (rotatert "rotr")]) + +;; Base name for insn mnemonic. +(define_code_attr rotate [(rotate "rol") (rotatert "ror")]) + +;; Mapping of abs neg operators +(define_code_iterator absneg [abs neg]) + +;; Base name for x87 insn mnemonic. +(define_code_attr absneg_mnemonic [(abs "abs") (neg "chs")]) + +;; Used in signed and unsigned widening multiplications. +(define_code_iterator any_extend [sign_extend zero_extend]) + +;; Various insn prefixes for signed and unsigned operations. +(define_code_attr u [(sign_extend "") (zero_extend "u") + (div "") (udiv "u")]) +(define_code_attr s [(sign_extend "s") (zero_extend "u")]) + +;; Used in signed and unsigned divisions. +(define_code_iterator any_div [div udiv]) + +;; Instruction prefix for signed and unsigned operations. +(define_code_attr sgnprefix [(sign_extend "i") (zero_extend "") + (div "i") (udiv "")]) + +;; 64bit single word integer modes. +(define_mode_iterator SWI1248x [QI HI SI DI]) + +;; 64bit single word integer modes without QImode and HImode. +(define_mode_iterator SWI48x [SI DI]) + +;; Single word integer modes. +(define_mode_iterator SWI [QI HI SI (DI "TARGET_64BIT")]) + +;; Single word integer modes without SImode and DImode. +(define_mode_iterator SWI12 [QI HI]) + +;; Single word integer modes without DImode. +(define_mode_iterator SWI124 [QI HI SI]) + +;; Single word integer modes without QImode and DImode. +(define_mode_iterator SWI24 [HI SI]) + +;; Single word integer modes without QImode. +(define_mode_iterator SWI248 [HI SI (DI "TARGET_64BIT")]) + +;; Single word integer modes without QImode and HImode. +(define_mode_iterator SWI48 [SI (DI "TARGET_64BIT")]) + +;; All math-dependant single and double word integer modes. +(define_mode_iterator SDWIM [(QI "TARGET_QIMODE_MATH") + (HI "TARGET_HIMODE_MATH") + SI DI (TI "TARGET_64BIT")]) + +;; Math-dependant single word integer modes. +(define_mode_iterator SWIM [(QI "TARGET_QIMODE_MATH") + (HI "TARGET_HIMODE_MATH") + SI (DI "TARGET_64BIT")]) + +;; Math-dependant single word integer modes without DImode. +(define_mode_iterator SWIM124 [(QI "TARGET_QIMODE_MATH") + (HI "TARGET_HIMODE_MATH") + SI]) + +;; Math-dependant single word integer modes without QImode. +(define_mode_iterator SWIM248 [(HI "TARGET_HIMODE_MATH") + SI (DI "TARGET_64BIT")]) + +;; Double word integer modes. +(define_mode_iterator DWI [(DI "!TARGET_64BIT") + (TI "TARGET_64BIT")]) + +;; Double word integer modes as mode attribute. +(define_mode_attr DWI [(SI "DI") (DI "TI")]) +(define_mode_attr dwi [(SI "di") (DI "ti")]) + +;; Half mode for double word integer modes. +(define_mode_iterator DWIH [(SI "!TARGET_64BIT") + (DI "TARGET_64BIT")]) + +;; Instruction suffix for integer modes. +(define_mode_attr imodesuffix [(QI "b") (HI "w") (SI "l") (DI "q")]) + +;; Pointer size prefix for integer modes (Intel asm dialect) +(define_mode_attr iptrsize [(QI "BYTE") + (HI "WORD") + (SI "DWORD") + (DI "QWORD")]) + +;; Register class for integer modes. +(define_mode_attr r [(QI "q") (HI "r") (SI "r") (DI "r")]) + +;; Immediate operand constraint for integer modes. +(define_mode_attr i [(QI "n") (HI "n") (SI "i") (DI "e")]) + +;; General operand constraint for word modes. +(define_mode_attr g [(QI "qmn") (HI "rmn") (SI "g") (DI "rme")]) + +;; Immediate operand constraint for double integer modes. +(define_mode_attr di [(SI "iF") (DI "e")]) + +;; Immediate operand constraint for shifts. +(define_mode_attr S [(QI "I") (HI "I") (SI "I") (DI "J") (TI "O")]) + +;; General operand predicate for integer modes. +(define_mode_attr general_operand + [(QI "general_operand") + (HI "general_operand") + (SI "general_operand") + (DI "x86_64_general_operand") + (TI "x86_64_general_operand")]) + +;; General sign/zero extend operand predicate for integer modes. +(define_mode_attr general_szext_operand + [(QI "general_operand") + (HI "general_operand") + (SI "general_operand") + (DI "x86_64_szext_general_operand")]) + +;; Immediate operand predicate for integer modes. +(define_mode_attr immediate_operand + [(QI "immediate_operand") + (HI "immediate_operand") + (SI "immediate_operand") + (DI "x86_64_immediate_operand")]) + +;; Nonmemory operand predicate for integer modes. +(define_mode_attr nonmemory_operand + [(QI "nonmemory_operand") + (HI "nonmemory_operand") + (SI "nonmemory_operand") + (DI "x86_64_nonmemory_operand")]) + +;; Operand predicate for shifts. +(define_mode_attr shift_operand + [(QI "nonimmediate_operand") + (HI "nonimmediate_operand") + (SI "nonimmediate_operand") + (DI "shiftdi_operand") + (TI "register_operand")]) + +;; Operand predicate for shift argument. +(define_mode_attr shift_immediate_operand + [(QI "const_1_to_31_operand") + (HI "const_1_to_31_operand") + (SI "const_1_to_31_operand") + (DI "const_1_to_63_operand")]) + +;; Input operand predicate for arithmetic left shifts. +(define_mode_attr ashl_input_operand + [(QI "nonimmediate_operand") + (HI "nonimmediate_operand") + (SI "nonimmediate_operand") + (DI "ashldi_input_operand") + (TI "reg_or_pm1_operand")]) + +;; SSE and x87 SFmode and DFmode floating point modes +(define_mode_iterator MODEF [SF DF]) + +;; All x87 floating point modes +(define_mode_iterator X87MODEF [SF DF XF]) + +;; All integer modes handled by x87 fisttp operator. +(define_mode_iterator X87MODEI [HI SI DI]) + +;; All integer modes handled by integer x87 operators. +(define_mode_iterator X87MODEI12 [HI SI]) + +;; All integer modes handled by SSE cvtts?2si* operators. +(define_mode_iterator SSEMODEI24 [SI DI]) + +;; SSE asm suffix for floating point modes +(define_mode_attr ssemodefsuffix [(SF "s") (DF "d")]) + +;; SSE vector mode corresponding to a scalar mode +(define_mode_attr ssevecmode + [(QI "V16QI") (HI "V8HI") (SI "V4SI") (DI "V2DI") (SF "V4SF") (DF "V2DF")]) + +;; Instruction suffix for REX 64bit operators. +(define_mode_attr rex64suffix [(SI "") (DI "{q}")]) + +;; This mode iterator allows :P to be used for patterns that operate on +;; pointer-sized quantities. Exactly one of the two alternatives will match. +(define_mode_iterator P [(SI "Pmode == SImode") (DI "Pmode == DImode")]) + +;; Scheduling descriptions + +(include "pentium.md") +(include "ppro.md") +(include "k6.md") +(include "athlon.md") +(include "bdver1.md") +(include "geode.md") +(include "atom.md") +(include "core2.md") + + +;; Operand and operator predicates and constraints + +(include "predicates.md") +(include "constraints.md") + + +;; Compare and branch/compare and store instructions. + +(define_expand "cbranch4" + [(set (reg:CC FLAGS_REG) + (compare:CC (match_operand:SDWIM 1 "nonimmediate_operand" "") + (match_operand:SDWIM 2 "" ""))) + (set (pc) (if_then_else + (match_operator 0 "ordered_comparison_operator" + [(reg:CC FLAGS_REG) (const_int 0)]) + (label_ref (match_operand 3 "" "")) + (pc)))] + "" +{ + if (MEM_P (operands[1]) && MEM_P (operands[2])) + operands[1] = force_reg (mode, operands[1]); + ix86_expand_branch (GET_CODE (operands[0]), + operands[1], operands[2], operands[3]); + DONE; +}) + +(define_expand "cstore4" + [(set (reg:CC FLAGS_REG) + (compare:CC (match_operand:SWIM 2 "nonimmediate_operand" "") + (match_operand:SWIM 3 "" ""))) + (set (match_operand:QI 0 "register_operand" "") + (match_operator 1 "ordered_comparison_operator" + [(reg:CC FLAGS_REG) (const_int 0)]))] + "" +{ + if (MEM_P (operands[2]) && MEM_P (operands[3])) + operands[2] = force_reg (mode, operands[2]); + ix86_expand_setcc (operands[0], GET_CODE (operands[1]), + operands[2], operands[3]); + DONE; +}) + +(define_expand "cmp_1" + [(set (reg:CC FLAGS_REG) + (compare:CC (match_operand:SWI48 0 "nonimmediate_operand" "") + (match_operand:SWI48 1 "" "")))]) + +(define_insn "*cmp_ccno_1" + [(set (reg FLAGS_REG) + (compare (match_operand:SWI 0 "nonimmediate_operand" ",?m") + (match_operand:SWI 1 "const0_operand" "")))] + "ix86_match_ccmode (insn, CCNOmode)" + "@ + test{}\t%0, %0 + cmp{}\t{%1, %0|%0, %1}" + [(set_attr "type" "test,icmp") + (set_attr "length_immediate" "0,1") + (set_attr "mode" "")]) + +(define_insn "*cmp_1" + [(set (reg FLAGS_REG) + (compare (match_operand:SWI 0 "nonimmediate_operand" "m,") + (match_operand:SWI 1 "" ",m")))] + "ix86_match_ccmode (insn, CCmode)" + "cmp{}\t{%1, %0|%0, %1}" + [(set_attr "type" "icmp") + (set_attr "mode" "")]) + +(define_insn "*cmp_minus_1" + [(set (reg FLAGS_REG) + (compare + (minus:SWI (match_operand:SWI 0 "nonimmediate_operand" "m,") + (match_operand:SWI 1 "" ",m")) + (const_int 0)))] + "ix86_match_ccmode (insn, CCGOCmode)" + "cmp{}\t{%1, %0|%0, %1}" + [(set_attr "type" "icmp") + (set_attr "mode" "")]) + +(define_insn "*cmpqi_ext_1" + [(set (reg FLAGS_REG) + (compare + (match_operand:QI 0 "general_operand" "Qm") + (subreg:QI + (zero_extract:SI + (match_operand 1 "ext_register_operand" "Q") + (const_int 8) + (const_int 8)) 0)))] + "!TARGET_64BIT && ix86_match_ccmode (insn, CCmode)" + "cmp{b}\t{%h1, %0|%0, %h1}" + [(set_attr "type" "icmp") + (set_attr "mode" "QI")]) + +(define_insn "*cmpqi_ext_1_rex64" + [(set (reg FLAGS_REG) + (compare + (match_operand:QI 0 "register_operand" "Q") + (subreg:QI + (zero_extract:SI + (match_operand 1 "ext_register_operand" "Q") + (const_int 8) + (const_int 8)) 0)))] + "TARGET_64BIT && ix86_match_ccmode (insn, CCmode)" + "cmp{b}\t{%h1, %0|%0, %h1}" + [(set_attr "type" "icmp") + (set_attr "mode" "QI")]) + +(define_insn "*cmpqi_ext_2" + [(set (reg FLAGS_REG) + (compare + (subreg:QI + (zero_extract:SI + (match_operand 0 "ext_register_operand" "Q") + (const_int 8) + (const_int 8)) 0) + (match_operand:QI 1 "const0_operand" "")))] + "ix86_match_ccmode (insn, CCNOmode)" + "test{b}\t%h0, %h0" + [(set_attr "type" "test") + (set_attr "length_immediate" "0") + (set_attr "mode" "QI")]) + +(define_expand "cmpqi_ext_3" + [(set (reg:CC FLAGS_REG) + (compare:CC + (subreg:QI + (zero_extract:SI + (match_operand 0 "ext_register_operand" "") + (const_int 8) + (const_int 8)) 0) + (match_operand:QI 1 "immediate_operand" "")))]) + +(define_insn "*cmpqi_ext_3_insn" + [(set (reg FLAGS_REG) + (compare + (subreg:QI + (zero_extract:SI + (match_operand 0 "ext_register_operand" "Q") + (const_int 8) + (const_int 8)) 0) + (match_operand:QI 1 "general_operand" "Qmn")))] + "!TARGET_64BIT && ix86_match_ccmode (insn, CCmode)" + "cmp{b}\t{%1, %h0|%h0, %1}" + [(set_attr "type" "icmp") + (set_attr "modrm" "1") + (set_attr "mode" "QI")]) + +(define_insn "*cmpqi_ext_3_insn_rex64" + [(set (reg FLAGS_REG) + (compare + (subreg:QI + (zero_extract:SI + (match_operand 0 "ext_register_operand" "Q") + (const_int 8) + (const_int 8)) 0) + (match_operand:QI 1 "nonmemory_operand" "Qn")))] + "TARGET_64BIT && ix86_match_ccmode (insn, CCmode)" + "cmp{b}\t{%1, %h0|%h0, %1}" + [(set_attr "type" "icmp") + (set_attr "modrm" "1") + (set_attr "mode" "QI")]) + +(define_insn "*cmpqi_ext_4" + [(set (reg FLAGS_REG) + (compare + (subreg:QI + (zero_extract:SI + (match_operand 0 "ext_register_operand" "Q") + (const_int 8) + (const_int 8)) 0) + (subreg:QI + (zero_extract:SI + (match_operand 1 "ext_register_operand" "Q") + (const_int 8) + (const_int 8)) 0)))] + "ix86_match_ccmode (insn, CCmode)" + "cmp{b}\t{%h1, %h0|%h0, %h1}" + [(set_attr "type" "icmp") + (set_attr "mode" "QI")]) + +;; These implement float point compares. +;; %%% See if we can get away with VOIDmode operands on the actual insns, +;; which would allow mix and match FP modes on the compares. Which is what +;; the old patterns did, but with many more of them. + +(define_expand "cbranchxf4" + [(set (reg:CC FLAGS_REG) + (compare:CC (match_operand:XF 1 "nonmemory_operand" "") + (match_operand:XF 2 "nonmemory_operand" ""))) + (set (pc) (if_then_else + (match_operator 0 "ix86_fp_comparison_operator" + [(reg:CC FLAGS_REG) + (const_int 0)]) + (label_ref (match_operand 3 "" "")) + (pc)))] + "TARGET_80387" +{ + ix86_expand_branch (GET_CODE (operands[0]), + operands[1], operands[2], operands[3]); + DONE; +}) + +(define_expand "cstorexf4" + [(set (reg:CC FLAGS_REG) + (compare:CC (match_operand:XF 2 "nonmemory_operand" "") + (match_operand:XF 3 "nonmemory_operand" ""))) + (set (match_operand:QI 0 "register_operand" "") + (match_operator 1 "ix86_fp_comparison_operator" + [(reg:CC FLAGS_REG) + (const_int 0)]))] + "TARGET_80387" +{ + ix86_expand_setcc (operands[0], GET_CODE (operands[1]), + operands[2], operands[3]); + DONE; +}) + +(define_expand "cbranch4" + [(set (reg:CC FLAGS_REG) + (compare:CC (match_operand:MODEF 1 "cmp_fp_expander_operand" "") + (match_operand:MODEF 2 "cmp_fp_expander_operand" ""))) + (set (pc) (if_then_else + (match_operator 0 "ix86_fp_comparison_operator" + [(reg:CC FLAGS_REG) + (const_int 0)]) + (label_ref (match_operand 3 "" "")) + (pc)))] + "TARGET_80387 || (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)" +{ + ix86_expand_branch (GET_CODE (operands[0]), + operands[1], operands[2], operands[3]); + DONE; +}) + +(define_expand "cstore4" + [(set (reg:CC FLAGS_REG) + (compare:CC (match_operand:MODEF 2 "cmp_fp_expander_operand" "") + (match_operand:MODEF 3 "cmp_fp_expander_operand" ""))) + (set (match_operand:QI 0 "register_operand" "") + (match_operator 1 "ix86_fp_comparison_operator" + [(reg:CC FLAGS_REG) + (const_int 0)]))] + "TARGET_80387 || (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)" +{ + ix86_expand_setcc (operands[0], GET_CODE (operands[1]), + operands[2], operands[3]); + DONE; +}) + +(define_expand "cbranchcc4" + [(set (pc) (if_then_else + (match_operator 0 "comparison_operator" + [(match_operand 1 "flags_reg_operand" "") + (match_operand 2 "const0_operand" "")]) + (label_ref (match_operand 3 "" "")) + (pc)))] + "" +{ + ix86_expand_branch (GET_CODE (operands[0]), + operands[1], operands[2], operands[3]); + DONE; +}) + +(define_expand "cstorecc4" + [(set (match_operand:QI 0 "register_operand" "") + (match_operator 1 "comparison_operator" + [(match_operand 2 "flags_reg_operand" "") + (match_operand 3 "const0_operand" "")]))] + "" +{ + ix86_expand_setcc (operands[0], GET_CODE (operands[1]), + operands[2], operands[3]); + DONE; +}) + + +;; FP compares, step 1: +;; Set the FP condition codes. +;; +;; CCFPmode compare with exceptions +;; CCFPUmode compare with no exceptions + +;; We may not use "#" to split and emit these, since the REG_DEAD notes +;; used to manage the reg stack popping would not be preserved. + +(define_insn "*cmpfp_0" + [(set (match_operand:HI 0 "register_operand" "=a") + (unspec:HI + [(compare:CCFP + (match_operand 1 "register_operand" "f") + (match_operand 2 "const0_operand" ""))] + UNSPEC_FNSTSW))] + "X87_FLOAT_MODE_P (GET_MODE (operands[1])) + && GET_MODE (operands[1]) == GET_MODE (operands[2])" + "* return output_fp_compare (insn, operands, 0, 0);" + [(set_attr "type" "multi") + (set_attr "unit" "i387") + (set (attr "mode") + (cond [(match_operand:SF 1 "" "") + (const_string "SF") + (match_operand:DF 1 "" "") + (const_string "DF") + ] + (const_string "XF")))]) + +(define_insn_and_split "*cmpfp_0_cc" + [(set (reg:CCFP FLAGS_REG) + (compare:CCFP + (match_operand 1 "register_operand" "f") + (match_operand 2 "const0_operand" ""))) + (clobber (match_operand:HI 0 "register_operand" "=a"))] + "X87_FLOAT_MODE_P (GET_MODE (operands[1])) + && TARGET_SAHF && !TARGET_CMOVE + && GET_MODE (operands[1]) == GET_MODE (operands[2])" + "#" + "&& reload_completed" + [(set (match_dup 0) + (unspec:HI + [(compare:CCFP (match_dup 1)(match_dup 2))] + UNSPEC_FNSTSW)) + (set (reg:CC FLAGS_REG) + (unspec:CC [(match_dup 0)] UNSPEC_SAHF))] + "" + [(set_attr "type" "multi") + (set_attr "unit" "i387") + (set (attr "mode") + (cond [(match_operand:SF 1 "" "") + (const_string "SF") + (match_operand:DF 1 "" "") + (const_string "DF") + ] + (const_string "XF")))]) + +(define_insn "*cmpfp_xf" + [(set (match_operand:HI 0 "register_operand" "=a") + (unspec:HI + [(compare:CCFP + (match_operand:XF 1 "register_operand" "f") + (match_operand:XF 2 "register_operand" "f"))] + UNSPEC_FNSTSW))] + "TARGET_80387" + "* return output_fp_compare (insn, operands, 0, 0);" + [(set_attr "type" "multi") + (set_attr "unit" "i387") + (set_attr "mode" "XF")]) + +(define_insn_and_split "*cmpfp_xf_cc" + [(set (reg:CCFP FLAGS_REG) + (compare:CCFP + (match_operand:XF 1 "register_operand" "f") + (match_operand:XF 2 "register_operand" "f"))) + (clobber (match_operand:HI 0 "register_operand" "=a"))] + "TARGET_80387 + && TARGET_SAHF && !TARGET_CMOVE" + "#" + "&& reload_completed" + [(set (match_dup 0) + (unspec:HI + [(compare:CCFP (match_dup 1)(match_dup 2))] + UNSPEC_FNSTSW)) + (set (reg:CC FLAGS_REG) + (unspec:CC [(match_dup 0)] UNSPEC_SAHF))] + "" + [(set_attr "type" "multi") + (set_attr "unit" "i387") + (set_attr "mode" "XF")]) + +(define_insn "*cmpfp_" + [(set (match_operand:HI 0 "register_operand" "=a") + (unspec:HI + [(compare:CCFP + (match_operand:MODEF 1 "register_operand" "f") + (match_operand:MODEF 2 "nonimmediate_operand" "fm"))] + UNSPEC_FNSTSW))] + "TARGET_80387" + "* return output_fp_compare (insn, operands, 0, 0);" + [(set_attr "type" "multi") + (set_attr "unit" "i387") + (set_attr "mode" "")]) + +(define_insn_and_split "*cmpfp__cc" + [(set (reg:CCFP FLAGS_REG) + (compare:CCFP + (match_operand:MODEF 1 "register_operand" "f") + (match_operand:MODEF 2 "nonimmediate_operand" "fm"))) + (clobber (match_operand:HI 0 "register_operand" "=a"))] + "TARGET_80387 + && TARGET_SAHF && !TARGET_CMOVE" + "#" + "&& reload_completed" + [(set (match_dup 0) + (unspec:HI + [(compare:CCFP (match_dup 1)(match_dup 2))] + UNSPEC_FNSTSW)) + (set (reg:CC FLAGS_REG) + (unspec:CC [(match_dup 0)] UNSPEC_SAHF))] + "" + [(set_attr "type" "multi") + (set_attr "unit" "i387") + (set_attr "mode" "")]) + +(define_insn "*cmpfp_u" + [(set (match_operand:HI 0 "register_operand" "=a") + (unspec:HI + [(compare:CCFPU + (match_operand 1 "register_operand" "f") + (match_operand 2 "register_operand" "f"))] + UNSPEC_FNSTSW))] + "X87_FLOAT_MODE_P (GET_MODE (operands[1])) + && GET_MODE (operands[1]) == GET_MODE (operands[2])" + "* return output_fp_compare (insn, operands, 0, 1);" + [(set_attr "type" "multi") + (set_attr "unit" "i387") + (set (attr "mode") + (cond [(match_operand:SF 1 "" "") + (const_string "SF") + (match_operand:DF 1 "" "") + (const_string "DF") + ] + (const_string "XF")))]) + +(define_insn_and_split "*cmpfp_u_cc" + [(set (reg:CCFPU FLAGS_REG) + (compare:CCFPU + (match_operand 1 "register_operand" "f") + (match_operand 2 "register_operand" "f"))) + (clobber (match_operand:HI 0 "register_operand" "=a"))] + "X87_FLOAT_MODE_P (GET_MODE (operands[1])) + && TARGET_SAHF && !TARGET_CMOVE + && GET_MODE (operands[1]) == GET_MODE (operands[2])" + "#" + "&& reload_completed" + [(set (match_dup 0) + (unspec:HI + [(compare:CCFPU (match_dup 1)(match_dup 2))] + UNSPEC_FNSTSW)) + (set (reg:CC FLAGS_REG) + (unspec:CC [(match_dup 0)] UNSPEC_SAHF))] + "" + [(set_attr "type" "multi") + (set_attr "unit" "i387") + (set (attr "mode") + (cond [(match_operand:SF 1 "" "") + (const_string "SF") + (match_operand:DF 1 "" "") + (const_string "DF") + ] + (const_string "XF")))]) + +(define_insn "*cmpfp_" + [(set (match_operand:HI 0 "register_operand" "=a") + (unspec:HI + [(compare:CCFP + (match_operand 1 "register_operand" "f") + (match_operator 3 "float_operator" + [(match_operand:X87MODEI12 2 "memory_operand" "m")]))] + UNSPEC_FNSTSW))] + "X87_FLOAT_MODE_P (GET_MODE (operands[1])) + && (TARGET_USE_MODE_FIOP || optimize_function_for_size_p (cfun)) + && (GET_MODE (operands [3]) == GET_MODE (operands[1]))" + "* return output_fp_compare (insn, operands, 0, 0);" + [(set_attr "type" "multi") + (set_attr "unit" "i387") + (set_attr "fp_int_src" "true") + (set_attr "mode" "")]) + +(define_insn_and_split "*cmpfp__cc" + [(set (reg:CCFP FLAGS_REG) + (compare:CCFP + (match_operand 1 "register_operand" "f") + (match_operator 3 "float_operator" + [(match_operand:X87MODEI12 2 "memory_operand" "m")]))) + (clobber (match_operand:HI 0 "register_operand" "=a"))] + "X87_FLOAT_MODE_P (GET_MODE (operands[1])) + && TARGET_SAHF && !TARGET_CMOVE + && (TARGET_USE_MODE_FIOP || optimize_function_for_size_p (cfun)) + && (GET_MODE (operands [3]) == GET_MODE (operands[1]))" + "#" + "&& reload_completed" + [(set (match_dup 0) + (unspec:HI + [(compare:CCFP + (match_dup 1) + (match_op_dup 3 [(match_dup 2)]))] + UNSPEC_FNSTSW)) + (set (reg:CC FLAGS_REG) + (unspec:CC [(match_dup 0)] UNSPEC_SAHF))] + "" + [(set_attr "type" "multi") + (set_attr "unit" "i387") + (set_attr "fp_int_src" "true") + (set_attr "mode" "")]) + +;; FP compares, step 2 +;; Move the fpsw to ax. + +(define_insn "x86_fnstsw_1" + [(set (match_operand:HI 0 "register_operand" "=a") + (unspec:HI [(reg:CCFP FPSR_REG)] UNSPEC_FNSTSW))] + "TARGET_80387" + "fnstsw\t%0" + [(set (attr "length") (symbol_ref "ix86_attr_length_address_default (insn) + 2")) + (set_attr "mode" "SI") + (set_attr "unit" "i387")]) + +;; FP compares, step 3 +;; Get ax into flags, general case. + +(define_insn "x86_sahf_1" + [(set (reg:CC FLAGS_REG) + (unspec:CC [(match_operand:HI 0 "register_operand" "a")] + UNSPEC_SAHF))] + "TARGET_SAHF" +{ +#ifndef HAVE_AS_IX86_SAHF + if (TARGET_64BIT) + return ASM_BYTE "0x9e"; + else +#endif + return "sahf"; +} + [(set_attr "length" "1") + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct") + (set_attr "bdver1_decode" "direct") + (set_attr "mode" "SI")]) + +;; Pentium Pro can do steps 1 through 3 in one go. +;; comi*, ucomi*, fcomi*, ficomi*,fucomi* (i387 instructions set condition codes) +(define_insn "*cmpfp_i_mixed" + [(set (reg:CCFP FLAGS_REG) + (compare:CCFP (match_operand 0 "register_operand" "f,x") + (match_operand 1 "nonimmediate_operand" "f,xm")))] + "TARGET_MIX_SSE_I387 + && SSE_FLOAT_MODE_P (GET_MODE (operands[0])) + && GET_MODE (operands[0]) == GET_MODE (operands[1])" + "* return output_fp_compare (insn, operands, 1, 0);" + [(set_attr "type" "fcmp,ssecomi") + (set_attr "prefix" "orig,maybe_vex") + (set (attr "mode") + (if_then_else (match_operand:SF 1 "" "") + (const_string "SF") + (const_string "DF"))) + (set (attr "prefix_rep") + (if_then_else (eq_attr "type" "ssecomi") + (const_string "0") + (const_string "*"))) + (set (attr "prefix_data16") + (cond [(eq_attr "type" "fcmp") + (const_string "*") + (eq_attr "mode" "DF") + (const_string "1") + ] + (const_string "0"))) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct") + (set_attr "bdver1_decode" "double")]) + +(define_insn "*cmpfp_i_sse" + [(set (reg:CCFP FLAGS_REG) + (compare:CCFP (match_operand 0 "register_operand" "x") + (match_operand 1 "nonimmediate_operand" "xm")))] + "TARGET_SSE_MATH + && SSE_FLOAT_MODE_P (GET_MODE (operands[0])) + && GET_MODE (operands[0]) == GET_MODE (operands[1])" + "* return output_fp_compare (insn, operands, 1, 0);" + [(set_attr "type" "ssecomi") + (set_attr "prefix" "maybe_vex") + (set (attr "mode") + (if_then_else (match_operand:SF 1 "" "") + (const_string "SF") + (const_string "DF"))) + (set_attr "prefix_rep" "0") + (set (attr "prefix_data16") + (if_then_else (eq_attr "mode" "DF") + (const_string "1") + (const_string "0"))) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct") + (set_attr "bdver1_decode" "double")]) + +(define_insn "*cmpfp_i_i387" + [(set (reg:CCFP FLAGS_REG) + (compare:CCFP (match_operand 0 "register_operand" "f") + (match_operand 1 "register_operand" "f")))] + "X87_FLOAT_MODE_P (GET_MODE (operands[0])) + && TARGET_CMOVE + && !(SSE_FLOAT_MODE_P (GET_MODE (operands[0])) && TARGET_SSE_MATH) + && GET_MODE (operands[0]) == GET_MODE (operands[1])" + "* return output_fp_compare (insn, operands, 1, 0);" + [(set_attr "type" "fcmp") + (set (attr "mode") + (cond [(match_operand:SF 1 "" "") + (const_string "SF") + (match_operand:DF 1 "" "") + (const_string "DF") + ] + (const_string "XF"))) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct") + (set_attr "bdver1_decode" "double")]) + +(define_insn "*cmpfp_iu_mixed" + [(set (reg:CCFPU FLAGS_REG) + (compare:CCFPU (match_operand 0 "register_operand" "f,x") + (match_operand 1 "nonimmediate_operand" "f,xm")))] + "TARGET_MIX_SSE_I387 + && SSE_FLOAT_MODE_P (GET_MODE (operands[0])) + && GET_MODE (operands[0]) == GET_MODE (operands[1])" + "* return output_fp_compare (insn, operands, 1, 1);" + [(set_attr "type" "fcmp,ssecomi") + (set_attr "prefix" "orig,maybe_vex") + (set (attr "mode") + (if_then_else (match_operand:SF 1 "" "") + (const_string "SF") + (const_string "DF"))) + (set (attr "prefix_rep") + (if_then_else (eq_attr "type" "ssecomi") + (const_string "0") + (const_string "*"))) + (set (attr "prefix_data16") + (cond [(eq_attr "type" "fcmp") + (const_string "*") + (eq_attr "mode" "DF") + (const_string "1") + ] + (const_string "0"))) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct") + (set_attr "bdver1_decode" "double")]) + +(define_insn "*cmpfp_iu_sse" + [(set (reg:CCFPU FLAGS_REG) + (compare:CCFPU (match_operand 0 "register_operand" "x") + (match_operand 1 "nonimmediate_operand" "xm")))] + "TARGET_SSE_MATH + && SSE_FLOAT_MODE_P (GET_MODE (operands[0])) + && GET_MODE (operands[0]) == GET_MODE (operands[1])" + "* return output_fp_compare (insn, operands, 1, 1);" + [(set_attr "type" "ssecomi") + (set_attr "prefix" "maybe_vex") + (set (attr "mode") + (if_then_else (match_operand:SF 1 "" "") + (const_string "SF") + (const_string "DF"))) + (set_attr "prefix_rep" "0") + (set (attr "prefix_data16") + (if_then_else (eq_attr "mode" "DF") + (const_string "1") + (const_string "0"))) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct") + (set_attr "bdver1_decode" "double")]) + +(define_insn "*cmpfp_iu_387" + [(set (reg:CCFPU FLAGS_REG) + (compare:CCFPU (match_operand 0 "register_operand" "f") + (match_operand 1 "register_operand" "f")))] + "X87_FLOAT_MODE_P (GET_MODE (operands[0])) + && TARGET_CMOVE + && !(SSE_FLOAT_MODE_P (GET_MODE (operands[0])) && TARGET_SSE_MATH) + && GET_MODE (operands[0]) == GET_MODE (operands[1])" + "* return output_fp_compare (insn, operands, 1, 1);" + [(set_attr "type" "fcmp") + (set (attr "mode") + (cond [(match_operand:SF 1 "" "") + (const_string "SF") + (match_operand:DF 1 "" "") + (const_string "DF") + ] + (const_string "XF"))) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct") + (set_attr "bdver1_decode" "direct")]) + +;; Push/pop instructions. + +(define_insn "*push2" + [(set (match_operand:DWI 0 "push_operand" "=<") + (match_operand:DWI 1 "general_no_elim_operand" "riF*m"))] + "" + "#") + +(define_split + [(set (match_operand:TI 0 "push_operand" "") + (match_operand:TI 1 "general_operand" ""))] + "TARGET_64BIT && reload_completed + && !SSE_REG_P (operands[1])" + [(const_int 0)] + "ix86_split_long_move (operands); DONE;") + +(define_insn "*pushdi2_rex64" + [(set (match_operand:DI 0 "push_operand" "=<,!<") + (match_operand:DI 1 "general_no_elim_operand" "re*m,n"))] + "TARGET_64BIT" + "@ + push{q}\t%1 + #" + [(set_attr "type" "push,multi") + (set_attr "mode" "DI")]) + +;; Convert impossible pushes of immediate to existing instructions. +;; First try to get scratch register and go through it. In case this +;; fails, push sign extended lower part first and then overwrite +;; upper part by 32bit move. +(define_peephole2 + [(match_scratch:DI 2 "r") + (set (match_operand:DI 0 "push_operand" "") + (match_operand:DI 1 "immediate_operand" ""))] + "TARGET_64BIT && !symbolic_operand (operands[1], DImode) + && !x86_64_immediate_operand (operands[1], DImode)" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (match_dup 2))]) + +;; We need to define this as both peepholer and splitter for case +;; peephole2 pass is not run. +;; "&& 1" is needed to keep it from matching the previous pattern. +(define_peephole2 + [(set (match_operand:DI 0 "push_operand" "") + (match_operand:DI 1 "immediate_operand" ""))] + "TARGET_64BIT && !symbolic_operand (operands[1], DImode) + && !x86_64_immediate_operand (operands[1], DImode) && 1" + [(set (match_dup 0) (match_dup 1)) + (set (match_dup 2) (match_dup 3))] +{ + split_double_mode (DImode, &operands[1], 1, &operands[2], &operands[3]); + + operands[1] = gen_lowpart (DImode, operands[2]); + operands[2] = gen_rtx_MEM (SImode, gen_rtx_PLUS (DImode, stack_pointer_rtx, + GEN_INT (4))); +}) + +(define_split + [(set (match_operand:DI 0 "push_operand" "") + (match_operand:DI 1 "immediate_operand" ""))] + "TARGET_64BIT && ((optimize > 0 && flag_peephole2) + ? epilogue_completed : reload_completed) + && !symbolic_operand (operands[1], DImode) + && !x86_64_immediate_operand (operands[1], DImode)" + [(set (match_dup 0) (match_dup 1)) + (set (match_dup 2) (match_dup 3))] +{ + split_double_mode (DImode, &operands[1], 1, &operands[2], &operands[3]); + + operands[1] = gen_lowpart (DImode, operands[2]); + operands[2] = gen_rtx_MEM (SImode, gen_rtx_PLUS (DImode, stack_pointer_rtx, + GEN_INT (4))); +}) + +(define_split + [(set (match_operand:DI 0 "push_operand" "") + (match_operand:DI 1 "general_operand" ""))] + "!TARGET_64BIT && reload_completed + && !(MMX_REG_P (operands[1]) || SSE_REG_P (operands[1]))" + [(const_int 0)] + "ix86_split_long_move (operands); DONE;") + +(define_insn "*pushsi2" + [(set (match_operand:SI 0 "push_operand" "=<") + (match_operand:SI 1 "general_no_elim_operand" "ri*m"))] + "!TARGET_64BIT" + "push{l}\t%1" + [(set_attr "type" "push") + (set_attr "mode" "SI")]) + +;; emit_push_insn when it calls move_by_pieces requires an insn to +;; "push a byte/word". But actually we use pushl, which has the effect +;; of rounding the amount pushed up to a word. + +;; For TARGET_64BIT we always round up to 8 bytes. +(define_insn "*push2_rex64" + [(set (match_operand:SWI124 0 "push_operand" "=X") + (match_operand:SWI124 1 "nonmemory_no_elim_operand" "r"))] + "TARGET_64BIT" + "push{q}\t%q1" + [(set_attr "type" "push") + (set_attr "mode" "DI")]) + +(define_insn "*push2" + [(set (match_operand:SWI12 0 "push_operand" "=X") + (match_operand:SWI12 1 "nonmemory_no_elim_operand" "rn"))] + "!TARGET_64BIT" + "push{l}\t%k1" + [(set_attr "type" "push") + (set_attr "mode" "SI")]) + +(define_insn "*push2_prologue" + [(set (match_operand:P 0 "push_operand" "=<") + (match_operand:P 1 "general_no_elim_operand" "r*m")) + (clobber (mem:BLK (scratch)))] + "" + "push{}\t%1" + [(set_attr "type" "push") + (set_attr "mode" "")]) + +(define_insn "*pop1" + [(set (match_operand:P 0 "nonimmediate_operand" "=r*m") + (match_operand:P 1 "pop_operand" ">"))] + "" + "pop{}\t%0" + [(set_attr "type" "pop") + (set_attr "mode" "")]) + +(define_insn "*pop1_epilogue" + [(set (match_operand:P 0 "nonimmediate_operand" "=r*m") + (match_operand:P 1 "pop_operand" ">")) + (clobber (mem:BLK (scratch)))] + "" + "pop{}\t%0" + [(set_attr "type" "pop") + (set_attr "mode" "")]) + +;; Move instructions. + +(define_expand "movoi" + [(set (match_operand:OI 0 "nonimmediate_operand" "") + (match_operand:OI 1 "general_operand" ""))] + "TARGET_AVX" + "ix86_expand_move (OImode, operands); DONE;") + +(define_expand "movti" + [(set (match_operand:TI 0 "nonimmediate_operand" "") + (match_operand:TI 1 "nonimmediate_operand" ""))] + "TARGET_64BIT || TARGET_SSE" +{ + if (TARGET_64BIT) + ix86_expand_move (TImode, operands); + else if (push_operand (operands[0], TImode)) + ix86_expand_push (TImode, operands[1]); + else + ix86_expand_vector_move (TImode, operands); + DONE; +}) + +;; This expands to what emit_move_complex would generate if we didn't +;; have a movti pattern. Having this avoids problems with reload on +;; 32-bit targets when SSE is present, but doesn't seem to be harmful +;; to have around all the time. +(define_expand "movcdi" + [(set (match_operand:CDI 0 "nonimmediate_operand" "") + (match_operand:CDI 1 "general_operand" ""))] + "" +{ + if (push_operand (operands[0], CDImode)) + emit_move_complex_push (CDImode, operands[0], operands[1]); + else + emit_move_complex_parts (operands[0], operands[1]); + DONE; +}) + +(define_expand "mov" + [(set (match_operand:SWI1248x 0 "nonimmediate_operand" "") + (match_operand:SWI1248x 1 "general_operand" ""))] + "" + "ix86_expand_move (mode, operands); DONE;") + +(define_insn "*mov_xor" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (match_operand:SWI48 1 "const0_operand" "")) + (clobber (reg:CC FLAGS_REG))] + "reload_completed" + "xor{l}\t%k0, %k0" + [(set_attr "type" "alu1") + (set_attr "mode" "SI") + (set_attr "length_immediate" "0")]) + +(define_insn "*mov_or" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (match_operand:SWI48 1 "const_int_operand" "")) + (clobber (reg:CC FLAGS_REG))] + "reload_completed + && operands[1] == constm1_rtx" + "or{}\t{%1, %0|%0, %1}" + [(set_attr "type" "alu1") + (set_attr "mode" "") + (set_attr "length_immediate" "1")]) + +(define_insn "*movoi_internal_avx" + [(set (match_operand:OI 0 "nonimmediate_operand" "=x,x,m") + (match_operand:OI 1 "vector_move_operand" "C,xm,x"))] + "TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))" +{ + switch (which_alternative) + { + case 0: + return "vxorps\t%0, %0, %0"; + case 1: + case 2: + if (misaligned_operand (operands[0], OImode) + || misaligned_operand (operands[1], OImode)) + return "vmovdqu\t{%1, %0|%0, %1}"; + else + return "vmovdqa\t{%1, %0|%0, %1}"; + default: + gcc_unreachable (); + } +} + [(set_attr "type" "sselog1,ssemov,ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + +(define_insn "*movti_internal_rex64" + [(set (match_operand:TI 0 "nonimmediate_operand" "=!r,o,x,x,xm") + (match_operand:TI 1 "general_operand" "riFo,riF,C,xm,x"))] + "TARGET_64BIT && !(MEM_P (operands[0]) && MEM_P (operands[1]))" +{ + switch (which_alternative) + { + case 0: + case 1: + return "#"; + case 2: + if (get_attr_mode (insn) == MODE_V4SF) + return "%vxorps\t%0, %d0"; + else + return "%vpxor\t%0, %d0"; + case 3: + case 4: + /* TDmode values are passed as TImode on the stack. Moving them + to stack may result in unaligned memory access. */ + if (misaligned_operand (operands[0], TImode) + || misaligned_operand (operands[1], TImode)) + { + if (get_attr_mode (insn) == MODE_V4SF) + return "%vmovups\t{%1, %0|%0, %1}"; + else + return "%vmovdqu\t{%1, %0|%0, %1}"; + } + else + { + if (get_attr_mode (insn) == MODE_V4SF) + return "%vmovaps\t{%1, %0|%0, %1}"; + else + return "%vmovdqa\t{%1, %0|%0, %1}"; + } + default: + gcc_unreachable (); + } +} + [(set_attr "type" "*,*,sselog1,ssemov,ssemov") + (set_attr "prefix" "*,*,maybe_vex,maybe_vex,maybe_vex") + (set (attr "mode") + (cond [(eq_attr "alternative" "2,3") + (if_then_else + (ne (symbol_ref "optimize_function_for_size_p (cfun)") + (const_int 0)) + (const_string "V4SF") + (const_string "TI")) + (eq_attr "alternative" "4") + (if_then_else + (ior (ne (symbol_ref "TARGET_SSE_TYPELESS_STORES") + (const_int 0)) + (ne (symbol_ref "optimize_function_for_size_p (cfun)") + (const_int 0))) + (const_string "V4SF") + (const_string "TI"))] + (const_string "DI")))]) + +(define_split + [(set (match_operand:TI 0 "nonimmediate_operand" "") + (match_operand:TI 1 "general_operand" ""))] + "reload_completed + && !SSE_REG_P (operands[0]) && !SSE_REG_P (operands[1])" + [(const_int 0)] + "ix86_split_long_move (operands); DONE;") + +(define_insn "*movti_internal_sse" + [(set (match_operand:TI 0 "nonimmediate_operand" "=x,x,m") + (match_operand:TI 1 "vector_move_operand" "C,xm,x"))] + "TARGET_SSE && !TARGET_64BIT + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" +{ + switch (which_alternative) + { + case 0: + if (get_attr_mode (insn) == MODE_V4SF) + return "%vxorps\t%0, %d0"; + else + return "%vpxor\t%0, %d0"; + case 1: + case 2: + /* TDmode values are passed as TImode on the stack. Moving them + to stack may result in unaligned memory access. */ + if (misaligned_operand (operands[0], TImode) + || misaligned_operand (operands[1], TImode)) + { + if (get_attr_mode (insn) == MODE_V4SF) + return "%vmovups\t{%1, %0|%0, %1}"; + else + return "%vmovdqu\t{%1, %0|%0, %1}"; + } + else + { + if (get_attr_mode (insn) == MODE_V4SF) + return "%vmovaps\t{%1, %0|%0, %1}"; + else + return "%vmovdqa\t{%1, %0|%0, %1}"; + } + default: + gcc_unreachable (); + } +} + [(set_attr "type" "sselog1,ssemov,ssemov") + (set_attr "prefix" "maybe_vex") + (set (attr "mode") + (cond [(ior (eq (symbol_ref "TARGET_SSE2") (const_int 0)) + (ne (symbol_ref "optimize_function_for_size_p (cfun)") + (const_int 0))) + (const_string "V4SF") + (and (eq_attr "alternative" "2") + (ne (symbol_ref "TARGET_SSE_TYPELESS_STORES") + (const_int 0))) + (const_string "V4SF")] + (const_string "TI")))]) + +(define_insn "*movdi_internal_rex64" + [(set (match_operand:DI 0 "nonimmediate_operand" + "=r,r ,r,m ,!o,*y,*y,?r ,m ,?*Ym,?*y,*x,*x,?r ,m,?*Yi,*x,?*x,?*Ym") + (match_operand:DI 1 "general_operand" + "Z ,rem,i,re,n ,C ,*y,*Ym,*y,r ,m ,C ,*x,*Yi,*x,r ,m ,*Ym,*x"))] + "TARGET_64BIT && !(MEM_P (operands[0]) && MEM_P (operands[1]))" +{ + switch (get_attr_type (insn)) + { + case TYPE_SSECVT: + if (SSE_REG_P (operands[0])) + return "movq2dq\t{%1, %0|%0, %1}"; + else + return "movdq2q\t{%1, %0|%0, %1}"; + + case TYPE_SSEMOV: + if (get_attr_mode (insn) == MODE_TI) + return "%vmovdqa\t{%1, %0|%0, %1}"; + /* Handle broken assemblers that require movd instead of movq. */ + if (GENERAL_REG_P (operands[0]) || GENERAL_REG_P (operands[1])) + return "%vmovd\t{%1, %0|%0, %1}"; + return "%vmovq\t{%1, %0|%0, %1}"; + + case TYPE_MMXMOV: + /* Handle broken assemblers that require movd instead of movq. */ + if (GENERAL_REG_P (operands[0]) || GENERAL_REG_P (operands[1])) + return "movd\t{%1, %0|%0, %1}"; + return "movq\t{%1, %0|%0, %1}"; + + case TYPE_SSELOG1: + return "%vpxor\t%0, %d0"; + + case TYPE_MMX: + return "pxor\t%0, %0"; + + case TYPE_MULTI: + return "#"; + + case TYPE_LEA: + return "lea{q}\t{%a1, %0|%0, %a1}"; + + default: + gcc_assert (!flag_pic || LEGITIMATE_PIC_OPERAND_P (operands[1])); + if (get_attr_mode (insn) == MODE_SI) + return "mov{l}\t{%k1, %k0|%k0, %k1}"; + else if (which_alternative == 2) + return "movabs{q}\t{%1, %0|%0, %1}"; + else + return "mov{q}\t{%1, %0|%0, %1}"; + } +} + [(set (attr "type") + (cond [(eq_attr "alternative" "5") + (const_string "mmx") + (eq_attr "alternative" "6,7,8,9,10") + (const_string "mmxmov") + (eq_attr "alternative" "11") + (const_string "sselog1") + (eq_attr "alternative" "12,13,14,15,16") + (const_string "ssemov") + (eq_attr "alternative" "17,18") + (const_string "ssecvt") + (eq_attr "alternative" "4") + (const_string "multi") + (match_operand:DI 1 "pic_32bit_operand" "") + (const_string "lea") + ] + (const_string "imov"))) + (set (attr "modrm") + (if_then_else + (and (eq_attr "alternative" "2") (eq_attr "type" "imov")) + (const_string "0") + (const_string "*"))) + (set (attr "length_immediate") + (if_then_else + (and (eq_attr "alternative" "2") (eq_attr "type" "imov")) + (const_string "8") + (const_string "*"))) + (set_attr "prefix_rex" "*,*,*,*,*,*,*,1,*,1,*,*,*,*,*,*,*,*,*") + (set_attr "prefix_data16" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,1,*,*,*") + (set (attr "prefix") + (if_then_else (eq_attr "alternative" "11,12,13,14,15,16") + (const_string "maybe_vex") + (const_string "orig"))) + (set_attr "mode" "SI,DI,DI,DI,SI,DI,DI,DI,DI,DI,DI,TI,TI,DI,DI,DI,DI,DI,DI")]) + +;; Convert impossible stores of immediate to existing instructions. +;; First try to get scratch register and go through it. In case this +;; fails, move by 32bit parts. +(define_peephole2 + [(match_scratch:DI 2 "r") + (set (match_operand:DI 0 "memory_operand" "") + (match_operand:DI 1 "immediate_operand" ""))] + "TARGET_64BIT && !symbolic_operand (operands[1], DImode) + && !x86_64_immediate_operand (operands[1], DImode)" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (match_dup 2))]) + +;; We need to define this as both peepholer and splitter for case +;; peephole2 pass is not run. +;; "&& 1" is needed to keep it from matching the previous pattern. +(define_peephole2 + [(set (match_operand:DI 0 "memory_operand" "") + (match_operand:DI 1 "immediate_operand" ""))] + "TARGET_64BIT && !symbolic_operand (operands[1], DImode) + && !x86_64_immediate_operand (operands[1], DImode) && 1" + [(set (match_dup 2) (match_dup 3)) + (set (match_dup 4) (match_dup 5))] + "split_double_mode (DImode, &operands[0], 2, &operands[2], &operands[4]);") + +(define_split + [(set (match_operand:DI 0 "memory_operand" "") + (match_operand:DI 1 "immediate_operand" ""))] + "TARGET_64BIT && ((optimize > 0 && flag_peephole2) + ? epilogue_completed : reload_completed) + && !symbolic_operand (operands[1], DImode) + && !x86_64_immediate_operand (operands[1], DImode)" + [(set (match_dup 2) (match_dup 3)) + (set (match_dup 4) (match_dup 5))] + "split_double_mode (DImode, &operands[0], 2, &operands[2], &operands[4]);") + +(define_insn "*movdi_internal" + [(set (match_operand:DI 0 "nonimmediate_operand" + "=r ,o ,*y,m*y,*y,*Y2,m ,*Y2,*Y2,*x,m ,*x,*x") + (match_operand:DI 1 "general_operand" + "riFo,riF,C ,*y ,m ,C ,*Y2,*Y2,m ,C ,*x,*x,m "))] + "!TARGET_64BIT && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "@ + # + # + pxor\t%0, %0 + movq\t{%1, %0|%0, %1} + movq\t{%1, %0|%0, %1} + %vpxor\t%0, %d0 + %vmovq\t{%1, %0|%0, %1} + %vmovdqa\t{%1, %0|%0, %1} + %vmovq\t{%1, %0|%0, %1} + xorps\t%0, %0 + movlps\t{%1, %0|%0, %1} + movaps\t{%1, %0|%0, %1} + movlps\t{%1, %0|%0, %1}" + [(set_attr "type" "*,*,mmx,mmxmov,mmxmov,sselog1,ssemov,ssemov,ssemov,sselog1,ssemov,ssemov,ssemov") + (set (attr "prefix") + (if_then_else (eq_attr "alternative" "5,6,7,8") + (const_string "maybe_vex") + (const_string "orig"))) + (set_attr "mode" "DI,DI,DI,DI,DI,TI,DI,TI,DI,V4SF,V2SF,V4SF,V2SF")]) + +(define_split + [(set (match_operand:DI 0 "nonimmediate_operand" "") + (match_operand:DI 1 "general_operand" ""))] + "!TARGET_64BIT && reload_completed + && !(MMX_REG_P (operands[0]) || SSE_REG_P (operands[0])) + && !(MMX_REG_P (operands[1]) || SSE_REG_P (operands[1]))" + [(const_int 0)] + "ix86_split_long_move (operands); DONE;") + +(define_insn "*movsi_internal" + [(set (match_operand:SI 0 "nonimmediate_operand" + "=r,m ,*y,*y,?rm,?*y,*x,*x,?r ,m ,?*Yi,*x") + (match_operand:SI 1 "general_operand" + "g ,ri,C ,*y,*y ,rm ,C ,*x,*Yi,*x,r ,m "))] + "!(MEM_P (operands[0]) && MEM_P (operands[1]))" +{ + switch (get_attr_type (insn)) + { + case TYPE_SSELOG1: + if (get_attr_mode (insn) == MODE_TI) + return "%vpxor\t%0, %d0"; + return "%vxorps\t%0, %d0"; + + case TYPE_SSEMOV: + switch (get_attr_mode (insn)) + { + case MODE_TI: + return "%vmovdqa\t{%1, %0|%0, %1}"; + case MODE_V4SF: + return "%vmovaps\t{%1, %0|%0, %1}"; + case MODE_SI: + return "%vmovd\t{%1, %0|%0, %1}"; + case MODE_SF: + return "%vmovss\t{%1, %0|%0, %1}"; + default: + gcc_unreachable (); + } + + case TYPE_MMX: + return "pxor\t%0, %0"; + + case TYPE_MMXMOV: + if (get_attr_mode (insn) == MODE_DI) + return "movq\t{%1, %0|%0, %1}"; + return "movd\t{%1, %0|%0, %1}"; + + case TYPE_LEA: + return "lea{l}\t{%a1, %0|%0, %a1}"; + + default: + gcc_assert (!flag_pic || LEGITIMATE_PIC_OPERAND_P (operands[1])); + return "mov{l}\t{%1, %0|%0, %1}"; + } +} + [(set (attr "type") + (cond [(eq_attr "alternative" "2") + (const_string "mmx") + (eq_attr "alternative" "3,4,5") + (const_string "mmxmov") + (eq_attr "alternative" "6") + (const_string "sselog1") + (eq_attr "alternative" "7,8,9,10,11") + (const_string "ssemov") + (match_operand:DI 1 "pic_32bit_operand" "") + (const_string "lea") + ] + (const_string "imov"))) + (set (attr "prefix") + (if_then_else (eq_attr "alternative" "0,1,2,3,4,5") + (const_string "orig") + (const_string "maybe_vex"))) + (set (attr "prefix_data16") + (if_then_else (and (eq_attr "type" "ssemov") (eq_attr "mode" "SI")) + (const_string "1") + (const_string "*"))) + (set (attr "mode") + (cond [(eq_attr "alternative" "2,3") + (const_string "DI") + (eq_attr "alternative" "6,7") + (if_then_else + (eq (symbol_ref "TARGET_SSE2") (const_int 0)) + (const_string "V4SF") + (const_string "TI")) + (and (eq_attr "alternative" "8,9,10,11") + (eq (symbol_ref "TARGET_SSE2") (const_int 0))) + (const_string "SF") + ] + (const_string "SI")))]) + +(define_insn "*movhi_internal" + [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,m") + (match_operand:HI 1 "general_operand" "r,rn,rm,rn"))] + "!(MEM_P (operands[0]) && MEM_P (operands[1]))" +{ + switch (get_attr_type (insn)) + { + case TYPE_IMOVX: + /* movzwl is faster than movw on p2 due to partial word stalls, + though not as fast as an aligned movl. */ + return "movz{wl|x}\t{%1, %k0|%k0, %1}"; + default: + if (get_attr_mode (insn) == MODE_SI) + return "mov{l}\t{%k1, %k0|%k0, %k1}"; + else + return "mov{w}\t{%1, %0|%0, %1}"; + } +} + [(set (attr "type") + (cond [(ne (symbol_ref "optimize_function_for_size_p (cfun)") + (const_int 0)) + (const_string "imov") + (and (eq_attr "alternative" "0") + (ior (eq (symbol_ref "TARGET_PARTIAL_REG_STALL") + (const_int 0)) + (eq (symbol_ref "TARGET_HIMODE_MATH") + (const_int 0)))) + (const_string "imov") + (and (eq_attr "alternative" "1,2") + (match_operand:HI 1 "aligned_operand" "")) + (const_string "imov") + (and (ne (symbol_ref "TARGET_MOVX") + (const_int 0)) + (eq_attr "alternative" "0,2")) + (const_string "imovx") + ] + (const_string "imov"))) + (set (attr "mode") + (cond [(eq_attr "type" "imovx") + (const_string "SI") + (and (eq_attr "alternative" "1,2") + (match_operand:HI 1 "aligned_operand" "")) + (const_string "SI") + (and (eq_attr "alternative" "0") + (ior (eq (symbol_ref "TARGET_PARTIAL_REG_STALL") + (const_int 0)) + (eq (symbol_ref "TARGET_HIMODE_MATH") + (const_int 0)))) + (const_string "SI") + ] + (const_string "HI")))]) + +;; Situation is quite tricky about when to choose full sized (SImode) move +;; over QImode moves. For Q_REG -> Q_REG move we use full size only for +;; partial register dependency machines (such as AMD Athlon), where QImode +;; moves issue extra dependency and for partial register stalls machines +;; that don't use QImode patterns (and QImode move cause stall on the next +;; instruction). +;; +;; For loads of Q_REG to NONQ_REG we use full sized moves except for partial +;; register stall machines with, where we use QImode instructions, since +;; partial register stall can be caused there. Then we use movzx. +(define_insn "*movqi_internal" + [(set (match_operand:QI 0 "nonimmediate_operand" "=q,q ,q ,r,r ,?r,m") + (match_operand:QI 1 "general_operand" " q,qn,qm,q,rn,qm,qn"))] + "!(MEM_P (operands[0]) && MEM_P (operands[1]))" +{ + switch (get_attr_type (insn)) + { + case TYPE_IMOVX: + gcc_assert (ANY_QI_REG_P (operands[1]) || MEM_P (operands[1])); + return "movz{bl|x}\t{%1, %k0|%k0, %1}"; + default: + if (get_attr_mode (insn) == MODE_SI) + return "mov{l}\t{%k1, %k0|%k0, %k1}"; + else + return "mov{b}\t{%1, %0|%0, %1}"; + } +} + [(set (attr "type") + (cond [(and (eq_attr "alternative" "5") + (not (match_operand:QI 1 "aligned_operand" ""))) + (const_string "imovx") + (ne (symbol_ref "optimize_function_for_size_p (cfun)") + (const_int 0)) + (const_string "imov") + (and (eq_attr "alternative" "3") + (ior (eq (symbol_ref "TARGET_PARTIAL_REG_STALL") + (const_int 0)) + (eq (symbol_ref "TARGET_QIMODE_MATH") + (const_int 0)))) + (const_string "imov") + (eq_attr "alternative" "3,5") + (const_string "imovx") + (and (ne (symbol_ref "TARGET_MOVX") + (const_int 0)) + (eq_attr "alternative" "2")) + (const_string "imovx") + ] + (const_string "imov"))) + (set (attr "mode") + (cond [(eq_attr "alternative" "3,4,5") + (const_string "SI") + (eq_attr "alternative" "6") + (const_string "QI") + (eq_attr "type" "imovx") + (const_string "SI") + (and (eq_attr "type" "imov") + (and (eq_attr "alternative" "0,1") + (and (ne (symbol_ref "TARGET_PARTIAL_REG_DEPENDENCY") + (const_int 0)) + (and (eq (symbol_ref "optimize_function_for_size_p (cfun)") + (const_int 0)) + (eq (symbol_ref "TARGET_PARTIAL_REG_STALL") + (const_int 0)))))) + (const_string "SI") + ;; Avoid partial register stalls when not using QImode arithmetic + (and (eq_attr "type" "imov") + (and (eq_attr "alternative" "0,1") + (and (ne (symbol_ref "TARGET_PARTIAL_REG_STALL") + (const_int 0)) + (eq (symbol_ref "TARGET_QIMODE_MATH") + (const_int 0))))) + (const_string "SI") + ] + (const_string "QI")))]) + +;; Stores and loads of ax to arbitrary constant address. +;; We fake an second form of instruction to force reload to load address +;; into register when rax is not available +(define_insn "*movabs_1" + [(set (mem:SWI1248x (match_operand:DI 0 "x86_64_movabs_operand" "i,r")) + (match_operand:SWI1248x 1 "nonmemory_operand" "a,er"))] + "TARGET_64BIT && ix86_check_movabs (insn, 0)" + "@ + movabs{}\t{%1, %P0|[%P0], %1} + mov{}\t{%1, %a0|%a0, %1}" + [(set_attr "type" "imov") + (set_attr "modrm" "0,*") + (set_attr "length_address" "8,0") + (set_attr "length_immediate" "0,*") + (set_attr "memory" "store") + (set_attr "mode" "")]) + +(define_insn "*movabs_2" + [(set (match_operand:SWI1248x 0 "register_operand" "=a,r") + (mem:SWI1248x (match_operand:DI 1 "x86_64_movabs_operand" "i,r")))] + "TARGET_64BIT && ix86_check_movabs (insn, 1)" + "@ + movabs{}\t{%P1, %0|%0, [%P1]} + mov{}\t{%a1, %0|%0, %a1}" + [(set_attr "type" "imov") + (set_attr "modrm" "0,*") + (set_attr "length_address" "8,0") + (set_attr "length_immediate" "0") + (set_attr "memory" "load") + (set_attr "mode" "")]) + +(define_insn "*swap" + [(set (match_operand:SWI48 0 "register_operand" "+r") + (match_operand:SWI48 1 "register_operand" "+r")) + (set (match_dup 1) + (match_dup 0))] + "" + "xchg{}\t%1, %0" + [(set_attr "type" "imov") + (set_attr "mode" "") + (set_attr "pent_pair" "np") + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "double") + (set_attr "bdver1_decode" "double")]) + +(define_insn "*swap_1" + [(set (match_operand:SWI12 0 "register_operand" "+r") + (match_operand:SWI12 1 "register_operand" "+r")) + (set (match_dup 1) + (match_dup 0))] + "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)" + "xchg{l}\t%k1, %k0" + [(set_attr "type" "imov") + (set_attr "mode" "SI") + (set_attr "pent_pair" "np") + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "double") + (set_attr "bdver1_decode" "double")]) + +;; Not added amdfam10_decode since TARGET_PARTIAL_REG_STALL +;; is disabled for AMDFAM10 +(define_insn "*swap_2" + [(set (match_operand:SWI12 0 "register_operand" "+") + (match_operand:SWI12 1 "register_operand" "+")) + (set (match_dup 1) + (match_dup 0))] + "TARGET_PARTIAL_REG_STALL" + "xchg{}\t%1, %0" + [(set_attr "type" "imov") + (set_attr "mode" "") + (set_attr "pent_pair" "np") + (set_attr "athlon_decode" "vector")]) + +(define_expand "movstrict" + [(set (strict_low_part (match_operand:SWI12 0 "nonimmediate_operand" "")) + (match_operand:SWI12 1 "general_operand" ""))] + "" +{ + if (TARGET_PARTIAL_REG_STALL && optimize_function_for_speed_p (cfun)) + FAIL; + if (GET_CODE (operands[0]) == SUBREG + && GET_MODE_CLASS (GET_MODE (SUBREG_REG (operands[0]))) != MODE_INT) + FAIL; + /* Don't generate memory->memory moves, go through a register */ + if (MEM_P (operands[0]) && MEM_P (operands[1])) + operands[1] = force_reg (mode, operands[1]); +}) + +(define_insn "*movstrict_1" + [(set (strict_low_part + (match_operand:SWI12 0 "nonimmediate_operand" "+m,")) + (match_operand:SWI12 1 "general_operand" "n,m"))] + "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)) + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "mov{}\t{%1, %0|%0, %1}" + [(set_attr "type" "imov") + (set_attr "mode" "")]) + +(define_insn "*movstrict_xor" + [(set (strict_low_part (match_operand:SWI12 0 "register_operand" "+")) + (match_operand:SWI12 1 "const0_operand" "")) + (clobber (reg:CC FLAGS_REG))] + "reload_completed" + "xor{}\t%0, %0" + [(set_attr "type" "alu1") + (set_attr "mode" "") + (set_attr "length_immediate" "0")]) + +(define_insn "*mov_extv_1" + [(set (match_operand:SWI24 0 "register_operand" "=R") + (sign_extract:SWI24 (match_operand 1 "ext_register_operand" "Q") + (const_int 8) + (const_int 8)))] + "" + "movs{bl|x}\t{%h1, %k0|%k0, %h1}" + [(set_attr "type" "imovx") + (set_attr "mode" "SI")]) + +(define_insn "*movqi_extv_1_rex64" + [(set (match_operand:QI 0 "register_operand" "=Q,?R") + (sign_extract:QI (match_operand 1 "ext_register_operand" "Q,Q") + (const_int 8) + (const_int 8)))] + "TARGET_64BIT" +{ + switch (get_attr_type (insn)) + { + case TYPE_IMOVX: + return "movs{bl|x}\t{%h1, %k0|%k0, %h1}"; + default: + return "mov{b}\t{%h1, %0|%0, %h1}"; + } +} + [(set (attr "type") + (if_then_else (and (match_operand:QI 0 "register_operand" "") + (ior (not (match_operand:QI 0 "q_regs_operand" "")) + (ne (symbol_ref "TARGET_MOVX") + (const_int 0)))) + (const_string "imovx") + (const_string "imov"))) + (set (attr "mode") + (if_then_else (eq_attr "type" "imovx") + (const_string "SI") + (const_string "QI")))]) + +(define_insn "*movqi_extv_1" + [(set (match_operand:QI 0 "nonimmediate_operand" "=Qm,?r") + (sign_extract:QI (match_operand 1 "ext_register_operand" "Q,Q") + (const_int 8) + (const_int 8)))] + "!TARGET_64BIT" +{ + switch (get_attr_type (insn)) + { + case TYPE_IMOVX: + return "movs{bl|x}\t{%h1, %k0|%k0, %h1}"; + default: + return "mov{b}\t{%h1, %0|%0, %h1}"; + } +} + [(set (attr "type") + (if_then_else (and (match_operand:QI 0 "register_operand" "") + (ior (not (match_operand:QI 0 "q_regs_operand" "")) + (ne (symbol_ref "TARGET_MOVX") + (const_int 0)))) + (const_string "imovx") + (const_string "imov"))) + (set (attr "mode") + (if_then_else (eq_attr "type" "imovx") + (const_string "SI") + (const_string "QI")))]) + +(define_insn "*mov_extzv_1" + [(set (match_operand:SWI48 0 "register_operand" "=R") + (zero_extract:SWI48 (match_operand 1 "ext_register_operand" "Q") + (const_int 8) + (const_int 8)))] + "" + "movz{bl|x}\t{%h1, %k0|%k0, %h1}" + [(set_attr "type" "imovx") + (set_attr "mode" "SI")]) + +(define_insn "*movqi_extzv_2_rex64" + [(set (match_operand:QI 0 "register_operand" "=Q,?R") + (subreg:QI + (zero_extract:SI (match_operand 1 "ext_register_operand" "Q,Q") + (const_int 8) + (const_int 8)) 0))] + "TARGET_64BIT" +{ + switch (get_attr_type (insn)) + { + case TYPE_IMOVX: + return "movz{bl|x}\t{%h1, %k0|%k0, %h1}"; + default: + return "mov{b}\t{%h1, %0|%0, %h1}"; + } +} + [(set (attr "type") + (if_then_else (ior (not (match_operand:QI 0 "q_regs_operand" "")) + (ne (symbol_ref "TARGET_MOVX") + (const_int 0))) + (const_string "imovx") + (const_string "imov"))) + (set (attr "mode") + (if_then_else (eq_attr "type" "imovx") + (const_string "SI") + (const_string "QI")))]) + +(define_insn "*movqi_extzv_2" + [(set (match_operand:QI 0 "nonimmediate_operand" "=Qm,?R") + (subreg:QI + (zero_extract:SI (match_operand 1 "ext_register_operand" "Q,Q") + (const_int 8) + (const_int 8)) 0))] + "!TARGET_64BIT" +{ + switch (get_attr_type (insn)) + { + case TYPE_IMOVX: + return "movz{bl|x}\t{%h1, %k0|%k0, %h1}"; + default: + return "mov{b}\t{%h1, %0|%0, %h1}"; + } +} + [(set (attr "type") + (if_then_else (and (match_operand:QI 0 "register_operand" "") + (ior (not (match_operand:QI 0 "q_regs_operand" "")) + (ne (symbol_ref "TARGET_MOVX") + (const_int 0)))) + (const_string "imovx") + (const_string "imov"))) + (set (attr "mode") + (if_then_else (eq_attr "type" "imovx") + (const_string "SI") + (const_string "QI")))]) + +(define_expand "mov_insv_1" + [(set (zero_extract:SWI48 (match_operand 0 "ext_register_operand" "") + (const_int 8) + (const_int 8)) + (match_operand:SWI48 1 "nonmemory_operand" ""))]) + +(define_insn "*mov_insv_1_rex64" + [(set (zero_extract:SWI48x (match_operand 0 "ext_register_operand" "+Q") + (const_int 8) + (const_int 8)) + (match_operand:SWI48x 1 "nonmemory_operand" "Qn"))] + "TARGET_64BIT" + "mov{b}\t{%b1, %h0|%h0, %b1}" + [(set_attr "type" "imov") + (set_attr "mode" "QI")]) + +(define_insn "*movsi_insv_1" + [(set (zero_extract:SI (match_operand 0 "ext_register_operand" "+Q") + (const_int 8) + (const_int 8)) + (match_operand:SI 1 "general_operand" "Qmn"))] + "!TARGET_64BIT" + "mov{b}\t{%b1, %h0|%h0, %b1}" + [(set_attr "type" "imov") + (set_attr "mode" "QI")]) + +(define_insn "*movqi_insv_2" + [(set (zero_extract:SI (match_operand 0 "ext_register_operand" "+Q") + (const_int 8) + (const_int 8)) + (lshiftrt:SI (match_operand:SI 1 "register_operand" "Q") + (const_int 8)))] + "" + "mov{b}\t{%h1, %h0|%h0, %h1}" + [(set_attr "type" "imov") + (set_attr "mode" "QI")]) + +;; Floating point push instructions. + +(define_insn "*pushtf" + [(set (match_operand:TF 0 "push_operand" "=<,<,<") + (match_operand:TF 1 "general_no_elim_operand" "x,Fo,*r"))] + "TARGET_SSE2" +{ + /* This insn should be already split before reg-stack. */ + gcc_unreachable (); +} + [(set_attr "type" "multi") + (set_attr "unit" "sse,*,*") + (set_attr "mode" "TF,SI,SI")]) + +(define_split + [(set (match_operand:TF 0 "push_operand" "") + (match_operand:TF 1 "sse_reg_operand" ""))] + "TARGET_SSE2 && reload_completed" + [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (const_int -16))) + (set (mem:TF (reg:P SP_REG)) (match_dup 1))]) + +(define_split + [(set (match_operand:TF 0 "push_operand" "") + (match_operand:TF 1 "general_operand" ""))] + "TARGET_SSE2 && reload_completed + && !SSE_REG_P (operands[1])" + [(const_int 0)] + "ix86_split_long_move (operands); DONE;") + +(define_insn "*pushxf" + [(set (match_operand:XF 0 "push_operand" "=<,<") + (match_operand:XF 1 "general_no_elim_operand" "f,ro"))] + "optimize_function_for_speed_p (cfun)" +{ + /* This insn should be already split before reg-stack. */ + gcc_unreachable (); +} + [(set_attr "type" "multi") + (set_attr "unit" "i387,*") + (set_attr "mode" "XF,SI")]) + +;; Size of pushxf is 3 (for sub) + 2 (for fstp) + memory operand size. +;; Size of pushxf using integer instructions is 3+3*memory operand size +;; Pushing using integer instructions is longer except for constants +;; and direct memory references (assuming that any given constant is pushed +;; only once, but this ought to be handled elsewhere). + +(define_insn "*pushxf_nointeger" + [(set (match_operand:XF 0 "push_operand" "=X,X,X") + (match_operand:XF 1 "general_no_elim_operand" "f,Fo,*r"))] + "optimize_function_for_size_p (cfun)" +{ + /* This insn should be already split before reg-stack. */ + gcc_unreachable (); +} + [(set_attr "type" "multi") + (set_attr "unit" "i387,*,*") + (set_attr "mode" "XF,SI,SI")]) + +(define_split + [(set (match_operand:XF 0 "push_operand" "") + (match_operand:XF 1 "fp_register_operand" ""))] + "reload_completed" + [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2))) + (set (mem:XF (reg:P SP_REG)) (match_dup 1))] + "operands[2] = GEN_INT (-GET_MODE_SIZE (XFmode));") + +(define_split + [(set (match_operand:XF 0 "push_operand" "") + (match_operand:XF 1 "general_operand" ""))] + "reload_completed + && !FP_REG_P (operands[1])" + [(const_int 0)] + "ix86_split_long_move (operands); DONE;") + +(define_insn "*pushdf" + [(set (match_operand:DF 0 "push_operand" "=<,<,<") + (match_operand:DF 1 "general_no_elim_operand" "f,rFo,Y2"))] + "TARGET_64BIT || TARGET_INTEGER_DFMODE_MOVES" +{ + /* This insn should be already split before reg-stack. */ + gcc_unreachable (); +} + [(set_attr "type" "multi") + (set_attr "unit" "i387,*,*") + (set_attr "mode" "DF,SI,DF")]) + +;; Size of pushdf is 3 (for sub) + 2 (for fstp) + memory operand size. +;; Size of pushdf using integer instructions is 2+2*memory operand size +;; On the average, pushdf using integers can be still shorter. Allow this +;; pattern for optimize_size too. + +(define_insn "*pushdf_nointeger" + [(set (match_operand:DF 0 "push_operand" "=<,<,<,<") + (match_operand:DF 1 "general_no_elim_operand" "f,Fo,*r,Y2"))] + "!(TARGET_64BIT || TARGET_INTEGER_DFMODE_MOVES)" +{ + /* This insn should be already split before reg-stack. */ + gcc_unreachable (); +} + [(set_attr "type" "multi") + (set_attr "unit" "i387,*,*,*") + (set_attr "mode" "DF,SI,SI,DF")]) + +;; %%% Kill this when call knows how to work this out. +(define_split + [(set (match_operand:DF 0 "push_operand" "") + (match_operand:DF 1 "any_fp_register_operand" ""))] + "reload_completed" + [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (const_int -8))) + (set (mem:DF (reg:P SP_REG)) (match_dup 1))]) + +(define_split + [(set (match_operand:DF 0 "push_operand" "") + (match_operand:DF 1 "general_operand" ""))] + "reload_completed + && !ANY_FP_REG_P (operands[1])" + [(const_int 0)] + "ix86_split_long_move (operands); DONE;") + +(define_insn "*pushsf_rex64" + [(set (match_operand:SF 0 "push_operand" "=X,X,X") + (match_operand:SF 1 "nonmemory_no_elim_operand" "f,rF,x"))] + "TARGET_64BIT" +{ + /* Anything else should be already split before reg-stack. */ + gcc_assert (which_alternative == 1); + return "push{q}\t%q1"; +} + [(set_attr "type" "multi,push,multi") + (set_attr "unit" "i387,*,*") + (set_attr "mode" "SF,DI,SF")]) + +(define_insn "*pushsf" + [(set (match_operand:SF 0 "push_operand" "=<,<,<") + (match_operand:SF 1 "general_no_elim_operand" "f,rFm,x"))] + "!TARGET_64BIT" +{ + /* Anything else should be already split before reg-stack. */ + gcc_assert (which_alternative == 1); + return "push{l}\t%1"; +} + [(set_attr "type" "multi,push,multi") + (set_attr "unit" "i387,*,*") + (set_attr "mode" "SF,SI,SF")]) + +(define_split + [(set (match_operand:SF 0 "push_operand" "") + (match_operand:SF 1 "memory_operand" ""))] + "reload_completed + && MEM_P (operands[1]) + && (operands[2] = find_constant_src (insn))" + [(set (match_dup 0) + (match_dup 2))]) + +;; %%% Kill this when call knows how to work this out. +(define_split + [(set (match_operand:SF 0 "push_operand" "") + (match_operand:SF 1 "any_fp_register_operand" ""))] + "reload_completed" + [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2))) + (set (mem:SF (reg:P SP_REG)) (match_dup 1))] + "operands[2] = GEN_INT (-GET_MODE_SIZE (mode));") + +;; Floating point move instructions. + +(define_expand "movtf" + [(set (match_operand:TF 0 "nonimmediate_operand" "") + (match_operand:TF 1 "nonimmediate_operand" ""))] + "TARGET_SSE2" +{ + ix86_expand_move (TFmode, operands); + DONE; +}) + +(define_expand "mov" + [(set (match_operand:X87MODEF 0 "nonimmediate_operand" "") + (match_operand:X87MODEF 1 "general_operand" ""))] + "" + "ix86_expand_move (mode, operands); DONE;") + +(define_insn "*movtf_internal" + [(set (match_operand:TF 0 "nonimmediate_operand" "=x,m,x,?r,?o") + (match_operand:TF 1 "general_operand" "xm,x,C,roF,Fr"))] + "TARGET_SSE2 + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" +{ + switch (which_alternative) + { + case 0: + case 1: + if (get_attr_mode (insn) == MODE_V4SF) + return "%vmovaps\t{%1, %0|%0, %1}"; + else + return "%vmovdqa\t{%1, %0|%0, %1}"; + case 2: + if (get_attr_mode (insn) == MODE_V4SF) + return "%vxorps\t%0, %d0"; + else + return "%vpxor\t%0, %d0"; + case 3: + case 4: + return "#"; + default: + gcc_unreachable (); + } +} + [(set_attr "type" "ssemov,ssemov,sselog1,*,*") + (set_attr "prefix" "maybe_vex,maybe_vex,maybe_vex,*,*") + (set (attr "mode") + (cond [(eq_attr "alternative" "0,2") + (if_then_else + (ne (symbol_ref "optimize_function_for_size_p (cfun)") + (const_int 0)) + (const_string "V4SF") + (const_string "TI")) + (eq_attr "alternative" "1") + (if_then_else + (ior (ne (symbol_ref "TARGET_SSE_TYPELESS_STORES") + (const_int 0)) + (ne (symbol_ref "optimize_function_for_size_p (cfun)") + (const_int 0))) + (const_string "V4SF") + (const_string "TI"))] + (const_string "DI")))]) + +(define_split + [(set (match_operand:TF 0 "nonimmediate_operand" "") + (match_operand:TF 1 "general_operand" ""))] + "reload_completed + && !(SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]))" + [(const_int 0)] + "ix86_split_long_move (operands); DONE;") + +(define_insn "*movxf_internal" + [(set (match_operand:XF 0 "nonimmediate_operand" "=f,m,f,r,o") + (match_operand:XF 1 "general_operand" "fm,f,G,roF,Fr"))] + "optimize_function_for_speed_p (cfun) + && !(MEM_P (operands[0]) && MEM_P (operands[1])) + && (reload_in_progress || reload_completed + || (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE) + || GET_CODE (operands[1]) != CONST_DOUBLE + || memory_operand (operands[0], XFmode))" +{ + switch (which_alternative) + { + case 0: + case 1: + return output_387_reg_move (insn, operands); + + case 2: + return standard_80387_constant_opcode (operands[1]); + + case 3: case 4: + return "#"; + + default: + gcc_unreachable (); + } +} + [(set_attr "type" "fmov,fmov,fmov,multi,multi") + (set_attr "mode" "XF,XF,XF,SI,SI")]) + +;; Do not use integer registers when optimizing for size +(define_insn "*movxf_internal_nointeger" + [(set (match_operand:XF 0 "nonimmediate_operand" "=f,m,f,*r,o") + (match_operand:XF 1 "general_operand" "fm,f,G,*roF,F*r"))] + "optimize_function_for_size_p (cfun) + && !(MEM_P (operands[0]) && MEM_P (operands[1])) + && (reload_in_progress || reload_completed + || (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE) + || standard_80387_constant_p (operands[1]) > 0 + || GET_CODE (operands[1]) != CONST_DOUBLE + || memory_operand (operands[0], XFmode))" +{ + switch (which_alternative) + { + case 0: + case 1: + return output_387_reg_move (insn, operands); + + case 2: + return standard_80387_constant_opcode (operands[1]); + + case 3: case 4: + return "#"; + default: + gcc_unreachable (); + } +} + [(set_attr "type" "fmov,fmov,fmov,multi,multi") + (set_attr "mode" "XF,XF,XF,SI,SI")]) + +(define_split + [(set (match_operand:XF 0 "nonimmediate_operand" "") + (match_operand:XF 1 "general_operand" ""))] + "reload_completed + && !(MEM_P (operands[0]) && MEM_P (operands[1])) + && ! (FP_REG_P (operands[0]) || + (GET_CODE (operands[0]) == SUBREG + && FP_REG_P (SUBREG_REG (operands[0])))) + && ! (FP_REG_P (operands[1]) || + (GET_CODE (operands[1]) == SUBREG + && FP_REG_P (SUBREG_REG (operands[1]))))" + [(const_int 0)] + "ix86_split_long_move (operands); DONE;") + +(define_insn "*movdf_internal_rex64" + [(set (match_operand:DF 0 "nonimmediate_operand" + "=f,m,f,r ,m,!r,!o,Y2*x,Y2*x,Y2*x,m ,Yi,r ") + (match_operand:DF 1 "general_operand" + "fm,f,G,rm,r,F ,F ,C ,Y2*x,m ,Y2*x,r ,Yi"))] + "TARGET_64BIT && !(MEM_P (operands[0]) && MEM_P (operands[1])) + && (reload_in_progress || reload_completed + || (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE) + || (!(TARGET_SSE2 && TARGET_SSE_MATH) + && optimize_function_for_size_p (cfun) + && standard_80387_constant_p (operands[1]) > 0) + || GET_CODE (operands[1]) != CONST_DOUBLE + || memory_operand (operands[0], DFmode))" +{ + switch (which_alternative) + { + case 0: + case 1: + return output_387_reg_move (insn, operands); + + case 2: + return standard_80387_constant_opcode (operands[1]); + + case 3: + case 4: + return "mov{q}\t{%1, %0|%0, %1}"; + + case 5: + return "movabs{q}\t{%1, %0|%0, %1}"; + + case 6: + return "#"; + + case 7: + switch (get_attr_mode (insn)) + { + case MODE_V4SF: + return "%vxorps\t%0, %d0"; + case MODE_V2DF: + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "%vxorps\t%0, %d0"; + else + return "%vxorpd\t%0, %d0"; + case MODE_TI: + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "%vxorps\t%0, %d0"; + else + return "%vpxor\t%0, %d0"; + default: + gcc_unreachable (); + } + case 8: + case 9: + case 10: + switch (get_attr_mode (insn)) + { + case MODE_V4SF: + return "%vmovaps\t{%1, %0|%0, %1}"; + case MODE_V2DF: + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "%vmovaps\t{%1, %0|%0, %1}"; + else + return "%vmovapd\t{%1, %0|%0, %1}"; + case MODE_TI: + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "%vmovaps\t{%1, %0|%0, %1}"; + else + return "%vmovdqa\t{%1, %0|%0, %1}"; + case MODE_DI: + return "%vmovq\t{%1, %0|%0, %1}"; + case MODE_DF: + if (TARGET_AVX) + { + if (REG_P (operands[0]) && REG_P (operands[1])) + return "vmovsd\t{%1, %0, %0|%0, %0, %1}"; + else + return "vmovsd\t{%1, %0|%0, %1}"; + } + else + return "movsd\t{%1, %0|%0, %1}"; + case MODE_V1DF: + return "%vmovlpd\t{%1, %d0|%d0, %1}"; + case MODE_V2SF: + return "%vmovlps\t{%1, %d0|%d0, %1}"; + default: + gcc_unreachable (); + } + + case 11: + case 12: + /* Handle broken assemblers that require movd instead of movq. */ + return "%vmovd\t{%1, %0|%0, %1}"; + + default: + gcc_unreachable(); + } +} + [(set_attr "type" "fmov,fmov,fmov,imov,imov,imov,multi,sselog1,ssemov,ssemov,ssemov,ssemov,ssemov") + (set (attr "modrm") + (if_then_else + (and (eq_attr "alternative" "5") (eq_attr "type" "imov")) + (const_string "0") + (const_string "*"))) + (set (attr "length_immediate") + (if_then_else + (and (eq_attr "alternative" "5") (eq_attr "type" "imov")) + (const_string "8") + (const_string "*"))) + (set (attr "prefix") + (if_then_else (eq_attr "alternative" "0,1,2,3,4,5,6") + (const_string "orig") + (const_string "maybe_vex"))) + (set (attr "prefix_data16") + (if_then_else (eq_attr "mode" "V1DF") + (const_string "1") + (const_string "*"))) + (set (attr "mode") + (cond [(eq_attr "alternative" "0,1,2") + (const_string "DF") + (eq_attr "alternative" "3,4,5,6,11,12") + (const_string "DI") + + /* For SSE1, we have many fewer alternatives. */ + (eq (symbol_ref "TARGET_SSE2") (const_int 0)) + (cond [(eq_attr "alternative" "7,8") + (const_string "V4SF") + ] + (const_string "V2SF")) + + /* xorps is one byte shorter. */ + (eq_attr "alternative" "7") + (cond [(ne (symbol_ref "optimize_function_for_size_p (cfun)") + (const_int 0)) + (const_string "V4SF") + (ne (symbol_ref "TARGET_SSE_LOAD0_BY_PXOR") + (const_int 0)) + (const_string "TI") + ] + (const_string "V2DF")) + + /* For architectures resolving dependencies on + whole SSE registers use APD move to break dependency + chains, otherwise use short move to avoid extra work. + + movaps encodes one byte shorter. */ + (eq_attr "alternative" "8") + (cond + [(ne (symbol_ref "optimize_function_for_size_p (cfun)") + (const_int 0)) + (const_string "V4SF") + (ne (symbol_ref "TARGET_SSE_PARTIAL_REG_DEPENDENCY") + (const_int 0)) + (const_string "V2DF") + ] + (const_string "DF")) + /* For architectures resolving dependencies on register + parts we may avoid extra work to zero out upper part + of register. */ + (eq_attr "alternative" "9") + (if_then_else + (ne (symbol_ref "TARGET_SSE_SPLIT_REGS") + (const_int 0)) + (const_string "V1DF") + (const_string "DF")) + ] + (const_string "DF")))]) + +(define_insn "*movdf_internal" + [(set (match_operand:DF 0 "nonimmediate_operand" + "=f,m,f,r ,o ,Y2*x,Y2*x,Y2*x,m ") + (match_operand:DF 1 "general_operand" + "fm,f,G,roF,Fr,C ,Y2*x,m ,Y2*x"))] + "!TARGET_64BIT && !(MEM_P (operands[0]) && MEM_P (operands[1])) + && optimize_function_for_speed_p (cfun) + && TARGET_INTEGER_DFMODE_MOVES + && (reload_in_progress || reload_completed + || (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE) + || GET_CODE (operands[1]) != CONST_DOUBLE + || memory_operand (operands[0], DFmode))" +{ + switch (which_alternative) + { + case 0: + case 1: + return output_387_reg_move (insn, operands); + + case 2: + return standard_80387_constant_opcode (operands[1]); + + case 3: + case 4: + return "#"; + + case 5: + switch (get_attr_mode (insn)) + { + case MODE_V4SF: + return "%vxorps\t%0, %d0"; + case MODE_V2DF: + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "%vxorps\t%0, %d0"; + else + return "%vxorpd\t%0, %d0"; + case MODE_TI: + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "%vxorps\t%0, %d0"; + else + return "%vpxor\t%0, %d0"; + default: + gcc_unreachable (); + } + case 6: + case 7: + case 8: + switch (get_attr_mode (insn)) + { + case MODE_V4SF: + return "%vmovaps\t{%1, %0|%0, %1}"; + case MODE_V2DF: + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "%vmovaps\t{%1, %0|%0, %1}"; + else + return "%vmovapd\t{%1, %0|%0, %1}"; + case MODE_TI: + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "%vmovaps\t{%1, %0|%0, %1}"; + else + return "%vmovdqa\t{%1, %0|%0, %1}"; + case MODE_DI: + return "%vmovq\t{%1, %0|%0, %1}"; + case MODE_DF: + if (TARGET_AVX) + { + if (REG_P (operands[0]) && REG_P (operands[1])) + return "vmovsd\t{%1, %0, %0|%0, %0, %1}"; + else + return "vmovsd\t{%1, %0|%0, %1}"; + } + else + return "movsd\t{%1, %0|%0, %1}"; + case MODE_V1DF: + if (TARGET_AVX) + { + if (REG_P (operands[0])) + return "vmovlpd\t{%1, %0, %0|%0, %0, %1}"; + else + return "vmovlpd\t{%1, %0|%0, %1}"; + } + else + return "movlpd\t{%1, %0|%0, %1}"; + case MODE_V2SF: + if (TARGET_AVX) + { + if (REG_P (operands[0])) + return "vmovlps\t{%1, %0, %0|%0, %0, %1}"; + else + return "vmovlps\t{%1, %0|%0, %1}"; + } + else + return "movlps\t{%1, %0|%0, %1}"; + default: + gcc_unreachable (); + } + + default: + gcc_unreachable (); + } +} + [(set_attr "type" "fmov,fmov,fmov,multi,multi,sselog1,ssemov,ssemov,ssemov") + (set (attr "prefix") + (if_then_else (eq_attr "alternative" "0,1,2,3,4") + (const_string "orig") + (const_string "maybe_vex"))) + (set (attr "prefix_data16") + (if_then_else (eq_attr "mode" "V1DF") + (const_string "1") + (const_string "*"))) + (set (attr "mode") + (cond [(eq_attr "alternative" "0,1,2") + (const_string "DF") + (eq_attr "alternative" "3,4") + (const_string "SI") + + /* For SSE1, we have many fewer alternatives. */ + (eq (symbol_ref "TARGET_SSE2") (const_int 0)) + (cond [(eq_attr "alternative" "5,6") + (const_string "V4SF") + ] + (const_string "V2SF")) + + /* xorps is one byte shorter. */ + (eq_attr "alternative" "5") + (cond [(ne (symbol_ref "optimize_function_for_size_p (cfun)") + (const_int 0)) + (const_string "V4SF") + (ne (symbol_ref "TARGET_SSE_LOAD0_BY_PXOR") + (const_int 0)) + (const_string "TI") + ] + (const_string "V2DF")) + + /* For architectures resolving dependencies on + whole SSE registers use APD move to break dependency + chains, otherwise use short move to avoid extra work. + + movaps encodes one byte shorter. */ + (eq_attr "alternative" "6") + (cond + [(ne (symbol_ref "optimize_function_for_size_p (cfun)") + (const_int 0)) + (const_string "V4SF") + (ne (symbol_ref "TARGET_SSE_PARTIAL_REG_DEPENDENCY") + (const_int 0)) + (const_string "V2DF") + ] + (const_string "DF")) + /* For architectures resolving dependencies on register + parts we may avoid extra work to zero out upper part + of register. */ + (eq_attr "alternative" "7") + (if_then_else + (ne (symbol_ref "TARGET_SSE_SPLIT_REGS") + (const_int 0)) + (const_string "V1DF") + (const_string "DF")) + ] + (const_string "DF")))]) + +;; Moving is usually shorter when only FP registers are used. This separate +;; movdf pattern avoids the use of integer registers for FP operations +;; when optimizing for size. + +(define_insn "*movdf_internal_nointeger" + [(set (match_operand:DF 0 "nonimmediate_operand" + "=f,m,f,*r ,o ,Y2*x,Y2*x,Y2*x ,m ") + (match_operand:DF 1 "general_operand" + "fm,f,G,*roF,F*r,C ,Y2*x,mY2*x,Y2*x"))] + "!TARGET_64BIT && !(MEM_P (operands[0]) && MEM_P (operands[1])) + && (optimize_function_for_size_p (cfun) + || !TARGET_INTEGER_DFMODE_MOVES) + && (reload_in_progress || reload_completed + || (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE) + || (!(TARGET_SSE2 && TARGET_SSE_MATH) + && optimize_function_for_size_p (cfun) + && !memory_operand (operands[0], DFmode) + && standard_80387_constant_p (operands[1]) > 0) + || GET_CODE (operands[1]) != CONST_DOUBLE + || ((optimize_function_for_size_p (cfun) + || !TARGET_MEMORY_MISMATCH_STALL) + && memory_operand (operands[0], DFmode)))" +{ + switch (which_alternative) + { + case 0: + case 1: + return output_387_reg_move (insn, operands); + + case 2: + return standard_80387_constant_opcode (operands[1]); + + case 3: + case 4: + return "#"; + + case 5: + switch (get_attr_mode (insn)) + { + case MODE_V4SF: + return "%vxorps\t%0, %d0"; + case MODE_V2DF: + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "%vxorps\t%0, %d0"; + else + return "%vxorpd\t%0, %d0"; + case MODE_TI: + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "%vxorps\t%0, %d0"; + else + return "%vpxor\t%0, %d0"; + default: + gcc_unreachable (); + } + case 6: + case 7: + case 8: + switch (get_attr_mode (insn)) + { + case MODE_V4SF: + return "%vmovaps\t{%1, %0|%0, %1}"; + case MODE_V2DF: + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "%vmovaps\t{%1, %0|%0, %1}"; + else + return "%vmovapd\t{%1, %0|%0, %1}"; + case MODE_TI: + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "%vmovaps\t{%1, %0|%0, %1}"; + else + return "%vmovdqa\t{%1, %0|%0, %1}"; + case MODE_DI: + return "%vmovq\t{%1, %0|%0, %1}"; + case MODE_DF: + if (TARGET_AVX) + { + if (REG_P (operands[0]) && REG_P (operands[1])) + return "vmovsd\t{%1, %0, %0|%0, %0, %1}"; + else + return "vmovsd\t{%1, %0|%0, %1}"; + } + else + return "movsd\t{%1, %0|%0, %1}"; + case MODE_V1DF: + if (TARGET_AVX) + { + if (REG_P (operands[0])) + return "vmovlpd\t{%1, %0, %0|%0, %0, %1}"; + else + return "vmovlpd\t{%1, %0|%0, %1}"; + } + else + return "movlpd\t{%1, %0|%0, %1}"; + case MODE_V2SF: + if (TARGET_AVX) + { + if (REG_P (operands[0])) + return "vmovlps\t{%1, %0, %0|%0, %0, %1}"; + else + return "vmovlps\t{%1, %0|%0, %1}"; + } + else + return "movlps\t{%1, %0|%0, %1}"; + default: + gcc_unreachable (); + } + + default: + gcc_unreachable (); + } +} + [(set_attr "type" "fmov,fmov,fmov,multi,multi,sselog1,ssemov,ssemov,ssemov") + (set (attr "prefix") + (if_then_else (eq_attr "alternative" "0,1,2,3,4") + (const_string "orig") + (const_string "maybe_vex"))) + (set (attr "prefix_data16") + (if_then_else (eq_attr "mode" "V1DF") + (const_string "1") + (const_string "*"))) + (set (attr "mode") + (cond [(eq_attr "alternative" "0,1,2") + (const_string "DF") + (eq_attr "alternative" "3,4") + (const_string "SI") + + /* For SSE1, we have many fewer alternatives. */ + (eq (symbol_ref "TARGET_SSE2") (const_int 0)) + (cond [(eq_attr "alternative" "5,6") + (const_string "V4SF") + ] + (const_string "V2SF")) + + /* xorps is one byte shorter. */ + (eq_attr "alternative" "5") + (cond [(ne (symbol_ref "optimize_function_for_size_p (cfun)") + (const_int 0)) + (const_string "V4SF") + (ne (symbol_ref "TARGET_SSE_LOAD0_BY_PXOR") + (const_int 0)) + (const_string "TI") + ] + (const_string "V2DF")) + + /* For architectures resolving dependencies on + whole SSE registers use APD move to break dependency + chains, otherwise use short move to avoid extra work. + + movaps encodes one byte shorter. */ + (eq_attr "alternative" "6") + (cond + [(ne (symbol_ref "optimize_function_for_size_p (cfun)") + (const_int 0)) + (const_string "V4SF") + (ne (symbol_ref "TARGET_SSE_PARTIAL_REG_DEPENDENCY") + (const_int 0)) + (const_string "V2DF") + ] + (const_string "DF")) + /* For architectures resolving dependencies on register + parts we may avoid extra work to zero out upper part + of register. */ + (eq_attr "alternative" "7") + (if_then_else + (ne (symbol_ref "TARGET_SSE_SPLIT_REGS") + (const_int 0)) + (const_string "V1DF") + (const_string "DF")) + ] + (const_string "DF")))]) + +(define_split + [(set (match_operand:DF 0 "nonimmediate_operand" "") + (match_operand:DF 1 "general_operand" ""))] + "reload_completed + && !(MEM_P (operands[0]) && MEM_P (operands[1])) + && ! (ANY_FP_REG_P (operands[0]) || + (GET_CODE (operands[0]) == SUBREG + && ANY_FP_REG_P (SUBREG_REG (operands[0])))) + && ! (ANY_FP_REG_P (operands[1]) || + (GET_CODE (operands[1]) == SUBREG + && ANY_FP_REG_P (SUBREG_REG (operands[1]))))" + [(const_int 0)] + "ix86_split_long_move (operands); DONE;") + +(define_insn "*movsf_internal" + [(set (match_operand:SF 0 "nonimmediate_operand" + "=f,m,f,r ,m ,x,x,x ,m,!*y,!m,!*y,?Yi,?r,!*Ym,!r") + (match_operand:SF 1 "general_operand" + "fm,f,G,rmF,Fr,C,x,xm,x,m ,*y,*y ,r ,Yi,r ,*Ym"))] + "!(MEM_P (operands[0]) && MEM_P (operands[1])) + && (reload_in_progress || reload_completed + || (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE) + || (!TARGET_SSE_MATH && optimize_function_for_size_p (cfun) + && standard_80387_constant_p (operands[1]) > 0) + || GET_CODE (operands[1]) != CONST_DOUBLE + || memory_operand (operands[0], SFmode))" +{ + switch (which_alternative) + { + case 0: + case 1: + return output_387_reg_move (insn, operands); + + case 2: + return standard_80387_constant_opcode (operands[1]); + + case 3: + case 4: + return "mov{l}\t{%1, %0|%0, %1}"; + case 5: + if (get_attr_mode (insn) == MODE_TI) + return "%vpxor\t%0, %d0"; + else + return "%vxorps\t%0, %d0"; + case 6: + if (get_attr_mode (insn) == MODE_V4SF) + return "%vmovaps\t{%1, %0|%0, %1}"; + else + return "%vmovss\t{%1, %d0|%d0, %1}"; + case 7: + if (TARGET_AVX) + return REG_P (operands[1]) ? "vmovss\t{%1, %0, %0|%0, %0, %1}" + : "vmovss\t{%1, %0|%0, %1}"; + else + return "movss\t{%1, %0|%0, %1}"; + case 8: + return "%vmovss\t{%1, %0|%0, %1}"; + + case 9: case 10: case 14: case 15: + return "movd\t{%1, %0|%0, %1}"; + + case 11: + return "movq\t{%1, %0|%0, %1}"; + + case 12: case 13: + return "%vmovd\t{%1, %0|%0, %1}"; + + default: + gcc_unreachable (); + } +} + [(set_attr "type" "fmov,fmov,fmov,imov,imov,sselog1,ssemov,ssemov,ssemov,mmxmov,mmxmov,mmxmov,ssemov,ssemov,mmxmov,mmxmov") + (set (attr "prefix") + (if_then_else (eq_attr "alternative" "5,6,7,8,12,13") + (const_string "maybe_vex") + (const_string "orig"))) + (set (attr "mode") + (cond [(eq_attr "alternative" "3,4,9,10") + (const_string "SI") + (eq_attr "alternative" "5") + (if_then_else + (and (and (ne (symbol_ref "TARGET_SSE_LOAD0_BY_PXOR") + (const_int 0)) + (ne (symbol_ref "TARGET_SSE2") + (const_int 0))) + (eq (symbol_ref "optimize_function_for_size_p (cfun)") + (const_int 0))) + (const_string "TI") + (const_string "V4SF")) + /* For architectures resolving dependencies on + whole SSE registers use APS move to break dependency + chains, otherwise use short move to avoid extra work. + + Do the same for architectures resolving dependencies on + the parts. While in DF mode it is better to always handle + just register parts, the SF mode is different due to lack + of instructions to load just part of the register. It is + better to maintain the whole registers in single format + to avoid problems on using packed logical operations. */ + (eq_attr "alternative" "6") + (if_then_else + (ior (ne (symbol_ref "TARGET_SSE_PARTIAL_REG_DEPENDENCY") + (const_int 0)) + (ne (symbol_ref "TARGET_SSE_SPLIT_REGS") + (const_int 0))) + (const_string "V4SF") + (const_string "SF")) + (eq_attr "alternative" "11") + (const_string "DI")] + (const_string "SF")))]) + +(define_split + [(set (match_operand 0 "register_operand" "") + (match_operand 1 "memory_operand" ""))] + "reload_completed + && MEM_P (operands[1]) + && (GET_MODE (operands[0]) == TFmode + || GET_MODE (operands[0]) == XFmode + || GET_MODE (operands[0]) == DFmode + || GET_MODE (operands[0]) == SFmode) + && (operands[2] = find_constant_src (insn))" + [(set (match_dup 0) (match_dup 2))] +{ + rtx c = operands[2]; + rtx r = operands[0]; + + if (GET_CODE (r) == SUBREG) + r = SUBREG_REG (r); + + if (SSE_REG_P (r)) + { + if (!standard_sse_constant_p (c)) + FAIL; + } + else if (FP_REG_P (r)) + { + if (standard_80387_constant_p (c) < 1) + FAIL; + } + else if (MMX_REG_P (r)) + FAIL; +}) + +(define_split + [(set (match_operand 0 "register_operand" "") + (float_extend (match_operand 1 "memory_operand" "")))] + "reload_completed + && MEM_P (operands[1]) + && (GET_MODE (operands[0]) == TFmode + || GET_MODE (operands[0]) == XFmode + || GET_MODE (operands[0]) == DFmode + || GET_MODE (operands[0]) == SFmode) + && (operands[2] = find_constant_src (insn))" + [(set (match_dup 0) (match_dup 2))] +{ + rtx c = operands[2]; + rtx r = operands[0]; + + if (GET_CODE (r) == SUBREG) + r = SUBREG_REG (r); + + if (SSE_REG_P (r)) + { + if (!standard_sse_constant_p (c)) + FAIL; + } + else if (FP_REG_P (r)) + { + if (standard_80387_constant_p (c) < 1) + FAIL; + } + else if (MMX_REG_P (r)) + FAIL; +}) + +;; Split the load of -0.0 or -1.0 into fldz;fchs or fld1;fchs sequence +(define_split + [(set (match_operand:X87MODEF 0 "register_operand" "") + (match_operand:X87MODEF 1 "immediate_operand" ""))] + "reload_completed && FP_REGNO_P (REGNO (operands[0])) + && (standard_80387_constant_p (operands[1]) == 8 + || standard_80387_constant_p (operands[1]) == 9)" + [(set (match_dup 0)(match_dup 1)) + (set (match_dup 0) + (neg:X87MODEF (match_dup 0)))] +{ + REAL_VALUE_TYPE r; + + REAL_VALUE_FROM_CONST_DOUBLE (r, operands[1]); + if (real_isnegzero (&r)) + operands[1] = CONST0_RTX (mode); + else + operands[1] = CONST1_RTX (mode); +}) + +(define_insn "swapxf" + [(set (match_operand:XF 0 "register_operand" "+f") + (match_operand:XF 1 "register_operand" "+f")) + (set (match_dup 1) + (match_dup 0))] + "TARGET_80387" +{ + if (STACK_TOP_P (operands[0])) + return "fxch\t%1"; + else + return "fxch\t%0"; +} + [(set_attr "type" "fxch") + (set_attr "mode" "XF")]) + +(define_insn "*swap" + [(set (match_operand:MODEF 0 "fp_register_operand" "+f") + (match_operand:MODEF 1 "fp_register_operand" "+f")) + (set (match_dup 1) + (match_dup 0))] + "TARGET_80387 || reload_completed" +{ + if (STACK_TOP_P (operands[0])) + return "fxch\t%1"; + else + return "fxch\t%0"; +} + [(set_attr "type" "fxch") + (set_attr "mode" "")]) + +;; Zero extension instructions + +(define_expand "zero_extendsidi2" + [(set (match_operand:DI 0 "nonimmediate_operand" "") + (zero_extend:DI (match_operand:SI 1 "nonimmediate_operand" "")))] + "" +{ + if (!TARGET_64BIT) + { + emit_insn (gen_zero_extendsidi2_1 (operands[0], operands[1])); + DONE; + } +}) + +(define_insn "*zero_extendsidi2_rex64" + [(set (match_operand:DI 0 "nonimmediate_operand" "=r,o,?*Ym,?*y,?*Yi,*Y2") + (zero_extend:DI + (match_operand:SI 1 "nonimmediate_operand" "rm,0,r ,m ,r ,m")))] + "TARGET_64BIT" + "@ + mov{l}\t{%1, %k0|%k0, %1} + # + movd\t{%1, %0|%0, %1} + movd\t{%1, %0|%0, %1} + %vmovd\t{%1, %0|%0, %1} + %vmovd\t{%1, %0|%0, %1}" + [(set_attr "type" "imovx,imov,mmxmov,mmxmov,ssemov,ssemov") + (set_attr "prefix" "orig,*,orig,orig,maybe_vex,maybe_vex") + (set_attr "prefix_0f" "0,*,*,*,*,*") + (set_attr "mode" "SI,DI,DI,DI,TI,TI")]) + +(define_split + [(set (match_operand:DI 0 "memory_operand" "") + (zero_extend:DI (match_dup 0)))] + "TARGET_64BIT" + [(set (match_dup 4) (const_int 0))] + "split_double_mode (DImode, &operands[0], 1, &operands[3], &operands[4]);") + +;; %%% Kill me once multi-word ops are sane. +(define_insn "zero_extendsidi2_1" + [(set (match_operand:DI 0 "nonimmediate_operand" "=r,?r,?o,?*Ym,?*y,?*Yi,*Y2") + (zero_extend:DI + (match_operand:SI 1 "nonimmediate_operand" "0,rm,r ,r ,m ,r ,m"))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_64BIT" + "@ + # + # + # + movd\t{%1, %0|%0, %1} + movd\t{%1, %0|%0, %1} + %vmovd\t{%1, %0|%0, %1} + %vmovd\t{%1, %0|%0, %1}" + [(set_attr "type" "multi,multi,multi,mmxmov,mmxmov,ssemov,ssemov") + (set_attr "prefix" "*,*,*,orig,orig,maybe_vex,maybe_vex") + (set_attr "mode" "SI,SI,SI,DI,DI,TI,TI")]) + +(define_split + [(set (match_operand:DI 0 "register_operand" "") + (zero_extend:DI (match_operand:SI 1 "register_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_64BIT && reload_completed + && true_regnum (operands[0]) == true_regnum (operands[1])" + [(set (match_dup 4) (const_int 0))] + "split_double_mode (DImode, &operands[0], 1, &operands[3], &operands[4]);") + +(define_split + [(set (match_operand:DI 0 "nonimmediate_operand" "") + (zero_extend:DI (match_operand:SI 1 "general_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_64BIT && reload_completed + && !(MMX_REG_P (operands[0]) || SSE_REG_P (operands[0]))" + [(set (match_dup 3) (match_dup 1)) + (set (match_dup 4) (const_int 0))] + "split_double_mode (DImode, &operands[0], 1, &operands[3], &operands[4]);") + +(define_insn "zero_extenddi2" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (match_operand:SWI12 1 "nonimmediate_operand" "m")))] + "TARGET_64BIT" + "movz{l|x}\t{%1, %k0|%k0, %1}" + [(set_attr "type" "imovx") + (set_attr "mode" "SI")]) + +(define_expand "zero_extendhisi2" + [(set (match_operand:SI 0 "register_operand" "") + (zero_extend:SI (match_operand:HI 1 "nonimmediate_operand" "")))] + "" +{ + if (TARGET_ZERO_EXTEND_WITH_AND && optimize_function_for_speed_p (cfun)) + { + operands[1] = force_reg (HImode, operands[1]); + emit_insn (gen_zero_extendhisi2_and (operands[0], operands[1])); + DONE; + } +}) + +(define_insn_and_split "zero_extendhisi2_and" + [(set (match_operand:SI 0 "register_operand" "=r") + (zero_extend:SI (match_operand:HI 1 "register_operand" "0"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_ZERO_EXTEND_WITH_AND && optimize_function_for_speed_p (cfun)" + "#" + "&& reload_completed" + [(parallel [(set (match_dup 0) (and:SI (match_dup 0) (const_int 65535))) + (clobber (reg:CC FLAGS_REG))])] + "" + [(set_attr "type" "alu1") + (set_attr "mode" "SI")]) + +(define_insn "*zero_extendhisi2_movzwl" + [(set (match_operand:SI 0 "register_operand" "=r") + (zero_extend:SI (match_operand:HI 1 "nonimmediate_operand" "rm")))] + "!TARGET_ZERO_EXTEND_WITH_AND + || optimize_function_for_size_p (cfun)" + "movz{wl|x}\t{%1, %0|%0, %1}" + [(set_attr "type" "imovx") + (set_attr "mode" "SI")]) + +(define_expand "zero_extendqi2" + [(parallel + [(set (match_operand:SWI24 0 "register_operand" "") + (zero_extend:SWI24 (match_operand:QI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))])]) + +(define_insn "*zero_extendqi2_and" + [(set (match_operand:SWI24 0 "register_operand" "=r,?&q") + (zero_extend:SWI24 (match_operand:QI 1 "nonimmediate_operand" "0,qm"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_ZERO_EXTEND_WITH_AND && optimize_function_for_speed_p (cfun)" + "#" + [(set_attr "type" "alu1") + (set_attr "mode" "")]) + +;; When source and destination does not overlap, clear destination +;; first and then do the movb +(define_split + [(set (match_operand:SWI24 0 "register_operand" "") + (zero_extend:SWI24 (match_operand:QI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "reload_completed + && (TARGET_ZERO_EXTEND_WITH_AND && optimize_function_for_speed_p (cfun)) + && ANY_QI_REG_P (operands[0]) + && (ANY_QI_REG_P (operands[1]) || MEM_P (operands[1])) + && !reg_overlap_mentioned_p (operands[0], operands[1])" + [(set (strict_low_part (match_dup 2)) (match_dup 1))] +{ + operands[2] = gen_lowpart (QImode, operands[0]); + ix86_expand_clear (operands[0]); +}) + +(define_insn "*zero_extendqi2_movzbl_and" + [(set (match_operand:SWI24 0 "register_operand" "=r,r") + (zero_extend:SWI24 (match_operand:QI 1 "nonimmediate_operand" "qm,0"))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_ZERO_EXTEND_WITH_AND || optimize_function_for_size_p (cfun)" + "#" + [(set_attr "type" "imovx,alu1") + (set_attr "mode" "")]) + +;; For the movzbl case strip only the clobber +(define_split + [(set (match_operand:SWI24 0 "register_operand" "") + (zero_extend:SWI24 (match_operand:QI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "reload_completed + && (!TARGET_ZERO_EXTEND_WITH_AND || optimize_function_for_size_p (cfun)) + && (!REG_P (operands[1]) || ANY_QI_REG_P (operands[1]))" + [(set (match_dup 0) + (zero_extend:SWI24 (match_dup 1)))]) + +; zero extend to SImode to avoid partial register stalls +(define_insn "*zero_extendqi2_movzbl" + [(set (match_operand:SWI24 0 "register_operand" "=r") + (zero_extend:SWI24 (match_operand:QI 1 "nonimmediate_operand" "qm")))] + "reload_completed + && (!TARGET_ZERO_EXTEND_WITH_AND || optimize_function_for_size_p (cfun))" + "movz{bl|x}\t{%1, %k0|%k0, %1}" + [(set_attr "type" "imovx") + (set_attr "mode" "SI")]) + +;; Rest is handled by single and. +(define_split + [(set (match_operand:SWI24 0 "register_operand" "") + (zero_extend:SWI24 (match_operand:QI 1 "register_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "reload_completed + && true_regnum (operands[0]) == true_regnum (operands[1])" + [(parallel [(set (match_dup 0) (and:SWI24 (match_dup 0) (const_int 255))) + (clobber (reg:CC FLAGS_REG))])]) + +;; Sign extension instructions + +(define_expand "extendsidi2" + [(set (match_operand:DI 0 "register_operand" "") + (sign_extend:DI (match_operand:SI 1 "register_operand" "")))] + "" +{ + if (!TARGET_64BIT) + { + emit_insn (gen_extendsidi2_1 (operands[0], operands[1])); + DONE; + } +}) + +(define_insn "*extendsidi2_rex64" + [(set (match_operand:DI 0 "register_operand" "=*a,r") + (sign_extend:DI (match_operand:SI 1 "nonimmediate_operand" "*0,rm")))] + "TARGET_64BIT" + "@ + {cltq|cdqe} + movs{lq|x}\t{%1, %0|%0, %1}" + [(set_attr "type" "imovx") + (set_attr "mode" "DI") + (set_attr "prefix_0f" "0") + (set_attr "modrm" "0,1")]) + +(define_insn "extendsidi2_1" + [(set (match_operand:DI 0 "nonimmediate_operand" "=*A,r,?r,?*o") + (sign_extend:DI (match_operand:SI 1 "register_operand" "0,0,r,r"))) + (clobber (reg:CC FLAGS_REG)) + (clobber (match_scratch:SI 2 "=X,X,X,&r"))] + "!TARGET_64BIT" + "#") + +;; Extend to memory case when source register does die. +(define_split + [(set (match_operand:DI 0 "memory_operand" "") + (sign_extend:DI (match_operand:SI 1 "register_operand" ""))) + (clobber (reg:CC FLAGS_REG)) + (clobber (match_operand:SI 2 "register_operand" ""))] + "(reload_completed + && dead_or_set_p (insn, operands[1]) + && !reg_mentioned_p (operands[1], operands[0]))" + [(set (match_dup 3) (match_dup 1)) + (parallel [(set (match_dup 1) (ashiftrt:SI (match_dup 1) (const_int 31))) + (clobber (reg:CC FLAGS_REG))]) + (set (match_dup 4) (match_dup 1))] + "split_double_mode (DImode, &operands[0], 1, &operands[3], &operands[4]);") + +;; Extend to memory case when source register does not die. +(define_split + [(set (match_operand:DI 0 "memory_operand" "") + (sign_extend:DI (match_operand:SI 1 "register_operand" ""))) + (clobber (reg:CC FLAGS_REG)) + (clobber (match_operand:SI 2 "register_operand" ""))] + "reload_completed" + [(const_int 0)] +{ + split_double_mode (DImode, &operands[0], 1, &operands[3], &operands[4]); + + emit_move_insn (operands[3], operands[1]); + + /* Generate a cltd if possible and doing so it profitable. */ + if ((optimize_function_for_size_p (cfun) || TARGET_USE_CLTD) + && true_regnum (operands[1]) == AX_REG + && true_regnum (operands[2]) == DX_REG) + { + emit_insn (gen_ashrsi3_cvt (operands[2], operands[1], GEN_INT (31))); + } + else + { + emit_move_insn (operands[2], operands[1]); + emit_insn (gen_ashrsi3_cvt (operands[2], operands[2], GEN_INT (31))); + } + emit_move_insn (operands[4], operands[2]); + DONE; +}) + +;; Extend to register case. Optimize case where source and destination +;; registers match and cases where we can use cltd. +(define_split + [(set (match_operand:DI 0 "register_operand" "") + (sign_extend:DI (match_operand:SI 1 "register_operand" ""))) + (clobber (reg:CC FLAGS_REG)) + (clobber (match_scratch:SI 2 ""))] + "reload_completed" + [(const_int 0)] +{ + split_double_mode (DImode, &operands[0], 1, &operands[3], &operands[4]); + + if (true_regnum (operands[3]) != true_regnum (operands[1])) + emit_move_insn (operands[3], operands[1]); + + /* Generate a cltd if possible and doing so it profitable. */ + if ((optimize_function_for_size_p (cfun) || TARGET_USE_CLTD) + && true_regnum (operands[3]) == AX_REG + && true_regnum (operands[4]) == DX_REG) + { + emit_insn (gen_ashrsi3_cvt (operands[4], operands[3], GEN_INT (31))); + DONE; + } + + if (true_regnum (operands[4]) != true_regnum (operands[1])) + emit_move_insn (operands[4], operands[1]); + + emit_insn (gen_ashrsi3_cvt (operands[4], operands[4], GEN_INT (31))); + DONE; +}) + +(define_insn "extenddi2" + [(set (match_operand:DI 0 "register_operand" "=r") + (sign_extend:DI + (match_operand:SWI12 1 "nonimmediate_operand" "m")))] + "TARGET_64BIT" + "movs{q|x}\t{%1, %0|%0, %1}" + [(set_attr "type" "imovx") + (set_attr "mode" "DI")]) + +(define_insn "extendhisi2" + [(set (match_operand:SI 0 "register_operand" "=*a,r") + (sign_extend:SI (match_operand:HI 1 "nonimmediate_operand" "*0,rm")))] + "" +{ + switch (get_attr_prefix_0f (insn)) + { + case 0: + return "{cwtl|cwde}"; + default: + return "movs{wl|x}\t{%1, %0|%0, %1}"; + } +} + [(set_attr "type" "imovx") + (set_attr "mode" "SI") + (set (attr "prefix_0f") + ;; movsx is short decodable while cwtl is vector decoded. + (if_then_else (and (eq_attr "cpu" "!k6") + (eq_attr "alternative" "0")) + (const_string "0") + (const_string "1"))) + (set (attr "modrm") + (if_then_else (eq_attr "prefix_0f" "0") + (const_string "0") + (const_string "1")))]) + +(define_insn "*extendhisi2_zext" + [(set (match_operand:DI 0 "register_operand" "=*a,r") + (zero_extend:DI + (sign_extend:SI + (match_operand:HI 1 "nonimmediate_operand" "*0,rm"))))] + "TARGET_64BIT" +{ + switch (get_attr_prefix_0f (insn)) + { + case 0: + return "{cwtl|cwde}"; + default: + return "movs{wl|x}\t{%1, %k0|%k0, %1}"; + } +} + [(set_attr "type" "imovx") + (set_attr "mode" "SI") + (set (attr "prefix_0f") + ;; movsx is short decodable while cwtl is vector decoded. + (if_then_else (and (eq_attr "cpu" "!k6") + (eq_attr "alternative" "0")) + (const_string "0") + (const_string "1"))) + (set (attr "modrm") + (if_then_else (eq_attr "prefix_0f" "0") + (const_string "0") + (const_string "1")))]) + +(define_insn "extendqisi2" + [(set (match_operand:SI 0 "register_operand" "=r") + (sign_extend:SI (match_operand:QI 1 "nonimmediate_operand" "qm")))] + "" + "movs{bl|x}\t{%1, %0|%0, %1}" + [(set_attr "type" "imovx") + (set_attr "mode" "SI")]) + +(define_insn "*extendqisi2_zext" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (sign_extend:SI (match_operand:QI 1 "nonimmediate_operand" "qm"))))] + "TARGET_64BIT" + "movs{bl|x}\t{%1, %k0|%k0, %1}" + [(set_attr "type" "imovx") + (set_attr "mode" "SI")]) + +(define_insn "extendqihi2" + [(set (match_operand:HI 0 "register_operand" "=*a,r") + (sign_extend:HI (match_operand:QI 1 "nonimmediate_operand" "*0,qm")))] + "" +{ + switch (get_attr_prefix_0f (insn)) + { + case 0: + return "{cbtw|cbw}"; + default: + return "movs{bw|x}\t{%1, %0|%0, %1}"; + } +} + [(set_attr "type" "imovx") + (set_attr "mode" "HI") + (set (attr "prefix_0f") + ;; movsx is short decodable while cwtl is vector decoded. + (if_then_else (and (eq_attr "cpu" "!k6") + (eq_attr "alternative" "0")) + (const_string "0") + (const_string "1"))) + (set (attr "modrm") + (if_then_else (eq_attr "prefix_0f" "0") + (const_string "0") + (const_string "1")))]) + +;; Conversions between float and double. + +;; These are all no-ops in the model used for the 80387. +;; So just emit moves. + +;; %%% Kill these when call knows how to work out a DFmode push earlier. +(define_split + [(set (match_operand:DF 0 "push_operand" "") + (float_extend:DF (match_operand:SF 1 "fp_register_operand" "")))] + "reload_completed" + [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (const_int -8))) + (set (mem:DF (reg:P SP_REG)) (float_extend:DF (match_dup 1)))]) + +(define_split + [(set (match_operand:XF 0 "push_operand" "") + (float_extend:XF (match_operand:MODEF 1 "fp_register_operand" "")))] + "reload_completed" + [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2))) + (set (mem:XF (reg:P SP_REG)) (float_extend:XF (match_dup 1)))] + "operands[2] = GEN_INT (-GET_MODE_SIZE (XFmode));") + +(define_expand "extendsfdf2" + [(set (match_operand:DF 0 "nonimmediate_operand" "") + (float_extend:DF (match_operand:SF 1 "general_operand" "")))] + "TARGET_80387 || (TARGET_SSE2 && TARGET_SSE_MATH)" +{ + /* ??? Needed for compress_float_constant since all fp constants + are LEGITIMATE_CONSTANT_P. */ + if (GET_CODE (operands[1]) == CONST_DOUBLE) + { + if ((!TARGET_SSE2 || TARGET_MIX_SSE_I387) + && standard_80387_constant_p (operands[1]) > 0) + { + operands[1] = simplify_const_unary_operation + (FLOAT_EXTEND, DFmode, operands[1], SFmode); + emit_move_insn_1 (operands[0], operands[1]); + DONE; + } + operands[1] = validize_mem (force_const_mem (SFmode, operands[1])); + } +}) + +/* For converting SF(xmm2) to DF(xmm1), use the following code instead of + cvtss2sd: + unpcklps xmm2,xmm2 ; packed conversion might crash on signaling NaNs + cvtps2pd xmm2,xmm1 + We do the conversion post reload to avoid producing of 128bit spills + that might lead to ICE on 32bit target. The sequence unlikely combine + anyway. */ +(define_split + [(set (match_operand:DF 0 "register_operand" "") + (float_extend:DF + (match_operand:SF 1 "nonimmediate_operand" "")))] + "TARGET_USE_VECTOR_FP_CONVERTS + && optimize_insn_for_speed_p () + && reload_completed && SSE_REG_P (operands[0])" + [(set (match_dup 2) + (float_extend:V2DF + (vec_select:V2SF + (match_dup 3) + (parallel [(const_int 0) (const_int 1)]))))] +{ + operands[2] = simplify_gen_subreg (V2DFmode, operands[0], DFmode, 0); + operands[3] = simplify_gen_subreg (V4SFmode, operands[0], DFmode, 0); + /* Use movss for loading from memory, unpcklps reg, reg for registers. + Try to avoid move when unpacking can be done in source. */ + if (REG_P (operands[1])) + { + /* If it is unsafe to overwrite upper half of source, we need + to move to destination and unpack there. */ + if ((ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER + || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 4) + && true_regnum (operands[0]) != true_regnum (operands[1])) + { + rtx tmp = gen_rtx_REG (SFmode, true_regnum (operands[0])); + emit_move_insn (tmp, operands[1]); + } + else + operands[3] = simplify_gen_subreg (V4SFmode, operands[1], SFmode, 0); + emit_insn (gen_vec_interleave_lowv4sf (operands[3], operands[3], + operands[3])); + } + else + emit_insn (gen_vec_setv4sf_0 (operands[3], + CONST0_RTX (V4SFmode), operands[1])); +}) + +(define_insn "*extendsfdf2_mixed" + [(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,x") + (float_extend:DF + (match_operand:SF 1 "nonimmediate_operand" "fm,f,xm")))] + "TARGET_SSE2 && TARGET_MIX_SSE_I387" +{ + switch (which_alternative) + { + case 0: + case 1: + return output_387_reg_move (insn, operands); + + case 2: + return "%vcvtss2sd\t{%1, %d0|%d0, %1}"; + + default: + gcc_unreachable (); + } +} + [(set_attr "type" "fmov,fmov,ssecvt") + (set_attr "prefix" "orig,orig,maybe_vex") + (set_attr "mode" "SF,XF,DF")]) + +(define_insn "*extendsfdf2_sse" + [(set (match_operand:DF 0 "nonimmediate_operand" "=x") + (float_extend:DF (match_operand:SF 1 "nonimmediate_operand" "xm")))] + "TARGET_SSE2 && TARGET_SSE_MATH" + "%vcvtss2sd\t{%1, %d0|%d0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "DF")]) + +(define_insn "*extendsfdf2_i387" + [(set (match_operand:DF 0 "nonimmediate_operand" "=f,m") + (float_extend:DF (match_operand:SF 1 "nonimmediate_operand" "fm,f")))] + "TARGET_80387" + "* return output_387_reg_move (insn, operands);" + [(set_attr "type" "fmov") + (set_attr "mode" "SF,XF")]) + +(define_expand "extendxf2" + [(set (match_operand:XF 0 "nonimmediate_operand" "") + (float_extend:XF (match_operand:MODEF 1 "general_operand" "")))] + "TARGET_80387" +{ + /* ??? Needed for compress_float_constant since all fp constants + are LEGITIMATE_CONSTANT_P. */ + if (GET_CODE (operands[1]) == CONST_DOUBLE) + { + if (standard_80387_constant_p (operands[1]) > 0) + { + operands[1] = simplify_const_unary_operation + (FLOAT_EXTEND, XFmode, operands[1], mode); + emit_move_insn_1 (operands[0], operands[1]); + DONE; + } + operands[1] = validize_mem (force_const_mem (mode, operands[1])); + } +}) + +(define_insn "*extendxf2_i387" + [(set (match_operand:XF 0 "nonimmediate_operand" "=f,m") + (float_extend:XF + (match_operand:MODEF 1 "nonimmediate_operand" "fm,f")))] + "TARGET_80387" + "* return output_387_reg_move (insn, operands);" + [(set_attr "type" "fmov") + (set_attr "mode" ",XF")]) + +;; %%% This seems bad bad news. +;; This cannot output into an f-reg because there is no way to be sure +;; of truncating in that case. Otherwise this is just like a simple move +;; insn. So we pretend we can output to a reg in order to get better +;; register preferencing, but we really use a stack slot. + +;; Conversion from DFmode to SFmode. + +(define_expand "truncdfsf2" + [(set (match_operand:SF 0 "nonimmediate_operand" "") + (float_truncate:SF + (match_operand:DF 1 "nonimmediate_operand" "")))] + "TARGET_80387 || (TARGET_SSE2 && TARGET_SSE_MATH)" +{ + if (TARGET_SSE2 && TARGET_SSE_MATH && !TARGET_MIX_SSE_I387) + ; + else if (flag_unsafe_math_optimizations) + ; + else + { + enum ix86_stack_slot slot = (virtuals_instantiated + ? SLOT_TEMP + : SLOT_VIRTUAL); + rtx temp = assign_386_stack_local (SFmode, slot); + emit_insn (gen_truncdfsf2_with_temp (operands[0], operands[1], temp)); + DONE; + } +}) + +/* For converting DF(xmm2) to SF(xmm1), use the following code instead of + cvtsd2ss: + unpcklpd xmm2,xmm2 ; packed conversion might crash on signaling NaNs + cvtpd2ps xmm2,xmm1 + We do the conversion post reload to avoid producing of 128bit spills + that might lead to ICE on 32bit target. The sequence unlikely combine + anyway. */ +(define_split + [(set (match_operand:SF 0 "register_operand" "") + (float_truncate:SF + (match_operand:DF 1 "nonimmediate_operand" "")))] + "TARGET_USE_VECTOR_FP_CONVERTS + && optimize_insn_for_speed_p () + && reload_completed && SSE_REG_P (operands[0])" + [(set (match_dup 2) + (vec_concat:V4SF + (float_truncate:V2SF + (match_dup 4)) + (match_dup 3)))] +{ + operands[2] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0); + operands[3] = CONST0_RTX (V2SFmode); + operands[4] = simplify_gen_subreg (V2DFmode, operands[0], SFmode, 0); + /* Use movsd for loading from memory, unpcklpd for registers. + Try to avoid move when unpacking can be done in source, or SSE3 + movddup is available. */ + if (REG_P (operands[1])) + { + if (!TARGET_SSE3 + && true_regnum (operands[0]) != true_regnum (operands[1]) + && (ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER + || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 8)) + { + rtx tmp = simplify_gen_subreg (DFmode, operands[0], SFmode, 0); + emit_move_insn (tmp, operands[1]); + operands[1] = tmp; + } + else if (!TARGET_SSE3) + operands[4] = simplify_gen_subreg (V2DFmode, operands[1], DFmode, 0); + emit_insn (gen_vec_dupv2df (operands[4], operands[1])); + } + else + emit_insn (gen_sse2_loadlpd (operands[4], + CONST0_RTX (V2DFmode), operands[1])); +}) + +(define_expand "truncdfsf2_with_temp" + [(parallel [(set (match_operand:SF 0 "" "") + (float_truncate:SF (match_operand:DF 1 "" ""))) + (clobber (match_operand:SF 2 "" ""))])]) + +(define_insn "*truncdfsf_fast_mixed" + [(set (match_operand:SF 0 "nonimmediate_operand" "=fm,x") + (float_truncate:SF + (match_operand:DF 1 "nonimmediate_operand" "f ,xm")))] + "TARGET_SSE2 && TARGET_MIX_SSE_I387 && flag_unsafe_math_optimizations" +{ + switch (which_alternative) + { + case 0: + return output_387_reg_move (insn, operands); + case 1: + return "%vcvtsd2ss\t{%1, %d0|%d0, %1}"; + default: + gcc_unreachable (); + } +} + [(set_attr "type" "fmov,ssecvt") + (set_attr "prefix" "orig,maybe_vex") + (set_attr "mode" "SF")]) + +;; Yes, this one doesn't depend on flag_unsafe_math_optimizations, +;; because nothing we do here is unsafe. +(define_insn "*truncdfsf_fast_sse" + [(set (match_operand:SF 0 "nonimmediate_operand" "=x") + (float_truncate:SF + (match_operand:DF 1 "nonimmediate_operand" "xm")))] + "TARGET_SSE2 && TARGET_SSE_MATH" + "%vcvtsd2ss\t{%1, %d0|%d0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "SF")]) + +(define_insn "*truncdfsf_fast_i387" + [(set (match_operand:SF 0 "nonimmediate_operand" "=fm") + (float_truncate:SF + (match_operand:DF 1 "nonimmediate_operand" "f")))] + "TARGET_80387 && flag_unsafe_math_optimizations" + "* return output_387_reg_move (insn, operands);" + [(set_attr "type" "fmov") + (set_attr "mode" "SF")]) + +(define_insn "*truncdfsf_mixed" + [(set (match_operand:SF 0 "nonimmediate_operand" "=m,Y2 ,?f,?x,?*r") + (float_truncate:SF + (match_operand:DF 1 "nonimmediate_operand" "f ,Y2m,f ,f ,f"))) + (clobber (match_operand:SF 2 "memory_operand" "=X,X ,m ,m ,m"))] + "TARGET_MIX_SSE_I387" +{ + switch (which_alternative) + { + case 0: + return output_387_reg_move (insn, operands); + case 1: + return "%vcvtsd2ss\t{%1, %d0|%d0, %1}"; + + default: + return "#"; + } +} + [(set_attr "type" "fmov,ssecvt,multi,multi,multi") + (set_attr "unit" "*,*,i387,i387,i387") + (set_attr "prefix" "orig,maybe_vex,orig,orig,orig") + (set_attr "mode" "SF")]) + +(define_insn "*truncdfsf_i387" + [(set (match_operand:SF 0 "nonimmediate_operand" "=m,?f,?x,?*r") + (float_truncate:SF + (match_operand:DF 1 "nonimmediate_operand" "f ,f ,f ,f"))) + (clobber (match_operand:SF 2 "memory_operand" "=X,m ,m ,m"))] + "TARGET_80387" +{ + switch (which_alternative) + { + case 0: + return output_387_reg_move (insn, operands); + + default: + return "#"; + } +} + [(set_attr "type" "fmov,multi,multi,multi") + (set_attr "unit" "*,i387,i387,i387") + (set_attr "mode" "SF")]) + +(define_insn "*truncdfsf2_i387_1" + [(set (match_operand:SF 0 "memory_operand" "=m") + (float_truncate:SF + (match_operand:DF 1 "register_operand" "f")))] + "TARGET_80387 + && !(TARGET_SSE2 && TARGET_SSE_MATH) + && !TARGET_MIX_SSE_I387" + "* return output_387_reg_move (insn, operands);" + [(set_attr "type" "fmov") + (set_attr "mode" "SF")]) + +(define_split + [(set (match_operand:SF 0 "register_operand" "") + (float_truncate:SF + (match_operand:DF 1 "fp_register_operand" ""))) + (clobber (match_operand 2 "" ""))] + "reload_completed" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (match_dup 2))] + "operands[1] = gen_rtx_REG (SFmode, true_regnum (operands[1]));") + +;; Conversion from XFmode to {SF,DF}mode + +(define_expand "truncxf2" + [(parallel [(set (match_operand:MODEF 0 "nonimmediate_operand" "") + (float_truncate:MODEF + (match_operand:XF 1 "register_operand" ""))) + (clobber (match_dup 2))])] + "TARGET_80387" +{ + if (flag_unsafe_math_optimizations) + { + rtx reg = REG_P (operands[0]) ? operands[0] : gen_reg_rtx (mode); + emit_insn (gen_truncxf2_i387_noop (reg, operands[1])); + if (reg != operands[0]) + emit_move_insn (operands[0], reg); + DONE; + } + else + { + enum ix86_stack_slot slot = (virtuals_instantiated + ? SLOT_TEMP + : SLOT_VIRTUAL); + operands[2] = assign_386_stack_local (mode, slot); + } +}) + +(define_insn "*truncxfsf2_mixed" + [(set (match_operand:SF 0 "nonimmediate_operand" "=m,?f,?x,?*r") + (float_truncate:SF + (match_operand:XF 1 "register_operand" "f ,f ,f ,f"))) + (clobber (match_operand:SF 2 "memory_operand" "=X,m ,m ,m"))] + "TARGET_80387" +{ + gcc_assert (!which_alternative); + return output_387_reg_move (insn, operands); +} + [(set_attr "type" "fmov,multi,multi,multi") + (set_attr "unit" "*,i387,i387,i387") + (set_attr "mode" "SF")]) + +(define_insn "*truncxfdf2_mixed" + [(set (match_operand:DF 0 "nonimmediate_operand" "=m,?f,?Y2,?*r") + (float_truncate:DF + (match_operand:XF 1 "register_operand" "f ,f ,f ,f"))) + (clobber (match_operand:DF 2 "memory_operand" "=X,m ,m ,m"))] + "TARGET_80387" +{ + gcc_assert (!which_alternative); + return output_387_reg_move (insn, operands); +} + [(set_attr "type" "fmov,multi,multi,multi") + (set_attr "unit" "*,i387,i387,i387") + (set_attr "mode" "DF")]) + +(define_insn "truncxf2_i387_noop" + [(set (match_operand:MODEF 0 "register_operand" "=f") + (float_truncate:MODEF + (match_operand:XF 1 "register_operand" "f")))] + "TARGET_80387 && flag_unsafe_math_optimizations" + "* return output_387_reg_move (insn, operands);" + [(set_attr "type" "fmov") + (set_attr "mode" "")]) + +(define_insn "*truncxf2_i387" + [(set (match_operand:MODEF 0 "memory_operand" "=m") + (float_truncate:MODEF + (match_operand:XF 1 "register_operand" "f")))] + "TARGET_80387" + "* return output_387_reg_move (insn, operands);" + [(set_attr "type" "fmov") + (set_attr "mode" "")]) + +(define_split + [(set (match_operand:MODEF 0 "register_operand" "") + (float_truncate:MODEF + (match_operand:XF 1 "register_operand" ""))) + (clobber (match_operand:MODEF 2 "memory_operand" ""))] + "TARGET_80387 && reload_completed" + [(set (match_dup 2) (float_truncate:MODEF (match_dup 1))) + (set (match_dup 0) (match_dup 2))]) + +(define_split + [(set (match_operand:MODEF 0 "memory_operand" "") + (float_truncate:MODEF + (match_operand:XF 1 "register_operand" ""))) + (clobber (match_operand:MODEF 2 "memory_operand" ""))] + "TARGET_80387" + [(set (match_dup 0) (float_truncate:MODEF (match_dup 1)))]) + +;; Signed conversion to DImode. + +(define_expand "fix_truncxfdi2" + [(parallel [(set (match_operand:DI 0 "nonimmediate_operand" "") + (fix:DI (match_operand:XF 1 "register_operand" ""))) + (clobber (reg:CC FLAGS_REG))])] + "TARGET_80387" +{ + if (TARGET_FISTTP) + { + emit_insn (gen_fix_truncdi_fisttp_i387_1 (operands[0], operands[1])); + DONE; + } +}) + +(define_expand "fix_truncdi2" + [(parallel [(set (match_operand:DI 0 "nonimmediate_operand" "") + (fix:DI (match_operand:MODEF 1 "register_operand" ""))) + (clobber (reg:CC FLAGS_REG))])] + "TARGET_80387 || (TARGET_64BIT && SSE_FLOAT_MODE_P (mode))" +{ + if (TARGET_FISTTP + && !(TARGET_64BIT && SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)) + { + emit_insn (gen_fix_truncdi_fisttp_i387_1 (operands[0], operands[1])); + DONE; + } + if (TARGET_64BIT && SSE_FLOAT_MODE_P (mode)) + { + rtx out = REG_P (operands[0]) ? operands[0] : gen_reg_rtx (DImode); + emit_insn (gen_fix_truncdi_sse (out, operands[1])); + if (out != operands[0]) + emit_move_insn (operands[0], out); + DONE; + } +}) + +;; Signed conversion to SImode. + +(define_expand "fix_truncxfsi2" + [(parallel [(set (match_operand:SI 0 "nonimmediate_operand" "") + (fix:SI (match_operand:XF 1 "register_operand" ""))) + (clobber (reg:CC FLAGS_REG))])] + "TARGET_80387" +{ + if (TARGET_FISTTP) + { + emit_insn (gen_fix_truncsi_fisttp_i387_1 (operands[0], operands[1])); + DONE; + } +}) + +(define_expand "fix_truncsi2" + [(parallel [(set (match_operand:SI 0 "nonimmediate_operand" "") + (fix:SI (match_operand:MODEF 1 "register_operand" ""))) + (clobber (reg:CC FLAGS_REG))])] + "TARGET_80387 || SSE_FLOAT_MODE_P (mode)" +{ + if (TARGET_FISTTP + && !(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)) + { + emit_insn (gen_fix_truncsi_fisttp_i387_1 (operands[0], operands[1])); + DONE; + } + if (SSE_FLOAT_MODE_P (mode)) + { + rtx out = REG_P (operands[0]) ? operands[0] : gen_reg_rtx (SImode); + emit_insn (gen_fix_truncsi_sse (out, operands[1])); + if (out != operands[0]) + emit_move_insn (operands[0], out); + DONE; + } +}) + +;; Signed conversion to HImode. + +(define_expand "fix_trunchi2" + [(parallel [(set (match_operand:HI 0 "nonimmediate_operand" "") + (fix:HI (match_operand:X87MODEF 1 "register_operand" ""))) + (clobber (reg:CC FLAGS_REG))])] + "TARGET_80387 + && !(SSE_FLOAT_MODE_P (mode) && (!TARGET_FISTTP || TARGET_SSE_MATH))" +{ + if (TARGET_FISTTP) + { + emit_insn (gen_fix_trunchi_fisttp_i387_1 (operands[0], operands[1])); + DONE; + } +}) + +;; Unsigned conversion to SImode. + +(define_expand "fixuns_truncsi2" + [(parallel + [(set (match_operand:SI 0 "register_operand" "") + (unsigned_fix:SI + (match_operand:MODEF 1 "nonimmediate_operand" ""))) + (use (match_dup 2)) + (clobber (match_scratch: 3 "")) + (clobber (match_scratch: 4 ""))])] + "!TARGET_64BIT && TARGET_SSE2 && TARGET_SSE_MATH" +{ + enum machine_mode mode = mode; + enum machine_mode vecmode = mode; + REAL_VALUE_TYPE TWO31r; + rtx two31; + + if (optimize_insn_for_size_p ()) + FAIL; + + real_ldexp (&TWO31r, &dconst1, 31); + two31 = const_double_from_real_value (TWO31r, mode); + two31 = ix86_build_const_vector (vecmode, true, two31); + operands[2] = force_reg (vecmode, two31); +}) + +(define_insn_and_split "*fixuns_trunc_1" + [(set (match_operand:SI 0 "register_operand" "=&x,&x") + (unsigned_fix:SI + (match_operand:MODEF 3 "nonimmediate_operand" "xm,xm"))) + (use (match_operand: 4 "nonimmediate_operand" "m,x")) + (clobber (match_scratch: 1 "=x,&x")) + (clobber (match_scratch: 2 "=x,x"))] + "!TARGET_64BIT && TARGET_SSE2 && TARGET_SSE_MATH + && optimize_function_for_speed_p (cfun)" + "#" + "&& reload_completed" + [(const_int 0)] +{ + ix86_split_convert_uns_si_sse (operands); + DONE; +}) + +;; Unsigned conversion to HImode. +;; Without these patterns, we'll try the unsigned SI conversion which +;; is complex for SSE, rather than the signed SI conversion, which isn't. + +(define_expand "fixuns_trunchi2" + [(set (match_dup 2) + (fix:SI (match_operand:MODEF 1 "nonimmediate_operand" ""))) + (set (match_operand:HI 0 "nonimmediate_operand" "") + (subreg:HI (match_dup 2) 0))] + "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH" + "operands[2] = gen_reg_rtx (SImode);") + +;; When SSE is available, it is always faster to use it! +(define_insn "fix_truncdi_sse" + [(set (match_operand:DI 0 "register_operand" "=r,r") + (fix:DI (match_operand:MODEF 1 "nonimmediate_operand" "x,m")))] + "TARGET_64BIT && SSE_FLOAT_MODE_P (mode) + && (!TARGET_FISTTP || TARGET_SSE_MATH)" + "%vcvtts2si{q}\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "prefix" "maybe_vex") + (set_attr "prefix_rex" "1") + (set_attr "mode" "") + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") + (set_attr "bdver1_decode" "double,double")]) + +(define_insn "fix_truncsi_sse" + [(set (match_operand:SI 0 "register_operand" "=r,r") + (fix:SI (match_operand:MODEF 1 "nonimmediate_operand" "x,m")))] + "SSE_FLOAT_MODE_P (mode) + && (!TARGET_FISTTP || TARGET_SSE_MATH)" + "%vcvtts2si\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "") + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") + (set_attr "bdver1_decode" "double,double")]) + +;; Shorten x87->SSE reload sequences of fix_trunc?f?i_sse patterns. +(define_peephole2 + [(set (match_operand:MODEF 0 "register_operand" "") + (match_operand:MODEF 1 "memory_operand" "")) + (set (match_operand:SSEMODEI24 2 "register_operand" "") + (fix:SSEMODEI24 (match_dup 0)))] + "TARGET_SHORTEN_X87_SSE + && !(TARGET_AVOID_VECTOR_DECODE && optimize_insn_for_speed_p ()) + && peep2_reg_dead_p (2, operands[0])" + [(set (match_dup 2) (fix:SSEMODEI24 (match_dup 1)))]) + +;; Avoid vector decoded forms of the instruction. +(define_peephole2 + [(match_scratch:DF 2 "Y2") + (set (match_operand:SSEMODEI24 0 "register_operand" "") + (fix:SSEMODEI24 (match_operand:DF 1 "memory_operand" "")))] + "TARGET_AVOID_VECTOR_DECODE && optimize_insn_for_speed_p ()" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (fix:SSEMODEI24 (match_dup 2)))]) + +(define_peephole2 + [(match_scratch:SF 2 "x") + (set (match_operand:SSEMODEI24 0 "register_operand" "") + (fix:SSEMODEI24 (match_operand:SF 1 "memory_operand" "")))] + "TARGET_AVOID_VECTOR_DECODE && optimize_insn_for_speed_p ()" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (fix:SSEMODEI24 (match_dup 2)))]) + +(define_insn_and_split "fix_trunc_fisttp_i387_1" + [(set (match_operand:X87MODEI 0 "nonimmediate_operand" "") + (fix:X87MODEI (match_operand 1 "register_operand" "")))] + "X87_FLOAT_MODE_P (GET_MODE (operands[1])) + && TARGET_FISTTP + && !((SSE_FLOAT_MODE_P (GET_MODE (operands[1])) + && (TARGET_64BIT || mode != DImode)) + && TARGET_SSE_MATH) + && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] +{ + if (memory_operand (operands[0], VOIDmode)) + emit_insn (gen_fix_trunc_i387_fisttp (operands[0], operands[1])); + else + { + operands[2] = assign_386_stack_local (mode, SLOT_TEMP); + emit_insn (gen_fix_trunc_i387_fisttp_with_temp (operands[0], + operands[1], + operands[2])); + } + DONE; +} + [(set_attr "type" "fisttp") + (set_attr "mode" "")]) + +(define_insn "fix_trunc_i387_fisttp" + [(set (match_operand:X87MODEI 0 "memory_operand" "=m") + (fix:X87MODEI (match_operand 1 "register_operand" "f"))) + (clobber (match_scratch:XF 2 "=&1f"))] + "X87_FLOAT_MODE_P (GET_MODE (operands[1])) + && TARGET_FISTTP + && !((SSE_FLOAT_MODE_P (GET_MODE (operands[1])) + && (TARGET_64BIT || mode != DImode)) + && TARGET_SSE_MATH)" + "* return output_fix_trunc (insn, operands, 1);" + [(set_attr "type" "fisttp") + (set_attr "mode" "")]) + +(define_insn "fix_trunc_i387_fisttp_with_temp" + [(set (match_operand:X87MODEI 0 "nonimmediate_operand" "=m,?r") + (fix:X87MODEI (match_operand 1 "register_operand" "f,f"))) + (clobber (match_operand:X87MODEI 2 "memory_operand" "=X,m")) + (clobber (match_scratch:XF 3 "=&1f,&1f"))] + "X87_FLOAT_MODE_P (GET_MODE (operands[1])) + && TARGET_FISTTP + && !((SSE_FLOAT_MODE_P (GET_MODE (operands[1])) + && (TARGET_64BIT || mode != DImode)) + && TARGET_SSE_MATH)" + "#" + [(set_attr "type" "fisttp") + (set_attr "mode" "")]) + +(define_split + [(set (match_operand:X87MODEI 0 "register_operand" "") + (fix:X87MODEI (match_operand 1 "register_operand" ""))) + (clobber (match_operand:X87MODEI 2 "memory_operand" "")) + (clobber (match_scratch 3 ""))] + "reload_completed" + [(parallel [(set (match_dup 2) (fix:X87MODEI (match_dup 1))) + (clobber (match_dup 3))]) + (set (match_dup 0) (match_dup 2))]) + +(define_split + [(set (match_operand:X87MODEI 0 "memory_operand" "") + (fix:X87MODEI (match_operand 1 "register_operand" ""))) + (clobber (match_operand:X87MODEI 2 "memory_operand" "")) + (clobber (match_scratch 3 ""))] + "reload_completed" + [(parallel [(set (match_dup 0) (fix:X87MODEI (match_dup 1))) + (clobber (match_dup 3))])]) + +;; See the comments in i386.h near OPTIMIZE_MODE_SWITCHING for the description +;; of the machinery. Please note the clobber of FLAGS_REG. In i387 control +;; word calculation (inserted by LCM in mode switching pass) a FLAGS_REG +;; clobbering insns can be used. Look at emit_i387_cw_initialization () +;; function in i386.c. +(define_insn_and_split "*fix_trunc_i387_1" + [(set (match_operand:X87MODEI 0 "nonimmediate_operand" "") + (fix:X87MODEI (match_operand 1 "register_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "X87_FLOAT_MODE_P (GET_MODE (operands[1])) + && !TARGET_FISTTP + && !(SSE_FLOAT_MODE_P (GET_MODE (operands[1])) + && (TARGET_64BIT || mode != DImode)) + && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] +{ + ix86_optimize_mode_switching[I387_TRUNC] = 1; + + operands[2] = assign_386_stack_local (HImode, SLOT_CW_STORED); + operands[3] = assign_386_stack_local (HImode, SLOT_CW_TRUNC); + if (memory_operand (operands[0], VOIDmode)) + emit_insn (gen_fix_trunc_i387 (operands[0], operands[1], + operands[2], operands[3])); + else + { + operands[4] = assign_386_stack_local (mode, SLOT_TEMP); + emit_insn (gen_fix_trunc_i387_with_temp (operands[0], operands[1], + operands[2], operands[3], + operands[4])); + } + DONE; +} + [(set_attr "type" "fistp") + (set_attr "i387_cw" "trunc") + (set_attr "mode" "")]) + +(define_insn "fix_truncdi_i387" + [(set (match_operand:DI 0 "memory_operand" "=m") + (fix:DI (match_operand 1 "register_operand" "f"))) + (use (match_operand:HI 2 "memory_operand" "m")) + (use (match_operand:HI 3 "memory_operand" "m")) + (clobber (match_scratch:XF 4 "=&1f"))] + "X87_FLOAT_MODE_P (GET_MODE (operands[1])) + && !TARGET_FISTTP + && !(TARGET_64BIT && SSE_FLOAT_MODE_P (GET_MODE (operands[1])))" + "* return output_fix_trunc (insn, operands, 0);" + [(set_attr "type" "fistp") + (set_attr "i387_cw" "trunc") + (set_attr "mode" "DI")]) + +(define_insn "fix_truncdi_i387_with_temp" + [(set (match_operand:DI 0 "nonimmediate_operand" "=m,?r") + (fix:DI (match_operand 1 "register_operand" "f,f"))) + (use (match_operand:HI 2 "memory_operand" "m,m")) + (use (match_operand:HI 3 "memory_operand" "m,m")) + (clobber (match_operand:DI 4 "memory_operand" "=X,m")) + (clobber (match_scratch:XF 5 "=&1f,&1f"))] + "X87_FLOAT_MODE_P (GET_MODE (operands[1])) + && !TARGET_FISTTP + && !(TARGET_64BIT && SSE_FLOAT_MODE_P (GET_MODE (operands[1])))" + "#" + [(set_attr "type" "fistp") + (set_attr "i387_cw" "trunc") + (set_attr "mode" "DI")]) + +(define_split + [(set (match_operand:DI 0 "register_operand" "") + (fix:DI (match_operand 1 "register_operand" ""))) + (use (match_operand:HI 2 "memory_operand" "")) + (use (match_operand:HI 3 "memory_operand" "")) + (clobber (match_operand:DI 4 "memory_operand" "")) + (clobber (match_scratch 5 ""))] + "reload_completed" + [(parallel [(set (match_dup 4) (fix:DI (match_dup 1))) + (use (match_dup 2)) + (use (match_dup 3)) + (clobber (match_dup 5))]) + (set (match_dup 0) (match_dup 4))]) + +(define_split + [(set (match_operand:DI 0 "memory_operand" "") + (fix:DI (match_operand 1 "register_operand" ""))) + (use (match_operand:HI 2 "memory_operand" "")) + (use (match_operand:HI 3 "memory_operand" "")) + (clobber (match_operand:DI 4 "memory_operand" "")) + (clobber (match_scratch 5 ""))] + "reload_completed" + [(parallel [(set (match_dup 0) (fix:DI (match_dup 1))) + (use (match_dup 2)) + (use (match_dup 3)) + (clobber (match_dup 5))])]) + +(define_insn "fix_trunc_i387" + [(set (match_operand:X87MODEI12 0 "memory_operand" "=m") + (fix:X87MODEI12 (match_operand 1 "register_operand" "f"))) + (use (match_operand:HI 2 "memory_operand" "m")) + (use (match_operand:HI 3 "memory_operand" "m"))] + "X87_FLOAT_MODE_P (GET_MODE (operands[1])) + && !TARGET_FISTTP + && !SSE_FLOAT_MODE_P (GET_MODE (operands[1]))" + "* return output_fix_trunc (insn, operands, 0);" + [(set_attr "type" "fistp") + (set_attr "i387_cw" "trunc") + (set_attr "mode" "")]) + +(define_insn "fix_trunc_i387_with_temp" + [(set (match_operand:X87MODEI12 0 "nonimmediate_operand" "=m,?r") + (fix:X87MODEI12 (match_operand 1 "register_operand" "f,f"))) + (use (match_operand:HI 2 "memory_operand" "m,m")) + (use (match_operand:HI 3 "memory_operand" "m,m")) + (clobber (match_operand:X87MODEI12 4 "memory_operand" "=X,m"))] + "X87_FLOAT_MODE_P (GET_MODE (operands[1])) + && !TARGET_FISTTP + && !SSE_FLOAT_MODE_P (GET_MODE (operands[1]))" + "#" + [(set_attr "type" "fistp") + (set_attr "i387_cw" "trunc") + (set_attr "mode" "")]) + +(define_split + [(set (match_operand:X87MODEI12 0 "register_operand" "") + (fix:X87MODEI12 (match_operand 1 "register_operand" ""))) + (use (match_operand:HI 2 "memory_operand" "")) + (use (match_operand:HI 3 "memory_operand" "")) + (clobber (match_operand:X87MODEI12 4 "memory_operand" ""))] + "reload_completed" + [(parallel [(set (match_dup 4) (fix:X87MODEI12 (match_dup 1))) + (use (match_dup 2)) + (use (match_dup 3))]) + (set (match_dup 0) (match_dup 4))]) + +(define_split + [(set (match_operand:X87MODEI12 0 "memory_operand" "") + (fix:X87MODEI12 (match_operand 1 "register_operand" ""))) + (use (match_operand:HI 2 "memory_operand" "")) + (use (match_operand:HI 3 "memory_operand" "")) + (clobber (match_operand:X87MODEI12 4 "memory_operand" ""))] + "reload_completed" + [(parallel [(set (match_dup 0) (fix:X87MODEI12 (match_dup 1))) + (use (match_dup 2)) + (use (match_dup 3))])]) + +(define_insn "x86_fnstcw_1" + [(set (match_operand:HI 0 "memory_operand" "=m") + (unspec:HI [(reg:HI FPCR_REG)] UNSPEC_FSTCW))] + "TARGET_80387" + "fnstcw\t%0" + [(set (attr "length") + (symbol_ref "ix86_attr_length_address_default (insn) + 2")) + (set_attr "mode" "HI") + (set_attr "unit" "i387") + (set_attr "bdver1_decode" "vector")]) + +(define_insn "x86_fldcw_1" + [(set (reg:HI FPCR_REG) + (unspec:HI [(match_operand:HI 0 "memory_operand" "m")] UNSPEC_FLDCW))] + "TARGET_80387" + "fldcw\t%0" + [(set (attr "length") + (symbol_ref "ix86_attr_length_address_default (insn) + 2")) + (set_attr "mode" "HI") + (set_attr "unit" "i387") + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "vector") + (set_attr "bdver1_decode" "vector")]) + +;; Conversion between fixed point and floating point. + +;; Even though we only accept memory inputs, the backend _really_ +;; wants to be able to do this between registers. + +(define_expand "floathi2" + [(set (match_operand:X87MODEF 0 "register_operand" "") + (float:X87MODEF (match_operand:HI 1 "nonimmediate_operand" "")))] + "TARGET_80387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387)") + +;; Pre-reload splitter to add memory clobber to the pattern. +(define_insn_and_split "*floathi2_1" + [(set (match_operand:X87MODEF 0 "register_operand" "") + (float:X87MODEF (match_operand:HI 1 "register_operand" "")))] + "TARGET_80387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && can_create_pseudo_p ()" + "#" + "&& 1" + [(parallel [(set (match_dup 0) + (float:X87MODEF (match_dup 1))) + (clobber (match_dup 2))])] + "operands[2] = assign_386_stack_local (HImode, SLOT_TEMP);") + +(define_insn "*floathi2_i387_with_temp" + [(set (match_operand:X87MODEF 0 "register_operand" "=f,f") + (float:X87MODEF (match_operand:HI 1 "nonimmediate_operand" "m,?r"))) + (clobber (match_operand:HI 2 "memory_operand" "=m,m"))] + "TARGET_80387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387)" + "#" + [(set_attr "type" "fmov,multi") + (set_attr "mode" "") + (set_attr "unit" "*,i387") + (set_attr "fp_int_src" "true")]) + +(define_insn "*floathi2_i387" + [(set (match_operand:X87MODEF 0 "register_operand" "=f") + (float:X87MODEF (match_operand:HI 1 "memory_operand" "m")))] + "TARGET_80387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387)" + "fild%Z1\t%1" + [(set_attr "type" "fmov") + (set_attr "mode" "") + (set_attr "fp_int_src" "true")]) + +(define_split + [(set (match_operand:X87MODEF 0 "register_operand" "") + (float:X87MODEF (match_operand:HI 1 "register_operand" ""))) + (clobber (match_operand:HI 2 "memory_operand" ""))] + "TARGET_80387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && reload_completed" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (float:X87MODEF (match_dup 2)))]) + +(define_split + [(set (match_operand:X87MODEF 0 "register_operand" "") + (float:X87MODEF (match_operand:HI 1 "memory_operand" ""))) + (clobber (match_operand:HI 2 "memory_operand" ""))] + "TARGET_80387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && reload_completed" + [(set (match_dup 0) (float:X87MODEF (match_dup 1)))]) + +(define_expand "float2" + [(set (match_operand:X87MODEF 0 "register_operand" "") + (float:X87MODEF + (match_operand:SSEMODEI24 1 "nonimmediate_operand" "")))] + "TARGET_80387 + || ((mode != DImode || TARGET_64BIT) + && SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)" +{ + if (!((mode != DImode || TARGET_64BIT) + && SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + && !X87_ENABLE_FLOAT (mode, mode)) + { + rtx reg = gen_reg_rtx (XFmode); + rtx insn; + + emit_insn (gen_floatxf2 (reg, operands[1])); + + if (mode == SFmode) + insn = gen_truncxfsf2 (operands[0], reg); + else if (mode == DFmode) + insn = gen_truncxfdf2 (operands[0], reg); + else + gcc_unreachable (); + + emit_insn (insn); + DONE; + } +}) + +;; Pre-reload splitter to add memory clobber to the pattern. +(define_insn_and_split "*float2_1" + [(set (match_operand:X87MODEF 0 "register_operand" "") + (float:X87MODEF (match_operand:SSEMODEI24 1 "register_operand" "")))] + "((TARGET_80387 + && X87_ENABLE_FLOAT (mode, mode) + && (!((mode != DImode || TARGET_64BIT) + && SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387)) + || ((mode != DImode || TARGET_64BIT) + && SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && ((mode == SImode + && TARGET_SSE2 && TARGET_USE_VECTOR_CONVERTS + && optimize_function_for_speed_p (cfun) + && flag_trapping_math) + || !(TARGET_INTER_UNIT_CONVERSIONS + || optimize_function_for_size_p (cfun))))) + && can_create_pseudo_p ()" + "#" + "&& 1" + [(parallel [(set (match_dup 0) (float:X87MODEF (match_dup 1))) + (clobber (match_dup 2))])] +{ + operands[2] = assign_386_stack_local (mode, SLOT_TEMP); + + /* Avoid store forwarding (partial memory) stall penalty + by passing DImode value through XMM registers. */ + if (mode == DImode && !TARGET_64BIT + && TARGET_80387 && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES + && optimize_function_for_speed_p (cfun)) + { + emit_insn (gen_floatdi2_i387_with_xmm (operands[0], + operands[1], + operands[2])); + DONE; + } +}) + +(define_insn "*floatsi2_vector_mixed_with_temp" + [(set (match_operand:MODEF 0 "register_operand" "=f,f,x,x,x") + (float:MODEF + (match_operand:SI 1 "nonimmediate_operand" "m,?r,r,m,!x"))) + (clobber (match_operand:SI 2 "memory_operand" "=X,m,m,X,m"))] + "TARGET_SSE2 && TARGET_MIX_SSE_I387 + && TARGET_USE_VECTOR_CONVERTS && optimize_function_for_speed_p (cfun)" + "#" + [(set_attr "type" "fmov,multi,sseicvt,sseicvt,sseicvt") + (set_attr "mode" ",,,,") + (set_attr "unit" "*,i387,*,*,*") + (set_attr "athlon_decode" "*,*,double,direct,double") + (set_attr "amdfam10_decode" "*,*,vector,double,double") + (set_attr "bdver1_decode" "*,*,double,direct,double") + (set_attr "fp_int_src" "true")]) + +(define_insn "*floatsi2_vector_mixed" + [(set (match_operand:MODEF 0 "register_operand" "=f,x") + (float:MODEF (match_operand:SI 1 "memory_operand" "m,m")))] + "TARGET_SSE2 && TARGET_MIX_SSE_I387 + && TARGET_USE_VECTOR_CONVERTS && optimize_function_for_speed_p (cfun)" + "@ + fild%Z1\t%1 + #" + [(set_attr "type" "fmov,sseicvt") + (set_attr "mode" ",") + (set_attr "unit" "i387,*") + (set_attr "athlon_decode" "*,direct") + (set_attr "amdfam10_decode" "*,double") + (set_attr "bdver1_decode" "*,direct") + (set_attr "fp_int_src" "true")]) + +(define_insn "*float2_mixed_with_temp" + [(set (match_operand:MODEF 0 "register_operand" "=f,f,x,x") + (float:MODEF + (match_operand:SSEMODEI24 1 "nonimmediate_operand" "m,?r,r,m"))) + (clobber (match_operand:SSEMODEI24 2 "memory_operand" "=X,m,m,X"))] + "(mode != DImode || TARGET_64BIT) + && SSE_FLOAT_MODE_P (mode) && TARGET_MIX_SSE_I387" + "#" + [(set_attr "type" "fmov,multi,sseicvt,sseicvt") + (set_attr "mode" "") + (set_attr "unit" "*,i387,*,*") + (set_attr "athlon_decode" "*,*,double,direct") + (set_attr "amdfam10_decode" "*,*,vector,double") + (set_attr "bdver1_decode" "*,*,double,direct") + (set_attr "fp_int_src" "true")]) + +(define_split + [(set (match_operand:MODEF 0 "register_operand" "") + (float:MODEF (match_operand:SSEMODEI24 1 "register_operand" ""))) + (clobber (match_operand:SSEMODEI24 2 "memory_operand" ""))] + "(mode != DImode || TARGET_64BIT) + && SSE_FLOAT_MODE_P (mode) && TARGET_MIX_SSE_I387 + && TARGET_INTER_UNIT_CONVERSIONS + && reload_completed + && (SSE_REG_P (operands[0]) + || (GET_CODE (operands[0]) == SUBREG + && SSE_REG_P (SUBREG_REG (operands[0]))))" + [(set (match_dup 0) (float:MODEF (match_dup 1)))]) + +(define_split + [(set (match_operand:MODEF 0 "register_operand" "") + (float:MODEF (match_operand:SSEMODEI24 1 "register_operand" ""))) + (clobber (match_operand:SSEMODEI24 2 "memory_operand" ""))] + "(mode != DImode || TARGET_64BIT) + && SSE_FLOAT_MODE_P (mode) && TARGET_MIX_SSE_I387 + && !(TARGET_INTER_UNIT_CONVERSIONS || optimize_function_for_size_p (cfun)) + && reload_completed + && (SSE_REG_P (operands[0]) + || (GET_CODE (operands[0]) == SUBREG + && SSE_REG_P (SUBREG_REG (operands[0]))))" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (float:MODEF (match_dup 2)))]) + +(define_insn "*float2_mixed_interunit" + [(set (match_operand:MODEF 0 "register_operand" "=f,x,x") + (float:MODEF + (match_operand:SSEMODEI24 1 "nonimmediate_operand" "m,r,m")))] + "(mode != DImode || TARGET_64BIT) + && SSE_FLOAT_MODE_P (mode) && TARGET_MIX_SSE_I387 + && (TARGET_INTER_UNIT_CONVERSIONS || optimize_function_for_size_p (cfun))" + "@ + fild%Z1\t%1 + %vcvtsi2s\t{%1, %d0|%d0, %1} + %vcvtsi2s\t{%1, %d0|%d0, %1}" + [(set_attr "type" "fmov,sseicvt,sseicvt") + (set_attr "prefix" "orig,maybe_vex,maybe_vex") + (set_attr "mode" "") + (set (attr "prefix_rex") + (if_then_else + (and (eq_attr "prefix" "maybe_vex") + (ne (symbol_ref "mode == DImode") (const_int 0))) + (const_string "1") + (const_string "*"))) + (set_attr "unit" "i387,*,*") + (set_attr "athlon_decode" "*,double,direct") + (set_attr "amdfam10_decode" "*,vector,double") + (set_attr "bdver1_decode" "*,double,direct") + (set_attr "fp_int_src" "true")]) + +(define_insn "*float2_mixed_nointerunit" + [(set (match_operand:MODEF 0 "register_operand" "=f,x") + (float:MODEF + (match_operand:SSEMODEI24 1 "memory_operand" "m,m")))] + "(mode != DImode || TARGET_64BIT) + && SSE_FLOAT_MODE_P (mode) && TARGET_MIX_SSE_I387 + && !(TARGET_INTER_UNIT_CONVERSIONS || optimize_function_for_size_p (cfun))" + "@ + fild%Z1\t%1 + %vcvtsi2s\t{%1, %d0|%d0, %1}" + [(set_attr "type" "fmov,sseicvt") + (set_attr "prefix" "orig,maybe_vex") + (set_attr "mode" "") + (set (attr "prefix_rex") + (if_then_else + (and (eq_attr "prefix" "maybe_vex") + (ne (symbol_ref "mode == DImode") (const_int 0))) + (const_string "1") + (const_string "*"))) + (set_attr "athlon_decode" "*,direct") + (set_attr "amdfam10_decode" "*,double") + (set_attr "bdver1_decode" "*,direct") + (set_attr "fp_int_src" "true")]) + +(define_insn "*floatsi2_vector_sse_with_temp" + [(set (match_operand:MODEF 0 "register_operand" "=x,x,x") + (float:MODEF + (match_operand:SI 1 "nonimmediate_operand" "r,m,!x"))) + (clobber (match_operand:SI 2 "memory_operand" "=m,X,m"))] + "TARGET_SSE2 && TARGET_SSE_MATH + && TARGET_USE_VECTOR_CONVERTS && optimize_function_for_speed_p (cfun)" + "#" + [(set_attr "type" "sseicvt") + (set_attr "mode" ",,") + (set_attr "athlon_decode" "double,direct,double") + (set_attr "amdfam10_decode" "vector,double,double") + (set_attr "bdver1_decode" "double,direct,double") + (set_attr "fp_int_src" "true")]) + +(define_insn "*floatsi2_vector_sse" + [(set (match_operand:MODEF 0 "register_operand" "=x") + (float:MODEF (match_operand:SI 1 "memory_operand" "m")))] + "TARGET_SSE2 && TARGET_SSE_MATH + && TARGET_USE_VECTOR_CONVERTS && optimize_function_for_speed_p (cfun)" + "#" + [(set_attr "type" "sseicvt") + (set_attr "mode" "") + (set_attr "athlon_decode" "direct") + (set_attr "amdfam10_decode" "double") + (set_attr "bdver1_decode" "direct") + (set_attr "fp_int_src" "true")]) + +(define_split + [(set (match_operand:MODEF 0 "register_operand" "") + (float:MODEF (match_operand:SI 1 "register_operand" ""))) + (clobber (match_operand:SI 2 "memory_operand" ""))] + "TARGET_SSE2 && TARGET_SSE_MATH + && TARGET_USE_VECTOR_CONVERTS && optimize_function_for_speed_p (cfun) + && reload_completed + && (SSE_REG_P (operands[0]) + || (GET_CODE (operands[0]) == SUBREG + && SSE_REG_P (SUBREG_REG (operands[0]))))" + [(const_int 0)] +{ + rtx op1 = operands[1]; + + operands[3] = simplify_gen_subreg (mode, operands[0], + mode, 0); + if (GET_CODE (op1) == SUBREG) + op1 = SUBREG_REG (op1); + + if (GENERAL_REG_P (op1) && TARGET_INTER_UNIT_MOVES) + { + operands[4] = simplify_gen_subreg (V4SImode, operands[0], mode, 0); + emit_insn (gen_sse2_loadld (operands[4], + CONST0_RTX (V4SImode), operands[1])); + } + /* We can ignore possible trapping value in the + high part of SSE register for non-trapping math. */ + else if (SSE_REG_P (op1) && !flag_trapping_math) + operands[4] = simplify_gen_subreg (V4SImode, operands[1], SImode, 0); + else + { + operands[4] = simplify_gen_subreg (V4SImode, operands[0], mode, 0); + emit_move_insn (operands[2], operands[1]); + emit_insn (gen_sse2_loadld (operands[4], + CONST0_RTX (V4SImode), operands[2])); + } + emit_insn + (gen_sse2_cvtdq2p (operands[3], operands[4])); + DONE; +}) + +(define_split + [(set (match_operand:MODEF 0 "register_operand" "") + (float:MODEF (match_operand:SI 1 "memory_operand" ""))) + (clobber (match_operand:SI 2 "memory_operand" ""))] + "TARGET_SSE2 && TARGET_SSE_MATH + && TARGET_USE_VECTOR_CONVERTS && optimize_function_for_speed_p (cfun) + && reload_completed + && (SSE_REG_P (operands[0]) + || (GET_CODE (operands[0]) == SUBREG + && SSE_REG_P (SUBREG_REG (operands[0]))))" + [(const_int 0)] +{ + operands[3] = simplify_gen_subreg (mode, operands[0], + mode, 0); + operands[4] = simplify_gen_subreg (V4SImode, operands[0], mode, 0); + + emit_insn (gen_sse2_loadld (operands[4], + CONST0_RTX (V4SImode), operands[1])); + emit_insn + (gen_sse2_cvtdq2p (operands[3], operands[4])); + DONE; +}) + +(define_split + [(set (match_operand:MODEF 0 "register_operand" "") + (float:MODEF (match_operand:SI 1 "register_operand" "")))] + "TARGET_SSE2 && TARGET_SSE_MATH + && TARGET_USE_VECTOR_CONVERTS && optimize_function_for_speed_p (cfun) + && reload_completed + && (SSE_REG_P (operands[0]) + || (GET_CODE (operands[0]) == SUBREG + && SSE_REG_P (SUBREG_REG (operands[0]))))" + [(const_int 0)] +{ + rtx op1 = operands[1]; + + operands[3] = simplify_gen_subreg (mode, operands[0], + mode, 0); + if (GET_CODE (op1) == SUBREG) + op1 = SUBREG_REG (op1); + + if (GENERAL_REG_P (op1)) + { + operands[4] = simplify_gen_subreg (V4SImode, operands[0], mode, 0); + if (TARGET_INTER_UNIT_MOVES) + emit_insn (gen_sse2_loadld (operands[4], + CONST0_RTX (V4SImode), operands[1])); + else + { + operands[5] = ix86_force_to_memory (GET_MODE (operands[1]), + operands[1]); + emit_insn (gen_sse2_loadld (operands[4], + CONST0_RTX (V4SImode), operands[5])); + ix86_free_from_memory (GET_MODE (operands[1])); + } + } + /* We can ignore possible trapping value in the + high part of SSE register for non-trapping math. */ + else if (SSE_REG_P (op1) && !flag_trapping_math) + operands[4] = simplify_gen_subreg (V4SImode, operands[1], SImode, 0); + else + gcc_unreachable (); + emit_insn + (gen_sse2_cvtdq2p (operands[3], operands[4])); + DONE; +}) + +(define_split + [(set (match_operand:MODEF 0 "register_operand" "") + (float:MODEF (match_operand:SI 1 "memory_operand" "")))] + "TARGET_SSE2 && TARGET_SSE_MATH + && TARGET_USE_VECTOR_CONVERTS && optimize_function_for_speed_p (cfun) + && reload_completed + && (SSE_REG_P (operands[0]) + || (GET_CODE (operands[0]) == SUBREG + && SSE_REG_P (SUBREG_REG (operands[0]))))" + [(const_int 0)] +{ + operands[3] = simplify_gen_subreg (mode, operands[0], + mode, 0); + operands[4] = simplify_gen_subreg (V4SImode, operands[0], mode, 0); + + emit_insn (gen_sse2_loadld (operands[4], + CONST0_RTX (V4SImode), operands[1])); + emit_insn + (gen_sse2_cvtdq2p (operands[3], operands[4])); + DONE; +}) + +(define_insn "*float2_sse_with_temp" + [(set (match_operand:MODEF 0 "register_operand" "=x,x") + (float:MODEF + (match_operand:SSEMODEI24 1 "nonimmediate_operand" "r,m"))) + (clobber (match_operand:SSEMODEI24 2 "memory_operand" "=m,X"))] + "(mode != DImode || TARGET_64BIT) + && SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH" + "#" + [(set_attr "type" "sseicvt") + (set_attr "mode" "") + (set_attr "athlon_decode" "double,direct") + (set_attr "amdfam10_decode" "vector,double") + (set_attr "bdver1_decode" "double,direct") + (set_attr "fp_int_src" "true")]) + +(define_insn "*float2_sse_interunit" + [(set (match_operand:MODEF 0 "register_operand" "=x,x") + (float:MODEF + (match_operand:SSEMODEI24 1 "nonimmediate_operand" "r,m")))] + "(mode != DImode || TARGET_64BIT) + && SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && (TARGET_INTER_UNIT_CONVERSIONS || optimize_function_for_size_p (cfun))" + "%vcvtsi2s\t{%1, %d0|%d0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "") + (set (attr "prefix_rex") + (if_then_else + (and (eq_attr "prefix" "maybe_vex") + (ne (symbol_ref "mode == DImode") (const_int 0))) + (const_string "1") + (const_string "*"))) + (set_attr "athlon_decode" "double,direct") + (set_attr "amdfam10_decode" "vector,double") + (set_attr "bdver1_decode" "double,direct") + (set_attr "fp_int_src" "true")]) + +(define_split + [(set (match_operand:MODEF 0 "register_operand" "") + (float:MODEF (match_operand:SSEMODEI24 1 "nonimmediate_operand" ""))) + (clobber (match_operand:SSEMODEI24 2 "memory_operand" ""))] + "(mode != DImode || TARGET_64BIT) + && SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && (TARGET_INTER_UNIT_CONVERSIONS || optimize_function_for_size_p (cfun)) + && reload_completed + && (SSE_REG_P (operands[0]) + || (GET_CODE (operands[0]) == SUBREG + && SSE_REG_P (SUBREG_REG (operands[0]))))" + [(set (match_dup 0) (float:MODEF (match_dup 1)))]) + +(define_insn "*float2_sse_nointerunit" + [(set (match_operand:MODEF 0 "register_operand" "=x") + (float:MODEF + (match_operand:SSEMODEI24 1 "memory_operand" "m")))] + "(mode != DImode || TARGET_64BIT) + && SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && !(TARGET_INTER_UNIT_CONVERSIONS || optimize_function_for_size_p (cfun))" + "%vcvtsi2s\t{%1, %d0|%d0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "") + (set (attr "prefix_rex") + (if_then_else + (and (eq_attr "prefix" "maybe_vex") + (ne (symbol_ref "mode == DImode") (const_int 0))) + (const_string "1") + (const_string "*"))) + (set_attr "athlon_decode" "direct") + (set_attr "amdfam10_decode" "double") + (set_attr "bdver1_decode" "direct") + (set_attr "fp_int_src" "true")]) + +(define_split + [(set (match_operand:MODEF 0 "register_operand" "") + (float:MODEF (match_operand:SSEMODEI24 1 "register_operand" ""))) + (clobber (match_operand:SSEMODEI24 2 "memory_operand" ""))] + "(mode != DImode || TARGET_64BIT) + && SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && !(TARGET_INTER_UNIT_CONVERSIONS || optimize_function_for_size_p (cfun)) + && reload_completed + && (SSE_REG_P (operands[0]) + || (GET_CODE (operands[0]) == SUBREG + && SSE_REG_P (SUBREG_REG (operands[0]))))" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (float:MODEF (match_dup 2)))]) + +(define_split + [(set (match_operand:MODEF 0 "register_operand" "") + (float:MODEF (match_operand:SSEMODEI24 1 "memory_operand" ""))) + (clobber (match_operand:SSEMODEI24 2 "memory_operand" ""))] + "(mode != DImode || TARGET_64BIT) + && SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && reload_completed + && (SSE_REG_P (operands[0]) + || (GET_CODE (operands[0]) == SUBREG + && SSE_REG_P (SUBREG_REG (operands[0]))))" + [(set (match_dup 0) (float:MODEF (match_dup 1)))]) + +(define_insn "*float2_i387_with_temp" + [(set (match_operand:X87MODEF 0 "register_operand" "=f,f") + (float:X87MODEF + (match_operand:SSEMODEI24 1 "nonimmediate_operand" "m,?r"))) + (clobber (match_operand:SSEMODEI24 2 "memory_operand" "=X,m"))] + "TARGET_80387 + && X87_ENABLE_FLOAT (mode, mode)" + "@ + fild%Z1\t%1 + #" + [(set_attr "type" "fmov,multi") + (set_attr "mode" "") + (set_attr "unit" "*,i387") + (set_attr "fp_int_src" "true")]) + +(define_insn "*float2_i387" + [(set (match_operand:X87MODEF 0 "register_operand" "=f") + (float:X87MODEF + (match_operand:SSEMODEI24 1 "memory_operand" "m")))] + "TARGET_80387 + && X87_ENABLE_FLOAT (mode, mode)" + "fild%Z1\t%1" + [(set_attr "type" "fmov") + (set_attr "mode" "") + (set_attr "fp_int_src" "true")]) + +(define_split + [(set (match_operand:X87MODEF 0 "register_operand" "") + (float:X87MODEF (match_operand:SSEMODEI24 1 "register_operand" ""))) + (clobber (match_operand:SSEMODEI24 2 "memory_operand" ""))] + "TARGET_80387 + && X87_ENABLE_FLOAT (mode, mode) + && reload_completed + && FP_REG_P (operands[0])" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (float:X87MODEF (match_dup 2)))]) + +(define_split + [(set (match_operand:X87MODEF 0 "register_operand" "") + (float:X87MODEF (match_operand:SSEMODEI24 1 "memory_operand" ""))) + (clobber (match_operand:SSEMODEI24 2 "memory_operand" ""))] + "TARGET_80387 + && X87_ENABLE_FLOAT (mode, mode) + && reload_completed + && FP_REG_P (operands[0])" + [(set (match_dup 0) (float:X87MODEF (match_dup 1)))]) + +;; Avoid store forwarding (partial memory) stall penalty +;; by passing DImode value through XMM registers. */ + +(define_insn "floatdi2_i387_with_xmm" + [(set (match_operand:X87MODEF 0 "register_operand" "=f,f") + (float:X87MODEF + (match_operand:DI 1 "nonimmediate_operand" "m,?r"))) + (clobber (match_scratch:V4SI 3 "=X,x")) + (clobber (match_scratch:V4SI 4 "=X,x")) + (clobber (match_operand:DI 2 "memory_operand" "=X,m"))] + "TARGET_80387 && X87_ENABLE_FLOAT (mode, DImode) + && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES + && !TARGET_64BIT && optimize_function_for_speed_p (cfun)" + "#" + [(set_attr "type" "multi") + (set_attr "mode" "") + (set_attr "unit" "i387") + (set_attr "fp_int_src" "true")]) + +(define_split + [(set (match_operand:X87MODEF 0 "register_operand" "") + (float:X87MODEF (match_operand:DI 1 "register_operand" ""))) + (clobber (match_scratch:V4SI 3 "")) + (clobber (match_scratch:V4SI 4 "")) + (clobber (match_operand:DI 2 "memory_operand" ""))] + "TARGET_80387 && X87_ENABLE_FLOAT (mode, DImode) + && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES + && !TARGET_64BIT && optimize_function_for_speed_p (cfun) + && reload_completed + && FP_REG_P (operands[0])" + [(set (match_dup 2) (match_dup 3)) + (set (match_dup 0) (float:X87MODEF (match_dup 2)))] +{ + /* The DImode arrived in a pair of integral registers (e.g. %edx:%eax). + Assemble the 64-bit DImode value in an xmm register. */ + emit_insn (gen_sse2_loadld (operands[3], CONST0_RTX (V4SImode), + gen_rtx_SUBREG (SImode, operands[1], 0))); + emit_insn (gen_sse2_loadld (operands[4], CONST0_RTX (V4SImode), + gen_rtx_SUBREG (SImode, operands[1], 4))); + emit_insn (gen_vec_interleave_lowv4si (operands[3], operands[3], + operands[4])); + + operands[3] = gen_rtx_REG (DImode, REGNO (operands[3])); +}) + +(define_split + [(set (match_operand:X87MODEF 0 "register_operand" "") + (float:X87MODEF (match_operand:DI 1 "memory_operand" ""))) + (clobber (match_scratch:V4SI 3 "")) + (clobber (match_scratch:V4SI 4 "")) + (clobber (match_operand:DI 2 "memory_operand" ""))] + "TARGET_80387 && X87_ENABLE_FLOAT (mode, DImode) + && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES + && !TARGET_64BIT && optimize_function_for_speed_p (cfun) + && reload_completed + && FP_REG_P (operands[0])" + [(set (match_dup 0) (float:X87MODEF (match_dup 1)))]) + +;; Avoid store forwarding (partial memory) stall penalty by extending +;; SImode value to DImode through XMM register instead of pushing two +;; SImode values to stack. Note that even !TARGET_INTER_UNIT_MOVES +;; targets benefit from this optimization. Also note that fild +;; loads from memory only. + +(define_insn "*floatunssi2_1" + [(set (match_operand:X87MODEF 0 "register_operand" "=f,f") + (unsigned_float:X87MODEF + (match_operand:SI 1 "nonimmediate_operand" "x,m"))) + (clobber (match_operand:DI 2 "memory_operand" "=m,m")) + (clobber (match_scratch:SI 3 "=X,x"))] + "!TARGET_64BIT + && TARGET_80387 && X87_ENABLE_FLOAT (mode, DImode) + && TARGET_SSE" + "#" + [(set_attr "type" "multi") + (set_attr "mode" "")]) + +(define_split + [(set (match_operand:X87MODEF 0 "register_operand" "") + (unsigned_float:X87MODEF + (match_operand:SI 1 "register_operand" ""))) + (clobber (match_operand:DI 2 "memory_operand" "")) + (clobber (match_scratch:SI 3 ""))] + "!TARGET_64BIT + && TARGET_80387 && X87_ENABLE_FLOAT (mode, DImode) + && TARGET_SSE + && reload_completed" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) + (float:X87MODEF (match_dup 2)))] + "operands[1] = simplify_gen_subreg (DImode, operands[1], SImode, 0);") + +(define_split + [(set (match_operand:X87MODEF 0 "register_operand" "") + (unsigned_float:X87MODEF + (match_operand:SI 1 "memory_operand" ""))) + (clobber (match_operand:DI 2 "memory_operand" "")) + (clobber (match_scratch:SI 3 ""))] + "!TARGET_64BIT + && TARGET_80387 && X87_ENABLE_FLOAT (mode, DImode) + && TARGET_SSE + && reload_completed" + [(set (match_dup 2) (match_dup 3)) + (set (match_dup 0) + (float:X87MODEF (match_dup 2)))] +{ + emit_move_insn (operands[3], operands[1]); + operands[3] = simplify_gen_subreg (DImode, operands[3], SImode, 0); +}) + +(define_expand "floatunssi2" + [(parallel + [(set (match_operand:X87MODEF 0 "register_operand" "") + (unsigned_float:X87MODEF + (match_operand:SI 1 "nonimmediate_operand" ""))) + (clobber (match_dup 2)) + (clobber (match_scratch:SI 3 ""))])] + "!TARGET_64BIT + && ((TARGET_80387 && X87_ENABLE_FLOAT (mode, DImode) + && TARGET_SSE) + || (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))" +{ + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + { + ix86_expand_convert_uns_si_sse (operands[0], operands[1]); + DONE; + } + else + { + enum ix86_stack_slot slot = (virtuals_instantiated + ? SLOT_TEMP + : SLOT_VIRTUAL); + operands[2] = assign_386_stack_local (DImode, slot); + } +}) + +(define_expand "floatunsdisf2" + [(use (match_operand:SF 0 "register_operand" "")) + (use (match_operand:DI 1 "nonimmediate_operand" ""))] + "TARGET_64BIT && TARGET_SSE_MATH" + "x86_emit_floatuns (operands); DONE;") + +(define_expand "floatunsdidf2" + [(use (match_operand:DF 0 "register_operand" "")) + (use (match_operand:DI 1 "nonimmediate_operand" ""))] + "(TARGET_64BIT || TARGET_KEEPS_VECTOR_ALIGNED_STACK) + && TARGET_SSE2 && TARGET_SSE_MATH" +{ + if (TARGET_64BIT) + x86_emit_floatuns (operands); + else + ix86_expand_convert_uns_didf_sse (operands[0], operands[1]); + DONE; +}) + +;; Add instructions + +(define_expand "add3" + [(set (match_operand:SDWIM 0 "nonimmediate_operand" "") + (plus:SDWIM (match_operand:SDWIM 1 "nonimmediate_operand" "") + (match_operand:SDWIM 2 "" "")))] + "" + "ix86_expand_binary_operator (PLUS, mode, operands); DONE;") + +(define_insn_and_split "*add3_doubleword" + [(set (match_operand: 0 "nonimmediate_operand" "=r,o") + (plus: + (match_operand: 1 "nonimmediate_operand" "%0,0") + (match_operand: 2 "" "ro,r"))) + (clobber (reg:CC FLAGS_REG))] + "ix86_binary_operator_ok (PLUS, mode, operands)" + "#" + "reload_completed" + [(parallel [(set (reg:CC FLAGS_REG) + (unspec:CC [(match_dup 1) (match_dup 2)] + UNSPEC_ADD_CARRY)) + (set (match_dup 0) + (plus:DWIH (match_dup 1) (match_dup 2)))]) + (parallel [(set (match_dup 3) + (plus:DWIH + (match_dup 4) + (plus:DWIH + (ltu:DWIH (reg:CC FLAGS_REG) (const_int 0)) + (match_dup 5)))) + (clobber (reg:CC FLAGS_REG))])] + "split_double_mode (mode, &operands[0], 3, &operands[0], &operands[3]);") + +(define_insn "*add3_cc" + [(set (reg:CC FLAGS_REG) + (unspec:CC + [(match_operand:SWI48 1 "nonimmediate_operand" "%0,0") + (match_operand:SWI48 2 "" "r,rm")] + UNSPEC_ADD_CARRY)) + (set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r") + (plus:SWI48 (match_dup 1) (match_dup 2)))] + "ix86_binary_operator_ok (PLUS, mode, operands)" + "add{}\t{%2, %0|%0, %2}" + [(set_attr "type" "alu") + (set_attr "mode" "")]) + +(define_insn "addqi3_cc" + [(set (reg:CC FLAGS_REG) + (unspec:CC + [(match_operand:QI 1 "nonimmediate_operand" "%0,0") + (match_operand:QI 2 "general_operand" "qn,qm")] + UNSPEC_ADD_CARRY)) + (set (match_operand:QI 0 "nonimmediate_operand" "=qm,q") + (plus:QI (match_dup 1) (match_dup 2)))] + "ix86_binary_operator_ok (PLUS, QImode, operands)" + "add{b}\t{%2, %0|%0, %2}" + [(set_attr "type" "alu") + (set_attr "mode" "QI")]) + +(define_insn "*lea_1" + [(set (match_operand:P 0 "register_operand" "=r") + (match_operand:P 1 "no_seg_address_operand" "p"))] + "" + "lea{}\t{%a1, %0|%0, %a1}" + [(set_attr "type" "lea") + (set_attr "mode" "")]) + +(define_insn "*lea_2" + [(set (match_operand:SI 0 "register_operand" "=r") + (subreg:SI (match_operand:DI 1 "no_seg_address_operand" "p") 0))] + "TARGET_64BIT" + "lea{l}\t{%a1, %0|%0, %a1}" + [(set_attr "type" "lea") + (set_attr "mode" "SI")]) + +(define_insn "*lea_2_zext" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (subreg:SI (match_operand:DI 1 "no_seg_address_operand" "p") 0)))] + "TARGET_64BIT" + "lea{l}\t{%a1, %k0|%k0, %a1}" + [(set_attr "type" "lea") + (set_attr "mode" "SI")]) + +(define_insn "*add_1" + [(set (match_operand:SWI48 0 "nonimmediate_operand" "=r,rm,r,r") + (plus:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "%0,0,r,r") + (match_operand:SWI48 2 "" ",r,0,l"))) + (clobber (reg:CC FLAGS_REG))] + "ix86_binary_operator_ok (PLUS, mode, operands)" +{ + switch (get_attr_type (insn)) + { + case TYPE_LEA: + return "#"; + + case TYPE_INCDEC: + gcc_assert (rtx_equal_p (operands[0], operands[1])); + if (operands[2] == const1_rtx) + return "inc{}\t%0"; + else + { + gcc_assert (operands[2] == constm1_rtx); + return "dec{}\t%0"; + } + + default: + /* For most processors, ADD is faster than LEA. This alternative + was added to use ADD as much as possible. */ + if (which_alternative == 2) + { + rtx tmp; + tmp = operands[1], operands[1] = operands[2], operands[2] = tmp; + } + + gcc_assert (rtx_equal_p (operands[0], operands[1])); + if (x86_maybe_negate_const_int (&operands[2], mode)) + return "sub{}\t{%2, %0|%0, %2}"; + + return "add{}\t{%2, %0|%0, %2}"; + } +} + [(set (attr "type") + (cond [(eq_attr "alternative" "3") + (const_string "lea") + (match_operand:SWI48 2 "incdec_operand" "") + (const_string "incdec") + ] + (const_string "alu"))) + (set (attr "length_immediate") + (if_then_else + (and (eq_attr "type" "alu") (match_operand 2 "const128_operand" "")) + (const_string "1") + (const_string "*"))) + (set_attr "mode" "")]) + +;; It may seem that nonimmediate operand is proper one for operand 1. +;; The addsi_1 pattern allows nonimmediate operand at that place and +;; we take care in ix86_binary_operator_ok to not allow two memory +;; operands so proper swapping will be done in reload. This allow +;; patterns constructed from addsi_1 to match. + +(define_insn "*addsi_1_zext" + [(set (match_operand:DI 0 "register_operand" "=r,r,r") + (zero_extend:DI + (plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0,r,r") + (match_operand:SI 2 "general_operand" "g,0,li")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && ix86_binary_operator_ok (PLUS, SImode, operands)" +{ + switch (get_attr_type (insn)) + { + case TYPE_LEA: + return "#"; + + case TYPE_INCDEC: + if (operands[2] == const1_rtx) + return "inc{l}\t%k0"; + else + { + gcc_assert (operands[2] == constm1_rtx); + return "dec{l}\t%k0"; + } + + default: + /* For most processors, ADD is faster than LEA. This alternative + was added to use ADD as much as possible. */ + if (which_alternative == 1) + { + rtx tmp; + tmp = operands[1], operands[1] = operands[2], operands[2] = tmp; + } + + if (x86_maybe_negate_const_int (&operands[2], SImode)) + return "sub{l}\t{%2, %k0|%k0, %2}"; + + return "add{l}\t{%2, %k0|%k0, %2}"; + } +} + [(set (attr "type") + (cond [(eq_attr "alternative" "2") + (const_string "lea") + (match_operand:SI 2 "incdec_operand" "") + (const_string "incdec") + ] + (const_string "alu"))) + (set (attr "length_immediate") + (if_then_else + (and (eq_attr "type" "alu") (match_operand 2 "const128_operand" "")) + (const_string "1") + (const_string "*"))) + (set_attr "mode" "SI")]) + +(define_insn "*addhi_1" + [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,r") + (plus:HI (match_operand:HI 1 "nonimmediate_operand" "%0,0") + (match_operand:HI 2 "general_operand" "rn,rm"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_PARTIAL_REG_STALL + && ix86_binary_operator_ok (PLUS, HImode, operands)" +{ + switch (get_attr_type (insn)) + { + case TYPE_INCDEC: + if (operands[2] == const1_rtx) + return "inc{w}\t%0"; + else + { + gcc_assert (operands[2] == constm1_rtx); + return "dec{w}\t%0"; + } + + default: + if (x86_maybe_negate_const_int (&operands[2], HImode)) + return "sub{w}\t{%2, %0|%0, %2}"; + + return "add{w}\t{%2, %0|%0, %2}"; + } +} + [(set (attr "type") + (if_then_else (match_operand:HI 2 "incdec_operand" "") + (const_string "incdec") + (const_string "alu"))) + (set (attr "length_immediate") + (if_then_else + (and (eq_attr "type" "alu") (match_operand 2 "const128_operand" "")) + (const_string "1") + (const_string "*"))) + (set_attr "mode" "HI")]) + +(define_insn "*addhi_1_lea" + [(set (match_operand:HI 0 "nonimmediate_operand" "=r,rm,r,r") + (plus:HI (match_operand:HI 1 "nonimmediate_operand" "%0,0,r,r") + (match_operand:HI 2 "general_operand" "rmn,rn,0,ln"))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_PARTIAL_REG_STALL + && ix86_binary_operator_ok (PLUS, HImode, operands)" +{ + switch (get_attr_type (insn)) + { + case TYPE_LEA: + return "#"; + + case TYPE_INCDEC: + gcc_assert (rtx_equal_p (operands[0], operands[1])); + if (operands[2] == const1_rtx) + return "inc{w}\t%0"; + else + { + gcc_assert (operands[2] == constm1_rtx); + return "dec{w}\t%0"; + } + + default: + /* For most processors, ADD is faster than LEA. This alternative + was added to use ADD as much as possible. */ + if (which_alternative == 2) + { + rtx tmp; + tmp = operands[1], operands[1] = operands[2], operands[2] = tmp; + } + + gcc_assert (rtx_equal_p (operands[0], operands[1])); + if (x86_maybe_negate_const_int (&operands[2], HImode)) + return "sub{w}\t{%2, %0|%0, %2}"; + + return "add{w}\t{%2, %0|%0, %2}"; + } +} + [(set (attr "type") + (cond [(eq_attr "alternative" "3") + (const_string "lea") + (match_operand:HI 2 "incdec_operand" "") + (const_string "incdec") + ] + (const_string "alu"))) + (set (attr "length_immediate") + (if_then_else + (and (eq_attr "type" "alu") (match_operand 2 "const128_operand" "")) + (const_string "1") + (const_string "*"))) + (set_attr "mode" "HI,HI,HI,SI")]) + +;; %%% Potential partial reg stall on alternative 2. What to do? +(define_insn "*addqi_1" + [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r") + (plus:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0") + (match_operand:QI 2 "general_operand" "qn,qmn,rn"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_PARTIAL_REG_STALL + && ix86_binary_operator_ok (PLUS, QImode, operands)" +{ + int widen = (which_alternative == 2); + switch (get_attr_type (insn)) + { + case TYPE_INCDEC: + if (operands[2] == const1_rtx) + return widen ? "inc{l}\t%k0" : "inc{b}\t%0"; + else + { + gcc_assert (operands[2] == constm1_rtx); + return widen ? "dec{l}\t%k0" : "dec{b}\t%0"; + } + + default: + if (x86_maybe_negate_const_int (&operands[2], QImode)) + { + if (widen) + return "sub{l}\t{%2, %k0|%k0, %2}"; + else + return "sub{b}\t{%2, %0|%0, %2}"; + } + if (widen) + return "add{l}\t{%k2, %k0|%k0, %k2}"; + else + return "add{b}\t{%2, %0|%0, %2}"; + } +} + [(set (attr "type") + (if_then_else (match_operand:QI 2 "incdec_operand" "") + (const_string "incdec") + (const_string "alu"))) + (set (attr "length_immediate") + (if_then_else + (and (eq_attr "type" "alu") (match_operand 2 "const128_operand" "")) + (const_string "1") + (const_string "*"))) + (set_attr "mode" "QI,QI,SI")]) + +;; %%% Potential partial reg stall on alternatives 3 and 4. What to do? +(define_insn "*addqi_1_lea" + [(set (match_operand:QI 0 "nonimmediate_operand" "=q,qm,q,r,r,r") + (plus:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,q,0,r,r") + (match_operand:QI 2 "general_operand" "qmn,qn,0,rn,0,ln"))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_PARTIAL_REG_STALL + && ix86_binary_operator_ok (PLUS, QImode, operands)" +{ + int widen = (which_alternative == 3 || which_alternative == 4); + + switch (get_attr_type (insn)) + { + case TYPE_LEA: + return "#"; + + case TYPE_INCDEC: + gcc_assert (rtx_equal_p (operands[0], operands[1])); + if (operands[2] == const1_rtx) + return widen ? "inc{l}\t%k0" : "inc{b}\t%0"; + else + { + gcc_assert (operands[2] == constm1_rtx); + return widen ? "dec{l}\t%k0" : "dec{b}\t%0"; + } + + default: + /* For most processors, ADD is faster than LEA. These alternatives + were added to use ADD as much as possible. */ + if (which_alternative == 2 || which_alternative == 4) + { + rtx tmp; + tmp = operands[1], operands[1] = operands[2], operands[2] = tmp; + } + + gcc_assert (rtx_equal_p (operands[0], operands[1])); + if (x86_maybe_negate_const_int (&operands[2], QImode)) + { + if (widen) + return "sub{l}\t{%2, %k0|%k0, %2}"; + else + return "sub{b}\t{%2, %0|%0, %2}"; + } + if (widen) + return "add{l}\t{%k2, %k0|%k0, %k2}"; + else + return "add{b}\t{%2, %0|%0, %2}"; + } +} + [(set (attr "type") + (cond [(eq_attr "alternative" "5") + (const_string "lea") + (match_operand:QI 2 "incdec_operand" "") + (const_string "incdec") + ] + (const_string "alu"))) + (set (attr "length_immediate") + (if_then_else + (and (eq_attr "type" "alu") (match_operand 2 "const128_operand" "")) + (const_string "1") + (const_string "*"))) + (set_attr "mode" "QI,QI,QI,SI,SI,SI")]) + +(define_insn "*addqi_1_slp" + [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm,q")) + (plus:QI (match_dup 0) + (match_operand:QI 1 "general_operand" "qn,qnm"))) + (clobber (reg:CC FLAGS_REG))] + "(! TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)) + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" +{ + switch (get_attr_type (insn)) + { + case TYPE_INCDEC: + if (operands[1] == const1_rtx) + return "inc{b}\t%0"; + else + { + gcc_assert (operands[1] == constm1_rtx); + return "dec{b}\t%0"; + } + + default: + if (x86_maybe_negate_const_int (&operands[1], QImode)) + return "sub{b}\t{%1, %0|%0, %1}"; + + return "add{b}\t{%1, %0|%0, %1}"; + } +} + [(set (attr "type") + (if_then_else (match_operand:QI 1 "incdec_operand" "") + (const_string "incdec") + (const_string "alu1"))) + (set (attr "memory") + (if_then_else (match_operand 1 "memory_operand" "") + (const_string "load") + (const_string "none"))) + (set_attr "mode" "QI")]) + +;; Convert lea to the lea pattern to avoid flags dependency. +(define_split + [(set (match_operand 0 "register_operand" "") + (plus (match_operand 1 "register_operand" "") + (match_operand 2 "nonmemory_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "reload_completed && ix86_lea_for_add_ok (insn, operands)" + [(const_int 0)] +{ + rtx pat; + enum machine_mode mode = GET_MODE (operands[0]); + + /* In -fPIC mode the constructs like (const (unspec [symbol_ref])) + may confuse gen_lowpart. */ + if (mode != Pmode) + { + operands[1] = gen_lowpart (Pmode, operands[1]); + operands[2] = gen_lowpart (Pmode, operands[2]); + } + + pat = gen_rtx_PLUS (Pmode, operands[1], operands[2]); + + if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode)) + operands[0] = gen_lowpart (SImode, operands[0]); + + if (TARGET_64BIT && mode != Pmode) + pat = gen_rtx_SUBREG (SImode, pat, 0); + + emit_insn (gen_rtx_SET (VOIDmode, operands[0], pat)); + DONE; +}) + +;; Convert lea to the lea pattern to avoid flags dependency. +;; ??? This pattern handles immediate operands that do not satisfy immediate +;; operand predicate (LEGITIMATE_CONSTANT_P) in the previous pattern. +(define_split + [(set (match_operand:DI 0 "register_operand" "") + (plus:DI (match_operand:DI 1 "register_operand" "") + (match_operand:DI 2 "x86_64_immediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && reload_completed + && true_regnum (operands[0]) != true_regnum (operands[1])" + [(set (match_dup 0) + (plus:DI (match_dup 1) (match_dup 2)))]) + +;; Convert lea to the lea pattern to avoid flags dependency. +(define_split + [(set (match_operand:DI 0 "register_operand" "") + (zero_extend:DI + (plus:SI (match_operand:SI 1 "register_operand" "") + (match_operand:SI 2 "nonmemory_operand" "")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && reload_completed + && ix86_lea_for_add_ok (insn, operands)" + [(set (match_dup 0) + (zero_extend:DI (subreg:SI (plus:DI (match_dup 1) (match_dup 2)) 0)))] +{ + operands[1] = gen_lowpart (DImode, operands[1]); + operands[2] = gen_lowpart (DImode, operands[2]); +}) + +(define_insn "*add_2" + [(set (reg FLAGS_REG) + (compare + (plus:SWI + (match_operand:SWI 1 "nonimmediate_operand" "%0,0") + (match_operand:SWI 2 "" ",")) + (const_int 0))) + (set (match_operand:SWI 0 "nonimmediate_operand" "=,m") + (plus:SWI (match_dup 1) (match_dup 2)))] + "ix86_match_ccmode (insn, CCGOCmode) + && ix86_binary_operator_ok (PLUS, mode, operands)" +{ + switch (get_attr_type (insn)) + { + case TYPE_INCDEC: + if (operands[2] == const1_rtx) + return "inc{}\t%0"; + else + { + gcc_assert (operands[2] == constm1_rtx); + return "dec{}\t%0"; + } + + default: + if (x86_maybe_negate_const_int (&operands[2], mode)) + return "sub{}\t{%2, %0|%0, %2}"; + + return "add{}\t{%2, %0|%0, %2}"; + } +} + [(set (attr "type") + (if_then_else (match_operand:SWI 2 "incdec_operand" "") + (const_string "incdec") + (const_string "alu"))) + (set (attr "length_immediate") + (if_then_else + (and (eq_attr "type" "alu") (match_operand 2 "const128_operand" "")) + (const_string "1") + (const_string "*"))) + (set_attr "mode" "")]) + +;; See comment for addsi_1_zext why we do use nonimmediate_operand +(define_insn "*addsi_2_zext" + [(set (reg FLAGS_REG) + (compare + (plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0") + (match_operand:SI 2 "general_operand" "g")) + (const_int 0))) + (set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI (plus:SI (match_dup 1) (match_dup 2))))] + "TARGET_64BIT && ix86_match_ccmode (insn, CCGOCmode) + && ix86_binary_operator_ok (PLUS, SImode, operands)" +{ + switch (get_attr_type (insn)) + { + case TYPE_INCDEC: + if (operands[2] == const1_rtx) + return "inc{l}\t%k0"; + else + { + gcc_assert (operands[2] == constm1_rtx); + return "dec{l}\t%k0"; + } + + default: + if (x86_maybe_negate_const_int (&operands[2], SImode)) + return "sub{l}\t{%2, %k0|%k0, %2}"; + + return "add{l}\t{%2, %k0|%k0, %2}"; + } +} + [(set (attr "type") + (if_then_else (match_operand:SI 2 "incdec_operand" "") + (const_string "incdec") + (const_string "alu"))) + (set (attr "length_immediate") + (if_then_else + (and (eq_attr "type" "alu") (match_operand 2 "const128_operand" "")) + (const_string "1") + (const_string "*"))) + (set_attr "mode" "SI")]) + +(define_insn "*add_3" + [(set (reg FLAGS_REG) + (compare + (neg:SWI (match_operand:SWI 2 "" "")) + (match_operand:SWI 1 "nonimmediate_operand" "%0"))) + (clobber (match_scratch:SWI 0 "="))] + "ix86_match_ccmode (insn, CCZmode) + && !(MEM_P (operands[1]) && MEM_P (operands[2]))" +{ + switch (get_attr_type (insn)) + { + case TYPE_INCDEC: + if (operands[2] == const1_rtx) + return "inc{}\t%0"; + else + { + gcc_assert (operands[2] == constm1_rtx); + return "dec{}\t%0"; + } + + default: + if (x86_maybe_negate_const_int (&operands[2], mode)) + return "sub{}\t{%2, %0|%0, %2}"; + + return "add{}\t{%2, %0|%0, %2}"; + } +} + [(set (attr "type") + (if_then_else (match_operand:SWI 2 "incdec_operand" "") + (const_string "incdec") + (const_string "alu"))) + (set (attr "length_immediate") + (if_then_else + (and (eq_attr "type" "alu") (match_operand 2 "const128_operand" "")) + (const_string "1") + (const_string "*"))) + (set_attr "mode" "")]) + +;; See comment for addsi_1_zext why we do use nonimmediate_operand +(define_insn "*addsi_3_zext" + [(set (reg FLAGS_REG) + (compare + (neg:SI (match_operand:SI 2 "general_operand" "g")) + (match_operand:SI 1 "nonimmediate_operand" "%0"))) + (set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI (plus:SI (match_dup 1) (match_dup 2))))] + "TARGET_64BIT && ix86_match_ccmode (insn, CCZmode) + && ix86_binary_operator_ok (PLUS, SImode, operands)" +{ + switch (get_attr_type (insn)) + { + case TYPE_INCDEC: + if (operands[2] == const1_rtx) + return "inc{l}\t%k0"; + else + { + gcc_assert (operands[2] == constm1_rtx); + return "dec{l}\t%k0"; + } + + default: + if (x86_maybe_negate_const_int (&operands[2], SImode)) + return "sub{l}\t{%2, %k0|%k0, %2}"; + + return "add{l}\t{%2, %k0|%k0, %2}"; + } +} + [(set (attr "type") + (if_then_else (match_operand:SI 2 "incdec_operand" "") + (const_string "incdec") + (const_string "alu"))) + (set (attr "length_immediate") + (if_then_else + (and (eq_attr "type" "alu") (match_operand 2 "const128_operand" "")) + (const_string "1") + (const_string "*"))) + (set_attr "mode" "SI")]) + +; For comparisons against 1, -1 and 128, we may generate better code +; by converting cmp to add, inc or dec as done by peephole2. This pattern +; is matched then. We can't accept general immediate, because for +; case of overflows, the result is messed up. +; Also carry flag is reversed compared to cmp, so this conversion is valid +; only for comparisons not depending on it. + +(define_insn "*adddi_4" + [(set (reg FLAGS_REG) + (compare + (match_operand:DI 1 "nonimmediate_operand" "0") + (match_operand:DI 2 "x86_64_immediate_operand" "e"))) + (clobber (match_scratch:DI 0 "=rm"))] + "TARGET_64BIT + && ix86_match_ccmode (insn, CCGCmode)" +{ + switch (get_attr_type (insn)) + { + case TYPE_INCDEC: + if (operands[2] == constm1_rtx) + return "inc{q}\t%0"; + else + { + gcc_assert (operands[2] == const1_rtx); + return "dec{q}\t%0"; + } + + default: + if (x86_maybe_negate_const_int (&operands[2], DImode)) + return "add{q}\t{%2, %0|%0, %2}"; + + return "sub{q}\t{%2, %0|%0, %2}"; + } +} + [(set (attr "type") + (if_then_else (match_operand:DI 2 "incdec_operand" "") + (const_string "incdec") + (const_string "alu"))) + (set (attr "length_immediate") + (if_then_else + (and (eq_attr "type" "alu") (match_operand 2 "const128_operand" "")) + (const_string "1") + (const_string "*"))) + (set_attr "mode" "DI")]) + +; For comparisons against 1, -1 and 128, we may generate better code +; by converting cmp to add, inc or dec as done by peephole2. This pattern +; is matched then. We can't accept general immediate, because for +; case of overflows, the result is messed up. +; Also carry flag is reversed compared to cmp, so this conversion is valid +; only for comparisons not depending on it. + +(define_insn "*add_4" + [(set (reg FLAGS_REG) + (compare + (match_operand:SWI124 1 "nonimmediate_operand" "0") + (match_operand:SWI124 2 "const_int_operand" "n"))) + (clobber (match_scratch:SWI124 0 "=m"))] + "ix86_match_ccmode (insn, CCGCmode)" +{ + switch (get_attr_type (insn)) + { + case TYPE_INCDEC: + if (operands[2] == constm1_rtx) + return "inc{}\t%0"; + else + { + gcc_assert (operands[2] == const1_rtx); + return "dec{}\t%0"; + } + + default: + if (x86_maybe_negate_const_int (&operands[2], mode)) + return "add{}\t{%2, %0|%0, %2}"; + + return "sub{}\t{%2, %0|%0, %2}"; + } +} + [(set (attr "type") + (if_then_else (match_operand: 2 "incdec_operand" "") + (const_string "incdec") + (const_string "alu"))) + (set (attr "length_immediate") + (if_then_else + (and (eq_attr "type" "alu") (match_operand 2 "const128_operand" "")) + (const_string "1") + (const_string "*"))) + (set_attr "mode" "")]) + +(define_insn "*add_5" + [(set (reg FLAGS_REG) + (compare + (plus:SWI + (match_operand:SWI 1 "nonimmediate_operand" "%0") + (match_operand:SWI 2 "" "")) + (const_int 0))) + (clobber (match_scratch:SWI 0 "="))] + "ix86_match_ccmode (insn, CCGOCmode) + && !(MEM_P (operands[1]) && MEM_P (operands[2]))" +{ + switch (get_attr_type (insn)) + { + case TYPE_INCDEC: + if (operands[2] == const1_rtx) + return "inc{}\t%0"; + else + { + gcc_assert (operands[2] == constm1_rtx); + return "dec{}\t%0"; + } + + default: + if (x86_maybe_negate_const_int (&operands[2], mode)) + return "sub{}\t{%2, %0|%0, %2}"; + + return "add{}\t{%2, %0|%0, %2}"; + } +} + [(set (attr "type") + (if_then_else (match_operand:SWI 2 "incdec_operand" "") + (const_string "incdec") + (const_string "alu"))) + (set (attr "length_immediate") + (if_then_else + (and (eq_attr "type" "alu") (match_operand 2 "const128_operand" "")) + (const_string "1") + (const_string "*"))) + (set_attr "mode" "")]) + +(define_insn "*addqi_ext_1_rex64" + [(set (zero_extract:SI (match_operand 0 "ext_register_operand" "=Q") + (const_int 8) + (const_int 8)) + (plus:SI + (zero_extract:SI + (match_operand 1 "ext_register_operand" "0") + (const_int 8) + (const_int 8)) + (match_operand:QI 2 "nonmemory_operand" "Qn"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT" +{ + switch (get_attr_type (insn)) + { + case TYPE_INCDEC: + if (operands[2] == const1_rtx) + return "inc{b}\t%h0"; + else + { + gcc_assert (operands[2] == constm1_rtx); + return "dec{b}\t%h0"; + } + + default: + return "add{b}\t{%2, %h0|%h0, %2}"; + } +} + [(set (attr "type") + (if_then_else (match_operand:QI 2 "incdec_operand" "") + (const_string "incdec") + (const_string "alu"))) + (set_attr "modrm" "1") + (set_attr "mode" "QI")]) + +(define_insn "addqi_ext_1" + [(set (zero_extract:SI (match_operand 0 "ext_register_operand" "=Q") + (const_int 8) + (const_int 8)) + (plus:SI + (zero_extract:SI + (match_operand 1 "ext_register_operand" "0") + (const_int 8) + (const_int 8)) + (match_operand:QI 2 "general_operand" "Qmn"))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_64BIT" +{ + switch (get_attr_type (insn)) + { + case TYPE_INCDEC: + if (operands[2] == const1_rtx) + return "inc{b}\t%h0"; + else + { + gcc_assert (operands[2] == constm1_rtx); + return "dec{b}\t%h0"; + } + + default: + return "add{b}\t{%2, %h0|%h0, %2}"; + } +} + [(set (attr "type") + (if_then_else (match_operand:QI 2 "incdec_operand" "") + (const_string "incdec") + (const_string "alu"))) + (set_attr "modrm" "1") + (set_attr "mode" "QI")]) + +(define_insn "*addqi_ext_2" + [(set (zero_extract:SI (match_operand 0 "ext_register_operand" "=Q") + (const_int 8) + (const_int 8)) + (plus:SI + (zero_extract:SI + (match_operand 1 "ext_register_operand" "%0") + (const_int 8) + (const_int 8)) + (zero_extract:SI + (match_operand 2 "ext_register_operand" "Q") + (const_int 8) + (const_int 8)))) + (clobber (reg:CC FLAGS_REG))] + "" + "add{b}\t{%h2, %h0|%h0, %h2}" + [(set_attr "type" "alu") + (set_attr "mode" "QI")]) + +;; The lea patterns for non-Pmodes needs to be matched by +;; several insns converted to real lea by splitters. + +(define_insn_and_split "*lea_general_1" + [(set (match_operand 0 "register_operand" "=r") + (plus (plus (match_operand 1 "index_register_operand" "l") + (match_operand 2 "register_operand" "r")) + (match_operand 3 "immediate_operand" "i")))] + "(GET_MODE (operands[0]) == QImode || GET_MODE (operands[0]) == HImode + || (TARGET_64BIT && GET_MODE (operands[0]) == SImode)) + && (!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)) + && GET_MODE (operands[0]) == GET_MODE (operands[1]) + && GET_MODE (operands[0]) == GET_MODE (operands[2]) + && (GET_MODE (operands[0]) == GET_MODE (operands[3]) + || GET_MODE (operands[3]) == VOIDmode)" + "#" + "&& reload_completed" + [(const_int 0)] +{ + rtx pat; + operands[0] = gen_lowpart (SImode, operands[0]); + operands[1] = gen_lowpart (Pmode, operands[1]); + operands[2] = gen_lowpart (Pmode, operands[2]); + operands[3] = gen_lowpart (Pmode, operands[3]); + pat = gen_rtx_PLUS (Pmode, gen_rtx_PLUS (Pmode, operands[1], operands[2]), + operands[3]); + if (Pmode != SImode) + pat = gen_rtx_SUBREG (SImode, pat, 0); + emit_insn (gen_rtx_SET (VOIDmode, operands[0], pat)); + DONE; +} + [(set_attr "type" "lea") + (set_attr "mode" "SI")]) + +(define_insn_and_split "*lea_general_1_zext" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (plus:SI (plus:SI + (match_operand:SI 1 "index_register_operand" "l") + (match_operand:SI 2 "register_operand" "r")) + (match_operand:SI 3 "immediate_operand" "i"))))] + "TARGET_64BIT" + "#" + "&& reload_completed" + [(set (match_dup 0) + (zero_extend:DI (subreg:SI (plus:DI (plus:DI (match_dup 1) + (match_dup 2)) + (match_dup 3)) 0)))] +{ + operands[1] = gen_lowpart (Pmode, operands[1]); + operands[2] = gen_lowpart (Pmode, operands[2]); + operands[3] = gen_lowpart (Pmode, operands[3]); +} + [(set_attr "type" "lea") + (set_attr "mode" "SI")]) + +(define_insn_and_split "*lea_general_2" + [(set (match_operand 0 "register_operand" "=r") + (plus (mult (match_operand 1 "index_register_operand" "l") + (match_operand 2 "const248_operand" "i")) + (match_operand 3 "nonmemory_operand" "ri")))] + "(GET_MODE (operands[0]) == QImode || GET_MODE (operands[0]) == HImode + || (TARGET_64BIT && GET_MODE (operands[0]) == SImode)) + && (!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)) + && GET_MODE (operands[0]) == GET_MODE (operands[1]) + && (GET_MODE (operands[0]) == GET_MODE (operands[3]) + || GET_MODE (operands[3]) == VOIDmode)" + "#" + "&& reload_completed" + [(const_int 0)] +{ + rtx pat; + operands[0] = gen_lowpart (SImode, operands[0]); + operands[1] = gen_lowpart (Pmode, operands[1]); + operands[3] = gen_lowpart (Pmode, operands[3]); + pat = gen_rtx_PLUS (Pmode, gen_rtx_MULT (Pmode, operands[1], operands[2]), + operands[3]); + if (Pmode != SImode) + pat = gen_rtx_SUBREG (SImode, pat, 0); + emit_insn (gen_rtx_SET (VOIDmode, operands[0], pat)); + DONE; +} + [(set_attr "type" "lea") + (set_attr "mode" "SI")]) + +(define_insn_and_split "*lea_general_2_zext" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (plus:SI (mult:SI + (match_operand:SI 1 "index_register_operand" "l") + (match_operand:SI 2 "const248_operand" "n")) + (match_operand:SI 3 "nonmemory_operand" "ri"))))] + "TARGET_64BIT" + "#" + "&& reload_completed" + [(set (match_dup 0) + (zero_extend:DI (subreg:SI (plus:DI (mult:DI (match_dup 1) + (match_dup 2)) + (match_dup 3)) 0)))] +{ + operands[1] = gen_lowpart (Pmode, operands[1]); + operands[3] = gen_lowpart (Pmode, operands[3]); +} + [(set_attr "type" "lea") + (set_attr "mode" "SI")]) + +(define_insn_and_split "*lea_general_3" + [(set (match_operand 0 "register_operand" "=r") + (plus (plus (mult (match_operand 1 "index_register_operand" "l") + (match_operand 2 "const248_operand" "i")) + (match_operand 3 "register_operand" "r")) + (match_operand 4 "immediate_operand" "i")))] + "(GET_MODE (operands[0]) == QImode || GET_MODE (operands[0]) == HImode + || (TARGET_64BIT && GET_MODE (operands[0]) == SImode)) + && (!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)) + && GET_MODE (operands[0]) == GET_MODE (operands[1]) + && GET_MODE (operands[0]) == GET_MODE (operands[3])" + "#" + "&& reload_completed" + [(const_int 0)] +{ + rtx pat; + operands[0] = gen_lowpart (SImode, operands[0]); + operands[1] = gen_lowpart (Pmode, operands[1]); + operands[3] = gen_lowpart (Pmode, operands[3]); + operands[4] = gen_lowpart (Pmode, operands[4]); + pat = gen_rtx_PLUS (Pmode, + gen_rtx_PLUS (Pmode, gen_rtx_MULT (Pmode, operands[1], + operands[2]), + operands[3]), + operands[4]); + if (Pmode != SImode) + pat = gen_rtx_SUBREG (SImode, pat, 0); + emit_insn (gen_rtx_SET (VOIDmode, operands[0], pat)); + DONE; +} + [(set_attr "type" "lea") + (set_attr "mode" "SI")]) + +(define_insn_and_split "*lea_general_3_zext" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (plus:SI (plus:SI + (mult:SI + (match_operand:SI 1 "index_register_operand" "l") + (match_operand:SI 2 "const248_operand" "n")) + (match_operand:SI 3 "register_operand" "r")) + (match_operand:SI 4 "immediate_operand" "i"))))] + "TARGET_64BIT" + "#" + "&& reload_completed" + [(set (match_dup 0) + (zero_extend:DI (subreg:SI (plus:DI (plus:DI (mult:DI (match_dup 1) + (match_dup 2)) + (match_dup 3)) + (match_dup 4)) 0)))] +{ + operands[1] = gen_lowpart (Pmode, operands[1]); + operands[3] = gen_lowpart (Pmode, operands[3]); + operands[4] = gen_lowpart (Pmode, operands[4]); +} + [(set_attr "type" "lea") + (set_attr "mode" "SI")]) + +;; Subtract instructions + +(define_expand "sub3" + [(set (match_operand:SDWIM 0 "nonimmediate_operand" "") + (minus:SDWIM (match_operand:SDWIM 1 "nonimmediate_operand" "") + (match_operand:SDWIM 2 "" "")))] + "" + "ix86_expand_binary_operator (MINUS, mode, operands); DONE;") + +(define_insn_and_split "*sub3_doubleword" + [(set (match_operand: 0 "nonimmediate_operand" "=r,o") + (minus: + (match_operand: 1 "nonimmediate_operand" "0,0") + (match_operand: 2 "" "ro,r"))) + (clobber (reg:CC FLAGS_REG))] + "ix86_binary_operator_ok (MINUS, mode, operands)" + "#" + "reload_completed" + [(parallel [(set (reg:CC FLAGS_REG) + (compare:CC (match_dup 1) (match_dup 2))) + (set (match_dup 0) + (minus:DWIH (match_dup 1) (match_dup 2)))]) + (parallel [(set (match_dup 3) + (minus:DWIH + (match_dup 4) + (plus:DWIH + (ltu:DWIH (reg:CC FLAGS_REG) (const_int 0)) + (match_dup 5)))) + (clobber (reg:CC FLAGS_REG))])] + "split_double_mode (mode, &operands[0], 3, &operands[0], &operands[3]);") + +(define_insn "*sub_1" + [(set (match_operand:SWI 0 "nonimmediate_operand" "=m,") + (minus:SWI + (match_operand:SWI 1 "nonimmediate_operand" "0,0") + (match_operand:SWI 2 "" ",m"))) + (clobber (reg:CC FLAGS_REG))] + "ix86_binary_operator_ok (MINUS, mode, operands)" + "sub{}\t{%2, %0|%0, %2}" + [(set_attr "type" "alu") + (set_attr "mode" "")]) + +(define_insn "*subsi_1_zext" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (minus:SI (match_operand:SI 1 "register_operand" "0") + (match_operand:SI 2 "general_operand" "g")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && ix86_binary_operator_ok (MINUS, SImode, operands)" + "sub{l}\t{%2, %k0|%k0, %2}" + [(set_attr "type" "alu") + (set_attr "mode" "SI")]) + +(define_insn "*subqi_1_slp" + [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm,q")) + (minus:QI (match_dup 0) + (match_operand:QI 1 "general_operand" "qn,qm"))) + (clobber (reg:CC FLAGS_REG))] + "(! TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)) + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "sub{b}\t{%1, %0|%0, %1}" + [(set_attr "type" "alu1") + (set_attr "mode" "QI")]) + +(define_insn "*sub_2" + [(set (reg FLAGS_REG) + (compare + (minus:SWI + (match_operand:SWI 1 "nonimmediate_operand" "0,0") + (match_operand:SWI 2 "" ",m")) + (const_int 0))) + (set (match_operand:SWI 0 "nonimmediate_operand" "=m,") + (minus:SWI (match_dup 1) (match_dup 2)))] + "ix86_match_ccmode (insn, CCGOCmode) + && ix86_binary_operator_ok (MINUS, mode, operands)" + "sub{}\t{%2, %0|%0, %2}" + [(set_attr "type" "alu") + (set_attr "mode" "")]) + +(define_insn "*subsi_2_zext" + [(set (reg FLAGS_REG) + (compare + (minus:SI (match_operand:SI 1 "register_operand" "0") + (match_operand:SI 2 "general_operand" "g")) + (const_int 0))) + (set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (minus:SI (match_dup 1) + (match_dup 2))))] + "TARGET_64BIT && ix86_match_ccmode (insn, CCGOCmode) + && ix86_binary_operator_ok (MINUS, SImode, operands)" + "sub{l}\t{%2, %k0|%k0, %2}" + [(set_attr "type" "alu") + (set_attr "mode" "SI")]) + +(define_insn "*sub_3" + [(set (reg FLAGS_REG) + (compare (match_operand:SWI 1 "nonimmediate_operand" "0,0") + (match_operand:SWI 2 "" ",m"))) + (set (match_operand:SWI 0 "nonimmediate_operand" "=m,") + (minus:SWI (match_dup 1) (match_dup 2)))] + "ix86_match_ccmode (insn, CCmode) + && ix86_binary_operator_ok (MINUS, mode, operands)" + "sub{}\t{%2, %0|%0, %2}" + [(set_attr "type" "alu") + (set_attr "mode" "")]) + +(define_insn "*subsi_3_zext" + [(set (reg FLAGS_REG) + (compare (match_operand:SI 1 "register_operand" "0") + (match_operand:SI 2 "general_operand" "g"))) + (set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (minus:SI (match_dup 1) + (match_dup 2))))] + "TARGET_64BIT && ix86_match_ccmode (insn, CCmode) + && ix86_binary_operator_ok (MINUS, SImode, operands)" + "sub{l}\t{%2, %1|%1, %2}" + [(set_attr "type" "alu") + (set_attr "mode" "SI")]) + +;; Add with carry and subtract with borrow + +(define_expand "3_carry" + [(parallel + [(set (match_operand:SWI 0 "nonimmediate_operand" "") + (plusminus:SWI + (match_operand:SWI 1 "nonimmediate_operand" "") + (plus:SWI (match_operator:SWI 4 "ix86_carry_flag_operator" + [(match_operand 3 "flags_reg_operand" "") + (const_int 0)]) + (match_operand:SWI 2 "" "")))) + (clobber (reg:CC FLAGS_REG))])] + "ix86_binary_operator_ok (, mode, operands)") + +(define_insn "*3_carry" + [(set (match_operand:SWI 0 "nonimmediate_operand" "=m,") + (plusminus:SWI + (match_operand:SWI 1 "nonimmediate_operand" "0,0") + (plus:SWI + (match_operator 3 "ix86_carry_flag_operator" + [(reg FLAGS_REG) (const_int 0)]) + (match_operand:SWI 2 "" ",m")))) + (clobber (reg:CC FLAGS_REG))] + "ix86_binary_operator_ok (PLUS, mode, operands)" + "{}\t{%2, %0|%0, %2}" + [(set_attr "type" "alu") + (set_attr "use_carry" "1") + (set_attr "pent_pair" "pu") + (set_attr "mode" "")]) + +(define_insn "*addsi3_carry_zext" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0") + (plus:SI (match_operator 3 "ix86_carry_flag_operator" + [(reg FLAGS_REG) (const_int 0)]) + (match_operand:SI 2 "general_operand" "g"))))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && ix86_binary_operator_ok (PLUS, SImode, operands)" + "adc{l}\t{%2, %k0|%k0, %2}" + [(set_attr "type" "alu") + (set_attr "use_carry" "1") + (set_attr "pent_pair" "pu") + (set_attr "mode" "SI")]) + +(define_insn "*subsi3_carry_zext" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (minus:SI (match_operand:SI 1 "register_operand" "0") + (plus:SI (match_operator 3 "ix86_carry_flag_operator" + [(reg FLAGS_REG) (const_int 0)]) + (match_operand:SI 2 "general_operand" "g"))))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && ix86_binary_operator_ok (MINUS, SImode, operands)" + "sbb{l}\t{%2, %k0|%k0, %2}" + [(set_attr "type" "alu") + (set_attr "pent_pair" "pu") + (set_attr "mode" "SI")]) + +;; Overflow setting add and subtract instructions + +(define_insn "*add3_cconly_overflow" + [(set (reg:CCC FLAGS_REG) + (compare:CCC + (plus:SWI + (match_operand:SWI 1 "nonimmediate_operand" "%0") + (match_operand:SWI 2 "" "")) + (match_dup 1))) + (clobber (match_scratch:SWI 0 "="))] + "!(MEM_P (operands[1]) && MEM_P (operands[2]))" + "add{}\t{%2, %0|%0, %2}" + [(set_attr "type" "alu") + (set_attr "mode" "")]) + +(define_insn "*sub3_cconly_overflow" + [(set (reg:CCC FLAGS_REG) + (compare:CCC + (minus:SWI + (match_operand:SWI 0 "nonimmediate_operand" "m,") + (match_operand:SWI 1 "" ",m")) + (match_dup 0)))] + "" + "cmp{}\t{%1, %0|%0, %1}" + [(set_attr "type" "icmp") + (set_attr "mode" "")]) + +(define_insn "*3_cc_overflow" + [(set (reg:CCC FLAGS_REG) + (compare:CCC + (plusminus:SWI + (match_operand:SWI 1 "nonimmediate_operand" "0,0") + (match_operand:SWI 2 "" ",m")) + (match_dup 1))) + (set (match_operand:SWI 0 "nonimmediate_operand" "=m,") + (plusminus:SWI (match_dup 1) (match_dup 2)))] + "ix86_binary_operator_ok (, mode, operands)" + "{}\t{%2, %0|%0, %2}" + [(set_attr "type" "alu") + (set_attr "mode" "")]) + +(define_insn "*si3_zext_cc_overflow" + [(set (reg:CCC FLAGS_REG) + (compare:CCC + (plusminus:SI + (match_operand:SI 1 "nonimmediate_operand" "0") + (match_operand:SI 2 "general_operand" "g")) + (match_dup 1))) + (set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI (plusminus:SI (match_dup 1) (match_dup 2))))] + "TARGET_64BIT && ix86_binary_operator_ok (, SImode, operands)" + "{l}\t{%2, %k0|%k0, %2}" + [(set_attr "type" "alu") + (set_attr "mode" "SI")]) + +;; The patterns that match these are at the end of this file. + +(define_expand "xf3" + [(set (match_operand:XF 0 "register_operand" "") + (plusminus:XF + (match_operand:XF 1 "register_operand" "") + (match_operand:XF 2 "register_operand" "")))] + "TARGET_80387") + +(define_expand "3" + [(set (match_operand:MODEF 0 "register_operand" "") + (plusminus:MODEF + (match_operand:MODEF 1 "register_operand" "") + (match_operand:MODEF 2 "nonimmediate_operand" "")))] + "(TARGET_80387 && X87_ENABLE_ARITH (mode)) + || (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)") + +;; Multiply instructions + +(define_expand "mul3" + [(parallel [(set (match_operand:SWIM248 0 "register_operand" "") + (mult:SWIM248 + (match_operand:SWIM248 1 "register_operand" "") + (match_operand:SWIM248 2 "" ""))) + (clobber (reg:CC FLAGS_REG))])]) + +(define_expand "mulqi3" + [(parallel [(set (match_operand:QI 0 "register_operand" "") + (mult:QI + (match_operand:QI 1 "register_operand" "") + (match_operand:QI 2 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))])] + "TARGET_QIMODE_MATH") + +;; On AMDFAM10 +;; IMUL reg32/64, reg32/64, imm8 Direct +;; IMUL reg32/64, mem32/64, imm8 VectorPath +;; IMUL reg32/64, reg32/64, imm32 Direct +;; IMUL reg32/64, mem32/64, imm32 VectorPath +;; IMUL reg32/64, reg32/64 Direct +;; IMUL reg32/64, mem32/64 Direct +;; +;; On BDVER1, all above IMULs use DirectPath + +(define_insn "*mul3_1" + [(set (match_operand:SWI48 0 "register_operand" "=r,r,r") + (mult:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "%rm,rm,0") + (match_operand:SWI48 2 "" "K,,mr"))) + (clobber (reg:CC FLAGS_REG))] + "!(MEM_P (operands[1]) && MEM_P (operands[2]))" + "@ + imul{}\t{%2, %1, %0|%0, %1, %2} + imul{}\t{%2, %1, %0|%0, %1, %2} + imul{}\t{%2, %0|%0, %2}" + [(set_attr "type" "imul") + (set_attr "prefix_0f" "0,0,1") + (set (attr "athlon_decode") + (cond [(eq_attr "cpu" "athlon") + (const_string "vector") + (eq_attr "alternative" "1") + (const_string "vector") + (and (eq_attr "alternative" "2") + (match_operand 1 "memory_operand" "")) + (const_string "vector")] + (const_string "direct"))) + (set (attr "amdfam10_decode") + (cond [(and (eq_attr "alternative" "0,1") + (match_operand 1 "memory_operand" "")) + (const_string "vector")] + (const_string "direct"))) + (set_attr "bdver1_decode" "direct") + (set_attr "mode" "")]) + +(define_insn "*mulsi3_1_zext" + [(set (match_operand:DI 0 "register_operand" "=r,r,r") + (zero_extend:DI + (mult:SI (match_operand:SI 1 "nonimmediate_operand" "%rm,rm,0") + (match_operand:SI 2 "general_operand" "K,i,mr")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT + && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "@ + imul{l}\t{%2, %1, %k0|%k0, %1, %2} + imul{l}\t{%2, %1, %k0|%k0, %1, %2} + imul{l}\t{%2, %k0|%k0, %2}" + [(set_attr "type" "imul") + (set_attr "prefix_0f" "0,0,1") + (set (attr "athlon_decode") + (cond [(eq_attr "cpu" "athlon") + (const_string "vector") + (eq_attr "alternative" "1") + (const_string "vector") + (and (eq_attr "alternative" "2") + (match_operand 1 "memory_operand" "")) + (const_string "vector")] + (const_string "direct"))) + (set (attr "amdfam10_decode") + (cond [(and (eq_attr "alternative" "0,1") + (match_operand 1 "memory_operand" "")) + (const_string "vector")] + (const_string "direct"))) + (set_attr "bdver1_decode" "direct") + (set_attr "mode" "SI")]) + +;; On AMDFAM10 +;; IMUL reg16, reg16, imm8 VectorPath +;; IMUL reg16, mem16, imm8 VectorPath +;; IMUL reg16, reg16, imm16 VectorPath +;; IMUL reg16, mem16, imm16 VectorPath +;; IMUL reg16, reg16 Direct +;; IMUL reg16, mem16 Direct +;; +;; On BDVER1, all HI MULs use DoublePath + +(define_insn "*mulhi3_1" + [(set (match_operand:HI 0 "register_operand" "=r,r,r") + (mult:HI (match_operand:HI 1 "nonimmediate_operand" "%rm,rm,0") + (match_operand:HI 2 "general_operand" "K,n,mr"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_HIMODE_MATH + && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "@ + imul{w}\t{%2, %1, %0|%0, %1, %2} + imul{w}\t{%2, %1, %0|%0, %1, %2} + imul{w}\t{%2, %0|%0, %2}" + [(set_attr "type" "imul") + (set_attr "prefix_0f" "0,0,1") + (set (attr "athlon_decode") + (cond [(eq_attr "cpu" "athlon") + (const_string "vector") + (eq_attr "alternative" "1,2") + (const_string "vector")] + (const_string "direct"))) + (set (attr "amdfam10_decode") + (cond [(eq_attr "alternative" "0,1") + (const_string "vector")] + (const_string "direct"))) + (set_attr "bdver1_decode" "double") + (set_attr "mode" "HI")]) + +;;On AMDFAM10 and BDVER1 +;; MUL reg8 Direct +;; MUL mem8 Direct + +(define_insn "*mulqi3_1" + [(set (match_operand:QI 0 "register_operand" "=a") + (mult:QI (match_operand:QI 1 "nonimmediate_operand" "%0") + (match_operand:QI 2 "nonimmediate_operand" "qm"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_QIMODE_MATH + && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "mul{b}\t%2" + [(set_attr "type" "imul") + (set_attr "length_immediate" "0") + (set (attr "athlon_decode") + (if_then_else (eq_attr "cpu" "athlon") + (const_string "vector") + (const_string "direct"))) + (set_attr "amdfam10_decode" "direct") + (set_attr "bdver1_decode" "direct") + (set_attr "mode" "QI")]) + +(define_expand "mul3" + [(parallel [(set (match_operand: 0 "register_operand" "") + (mult: + (any_extend: + (match_operand:DWIH 1 "nonimmediate_operand" "")) + (any_extend: + (match_operand:DWIH 2 "register_operand" "")))) + (clobber (reg:CC FLAGS_REG))])]) + +(define_expand "mulqihi3" + [(parallel [(set (match_operand:HI 0 "register_operand" "") + (mult:HI + (any_extend:HI + (match_operand:QI 1 "nonimmediate_operand" "")) + (any_extend:HI + (match_operand:QI 2 "register_operand" "")))) + (clobber (reg:CC FLAGS_REG))])] + "TARGET_QIMODE_MATH") + +(define_insn "*mul3_1" + [(set (match_operand: 0 "register_operand" "=A") + (mult: + (any_extend: + (match_operand:DWIH 1 "nonimmediate_operand" "%0")) + (any_extend: + (match_operand:DWIH 2 "nonimmediate_operand" "rm")))) + (clobber (reg:CC FLAGS_REG))] + "!(MEM_P (operands[1]) && MEM_P (operands[2]))" + "mul{}\t%2" + [(set_attr "type" "imul") + (set_attr "length_immediate" "0") + (set (attr "athlon_decode") + (if_then_else (eq_attr "cpu" "athlon") + (const_string "vector") + (const_string "double"))) + (set_attr "amdfam10_decode" "double") + (set_attr "bdver1_decode" "direct") + (set_attr "mode" "")]) + +(define_insn "*mulqihi3_1" + [(set (match_operand:HI 0 "register_operand" "=a") + (mult:HI + (any_extend:HI + (match_operand:QI 1 "nonimmediate_operand" "%0")) + (any_extend:HI + (match_operand:QI 2 "nonimmediate_operand" "qm")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_QIMODE_MATH + && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "mul{b}\t%2" + [(set_attr "type" "imul") + (set_attr "length_immediate" "0") + (set (attr "athlon_decode") + (if_then_else (eq_attr "cpu" "athlon") + (const_string "vector") + (const_string "direct"))) + (set_attr "amdfam10_decode" "direct") + (set_attr "bdver1_decode" "direct") + (set_attr "mode" "QI")]) + +(define_expand "mul3_highpart" + [(parallel [(set (match_operand:SWI48 0 "register_operand" "") + (truncate:SWI48 + (lshiftrt: + (mult: + (any_extend: + (match_operand:SWI48 1 "nonimmediate_operand" "")) + (any_extend: + (match_operand:SWI48 2 "register_operand" ""))) + (match_dup 4)))) + (clobber (match_scratch:SWI48 3 "")) + (clobber (reg:CC FLAGS_REG))])] + "" + "operands[4] = GEN_INT (GET_MODE_BITSIZE (mode));") + +(define_insn "*muldi3_highpart_1" + [(set (match_operand:DI 0 "register_operand" "=d") + (truncate:DI + (lshiftrt:TI + (mult:TI + (any_extend:TI + (match_operand:DI 1 "nonimmediate_operand" "%a")) + (any_extend:TI + (match_operand:DI 2 "nonimmediate_operand" "rm"))) + (const_int 64)))) + (clobber (match_scratch:DI 3 "=1")) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT + && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "mul{q}\t%2" + [(set_attr "type" "imul") + (set_attr "length_immediate" "0") + (set (attr "athlon_decode") + (if_then_else (eq_attr "cpu" "athlon") + (const_string "vector") + (const_string "double"))) + (set_attr "amdfam10_decode" "double") + (set_attr "bdver1_decode" "direct") + (set_attr "mode" "DI")]) + +(define_insn "*mulsi3_highpart_1" + [(set (match_operand:SI 0 "register_operand" "=d") + (truncate:SI + (lshiftrt:DI + (mult:DI + (any_extend:DI + (match_operand:SI 1 "nonimmediate_operand" "%a")) + (any_extend:DI + (match_operand:SI 2 "nonimmediate_operand" "rm"))) + (const_int 32)))) + (clobber (match_scratch:SI 3 "=1")) + (clobber (reg:CC FLAGS_REG))] + "!(MEM_P (operands[1]) && MEM_P (operands[2]))" + "mul{l}\t%2" + [(set_attr "type" "imul") + (set_attr "length_immediate" "0") + (set (attr "athlon_decode") + (if_then_else (eq_attr "cpu" "athlon") + (const_string "vector") + (const_string "double"))) + (set_attr "amdfam10_decode" "double") + (set_attr "bdver1_decode" "direct") + (set_attr "mode" "SI")]) + +(define_insn "*mulsi3_highpart_zext" + [(set (match_operand:DI 0 "register_operand" "=d") + (zero_extend:DI (truncate:SI + (lshiftrt:DI + (mult:DI (any_extend:DI + (match_operand:SI 1 "nonimmediate_operand" "%a")) + (any_extend:DI + (match_operand:SI 2 "nonimmediate_operand" "rm"))) + (const_int 32))))) + (clobber (match_scratch:SI 3 "=1")) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT + && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "mul{l}\t%2" + [(set_attr "type" "imul") + (set_attr "length_immediate" "0") + (set (attr "athlon_decode") + (if_then_else (eq_attr "cpu" "athlon") + (const_string "vector") + (const_string "double"))) + (set_attr "amdfam10_decode" "double") + (set_attr "bdver1_decode" "direct") + (set_attr "mode" "SI")]) + +;; The patterns that match these are at the end of this file. + +(define_expand "mulxf3" + [(set (match_operand:XF 0 "register_operand" "") + (mult:XF (match_operand:XF 1 "register_operand" "") + (match_operand:XF 2 "register_operand" "")))] + "TARGET_80387") + +(define_expand "mul3" + [(set (match_operand:MODEF 0 "register_operand" "") + (mult:MODEF (match_operand:MODEF 1 "register_operand" "") + (match_operand:MODEF 2 "nonimmediate_operand" "")))] + "(TARGET_80387 && X87_ENABLE_ARITH (mode)) + || (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)") + +;; Divide instructions + +;; The patterns that match these are at the end of this file. + +(define_expand "divxf3" + [(set (match_operand:XF 0 "register_operand" "") + (div:XF (match_operand:XF 1 "register_operand" "") + (match_operand:XF 2 "register_operand" "")))] + "TARGET_80387") + +(define_expand "divdf3" + [(set (match_operand:DF 0 "register_operand" "") + (div:DF (match_operand:DF 1 "register_operand" "") + (match_operand:DF 2 "nonimmediate_operand" "")))] + "(TARGET_80387 && X87_ENABLE_ARITH (DFmode)) + || (TARGET_SSE2 && TARGET_SSE_MATH)") + +(define_expand "divsf3" + [(set (match_operand:SF 0 "register_operand" "") + (div:SF (match_operand:SF 1 "register_operand" "") + (match_operand:SF 2 "nonimmediate_operand" "")))] + "(TARGET_80387 && X87_ENABLE_ARITH (SFmode)) + || TARGET_SSE_MATH" +{ + if (TARGET_SSE_MATH && TARGET_RECIP && optimize_insn_for_speed_p () + && flag_finite_math_only && !flag_trapping_math + && flag_unsafe_math_optimizations) + { + ix86_emit_swdivsf (operands[0], operands[1], + operands[2], SFmode); + DONE; + } +}) + +;; Divmod instructions. + +(define_expand "divmod4" + [(parallel [(set (match_operand:SWIM248 0 "register_operand" "") + (div:SWIM248 + (match_operand:SWIM248 1 "register_operand" "") + (match_operand:SWIM248 2 "nonimmediate_operand" ""))) + (set (match_operand:SWIM248 3 "register_operand" "") + (mod:SWIM248 (match_dup 1) (match_dup 2))) + (clobber (reg:CC FLAGS_REG))])]) + +;; Split with 8bit unsigned divide: +;; if (dividend an divisor are in [0-255]) +;; use 8bit unsigned integer divide +;; else +;; use original integer divide +(define_split + [(set (match_operand:SWI48 0 "register_operand" "") + (div:SWI48 (match_operand:SWI48 2 "register_operand" "") + (match_operand:SWI48 3 "nonimmediate_operand" ""))) + (set (match_operand:SWI48 1 "register_operand" "") + (mod:SWI48 (match_dup 2) (match_dup 3))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_USE_8BIT_IDIV + && TARGET_QIMODE_MATH + && can_create_pseudo_p () + && !optimize_insn_for_size_p ()" + [(const_int 0)] + "ix86_split_idivmod (mode, operands, true); DONE;") + +(define_insn_and_split "divmod4_1" + [(set (match_operand:SWI48 0 "register_operand" "=a") + (div:SWI48 (match_operand:SWI48 2 "register_operand" "0") + (match_operand:SWI48 3 "nonimmediate_operand" "rm"))) + (set (match_operand:SWI48 1 "register_operand" "=&d") + (mod:SWI48 (match_dup 2) (match_dup 3))) + (unspec [(const_int 0)] UNSPEC_DIV_ALREADY_SPLIT) + (clobber (reg:CC FLAGS_REG))] + "" + "#" + "reload_completed" + [(parallel [(set (match_dup 1) + (ashiftrt:SWI48 (match_dup 4) (match_dup 5))) + (clobber (reg:CC FLAGS_REG))]) + (parallel [(set (match_dup 0) + (div:SWI48 (match_dup 2) (match_dup 3))) + (set (match_dup 1) + (mod:SWI48 (match_dup 2) (match_dup 3))) + (use (match_dup 1)) + (clobber (reg:CC FLAGS_REG))])] +{ + operands[5] = GEN_INT (GET_MODE_BITSIZE (mode)-1); + + if (optimize_function_for_size_p (cfun) || TARGET_USE_CLTD) + operands[4] = operands[2]; + else + { + /* Avoid use of cltd in favor of a mov+shift. */ + emit_move_insn (operands[1], operands[2]); + operands[4] = operands[1]; + } +} + [(set_attr "type" "multi") + (set_attr "mode" "")]) + +(define_insn_and_split "*divmod4" + [(set (match_operand:SWIM248 0 "register_operand" "=a") + (div:SWIM248 (match_operand:SWIM248 2 "register_operand" "0") + (match_operand:SWIM248 3 "nonimmediate_operand" "rm"))) + (set (match_operand:SWIM248 1 "register_operand" "=&d") + (mod:SWIM248 (match_dup 2) (match_dup 3))) + (clobber (reg:CC FLAGS_REG))] + "" + "#" + "reload_completed" + [(parallel [(set (match_dup 1) + (ashiftrt:SWIM248 (match_dup 4) (match_dup 5))) + (clobber (reg:CC FLAGS_REG))]) + (parallel [(set (match_dup 0) + (div:SWIM248 (match_dup 2) (match_dup 3))) + (set (match_dup 1) + (mod:SWIM248 (match_dup 2) (match_dup 3))) + (use (match_dup 1)) + (clobber (reg:CC FLAGS_REG))])] +{ + operands[5] = GEN_INT (GET_MODE_BITSIZE (mode)-1); + + if (mode != HImode + && (optimize_function_for_size_p (cfun) || TARGET_USE_CLTD)) + operands[4] = operands[2]; + else + { + /* Avoid use of cltd in favor of a mov+shift. */ + emit_move_insn (operands[1], operands[2]); + operands[4] = operands[1]; + } +} + [(set_attr "type" "multi") + (set_attr "mode" "")]) + +(define_insn "*divmod4_noext" + [(set (match_operand:SWIM248 0 "register_operand" "=a") + (div:SWIM248 (match_operand:SWIM248 2 "register_operand" "0") + (match_operand:SWIM248 3 "nonimmediate_operand" "rm"))) + (set (match_operand:SWIM248 1 "register_operand" "=d") + (mod:SWIM248 (match_dup 2) (match_dup 3))) + (use (match_operand:SWIM248 4 "register_operand" "1")) + (clobber (reg:CC FLAGS_REG))] + "" + "idiv{}\t%3" + [(set_attr "type" "idiv") + (set_attr "mode" "")]) + +(define_expand "divmodqi4" + [(parallel [(set (match_operand:QI 0 "register_operand" "") + (div:QI + (match_operand:QI 1 "register_operand" "") + (match_operand:QI 2 "nonimmediate_operand" ""))) + (set (match_operand:QI 3 "register_operand" "") + (mod:QI (match_dup 1) (match_dup 2))) + (clobber (reg:CC FLAGS_REG))])] + "TARGET_QIMODE_MATH" +{ + rtx div, mod, insn; + rtx tmp0, tmp1; + + tmp0 = gen_reg_rtx (HImode); + tmp1 = gen_reg_rtx (HImode); + + /* Extend operands[1] to HImode. Generate 8bit divide. Result is + in AX. */ + emit_insn (gen_extendqihi2 (tmp1, operands[1])); + emit_insn (gen_divmodhiqi3 (tmp0, tmp1, operands[2])); + + /* Extract remainder from AH. */ + tmp1 = gen_rtx_SIGN_EXTRACT (QImode, tmp0, GEN_INT (8), GEN_INT (8)); + insn = emit_move_insn (operands[3], tmp1); + + mod = gen_rtx_MOD (QImode, operands[1], operands[2]); + set_unique_reg_note (insn, REG_EQUAL, mod); + + /* Extract quotient from AL. */ + insn = emit_move_insn (operands[0], gen_lowpart (QImode, tmp0)); + + div = gen_rtx_DIV (QImode, operands[1], operands[2]); + set_unique_reg_note (insn, REG_EQUAL, div); + + DONE; +}) + +;; Divide AX by r/m8, with result stored in +;; AL <- Quotient +;; AH <- Remainder +;; Change div/mod to HImode and extend the second argument to HImode +;; so that mode of div/mod matches with mode of arguments. Otherwise +;; combine may fail. +(define_insn "divmodhiqi3" + [(set (match_operand:HI 0 "register_operand" "=a") + (ior:HI + (ashift:HI + (zero_extend:HI + (truncate:QI + (mod:HI (match_operand:HI 1 "register_operand" "0") + (sign_extend:HI + (match_operand:QI 2 "nonimmediate_operand" "qm"))))) + (const_int 8)) + (zero_extend:HI + (truncate:QI + (div:HI (match_dup 1) (sign_extend:HI (match_dup 2))))))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_QIMODE_MATH" + "idiv{b}\t%2" + [(set_attr "type" "idiv") + (set_attr "mode" "QI")]) + +(define_expand "udivmod4" + [(parallel [(set (match_operand:SWIM248 0 "register_operand" "") + (udiv:SWIM248 + (match_operand:SWIM248 1 "register_operand" "") + (match_operand:SWIM248 2 "nonimmediate_operand" ""))) + (set (match_operand:SWIM248 3 "register_operand" "") + (umod:SWIM248 (match_dup 1) (match_dup 2))) + (clobber (reg:CC FLAGS_REG))])]) + +;; Split with 8bit unsigned divide: +;; if (dividend an divisor are in [0-255]) +;; use 8bit unsigned integer divide +;; else +;; use original integer divide +(define_split + [(set (match_operand:SWI48 0 "register_operand" "") + (udiv:SWI48 (match_operand:SWI48 2 "register_operand" "") + (match_operand:SWI48 3 "nonimmediate_operand" ""))) + (set (match_operand:SWI48 1 "register_operand" "") + (umod:SWI48 (match_dup 2) (match_dup 3))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_USE_8BIT_IDIV + && TARGET_QIMODE_MATH + && can_create_pseudo_p () + && !optimize_insn_for_size_p ()" + [(const_int 0)] + "ix86_split_idivmod (mode, operands, false); DONE;") + +(define_insn_and_split "udivmod4_1" + [(set (match_operand:SWI48 0 "register_operand" "=a") + (udiv:SWI48 (match_operand:SWI48 2 "register_operand" "0") + (match_operand:SWI48 3 "nonimmediate_operand" "rm"))) + (set (match_operand:SWI48 1 "register_operand" "=&d") + (umod:SWI48 (match_dup 2) (match_dup 3))) + (unspec [(const_int 0)] UNSPEC_DIV_ALREADY_SPLIT) + (clobber (reg:CC FLAGS_REG))] + "" + "#" + "reload_completed" + [(set (match_dup 1) (const_int 0)) + (parallel [(set (match_dup 0) + (udiv:SWI48 (match_dup 2) (match_dup 3))) + (set (match_dup 1) + (umod:SWI48 (match_dup 2) (match_dup 3))) + (use (match_dup 1)) + (clobber (reg:CC FLAGS_REG))])] + "" + [(set_attr "type" "multi") + (set_attr "mode" "")]) + +(define_insn_and_split "*udivmod4" + [(set (match_operand:SWIM248 0 "register_operand" "=a") + (udiv:SWIM248 (match_operand:SWIM248 2 "register_operand" "0") + (match_operand:SWIM248 3 "nonimmediate_operand" "rm"))) + (set (match_operand:SWIM248 1 "register_operand" "=&d") + (umod:SWIM248 (match_dup 2) (match_dup 3))) + (clobber (reg:CC FLAGS_REG))] + "" + "#" + "reload_completed" + [(set (match_dup 1) (const_int 0)) + (parallel [(set (match_dup 0) + (udiv:SWIM248 (match_dup 2) (match_dup 3))) + (set (match_dup 1) + (umod:SWIM248 (match_dup 2) (match_dup 3))) + (use (match_dup 1)) + (clobber (reg:CC FLAGS_REG))])] + "" + [(set_attr "type" "multi") + (set_attr "mode" "")]) + +(define_insn "*udivmod4_noext" + [(set (match_operand:SWIM248 0 "register_operand" "=a") + (udiv:SWIM248 (match_operand:SWIM248 2 "register_operand" "0") + (match_operand:SWIM248 3 "nonimmediate_operand" "rm"))) + (set (match_operand:SWIM248 1 "register_operand" "=d") + (umod:SWIM248 (match_dup 2) (match_dup 3))) + (use (match_operand:SWIM248 4 "register_operand" "1")) + (clobber (reg:CC FLAGS_REG))] + "" + "div{}\t%3" + [(set_attr "type" "idiv") + (set_attr "mode" "")]) + +(define_expand "udivmodqi4" + [(parallel [(set (match_operand:QI 0 "register_operand" "") + (udiv:QI + (match_operand:QI 1 "register_operand" "") + (match_operand:QI 2 "nonimmediate_operand" ""))) + (set (match_operand:QI 3 "register_operand" "") + (umod:QI (match_dup 1) (match_dup 2))) + (clobber (reg:CC FLAGS_REG))])] + "TARGET_QIMODE_MATH" +{ + rtx div, mod, insn; + rtx tmp0, tmp1; + + tmp0 = gen_reg_rtx (HImode); + tmp1 = gen_reg_rtx (HImode); + + /* Extend operands[1] to HImode. Generate 8bit divide. Result is + in AX. */ + emit_insn (gen_zero_extendqihi2 (tmp1, operands[1])); + emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, operands[2])); + + /* Extract remainder from AH. */ + tmp1 = gen_rtx_ZERO_EXTRACT (SImode, tmp0, GEN_INT (8), GEN_INT (8)); + tmp1 = simplify_gen_subreg (QImode, tmp1, SImode, 0); + insn = emit_move_insn (operands[3], tmp1); + + mod = gen_rtx_UMOD (QImode, operands[1], operands[2]); + set_unique_reg_note (insn, REG_EQUAL, mod); + + /* Extract quotient from AL. */ + insn = emit_move_insn (operands[0], gen_lowpart (QImode, tmp0)); + + div = gen_rtx_UDIV (QImode, operands[1], operands[2]); + set_unique_reg_note (insn, REG_EQUAL, div); + + DONE; +}) + +(define_insn "udivmodhiqi3" + [(set (match_operand:HI 0 "register_operand" "=a") + (ior:HI + (ashift:HI + (zero_extend:HI + (truncate:QI + (mod:HI (match_operand:HI 1 "register_operand" "0") + (zero_extend:HI + (match_operand:QI 2 "nonimmediate_operand" "qm"))))) + (const_int 8)) + (zero_extend:HI + (truncate:QI + (div:HI (match_dup 1) (zero_extend:HI (match_dup 2))))))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_QIMODE_MATH" + "div{b}\t%2" + [(set_attr "type" "idiv") + (set_attr "mode" "QI")]) + +;; We cannot use div/idiv for double division, because it causes +;; "division by zero" on the overflow and that's not what we expect +;; from truncate. Because true (non truncating) double division is +;; never generated, we can't create this insn anyway. +; +;(define_insn "" +; [(set (match_operand:SI 0 "register_operand" "=a") +; (truncate:SI +; (udiv:DI (match_operand:DI 1 "register_operand" "A") +; (zero_extend:DI +; (match_operand:SI 2 "nonimmediate_operand" "rm"))))) +; (set (match_operand:SI 3 "register_operand" "=d") +; (truncate:SI +; (umod:DI (match_dup 1) (zero_extend:DI (match_dup 2))))) +; (clobber (reg:CC FLAGS_REG))] +; "" +; "div{l}\t{%2, %0|%0, %2}" +; [(set_attr "type" "idiv")]) + +;;- Logical AND instructions + +;; On Pentium, "test imm, reg" is pairable only with eax, ax, and al. +;; Note that this excludes ah. + +(define_expand "testsi_ccno_1" + [(set (reg:CCNO FLAGS_REG) + (compare:CCNO + (and:SI (match_operand:SI 0 "nonimmediate_operand" "") + (match_operand:SI 1 "nonmemory_operand" "")) + (const_int 0)))]) + +(define_expand "testqi_ccz_1" + [(set (reg:CCZ FLAGS_REG) + (compare:CCZ (and:QI (match_operand:QI 0 "nonimmediate_operand" "") + (match_operand:QI 1 "nonmemory_operand" "")) + (const_int 0)))]) + +(define_expand "testdi_ccno_1" + [(set (reg:CCNO FLAGS_REG) + (compare:CCNO + (and:DI (match_operand:DI 0 "nonimmediate_operand" "") + (match_operand:DI 1 "x86_64_szext_general_operand" "")) + (const_int 0)))] + "TARGET_64BIT && !(MEM_P (operands[0]) && MEM_P (operands[1]))") + +(define_insn "*testdi_1" + [(set (reg FLAGS_REG) + (compare + (and:DI + (match_operand:DI 0 "nonimmediate_operand" "%!*a,r,!*a,r,rm") + (match_operand:DI 1 "x86_64_szext_general_operand" "Z,Z,e,e,re")) + (const_int 0)))] + "TARGET_64BIT && ix86_match_ccmode (insn, CCNOmode) + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "@ + test{l}\t{%k1, %k0|%k0, %k1} + test{l}\t{%k1, %k0|%k0, %k1} + test{q}\t{%1, %0|%0, %1} + test{q}\t{%1, %0|%0, %1} + test{q}\t{%1, %0|%0, %1}" + [(set_attr "type" "test") + (set_attr "modrm" "0,1,0,1,1") + (set_attr "mode" "SI,SI,DI,DI,DI")]) + +(define_insn "*testqi_1_maybe_si" + [(set (reg FLAGS_REG) + (compare + (and:QI + (match_operand:QI 0 "nonimmediate_operand" "%!*a,q,qm,r") + (match_operand:QI 1 "general_operand" "n,n,qn,n")) + (const_int 0)))] + "!(MEM_P (operands[0]) && MEM_P (operands[1])) + && ix86_match_ccmode (insn, + CONST_INT_P (operands[1]) + && INTVAL (operands[1]) >= 0 ? CCNOmode : CCZmode)" +{ + if (which_alternative == 3) + { + if (CONST_INT_P (operands[1]) && INTVAL (operands[1]) < 0) + operands[1] = GEN_INT (INTVAL (operands[1]) & 0xff); + return "test{l}\t{%1, %k0|%k0, %1}"; + } + return "test{b}\t{%1, %0|%0, %1}"; +} + [(set_attr "type" "test") + (set_attr "modrm" "0,1,1,1") + (set_attr "mode" "QI,QI,QI,SI") + (set_attr "pent_pair" "uv,np,uv,np")]) + +(define_insn "*test_1" + [(set (reg FLAGS_REG) + (compare + (and:SWI124 + (match_operand:SWI124 0 "nonimmediate_operand" "%!*a,,m") + (match_operand:SWI124 1 "general_operand" ",,")) + (const_int 0)))] + "ix86_match_ccmode (insn, CCNOmode) + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "test{}\t{%1, %0|%0, %1}" + [(set_attr "type" "test") + (set_attr "modrm" "0,1,1") + (set_attr "mode" "") + (set_attr "pent_pair" "uv,np,uv")]) + +(define_expand "testqi_ext_ccno_0" + [(set (reg:CCNO FLAGS_REG) + (compare:CCNO + (and:SI + (zero_extract:SI + (match_operand 0 "ext_register_operand" "") + (const_int 8) + (const_int 8)) + (match_operand 1 "const_int_operand" "")) + (const_int 0)))]) + +(define_insn "*testqi_ext_0" + [(set (reg FLAGS_REG) + (compare + (and:SI + (zero_extract:SI + (match_operand 0 "ext_register_operand" "Q") + (const_int 8) + (const_int 8)) + (match_operand 1 "const_int_operand" "n")) + (const_int 0)))] + "ix86_match_ccmode (insn, CCNOmode)" + "test{b}\t{%1, %h0|%h0, %1}" + [(set_attr "type" "test") + (set_attr "mode" "QI") + (set_attr "length_immediate" "1") + (set_attr "modrm" "1") + (set_attr "pent_pair" "np")]) + +(define_insn "*testqi_ext_1_rex64" + [(set (reg FLAGS_REG) + (compare + (and:SI + (zero_extract:SI + (match_operand 0 "ext_register_operand" "Q") + (const_int 8) + (const_int 8)) + (zero_extend:SI + (match_operand:QI 1 "register_operand" "Q"))) + (const_int 0)))] + "TARGET_64BIT && ix86_match_ccmode (insn, CCNOmode)" + "test{b}\t{%1, %h0|%h0, %1}" + [(set_attr "type" "test") + (set_attr "mode" "QI")]) + +(define_insn "*testqi_ext_1" + [(set (reg FLAGS_REG) + (compare + (and:SI + (zero_extract:SI + (match_operand 0 "ext_register_operand" "Q") + (const_int 8) + (const_int 8)) + (zero_extend:SI + (match_operand:QI 1 "general_operand" "Qm"))) + (const_int 0)))] + "!TARGET_64BIT && ix86_match_ccmode (insn, CCNOmode)" + "test{b}\t{%1, %h0|%h0, %1}" + [(set_attr "type" "test") + (set_attr "mode" "QI")]) + +(define_insn "*testqi_ext_2" + [(set (reg FLAGS_REG) + (compare + (and:SI + (zero_extract:SI + (match_operand 0 "ext_register_operand" "Q") + (const_int 8) + (const_int 8)) + (zero_extract:SI + (match_operand 1 "ext_register_operand" "Q") + (const_int 8) + (const_int 8))) + (const_int 0)))] + "ix86_match_ccmode (insn, CCNOmode)" + "test{b}\t{%h1, %h0|%h0, %h1}" + [(set_attr "type" "test") + (set_attr "mode" "QI")]) + +(define_insn "*testqi_ext_3_rex64" + [(set (reg FLAGS_REG) + (compare (zero_extract:DI + (match_operand 0 "nonimmediate_operand" "rm") + (match_operand:DI 1 "const_int_operand" "") + (match_operand:DI 2 "const_int_operand" "")) + (const_int 0)))] + "TARGET_64BIT + && ix86_match_ccmode (insn, CCNOmode) + && INTVAL (operands[1]) > 0 + && INTVAL (operands[2]) >= 0 + /* Ensure that resulting mask is zero or sign extended operand. */ + && (INTVAL (operands[1]) + INTVAL (operands[2]) <= 32 + || (INTVAL (operands[1]) + INTVAL (operands[2]) == 64 + && INTVAL (operands[1]) > 32)) + && (GET_MODE (operands[0]) == SImode + || GET_MODE (operands[0]) == DImode + || GET_MODE (operands[0]) == HImode + || GET_MODE (operands[0]) == QImode)" + "#") + +;; Combine likes to form bit extractions for some tests. Humor it. +(define_insn "*testqi_ext_3" + [(set (reg FLAGS_REG) + (compare (zero_extract:SI + (match_operand 0 "nonimmediate_operand" "rm") + (match_operand:SI 1 "const_int_operand" "") + (match_operand:SI 2 "const_int_operand" "")) + (const_int 0)))] + "ix86_match_ccmode (insn, CCNOmode) + && INTVAL (operands[1]) > 0 + && INTVAL (operands[2]) >= 0 + && INTVAL (operands[1]) + INTVAL (operands[2]) <= 32 + && (GET_MODE (operands[0]) == SImode + || (TARGET_64BIT && GET_MODE (operands[0]) == DImode) + || GET_MODE (operands[0]) == HImode + || GET_MODE (operands[0]) == QImode)" + "#") + +(define_split + [(set (match_operand 0 "flags_reg_operand" "") + (match_operator 1 "compare_operator" + [(zero_extract + (match_operand 2 "nonimmediate_operand" "") + (match_operand 3 "const_int_operand" "") + (match_operand 4 "const_int_operand" "")) + (const_int 0)]))] + "ix86_match_ccmode (insn, CCNOmode)" + [(set (match_dup 0) (match_op_dup 1 [(match_dup 2) (const_int 0)]))] +{ + rtx val = operands[2]; + HOST_WIDE_INT len = INTVAL (operands[3]); + HOST_WIDE_INT pos = INTVAL (operands[4]); + HOST_WIDE_INT mask; + enum machine_mode mode, submode; + + mode = GET_MODE (val); + if (MEM_P (val)) + { + /* ??? Combine likes to put non-volatile mem extractions in QImode + no matter the size of the test. So find a mode that works. */ + if (! MEM_VOLATILE_P (val)) + { + mode = smallest_mode_for_size (pos + len, MODE_INT); + val = adjust_address (val, mode, 0); + } + } + else if (GET_CODE (val) == SUBREG + && (submode = GET_MODE (SUBREG_REG (val)), + GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (submode)) + && pos + len <= GET_MODE_BITSIZE (submode) + && GET_MODE_CLASS (submode) == MODE_INT) + { + /* Narrow a paradoxical subreg to prevent partial register stalls. */ + mode = submode; + val = SUBREG_REG (val); + } + else if (mode == HImode && pos + len <= 8) + { + /* Small HImode tests can be converted to QImode. */ + mode = QImode; + val = gen_lowpart (QImode, val); + } + + if (len == HOST_BITS_PER_WIDE_INT) + mask = -1; + else + mask = ((HOST_WIDE_INT)1 << len) - 1; + mask <<= pos; + + operands[2] = gen_rtx_AND (mode, val, gen_int_mode (mask, mode)); +}) + +;; Convert HImode/SImode test instructions with immediate to QImode ones. +;; i386 does not allow to encode test with 8bit sign extended immediate, so +;; this is relatively important trick. +;; Do the conversion only post-reload to avoid limiting of the register class +;; to QI regs. +(define_split + [(set (match_operand 0 "flags_reg_operand" "") + (match_operator 1 "compare_operator" + [(and (match_operand 2 "register_operand" "") + (match_operand 3 "const_int_operand" "")) + (const_int 0)]))] + "reload_completed + && QI_REG_P (operands[2]) + && GET_MODE (operands[2]) != QImode + && ((ix86_match_ccmode (insn, CCZmode) + && !(INTVAL (operands[3]) & ~(255 << 8))) + || (ix86_match_ccmode (insn, CCNOmode) + && !(INTVAL (operands[3]) & ~(127 << 8))))" + [(set (match_dup 0) + (match_op_dup 1 + [(and:SI (zero_extract:SI (match_dup 2) (const_int 8) (const_int 8)) + (match_dup 3)) + (const_int 0)]))] + "operands[2] = gen_lowpart (SImode, operands[2]); + operands[3] = gen_int_mode (INTVAL (operands[3]) >> 8, SImode);") + +(define_split + [(set (match_operand 0 "flags_reg_operand" "") + (match_operator 1 "compare_operator" + [(and (match_operand 2 "nonimmediate_operand" "") + (match_operand 3 "const_int_operand" "")) + (const_int 0)]))] + "reload_completed + && GET_MODE (operands[2]) != QImode + && (!REG_P (operands[2]) || ANY_QI_REG_P (operands[2])) + && ((ix86_match_ccmode (insn, CCZmode) + && !(INTVAL (operands[3]) & ~255)) + || (ix86_match_ccmode (insn, CCNOmode) + && !(INTVAL (operands[3]) & ~127)))" + [(set (match_dup 0) + (match_op_dup 1 [(and:QI (match_dup 2) (match_dup 3)) + (const_int 0)]))] + "operands[2] = gen_lowpart (QImode, operands[2]); + operands[3] = gen_lowpart (QImode, operands[3]);") + +;; %%% This used to optimize known byte-wide and operations to memory, +;; and sometimes to QImode registers. If this is considered useful, +;; it should be done with splitters. + +(define_expand "and3" + [(set (match_operand:SWIM 0 "nonimmediate_operand" "") + (and:SWIM (match_operand:SWIM 1 "nonimmediate_operand" "") + (match_operand:SWIM 2 "" "")))] + "" + "ix86_expand_binary_operator (AND, mode, operands); DONE;") + +(define_insn "*anddi_1" + [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r") + (and:DI + (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm") + (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,rm,L"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && ix86_binary_operator_ok (AND, DImode, operands)" +{ + switch (get_attr_type (insn)) + { + case TYPE_IMOVX: + { + enum machine_mode mode; + + gcc_assert (CONST_INT_P (operands[2])); + if (INTVAL (operands[2]) == 0xff) + mode = QImode; + else + { + gcc_assert (INTVAL (operands[2]) == 0xffff); + mode = HImode; + } + + operands[1] = gen_lowpart (mode, operands[1]); + if (mode == QImode) + return "movz{bl|x}\t{%1, %k0|%k0, %1}"; + else + return "movz{wl|x}\t{%1, %k0|%k0, %1}"; + } + + default: + gcc_assert (rtx_equal_p (operands[0], operands[1])); + if (get_attr_mode (insn) == MODE_SI) + return "and{l}\t{%k2, %k0|%k0, %k2}"; + else + return "and{q}\t{%2, %0|%0, %2}"; + } +} + [(set_attr "type" "alu,alu,alu,imovx") + (set_attr "length_immediate" "*,*,*,0") + (set (attr "prefix_rex") + (if_then_else + (and (eq_attr "type" "imovx") + (and (ne (symbol_ref "INTVAL (operands[2]) == 0xff") (const_int 0)) + (match_operand 1 "ext_QIreg_nomode_operand" ""))) + (const_string "1") + (const_string "*"))) + (set_attr "mode" "SI,DI,DI,SI")]) + +(define_insn "*andsi_1" + [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r,r") + (and:SI (match_operand:SI 1 "nonimmediate_operand" "%0,0,qm") + (match_operand:SI 2 "general_operand" "ri,rm,L"))) + (clobber (reg:CC FLAGS_REG))] + "ix86_binary_operator_ok (AND, SImode, operands)" +{ + switch (get_attr_type (insn)) + { + case TYPE_IMOVX: + { + enum machine_mode mode; + + gcc_assert (CONST_INT_P (operands[2])); + if (INTVAL (operands[2]) == 0xff) + mode = QImode; + else + { + gcc_assert (INTVAL (operands[2]) == 0xffff); + mode = HImode; + } + + operands[1] = gen_lowpart (mode, operands[1]); + if (mode == QImode) + return "movz{bl|x}\t{%1, %0|%0, %1}"; + else + return "movz{wl|x}\t{%1, %0|%0, %1}"; + } + + default: + gcc_assert (rtx_equal_p (operands[0], operands[1])); + return "and{l}\t{%2, %0|%0, %2}"; + } +} + [(set_attr "type" "alu,alu,imovx") + (set (attr "prefix_rex") + (if_then_else + (and (eq_attr "type" "imovx") + (and (ne (symbol_ref "INTVAL (operands[2]) == 0xff") (const_int 0)) + (match_operand 1 "ext_QIreg_nomode_operand" ""))) + (const_string "1") + (const_string "*"))) + (set_attr "length_immediate" "*,*,0") + (set_attr "mode" "SI")]) + +;; See comment for addsi_1_zext why we do use nonimmediate_operand +(define_insn "*andsi_1_zext" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (and:SI (match_operand:SI 1 "nonimmediate_operand" "%0") + (match_operand:SI 2 "general_operand" "g")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && ix86_binary_operator_ok (AND, SImode, operands)" + "and{l}\t{%2, %k0|%k0, %2}" + [(set_attr "type" "alu") + (set_attr "mode" "SI")]) + +(define_insn "*andhi_1" + [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,r,r") + (and:HI (match_operand:HI 1 "nonimmediate_operand" "%0,0,qm") + (match_operand:HI 2 "general_operand" "rn,rm,L"))) + (clobber (reg:CC FLAGS_REG))] + "ix86_binary_operator_ok (AND, HImode, operands)" +{ + switch (get_attr_type (insn)) + { + case TYPE_IMOVX: + gcc_assert (CONST_INT_P (operands[2])); + gcc_assert (INTVAL (operands[2]) == 0xff); + return "movz{bl|x}\t{%b1, %k0|%k0, %b1}"; + + default: + gcc_assert (rtx_equal_p (operands[0], operands[1])); + + return "and{w}\t{%2, %0|%0, %2}"; + } +} + [(set_attr "type" "alu,alu,imovx") + (set_attr "length_immediate" "*,*,0") + (set (attr "prefix_rex") + (if_then_else + (and (eq_attr "type" "imovx") + (match_operand 1 "ext_QIreg_nomode_operand" "")) + (const_string "1") + (const_string "*"))) + (set_attr "mode" "HI,HI,SI")]) + +;; %%% Potential partial reg stall on alternative 2. What to do? +(define_insn "*andqi_1" + [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r") + (and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0") + (match_operand:QI 2 "general_operand" "qn,qmn,rn"))) + (clobber (reg:CC FLAGS_REG))] + "ix86_binary_operator_ok (AND, QImode, operands)" + "@ + and{b}\t{%2, %0|%0, %2} + and{b}\t{%2, %0|%0, %2} + and{l}\t{%k2, %k0|%k0, %k2}" + [(set_attr "type" "alu") + (set_attr "mode" "QI,QI,SI")]) + +(define_insn "*andqi_1_slp" + [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm,q")) + (and:QI (match_dup 0) + (match_operand:QI 1 "general_operand" "qn,qmn"))) + (clobber (reg:CC FLAGS_REG))] + "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)) + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "and{b}\t{%1, %0|%0, %1}" + [(set_attr "type" "alu1") + (set_attr "mode" "QI")]) + +(define_split + [(set (match_operand 0 "register_operand" "") + (and (match_dup 0) + (const_int -65536))) + (clobber (reg:CC FLAGS_REG))] + "(TARGET_FAST_PREFIX && !TARGET_PARTIAL_REG_STALL) + || optimize_function_for_size_p (cfun)" + [(set (strict_low_part (match_dup 1)) (const_int 0))] + "operands[1] = gen_lowpart (HImode, operands[0]);") + +(define_split + [(set (match_operand 0 "ext_register_operand" "") + (and (match_dup 0) + (const_int -256))) + (clobber (reg:CC FLAGS_REG))] + "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)) + && reload_completed" + [(set (strict_low_part (match_dup 1)) (const_int 0))] + "operands[1] = gen_lowpart (QImode, operands[0]);") + +(define_split + [(set (match_operand 0 "ext_register_operand" "") + (and (match_dup 0) + (const_int -65281))) + (clobber (reg:CC FLAGS_REG))] + "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)) + && reload_completed" + [(parallel [(set (zero_extract:SI (match_dup 0) + (const_int 8) + (const_int 8)) + (xor:SI + (zero_extract:SI (match_dup 0) + (const_int 8) + (const_int 8)) + (zero_extract:SI (match_dup 0) + (const_int 8) + (const_int 8)))) + (clobber (reg:CC FLAGS_REG))])] + "operands[0] = gen_lowpart (SImode, operands[0]);") + +(define_insn "*anddi_2" + [(set (reg FLAGS_REG) + (compare + (and:DI + (match_operand:DI 1 "nonimmediate_operand" "%0,0,0") + (match_operand:DI 2 "x86_64_szext_general_operand" "Z,rem,re")) + (const_int 0))) + (set (match_operand:DI 0 "nonimmediate_operand" "=r,r,rm") + (and:DI (match_dup 1) (match_dup 2)))] + "TARGET_64BIT && ix86_match_ccmode (insn, CCNOmode) + && ix86_binary_operator_ok (AND, DImode, operands)" + "@ + and{l}\t{%k2, %k0|%k0, %k2} + and{q}\t{%2, %0|%0, %2} + and{q}\t{%2, %0|%0, %2}" + [(set_attr "type" "alu") + (set_attr "mode" "SI,DI,DI")]) + +(define_insn "*andqi_2_maybe_si" + [(set (reg FLAGS_REG) + (compare (and:QI + (match_operand:QI 1 "nonimmediate_operand" "%0,0,0") + (match_operand:QI 2 "general_operand" "qmn,qn,n")) + (const_int 0))) + (set (match_operand:QI 0 "nonimmediate_operand" "=q,qm,*r") + (and:QI (match_dup 1) (match_dup 2)))] + "ix86_binary_operator_ok (AND, QImode, operands) + && ix86_match_ccmode (insn, + CONST_INT_P (operands[2]) + && INTVAL (operands[2]) >= 0 ? CCNOmode : CCZmode)" +{ + if (which_alternative == 2) + { + if (CONST_INT_P (operands[2]) && INTVAL (operands[2]) < 0) + operands[2] = GEN_INT (INTVAL (operands[2]) & 0xff); + return "and{l}\t{%2, %k0|%k0, %2}"; + } + return "and{b}\t{%2, %0|%0, %2}"; +} + [(set_attr "type" "alu") + (set_attr "mode" "QI,QI,SI")]) + +(define_insn "*and_2" + [(set (reg FLAGS_REG) + (compare (and:SWI124 + (match_operand:SWI124 1 "nonimmediate_operand" "%0,0") + (match_operand:SWI124 2 "general_operand" ",")) + (const_int 0))) + (set (match_operand:SWI124 0 "nonimmediate_operand" "=,m") + (and:SWI124 (match_dup 1) (match_dup 2)))] + "ix86_match_ccmode (insn, CCNOmode) + && ix86_binary_operator_ok (AND, mode, operands)" + "and{}\t{%2, %0|%0, %2}" + [(set_attr "type" "alu") + (set_attr "mode" "")]) + +;; See comment for addsi_1_zext why we do use nonimmediate_operand +(define_insn "*andsi_2_zext" + [(set (reg FLAGS_REG) + (compare (and:SI + (match_operand:SI 1 "nonimmediate_operand" "%0") + (match_operand:SI 2 "general_operand" "g")) + (const_int 0))) + (set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI (and:SI (match_dup 1) (match_dup 2))))] + "TARGET_64BIT && ix86_match_ccmode (insn, CCNOmode) + && ix86_binary_operator_ok (AND, SImode, operands)" + "and{l}\t{%2, %k0|%k0, %2}" + [(set_attr "type" "alu") + (set_attr "mode" "SI")]) + +(define_insn "*andqi_2_slp" + [(set (reg FLAGS_REG) + (compare (and:QI + (match_operand:QI 0 "nonimmediate_operand" "+q,qm") + (match_operand:QI 1 "nonimmediate_operand" "qmn,qn")) + (const_int 0))) + (set (strict_low_part (match_dup 0)) + (and:QI (match_dup 0) (match_dup 1)))] + "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)) + && ix86_match_ccmode (insn, CCNOmode) + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "and{b}\t{%1, %0|%0, %1}" + [(set_attr "type" "alu1") + (set_attr "mode" "QI")]) + +;; ??? A bug in recog prevents it from recognizing a const_int as an +;; operand to zero_extend in andqi_ext_1. It was checking explicitly +;; for a QImode operand, which of course failed. +(define_insn "andqi_ext_0" + [(set (zero_extract:SI (match_operand 0 "ext_register_operand" "=Q") + (const_int 8) + (const_int 8)) + (and:SI + (zero_extract:SI + (match_operand 1 "ext_register_operand" "0") + (const_int 8) + (const_int 8)) + (match_operand 2 "const_int_operand" "n"))) + (clobber (reg:CC FLAGS_REG))] + "" + "and{b}\t{%2, %h0|%h0, %2}" + [(set_attr "type" "alu") + (set_attr "length_immediate" "1") + (set_attr "modrm" "1") + (set_attr "mode" "QI")]) + +;; Generated by peephole translating test to and. This shows up +;; often in fp comparisons. +(define_insn "*andqi_ext_0_cc" + [(set (reg FLAGS_REG) + (compare + (and:SI + (zero_extract:SI + (match_operand 1 "ext_register_operand" "0") + (const_int 8) + (const_int 8)) + (match_operand 2 "const_int_operand" "n")) + (const_int 0))) + (set (zero_extract:SI (match_operand 0 "ext_register_operand" "=Q") + (const_int 8) + (const_int 8)) + (and:SI + (zero_extract:SI + (match_dup 1) + (const_int 8) + (const_int 8)) + (match_dup 2)))] + "ix86_match_ccmode (insn, CCNOmode)" + "and{b}\t{%2, %h0|%h0, %2}" + [(set_attr "type" "alu") + (set_attr "length_immediate" "1") + (set_attr "modrm" "1") + (set_attr "mode" "QI")]) + +(define_insn "*andqi_ext_1_rex64" + [(set (zero_extract:SI (match_operand 0 "ext_register_operand" "=Q") + (const_int 8) + (const_int 8)) + (and:SI + (zero_extract:SI + (match_operand 1 "ext_register_operand" "0") + (const_int 8) + (const_int 8)) + (zero_extend:SI + (match_operand 2 "ext_register_operand" "Q")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT" + "and{b}\t{%2, %h0|%h0, %2}" + [(set_attr "type" "alu") + (set_attr "length_immediate" "0") + (set_attr "mode" "QI")]) + +(define_insn "*andqi_ext_1" + [(set (zero_extract:SI (match_operand 0 "ext_register_operand" "=Q") + (const_int 8) + (const_int 8)) + (and:SI + (zero_extract:SI + (match_operand 1 "ext_register_operand" "0") + (const_int 8) + (const_int 8)) + (zero_extend:SI + (match_operand:QI 2 "general_operand" "Qm")))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_64BIT" + "and{b}\t{%2, %h0|%h0, %2}" + [(set_attr "type" "alu") + (set_attr "length_immediate" "0") + (set_attr "mode" "QI")]) + +(define_insn "*andqi_ext_2" + [(set (zero_extract:SI (match_operand 0 "ext_register_operand" "=Q") + (const_int 8) + (const_int 8)) + (and:SI + (zero_extract:SI + (match_operand 1 "ext_register_operand" "%0") + (const_int 8) + (const_int 8)) + (zero_extract:SI + (match_operand 2 "ext_register_operand" "Q") + (const_int 8) + (const_int 8)))) + (clobber (reg:CC FLAGS_REG))] + "" + "and{b}\t{%h2, %h0|%h0, %h2}" + [(set_attr "type" "alu") + (set_attr "length_immediate" "0") + (set_attr "mode" "QI")]) + +;; Convert wide AND instructions with immediate operand to shorter QImode +;; equivalents when possible. +;; Don't do the splitting with memory operands, since it introduces risk +;; of memory mismatch stalls. We may want to do the splitting for optimizing +;; for size, but that can (should?) be handled by generic code instead. +(define_split + [(set (match_operand 0 "register_operand" "") + (and (match_operand 1 "register_operand" "") + (match_operand 2 "const_int_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "reload_completed + && QI_REG_P (operands[0]) + && (!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)) + && !(~INTVAL (operands[2]) & ~(255 << 8)) + && GET_MODE (operands[0]) != QImode" + [(parallel [(set (zero_extract:SI (match_dup 0) (const_int 8) (const_int 8)) + (and:SI (zero_extract:SI (match_dup 1) + (const_int 8) (const_int 8)) + (match_dup 2))) + (clobber (reg:CC FLAGS_REG))])] + "operands[0] = gen_lowpart (SImode, operands[0]); + operands[1] = gen_lowpart (SImode, operands[1]); + operands[2] = gen_int_mode ((INTVAL (operands[2]) >> 8) & 0xff, SImode);") + +;; Since AND can be encoded with sign extended immediate, this is only +;; profitable when 7th bit is not set. +(define_split + [(set (match_operand 0 "register_operand" "") + (and (match_operand 1 "general_operand" "") + (match_operand 2 "const_int_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "reload_completed + && ANY_QI_REG_P (operands[0]) + && (!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)) + && !(~INTVAL (operands[2]) & ~255) + && !(INTVAL (operands[2]) & 128) + && GET_MODE (operands[0]) != QImode" + [(parallel [(set (strict_low_part (match_dup 0)) + (and:QI (match_dup 1) + (match_dup 2))) + (clobber (reg:CC FLAGS_REG))])] + "operands[0] = gen_lowpart (QImode, operands[0]); + operands[1] = gen_lowpart (QImode, operands[1]); + operands[2] = gen_lowpart (QImode, operands[2]);") + +;; Logical inclusive and exclusive OR instructions + +;; %%% This used to optimize known byte-wide and operations to memory. +;; If this is considered useful, it should be done with splitters. + +(define_expand "3" + [(set (match_operand:SWIM 0 "nonimmediate_operand" "") + (any_or:SWIM (match_operand:SWIM 1 "nonimmediate_operand" "") + (match_operand:SWIM 2 "" "")))] + "" + "ix86_expand_binary_operator (, mode, operands); DONE;") + +(define_insn "*_1" + [(set (match_operand:SWI248 0 "nonimmediate_operand" "=r,rm") + (any_or:SWI248 + (match_operand:SWI248 1 "nonimmediate_operand" "%0,0") + (match_operand:SWI248 2 "" ",r"))) + (clobber (reg:CC FLAGS_REG))] + "ix86_binary_operator_ok (, mode, operands)" + "{}\t{%2, %0|%0, %2}" + [(set_attr "type" "alu") + (set_attr "mode" "")]) + +;; %%% Potential partial reg stall on alternative 2. What to do? +(define_insn "*qi_1" + [(set (match_operand:QI 0 "nonimmediate_operand" "=q,m,r") + (any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0") + (match_operand:QI 2 "general_operand" "qmn,qn,rn"))) + (clobber (reg:CC FLAGS_REG))] + "ix86_binary_operator_ok (, QImode, operands)" + "@ + {b}\t{%2, %0|%0, %2} + {b}\t{%2, %0|%0, %2} + {l}\t{%k2, %k0|%k0, %k2}" + [(set_attr "type" "alu") + (set_attr "mode" "QI,QI,SI")]) + +;; See comment for addsi_1_zext why we do use nonimmediate_operand +(define_insn "*si_1_zext" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (any_or:SI (match_operand:SI 1 "nonimmediate_operand" "%0") + (match_operand:SI 2 "general_operand" "g")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && ix86_binary_operator_ok (, SImode, operands)" + "{l}\t{%2, %k0|%k0, %2}" + [(set_attr "type" "alu") + (set_attr "mode" "SI")]) + +(define_insn "*si_1_zext_imm" + [(set (match_operand:DI 0 "register_operand" "=r") + (any_or:DI + (zero_extend:DI (match_operand:SI 1 "register_operand" "%0")) + (match_operand:DI 2 "x86_64_zext_immediate_operand" "Z"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && ix86_binary_operator_ok (, SImode, operands)" + "{l}\t{%2, %k0|%k0, %2}" + [(set_attr "type" "alu") + (set_attr "mode" "SI")]) + +(define_insn "*qi_1_slp" + [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+q,m")) + (any_or:QI (match_dup 0) + (match_operand:QI 1 "general_operand" "qmn,qn"))) + (clobber (reg:CC FLAGS_REG))] + "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)) + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "{b}\t{%1, %0|%0, %1}" + [(set_attr "type" "alu1") + (set_attr "mode" "QI")]) + +(define_insn "*_2" + [(set (reg FLAGS_REG) + (compare (any_or:SWI + (match_operand:SWI 1 "nonimmediate_operand" "%0,0") + (match_operand:SWI 2 "" ",")) + (const_int 0))) + (set (match_operand:SWI 0 "nonimmediate_operand" "=,m") + (any_or:SWI (match_dup 1) (match_dup 2)))] + "ix86_match_ccmode (insn, CCNOmode) + && ix86_binary_operator_ok (, mode, operands)" + "{}\t{%2, %0|%0, %2}" + [(set_attr "type" "alu") + (set_attr "mode" "")]) + +;; See comment for addsi_1_zext why we do use nonimmediate_operand +;; ??? Special case for immediate operand is missing - it is tricky. +(define_insn "*si_2_zext" + [(set (reg FLAGS_REG) + (compare (any_or:SI (match_operand:SI 1 "nonimmediate_operand" "%0") + (match_operand:SI 2 "general_operand" "g")) + (const_int 0))) + (set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI (any_or:SI (match_dup 1) (match_dup 2))))] + "TARGET_64BIT && ix86_match_ccmode (insn, CCNOmode) + && ix86_binary_operator_ok (, SImode, operands)" + "{l}\t{%2, %k0|%k0, %2}" + [(set_attr "type" "alu") + (set_attr "mode" "SI")]) + +(define_insn "*si_2_zext_imm" + [(set (reg FLAGS_REG) + (compare (any_or:SI + (match_operand:SI 1 "nonimmediate_operand" "%0") + (match_operand:SI 2 "x86_64_zext_immediate_operand" "Z")) + (const_int 0))) + (set (match_operand:DI 0 "register_operand" "=r") + (any_or:DI (zero_extend:DI (match_dup 1)) (match_dup 2)))] + "TARGET_64BIT && ix86_match_ccmode (insn, CCNOmode) + && ix86_binary_operator_ok (, SImode, operands)" + "{l}\t{%2, %k0|%k0, %2}" + [(set_attr "type" "alu") + (set_attr "mode" "SI")]) + +(define_insn "*qi_2_slp" + [(set (reg FLAGS_REG) + (compare (any_or:QI (match_operand:QI 0 "nonimmediate_operand" "+q,qm") + (match_operand:QI 1 "general_operand" "qmn,qn")) + (const_int 0))) + (set (strict_low_part (match_dup 0)) + (any_or:QI (match_dup 0) (match_dup 1)))] + "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)) + && ix86_match_ccmode (insn, CCNOmode) + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "{b}\t{%1, %0|%0, %1}" + [(set_attr "type" "alu1") + (set_attr "mode" "QI")]) + +(define_insn "*_3" + [(set (reg FLAGS_REG) + (compare (any_or:SWI + (match_operand:SWI 1 "nonimmediate_operand" "%0") + (match_operand:SWI 2 "" "")) + (const_int 0))) + (clobber (match_scratch:SWI 0 "="))] + "ix86_match_ccmode (insn, CCNOmode) + && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "{}\t{%2, %0|%0, %2}" + [(set_attr "type" "alu") + (set_attr "mode" "")]) + +(define_insn "*qi_ext_0" + [(set (zero_extract:SI (match_operand 0 "ext_register_operand" "=Q") + (const_int 8) + (const_int 8)) + (any_or:SI + (zero_extract:SI + (match_operand 1 "ext_register_operand" "0") + (const_int 8) + (const_int 8)) + (match_operand 2 "const_int_operand" "n"))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)" + "{b}\t{%2, %h0|%h0, %2}" + [(set_attr "type" "alu") + (set_attr "length_immediate" "1") + (set_attr "modrm" "1") + (set_attr "mode" "QI")]) + +(define_insn "*qi_ext_1_rex64" + [(set (zero_extract:SI (match_operand 0 "ext_register_operand" "=Q") + (const_int 8) + (const_int 8)) + (any_or:SI + (zero_extract:SI + (match_operand 1 "ext_register_operand" "0") + (const_int 8) + (const_int 8)) + (zero_extend:SI + (match_operand 2 "ext_register_operand" "Q")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT + && (!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))" + "{b}\t{%2, %h0|%h0, %2}" + [(set_attr "type" "alu") + (set_attr "length_immediate" "0") + (set_attr "mode" "QI")]) + +(define_insn "*qi_ext_1" + [(set (zero_extract:SI (match_operand 0 "ext_register_operand" "=Q") + (const_int 8) + (const_int 8)) + (any_or:SI + (zero_extract:SI + (match_operand 1 "ext_register_operand" "0") + (const_int 8) + (const_int 8)) + (zero_extend:SI + (match_operand:QI 2 "general_operand" "Qm")))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_64BIT + && (!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))" + "{b}\t{%2, %h0|%h0, %2}" + [(set_attr "type" "alu") + (set_attr "length_immediate" "0") + (set_attr "mode" "QI")]) + +(define_insn "*qi_ext_2" + [(set (zero_extract:SI (match_operand 0 "ext_register_operand" "=Q") + (const_int 8) + (const_int 8)) + (any_or:SI + (zero_extract:SI (match_operand 1 "ext_register_operand" "0") + (const_int 8) + (const_int 8)) + (zero_extract:SI (match_operand 2 "ext_register_operand" "Q") + (const_int 8) + (const_int 8)))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)" + "{b}\t{%h2, %h0|%h0, %h2}" + [(set_attr "type" "alu") + (set_attr "length_immediate" "0") + (set_attr "mode" "QI")]) + +(define_split + [(set (match_operand 0 "register_operand" "") + (any_or (match_operand 1 "register_operand" "") + (match_operand 2 "const_int_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "reload_completed + && QI_REG_P (operands[0]) + && (!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)) + && !(INTVAL (operands[2]) & ~(255 << 8)) + && GET_MODE (operands[0]) != QImode" + [(parallel [(set (zero_extract:SI (match_dup 0) (const_int 8) (const_int 8)) + (any_or:SI (zero_extract:SI (match_dup 1) + (const_int 8) (const_int 8)) + (match_dup 2))) + (clobber (reg:CC FLAGS_REG))])] + "operands[0] = gen_lowpart (SImode, operands[0]); + operands[1] = gen_lowpart (SImode, operands[1]); + operands[2] = gen_int_mode ((INTVAL (operands[2]) >> 8) & 0xff, SImode);") + +;; Since OR can be encoded with sign extended immediate, this is only +;; profitable when 7th bit is set. +(define_split + [(set (match_operand 0 "register_operand" "") + (any_or (match_operand 1 "general_operand" "") + (match_operand 2 "const_int_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "reload_completed + && ANY_QI_REG_P (operands[0]) + && (!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)) + && !(INTVAL (operands[2]) & ~255) + && (INTVAL (operands[2]) & 128) + && GET_MODE (operands[0]) != QImode" + [(parallel [(set (strict_low_part (match_dup 0)) + (any_or:QI (match_dup 1) + (match_dup 2))) + (clobber (reg:CC FLAGS_REG))])] + "operands[0] = gen_lowpart (QImode, operands[0]); + operands[1] = gen_lowpart (QImode, operands[1]); + operands[2] = gen_lowpart (QImode, operands[2]);") + +(define_expand "xorqi_cc_ext_1" + [(parallel [ + (set (reg:CCNO FLAGS_REG) + (compare:CCNO + (xor:SI + (zero_extract:SI + (match_operand 1 "ext_register_operand" "") + (const_int 8) + (const_int 8)) + (match_operand:QI 2 "general_operand" "")) + (const_int 0))) + (set (zero_extract:SI (match_operand 0 "ext_register_operand" "") + (const_int 8) + (const_int 8)) + (xor:SI + (zero_extract:SI + (match_dup 1) + (const_int 8) + (const_int 8)) + (match_dup 2)))])]) + +(define_insn "*xorqi_cc_ext_1_rex64" + [(set (reg FLAGS_REG) + (compare + (xor:SI + (zero_extract:SI + (match_operand 1 "ext_register_operand" "0") + (const_int 8) + (const_int 8)) + (match_operand:QI 2 "nonmemory_operand" "Qn")) + (const_int 0))) + (set (zero_extract:SI (match_operand 0 "ext_register_operand" "=Q") + (const_int 8) + (const_int 8)) + (xor:SI + (zero_extract:SI + (match_dup 1) + (const_int 8) + (const_int 8)) + (match_dup 2)))] + "TARGET_64BIT && ix86_match_ccmode (insn, CCNOmode)" + "xor{b}\t{%2, %h0|%h0, %2}" + [(set_attr "type" "alu") + (set_attr "modrm" "1") + (set_attr "mode" "QI")]) + +(define_insn "*xorqi_cc_ext_1" + [(set (reg FLAGS_REG) + (compare + (xor:SI + (zero_extract:SI + (match_operand 1 "ext_register_operand" "0") + (const_int 8) + (const_int 8)) + (match_operand:QI 2 "general_operand" "qmn")) + (const_int 0))) + (set (zero_extract:SI (match_operand 0 "ext_register_operand" "=q") + (const_int 8) + (const_int 8)) + (xor:SI + (zero_extract:SI + (match_dup 1) + (const_int 8) + (const_int 8)) + (match_dup 2)))] + "!TARGET_64BIT && ix86_match_ccmode (insn, CCNOmode)" + "xor{b}\t{%2, %h0|%h0, %2}" + [(set_attr "type" "alu") + (set_attr "modrm" "1") + (set_attr "mode" "QI")]) + +;; Negation instructions + +(define_expand "neg2" + [(set (match_operand:SDWIM 0 "nonimmediate_operand" "") + (neg:SDWIM (match_operand:SDWIM 1 "nonimmediate_operand" "")))] + "" + "ix86_expand_unary_operator (NEG, mode, operands); DONE;") + +(define_insn_and_split "*neg2_doubleword" + [(set (match_operand: 0 "nonimmediate_operand" "=ro") + (neg: (match_operand: 1 "nonimmediate_operand" "0"))) + (clobber (reg:CC FLAGS_REG))] + "ix86_unary_operator_ok (NEG, mode, operands)" + "#" + "reload_completed" + [(parallel + [(set (reg:CCZ FLAGS_REG) + (compare:CCZ (neg:DWIH (match_dup 1)) (const_int 0))) + (set (match_dup 0) (neg:DWIH (match_dup 1)))]) + (parallel + [(set (match_dup 2) + (plus:DWIH (match_dup 3) + (plus:DWIH (ltu:DWIH (reg:CC FLAGS_REG) (const_int 0)) + (const_int 0)))) + (clobber (reg:CC FLAGS_REG))]) + (parallel + [(set (match_dup 2) + (neg:DWIH (match_dup 2))) + (clobber (reg:CC FLAGS_REG))])] + "split_double_mode (mode, &operands[0], 2, &operands[0], &operands[2]);") + +(define_insn "*neg2_1" + [(set (match_operand:SWI 0 "nonimmediate_operand" "=m") + (neg:SWI (match_operand:SWI 1 "nonimmediate_operand" "0"))) + (clobber (reg:CC FLAGS_REG))] + "ix86_unary_operator_ok (NEG, mode, operands)" + "neg{}\t%0" + [(set_attr "type" "negnot") + (set_attr "mode" "")]) + +;; Combine is quite creative about this pattern. +(define_insn "*negsi2_1_zext" + [(set (match_operand:DI 0 "register_operand" "=r") + (lshiftrt:DI + (neg:DI (ashift:DI (match_operand:DI 1 "register_operand" "0") + (const_int 32))) + (const_int 32))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && ix86_unary_operator_ok (NEG, SImode, operands)" + "neg{l}\t%k0" + [(set_attr "type" "negnot") + (set_attr "mode" "SI")]) + +;; The problem with neg is that it does not perform (compare x 0), +;; it really performs (compare 0 x), which leaves us with the zero +;; flag being the only useful item. + +(define_insn "*neg2_cmpz" + [(set (reg:CCZ FLAGS_REG) + (compare:CCZ + (neg:SWI (match_operand:SWI 1 "nonimmediate_operand" "0")) + (const_int 0))) + (set (match_operand:SWI 0 "nonimmediate_operand" "=m") + (neg:SWI (match_dup 1)))] + "ix86_unary_operator_ok (NEG, mode, operands)" + "neg{}\t%0" + [(set_attr "type" "negnot") + (set_attr "mode" "")]) + +(define_insn "*negsi2_cmpz_zext" + [(set (reg:CCZ FLAGS_REG) + (compare:CCZ + (lshiftrt:DI + (neg:DI (ashift:DI + (match_operand:DI 1 "register_operand" "0") + (const_int 32))) + (const_int 32)) + (const_int 0))) + (set (match_operand:DI 0 "register_operand" "=r") + (lshiftrt:DI (neg:DI (ashift:DI (match_dup 1) + (const_int 32))) + (const_int 32)))] + "TARGET_64BIT && ix86_unary_operator_ok (NEG, SImode, operands)" + "neg{l}\t%k0" + [(set_attr "type" "negnot") + (set_attr "mode" "SI")]) + +;; Changing of sign for FP values is doable using integer unit too. + +(define_expand "2" + [(set (match_operand:X87MODEF 0 "register_operand" "") + (absneg:X87MODEF (match_operand:X87MODEF 1 "register_operand" "")))] + "TARGET_80387 || (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)" + "ix86_expand_fp_absneg_operator (, mode, operands); DONE;") + +(define_insn "*absneg2_mixed" + [(set (match_operand:MODEF 0 "register_operand" "=x,x,f,!r") + (match_operator:MODEF 3 "absneg_operator" + [(match_operand:MODEF 1 "register_operand" "0,x,0,0")])) + (use (match_operand: 2 "nonimmediate_operand" "xm,0,X,X")) + (clobber (reg:CC FLAGS_REG))] + "TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode)" + "#") + +(define_insn "*absneg2_sse" + [(set (match_operand:MODEF 0 "register_operand" "=x,x,!r") + (match_operator:MODEF 3 "absneg_operator" + [(match_operand:MODEF 1 "register_operand" "0 ,x,0")])) + (use (match_operand: 2 "register_operand" "xm,0,X")) + (clobber (reg:CC FLAGS_REG))] + "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH" + "#") + +(define_insn "*absneg2_i387" + [(set (match_operand:X87MODEF 0 "register_operand" "=f,!r") + (match_operator:X87MODEF 3 "absneg_operator" + [(match_operand:X87MODEF 1 "register_operand" "0,0")])) + (use (match_operand 2 "" "")) + (clobber (reg:CC FLAGS_REG))] + "TARGET_80387 && !(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)" + "#") + +(define_expand "tf2" + [(set (match_operand:TF 0 "register_operand" "") + (absneg:TF (match_operand:TF 1 "register_operand" "")))] + "TARGET_SSE2" + "ix86_expand_fp_absneg_operator (, TFmode, operands); DONE;") + +(define_insn "*absnegtf2_sse" + [(set (match_operand:TF 0 "register_operand" "=x,x") + (match_operator:TF 3 "absneg_operator" + [(match_operand:TF 1 "register_operand" "0,x")])) + (use (match_operand:TF 2 "nonimmediate_operand" "xm,0")) + (clobber (reg:CC FLAGS_REG))] + "TARGET_SSE2" + "#") + +;; Splitters for fp abs and neg. + +(define_split + [(set (match_operand 0 "fp_register_operand" "") + (match_operator 1 "absneg_operator" [(match_dup 0)])) + (use (match_operand 2 "" "")) + (clobber (reg:CC FLAGS_REG))] + "reload_completed" + [(set (match_dup 0) (match_op_dup 1 [(match_dup 0)]))]) + +(define_split + [(set (match_operand 0 "register_operand" "") + (match_operator 3 "absneg_operator" + [(match_operand 1 "register_operand" "")])) + (use (match_operand 2 "nonimmediate_operand" "")) + (clobber (reg:CC FLAGS_REG))] + "reload_completed && SSE_REG_P (operands[0])" + [(set (match_dup 0) (match_dup 3))] +{ + enum machine_mode mode = GET_MODE (operands[0]); + enum machine_mode vmode = GET_MODE (operands[2]); + rtx tmp; + + operands[0] = simplify_gen_subreg (vmode, operands[0], mode, 0); + operands[1] = simplify_gen_subreg (vmode, operands[1], mode, 0); + if (operands_match_p (operands[0], operands[2])) + { + tmp = operands[1]; + operands[1] = operands[2]; + operands[2] = tmp; + } + if (GET_CODE (operands[3]) == ABS) + tmp = gen_rtx_AND (vmode, operands[1], operands[2]); + else + tmp = gen_rtx_XOR (vmode, operands[1], operands[2]); + operands[3] = tmp; +}) + +(define_split + [(set (match_operand:SF 0 "register_operand" "") + (match_operator:SF 1 "absneg_operator" [(match_dup 0)])) + (use (match_operand:V4SF 2 "" "")) + (clobber (reg:CC FLAGS_REG))] + "reload_completed" + [(parallel [(set (match_dup 0) (match_dup 1)) + (clobber (reg:CC FLAGS_REG))])] +{ + rtx tmp; + operands[0] = gen_lowpart (SImode, operands[0]); + if (GET_CODE (operands[1]) == ABS) + { + tmp = gen_int_mode (0x7fffffff, SImode); + tmp = gen_rtx_AND (SImode, operands[0], tmp); + } + else + { + tmp = gen_int_mode (0x80000000, SImode); + tmp = gen_rtx_XOR (SImode, operands[0], tmp); + } + operands[1] = tmp; +}) + +(define_split + [(set (match_operand:DF 0 "register_operand" "") + (match_operator:DF 1 "absneg_operator" [(match_dup 0)])) + (use (match_operand 2 "" "")) + (clobber (reg:CC FLAGS_REG))] + "reload_completed" + [(parallel [(set (match_dup 0) (match_dup 1)) + (clobber (reg:CC FLAGS_REG))])] +{ + rtx tmp; + if (TARGET_64BIT) + { + tmp = gen_lowpart (DImode, operands[0]); + tmp = gen_rtx_ZERO_EXTRACT (DImode, tmp, const1_rtx, GEN_INT (63)); + operands[0] = tmp; + + if (GET_CODE (operands[1]) == ABS) + tmp = const0_rtx; + else + tmp = gen_rtx_NOT (DImode, tmp); + } + else + { + operands[0] = gen_highpart (SImode, operands[0]); + if (GET_CODE (operands[1]) == ABS) + { + tmp = gen_int_mode (0x7fffffff, SImode); + tmp = gen_rtx_AND (SImode, operands[0], tmp); + } + else + { + tmp = gen_int_mode (0x80000000, SImode); + tmp = gen_rtx_XOR (SImode, operands[0], tmp); + } + } + operands[1] = tmp; +}) + +(define_split + [(set (match_operand:XF 0 "register_operand" "") + (match_operator:XF 1 "absneg_operator" [(match_dup 0)])) + (use (match_operand 2 "" "")) + (clobber (reg:CC FLAGS_REG))] + "reload_completed" + [(parallel [(set (match_dup 0) (match_dup 1)) + (clobber (reg:CC FLAGS_REG))])] +{ + rtx tmp; + operands[0] = gen_rtx_REG (SImode, + true_regnum (operands[0]) + + (TARGET_64BIT ? 1 : 2)); + if (GET_CODE (operands[1]) == ABS) + { + tmp = GEN_INT (0x7fff); + tmp = gen_rtx_AND (SImode, operands[0], tmp); + } + else + { + tmp = GEN_INT (0x8000); + tmp = gen_rtx_XOR (SImode, operands[0], tmp); + } + operands[1] = tmp; +}) + +;; Conditionalize these after reload. If they match before reload, we +;; lose the clobber and ability to use integer instructions. + +(define_insn "*2_1" + [(set (match_operand:X87MODEF 0 "register_operand" "=f") + (absneg:X87MODEF (match_operand:X87MODEF 1 "register_operand" "0")))] + "TARGET_80387 + && (reload_completed + || !(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))" + "f" + [(set_attr "type" "fsgn") + (set_attr "mode" "")]) + +(define_insn "*extendsfdf2" + [(set (match_operand:DF 0 "register_operand" "=f") + (absneg:DF (float_extend:DF + (match_operand:SF 1 "register_operand" "0"))))] + "TARGET_80387 && (!TARGET_SSE_MATH || TARGET_MIX_SSE_I387)" + "f" + [(set_attr "type" "fsgn") + (set_attr "mode" "DF")]) + +(define_insn "*extendsfxf2" + [(set (match_operand:XF 0 "register_operand" "=f") + (absneg:XF (float_extend:XF + (match_operand:SF 1 "register_operand" "0"))))] + "TARGET_80387" + "f" + [(set_attr "type" "fsgn") + (set_attr "mode" "XF")]) + +(define_insn "*extenddfxf2" + [(set (match_operand:XF 0 "register_operand" "=f") + (absneg:XF (float_extend:XF + (match_operand:DF 1 "register_operand" "0"))))] + "TARGET_80387" + "f" + [(set_attr "type" "fsgn") + (set_attr "mode" "XF")]) + +;; Copysign instructions + +(define_mode_iterator CSGNMODE [SF DF TF]) +(define_mode_attr CSGNVMODE [(SF "V4SF") (DF "V2DF") (TF "TF")]) + +(define_expand "copysign3" + [(match_operand:CSGNMODE 0 "register_operand" "") + (match_operand:CSGNMODE 1 "nonmemory_operand" "") + (match_operand:CSGNMODE 2 "register_operand" "")] + "(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || (TARGET_SSE2 && (mode == TFmode))" + "ix86_expand_copysign (operands); DONE;") + +(define_insn_and_split "copysign3_const" + [(set (match_operand:CSGNMODE 0 "register_operand" "=x") + (unspec:CSGNMODE + [(match_operand: 1 "vector_move_operand" "xmC") + (match_operand:CSGNMODE 2 "register_operand" "0") + (match_operand: 3 "nonimmediate_operand" "xm")] + UNSPEC_COPYSIGN))] + "(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || (TARGET_SSE2 && (mode == TFmode))" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_copysign_const (operands); DONE;") + +(define_insn "copysign3_var" + [(set (match_operand:CSGNMODE 0 "register_operand" "=x,x,x,x,x") + (unspec:CSGNMODE + [(match_operand:CSGNMODE 2 "register_operand" "x,0,0,x,x") + (match_operand:CSGNMODE 3 "register_operand" "1,1,x,1,x") + (match_operand: 4 "nonimmediate_operand" "X,xm,xm,0,0") + (match_operand: 5 "nonimmediate_operand" "0,xm,1,xm,1")] + UNSPEC_COPYSIGN)) + (clobber (match_scratch: 1 "=x,x,x,x,x"))] + "(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || (TARGET_SSE2 && (mode == TFmode))" + "#") + +(define_split + [(set (match_operand:CSGNMODE 0 "register_operand" "") + (unspec:CSGNMODE + [(match_operand:CSGNMODE 2 "register_operand" "") + (match_operand:CSGNMODE 3 "register_operand" "") + (match_operand: 4 "" "") + (match_operand: 5 "" "")] + UNSPEC_COPYSIGN)) + (clobber (match_scratch: 1 ""))] + "((SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || (TARGET_SSE2 && (mode == TFmode))) + && reload_completed" + [(const_int 0)] + "ix86_split_copysign_var (operands); DONE;") + +;; One complement instructions + +(define_expand "one_cmpl2" + [(set (match_operand:SWIM 0 "nonimmediate_operand" "") + (not:SWIM (match_operand:SWIM 1 "nonimmediate_operand" "")))] + "" + "ix86_expand_unary_operator (NOT, mode, operands); DONE;") + +(define_insn "*one_cmpl2_1" + [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm") + (not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0")))] + "ix86_unary_operator_ok (NOT, mode, operands)" + "not{}\t%0" + [(set_attr "type" "negnot") + (set_attr "mode" "")]) + +;; %%% Potential partial reg stall on alternative 1. What to do? +(define_insn "*one_cmplqi2_1" + [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r") + (not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0")))] + "ix86_unary_operator_ok (NOT, QImode, operands)" + "@ + not{b}\t%0 + not{l}\t%k0" + [(set_attr "type" "negnot") + (set_attr "mode" "QI,SI")]) + +;; ??? Currently never generated - xor is used instead. +(define_insn "*one_cmplsi2_1_zext" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (not:SI (match_operand:SI 1 "register_operand" "0"))))] + "TARGET_64BIT && ix86_unary_operator_ok (NOT, SImode, operands)" + "not{l}\t%k0" + [(set_attr "type" "negnot") + (set_attr "mode" "SI")]) + +(define_insn "*one_cmpl2_2" + [(set (reg FLAGS_REG) + (compare (not:SWI (match_operand:SWI 1 "nonimmediate_operand" "0")) + (const_int 0))) + (set (match_operand:SWI 0 "nonimmediate_operand" "=m") + (not:SWI (match_dup 1)))] + "ix86_match_ccmode (insn, CCNOmode) + && ix86_unary_operator_ok (NOT, mode, operands)" + "#" + [(set_attr "type" "alu1") + (set_attr "mode" "")]) + +(define_split + [(set (match_operand 0 "flags_reg_operand" "") + (match_operator 2 "compare_operator" + [(not:SWI (match_operand:SWI 3 "nonimmediate_operand" "")) + (const_int 0)])) + (set (match_operand:SWI 1 "nonimmediate_operand" "") + (not:SWI (match_dup 3)))] + "ix86_match_ccmode (insn, CCNOmode)" + [(parallel [(set (match_dup 0) + (match_op_dup 2 [(xor:SWI (match_dup 3) (const_int -1)) + (const_int 0)])) + (set (match_dup 1) + (xor:SWI (match_dup 3) (const_int -1)))])]) + +;; ??? Currently never generated - xor is used instead. +(define_insn "*one_cmplsi2_2_zext" + [(set (reg FLAGS_REG) + (compare (not:SI (match_operand:SI 1 "register_operand" "0")) + (const_int 0))) + (set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI (not:SI (match_dup 1))))] + "TARGET_64BIT && ix86_match_ccmode (insn, CCNOmode) + && ix86_unary_operator_ok (NOT, SImode, operands)" + "#" + [(set_attr "type" "alu1") + (set_attr "mode" "SI")]) + +(define_split + [(set (match_operand 0 "flags_reg_operand" "") + (match_operator 2 "compare_operator" + [(not:SI (match_operand:SI 3 "register_operand" "")) + (const_int 0)])) + (set (match_operand:DI 1 "register_operand" "") + (zero_extend:DI (not:SI (match_dup 3))))] + "ix86_match_ccmode (insn, CCNOmode)" + [(parallel [(set (match_dup 0) + (match_op_dup 2 [(xor:SI (match_dup 3) (const_int -1)) + (const_int 0)])) + (set (match_dup 1) + (zero_extend:DI (xor:SI (match_dup 3) (const_int -1))))])]) + +;; Shift instructions + +;; DImode shifts are implemented using the i386 "shift double" opcode, +;; which is written as "sh[lr]d[lw] imm,reg,reg/mem". If the shift count +;; is variable, then the count is in %cl and the "imm" operand is dropped +;; from the assembler input. +;; +;; This instruction shifts the target reg/mem as usual, but instead of +;; shifting in zeros, bits are shifted in from reg operand. If the insn +;; is a left shift double, bits are taken from the high order bits of +;; reg, else if the insn is a shift right double, bits are taken from the +;; low order bits of reg. So if %eax is "1234" and %edx is "5678", +;; "shldl $8,%edx,%eax" leaves %edx unchanged and sets %eax to "2345". +;; +;; Since sh[lr]d does not change the `reg' operand, that is done +;; separately, making all shifts emit pairs of shift double and normal +;; shift. Since sh[lr]d does not shift more than 31 bits, and we wish to +;; support a 63 bit shift, each shift where the count is in a reg expands +;; to a pair of shifts, a branch, a shift by 32 and a label. +;; +;; If the shift count is a constant, we need never emit more than one +;; shift pair, instead using moves and sign extension for counts greater +;; than 31. + +(define_expand "ashl3" + [(set (match_operand:SDWIM 0 "" "") + (ashift:SDWIM (match_operand:SDWIM 1 "" "") + (match_operand:QI 2 "nonmemory_operand" "")))] + "" + "ix86_expand_binary_operator (ASHIFT, mode, operands); DONE;") + +(define_insn "*ashl3_doubleword" + [(set (match_operand:DWI 0 "register_operand" "=&r,r") + (ashift:DWI (match_operand:DWI 1 "reg_or_pm1_operand" "n,0") + (match_operand:QI 2 "nonmemory_operand" "c,c"))) + (clobber (reg:CC FLAGS_REG))] + "" + "#" + [(set_attr "type" "multi")]) + +(define_split + [(set (match_operand:DWI 0 "register_operand" "") + (ashift:DWI (match_operand:DWI 1 "nonmemory_operand" "") + (match_operand:QI 2 "nonmemory_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "(optimize && flag_peephole2) ? epilogue_completed : reload_completed" + [(const_int 0)] + "ix86_split_ashl (operands, NULL_RTX, mode); DONE;") + +;; By default we don't ask for a scratch register, because when DWImode +;; values are manipulated, registers are already at a premium. But if +;; we have one handy, we won't turn it away. + +(define_peephole2 + [(match_scratch:DWIH 3 "r") + (parallel [(set (match_operand: 0 "register_operand" "") + (ashift: + (match_operand: 1 "nonmemory_operand" "") + (match_operand:QI 2 "nonmemory_operand" ""))) + (clobber (reg:CC FLAGS_REG))]) + (match_dup 3)] + "TARGET_CMOVE" + [(const_int 0)] + "ix86_split_ashl (operands, operands[3], mode); DONE;") + +(define_insn "x86_64_shld" + [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m") + (ior:DI (ashift:DI (match_dup 0) + (match_operand:QI 2 "nonmemory_operand" "Jc")) + (lshiftrt:DI (match_operand:DI 1 "register_operand" "r") + (minus:QI (const_int 64) (match_dup 2))))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT" + "shld{q}\t{%s2%1, %0|%0, %1, %2}" + [(set_attr "type" "ishift") + (set_attr "prefix_0f" "1") + (set_attr "mode" "DI") + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "vector") + (set_attr "bdver1_decode" "vector")]) + +(define_insn "x86_shld" + [(set (match_operand:SI 0 "nonimmediate_operand" "+r*m") + (ior:SI (ashift:SI (match_dup 0) + (match_operand:QI 2 "nonmemory_operand" "Ic")) + (lshiftrt:SI (match_operand:SI 1 "register_operand" "r") + (minus:QI (const_int 32) (match_dup 2))))) + (clobber (reg:CC FLAGS_REG))] + "" + "shld{l}\t{%s2%1, %0|%0, %1, %2}" + [(set_attr "type" "ishift") + (set_attr "prefix_0f" "1") + (set_attr "mode" "SI") + (set_attr "pent_pair" "np") + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "vector") + (set_attr "bdver1_decode" "vector")]) + +(define_expand "x86_shift_adj_1" + [(set (reg:CCZ FLAGS_REG) + (compare:CCZ (and:QI (match_operand:QI 2 "register_operand" "") + (match_dup 4)) + (const_int 0))) + (set (match_operand:SWI48 0 "register_operand" "") + (if_then_else:SWI48 (ne (reg:CCZ FLAGS_REG) (const_int 0)) + (match_operand:SWI48 1 "register_operand" "") + (match_dup 0))) + (set (match_dup 1) + (if_then_else:SWI48 (ne (reg:CCZ FLAGS_REG) (const_int 0)) + (match_operand:SWI48 3 "register_operand" "") + (match_dup 1)))] + "TARGET_CMOVE" + "operands[4] = GEN_INT (GET_MODE_BITSIZE (mode));") + +(define_expand "x86_shift_adj_2" + [(use (match_operand:SWI48 0 "register_operand" "")) + (use (match_operand:SWI48 1 "register_operand" "")) + (use (match_operand:QI 2 "register_operand" ""))] + "" +{ + rtx label = gen_label_rtx (); + rtx tmp; + + emit_insn (gen_testqi_ccz_1 (operands[2], + GEN_INT (GET_MODE_BITSIZE (mode)))); + + tmp = gen_rtx_REG (CCZmode, FLAGS_REG); + tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx); + tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, + gen_rtx_LABEL_REF (VOIDmode, label), + pc_rtx); + tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp)); + JUMP_LABEL (tmp) = label; + + emit_move_insn (operands[0], operands[1]); + ix86_expand_clear (operands[1]); + + emit_label (label); + LABEL_NUSES (label) = 1; + + DONE; +}) + +;; Avoid useless masking of count operand. +(define_insn "*ashl3_mask" + [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm") + (ashift:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "0") + (subreg:QI + (and:SI + (match_operand:SI 2 "register_operand" "c") + (match_operand:SI 3 "const_int_operand" "n")) 0))) + (clobber (reg:CC FLAGS_REG))] + "ix86_binary_operator_ok (ASHIFT, mode, operands) + && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (mode)-1)) + == GET_MODE_BITSIZE (mode)-1" +{ + return "sal{}\t{%b2, %0|%0, %b2}"; +} + [(set_attr "type" "ishift") + (set_attr "mode" "")]) + +(define_insn "*ashl3_1" + [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r") + (ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "0,l") + (match_operand:QI 2 "nonmemory_operand" "c,M"))) + (clobber (reg:CC FLAGS_REG))] + "ix86_binary_operator_ok (ASHIFT, mode, operands)" +{ + switch (get_attr_type (insn)) + { + case TYPE_LEA: + return "#"; + + case TYPE_ALU: + gcc_assert (operands[2] == const1_rtx); + gcc_assert (rtx_equal_p (operands[0], operands[1])); + return "add{}\t%0, %0"; + + default: + if (operands[2] == const1_rtx + && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))) + return "sal{}\t%0"; + else + return "sal{}\t{%2, %0|%0, %2}"; + } +} + [(set (attr "type") + (cond [(eq_attr "alternative" "1") + (const_string "lea") + (and (and (ne (symbol_ref "TARGET_DOUBLE_WITH_ADD") + (const_int 0)) + (match_operand 0 "register_operand" "")) + (match_operand 2 "const1_operand" "")) + (const_string "alu") + ] + (const_string "ishift"))) + (set (attr "length_immediate") + (if_then_else + (ior (eq_attr "type" "alu") + (and (eq_attr "type" "ishift") + (and (match_operand 2 "const1_operand" "") + (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)") + (const_int 0))))) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "")]) + +(define_insn "*ashlsi3_1_zext" + [(set (match_operand:DI 0 "register_operand" "=r,r") + (zero_extend:DI + (ashift:SI (match_operand:SI 1 "register_operand" "0,l") + (match_operand:QI 2 "nonmemory_operand" "cI,M")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && ix86_binary_operator_ok (ASHIFT, SImode, operands)" +{ + switch (get_attr_type (insn)) + { + case TYPE_LEA: + return "#"; + + case TYPE_ALU: + gcc_assert (operands[2] == const1_rtx); + return "add{l}\t%k0, %k0"; + + default: + if (operands[2] == const1_rtx + && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))) + return "sal{l}\t%k0"; + else + return "sal{l}\t{%2, %k0|%k0, %2}"; + } +} + [(set (attr "type") + (cond [(eq_attr "alternative" "1") + (const_string "lea") + (and (ne (symbol_ref "TARGET_DOUBLE_WITH_ADD") + (const_int 0)) + (match_operand 2 "const1_operand" "")) + (const_string "alu") + ] + (const_string "ishift"))) + (set (attr "length_immediate") + (if_then_else + (ior (eq_attr "type" "alu") + (and (eq_attr "type" "ishift") + (and (match_operand 2 "const1_operand" "") + (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)") + (const_int 0))))) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "SI")]) + +(define_insn "*ashlhi3_1" + [(set (match_operand:HI 0 "nonimmediate_operand" "=rm") + (ashift:HI (match_operand:HI 1 "nonimmediate_operand" "0") + (match_operand:QI 2 "nonmemory_operand" "cI"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_PARTIAL_REG_STALL + && ix86_binary_operator_ok (ASHIFT, HImode, operands)" +{ + switch (get_attr_type (insn)) + { + case TYPE_ALU: + gcc_assert (operands[2] == const1_rtx); + return "add{w}\t%0, %0"; + + default: + if (operands[2] == const1_rtx + && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))) + return "sal{w}\t%0"; + else + return "sal{w}\t{%2, %0|%0, %2}"; + } +} + [(set (attr "type") + (cond [(and (and (ne (symbol_ref "TARGET_DOUBLE_WITH_ADD") + (const_int 0)) + (match_operand 0 "register_operand" "")) + (match_operand 2 "const1_operand" "")) + (const_string "alu") + ] + (const_string "ishift"))) + (set (attr "length_immediate") + (if_then_else + (ior (eq_attr "type" "alu") + (and (eq_attr "type" "ishift") + (and (match_operand 2 "const1_operand" "") + (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)") + (const_int 0))))) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "HI")]) + +(define_insn "*ashlhi3_1_lea" + [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,r") + (ashift:HI (match_operand:HI 1 "nonimmediate_operand" "0,l") + (match_operand:QI 2 "nonmemory_operand" "cI,M"))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_PARTIAL_REG_STALL + && ix86_binary_operator_ok (ASHIFT, HImode, operands)" +{ + switch (get_attr_type (insn)) + { + case TYPE_LEA: + return "#"; + + case TYPE_ALU: + gcc_assert (operands[2] == const1_rtx); + return "add{w}\t%0, %0"; + + default: + if (operands[2] == const1_rtx + && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))) + return "sal{w}\t%0"; + else + return "sal{w}\t{%2, %0|%0, %2}"; + } +} + [(set (attr "type") + (cond [(eq_attr "alternative" "1") + (const_string "lea") + (and (and (ne (symbol_ref "TARGET_DOUBLE_WITH_ADD") + (const_int 0)) + (match_operand 0 "register_operand" "")) + (match_operand 2 "const1_operand" "")) + (const_string "alu") + ] + (const_string "ishift"))) + (set (attr "length_immediate") + (if_then_else + (ior (eq_attr "type" "alu") + (and (eq_attr "type" "ishift") + (and (match_operand 2 "const1_operand" "") + (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)") + (const_int 0))))) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "HI,SI")]) + +(define_insn "*ashlqi3_1" + [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r") + (ashift:QI (match_operand:QI 1 "nonimmediate_operand" "0,0") + (match_operand:QI 2 "nonmemory_operand" "cI,cI"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_PARTIAL_REG_STALL + && ix86_binary_operator_ok (ASHIFT, QImode, operands)" +{ + switch (get_attr_type (insn)) + { + case TYPE_ALU: + gcc_assert (operands[2] == const1_rtx); + if (REG_P (operands[1]) && !ANY_QI_REG_P (operands[1])) + return "add{l}\t%k0, %k0"; + else + return "add{b}\t%0, %0"; + + default: + if (operands[2] == const1_rtx + && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))) + { + if (get_attr_mode (insn) == MODE_SI) + return "sal{l}\t%k0"; + else + return "sal{b}\t%0"; + } + else + { + if (get_attr_mode (insn) == MODE_SI) + return "sal{l}\t{%2, %k0|%k0, %2}"; + else + return "sal{b}\t{%2, %0|%0, %2}"; + } + } +} + [(set (attr "type") + (cond [(and (and (ne (symbol_ref "TARGET_DOUBLE_WITH_ADD") + (const_int 0)) + (match_operand 0 "register_operand" "")) + (match_operand 2 "const1_operand" "")) + (const_string "alu") + ] + (const_string "ishift"))) + (set (attr "length_immediate") + (if_then_else + (ior (eq_attr "type" "alu") + (and (eq_attr "type" "ishift") + (and (match_operand 2 "const1_operand" "") + (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)") + (const_int 0))))) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "QI,SI")]) + +;; %%% Potential partial reg stall on alternative 2. What to do? +(define_insn "*ashlqi3_1_lea" + [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,r") + (ashift:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,l") + (match_operand:QI 2 "nonmemory_operand" "cI,cI,M"))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_PARTIAL_REG_STALL + && ix86_binary_operator_ok (ASHIFT, QImode, operands)" +{ + switch (get_attr_type (insn)) + { + case TYPE_LEA: + return "#"; + + case TYPE_ALU: + gcc_assert (operands[2] == const1_rtx); + if (REG_P (operands[1]) && !ANY_QI_REG_P (operands[1])) + return "add{l}\t%k0, %k0"; + else + return "add{b}\t%0, %0"; + + default: + if (operands[2] == const1_rtx + && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))) + { + if (get_attr_mode (insn) == MODE_SI) + return "sal{l}\t%k0"; + else + return "sal{b}\t%0"; + } + else + { + if (get_attr_mode (insn) == MODE_SI) + return "sal{l}\t{%2, %k0|%k0, %2}"; + else + return "sal{b}\t{%2, %0|%0, %2}"; + } + } +} + [(set (attr "type") + (cond [(eq_attr "alternative" "2") + (const_string "lea") + (and (and (ne (symbol_ref "TARGET_DOUBLE_WITH_ADD") + (const_int 0)) + (match_operand 0 "register_operand" "")) + (match_operand 2 "const1_operand" "")) + (const_string "alu") + ] + (const_string "ishift"))) + (set (attr "length_immediate") + (if_then_else + (ior (eq_attr "type" "alu") + (and (eq_attr "type" "ishift") + (and (match_operand 2 "const1_operand" "") + (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)") + (const_int 0))))) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "QI,SI,SI")]) + +(define_insn "*ashlqi3_1_slp" + [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm")) + (ashift:QI (match_dup 0) + (match_operand:QI 1 "nonmemory_operand" "cI"))) + (clobber (reg:CC FLAGS_REG))] + "(optimize_function_for_size_p (cfun) + || !TARGET_PARTIAL_FLAG_REG_STALL + || (operands[1] == const1_rtx + && (TARGET_SHIFT1 + || (TARGET_DOUBLE_WITH_ADD && REG_P (operands[0])))))" +{ + switch (get_attr_type (insn)) + { + case TYPE_ALU: + gcc_assert (operands[1] == const1_rtx); + return "add{b}\t%0, %0"; + + default: + if (operands[1] == const1_rtx + && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))) + return "sal{b}\t%0"; + else + return "sal{b}\t{%1, %0|%0, %1}"; + } +} + [(set (attr "type") + (cond [(and (and (ne (symbol_ref "TARGET_DOUBLE_WITH_ADD") + (const_int 0)) + (match_operand 0 "register_operand" "")) + (match_operand 1 "const1_operand" "")) + (const_string "alu") + ] + (const_string "ishift1"))) + (set (attr "length_immediate") + (if_then_else + (ior (eq_attr "type" "alu") + (and (eq_attr "type" "ishift1") + (and (match_operand 1 "const1_operand" "") + (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)") + (const_int 0))))) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "QI")]) + +;; Convert lea to the lea pattern to avoid flags dependency. +(define_split + [(set (match_operand 0 "register_operand" "") + (ashift (match_operand 1 "index_register_operand" "") + (match_operand:QI 2 "const_int_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "reload_completed + && true_regnum (operands[0]) != true_regnum (operands[1])" + [(const_int 0)] +{ + rtx pat; + enum machine_mode mode = GET_MODE (operands[0]); + + if (mode != Pmode) + operands[1] = gen_lowpart (Pmode, operands[1]); + operands[2] = gen_int_mode (1 << INTVAL (operands[2]), Pmode); + + pat = gen_rtx_MULT (Pmode, operands[1], operands[2]); + + if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode)) + operands[0] = gen_lowpart (SImode, operands[0]); + + if (TARGET_64BIT && mode != Pmode) + pat = gen_rtx_SUBREG (SImode, pat, 0); + + emit_insn (gen_rtx_SET (VOIDmode, operands[0], pat)); + DONE; +}) + +;; Convert lea to the lea pattern to avoid flags dependency. +(define_split + [(set (match_operand:DI 0 "register_operand" "") + (zero_extend:DI + (ashift:SI (match_operand:SI 1 "index_register_operand" "") + (match_operand:QI 2 "const_int_operand" "")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && reload_completed + && true_regnum (operands[0]) != true_regnum (operands[1])" + [(set (match_dup 0) + (zero_extend:DI (subreg:SI (mult:DI (match_dup 1) (match_dup 2)) 0)))] +{ + operands[1] = gen_lowpart (DImode, operands[1]); + operands[2] = gen_int_mode (1 << INTVAL (operands[2]), DImode); +}) + +;; This pattern can't accept a variable shift count, since shifts by +;; zero don't affect the flags. We assume that shifts by constant +;; zero are optimized away. +(define_insn "*ashl3_cmp" + [(set (reg FLAGS_REG) + (compare + (ashift:SWI (match_operand:SWI 1 "nonimmediate_operand" "0") + (match_operand:QI 2 "" "")) + (const_int 0))) + (set (match_operand:SWI 0 "nonimmediate_operand" "=m") + (ashift:SWI (match_dup 1) (match_dup 2)))] + "(optimize_function_for_size_p (cfun) + || !TARGET_PARTIAL_FLAG_REG_STALL + || (operands[2] == const1_rtx + && (TARGET_SHIFT1 + || (TARGET_DOUBLE_WITH_ADD && REG_P (operands[0]))))) + && ix86_match_ccmode (insn, CCGOCmode) + && ix86_binary_operator_ok (ASHIFT, mode, operands)" +{ + switch (get_attr_type (insn)) + { + case TYPE_ALU: + gcc_assert (operands[2] == const1_rtx); + return "add{}\t%0, %0"; + + default: + if (operands[2] == const1_rtx + && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))) + return "sal{}\t%0"; + else + return "sal{}\t{%2, %0|%0, %2}"; + } +} + [(set (attr "type") + (cond [(and (and (ne (symbol_ref "TARGET_DOUBLE_WITH_ADD") + (const_int 0)) + (match_operand 0 "register_operand" "")) + (match_operand 2 "const1_operand" "")) + (const_string "alu") + ] + (const_string "ishift"))) + (set (attr "length_immediate") + (if_then_else + (ior (eq_attr "type" "alu") + (and (eq_attr "type" "ishift") + (and (match_operand 2 "const1_operand" "") + (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)") + (const_int 0))))) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "")]) + +(define_insn "*ashlsi3_cmp_zext" + [(set (reg FLAGS_REG) + (compare + (ashift:SI (match_operand:SI 1 "register_operand" "0") + (match_operand:QI 2 "const_1_to_31_operand" "I")) + (const_int 0))) + (set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI (ashift:SI (match_dup 1) (match_dup 2))))] + "TARGET_64BIT + && (optimize_function_for_size_p (cfun) + || !TARGET_PARTIAL_FLAG_REG_STALL + || (operands[2] == const1_rtx + && (TARGET_SHIFT1 + || TARGET_DOUBLE_WITH_ADD))) + && ix86_match_ccmode (insn, CCGOCmode) + && ix86_binary_operator_ok (ASHIFT, SImode, operands)" +{ + switch (get_attr_type (insn)) + { + case TYPE_ALU: + gcc_assert (operands[2] == const1_rtx); + return "add{l}\t%k0, %k0"; + + default: + if (operands[2] == const1_rtx + && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))) + return "sal{l}\t%k0"; + else + return "sal{l}\t{%2, %k0|%k0, %2}"; + } +} + [(set (attr "type") + (cond [(and (ne (symbol_ref "TARGET_DOUBLE_WITH_ADD") + (const_int 0)) + (match_operand 2 "const1_operand" "")) + (const_string "alu") + ] + (const_string "ishift"))) + (set (attr "length_immediate") + (if_then_else + (ior (eq_attr "type" "alu") + (and (eq_attr "type" "ishift") + (and (match_operand 2 "const1_operand" "") + (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)") + (const_int 0))))) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "SI")]) + +(define_insn "*ashl3_cconly" + [(set (reg FLAGS_REG) + (compare + (ashift:SWI (match_operand:SWI 1 "register_operand" "0") + (match_operand:QI 2 "" "")) + (const_int 0))) + (clobber (match_scratch:SWI 0 "="))] + "(optimize_function_for_size_p (cfun) + || !TARGET_PARTIAL_FLAG_REG_STALL + || (operands[2] == const1_rtx + && (TARGET_SHIFT1 + || TARGET_DOUBLE_WITH_ADD))) + && ix86_match_ccmode (insn, CCGOCmode)" +{ + switch (get_attr_type (insn)) + { + case TYPE_ALU: + gcc_assert (operands[2] == const1_rtx); + return "add{}\t%0, %0"; + + default: + if (operands[2] == const1_rtx + && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))) + return "sal{}\t%0"; + else + return "sal{}\t{%2, %0|%0, %2}"; + } +} + [(set (attr "type") + (cond [(and (and (ne (symbol_ref "TARGET_DOUBLE_WITH_ADD") + (const_int 0)) + (match_operand 0 "register_operand" "")) + (match_operand 2 "const1_operand" "")) + (const_string "alu") + ] + (const_string "ishift"))) + (set (attr "length_immediate") + (if_then_else + (ior (eq_attr "type" "alu") + (and (eq_attr "type" "ishift") + (and (match_operand 2 "const1_operand" "") + (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)") + (const_int 0))))) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "")]) + +;; See comment above `ashl3' about how this works. + +(define_expand "3" + [(set (match_operand:SDWIM 0 "" "") + (any_shiftrt:SDWIM (match_operand:SDWIM 1 "" "") + (match_operand:QI 2 "nonmemory_operand" "")))] + "" + "ix86_expand_binary_operator (, mode, operands); DONE;") + +;; Avoid useless masking of count operand. +(define_insn "*3_mask" + [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm") + (any_shiftrt:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "0") + (subreg:QI + (and:SI + (match_operand:SI 2 "register_operand" "c") + (match_operand:SI 3 "const_int_operand" "n")) 0))) + (clobber (reg:CC FLAGS_REG))] + "ix86_binary_operator_ok (, mode, operands) + && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (mode)-1)) + == GET_MODE_BITSIZE (mode)-1" +{ + return "{}\t{%b2, %0|%0, %b2}"; +} + [(set_attr "type" "ishift") + (set_attr "mode" "")]) + +(define_insn_and_split "*3_doubleword" + [(set (match_operand:DWI 0 "register_operand" "=r") + (any_shiftrt:DWI (match_operand:DWI 1 "register_operand" "0") + (match_operand:QI 2 "nonmemory_operand" "c"))) + (clobber (reg:CC FLAGS_REG))] + "" + "#" + "(optimize && flag_peephole2) ? epilogue_completed : reload_completed" + [(const_int 0)] + "ix86_split_ (operands, NULL_RTX, mode); DONE;" + [(set_attr "type" "multi")]) + +;; By default we don't ask for a scratch register, because when DWImode +;; values are manipulated, registers are already at a premium. But if +;; we have one handy, we won't turn it away. + +(define_peephole2 + [(match_scratch:DWIH 3 "r") + (parallel [(set (match_operand: 0 "register_operand" "") + (any_shiftrt: + (match_operand: 1 "register_operand" "") + (match_operand:QI 2 "nonmemory_operand" ""))) + (clobber (reg:CC FLAGS_REG))]) + (match_dup 3)] + "TARGET_CMOVE" + [(const_int 0)] + "ix86_split_ (operands, operands[3], mode); DONE;") + +(define_insn "x86_64_shrd" + [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m") + (ior:DI (ashiftrt:DI (match_dup 0) + (match_operand:QI 2 "nonmemory_operand" "Jc")) + (ashift:DI (match_operand:DI 1 "register_operand" "r") + (minus:QI (const_int 64) (match_dup 2))))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT" + "shrd{q}\t{%s2%1, %0|%0, %1, %2}" + [(set_attr "type" "ishift") + (set_attr "prefix_0f" "1") + (set_attr "mode" "DI") + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "vector") + (set_attr "bdver1_decode" "vector")]) + +(define_insn "x86_shrd" + [(set (match_operand:SI 0 "nonimmediate_operand" "+r*m") + (ior:SI (ashiftrt:SI (match_dup 0) + (match_operand:QI 2 "nonmemory_operand" "Ic")) + (ashift:SI (match_operand:SI 1 "register_operand" "r") + (minus:QI (const_int 32) (match_dup 2))))) + (clobber (reg:CC FLAGS_REG))] + "" + "shrd{l}\t{%s2%1, %0|%0, %1, %2}" + [(set_attr "type" "ishift") + (set_attr "prefix_0f" "1") + (set_attr "mode" "SI") + (set_attr "pent_pair" "np") + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "vector") + (set_attr "bdver1_decode" "vector")]) + +(define_insn "ashrdi3_cvt" + [(set (match_operand:DI 0 "nonimmediate_operand" "=*d,rm") + (ashiftrt:DI (match_operand:DI 1 "nonimmediate_operand" "*a,0") + (match_operand:QI 2 "const_int_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && INTVAL (operands[2]) == 63 + && (TARGET_USE_CLTD || optimize_function_for_size_p (cfun)) + && ix86_binary_operator_ok (ASHIFTRT, DImode, operands)" + "@ + {cqto|cqo} + sar{q}\t{%2, %0|%0, %2}" + [(set_attr "type" "imovx,ishift") + (set_attr "prefix_0f" "0,*") + (set_attr "length_immediate" "0,*") + (set_attr "modrm" "0,1") + (set_attr "mode" "DI")]) + +(define_insn "ashrsi3_cvt" + [(set (match_operand:SI 0 "nonimmediate_operand" "=*d,rm") + (ashiftrt:SI (match_operand:SI 1 "nonimmediate_operand" "*a,0") + (match_operand:QI 2 "const_int_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "INTVAL (operands[2]) == 31 + && (TARGET_USE_CLTD || optimize_function_for_size_p (cfun)) + && ix86_binary_operator_ok (ASHIFTRT, SImode, operands)" + "@ + {cltd|cdq} + sar{l}\t{%2, %0|%0, %2}" + [(set_attr "type" "imovx,ishift") + (set_attr "prefix_0f" "0,*") + (set_attr "length_immediate" "0,*") + (set_attr "modrm" "0,1") + (set_attr "mode" "SI")]) + +(define_insn "*ashrsi3_cvt_zext" + [(set (match_operand:DI 0 "register_operand" "=*d,r") + (zero_extend:DI + (ashiftrt:SI (match_operand:SI 1 "register_operand" "*a,0") + (match_operand:QI 2 "const_int_operand" "")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && INTVAL (operands[2]) == 31 + && (TARGET_USE_CLTD || optimize_function_for_size_p (cfun)) + && ix86_binary_operator_ok (ASHIFTRT, SImode, operands)" + "@ + {cltd|cdq} + sar{l}\t{%2, %k0|%k0, %2}" + [(set_attr "type" "imovx,ishift") + (set_attr "prefix_0f" "0,*") + (set_attr "length_immediate" "0,*") + (set_attr "modrm" "0,1") + (set_attr "mode" "SI")]) + +(define_expand "x86_shift_adj_3" + [(use (match_operand:SWI48 0 "register_operand" "")) + (use (match_operand:SWI48 1 "register_operand" "")) + (use (match_operand:QI 2 "register_operand" ""))] + "" +{ + rtx label = gen_label_rtx (); + rtx tmp; + + emit_insn (gen_testqi_ccz_1 (operands[2], + GEN_INT (GET_MODE_BITSIZE (mode)))); + + tmp = gen_rtx_REG (CCZmode, FLAGS_REG); + tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx); + tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, + gen_rtx_LABEL_REF (VOIDmode, label), + pc_rtx); + tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp)); + JUMP_LABEL (tmp) = label; + + emit_move_insn (operands[0], operands[1]); + emit_insn (gen_ashr3_cvt (operands[1], operands[1], + GEN_INT (GET_MODE_BITSIZE (mode)-1))); + emit_label (label); + LABEL_NUSES (label) = 1; + + DONE; +}) + +(define_insn "*3_1" + [(set (match_operand:SWI 0 "nonimmediate_operand" "=m") + (any_shiftrt:SWI (match_operand:SWI 1 "nonimmediate_operand" "0") + (match_operand:QI 2 "nonmemory_operand" "c"))) + (clobber (reg:CC FLAGS_REG))] + "ix86_binary_operator_ok (, mode, operands)" +{ + if (operands[2] == const1_rtx + && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))) + return "{}\t%0"; + else + return "{}\t{%2, %0|%0, %2}"; +} + [(set_attr "type" "ishift") + (set (attr "length_immediate") + (if_then_else + (and (match_operand 2 "const1_operand" "") + (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)") + (const_int 0))) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "")]) + +(define_insn "*si3_1_zext" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (any_shiftrt:SI (match_operand:SI 1 "register_operand" "0") + (match_operand:QI 2 "nonmemory_operand" "cI")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && ix86_binary_operator_ok (, SImode, operands)" +{ + if (operands[2] == const1_rtx + && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))) + return "{l}\t%k0"; + else + return "{l}\t{%2, %k0|%k0, %2}"; +} + [(set_attr "type" "ishift") + (set (attr "length_immediate") + (if_then_else + (and (match_operand 2 "const1_operand" "") + (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)") + (const_int 0))) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "SI")]) + +(define_insn "*qi3_1_slp" + [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm")) + (any_shiftrt:QI (match_dup 0) + (match_operand:QI 1 "nonmemory_operand" "cI"))) + (clobber (reg:CC FLAGS_REG))] + "(optimize_function_for_size_p (cfun) + || !TARGET_PARTIAL_REG_STALL + || (operands[1] == const1_rtx + && TARGET_SHIFT1))" +{ + if (operands[1] == const1_rtx + && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))) + return "{b}\t%0"; + else + return "{b}\t{%1, %0|%0, %1}"; +} + [(set_attr "type" "ishift1") + (set (attr "length_immediate") + (if_then_else + (and (match_operand 1 "const1_operand" "") + (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)") + (const_int 0))) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "QI")]) + +;; This pattern can't accept a variable shift count, since shifts by +;; zero don't affect the flags. We assume that shifts by constant +;; zero are optimized away. +(define_insn "*3_cmp" + [(set (reg FLAGS_REG) + (compare + (any_shiftrt:SWI + (match_operand:SWI 1 "nonimmediate_operand" "0") + (match_operand:QI 2 "" "")) + (const_int 0))) + (set (match_operand:SWI 0 "nonimmediate_operand" "=m") + (any_shiftrt:SWI (match_dup 1) (match_dup 2)))] + "(optimize_function_for_size_p (cfun) + || !TARGET_PARTIAL_FLAG_REG_STALL + || (operands[2] == const1_rtx + && TARGET_SHIFT1)) + && ix86_match_ccmode (insn, CCGOCmode) + && ix86_binary_operator_ok (, mode, operands)" +{ + if (operands[2] == const1_rtx + && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))) + return "{}\t%0"; + else + return "{}\t{%2, %0|%0, %2}"; +} + [(set_attr "type" "ishift") + (set (attr "length_immediate") + (if_then_else + (and (match_operand 2 "const1_operand" "") + (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)") + (const_int 0))) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "")]) + +(define_insn "*si3_cmp_zext" + [(set (reg FLAGS_REG) + (compare + (any_shiftrt:SI (match_operand:SI 1 "register_operand" "0") + (match_operand:QI 2 "const_1_to_31_operand" "I")) + (const_int 0))) + (set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI (any_shiftrt:SI (match_dup 1) (match_dup 2))))] + "TARGET_64BIT + && (optimize_function_for_size_p (cfun) + || !TARGET_PARTIAL_FLAG_REG_STALL + || (operands[2] == const1_rtx + && TARGET_SHIFT1)) + && ix86_match_ccmode (insn, CCGOCmode) + && ix86_binary_operator_ok (, SImode, operands)" +{ + if (operands[2] == const1_rtx + && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))) + return "{l}\t%k0"; + else + return "{l}\t{%2, %k0|%k0, %2}"; +} + [(set_attr "type" "ishift") + (set (attr "length_immediate") + (if_then_else + (and (match_operand 2 "const1_operand" "") + (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)") + (const_int 0))) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "SI")]) + +(define_insn "*3_cconly" + [(set (reg FLAGS_REG) + (compare + (any_shiftrt:SWI + (match_operand:SWI 1 "register_operand" "0") + (match_operand:QI 2 "" "")) + (const_int 0))) + (clobber (match_scratch:SWI 0 "="))] + "(optimize_function_for_size_p (cfun) + || !TARGET_PARTIAL_FLAG_REG_STALL + || (operands[2] == const1_rtx + && TARGET_SHIFT1)) + && ix86_match_ccmode (insn, CCGOCmode)" +{ + if (operands[2] == const1_rtx + && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))) + return "{}\t%0"; + else + return "{}\t{%2, %0|%0, %2}"; +} + [(set_attr "type" "ishift") + (set (attr "length_immediate") + (if_then_else + (and (match_operand 2 "const1_operand" "") + (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)") + (const_int 0))) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "")]) + +;; Rotate instructions + +(define_expand "ti3" + [(set (match_operand:TI 0 "register_operand" "") + (any_rotate:TI (match_operand:TI 1 "register_operand" "") + (match_operand:QI 2 "nonmemory_operand" "")))] + "TARGET_64BIT" +{ + if (const_1_to_63_operand (operands[2], VOIDmode)) + emit_insn (gen_ix86_ti3_doubleword + (operands[0], operands[1], operands[2])); + else + FAIL; + + DONE; +}) + +(define_expand "di3" + [(set (match_operand:DI 0 "shiftdi_operand" "") + (any_rotate:DI (match_operand:DI 1 "shiftdi_operand" "") + (match_operand:QI 2 "nonmemory_operand" "")))] + "" +{ + if (TARGET_64BIT) + ix86_expand_binary_operator (, DImode, operands); + else if (const_1_to_31_operand (operands[2], VOIDmode)) + emit_insn (gen_ix86_di3_doubleword + (operands[0], operands[1], operands[2])); + else + FAIL; + + DONE; +}) + +(define_expand "3" + [(set (match_operand:SWIM124 0 "nonimmediate_operand" "") + (any_rotate:SWIM124 (match_operand:SWIM124 1 "nonimmediate_operand" "") + (match_operand:QI 2 "nonmemory_operand" "")))] + "" + "ix86_expand_binary_operator (, mode, operands); DONE;") + +;; Avoid useless masking of count operand. +(define_insn "*3_mask" + [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm") + (any_rotate:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "0") + (subreg:QI + (and:SI + (match_operand:SI 2 "register_operand" "c") + (match_operand:SI 3 "const_int_operand" "n")) 0))) + (clobber (reg:CC FLAGS_REG))] + "ix86_binary_operator_ok (, mode, operands) + && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (mode)-1)) + == GET_MODE_BITSIZE (mode)-1" +{ + return "{}\t{%b2, %0|%0, %b2}"; +} + [(set_attr "type" "rotate") + (set_attr "mode" "")]) + +;; Implement rotation using two double-precision +;; shift instructions and a scratch register. + +(define_insn_and_split "ix86_rotl3_doubleword" + [(set (match_operand: 0 "register_operand" "=r") + (rotate: (match_operand: 1 "register_operand" "0") + (match_operand:QI 2 "" ""))) + (clobber (reg:CC FLAGS_REG)) + (clobber (match_scratch:DWIH 3 "=&r"))] + "" + "#" + "reload_completed" + [(set (match_dup 3) (match_dup 4)) + (parallel + [(set (match_dup 4) + (ior:DWIH (ashift:DWIH (match_dup 4) (match_dup 2)) + (lshiftrt:DWIH (match_dup 5) + (minus:QI (match_dup 6) (match_dup 2))))) + (clobber (reg:CC FLAGS_REG))]) + (parallel + [(set (match_dup 5) + (ior:DWIH (ashift:DWIH (match_dup 5) (match_dup 2)) + (lshiftrt:DWIH (match_dup 3) + (minus:QI (match_dup 6) (match_dup 2))))) + (clobber (reg:CC FLAGS_REG))])] +{ + operands[6] = GEN_INT (GET_MODE_BITSIZE (mode)); + + split_double_mode (mode, &operands[0], 1, &operands[4], &operands[5]); +}) + +(define_insn_and_split "ix86_rotr3_doubleword" + [(set (match_operand: 0 "register_operand" "=r") + (rotatert: (match_operand: 1 "register_operand" "0") + (match_operand:QI 2 "" ""))) + (clobber (reg:CC FLAGS_REG)) + (clobber (match_scratch:DWIH 3 "=&r"))] + "" + "#" + "reload_completed" + [(set (match_dup 3) (match_dup 4)) + (parallel + [(set (match_dup 4) + (ior:DWIH (ashiftrt:DWIH (match_dup 4) (match_dup 2)) + (ashift:DWIH (match_dup 5) + (minus:QI (match_dup 6) (match_dup 2))))) + (clobber (reg:CC FLAGS_REG))]) + (parallel + [(set (match_dup 5) + (ior:DWIH (ashiftrt:DWIH (match_dup 5) (match_dup 2)) + (ashift:DWIH (match_dup 3) + (minus:QI (match_dup 6) (match_dup 2))))) + (clobber (reg:CC FLAGS_REG))])] +{ + operands[6] = GEN_INT (GET_MODE_BITSIZE (mode)); + + split_double_mode (mode, &operands[0], 1, &operands[4], &operands[5]); +}) + +(define_insn "*3_1" + [(set (match_operand:SWI 0 "nonimmediate_operand" "=m") + (any_rotate:SWI (match_operand:SWI 1 "nonimmediate_operand" "0") + (match_operand:QI 2 "nonmemory_operand" "c"))) + (clobber (reg:CC FLAGS_REG))] + "ix86_binary_operator_ok (, mode, operands)" +{ + if (operands[2] == const1_rtx + && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))) + return "{}\t%0"; + else + return "{}\t{%2, %0|%0, %2}"; +} + [(set_attr "type" "rotate") + (set (attr "length_immediate") + (if_then_else + (and (match_operand 2 "const1_operand" "") + (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)") + (const_int 0))) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "")]) + +(define_insn "*si3_1_zext" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (any_rotate:SI (match_operand:SI 1 "register_operand" "0") + (match_operand:QI 2 "nonmemory_operand" "cI")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && ix86_binary_operator_ok (, SImode, operands)" +{ + if (operands[2] == const1_rtx + && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))) + return "{l}\t%k0"; + else + return "{l}\t{%2, %k0|%k0, %2}"; +} + [(set_attr "type" "rotate") + (set (attr "length_immediate") + (if_then_else + (and (match_operand 2 "const1_operand" "") + (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)") + (const_int 0))) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "SI")]) + +(define_insn "*qi3_1_slp" + [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm")) + (any_rotate:QI (match_dup 0) + (match_operand:QI 1 "nonmemory_operand" "cI"))) + (clobber (reg:CC FLAGS_REG))] + "(optimize_function_for_size_p (cfun) + || !TARGET_PARTIAL_REG_STALL + || (operands[1] == const1_rtx + && TARGET_SHIFT1))" +{ + if (operands[1] == const1_rtx + && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))) + return "{b}\t%0"; + else + return "{b}\t{%1, %0|%0, %1}"; +} + [(set_attr "type" "rotate1") + (set (attr "length_immediate") + (if_then_else + (and (match_operand 1 "const1_operand" "") + (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)") + (const_int 0))) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "QI")]) + +(define_split + [(set (match_operand:HI 0 "register_operand" "") + (any_rotate:HI (match_dup 0) (const_int 8))) + (clobber (reg:CC FLAGS_REG))] + "reload_completed + && (TARGET_USE_XCHGB || optimize_function_for_size_p (cfun))" + [(parallel [(set (strict_low_part (match_dup 0)) + (bswap:HI (match_dup 0))) + (clobber (reg:CC FLAGS_REG))])]) + +;; Bit set / bit test instructions + +(define_expand "extv" + [(set (match_operand:SI 0 "register_operand" "") + (sign_extract:SI (match_operand:SI 1 "register_operand" "") + (match_operand:SI 2 "const8_operand" "") + (match_operand:SI 3 "const8_operand" "")))] + "" +{ + /* Handle extractions from %ah et al. */ + if (INTVAL (operands[2]) != 8 || INTVAL (operands[3]) != 8) + FAIL; + + /* From mips.md: extract_bit_field doesn't verify that our source + matches the predicate, so check it again here. */ + if (! ext_register_operand (operands[1], VOIDmode)) + FAIL; +}) + +(define_expand "extzv" + [(set (match_operand:SI 0 "register_operand" "") + (zero_extract:SI (match_operand 1 "ext_register_operand" "") + (match_operand:SI 2 "const8_operand" "") + (match_operand:SI 3 "const8_operand" "")))] + "" +{ + /* Handle extractions from %ah et al. */ + if (INTVAL (operands[2]) != 8 || INTVAL (operands[3]) != 8) + FAIL; + + /* From mips.md: extract_bit_field doesn't verify that our source + matches the predicate, so check it again here. */ + if (! ext_register_operand (operands[1], VOIDmode)) + FAIL; +}) + +(define_expand "insv" + [(set (zero_extract (match_operand 0 "ext_register_operand" "") + (match_operand 1 "const8_operand" "") + (match_operand 2 "const8_operand" "")) + (match_operand 3 "register_operand" ""))] + "" +{ + rtx (*gen_mov_insv_1) (rtx, rtx); + + /* Handle insertions to %ah et al. */ + if (INTVAL (operands[1]) != 8 || INTVAL (operands[2]) != 8) + FAIL; + + /* From mips.md: insert_bit_field doesn't verify that our source + matches the predicate, so check it again here. */ + if (! ext_register_operand (operands[0], VOIDmode)) + FAIL; + + gen_mov_insv_1 = (TARGET_64BIT + ? gen_movdi_insv_1 : gen_movsi_insv_1); + + emit_insn (gen_mov_insv_1 (operands[0], operands[3])); + DONE; +}) + +;; %%% bts, btr, btc, bt. +;; In general these instructions are *slow* when applied to memory, +;; since they enforce atomic operation. When applied to registers, +;; it depends on the cpu implementation. They're never faster than +;; the corresponding and/ior/xor operations, so with 32-bit there's +;; no point. But in 64-bit, we can't hold the relevant immediates +;; within the instruction itself, so operating on bits in the high +;; 32-bits of a register becomes easier. +;; +;; These are slow on Nocona, but fast on Athlon64. We do require the use +;; of btrq and btcq for corner cases of post-reload expansion of absdf and +;; negdf respectively, so they can never be disabled entirely. + +(define_insn "*btsq" + [(set (zero_extract:DI (match_operand:DI 0 "register_operand" "+r") + (const_int 1) + (match_operand:DI 1 "const_0_to_63_operand" "")) + (const_int 1)) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && (TARGET_USE_BT || reload_completed)" + "bts{q}\t{%1, %0|%0, %1}" + [(set_attr "type" "alu1") + (set_attr "prefix_0f" "1") + (set_attr "mode" "DI")]) + +(define_insn "*btrq" + [(set (zero_extract:DI (match_operand:DI 0 "register_operand" "+r") + (const_int 1) + (match_operand:DI 1 "const_0_to_63_operand" "")) + (const_int 0)) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && (TARGET_USE_BT || reload_completed)" + "btr{q}\t{%1, %0|%0, %1}" + [(set_attr "type" "alu1") + (set_attr "prefix_0f" "1") + (set_attr "mode" "DI")]) + +(define_insn "*btcq" + [(set (zero_extract:DI (match_operand:DI 0 "register_operand" "+r") + (const_int 1) + (match_operand:DI 1 "const_0_to_63_operand" "")) + (not:DI (zero_extract:DI (match_dup 0) (const_int 1) (match_dup 1)))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && (TARGET_USE_BT || reload_completed)" + "btc{q}\t{%1, %0|%0, %1}" + [(set_attr "type" "alu1") + (set_attr "prefix_0f" "1") + (set_attr "mode" "DI")]) + +;; Allow Nocona to avoid these instructions if a register is available. + +(define_peephole2 + [(match_scratch:DI 2 "r") + (parallel [(set (zero_extract:DI + (match_operand:DI 0 "register_operand" "") + (const_int 1) + (match_operand:DI 1 "const_0_to_63_operand" "")) + (const_int 1)) + (clobber (reg:CC FLAGS_REG))])] + "TARGET_64BIT && !TARGET_USE_BT" + [(const_int 0)] +{ + HOST_WIDE_INT i = INTVAL (operands[1]), hi, lo; + rtx op1; + + if (HOST_BITS_PER_WIDE_INT >= 64) + lo = (HOST_WIDE_INT)1 << i, hi = 0; + else if (i < HOST_BITS_PER_WIDE_INT) + lo = (HOST_WIDE_INT)1 << i, hi = 0; + else + lo = 0, hi = (HOST_WIDE_INT)1 << (i - HOST_BITS_PER_WIDE_INT); + + op1 = immed_double_const (lo, hi, DImode); + if (i >= 31) + { + emit_move_insn (operands[2], op1); + op1 = operands[2]; + } + + emit_insn (gen_iordi3 (operands[0], operands[0], op1)); + DONE; +}) + +(define_peephole2 + [(match_scratch:DI 2 "r") + (parallel [(set (zero_extract:DI + (match_operand:DI 0 "register_operand" "") + (const_int 1) + (match_operand:DI 1 "const_0_to_63_operand" "")) + (const_int 0)) + (clobber (reg:CC FLAGS_REG))])] + "TARGET_64BIT && !TARGET_USE_BT" + [(const_int 0)] +{ + HOST_WIDE_INT i = INTVAL (operands[1]), hi, lo; + rtx op1; + + if (HOST_BITS_PER_WIDE_INT >= 64) + lo = (HOST_WIDE_INT)1 << i, hi = 0; + else if (i < HOST_BITS_PER_WIDE_INT) + lo = (HOST_WIDE_INT)1 << i, hi = 0; + else + lo = 0, hi = (HOST_WIDE_INT)1 << (i - HOST_BITS_PER_WIDE_INT); + + op1 = immed_double_const (~lo, ~hi, DImode); + if (i >= 32) + { + emit_move_insn (operands[2], op1); + op1 = operands[2]; + } + + emit_insn (gen_anddi3 (operands[0], operands[0], op1)); + DONE; +}) + +(define_peephole2 + [(match_scratch:DI 2 "r") + (parallel [(set (zero_extract:DI + (match_operand:DI 0 "register_operand" "") + (const_int 1) + (match_operand:DI 1 "const_0_to_63_operand" "")) + (not:DI (zero_extract:DI + (match_dup 0) (const_int 1) (match_dup 1)))) + (clobber (reg:CC FLAGS_REG))])] + "TARGET_64BIT && !TARGET_USE_BT" + [(const_int 0)] +{ + HOST_WIDE_INT i = INTVAL (operands[1]), hi, lo; + rtx op1; + + if (HOST_BITS_PER_WIDE_INT >= 64) + lo = (HOST_WIDE_INT)1 << i, hi = 0; + else if (i < HOST_BITS_PER_WIDE_INT) + lo = (HOST_WIDE_INT)1 << i, hi = 0; + else + lo = 0, hi = (HOST_WIDE_INT)1 << (i - HOST_BITS_PER_WIDE_INT); + + op1 = immed_double_const (lo, hi, DImode); + if (i >= 31) + { + emit_move_insn (operands[2], op1); + op1 = operands[2]; + } + + emit_insn (gen_xordi3 (operands[0], operands[0], op1)); + DONE; +}) + +(define_insn "*bt" + [(set (reg:CCC FLAGS_REG) + (compare:CCC + (zero_extract:SWI48 + (match_operand:SWI48 0 "register_operand" "r") + (const_int 1) + (match_operand:SWI48 1 "nonmemory_operand" "rN")) + (const_int 0)))] + "TARGET_USE_BT || optimize_function_for_size_p (cfun)" + "bt{}\t{%1, %0|%0, %1}" + [(set_attr "type" "alu1") + (set_attr "prefix_0f" "1") + (set_attr "mode" "")]) + +;; Store-flag instructions. + +;; For all sCOND expanders, also expand the compare or test insn that +;; generates cc0. Generate an equality comparison if `seq' or `sne'. + +(define_insn_and_split "*setcc_di_1" + [(set (match_operand:DI 0 "register_operand" "=q") + (match_operator:DI 1 "ix86_comparison_operator" + [(reg FLAGS_REG) (const_int 0)]))] + "TARGET_64BIT && !TARGET_PARTIAL_REG_STALL" + "#" + "&& reload_completed" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (zero_extend:DI (match_dup 2)))] +{ + PUT_MODE (operands[1], QImode); + operands[2] = gen_lowpart (QImode, operands[0]); +}) + +(define_insn_and_split "*setcc_si_1_and" + [(set (match_operand:SI 0 "register_operand" "=q") + (match_operator:SI 1 "ix86_comparison_operator" + [(reg FLAGS_REG) (const_int 0)])) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_PARTIAL_REG_STALL + && TARGET_ZERO_EXTEND_WITH_AND && optimize_function_for_speed_p (cfun)" + "#" + "&& reload_completed" + [(set (match_dup 2) (match_dup 1)) + (parallel [(set (match_dup 0) (zero_extend:SI (match_dup 2))) + (clobber (reg:CC FLAGS_REG))])] +{ + PUT_MODE (operands[1], QImode); + operands[2] = gen_lowpart (QImode, operands[0]); +}) + +(define_insn_and_split "*setcc_si_1_movzbl" + [(set (match_operand:SI 0 "register_operand" "=q") + (match_operator:SI 1 "ix86_comparison_operator" + [(reg FLAGS_REG) (const_int 0)]))] + "!TARGET_PARTIAL_REG_STALL + && (!TARGET_ZERO_EXTEND_WITH_AND || optimize_function_for_size_p (cfun))" + "#" + "&& reload_completed" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (zero_extend:SI (match_dup 2)))] +{ + PUT_MODE (operands[1], QImode); + operands[2] = gen_lowpart (QImode, operands[0]); +}) + +(define_insn "*setcc_qi" + [(set (match_operand:QI 0 "nonimmediate_operand" "=qm") + (match_operator:QI 1 "ix86_comparison_operator" + [(reg FLAGS_REG) (const_int 0)]))] + "" + "set%C1\t%0" + [(set_attr "type" "setcc") + (set_attr "mode" "QI")]) + +(define_insn "*setcc_qi_slp" + [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm")) + (match_operator:QI 1 "ix86_comparison_operator" + [(reg FLAGS_REG) (const_int 0)]))] + "" + "set%C1\t%0" + [(set_attr "type" "setcc") + (set_attr "mode" "QI")]) + +;; In general it is not safe to assume too much about CCmode registers, +;; so simplify-rtx stops when it sees a second one. Under certain +;; conditions this is safe on x86, so help combine not create +;; +;; seta %al +;; testb %al, %al +;; sete %al + +(define_split + [(set (match_operand:QI 0 "nonimmediate_operand" "") + (ne:QI (match_operator 1 "ix86_comparison_operator" + [(reg FLAGS_REG) (const_int 0)]) + (const_int 0)))] + "" + [(set (match_dup 0) (match_dup 1))] + "PUT_MODE (operands[1], QImode);") + +(define_split + [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "")) + (ne:QI (match_operator 1 "ix86_comparison_operator" + [(reg FLAGS_REG) (const_int 0)]) + (const_int 0)))] + "" + [(set (match_dup 0) (match_dup 1))] + "PUT_MODE (operands[1], QImode);") + +(define_split + [(set (match_operand:QI 0 "nonimmediate_operand" "") + (eq:QI (match_operator 1 "ix86_comparison_operator" + [(reg FLAGS_REG) (const_int 0)]) + (const_int 0)))] + "" + [(set (match_dup 0) (match_dup 1))] +{ + rtx new_op1 = copy_rtx (operands[1]); + operands[1] = new_op1; + PUT_MODE (new_op1, QImode); + PUT_CODE (new_op1, ix86_reverse_condition (GET_CODE (new_op1), + GET_MODE (XEXP (new_op1, 0)))); + + /* Make sure that (a) the CCmode we have for the flags is strong + enough for the reversed compare or (b) we have a valid FP compare. */ + if (! ix86_comparison_operator (new_op1, VOIDmode)) + FAIL; +}) + +(define_split + [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "")) + (eq:QI (match_operator 1 "ix86_comparison_operator" + [(reg FLAGS_REG) (const_int 0)]) + (const_int 0)))] + "" + [(set (match_dup 0) (match_dup 1))] +{ + rtx new_op1 = copy_rtx (operands[1]); + operands[1] = new_op1; + PUT_MODE (new_op1, QImode); + PUT_CODE (new_op1, ix86_reverse_condition (GET_CODE (new_op1), + GET_MODE (XEXP (new_op1, 0)))); + + /* Make sure that (a) the CCmode we have for the flags is strong + enough for the reversed compare or (b) we have a valid FP compare. */ + if (! ix86_comparison_operator (new_op1, VOIDmode)) + FAIL; +}) + +;; The SSE store flag instructions saves 0 or 0xffffffff to the result. +;; subsequent logical operations are used to imitate conditional moves. +;; 0xffffffff is NaN, but not in normalized form, so we can't represent +;; it directly. + +(define_insn "*avx_setcc" + [(set (match_operand:MODEF 0 "register_operand" "=x") + (match_operator:MODEF 1 "avx_comparison_float_operator" + [(match_operand:MODEF 2 "register_operand" "x") + (match_operand:MODEF 3 "nonimmediate_operand" "xm")]))] + "TARGET_AVX" + "vcmp%D1s\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "prefix" "vex") + (set_attr "length_immediate" "1") + (set_attr "mode" "")]) + +(define_insn "*sse_setcc" + [(set (match_operand:MODEF 0 "register_operand" "=x") + (match_operator:MODEF 1 "sse_comparison_operator" + [(match_operand:MODEF 2 "register_operand" "0") + (match_operand:MODEF 3 "nonimmediate_operand" "xm")]))] + "SSE_FLOAT_MODE_P (mode)" + "cmp%D1s\t{%3, %0|%0, %3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "mode" "")]) + +;; Basic conditional jump instructions. +;; We ignore the overflow flag for signed branch instructions. + +(define_insn "*jcc_1" + [(set (pc) + (if_then_else (match_operator 1 "ix86_comparison_operator" + [(reg FLAGS_REG) (const_int 0)]) + (label_ref (match_operand 0 "" "")) + (pc)))] + "" + "%+j%C1\t%l0" + [(set_attr "type" "ibr") + (set_attr "modrm" "0") + (set (attr "length") + (if_then_else (and (ge (minus (match_dup 0) (pc)) + (const_int -126)) + (lt (minus (match_dup 0) (pc)) + (const_int 128))) + (const_int 2) + (const_int 6)))]) + +(define_insn "*jcc_2" + [(set (pc) + (if_then_else (match_operator 1 "ix86_comparison_operator" + [(reg FLAGS_REG) (const_int 0)]) + (pc) + (label_ref (match_operand 0 "" ""))))] + "" + "%+j%c1\t%l0" + [(set_attr "type" "ibr") + (set_attr "modrm" "0") + (set (attr "length") + (if_then_else (and (ge (minus (match_dup 0) (pc)) + (const_int -126)) + (lt (minus (match_dup 0) (pc)) + (const_int 128))) + (const_int 2) + (const_int 6)))]) + +;; In general it is not safe to assume too much about CCmode registers, +;; so simplify-rtx stops when it sees a second one. Under certain +;; conditions this is safe on x86, so help combine not create +;; +;; seta %al +;; testb %al, %al +;; je Lfoo + +(define_split + [(set (pc) + (if_then_else (ne (match_operator 0 "ix86_comparison_operator" + [(reg FLAGS_REG) (const_int 0)]) + (const_int 0)) + (label_ref (match_operand 1 "" "")) + (pc)))] + "" + [(set (pc) + (if_then_else (match_dup 0) + (label_ref (match_dup 1)) + (pc)))] + "PUT_MODE (operands[0], VOIDmode);") + +(define_split + [(set (pc) + (if_then_else (eq (match_operator 0 "ix86_comparison_operator" + [(reg FLAGS_REG) (const_int 0)]) + (const_int 0)) + (label_ref (match_operand 1 "" "")) + (pc)))] + "" + [(set (pc) + (if_then_else (match_dup 0) + (label_ref (match_dup 1)) + (pc)))] +{ + rtx new_op0 = copy_rtx (operands[0]); + operands[0] = new_op0; + PUT_MODE (new_op0, VOIDmode); + PUT_CODE (new_op0, ix86_reverse_condition (GET_CODE (new_op0), + GET_MODE (XEXP (new_op0, 0)))); + + /* Make sure that (a) the CCmode we have for the flags is strong + enough for the reversed compare or (b) we have a valid FP compare. */ + if (! ix86_comparison_operator (new_op0, VOIDmode)) + FAIL; +}) + +;; zero_extend in SImode is correct also for DImode, since this is what combine +;; pass generates from shift insn with QImode operand. Actually, the mode +;; of operand 2 (bit offset operand) doesn't matter since bt insn takes +;; appropriate modulo of the bit offset value. + +(define_insn_and_split "*jcc_bt" + [(set (pc) + (if_then_else (match_operator 0 "bt_comparison_operator" + [(zero_extract:SWI48 + (match_operand:SWI48 1 "register_operand" "r") + (const_int 1) + (zero_extend:SI + (match_operand:QI 2 "register_operand" "r"))) + (const_int 0)]) + (label_ref (match_operand 3 "" "")) + (pc))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_USE_BT || optimize_function_for_size_p (cfun)" + "#" + "&& 1" + [(set (reg:CCC FLAGS_REG) + (compare:CCC + (zero_extract:SWI48 + (match_dup 1) + (const_int 1) + (match_dup 2)) + (const_int 0))) + (set (pc) + (if_then_else (match_op_dup 0 [(reg:CCC FLAGS_REG) (const_int 0)]) + (label_ref (match_dup 3)) + (pc)))] +{ + operands[2] = simplify_gen_subreg (mode, operands[2], QImode, 0); + + PUT_CODE (operands[0], reverse_condition (GET_CODE (operands[0]))); +}) + +;; Avoid useless masking of bit offset operand. "and" in SImode is correct +;; also for DImode, this is what combine produces. +(define_insn_and_split "*jcc_bt_mask" + [(set (pc) + (if_then_else (match_operator 0 "bt_comparison_operator" + [(zero_extract:SWI48 + (match_operand:SWI48 1 "register_operand" "r") + (const_int 1) + (and:SI + (match_operand:SI 2 "register_operand" "r") + (match_operand:SI 3 "const_int_operand" "n")))]) + (label_ref (match_operand 4 "" "")) + (pc))) + (clobber (reg:CC FLAGS_REG))] + "(TARGET_USE_BT || optimize_function_for_size_p (cfun)) + && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (mode)-1)) + == GET_MODE_BITSIZE (mode)-1" + "#" + "&& 1" + [(set (reg:CCC FLAGS_REG) + (compare:CCC + (zero_extract:SWI48 + (match_dup 1) + (const_int 1) + (match_dup 2)) + (const_int 0))) + (set (pc) + (if_then_else (match_op_dup 0 [(reg:CCC FLAGS_REG) (const_int 0)]) + (label_ref (match_dup 4)) + (pc)))] +{ + operands[2] = simplify_gen_subreg (mode, operands[2], SImode, 0); + + PUT_CODE (operands[0], reverse_condition (GET_CODE (operands[0]))); +}) + +(define_insn_and_split "*jcc_btsi_1" + [(set (pc) + (if_then_else (match_operator 0 "bt_comparison_operator" + [(and:SI + (lshiftrt:SI + (match_operand:SI 1 "register_operand" "r") + (match_operand:QI 2 "register_operand" "r")) + (const_int 1)) + (const_int 0)]) + (label_ref (match_operand 3 "" "")) + (pc))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_USE_BT || optimize_function_for_size_p (cfun)" + "#" + "&& 1" + [(set (reg:CCC FLAGS_REG) + (compare:CCC + (zero_extract:SI + (match_dup 1) + (const_int 1) + (match_dup 2)) + (const_int 0))) + (set (pc) + (if_then_else (match_op_dup 0 [(reg:CCC FLAGS_REG) (const_int 0)]) + (label_ref (match_dup 3)) + (pc)))] +{ + operands[2] = simplify_gen_subreg (SImode, operands[2], QImode, 0); + + PUT_CODE (operands[0], reverse_condition (GET_CODE (operands[0]))); +}) + +;; avoid useless masking of bit offset operand +(define_insn_and_split "*jcc_btsi_mask_1" + [(set (pc) + (if_then_else + (match_operator 0 "bt_comparison_operator" + [(and:SI + (lshiftrt:SI + (match_operand:SI 1 "register_operand" "r") + (subreg:QI + (and:SI + (match_operand:SI 2 "register_operand" "r") + (match_operand:SI 3 "const_int_operand" "n")) 0)) + (const_int 1)) + (const_int 0)]) + (label_ref (match_operand 4 "" "")) + (pc))) + (clobber (reg:CC FLAGS_REG))] + "(TARGET_USE_BT || optimize_function_for_size_p (cfun)) + && (INTVAL (operands[3]) & 0x1f) == 0x1f" + "#" + "&& 1" + [(set (reg:CCC FLAGS_REG) + (compare:CCC + (zero_extract:SI + (match_dup 1) + (const_int 1) + (match_dup 2)) + (const_int 0))) + (set (pc) + (if_then_else (match_op_dup 0 [(reg:CCC FLAGS_REG) (const_int 0)]) + (label_ref (match_dup 4)) + (pc)))] + "PUT_CODE (operands[0], reverse_condition (GET_CODE (operands[0])));") + +;; Define combination compare-and-branch fp compare instructions to help +;; combine. + +(define_insn "*fp_jcc_1_387" + [(set (pc) + (if_then_else (match_operator 0 "ix86_fp_comparison_operator" + [(match_operand 1 "register_operand" "f") + (match_operand 2 "nonimmediate_operand" "fm")]) + (label_ref (match_operand 3 "" "")) + (pc))) + (clobber (reg:CCFP FPSR_REG)) + (clobber (reg:CCFP FLAGS_REG)) + (clobber (match_scratch:HI 4 "=a"))] + "TARGET_80387 + && (GET_MODE (operands[1]) == SFmode || GET_MODE (operands[1]) == DFmode) + && GET_MODE (operands[1]) == GET_MODE (operands[2]) + && SELECT_CC_MODE (GET_CODE (operands[0]), + operands[1], operands[2]) == CCFPmode + && !TARGET_CMOVE" + "#") + +(define_insn "*fp_jcc_1r_387" + [(set (pc) + (if_then_else (match_operator 0 "ix86_fp_comparison_operator" + [(match_operand 1 "register_operand" "f") + (match_operand 2 "nonimmediate_operand" "fm")]) + (pc) + (label_ref (match_operand 3 "" "")))) + (clobber (reg:CCFP FPSR_REG)) + (clobber (reg:CCFP FLAGS_REG)) + (clobber (match_scratch:HI 4 "=a"))] + "TARGET_80387 + && (GET_MODE (operands[1]) == SFmode || GET_MODE (operands[1]) == DFmode) + && GET_MODE (operands[1]) == GET_MODE (operands[2]) + && SELECT_CC_MODE (GET_CODE (operands[0]), + operands[1], operands[2]) == CCFPmode + && !TARGET_CMOVE" + "#") + +(define_insn "*fp_jcc_2_387" + [(set (pc) + (if_then_else (match_operator 0 "ix86_fp_comparison_operator" + [(match_operand 1 "register_operand" "f") + (match_operand 2 "register_operand" "f")]) + (label_ref (match_operand 3 "" "")) + (pc))) + (clobber (reg:CCFP FPSR_REG)) + (clobber (reg:CCFP FLAGS_REG)) + (clobber (match_scratch:HI 4 "=a"))] + "X87_FLOAT_MODE_P (GET_MODE (operands[1])) + && GET_MODE (operands[1]) == GET_MODE (operands[2]) + && !TARGET_CMOVE" + "#") + +(define_insn "*fp_jcc_2r_387" + [(set (pc) + (if_then_else (match_operator 0 "ix86_fp_comparison_operator" + [(match_operand 1 "register_operand" "f") + (match_operand 2 "register_operand" "f")]) + (pc) + (label_ref (match_operand 3 "" "")))) + (clobber (reg:CCFP FPSR_REG)) + (clobber (reg:CCFP FLAGS_REG)) + (clobber (match_scratch:HI 4 "=a"))] + "X87_FLOAT_MODE_P (GET_MODE (operands[1])) + && GET_MODE (operands[1]) == GET_MODE (operands[2]) + && !TARGET_CMOVE" + "#") + +(define_insn "*fp_jcc_3_387" + [(set (pc) + (if_then_else (match_operator 0 "ix86_fp_comparison_operator" + [(match_operand 1 "register_operand" "f") + (match_operand 2 "const0_operand" "")]) + (label_ref (match_operand 3 "" "")) + (pc))) + (clobber (reg:CCFP FPSR_REG)) + (clobber (reg:CCFP FLAGS_REG)) + (clobber (match_scratch:HI 4 "=a"))] + "X87_FLOAT_MODE_P (GET_MODE (operands[1])) + && GET_MODE (operands[1]) == GET_MODE (operands[2]) + && SELECT_CC_MODE (GET_CODE (operands[0]), + operands[1], operands[2]) == CCFPmode + && !TARGET_CMOVE" + "#") + +(define_split + [(set (pc) + (if_then_else (match_operator 0 "ix86_fp_comparison_operator" + [(match_operand 1 "register_operand" "") + (match_operand 2 "nonimmediate_operand" "")]) + (match_operand 3 "" "") + (match_operand 4 "" ""))) + (clobber (reg:CCFP FPSR_REG)) + (clobber (reg:CCFP FLAGS_REG))] + "reload_completed" + [(const_int 0)] +{ + ix86_split_fp_branch (GET_CODE (operands[0]), operands[1], operands[2], + operands[3], operands[4], NULL_RTX, NULL_RTX); + DONE; +}) + +(define_split + [(set (pc) + (if_then_else (match_operator 0 "ix86_fp_comparison_operator" + [(match_operand 1 "register_operand" "") + (match_operand 2 "general_operand" "")]) + (match_operand 3 "" "") + (match_operand 4 "" ""))) + (clobber (reg:CCFP FPSR_REG)) + (clobber (reg:CCFP FLAGS_REG)) + (clobber (match_scratch:HI 5 "=a"))] + "reload_completed" + [(const_int 0)] +{ + ix86_split_fp_branch (GET_CODE (operands[0]), operands[1], operands[2], + operands[3], operands[4], operands[5], NULL_RTX); + DONE; +}) + +;; The order of operands in *fp_jcc_4_387 is forced by combine in +;; simplify_comparison () function. Float operator is treated as RTX_OBJ +;; with a precedence over other operators and is always put in the first +;; place. Swap condition and operands to match ficom instruction. + +(define_insn "*fp_jcc_4__387" + [(set (pc) + (if_then_else + (match_operator 0 "ix86_swapped_fp_comparison_operator" + [(match_operator 1 "float_operator" + [(match_operand:X87MODEI12 2 "nonimmediate_operand" "m,?r")]) + (match_operand 3 "register_operand" "f,f")]) + (label_ref (match_operand 4 "" "")) + (pc))) + (clobber (reg:CCFP FPSR_REG)) + (clobber (reg:CCFP FLAGS_REG)) + (clobber (match_scratch:HI 5 "=a,a"))] + "X87_FLOAT_MODE_P (GET_MODE (operands[3])) + && (TARGET_USE_MODE_FIOP || optimize_function_for_size_p (cfun)) + && GET_MODE (operands[1]) == GET_MODE (operands[3]) + && ix86_fp_compare_mode (swap_condition (GET_CODE (operands[0]))) == CCFPmode + && !TARGET_CMOVE" + "#") + +(define_split + [(set (pc) + (if_then_else + (match_operator 0 "ix86_swapped_fp_comparison_operator" + [(match_operator 1 "float_operator" + [(match_operand:X87MODEI12 2 "memory_operand" "")]) + (match_operand 3 "register_operand" "")]) + (match_operand 4 "" "") + (match_operand 5 "" ""))) + (clobber (reg:CCFP FPSR_REG)) + (clobber (reg:CCFP FLAGS_REG)) + (clobber (match_scratch:HI 6 "=a"))] + "reload_completed" + [(const_int 0)] +{ + operands[7] = gen_rtx_FLOAT (GET_MODE (operands[1]), operands[2]); + + ix86_split_fp_branch (swap_condition (GET_CODE (operands[0])), + operands[3], operands[7], + operands[4], operands[5], operands[6], NULL_RTX); + DONE; +}) + +;; %%% Kill this when reload knows how to do it. +(define_split + [(set (pc) + (if_then_else + (match_operator 0 "ix86_swapped_fp_comparison_operator" + [(match_operator 1 "float_operator" + [(match_operand:X87MODEI12 2 "register_operand" "")]) + (match_operand 3 "register_operand" "")]) + (match_operand 4 "" "") + (match_operand 5 "" ""))) + (clobber (reg:CCFP FPSR_REG)) + (clobber (reg:CCFP FLAGS_REG)) + (clobber (match_scratch:HI 6 "=a"))] + "reload_completed" + [(const_int 0)] +{ + operands[7] = ix86_force_to_memory (GET_MODE (operands[2]), operands[2]); + operands[7] = gen_rtx_FLOAT (GET_MODE (operands[1]), operands[7]); + + ix86_split_fp_branch (swap_condition (GET_CODE (operands[0])), + operands[3], operands[7], + operands[4], operands[5], operands[6], operands[2]); + DONE; +}) + +;; Unconditional and other jump instructions + +(define_insn "jump" + [(set (pc) + (label_ref (match_operand 0 "" "")))] + "" + "jmp\t%l0" + [(set_attr "type" "ibr") + (set (attr "length") + (if_then_else (and (ge (minus (match_dup 0) (pc)) + (const_int -126)) + (lt (minus (match_dup 0) (pc)) + (const_int 128))) + (const_int 2) + (const_int 5))) + (set_attr "modrm" "0")]) + +(define_expand "indirect_jump" + [(set (pc) (match_operand 0 "nonimmediate_operand" ""))] + "" + "") + +(define_insn "*indirect_jump" + [(set (pc) (match_operand:P 0 "nonimmediate_operand" "rm"))] + "" + "jmp\t%A0" + [(set_attr "type" "ibr") + (set_attr "length_immediate" "0")]) + +(define_expand "tablejump" + [(parallel [(set (pc) (match_operand 0 "nonimmediate_operand" "")) + (use (label_ref (match_operand 1 "" "")))])] + "" +{ + /* In PIC mode, the table entries are stored GOT (32-bit) or PC (64-bit) + relative. Convert the relative address to an absolute address. */ + if (flag_pic) + { + rtx op0, op1; + enum rtx_code code; + + /* We can't use @GOTOFF for text labels on VxWorks; + see gotoff_operand. */ + if (TARGET_64BIT || TARGET_VXWORKS_RTP) + { + code = PLUS; + op0 = operands[0]; + op1 = gen_rtx_LABEL_REF (Pmode, operands[1]); + } + else if (TARGET_MACHO || HAVE_AS_GOTOFF_IN_DATA) + { + code = PLUS; + op0 = operands[0]; + op1 = pic_offset_table_rtx; + } + else + { + code = MINUS; + op0 = pic_offset_table_rtx; + op1 = operands[0]; + } + + operands[0] = expand_simple_binop (Pmode, code, op0, op1, NULL_RTX, 0, + OPTAB_DIRECT); + } +}) + +(define_insn "*tablejump_1" + [(set (pc) (match_operand:P 0 "nonimmediate_operand" "rm")) + (use (label_ref (match_operand 1 "" "")))] + "" + "jmp\t%A0" + [(set_attr "type" "ibr") + (set_attr "length_immediate" "0")]) + +;; Convert setcc + movzbl to xor + setcc if operands don't overlap. + +(define_peephole2 + [(set (reg FLAGS_REG) (match_operand 0 "" "")) + (set (match_operand:QI 1 "register_operand" "") + (match_operator:QI 2 "ix86_comparison_operator" + [(reg FLAGS_REG) (const_int 0)])) + (set (match_operand 3 "q_regs_operand" "") + (zero_extend (match_dup 1)))] + "(peep2_reg_dead_p (3, operands[1]) + || operands_match_p (operands[1], operands[3])) + && ! reg_overlap_mentioned_p (operands[3], operands[0])" + [(set (match_dup 4) (match_dup 0)) + (set (strict_low_part (match_dup 5)) + (match_dup 2))] +{ + operands[4] = gen_rtx_REG (GET_MODE (operands[0]), FLAGS_REG); + operands[5] = gen_lowpart (QImode, operands[3]); + ix86_expand_clear (operands[3]); +}) + +;; Similar, but match zero_extendhisi2_and, which adds a clobber. + +(define_peephole2 + [(set (reg FLAGS_REG) (match_operand 0 "" "")) + (set (match_operand:QI 1 "register_operand" "") + (match_operator:QI 2 "ix86_comparison_operator" + [(reg FLAGS_REG) (const_int 0)])) + (parallel [(set (match_operand 3 "q_regs_operand" "") + (zero_extend (match_dup 1))) + (clobber (reg:CC FLAGS_REG))])] + "(peep2_reg_dead_p (3, operands[1]) + || operands_match_p (operands[1], operands[3])) + && ! reg_overlap_mentioned_p (operands[3], operands[0])" + [(set (match_dup 4) (match_dup 0)) + (set (strict_low_part (match_dup 5)) + (match_dup 2))] +{ + operands[4] = gen_rtx_REG (GET_MODE (operands[0]), FLAGS_REG); + operands[5] = gen_lowpart (QImode, operands[3]); + ix86_expand_clear (operands[3]); +}) + +;; Call instructions. + +;; The predicates normally associated with named expanders are not properly +;; checked for calls. This is a bug in the generic code, but it isn't that +;; easy to fix. Ignore it for now and be prepared to fix things up. + +;; P6 processors will jump to the address after the decrement when %esp +;; is used as a call operand, so they will execute return address as a code. +;; See Pentium Pro errata 70, Pentium 2 errata A33 and Pentium 3 errata E17. + +;; Call subroutine returning no value. + +(define_expand "call_pop" + [(parallel [(call (match_operand:QI 0 "" "") + (match_operand:SI 1 "" "")) + (set (reg:SI SP_REG) + (plus:SI (reg:SI SP_REG) + (match_operand:SI 3 "" "")))])] + "!TARGET_64BIT" +{ + ix86_expand_call (NULL, operands[0], operands[1], + operands[2], operands[3], 0); + DONE; +}) + +(define_insn_and_split "*call_pop_0_vzeroupper" + [(parallel + [(call (mem:QI (match_operand:SI 0 "constant_call_address_operand" "")) + (match_operand:SI 1 "" "")) + (set (reg:SI SP_REG) + (plus:SI (reg:SI SP_REG) + (match_operand:SI 2 "immediate_operand" "")))]) + (unspec [(match_operand 3 "const_int_operand" "")] + UNSPEC_CALL_NEEDS_VZEROUPPER)] + "TARGET_VZEROUPPER && !TARGET_64BIT" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_call_vzeroupper (curr_insn, operands[3]); DONE;" + [(set_attr "type" "call")]) + +(define_insn "*call_pop_0" + [(call (mem:QI (match_operand:SI 0 "constant_call_address_operand" "")) + (match_operand:SI 1 "" "")) + (set (reg:SI SP_REG) + (plus:SI (reg:SI SP_REG) + (match_operand:SI 2 "immediate_operand" "")))] + "!TARGET_64BIT" +{ + if (SIBLING_CALL_P (insn)) + return "jmp\t%P0"; + else + return "call\t%P0"; +} + [(set_attr "type" "call")]) + +(define_insn_and_split "*call_pop_1_vzeroupper" + [(parallel + [(call (mem:QI (match_operand:SI 0 "call_insn_operand" "lzm")) + (match_operand:SI 1 "" "")) + (set (reg:SI SP_REG) + (plus:SI (reg:SI SP_REG) + (match_operand:SI 2 "immediate_operand" "i")))]) + (unspec [(match_operand 3 "const_int_operand" "")] + UNSPEC_CALL_NEEDS_VZEROUPPER)] + "TARGET_VZEROUPPER && !TARGET_64BIT && !SIBLING_CALL_P (insn)" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_call_vzeroupper (curr_insn, operands[3]); DONE;" + [(set_attr "type" "call")]) + +(define_insn "*call_pop_1" + [(call (mem:QI (match_operand:SI 0 "call_insn_operand" "lzm")) + (match_operand:SI 1 "" "")) + (set (reg:SI SP_REG) + (plus:SI (reg:SI SP_REG) + (match_operand:SI 2 "immediate_operand" "i")))] + "!TARGET_64BIT && !SIBLING_CALL_P (insn)" +{ + if (constant_call_address_operand (operands[0], Pmode)) + return "call\t%P0"; + return "call\t%A0"; +} + [(set_attr "type" "call")]) + +(define_insn_and_split "*sibcall_pop_1_vzeroupper" + [(parallel + [(call (mem:QI (match_operand:SI 0 "sibcall_insn_operand" "z,U")) + (match_operand:SI 1 "" "")) + (set (reg:SI SP_REG) + (plus:SI (reg:SI SP_REG) + (match_operand:SI 2 "immediate_operand" "i,i")))]) + (unspec [(match_operand 3 "const_int_operand" "")] + UNSPEC_CALL_NEEDS_VZEROUPPER)] + "TARGET_VZEROUPPER && !TARGET_64BIT && SIBLING_CALL_P (insn)" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_call_vzeroupper (curr_insn, operands[3]); DONE;" + [(set_attr "type" "call")]) + +(define_insn "*sibcall_pop_1" + [(call (mem:QI (match_operand:SI 0 "sibcall_insn_operand" "z,U")) + (match_operand:SI 1 "" "")) + (set (reg:SI SP_REG) + (plus:SI (reg:SI SP_REG) + (match_operand:SI 2 "immediate_operand" "i,i")))] + "!TARGET_64BIT && SIBLING_CALL_P (insn)" + "@ + jmp\t%P0 + jmp\t%A0" + [(set_attr "type" "call")]) + +(define_expand "call" + [(call (match_operand:QI 0 "" "") + (match_operand 1 "" "")) + (use (match_operand 2 "" ""))] + "" +{ + ix86_expand_call (NULL, operands[0], operands[1], operands[2], NULL, 0); + DONE; +}) + +(define_expand "sibcall" + [(call (match_operand:QI 0 "" "") + (match_operand 1 "" "")) + (use (match_operand 2 "" ""))] + "" +{ + ix86_expand_call (NULL, operands[0], operands[1], operands[2], NULL, 1); + DONE; +}) + +(define_insn_and_split "*call_0_vzeroupper" + [(call (mem:QI (match_operand 0 "constant_call_address_operand" "")) + (match_operand 1 "" "")) + (unspec [(match_operand 2 "const_int_operand" "")] + UNSPEC_CALL_NEEDS_VZEROUPPER)] + "TARGET_VZEROUPPER" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_call_vzeroupper (curr_insn, operands[2]); DONE;" + [(set_attr "type" "call")]) + +(define_insn "*call_0" + [(call (mem:QI (match_operand 0 "constant_call_address_operand" "")) + (match_operand 1 "" ""))] + "" + { return ix86_output_call_insn (insn, operands[0], 0); } + [(set_attr "type" "call")]) + +(define_insn_and_split "*call_1_vzeroupper" + [(call (mem:QI (match_operand:SI 0 "call_insn_operand" "lzm")) + (match_operand 1 "" "")) + (unspec [(match_operand 2 "const_int_operand" "")] + UNSPEC_CALL_NEEDS_VZEROUPPER)] + "TARGET_VZEROUPPER && !TARGET_64BIT && !SIBLING_CALL_P (insn)" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_call_vzeroupper (curr_insn, operands[2]); DONE;" + [(set_attr "type" "call")]) + +(define_insn "*call_1" + [(call (mem:QI (match_operand:SI 0 "call_insn_operand" "lzm")) + (match_operand 1 "" ""))] + "!TARGET_64BIT && !SIBLING_CALL_P (insn)" + { return ix86_output_call_insn (insn, operands[0], 0); } + [(set_attr "type" "call")]) + +(define_insn_and_split "*sibcall_1_vzeroupper" + [(call (mem:QI (match_operand:SI 0 "sibcall_insn_operand" "z,U")) + (match_operand 1 "" "")) + (unspec [(match_operand 2 "const_int_operand" "")] + UNSPEC_CALL_NEEDS_VZEROUPPER)] + "TARGET_VZEROUPPER && !TARGET_64BIT && SIBLING_CALL_P (insn)" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_call_vzeroupper (curr_insn, operands[2]); DONE;" + [(set_attr "type" "call")]) + +(define_insn "*sibcall_1" + [(call (mem:QI (match_operand:SI 0 "sibcall_insn_operand" "z,U")) + (match_operand 1 "" ""))] + "!TARGET_64BIT && SIBLING_CALL_P (insn)" + { return ix86_output_call_insn (insn, operands[0], 0); } + [(set_attr "type" "call")]) + +(define_insn_and_split "*call_1_rex64_vzeroupper" + [(call (mem:QI (match_operand:DI 0 "call_insn_operand" "rzm")) + (match_operand 1 "" "")) + (unspec [(match_operand 2 "const_int_operand" "")] + UNSPEC_CALL_NEEDS_VZEROUPPER)] + "TARGET_VZEROUPPER && TARGET_64BIT && !SIBLING_CALL_P (insn) + && ix86_cmodel != CM_LARGE && ix86_cmodel != CM_LARGE_PIC" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_call_vzeroupper (curr_insn, operands[2]); DONE;" + [(set_attr "type" "call")]) + +(define_insn "*call_1_rex64" + [(call (mem:QI (match_operand:DI 0 "call_insn_operand" "rzm")) + (match_operand 1 "" ""))] + "TARGET_64BIT && !SIBLING_CALL_P (insn) + && ix86_cmodel != CM_LARGE && ix86_cmodel != CM_LARGE_PIC" + { return ix86_output_call_insn (insn, operands[0], 0); } + [(set_attr "type" "call")]) + +(define_insn_and_split "*call_1_rex64_ms_sysv_vzeroupper" + [(parallel + [(call (mem:QI (match_operand:DI 0 "call_insn_operand" "rzm")) + (match_operand 1 "" "")) + (unspec [(const_int 0)] UNSPEC_MS_TO_SYSV_CALL) + (clobber (reg:TI XMM6_REG)) + (clobber (reg:TI XMM7_REG)) + (clobber (reg:TI XMM8_REG)) + (clobber (reg:TI XMM9_REG)) + (clobber (reg:TI XMM10_REG)) + (clobber (reg:TI XMM11_REG)) + (clobber (reg:TI XMM12_REG)) + (clobber (reg:TI XMM13_REG)) + (clobber (reg:TI XMM14_REG)) + (clobber (reg:TI XMM15_REG)) + (clobber (reg:DI SI_REG)) + (clobber (reg:DI DI_REG))]) + (unspec [(match_operand 2 "const_int_operand" "")] + UNSPEC_CALL_NEEDS_VZEROUPPER)] + "TARGET_VZEROUPPER && TARGET_64BIT && !SIBLING_CALL_P (insn)" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_call_vzeroupper (curr_insn, operands[2]); DONE;" + [(set_attr "type" "call")]) + +(define_insn "*call_1_rex64_ms_sysv" + [(call (mem:QI (match_operand:DI 0 "call_insn_operand" "rzm")) + (match_operand 1 "" "")) + (unspec [(const_int 0)] UNSPEC_MS_TO_SYSV_CALL) + (clobber (reg:TI XMM6_REG)) + (clobber (reg:TI XMM7_REG)) + (clobber (reg:TI XMM8_REG)) + (clobber (reg:TI XMM9_REG)) + (clobber (reg:TI XMM10_REG)) + (clobber (reg:TI XMM11_REG)) + (clobber (reg:TI XMM12_REG)) + (clobber (reg:TI XMM13_REG)) + (clobber (reg:TI XMM14_REG)) + (clobber (reg:TI XMM15_REG)) + (clobber (reg:DI SI_REG)) + (clobber (reg:DI DI_REG))] + "TARGET_64BIT && !SIBLING_CALL_P (insn)" + { return ix86_output_call_insn (insn, operands[0], 0); } + [(set_attr "type" "call")]) + +(define_insn_and_split "*call_1_rex64_large_vzeroupper" + [(call (mem:QI (match_operand:DI 0 "call_insn_operand" "rm")) + (match_operand 1 "" "")) + (unspec [(match_operand 2 "const_int_operand" "")] + UNSPEC_CALL_NEEDS_VZEROUPPER)] + "TARGET_VZEROUPPER && TARGET_64BIT && !SIBLING_CALL_P (insn)" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_call_vzeroupper (curr_insn, operands[2]); DONE;" + [(set_attr "type" "call")]) + +(define_insn "*call_1_rex64_large" + [(call (mem:QI (match_operand:DI 0 "call_insn_operand" "rm")) + (match_operand 1 "" ""))] + "TARGET_64BIT && !SIBLING_CALL_P (insn)" + { return ix86_output_call_insn (insn, operands[0], 0); } + [(set_attr "type" "call")]) + +(define_insn_and_split "*sibcall_1_rex64_vzeroupper" + [(call (mem:QI (match_operand:DI 0 "sibcall_insn_operand" "z,U")) + (match_operand 1 "" "")) + (unspec [(match_operand 2 "const_int_operand" "")] + UNSPEC_CALL_NEEDS_VZEROUPPER)] + "TARGET_VZEROUPPER && TARGET_64BIT && SIBLING_CALL_P (insn)" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_call_vzeroupper (curr_insn, operands[2]); DONE;" + [(set_attr "type" "call")]) + +(define_insn "*sibcall_1_rex64" + [(call (mem:QI (match_operand:DI 0 "sibcall_insn_operand" "z,U")) + (match_operand 1 "" ""))] + "TARGET_64BIT && SIBLING_CALL_P (insn)" + { return ix86_output_call_insn (insn, operands[0], 0); } + [(set_attr "type" "call")]) + +;; Call subroutine, returning value in operand 0 +(define_expand "call_value_pop" + [(parallel [(set (match_operand 0 "" "") + (call (match_operand:QI 1 "" "") + (match_operand:SI 2 "" ""))) + (set (reg:SI SP_REG) + (plus:SI (reg:SI SP_REG) + (match_operand:SI 4 "" "")))])] + "!TARGET_64BIT" +{ + ix86_expand_call (operands[0], operands[1], operands[2], + operands[3], operands[4], 0); + DONE; +}) + +(define_expand "call_value" + [(set (match_operand 0 "" "") + (call (match_operand:QI 1 "" "") + (match_operand:SI 2 "" ""))) + (use (match_operand:SI 3 "" ""))] + ;; Operand 3 is not used on the i386. + "" +{ + ix86_expand_call (operands[0], operands[1], operands[2], + operands[3], NULL, 0); + DONE; +}) + +(define_expand "sibcall_value" + [(set (match_operand 0 "" "") + (call (match_operand:QI 1 "" "") + (match_operand:SI 2 "" ""))) + (use (match_operand:SI 3 "" ""))] + ;; Operand 3 is not used on the i386. + "" +{ + ix86_expand_call (operands[0], operands[1], operands[2], + operands[3], NULL, 1); + DONE; +}) + +;; Call subroutine returning any type. + +(define_expand "untyped_call" + [(parallel [(call (match_operand 0 "" "") + (const_int 0)) + (match_operand 1 "" "") + (match_operand 2 "" "")])] + "" +{ + int i; + + /* In order to give reg-stack an easier job in validating two + coprocessor registers as containing a possible return value, + simply pretend the untyped call returns a complex long double + value. + + We can't use SSE_REGPARM_MAX here since callee is unprototyped + and should have the default ABI. */ + + ix86_expand_call ((TARGET_FLOAT_RETURNS_IN_80387 + ? gen_rtx_REG (XCmode, FIRST_FLOAT_REG) : NULL), + operands[0], const0_rtx, + GEN_INT ((TARGET_64BIT + ? (ix86_abi == SYSV_ABI + ? X86_64_SSE_REGPARM_MAX + : X86_64_MS_SSE_REGPARM_MAX) + : X86_32_SSE_REGPARM_MAX) + - 1), + NULL, 0); + + for (i = 0; i < XVECLEN (operands[2], 0); i++) + { + rtx set = XVECEXP (operands[2], 0, i); + emit_move_insn (SET_DEST (set), SET_SRC (set)); + } + + /* The optimizer does not know that the call sets the function value + registers we stored in the result block. We avoid problems by + claiming that all hard registers are used and clobbered at this + point. */ + emit_insn (gen_blockage ()); + + DONE; +}) + +;; Prologue and epilogue instructions + +;; UNSPEC_VOLATILE is considered to use and clobber all hard registers and +;; all of memory. This blocks insns from being moved across this point. + +(define_insn "blockage" + [(unspec_volatile [(const_int 0)] UNSPECV_BLOCKAGE)] + "" + "" + [(set_attr "length" "0")]) + +;; Do not schedule instructions accessing memory across this point. + +(define_expand "memory_blockage" + [(set (match_dup 0) + (unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BLOCKAGE))] + "" +{ + operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)); + MEM_VOLATILE_P (operands[0]) = 1; +}) + +(define_insn "*memory_blockage" + [(set (match_operand:BLK 0 "" "") + (unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BLOCKAGE))] + "" + "" + [(set_attr "length" "0")]) + +;; As USE insns aren't meaningful after reload, this is used instead +;; to prevent deleting instructions setting registers for PIC code +(define_insn "prologue_use" + [(unspec_volatile [(match_operand 0 "" "")] UNSPECV_PROLOGUE_USE)] + "" + "" + [(set_attr "length" "0")]) + +;; Insn emitted into the body of a function to return from a function. +;; This is only done if the function's epilogue is known to be simple. +;; See comments for ix86_can_use_return_insn_p in i386.c. + +(define_expand "return" + [(return)] + "ix86_can_use_return_insn_p ()" +{ + if (crtl->args.pops_args) + { + rtx popc = GEN_INT (crtl->args.pops_args); + emit_jump_insn (gen_return_pop_internal (popc)); + DONE; + } +}) + +(define_insn "return_internal" + [(return)] + "reload_completed" + "ret" + [(set_attr "length" "1") + (set_attr "atom_unit" "jeu") + (set_attr "length_immediate" "0") + (set_attr "modrm" "0")]) + +;; Used by x86_machine_dependent_reorg to avoid penalty on single byte RET +;; instruction Athlon and K8 have. + +(define_insn "return_internal_long" + [(return) + (unspec [(const_int 0)] UNSPEC_REP)] + "reload_completed" + "rep\;ret" + [(set_attr "length" "2") + (set_attr "atom_unit" "jeu") + (set_attr "length_immediate" "0") + (set_attr "prefix_rep" "1") + (set_attr "modrm" "0")]) + +(define_insn "return_pop_internal" + [(return) + (use (match_operand:SI 0 "const_int_operand" ""))] + "reload_completed" + "ret\t%0" + [(set_attr "length" "3") + (set_attr "atom_unit" "jeu") + (set_attr "length_immediate" "2") + (set_attr "modrm" "0")]) + +(define_insn "return_indirect_internal" + [(return) + (use (match_operand:SI 0 "register_operand" "r"))] + "reload_completed" + "jmp\t%A0" + [(set_attr "type" "ibr") + (set_attr "length_immediate" "0")]) + +(define_insn "nop" + [(const_int 0)] + "" + "nop" + [(set_attr "length" "1") + (set_attr "length_immediate" "0") + (set_attr "modrm" "0")]) + +;; Generate nops. Operand 0 is the number of nops, up to 8. +(define_insn "nops" + [(unspec_volatile [(match_operand 0 "const_int_operand" "")] + UNSPECV_NOPS)] + "reload_completed" +{ + int num = INTVAL (operands[0]); + + gcc_assert (num >= 1 && num <= 8); + + while (num--) + fputs ("\tnop\n", asm_out_file); + + return ""; +} + [(set (attr "length") (symbol_ref "INTVAL (operands[0])")) + (set_attr "length_immediate" "0") + (set_attr "modrm" "0")]) + +;; Pad to 16-byte boundary, max skip in op0. Used to avoid +;; branch prediction penalty for the third jump in a 16-byte +;; block on K8. + +(define_insn "pad" + [(unspec_volatile [(match_operand 0 "" "")] UNSPECV_ALIGN)] + "" +{ +#ifdef ASM_OUTPUT_MAX_SKIP_PAD + ASM_OUTPUT_MAX_SKIP_PAD (asm_out_file, 4, (int)INTVAL (operands[0])); +#else + /* It is tempting to use ASM_OUTPUT_ALIGN here, but we don't want to do that. + The align insn is used to avoid 3 jump instructions in the row to improve + branch prediction and the benefits hardly outweigh the cost of extra 8 + nops on the average inserted by full alignment pseudo operation. */ +#endif + return ""; +} + [(set_attr "length" "16")]) + +(define_expand "prologue" + [(const_int 0)] + "" + "ix86_expand_prologue (); DONE;") + +(define_insn "set_got" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec:SI [(const_int 0)] UNSPEC_SET_GOT)) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_64BIT" + "* return output_set_got (operands[0], NULL_RTX);" + [(set_attr "type" "multi") + (set_attr "length" "12")]) + +(define_insn "set_got_labelled" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec:SI [(label_ref (match_operand 1 "" ""))] + UNSPEC_SET_GOT)) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_64BIT" + "* return output_set_got (operands[0], operands[1]);" + [(set_attr "type" "multi") + (set_attr "length" "12")]) + +(define_insn "set_got_rex64" + [(set (match_operand:DI 0 "register_operand" "=r") + (unspec:DI [(const_int 0)] UNSPEC_SET_GOT))] + "TARGET_64BIT" + "lea{q}\t{_GLOBAL_OFFSET_TABLE_(%%rip), %0|%0, _GLOBAL_OFFSET_TABLE_[rip]}" + [(set_attr "type" "lea") + (set_attr "length_address" "4") + (set_attr "mode" "DI")]) + +(define_insn "set_rip_rex64" + [(set (match_operand:DI 0 "register_operand" "=r") + (unspec:DI [(label_ref (match_operand 1 "" ""))] UNSPEC_SET_RIP))] + "TARGET_64BIT" + "lea{q}\t{%l1(%%rip), %0|%0, %l1[rip]}" + [(set_attr "type" "lea") + (set_attr "length_address" "4") + (set_attr "mode" "DI")]) + +(define_insn "set_got_offset_rex64" + [(set (match_operand:DI 0 "register_operand" "=r") + (unspec:DI + [(label_ref (match_operand 1 "" ""))] + UNSPEC_SET_GOT_OFFSET))] + "TARGET_64BIT" + "movabs{q}\t{$_GLOBAL_OFFSET_TABLE_-%l1, %0|%0, OFFSET FLAT:_GLOBAL_OFFSET_TABLE_-%l1}" + [(set_attr "type" "imov") + (set_attr "length_immediate" "0") + (set_attr "length_address" "8") + (set_attr "mode" "DI")]) + +(define_expand "epilogue" + [(const_int 0)] + "" + "ix86_expand_epilogue (1); DONE;") + +(define_expand "sibcall_epilogue" + [(const_int 0)] + "" + "ix86_expand_epilogue (0); DONE;") + +(define_expand "eh_return" + [(use (match_operand 0 "register_operand" ""))] + "" +{ + rtx tmp, sa = EH_RETURN_STACKADJ_RTX, ra = operands[0]; + + /* Tricky bit: we write the address of the handler to which we will + be returning into someone else's stack frame, one word below the + stack address we wish to restore. */ + tmp = gen_rtx_PLUS (Pmode, arg_pointer_rtx, sa); + tmp = plus_constant (tmp, -UNITS_PER_WORD); + tmp = gen_rtx_MEM (Pmode, tmp); + emit_move_insn (tmp, ra); + + emit_jump_insn (gen_eh_return_internal ()); + emit_barrier (); + DONE; +}) + +(define_insn_and_split "eh_return_internal" + [(eh_return)] + "" + "#" + "epilogue_completed" + [(const_int 0)] + "ix86_expand_epilogue (2); DONE;") + +(define_insn "leave" + [(set (reg:SI SP_REG) (plus:SI (reg:SI BP_REG) (const_int 4))) + (set (reg:SI BP_REG) (mem:SI (reg:SI BP_REG))) + (clobber (mem:BLK (scratch)))] + "!TARGET_64BIT" + "leave" + [(set_attr "type" "leave")]) + +(define_insn "leave_rex64" + [(set (reg:DI SP_REG) (plus:DI (reg:DI BP_REG) (const_int 8))) + (set (reg:DI BP_REG) (mem:DI (reg:DI BP_REG))) + (clobber (mem:BLK (scratch)))] + "TARGET_64BIT" + "leave" + [(set_attr "type" "leave")]) + +;; Handle -fsplit-stack. + +(define_expand "split_stack_prologue" + [(const_int 0)] + "" +{ + ix86_expand_split_stack_prologue (); + DONE; +}) + +;; In order to support the call/return predictor, we use a return +;; instruction which the middle-end doesn't see. +(define_insn "split_stack_return" + [(unspec_volatile [(match_operand:SI 0 "const_int_operand" "")] + UNSPECV_SPLIT_STACK_RETURN)] + "" +{ + if (operands[0] == const0_rtx) + return "ret"; + else + return "ret\t%0"; +} + [(set_attr "atom_unit" "jeu") + (set_attr "modrm" "0") + (set (attr "length") + (if_then_else (match_operand:SI 0 "const0_operand" "") + (const_int 1) + (const_int 3))) + (set (attr "length_immediate") + (if_then_else (match_operand:SI 0 "const0_operand" "") + (const_int 0) + (const_int 2)))]) + +;; If there are operand 0 bytes available on the stack, jump to +;; operand 1. + +(define_expand "split_stack_space_check" + [(set (pc) (if_then_else + (ltu (minus (reg SP_REG) + (match_operand 0 "register_operand" "")) + (unspec [(const_int 0)] UNSPEC_STACK_CHECK)) + (label_ref (match_operand 1 "" "")) + (pc)))] + "" +{ + rtx reg, size, limit; + + reg = gen_reg_rtx (Pmode); + size = force_reg (Pmode, operands[0]); + emit_insn (gen_sub3_insn (reg, stack_pointer_rtx, size)); + limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx), + UNSPEC_STACK_CHECK); + limit = gen_rtx_MEM (Pmode, gen_rtx_CONST (Pmode, limit)); + ix86_expand_branch (GEU, reg, limit, operands[1]); + + DONE; +}) + +;; Bit manipulation instructions. + +(define_expand "ffs2" + [(set (match_dup 2) (const_int -1)) + (parallel [(set (reg:CCZ FLAGS_REG) + (compare:CCZ + (match_operand:SWI48 1 "nonimmediate_operand" "") + (const_int 0))) + (set (match_operand:SWI48 0 "register_operand" "") + (ctz:SWI48 (match_dup 1)))]) + (set (match_dup 0) (if_then_else:SWI48 + (eq (reg:CCZ FLAGS_REG) (const_int 0)) + (match_dup 2) + (match_dup 0))) + (parallel [(set (match_dup 0) (plus:SWI48 (match_dup 0) (const_int 1))) + (clobber (reg:CC FLAGS_REG))])] + "" +{ + if (mode == SImode && !TARGET_CMOVE) + { + emit_insn (gen_ffssi2_no_cmove (operands[0], operands [1])); + DONE; + } + operands[2] = gen_reg_rtx (mode); +}) + +(define_insn_and_split "ffssi2_no_cmove" + [(set (match_operand:SI 0 "register_operand" "=r") + (ffs:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))) + (clobber (match_scratch:SI 2 "=&q")) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_CMOVE" + "#" + "&& reload_completed" + [(parallel [(set (reg:CCZ FLAGS_REG) + (compare:CCZ (match_dup 1) (const_int 0))) + (set (match_dup 0) (ctz:SI (match_dup 1)))]) + (set (strict_low_part (match_dup 3)) + (eq:QI (reg:CCZ FLAGS_REG) (const_int 0))) + (parallel [(set (match_dup 2) (neg:SI (match_dup 2))) + (clobber (reg:CC FLAGS_REG))]) + (parallel [(set (match_dup 0) (ior:SI (match_dup 0) (match_dup 2))) + (clobber (reg:CC FLAGS_REG))]) + (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 1))) + (clobber (reg:CC FLAGS_REG))])] +{ + operands[3] = gen_lowpart (QImode, operands[2]); + ix86_expand_clear (operands[2]); +}) + +(define_insn "*ffs_1" + [(set (reg:CCZ FLAGS_REG) + (compare:CCZ (match_operand:SWI48 1 "nonimmediate_operand" "rm") + (const_int 0))) + (set (match_operand:SWI48 0 "register_operand" "=r") + (ctz:SWI48 (match_dup 1)))] + "" + "bsf{}\t{%1, %0|%0, %1}" + [(set_attr "type" "alu1") + (set_attr "prefix_0f" "1") + (set_attr "mode" "")]) + +(define_insn "ctz2" + [(set (match_operand:SWI248 0 "register_operand" "=r") + (ctz:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "rm"))) + (clobber (reg:CC FLAGS_REG))] + "" +{ + if (TARGET_BMI) + return "tzcnt{}\t{%1, %0|%0, %1}"; + else + return "bsf{}\t{%1, %0|%0, %1}"; +} + [(set_attr "type" "alu1") + (set_attr "prefix_0f" "1") + (set (attr "prefix_rep") (symbol_ref "TARGET_BMI")) + (set_attr "mode" "")]) + +(define_expand "clz2" + [(parallel + [(set (match_operand:SWI248 0 "register_operand" "") + (minus:SWI248 + (match_dup 2) + (clz:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "")))) + (clobber (reg:CC FLAGS_REG))]) + (parallel + [(set (match_dup 0) (xor:SWI248 (match_dup 0) (match_dup 2))) + (clobber (reg:CC FLAGS_REG))])] + "" +{ + if (TARGET_ABM) + { + emit_insn (gen_clz2_abm (operands[0], operands[1])); + DONE; + } + operands[2] = GEN_INT (GET_MODE_BITSIZE (mode)-1); +}) + +(define_insn "clz2_abm" + [(set (match_operand:SWI248 0 "register_operand" "=r") + (clz:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "rm"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_ABM || TARGET_BMI" + "lzcnt{}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "")]) + +;; BMI instructions. +(define_insn "*bmi_andn_" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (and:SWI48 + (not:SWI48 + (match_operand:SWI48 1 "register_operand" "r")) + (match_operand:SWI48 2 "nonimmediate_operand" "rm"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_BMI" + "andn\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "bitmanip") + (set_attr "mode" "")]) + +(define_insn "bmi_bextr_" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (unspec:SWI48 [(match_operand:SWI48 1 "nonimmediate_operand" "rm") + (match_operand:SWI48 2 "register_operand" "r")] + UNSPEC_BEXTR)) + (clobber (reg:CC FLAGS_REG))] + "TARGET_BMI" + "bextr\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "bitmanip") + (set_attr "mode" "")]) + +(define_insn "*bmi_blsi_" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (and:SWI48 + (neg:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "rm")) + (match_dup 1))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_BMI" + "blsi\t{%1, %0|%0, %1}" + [(set_attr "type" "bitmanip") + (set_attr "mode" "")]) + +(define_insn "*bmi_blsmsk_" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (xor:SWI48 + (plus:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "rm") + (const_int -1)) + (match_dup 1))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_BMI" + "blsmsk\t{%1, %0|%0, %1}" + [(set_attr "type" "bitmanip") + (set_attr "mode" "")]) + +(define_insn "*bmi_blsr_" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (and:SWI48 + (plus:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "rm") + (const_int -1)) + (match_dup 1))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_BMI" + "blsr\t{%1, %0|%0, %1}" + [(set_attr "type" "bitmanip") + (set_attr "mode" "")]) + +;; TBM instructions. +(define_insn "tbm_bextri_" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (zero_extract:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "rm") + (match_operand:SWI48 2 "const_0_to_255_operand" "n") + (match_operand:SWI48 3 "const_0_to_255_operand" "n"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_TBM" +{ + operands[2] = GEN_INT (INTVAL (operands[2]) << 8 | INTVAL (operands[3])); + return "bextr\t{%2, %1, %0|%0, %1, %2}"; +} + [(set_attr "type" "bitmanip") + (set_attr "mode" "")]) + +(define_insn "*tbm_blcfill_" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (and:SWI48 + (plus:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "rm") + (const_int 1)) + (match_dup 1))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_TBM" + "blcfill\t{%1, %0|%0, %1}" + [(set_attr "type" "bitmanip") + (set_attr "mode" "")]) + +(define_insn "*tbm_blci_" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (ior:SWI48 + (not:SWI48 + (plus:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "rm") + (const_int 1))) + (match_dup 1))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_TBM" + "blci\t{%1, %0|%0, %1}" + [(set_attr "type" "bitmanip") + (set_attr "mode" "")]) + +(define_insn "*tbm_blcic_" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (and:SWI48 + (plus:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "rm") + (const_int 1)) + (not:SWI48 + (match_dup 1)))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_TBM" + "blcic\t{%1, %0|%0, %1}" + [(set_attr "type" "bitmanip") + (set_attr "mode" "")]) + +(define_insn "*tbm_blcmsk_" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (xor:SWI48 + (plus:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "rm") + (const_int 1)) + (match_dup 1))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_TBM" + "blcmsk\t{%1, %0|%0, %1}" + [(set_attr "type" "bitmanip") + (set_attr "mode" "")]) + +(define_insn "*tbm_blcs_" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (ior:SWI48 + (plus:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "rm") + (const_int 1)) + (match_dup 1))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_TBM" + "blcs\t{%1, %0|%0, %1}" + [(set_attr "type" "bitmanip") + (set_attr "mode" "")]) + +(define_insn "*tbm_blsfill_" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (ior:SWI48 + (plus:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "rm") + (const_int -1)) + (match_dup 1))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_TBM" + "blsfill\t{%1, %0|%0, %1}" + [(set_attr "type" "bitmanip") + (set_attr "mode" "")]) + +(define_insn "*tbm_blsic_" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (ior:SWI48 + (plus:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "rm") + (const_int -1)) + (not:SWI48 + (match_dup 1)))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_TBM" + "blsic\t{%1, %0|%0, %1}" + [(set_attr "type" "bitmanip") + (set_attr "mode" "")]) + +(define_insn "*tbm_t1mskc_" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (ior:SWI48 + (plus:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "rm") + (const_int 1)) + (not:SWI48 + (match_dup 1)))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_TBM" + "t1mskc\t{%1, %0|%0, %1}" + [(set_attr "type" "bitmanip") + (set_attr "mode" "")]) + +(define_insn "*tbm_tzmsk_" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (and:SWI48 + (plus:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "rm") + (const_int -1)) + (not:SWI48 + (match_dup 1)))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_TBM" + "tzmsk\t{%1, %0|%0, %1}" + [(set_attr "type" "bitmanip") + (set_attr "mode" "")]) + +(define_insn "bsr_rex64" + [(set (match_operand:DI 0 "register_operand" "=r") + (minus:DI (const_int 63) + (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT" + "bsr{q}\t{%1, %0|%0, %1}" + [(set_attr "type" "alu1") + (set_attr "prefix_0f" "1") + (set_attr "mode" "DI")]) + +(define_insn "bsr" + [(set (match_operand:SI 0 "register_operand" "=r") + (minus:SI (const_int 31) + (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))) + (clobber (reg:CC FLAGS_REG))] + "" + "bsr{l}\t{%1, %0|%0, %1}" + [(set_attr "type" "alu1") + (set_attr "prefix_0f" "1") + (set_attr "mode" "SI")]) + +(define_insn "*bsrhi" + [(set (match_operand:HI 0 "register_operand" "=r") + (minus:HI (const_int 15) + (clz:HI (match_operand:HI 1 "nonimmediate_operand" "rm")))) + (clobber (reg:CC FLAGS_REG))] + "" + "bsr{w}\t{%1, %0|%0, %1}" + [(set_attr "type" "alu1") + (set_attr "prefix_0f" "1") + (set_attr "mode" "HI")]) + +(define_insn "popcount2" + [(set (match_operand:SWI248 0 "register_operand" "=r") + (popcount:SWI248 + (match_operand:SWI248 1 "nonimmediate_operand" "rm"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_POPCNT" +{ +#if TARGET_MACHO + return "popcnt\t{%1, %0|%0, %1}"; +#else + return "popcnt{}\t{%1, %0|%0, %1}"; +#endif +} + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "")]) + +(define_insn "*popcount2_cmp" + [(set (reg FLAGS_REG) + (compare + (popcount:SWI248 + (match_operand:SWI248 1 "nonimmediate_operand" "rm")) + (const_int 0))) + (set (match_operand:SWI248 0 "register_operand" "=r") + (popcount:SWI248 (match_dup 1)))] + "TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)" +{ +#if TARGET_MACHO + return "popcnt\t{%1, %0|%0, %1}"; +#else + return "popcnt{}\t{%1, %0|%0, %1}"; +#endif +} + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "")]) + +(define_insn "*popcountsi2_cmp_zext" + [(set (reg FLAGS_REG) + (compare + (popcount:SI (match_operand:SI 1 "nonimmediate_operand" "rm")) + (const_int 0))) + (set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI(popcount:SI (match_dup 1))))] + "TARGET_64BIT && TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)" +{ +#if TARGET_MACHO + return "popcnt\t{%1, %0|%0, %1}"; +#else + return "popcnt{l}\t{%1, %0|%0, %1}"; +#endif +} + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) + +(define_expand "bswap2" + [(set (match_operand:SWI48 0 "register_operand" "") + (bswap:SWI48 (match_operand:SWI48 1 "register_operand" "")))] + "" +{ + if (mode == SImode && !(TARGET_BSWAP || TARGET_MOVBE)) + { + rtx x = operands[0]; + + emit_move_insn (x, operands[1]); + emit_insn (gen_bswaphi_lowpart (gen_lowpart (HImode, x))); + emit_insn (gen_rotlsi3 (x, x, GEN_INT (16))); + emit_insn (gen_bswaphi_lowpart (gen_lowpart (HImode, x))); + DONE; + } +}) + +(define_insn "*bswap2_movbe" + [(set (match_operand:SWI48 0 "nonimmediate_operand" "=r,r,m") + (bswap:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "0,m,r")))] + "TARGET_MOVBE + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "@ + bswap\t%0 + movbe\t{%1, %0|%0, %1} + movbe\t{%1, %0|%0, %1}" + [(set_attr "type" "bitmanip,imov,imov") + (set_attr "modrm" "0,1,1") + (set_attr "prefix_0f" "*,1,1") + (set_attr "prefix_extra" "*,1,1") + (set_attr "mode" "")]) + +(define_insn "*bswap2_1" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (bswap:SWI48 (match_operand:SWI48 1 "register_operand" "0")))] + "TARGET_BSWAP" + "bswap\t%0" + [(set_attr "type" "bitmanip") + (set_attr "modrm" "0") + (set_attr "mode" "")]) + +(define_insn "*bswaphi_lowpart_1" + [(set (strict_low_part (match_operand:HI 0 "register_operand" "+Q,r")) + (bswap:HI (match_dup 0))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_USE_XCHGB || optimize_function_for_size_p (cfun)" + "@ + xchg{b}\t{%h0, %b0|%b0, %h0} + rol{w}\t{$8, %0|%0, 8}" + [(set_attr "length" "2,4") + (set_attr "mode" "QI,HI")]) + +(define_insn "bswaphi_lowpart" + [(set (strict_low_part (match_operand:HI 0 "register_operand" "+r")) + (bswap:HI (match_dup 0))) + (clobber (reg:CC FLAGS_REG))] + "" + "rol{w}\t{$8, %0|%0, 8}" + [(set_attr "length" "4") + (set_attr "mode" "HI")]) + +(define_expand "paritydi2" + [(set (match_operand:DI 0 "register_operand" "") + (parity:DI (match_operand:DI 1 "register_operand" "")))] + "! TARGET_POPCNT" +{ + rtx scratch = gen_reg_rtx (QImode); + rtx cond; + + emit_insn (gen_paritydi2_cmp (NULL_RTX, NULL_RTX, + NULL_RTX, operands[1])); + + cond = gen_rtx_fmt_ee (ORDERED, QImode, + gen_rtx_REG (CCmode, FLAGS_REG), + const0_rtx); + emit_insn (gen_rtx_SET (VOIDmode, scratch, cond)); + + if (TARGET_64BIT) + emit_insn (gen_zero_extendqidi2 (operands[0], scratch)); + else + { + rtx tmp = gen_reg_rtx (SImode); + + emit_insn (gen_zero_extendqisi2 (tmp, scratch)); + emit_insn (gen_zero_extendsidi2 (operands[0], tmp)); + } + DONE; +}) + +(define_expand "paritysi2" + [(set (match_operand:SI 0 "register_operand" "") + (parity:SI (match_operand:SI 1 "register_operand" "")))] + "! TARGET_POPCNT" +{ + rtx scratch = gen_reg_rtx (QImode); + rtx cond; + + emit_insn (gen_paritysi2_cmp (NULL_RTX, NULL_RTX, operands[1])); + + cond = gen_rtx_fmt_ee (ORDERED, QImode, + gen_rtx_REG (CCmode, FLAGS_REG), + const0_rtx); + emit_insn (gen_rtx_SET (VOIDmode, scratch, cond)); + + emit_insn (gen_zero_extendqisi2 (operands[0], scratch)); + DONE; +}) + +(define_insn_and_split "paritydi2_cmp" + [(set (reg:CC FLAGS_REG) + (unspec:CC [(match_operand:DI 3 "register_operand" "0")] + UNSPEC_PARITY)) + (clobber (match_scratch:DI 0 "=r")) + (clobber (match_scratch:SI 1 "=&r")) + (clobber (match_scratch:HI 2 "=Q"))] + "! TARGET_POPCNT" + "#" + "&& reload_completed" + [(parallel + [(set (match_dup 1) + (xor:SI (match_dup 1) (match_dup 4))) + (clobber (reg:CC FLAGS_REG))]) + (parallel + [(set (reg:CC FLAGS_REG) + (unspec:CC [(match_dup 1)] UNSPEC_PARITY)) + (clobber (match_dup 1)) + (clobber (match_dup 2))])] +{ + operands[4] = gen_lowpart (SImode, operands[3]); + + if (TARGET_64BIT) + { + emit_move_insn (operands[1], gen_lowpart (SImode, operands[3])); + emit_insn (gen_lshrdi3 (operands[3], operands[3], GEN_INT (32))); + } + else + operands[1] = gen_highpart (SImode, operands[3]); +}) + +(define_insn_and_split "paritysi2_cmp" + [(set (reg:CC FLAGS_REG) + (unspec:CC [(match_operand:SI 2 "register_operand" "0")] + UNSPEC_PARITY)) + (clobber (match_scratch:SI 0 "=r")) + (clobber (match_scratch:HI 1 "=&Q"))] + "! TARGET_POPCNT" + "#" + "&& reload_completed" + [(parallel + [(set (match_dup 1) + (xor:HI (match_dup 1) (match_dup 3))) + (clobber (reg:CC FLAGS_REG))]) + (parallel + [(set (reg:CC FLAGS_REG) + (unspec:CC [(match_dup 1)] UNSPEC_PARITY)) + (clobber (match_dup 1))])] +{ + operands[3] = gen_lowpart (HImode, operands[2]); + + emit_move_insn (operands[1], gen_lowpart (HImode, operands[2])); + emit_insn (gen_lshrsi3 (operands[2], operands[2], GEN_INT (16))); +}) + +(define_insn "*parityhi2_cmp" + [(set (reg:CC FLAGS_REG) + (unspec:CC [(match_operand:HI 1 "register_operand" "0")] + UNSPEC_PARITY)) + (clobber (match_scratch:HI 0 "=Q"))] + "! TARGET_POPCNT" + "xor{b}\t{%h0, %b0|%b0, %h0}" + [(set_attr "length" "2") + (set_attr "mode" "HI")]) + +;; Thread-local storage patterns for ELF. +;; +;; Note that these code sequences must appear exactly as shown +;; in order to allow linker relaxation. + +(define_insn "*tls_global_dynamic_32_gnu" + [(set (match_operand:SI 0 "register_operand" "=a") + (unspec:SI [(match_operand:SI 1 "register_operand" "b") + (match_operand:SI 2 "tls_symbolic_operand" "") + (match_operand:SI 3 "call_insn_operand" "")] + UNSPEC_TLS_GD)) + (clobber (match_scratch:SI 4 "=d")) + (clobber (match_scratch:SI 5 "=c")) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_64BIT && TARGET_GNU_TLS" + "lea{l}\t{%a2@tlsgd(,%1,1), %0|%0, %a2@tlsgd[%1*1]}\;call\t%P3" + [(set_attr "type" "multi") + (set_attr "length" "12")]) + +(define_expand "tls_global_dynamic_32" + [(parallel [(set (match_operand:SI 0 "register_operand" "") + (unspec:SI + [(match_dup 2) + (match_operand:SI 1 "tls_symbolic_operand" "") + (match_dup 3)] + UNSPEC_TLS_GD)) + (clobber (match_scratch:SI 4 "")) + (clobber (match_scratch:SI 5 "")) + (clobber (reg:CC FLAGS_REG))])] + "" +{ + if (flag_pic) + operands[2] = pic_offset_table_rtx; + else + { + operands[2] = gen_reg_rtx (Pmode); + emit_insn (gen_set_got (operands[2])); + } + if (TARGET_GNU2_TLS) + { + emit_insn (gen_tls_dynamic_gnu2_32 + (operands[0], operands[1], operands[2])); + DONE; + } + operands[3] = ix86_tls_get_addr (); +}) + +(define_insn "*tls_global_dynamic_64" + [(set (match_operand:DI 0 "register_operand" "=a") + (call:DI (mem:QI (match_operand:DI 2 "call_insn_operand" "")) + (match_operand:DI 3 "" ""))) + (unspec:DI [(match_operand:DI 1 "tls_symbolic_operand" "")] + UNSPEC_TLS_GD)] + "TARGET_64BIT" + { return ASM_BYTE "0x66\n\tlea{q}\t{%a1@tlsgd(%%rip), %%rdi|rdi, %a1@tlsgd[rip]}\n" ASM_SHORT "0x6666\n\trex64\n\tcall\t%P2"; } + [(set_attr "type" "multi") + (set_attr "length" "16")]) + +(define_expand "tls_global_dynamic_64" + [(parallel [(set (match_operand:DI 0 "register_operand" "") + (call:DI (mem:QI (match_dup 2)) (const_int 0))) + (unspec:DI [(match_operand:DI 1 "tls_symbolic_operand" "")] + UNSPEC_TLS_GD)])] + "" +{ + if (TARGET_GNU2_TLS) + { + emit_insn (gen_tls_dynamic_gnu2_64 + (operands[0], operands[1])); + DONE; + } + operands[2] = ix86_tls_get_addr (); +}) + +(define_insn "*tls_local_dynamic_base_32_gnu" + [(set (match_operand:SI 0 "register_operand" "=a") + (unspec:SI [(match_operand:SI 1 "register_operand" "b") + (match_operand:SI 2 "call_insn_operand" "")] + UNSPEC_TLS_LD_BASE)) + (clobber (match_scratch:SI 3 "=d")) + (clobber (match_scratch:SI 4 "=c")) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_64BIT && TARGET_GNU_TLS" + "lea{l}\t{%&@tlsldm(%1), %0|%0, %&@tlsldm[%1]}\;call\t%P2" + [(set_attr "type" "multi") + (set_attr "length" "11")]) + +(define_expand "tls_local_dynamic_base_32" + [(parallel [(set (match_operand:SI 0 "register_operand" "") + (unspec:SI [(match_dup 1) (match_dup 2)] + UNSPEC_TLS_LD_BASE)) + (clobber (match_scratch:SI 3 "")) + (clobber (match_scratch:SI 4 "")) + (clobber (reg:CC FLAGS_REG))])] + "" +{ + if (flag_pic) + operands[1] = pic_offset_table_rtx; + else + { + operands[1] = gen_reg_rtx (Pmode); + emit_insn (gen_set_got (operands[1])); + } + if (TARGET_GNU2_TLS) + { + emit_insn (gen_tls_dynamic_gnu2_32 + (operands[0], ix86_tls_module_base (), operands[1])); + DONE; + } + operands[2] = ix86_tls_get_addr (); +}) + +(define_insn "*tls_local_dynamic_base_64" + [(set (match_operand:DI 0 "register_operand" "=a") + (call:DI (mem:QI (match_operand:DI 1 "call_insn_operand" "")) + (match_operand:DI 2 "" ""))) + (unspec:DI [(const_int 0)] UNSPEC_TLS_LD_BASE)] + "TARGET_64BIT" + "lea{q}\t{%&@tlsld(%%rip), %%rdi|rdi, %&@tlsld[rip]}\;call\t%P1" + [(set_attr "type" "multi") + (set_attr "length" "12")]) + +(define_expand "tls_local_dynamic_base_64" + [(parallel [(set (match_operand:DI 0 "register_operand" "") + (call:DI (mem:QI (match_dup 1)) (const_int 0))) + (unspec:DI [(const_int 0)] UNSPEC_TLS_LD_BASE)])] + "" +{ + if (TARGET_GNU2_TLS) + { + emit_insn (gen_tls_dynamic_gnu2_64 + (operands[0], ix86_tls_module_base ())); + DONE; + } + operands[1] = ix86_tls_get_addr (); +}) + +;; Local dynamic of a single variable is a lose. Show combine how +;; to convert that back to global dynamic. + +(define_insn_and_split "*tls_local_dynamic_32_once" + [(set (match_operand:SI 0 "register_operand" "=a") + (plus:SI (unspec:SI [(match_operand:SI 1 "register_operand" "b") + (match_operand:SI 2 "call_insn_operand" "")] + UNSPEC_TLS_LD_BASE) + (const:SI (unspec:SI + [(match_operand:SI 3 "tls_symbolic_operand" "")] + UNSPEC_DTPOFF)))) + (clobber (match_scratch:SI 4 "=d")) + (clobber (match_scratch:SI 5 "=c")) + (clobber (reg:CC FLAGS_REG))] + "" + "#" + "" + [(parallel [(set (match_dup 0) + (unspec:SI [(match_dup 1) (match_dup 3) (match_dup 2)] + UNSPEC_TLS_GD)) + (clobber (match_dup 4)) + (clobber (match_dup 5)) + (clobber (reg:CC FLAGS_REG))])]) + +;; Segment register for the thread base ptr load +(define_mode_attr tp_seg [(SI "gs") (DI "fs")]) + +;; Load and add the thread base pointer from %gs:0. +(define_insn "*load_tp_" + [(set (match_operand:P 0 "register_operand" "=r") + (unspec:P [(const_int 0)] UNSPEC_TP))] + "" + "mov{}\t{%%:0, %0|%0, PTR :0}" + [(set_attr "type" "imov") + (set_attr "modrm" "0") + (set_attr "length" "7") + (set_attr "memory" "load") + (set_attr "imm_disp" "false")]) + +(define_insn "*add_tp_" + [(set (match_operand:P 0 "register_operand" "=r") + (plus:P (unspec:P [(const_int 0)] UNSPEC_TP) + (match_operand:P 1 "register_operand" "0"))) + (clobber (reg:CC FLAGS_REG))] + "" + "add{}\t{%%:0, %0|%0, PTR :0}" + [(set_attr "type" "alu") + (set_attr "modrm" "0") + (set_attr "length" "7") + (set_attr "memory" "load") + (set_attr "imm_disp" "false")]) + +;; The Sun linker took the AMD64 TLS spec literally and can only handle +;; %rax as destination of the initial executable code sequence. +(define_insn "tls_initial_exec_64_sun" + [(set (match_operand:DI 0 "register_operand" "=a") + (unspec:DI + [(match_operand:DI 1 "tls_symbolic_operand" "")] + UNSPEC_TLS_IE_SUN)) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && TARGET_SUN_TLS" + "mov{q}\t{%%fs:0, %0|%0, QWORD PTR fs:0}\n\tadd{q}\t{%a1@gottpoff(%%rip), %0|%0, %a1@gottpoff[rip]}" + [(set_attr "type" "multi")]) + +;; GNU2 TLS patterns can be split. + +(define_expand "tls_dynamic_gnu2_32" + [(set (match_dup 3) + (plus:SI (match_operand:SI 2 "register_operand" "") + (const:SI + (unspec:SI [(match_operand:SI 1 "tls_symbolic_operand" "")] + UNSPEC_TLSDESC)))) + (parallel + [(set (match_operand:SI 0 "register_operand" "") + (unspec:SI [(match_dup 1) (match_dup 3) + (match_dup 2) (reg:SI SP_REG)] + UNSPEC_TLSDESC)) + (clobber (reg:CC FLAGS_REG))])] + "!TARGET_64BIT && TARGET_GNU2_TLS" +{ + operands[3] = can_create_pseudo_p () ? gen_reg_rtx (Pmode) : operands[0]; + ix86_tls_descriptor_calls_expanded_in_cfun = true; +}) + +(define_insn "*tls_dynamic_lea_32" + [(set (match_operand:SI 0 "register_operand" "=r") + (plus:SI (match_operand:SI 1 "register_operand" "b") + (const:SI + (unspec:SI [(match_operand:SI 2 "tls_symbolic_operand" "")] + UNSPEC_TLSDESC))))] + "!TARGET_64BIT && TARGET_GNU2_TLS" + "lea{l}\t{%a2@TLSDESC(%1), %0|%0, %a2@TLSDESC[%1]}" + [(set_attr "type" "lea") + (set_attr "mode" "SI") + (set_attr "length" "6") + (set_attr "length_address" "4")]) + +(define_insn "*tls_dynamic_call_32" + [(set (match_operand:SI 0 "register_operand" "=a") + (unspec:SI [(match_operand:SI 1 "tls_symbolic_operand" "") + (match_operand:SI 2 "register_operand" "0") + ;; we have to make sure %ebx still points to the GOT + (match_operand:SI 3 "register_operand" "b") + (reg:SI SP_REG)] + UNSPEC_TLSDESC)) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_64BIT && TARGET_GNU2_TLS" + "call\t{*%a1@TLSCALL(%2)|[DWORD PTR [%2+%a1@TLSCALL]]}" + [(set_attr "type" "call") + (set_attr "length" "2") + (set_attr "length_address" "0")]) + +(define_insn_and_split "*tls_dynamic_gnu2_combine_32" + [(set (match_operand:SI 0 "register_operand" "=&a") + (plus:SI + (unspec:SI [(match_operand:SI 3 "tls_modbase_operand" "") + (match_operand:SI 4 "" "") + (match_operand:SI 2 "register_operand" "b") + (reg:SI SP_REG)] + UNSPEC_TLSDESC) + (const:SI (unspec:SI + [(match_operand:SI 1 "tls_symbolic_operand" "")] + UNSPEC_DTPOFF)))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_64BIT && TARGET_GNU2_TLS" + "#" + "" + [(set (match_dup 0) (match_dup 5))] +{ + operands[5] = can_create_pseudo_p () ? gen_reg_rtx (Pmode) : operands[0]; + emit_insn (gen_tls_dynamic_gnu2_32 (operands[5], operands[1], operands[2])); +}) + +(define_expand "tls_dynamic_gnu2_64" + [(set (match_dup 2) + (unspec:DI [(match_operand:DI 1 "tls_symbolic_operand" "")] + UNSPEC_TLSDESC)) + (parallel + [(set (match_operand:DI 0 "register_operand" "") + (unspec:DI [(match_dup 1) (match_dup 2) (reg:DI SP_REG)] + UNSPEC_TLSDESC)) + (clobber (reg:CC FLAGS_REG))])] + "TARGET_64BIT && TARGET_GNU2_TLS" +{ + operands[2] = can_create_pseudo_p () ? gen_reg_rtx (Pmode) : operands[0]; + ix86_tls_descriptor_calls_expanded_in_cfun = true; +}) + +(define_insn "*tls_dynamic_lea_64" + [(set (match_operand:DI 0 "register_operand" "=r") + (unspec:DI [(match_operand:DI 1 "tls_symbolic_operand" "")] + UNSPEC_TLSDESC))] + "TARGET_64BIT && TARGET_GNU2_TLS" + "lea{q}\t{%a1@TLSDESC(%%rip), %0|%0, %a1@TLSDESC[rip]}" + [(set_attr "type" "lea") + (set_attr "mode" "DI") + (set_attr "length" "7") + (set_attr "length_address" "4")]) + +(define_insn "*tls_dynamic_call_64" + [(set (match_operand:DI 0 "register_operand" "=a") + (unspec:DI [(match_operand:DI 1 "tls_symbolic_operand" "") + (match_operand:DI 2 "register_operand" "0") + (reg:DI SP_REG)] + UNSPEC_TLSDESC)) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && TARGET_GNU2_TLS" + "call\t{*%a1@TLSCALL(%2)|[QWORD PTR [%2+%a1@TLSCALL]]}" + [(set_attr "type" "call") + (set_attr "length" "2") + (set_attr "length_address" "0")]) + +(define_insn_and_split "*tls_dynamic_gnu2_combine_64" + [(set (match_operand:DI 0 "register_operand" "=&a") + (plus:DI + (unspec:DI [(match_operand:DI 2 "tls_modbase_operand" "") + (match_operand:DI 3 "" "") + (reg:DI SP_REG)] + UNSPEC_TLSDESC) + (const:DI (unspec:DI + [(match_operand:DI 1 "tls_symbolic_operand" "")] + UNSPEC_DTPOFF)))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && TARGET_GNU2_TLS" + "#" + "" + [(set (match_dup 0) (match_dup 4))] +{ + operands[4] = can_create_pseudo_p () ? gen_reg_rtx (Pmode) : operands[0]; + emit_insn (gen_tls_dynamic_gnu2_64 (operands[4], operands[1])); +}) + +;; These patterns match the binary 387 instructions for addM3, subM3, +;; mulM3 and divM3. There are three patterns for each of DFmode and +;; SFmode. The first is the normal insn, the second the same insn but +;; with one operand a conversion, and the third the same insn but with +;; the other operand a conversion. The conversion may be SFmode or +;; SImode if the target mode DFmode, but only SImode if the target mode +;; is SFmode. + +;; Gcc is slightly more smart about handling normal two address instructions +;; so use special patterns for add and mull. + +(define_insn "*fop__comm_mixed_avx" + [(set (match_operand:MODEF 0 "register_operand" "=f,x") + (match_operator:MODEF 3 "binary_fp_operator" + [(match_operand:MODEF 1 "nonimmediate_operand" "%0,x") + (match_operand:MODEF 2 "nonimmediate_operand" "fm,xm")]))] + "AVX_FLOAT_MODE_P (mode) && TARGET_MIX_SSE_I387 + && COMMUTATIVE_ARITH_P (operands[3]) + && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "* return output_387_binary_op (insn, operands);" + [(set (attr "type") + (if_then_else (eq_attr "alternative" "1") + (if_then_else (match_operand:MODEF 3 "mult_operator" "") + (const_string "ssemul") + (const_string "sseadd")) + (if_then_else (match_operand:MODEF 3 "mult_operator" "") + (const_string "fmul") + (const_string "fop")))) + (set_attr "prefix" "orig,maybe_vex") + (set_attr "mode" "")]) + +(define_insn "*fop__comm_mixed" + [(set (match_operand:MODEF 0 "register_operand" "=f,x") + (match_operator:MODEF 3 "binary_fp_operator" + [(match_operand:MODEF 1 "nonimmediate_operand" "%0,0") + (match_operand:MODEF 2 "nonimmediate_operand" "fm,xm")]))] + "SSE_FLOAT_MODE_P (mode) && TARGET_MIX_SSE_I387 + && COMMUTATIVE_ARITH_P (operands[3]) + && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "* return output_387_binary_op (insn, operands);" + [(set (attr "type") + (if_then_else (eq_attr "alternative" "1") + (if_then_else (match_operand:MODEF 3 "mult_operator" "") + (const_string "ssemul") + (const_string "sseadd")) + (if_then_else (match_operand:MODEF 3 "mult_operator" "") + (const_string "fmul") + (const_string "fop")))) + (set_attr "mode" "")]) + +(define_insn "*fop__comm_avx" + [(set (match_operand:MODEF 0 "register_operand" "=x") + (match_operator:MODEF 3 "binary_fp_operator" + [(match_operand:MODEF 1 "nonimmediate_operand" "%x") + (match_operand:MODEF 2 "nonimmediate_operand" "xm")]))] + "AVX_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && COMMUTATIVE_ARITH_P (operands[3]) + && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "* return output_387_binary_op (insn, operands);" + [(set (attr "type") + (if_then_else (match_operand:MODEF 3 "mult_operator" "") + (const_string "ssemul") + (const_string "sseadd"))) + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "*fop__comm_sse" + [(set (match_operand:MODEF 0 "register_operand" "=x") + (match_operator:MODEF 3 "binary_fp_operator" + [(match_operand:MODEF 1 "nonimmediate_operand" "%0") + (match_operand:MODEF 2 "nonimmediate_operand" "xm")]))] + "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && COMMUTATIVE_ARITH_P (operands[3]) + && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "* return output_387_binary_op (insn, operands);" + [(set (attr "type") + (if_then_else (match_operand:MODEF 3 "mult_operator" "") + (const_string "ssemul") + (const_string "sseadd"))) + (set_attr "mode" "")]) + +(define_insn "*fop__comm_i387" + [(set (match_operand:MODEF 0 "register_operand" "=f") + (match_operator:MODEF 3 "binary_fp_operator" + [(match_operand:MODEF 1 "nonimmediate_operand" "%0") + (match_operand:MODEF 2 "nonimmediate_operand" "fm")]))] + "TARGET_80387 && X87_ENABLE_ARITH (mode) + && COMMUTATIVE_ARITH_P (operands[3]) + && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "* return output_387_binary_op (insn, operands);" + [(set (attr "type") + (if_then_else (match_operand:MODEF 3 "mult_operator" "") + (const_string "fmul") + (const_string "fop"))) + (set_attr "mode" "")]) + +(define_insn "*fop__1_mixed_avx" + [(set (match_operand:MODEF 0 "register_operand" "=f,f,x") + (match_operator:MODEF 3 "binary_fp_operator" + [(match_operand:MODEF 1 "nonimmediate_operand" "0,fm,x") + (match_operand:MODEF 2 "nonimmediate_operand" "fm,0,xm")]))] + "AVX_FLOAT_MODE_P (mode) && TARGET_MIX_SSE_I387 + && !COMMUTATIVE_ARITH_P (operands[3]) + && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "* return output_387_binary_op (insn, operands);" + [(set (attr "type") + (cond [(and (eq_attr "alternative" "2") + (match_operand:MODEF 3 "mult_operator" "")) + (const_string "ssemul") + (and (eq_attr "alternative" "2") + (match_operand:MODEF 3 "div_operator" "")) + (const_string "ssediv") + (eq_attr "alternative" "2") + (const_string "sseadd") + (match_operand:MODEF 3 "mult_operator" "") + (const_string "fmul") + (match_operand:MODEF 3 "div_operator" "") + (const_string "fdiv") + ] + (const_string "fop"))) + (set_attr "prefix" "orig,orig,maybe_vex") + (set_attr "mode" "")]) + +(define_insn "*fop__1_mixed" + [(set (match_operand:MODEF 0 "register_operand" "=f,f,x") + (match_operator:MODEF 3 "binary_fp_operator" + [(match_operand:MODEF 1 "nonimmediate_operand" "0,fm,0") + (match_operand:MODEF 2 "nonimmediate_operand" "fm,0,xm")]))] + "SSE_FLOAT_MODE_P (mode) && TARGET_MIX_SSE_I387 + && !COMMUTATIVE_ARITH_P (operands[3]) + && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "* return output_387_binary_op (insn, operands);" + [(set (attr "type") + (cond [(and (eq_attr "alternative" "2") + (match_operand:MODEF 3 "mult_operator" "")) + (const_string "ssemul") + (and (eq_attr "alternative" "2") + (match_operand:MODEF 3 "div_operator" "")) + (const_string "ssediv") + (eq_attr "alternative" "2") + (const_string "sseadd") + (match_operand:MODEF 3 "mult_operator" "") + (const_string "fmul") + (match_operand:MODEF 3 "div_operator" "") + (const_string "fdiv") + ] + (const_string "fop"))) + (set_attr "mode" "")]) + +(define_insn "*rcpsf2_sse" + [(set (match_operand:SF 0 "register_operand" "=x") + (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "xm")] + UNSPEC_RCP))] + "TARGET_SSE_MATH" + "%vrcpss\t{%1, %d0|%d0, %1}" + [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "rcp") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "SF")]) + +(define_insn "*fop__1_avx" + [(set (match_operand:MODEF 0 "register_operand" "=x") + (match_operator:MODEF 3 "binary_fp_operator" + [(match_operand:MODEF 1 "register_operand" "x") + (match_operand:MODEF 2 "nonimmediate_operand" "xm")]))] + "AVX_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && !COMMUTATIVE_ARITH_P (operands[3])" + "* return output_387_binary_op (insn, operands);" + [(set (attr "type") + (cond [(match_operand:MODEF 3 "mult_operator" "") + (const_string "ssemul") + (match_operand:MODEF 3 "div_operator" "") + (const_string "ssediv") + ] + (const_string "sseadd"))) + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "*fop__1_sse" + [(set (match_operand:MODEF 0 "register_operand" "=x") + (match_operator:MODEF 3 "binary_fp_operator" + [(match_operand:MODEF 1 "register_operand" "0") + (match_operand:MODEF 2 "nonimmediate_operand" "xm")]))] + "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && !COMMUTATIVE_ARITH_P (operands[3])" + "* return output_387_binary_op (insn, operands);" + [(set (attr "type") + (cond [(match_operand:MODEF 3 "mult_operator" "") + (const_string "ssemul") + (match_operand:MODEF 3 "div_operator" "") + (const_string "ssediv") + ] + (const_string "sseadd"))) + (set_attr "mode" "")]) + +;; This pattern is not fully shadowed by the pattern above. +(define_insn "*fop__1_i387" + [(set (match_operand:MODEF 0 "register_operand" "=f,f") + (match_operator:MODEF 3 "binary_fp_operator" + [(match_operand:MODEF 1 "nonimmediate_operand" "0,fm") + (match_operand:MODEF 2 "nonimmediate_operand" "fm,0")]))] + "TARGET_80387 && X87_ENABLE_ARITH (mode) + && !(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + && !COMMUTATIVE_ARITH_P (operands[3]) + && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "* return output_387_binary_op (insn, operands);" + [(set (attr "type") + (cond [(match_operand:MODEF 3 "mult_operator" "") + (const_string "fmul") + (match_operand:MODEF 3 "div_operator" "") + (const_string "fdiv") + ] + (const_string "fop"))) + (set_attr "mode" "")]) + +;; ??? Add SSE splitters for these! +(define_insn "*fop__2_i387" + [(set (match_operand:MODEF 0 "register_operand" "=f,f") + (match_operator:MODEF 3 "binary_fp_operator" + [(float:MODEF + (match_operand:X87MODEI12 1 "nonimmediate_operand" "m,?r")) + (match_operand:MODEF 2 "register_operand" "0,0")]))] + "TARGET_80387 && X87_ENABLE_FLOAT (mode, mode) + && !(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + && (TARGET_USE_MODE_FIOP || optimize_function_for_size_p (cfun))" + "* return which_alternative ? \"#\" : output_387_binary_op (insn, operands);" + [(set (attr "type") + (cond [(match_operand:MODEF 3 "mult_operator" "") + (const_string "fmul") + (match_operand:MODEF 3 "div_operator" "") + (const_string "fdiv") + ] + (const_string "fop"))) + (set_attr "fp_int_src" "true") + (set_attr "mode" "")]) + +(define_insn "*fop__3_i387" + [(set (match_operand:MODEF 0 "register_operand" "=f,f") + (match_operator:MODEF 3 "binary_fp_operator" + [(match_operand:MODEF 1 "register_operand" "0,0") + (float:MODEF + (match_operand:X87MODEI12 2 "nonimmediate_operand" "m,?r"))]))] + "TARGET_80387 && X87_ENABLE_FLOAT (mode, mode) + && !(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + && (TARGET_USE_MODE_FIOP || optimize_function_for_size_p (cfun))" + "* return which_alternative ? \"#\" : output_387_binary_op (insn, operands);" + [(set (attr "type") + (cond [(match_operand:MODEF 3 "mult_operator" "") + (const_string "fmul") + (match_operand:MODEF 3 "div_operator" "") + (const_string "fdiv") + ] + (const_string "fop"))) + (set_attr "fp_int_src" "true") + (set_attr "mode" "")]) + +(define_insn "*fop_df_4_i387" + [(set (match_operand:DF 0 "register_operand" "=f,f") + (match_operator:DF 3 "binary_fp_operator" + [(float_extend:DF + (match_operand:SF 1 "nonimmediate_operand" "fm,0")) + (match_operand:DF 2 "register_operand" "0,f")]))] + "TARGET_80387 && X87_ENABLE_ARITH (DFmode) + && !(TARGET_SSE2 && TARGET_SSE_MATH) + && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "* return output_387_binary_op (insn, operands);" + [(set (attr "type") + (cond [(match_operand:DF 3 "mult_operator" "") + (const_string "fmul") + (match_operand:DF 3 "div_operator" "") + (const_string "fdiv") + ] + (const_string "fop"))) + (set_attr "mode" "SF")]) + +(define_insn "*fop_df_5_i387" + [(set (match_operand:DF 0 "register_operand" "=f,f") + (match_operator:DF 3 "binary_fp_operator" + [(match_operand:DF 1 "register_operand" "0,f") + (float_extend:DF + (match_operand:SF 2 "nonimmediate_operand" "fm,0"))]))] + "TARGET_80387 && X87_ENABLE_ARITH (DFmode) + && !(TARGET_SSE2 && TARGET_SSE_MATH)" + "* return output_387_binary_op (insn, operands);" + [(set (attr "type") + (cond [(match_operand:DF 3 "mult_operator" "") + (const_string "fmul") + (match_operand:DF 3 "div_operator" "") + (const_string "fdiv") + ] + (const_string "fop"))) + (set_attr "mode" "SF")]) + +(define_insn "*fop_df_6_i387" + [(set (match_operand:DF 0 "register_operand" "=f,f") + (match_operator:DF 3 "binary_fp_operator" + [(float_extend:DF + (match_operand:SF 1 "register_operand" "0,f")) + (float_extend:DF + (match_operand:SF 2 "nonimmediate_operand" "fm,0"))]))] + "TARGET_80387 && X87_ENABLE_ARITH (DFmode) + && !(TARGET_SSE2 && TARGET_SSE_MATH)" + "* return output_387_binary_op (insn, operands);" + [(set (attr "type") + (cond [(match_operand:DF 3 "mult_operator" "") + (const_string "fmul") + (match_operand:DF 3 "div_operator" "") + (const_string "fdiv") + ] + (const_string "fop"))) + (set_attr "mode" "SF")]) + +(define_insn "*fop_xf_comm_i387" + [(set (match_operand:XF 0 "register_operand" "=f") + (match_operator:XF 3 "binary_fp_operator" + [(match_operand:XF 1 "register_operand" "%0") + (match_operand:XF 2 "register_operand" "f")]))] + "TARGET_80387 + && COMMUTATIVE_ARITH_P (operands[3])" + "* return output_387_binary_op (insn, operands);" + [(set (attr "type") + (if_then_else (match_operand:XF 3 "mult_operator" "") + (const_string "fmul") + (const_string "fop"))) + (set_attr "mode" "XF")]) + +(define_insn "*fop_xf_1_i387" + [(set (match_operand:XF 0 "register_operand" "=f,f") + (match_operator:XF 3 "binary_fp_operator" + [(match_operand:XF 1 "register_operand" "0,f") + (match_operand:XF 2 "register_operand" "f,0")]))] + "TARGET_80387 + && !COMMUTATIVE_ARITH_P (operands[3])" + "* return output_387_binary_op (insn, operands);" + [(set (attr "type") + (cond [(match_operand:XF 3 "mult_operator" "") + (const_string "fmul") + (match_operand:XF 3 "div_operator" "") + (const_string "fdiv") + ] + (const_string "fop"))) + (set_attr "mode" "XF")]) + +(define_insn "*fop_xf_2_i387" + [(set (match_operand:XF 0 "register_operand" "=f,f") + (match_operator:XF 3 "binary_fp_operator" + [(float:XF + (match_operand:X87MODEI12 1 "nonimmediate_operand" "m,?r")) + (match_operand:XF 2 "register_operand" "0,0")]))] + "TARGET_80387 && (TARGET_USE_MODE_FIOP || optimize_function_for_size_p (cfun))" + "* return which_alternative ? \"#\" : output_387_binary_op (insn, operands);" + [(set (attr "type") + (cond [(match_operand:XF 3 "mult_operator" "") + (const_string "fmul") + (match_operand:XF 3 "div_operator" "") + (const_string "fdiv") + ] + (const_string "fop"))) + (set_attr "fp_int_src" "true") + (set_attr "mode" "")]) + +(define_insn "*fop_xf_3_i387" + [(set (match_operand:XF 0 "register_operand" "=f,f") + (match_operator:XF 3 "binary_fp_operator" + [(match_operand:XF 1 "register_operand" "0,0") + (float:XF + (match_operand:X87MODEI12 2 "nonimmediate_operand" "m,?r"))]))] + "TARGET_80387 && (TARGET_USE_MODE_FIOP || optimize_function_for_size_p (cfun))" + "* return which_alternative ? \"#\" : output_387_binary_op (insn, operands);" + [(set (attr "type") + (cond [(match_operand:XF 3 "mult_operator" "") + (const_string "fmul") + (match_operand:XF 3 "div_operator" "") + (const_string "fdiv") + ] + (const_string "fop"))) + (set_attr "fp_int_src" "true") + (set_attr "mode" "")]) + +(define_insn "*fop_xf_4_i387" + [(set (match_operand:XF 0 "register_operand" "=f,f") + (match_operator:XF 3 "binary_fp_operator" + [(float_extend:XF + (match_operand:MODEF 1 "nonimmediate_operand" "fm,0")) + (match_operand:XF 2 "register_operand" "0,f")]))] + "TARGET_80387" + "* return output_387_binary_op (insn, operands);" + [(set (attr "type") + (cond [(match_operand:XF 3 "mult_operator" "") + (const_string "fmul") + (match_operand:XF 3 "div_operator" "") + (const_string "fdiv") + ] + (const_string "fop"))) + (set_attr "mode" "")]) + +(define_insn "*fop_xf_5_i387" + [(set (match_operand:XF 0 "register_operand" "=f,f") + (match_operator:XF 3 "binary_fp_operator" + [(match_operand:XF 1 "register_operand" "0,f") + (float_extend:XF + (match_operand:MODEF 2 "nonimmediate_operand" "fm,0"))]))] + "TARGET_80387" + "* return output_387_binary_op (insn, operands);" + [(set (attr "type") + (cond [(match_operand:XF 3 "mult_operator" "") + (const_string "fmul") + (match_operand:XF 3 "div_operator" "") + (const_string "fdiv") + ] + (const_string "fop"))) + (set_attr "mode" "")]) + +(define_insn "*fop_xf_6_i387" + [(set (match_operand:XF 0 "register_operand" "=f,f") + (match_operator:XF 3 "binary_fp_operator" + [(float_extend:XF + (match_operand:MODEF 1 "register_operand" "0,f")) + (float_extend:XF + (match_operand:MODEF 2 "nonimmediate_operand" "fm,0"))]))] + "TARGET_80387" + "* return output_387_binary_op (insn, operands);" + [(set (attr "type") + (cond [(match_operand:XF 3 "mult_operator" "") + (const_string "fmul") + (match_operand:XF 3 "div_operator" "") + (const_string "fdiv") + ] + (const_string "fop"))) + (set_attr "mode" "")]) + +(define_split + [(set (match_operand 0 "register_operand" "") + (match_operator 3 "binary_fp_operator" + [(float (match_operand:X87MODEI12 1 "register_operand" "")) + (match_operand 2 "register_operand" "")]))] + "reload_completed + && X87_FLOAT_MODE_P (GET_MODE (operands[0])) + && X87_ENABLE_FLOAT (GET_MODE (operands[0]), GET_MODE (operands[1]))" + [(const_int 0)] +{ + operands[4] = ix86_force_to_memory (GET_MODE (operands[1]), operands[1]); + operands[4] = gen_rtx_FLOAT (GET_MODE (operands[0]), operands[4]); + emit_insn (gen_rtx_SET (VOIDmode, operands[0], + gen_rtx_fmt_ee (GET_CODE (operands[3]), + GET_MODE (operands[3]), + operands[4], + operands[2]))); + ix86_free_from_memory (GET_MODE (operands[1])); + DONE; +}) + +(define_split + [(set (match_operand 0 "register_operand" "") + (match_operator 3 "binary_fp_operator" + [(match_operand 1 "register_operand" "") + (float (match_operand:X87MODEI12 2 "register_operand" ""))]))] + "reload_completed + && X87_FLOAT_MODE_P (GET_MODE (operands[0])) + && X87_ENABLE_FLOAT (GET_MODE (operands[0]), GET_MODE (operands[2]))" + [(const_int 0)] +{ + operands[4] = ix86_force_to_memory (GET_MODE (operands[2]), operands[2]); + operands[4] = gen_rtx_FLOAT (GET_MODE (operands[0]), operands[4]); + emit_insn (gen_rtx_SET (VOIDmode, operands[0], + gen_rtx_fmt_ee (GET_CODE (operands[3]), + GET_MODE (operands[3]), + operands[1], + operands[4]))); + ix86_free_from_memory (GET_MODE (operands[2])); + DONE; +}) + +;; FPU special functions. + +;; This pattern implements a no-op XFmode truncation for +;; all fancy i386 XFmode math functions. + +(define_insn "truncxf2_i387_noop_unspec" + [(set (match_operand:MODEF 0 "register_operand" "=f") + (unspec:MODEF [(match_operand:XF 1 "register_operand" "f")] + UNSPEC_TRUNC_NOOP))] + "TARGET_USE_FANCY_MATH_387" + "* return output_387_reg_move (insn, operands);" + [(set_attr "type" "fmov") + (set_attr "mode" "")]) + +(define_insn "sqrtxf2" + [(set (match_operand:XF 0 "register_operand" "=f") + (sqrt:XF (match_operand:XF 1 "register_operand" "0")))] + "TARGET_USE_FANCY_MATH_387" + "fsqrt" + [(set_attr "type" "fpspc") + (set_attr "mode" "XF") + (set_attr "athlon_decode" "direct") + (set_attr "amdfam10_decode" "direct") + (set_attr "bdver1_decode" "direct")]) + +(define_insn "sqrt_extendxf2_i387" + [(set (match_operand:XF 0 "register_operand" "=f") + (sqrt:XF + (float_extend:XF + (match_operand:MODEF 1 "register_operand" "0"))))] + "TARGET_USE_FANCY_MATH_387" + "fsqrt" + [(set_attr "type" "fpspc") + (set_attr "mode" "XF") + (set_attr "athlon_decode" "direct") + (set_attr "amdfam10_decode" "direct") + (set_attr "bdver1_decode" "direct")]) + +(define_insn "*rsqrtsf2_sse" + [(set (match_operand:SF 0 "register_operand" "=x") + (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "xm")] + UNSPEC_RSQRT))] + "TARGET_SSE_MATH" + "%vrsqrtss\t{%1, %d0|%d0, %1}" + [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "rcp") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "SF")]) + +(define_expand "rsqrtsf2" + [(set (match_operand:SF 0 "register_operand" "") + (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "")] + UNSPEC_RSQRT))] + "TARGET_SSE_MATH" +{ + ix86_emit_swsqrtsf (operands[0], operands[1], SFmode, 1); + DONE; +}) + +(define_insn "*sqrt2_sse" + [(set (match_operand:MODEF 0 "register_operand" "=x") + (sqrt:MODEF + (match_operand:MODEF 1 "nonimmediate_operand" "xm")))] + "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH" + "%vsqrts\t{%1, %d0|%d0, %1}" + [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "sqrt") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "") + (set_attr "athlon_decode" "*") + (set_attr "amdfam10_decode" "*") + (set_attr "bdver1_decode" "*")]) + +(define_expand "sqrt2" + [(set (match_operand:MODEF 0 "register_operand" "") + (sqrt:MODEF + (match_operand:MODEF 1 "nonimmediate_operand" "")))] + "(TARGET_USE_FANCY_MATH_387 && X87_ENABLE_ARITH (mode)) + || (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)" +{ + if (mode == SFmode + && TARGET_SSE_MATH && TARGET_RECIP && !optimize_function_for_size_p (cfun) + && flag_finite_math_only && !flag_trapping_math + && flag_unsafe_math_optimizations) + { + ix86_emit_swsqrtsf (operands[0], operands[1], SFmode, 0); + DONE; + } + + if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)) + { + rtx op0 = gen_reg_rtx (XFmode); + rtx op1 = force_reg (mode, operands[1]); + + emit_insn (gen_sqrt_extendxf2_i387 (op0, op1)); + emit_insn (gen_truncxf2_i387_noop_unspec (operands[0], op0)); + DONE; + } +}) + +(define_insn "fpremxf4_i387" + [(set (match_operand:XF 0 "register_operand" "=f") + (unspec:XF [(match_operand:XF 2 "register_operand" "0") + (match_operand:XF 3 "register_operand" "1")] + UNSPEC_FPREM_F)) + (set (match_operand:XF 1 "register_operand" "=u") + (unspec:XF [(match_dup 2) (match_dup 3)] + UNSPEC_FPREM_U)) + (set (reg:CCFP FPSR_REG) + (unspec:CCFP [(match_dup 2) (match_dup 3)] + UNSPEC_C2_FLAG))] + "TARGET_USE_FANCY_MATH_387" + "fprem" + [(set_attr "type" "fpspc") + (set_attr "mode" "XF")]) + +(define_expand "fmodxf3" + [(use (match_operand:XF 0 "register_operand" "")) + (use (match_operand:XF 1 "general_operand" "")) + (use (match_operand:XF 2 "general_operand" ""))] + "TARGET_USE_FANCY_MATH_387" +{ + rtx label = gen_label_rtx (); + + rtx op1 = gen_reg_rtx (XFmode); + rtx op2 = gen_reg_rtx (XFmode); + + emit_move_insn (op2, operands[2]); + emit_move_insn (op1, operands[1]); + + emit_label (label); + emit_insn (gen_fpremxf4_i387 (op1, op2, op1, op2)); + ix86_emit_fp_unordered_jump (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operands[0], op1); + DONE; +}) + +(define_expand "fmod3" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "general_operand" "")) + (use (match_operand:MODEF 2 "general_operand" ""))] + "TARGET_USE_FANCY_MATH_387" +{ + rtx (*gen_truncxf) (rtx, rtx); + + rtx label = gen_label_rtx (); + + rtx op1 = gen_reg_rtx (XFmode); + rtx op2 = gen_reg_rtx (XFmode); + + emit_insn (gen_extendxf2 (op2, operands[2])); + emit_insn (gen_extendxf2 (op1, operands[1])); + + emit_label (label); + emit_insn (gen_fpremxf4_i387 (op1, op2, op1, op2)); + ix86_emit_fp_unordered_jump (label); + LABEL_NUSES (label) = 1; + + /* Truncate the result properly for strict SSE math. */ + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && !TARGET_MIX_SSE_I387) + gen_truncxf = gen_truncxf2; + else + gen_truncxf = gen_truncxf2_i387_noop_unspec; + + emit_insn (gen_truncxf (operands[0], op1)); + DONE; +}) + +(define_insn "fprem1xf4_i387" + [(set (match_operand:XF 0 "register_operand" "=f") + (unspec:XF [(match_operand:XF 2 "register_operand" "0") + (match_operand:XF 3 "register_operand" "1")] + UNSPEC_FPREM1_F)) + (set (match_operand:XF 1 "register_operand" "=u") + (unspec:XF [(match_dup 2) (match_dup 3)] + UNSPEC_FPREM1_U)) + (set (reg:CCFP FPSR_REG) + (unspec:CCFP [(match_dup 2) (match_dup 3)] + UNSPEC_C2_FLAG))] + "TARGET_USE_FANCY_MATH_387" + "fprem1" + [(set_attr "type" "fpspc") + (set_attr "mode" "XF")]) + +(define_expand "remainderxf3" + [(use (match_operand:XF 0 "register_operand" "")) + (use (match_operand:XF 1 "general_operand" "")) + (use (match_operand:XF 2 "general_operand" ""))] + "TARGET_USE_FANCY_MATH_387" +{ + rtx label = gen_label_rtx (); + + rtx op1 = gen_reg_rtx (XFmode); + rtx op2 = gen_reg_rtx (XFmode); + + emit_move_insn (op2, operands[2]); + emit_move_insn (op1, operands[1]); + + emit_label (label); + emit_insn (gen_fprem1xf4_i387 (op1, op2, op1, op2)); + ix86_emit_fp_unordered_jump (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operands[0], op1); + DONE; +}) + +(define_expand "remainder3" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "general_operand" "")) + (use (match_operand:MODEF 2 "general_operand" ""))] + "TARGET_USE_FANCY_MATH_387" +{ + rtx (*gen_truncxf) (rtx, rtx); + + rtx label = gen_label_rtx (); + + rtx op1 = gen_reg_rtx (XFmode); + rtx op2 = gen_reg_rtx (XFmode); + + emit_insn (gen_extendxf2 (op2, operands[2])); + emit_insn (gen_extendxf2 (op1, operands[1])); + + emit_label (label); + + emit_insn (gen_fprem1xf4_i387 (op1, op2, op1, op2)); + ix86_emit_fp_unordered_jump (label); + LABEL_NUSES (label) = 1; + + /* Truncate the result properly for strict SSE math. */ + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && !TARGET_MIX_SSE_I387) + gen_truncxf = gen_truncxf2; + else + gen_truncxf = gen_truncxf2_i387_noop_unspec; + + emit_insn (gen_truncxf (operands[0], op1)); + DONE; +}) + +(define_insn "*sinxf2_i387" + [(set (match_operand:XF 0 "register_operand" "=f") + (unspec:XF [(match_operand:XF 1 "register_operand" "0")] UNSPEC_SIN))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" + "fsin" + [(set_attr "type" "fpspc") + (set_attr "mode" "XF")]) + +(define_insn "*sin_extendxf2_i387" + [(set (match_operand:XF 0 "register_operand" "=f") + (unspec:XF [(float_extend:XF + (match_operand:MODEF 1 "register_operand" "0"))] + UNSPEC_SIN))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" + "fsin" + [(set_attr "type" "fpspc") + (set_attr "mode" "XF")]) + +(define_insn "*cosxf2_i387" + [(set (match_operand:XF 0 "register_operand" "=f") + (unspec:XF [(match_operand:XF 1 "register_operand" "0")] UNSPEC_COS))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" + "fcos" + [(set_attr "type" "fpspc") + (set_attr "mode" "XF")]) + +(define_insn "*cos_extendxf2_i387" + [(set (match_operand:XF 0 "register_operand" "=f") + (unspec:XF [(float_extend:XF + (match_operand:MODEF 1 "register_operand" "0"))] + UNSPEC_COS))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" + "fcos" + [(set_attr "type" "fpspc") + (set_attr "mode" "XF")]) + +;; When sincos pattern is defined, sin and cos builtin functions will be +;; expanded to sincos pattern with one of its outputs left unused. +;; CSE pass will figure out if two sincos patterns can be combined, +;; otherwise sincos pattern will be split back to sin or cos pattern, +;; depending on the unused output. + +(define_insn "sincosxf3" + [(set (match_operand:XF 0 "register_operand" "=f") + (unspec:XF [(match_operand:XF 2 "register_operand" "0")] + UNSPEC_SINCOS_COS)) + (set (match_operand:XF 1 "register_operand" "=u") + (unspec:XF [(match_dup 2)] UNSPEC_SINCOS_SIN))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" + "fsincos" + [(set_attr "type" "fpspc") + (set_attr "mode" "XF")]) + +(define_split + [(set (match_operand:XF 0 "register_operand" "") + (unspec:XF [(match_operand:XF 2 "register_operand" "")] + UNSPEC_SINCOS_COS)) + (set (match_operand:XF 1 "register_operand" "") + (unspec:XF [(match_dup 2)] UNSPEC_SINCOS_SIN))] + "find_regno_note (insn, REG_UNUSED, REGNO (operands[0])) + && !(reload_completed || reload_in_progress)" + [(set (match_dup 1) (unspec:XF [(match_dup 2)] UNSPEC_SIN))]) + +(define_split + [(set (match_operand:XF 0 "register_operand" "") + (unspec:XF [(match_operand:XF 2 "register_operand" "")] + UNSPEC_SINCOS_COS)) + (set (match_operand:XF 1 "register_operand" "") + (unspec:XF [(match_dup 2)] UNSPEC_SINCOS_SIN))] + "find_regno_note (insn, REG_UNUSED, REGNO (operands[1])) + && !(reload_completed || reload_in_progress)" + [(set (match_dup 0) (unspec:XF [(match_dup 2)] UNSPEC_COS))]) + +(define_insn "sincos_extendxf3_i387" + [(set (match_operand:XF 0 "register_operand" "=f") + (unspec:XF [(float_extend:XF + (match_operand:MODEF 2 "register_operand" "0"))] + UNSPEC_SINCOS_COS)) + (set (match_operand:XF 1 "register_operand" "=u") + (unspec:XF [(float_extend:XF (match_dup 2))] UNSPEC_SINCOS_SIN))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" + "fsincos" + [(set_attr "type" "fpspc") + (set_attr "mode" "XF")]) + +(define_split + [(set (match_operand:XF 0 "register_operand" "") + (unspec:XF [(float_extend:XF + (match_operand:MODEF 2 "register_operand" ""))] + UNSPEC_SINCOS_COS)) + (set (match_operand:XF 1 "register_operand" "") + (unspec:XF [(float_extend:XF (match_dup 2))] UNSPEC_SINCOS_SIN))] + "find_regno_note (insn, REG_UNUSED, REGNO (operands[0])) + && !(reload_completed || reload_in_progress)" + [(set (match_dup 1) + (unspec:XF [(float_extend:XF (match_dup 2))] UNSPEC_SIN))]) + +(define_split + [(set (match_operand:XF 0 "register_operand" "") + (unspec:XF [(float_extend:XF + (match_operand:MODEF 2 "register_operand" ""))] + UNSPEC_SINCOS_COS)) + (set (match_operand:XF 1 "register_operand" "") + (unspec:XF [(float_extend:XF (match_dup 2))] UNSPEC_SINCOS_SIN))] + "find_regno_note (insn, REG_UNUSED, REGNO (operands[1])) + && !(reload_completed || reload_in_progress)" + [(set (match_dup 0) + (unspec:XF [(float_extend:XF (match_dup 2))] UNSPEC_COS))]) + +(define_expand "sincos3" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "register_operand" "")) + (use (match_operand:MODEF 2 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" +{ + rtx op0 = gen_reg_rtx (XFmode); + rtx op1 = gen_reg_rtx (XFmode); + + emit_insn (gen_sincos_extendxf3_i387 (op0, op1, operands[2])); + emit_insn (gen_truncxf2_i387_noop (operands[0], op0)); + emit_insn (gen_truncxf2_i387_noop (operands[1], op1)); + DONE; +}) + +(define_insn "fptanxf4_i387" + [(set (match_operand:XF 0 "register_operand" "=f") + (match_operand:XF 3 "const_double_operand" "F")) + (set (match_operand:XF 1 "register_operand" "=u") + (unspec:XF [(match_operand:XF 2 "register_operand" "0")] + UNSPEC_TAN))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations + && standard_80387_constant_p (operands[3]) == 2" + "fptan" + [(set_attr "type" "fpspc") + (set_attr "mode" "XF")]) + +(define_insn "fptan_extendxf4_i387" + [(set (match_operand:MODEF 0 "register_operand" "=f") + (match_operand:MODEF 3 "const_double_operand" "F")) + (set (match_operand:XF 1 "register_operand" "=u") + (unspec:XF [(float_extend:XF + (match_operand:MODEF 2 "register_operand" "0"))] + UNSPEC_TAN))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations + && standard_80387_constant_p (operands[3]) == 2" + "fptan" + [(set_attr "type" "fpspc") + (set_attr "mode" "XF")]) + +(define_expand "tanxf2" + [(use (match_operand:XF 0 "register_operand" "")) + (use (match_operand:XF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" +{ + rtx one = gen_reg_rtx (XFmode); + rtx op2 = CONST1_RTX (XFmode); /* fld1 */ + + emit_insn (gen_fptanxf4_i387 (one, operands[0], operands[1], op2)); + DONE; +}) + +(define_expand "tan2" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" +{ + rtx op0 = gen_reg_rtx (XFmode); + + rtx one = gen_reg_rtx (mode); + rtx op2 = CONST1_RTX (mode); /* fld1 */ + + emit_insn (gen_fptan_extendxf4_i387 (one, op0, + operands[1], op2)); + emit_insn (gen_truncxf2_i387_noop (operands[0], op0)); + DONE; +}) + +(define_insn "*fpatanxf3_i387" + [(set (match_operand:XF 0 "register_operand" "=f") + (unspec:XF [(match_operand:XF 1 "register_operand" "0") + (match_operand:XF 2 "register_operand" "u")] + UNSPEC_FPATAN)) + (clobber (match_scratch:XF 3 "=2"))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" + "fpatan" + [(set_attr "type" "fpspc") + (set_attr "mode" "XF")]) + +(define_insn "fpatan_extendxf3_i387" + [(set (match_operand:XF 0 "register_operand" "=f") + (unspec:XF [(float_extend:XF + (match_operand:MODEF 1 "register_operand" "0")) + (float_extend:XF + (match_operand:MODEF 2 "register_operand" "u"))] + UNSPEC_FPATAN)) + (clobber (match_scratch:XF 3 "=2"))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" + "fpatan" + [(set_attr "type" "fpspc") + (set_attr "mode" "XF")]) + +(define_expand "atan2xf3" + [(parallel [(set (match_operand:XF 0 "register_operand" "") + (unspec:XF [(match_operand:XF 2 "register_operand" "") + (match_operand:XF 1 "register_operand" "")] + UNSPEC_FPATAN)) + (clobber (match_scratch:XF 3 ""))])] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations") + +(define_expand "atan23" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "register_operand" "")) + (use (match_operand:MODEF 2 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" +{ + rtx op0 = gen_reg_rtx (XFmode); + + emit_insn (gen_fpatan_extendxf3_i387 (op0, operands[2], operands[1])); + emit_insn (gen_truncxf2_i387_noop (operands[0], op0)); + DONE; +}) + +(define_expand "atanxf2" + [(parallel [(set (match_operand:XF 0 "register_operand" "") + (unspec:XF [(match_dup 2) + (match_operand:XF 1 "register_operand" "")] + UNSPEC_FPATAN)) + (clobber (match_scratch:XF 3 ""))])] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" +{ + operands[2] = gen_reg_rtx (XFmode); + emit_move_insn (operands[2], CONST1_RTX (XFmode)); /* fld1 */ +}) + +(define_expand "atan2" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" +{ + rtx op0 = gen_reg_rtx (XFmode); + + rtx op2 = gen_reg_rtx (mode); + emit_move_insn (op2, CONST1_RTX (mode)); /* fld1 */ + + emit_insn (gen_fpatan_extendxf3_i387 (op0, op2, operands[1])); + emit_insn (gen_truncxf2_i387_noop (operands[0], op0)); + DONE; +}) + +(define_expand "asinxf2" + [(set (match_dup 2) + (mult:XF (match_operand:XF 1 "register_operand" "") + (match_dup 1))) + (set (match_dup 4) (minus:XF (match_dup 3) (match_dup 2))) + (set (match_dup 5) (sqrt:XF (match_dup 4))) + (parallel [(set (match_operand:XF 0 "register_operand" "") + (unspec:XF [(match_dup 5) (match_dup 1)] + UNSPEC_FPATAN)) + (clobber (match_scratch:XF 6 ""))])] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" +{ + int i; + + if (optimize_insn_for_size_p ()) + FAIL; + + for (i = 2; i < 6; i++) + operands[i] = gen_reg_rtx (XFmode); + + emit_move_insn (operands[3], CONST1_RTX (XFmode)); /* fld1 */ +}) + +(define_expand "asin2" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "general_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" +{ + rtx op0 = gen_reg_rtx (XFmode); + rtx op1 = gen_reg_rtx (XFmode); + + if (optimize_insn_for_size_p ()) + FAIL; + + emit_insn (gen_extendxf2 (op1, operands[1])); + emit_insn (gen_asinxf2 (op0, op1)); + emit_insn (gen_truncxf2_i387_noop (operands[0], op0)); + DONE; +}) + +(define_expand "acosxf2" + [(set (match_dup 2) + (mult:XF (match_operand:XF 1 "register_operand" "") + (match_dup 1))) + (set (match_dup 4) (minus:XF (match_dup 3) (match_dup 2))) + (set (match_dup 5) (sqrt:XF (match_dup 4))) + (parallel [(set (match_operand:XF 0 "register_operand" "") + (unspec:XF [(match_dup 1) (match_dup 5)] + UNSPEC_FPATAN)) + (clobber (match_scratch:XF 6 ""))])] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" +{ + int i; + + if (optimize_insn_for_size_p ()) + FAIL; + + for (i = 2; i < 6; i++) + operands[i] = gen_reg_rtx (XFmode); + + emit_move_insn (operands[3], CONST1_RTX (XFmode)); /* fld1 */ +}) + +(define_expand "acos2" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "general_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" +{ + rtx op0 = gen_reg_rtx (XFmode); + rtx op1 = gen_reg_rtx (XFmode); + + if (optimize_insn_for_size_p ()) + FAIL; + + emit_insn (gen_extendxf2 (op1, operands[1])); + emit_insn (gen_acosxf2 (op0, op1)); + emit_insn (gen_truncxf2_i387_noop (operands[0], op0)); + DONE; +}) + +(define_insn "fyl2xxf3_i387" + [(set (match_operand:XF 0 "register_operand" "=f") + (unspec:XF [(match_operand:XF 1 "register_operand" "0") + (match_operand:XF 2 "register_operand" "u")] + UNSPEC_FYL2X)) + (clobber (match_scratch:XF 3 "=2"))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" + "fyl2x" + [(set_attr "type" "fpspc") + (set_attr "mode" "XF")]) + +(define_insn "fyl2x_extendxf3_i387" + [(set (match_operand:XF 0 "register_operand" "=f") + (unspec:XF [(float_extend:XF + (match_operand:MODEF 1 "register_operand" "0")) + (match_operand:XF 2 "register_operand" "u")] + UNSPEC_FYL2X)) + (clobber (match_scratch:XF 3 "=2"))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" + "fyl2x" + [(set_attr "type" "fpspc") + (set_attr "mode" "XF")]) + +(define_expand "logxf2" + [(parallel [(set (match_operand:XF 0 "register_operand" "") + (unspec:XF [(match_operand:XF 1 "register_operand" "") + (match_dup 2)] UNSPEC_FYL2X)) + (clobber (match_scratch:XF 3 ""))])] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" +{ + operands[2] = gen_reg_rtx (XFmode); + emit_move_insn (operands[2], standard_80387_constant_rtx (4)); /* fldln2 */ +}) + +(define_expand "log2" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" +{ + rtx op0 = gen_reg_rtx (XFmode); + + rtx op2 = gen_reg_rtx (XFmode); + emit_move_insn (op2, standard_80387_constant_rtx (4)); /* fldln2 */ + + emit_insn (gen_fyl2x_extendxf3_i387 (op0, operands[1], op2)); + emit_insn (gen_truncxf2_i387_noop (operands[0], op0)); + DONE; +}) + +(define_expand "log10xf2" + [(parallel [(set (match_operand:XF 0 "register_operand" "") + (unspec:XF [(match_operand:XF 1 "register_operand" "") + (match_dup 2)] UNSPEC_FYL2X)) + (clobber (match_scratch:XF 3 ""))])] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" +{ + operands[2] = gen_reg_rtx (XFmode); + emit_move_insn (operands[2], standard_80387_constant_rtx (3)); /* fldlg2 */ +}) + +(define_expand "log102" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" +{ + rtx op0 = gen_reg_rtx (XFmode); + + rtx op2 = gen_reg_rtx (XFmode); + emit_move_insn (op2, standard_80387_constant_rtx (3)); /* fldlg2 */ + + emit_insn (gen_fyl2x_extendxf3_i387 (op0, operands[1], op2)); + emit_insn (gen_truncxf2_i387_noop (operands[0], op0)); + DONE; +}) + +(define_expand "log2xf2" + [(parallel [(set (match_operand:XF 0 "register_operand" "") + (unspec:XF [(match_operand:XF 1 "register_operand" "") + (match_dup 2)] UNSPEC_FYL2X)) + (clobber (match_scratch:XF 3 ""))])] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" +{ + operands[2] = gen_reg_rtx (XFmode); + emit_move_insn (operands[2], CONST1_RTX (XFmode)); /* fld1 */ +}) + +(define_expand "log22" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" +{ + rtx op0 = gen_reg_rtx (XFmode); + + rtx op2 = gen_reg_rtx (XFmode); + emit_move_insn (op2, CONST1_RTX (XFmode)); /* fld1 */ + + emit_insn (gen_fyl2x_extendxf3_i387 (op0, operands[1], op2)); + emit_insn (gen_truncxf2_i387_noop (operands[0], op0)); + DONE; +}) + +(define_insn "fyl2xp1xf3_i387" + [(set (match_operand:XF 0 "register_operand" "=f") + (unspec:XF [(match_operand:XF 1 "register_operand" "0") + (match_operand:XF 2 "register_operand" "u")] + UNSPEC_FYL2XP1)) + (clobber (match_scratch:XF 3 "=2"))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" + "fyl2xp1" + [(set_attr "type" "fpspc") + (set_attr "mode" "XF")]) + +(define_insn "fyl2xp1_extendxf3_i387" + [(set (match_operand:XF 0 "register_operand" "=f") + (unspec:XF [(float_extend:XF + (match_operand:MODEF 1 "register_operand" "0")) + (match_operand:XF 2 "register_operand" "u")] + UNSPEC_FYL2XP1)) + (clobber (match_scratch:XF 3 "=2"))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" + "fyl2xp1" + [(set_attr "type" "fpspc") + (set_attr "mode" "XF")]) + +(define_expand "log1pxf2" + [(use (match_operand:XF 0 "register_operand" "")) + (use (match_operand:XF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" +{ + if (optimize_insn_for_size_p ()) + FAIL; + + ix86_emit_i387_log1p (operands[0], operands[1]); + DONE; +}) + +(define_expand "log1p2" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" +{ + rtx op0; + + if (optimize_insn_for_size_p ()) + FAIL; + + op0 = gen_reg_rtx (XFmode); + + operands[1] = gen_rtx_FLOAT_EXTEND (XFmode, operands[1]); + + ix86_emit_i387_log1p (op0, operands[1]); + emit_insn (gen_truncxf2_i387_noop (operands[0], op0)); + DONE; +}) + +(define_insn "fxtractxf3_i387" + [(set (match_operand:XF 0 "register_operand" "=f") + (unspec:XF [(match_operand:XF 2 "register_operand" "0")] + UNSPEC_XTRACT_FRACT)) + (set (match_operand:XF 1 "register_operand" "=u") + (unspec:XF [(match_dup 2)] UNSPEC_XTRACT_EXP))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" + "fxtract" + [(set_attr "type" "fpspc") + (set_attr "mode" "XF")]) + +(define_insn "fxtract_extendxf3_i387" + [(set (match_operand:XF 0 "register_operand" "=f") + (unspec:XF [(float_extend:XF + (match_operand:MODEF 2 "register_operand" "0"))] + UNSPEC_XTRACT_FRACT)) + (set (match_operand:XF 1 "register_operand" "=u") + (unspec:XF [(float_extend:XF (match_dup 2))] UNSPEC_XTRACT_EXP))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" + "fxtract" + [(set_attr "type" "fpspc") + (set_attr "mode" "XF")]) + +(define_expand "logbxf2" + [(parallel [(set (match_dup 2) + (unspec:XF [(match_operand:XF 1 "register_operand" "")] + UNSPEC_XTRACT_FRACT)) + (set (match_operand:XF 0 "register_operand" "") + (unspec:XF [(match_dup 1)] UNSPEC_XTRACT_EXP))])] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" + "operands[2] = gen_reg_rtx (XFmode);") + +(define_expand "logb2" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" +{ + rtx op0 = gen_reg_rtx (XFmode); + rtx op1 = gen_reg_rtx (XFmode); + + emit_insn (gen_fxtract_extendxf3_i387 (op0, op1, operands[1])); + emit_insn (gen_truncxf2_i387_noop (operands[0], op1)); + DONE; +}) + +(define_expand "ilogbxf2" + [(use (match_operand:SI 0 "register_operand" "")) + (use (match_operand:XF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" +{ + rtx op0, op1; + + if (optimize_insn_for_size_p ()) + FAIL; + + op0 = gen_reg_rtx (XFmode); + op1 = gen_reg_rtx (XFmode); + + emit_insn (gen_fxtractxf3_i387 (op0, op1, operands[1])); + emit_insn (gen_fix_truncxfsi2 (operands[0], op1)); + DONE; +}) + +(define_expand "ilogb2" + [(use (match_operand:SI 0 "register_operand" "")) + (use (match_operand:MODEF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" +{ + rtx op0, op1; + + if (optimize_insn_for_size_p ()) + FAIL; + + op0 = gen_reg_rtx (XFmode); + op1 = gen_reg_rtx (XFmode); + + emit_insn (gen_fxtract_extendxf3_i387 (op0, op1, operands[1])); + emit_insn (gen_fix_truncxfsi2 (operands[0], op1)); + DONE; +}) + +(define_insn "*f2xm1xf2_i387" + [(set (match_operand:XF 0 "register_operand" "=f") + (unspec:XF [(match_operand:XF 1 "register_operand" "0")] + UNSPEC_F2XM1))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" + "f2xm1" + [(set_attr "type" "fpspc") + (set_attr "mode" "XF")]) + +(define_insn "*fscalexf4_i387" + [(set (match_operand:XF 0 "register_operand" "=f") + (unspec:XF [(match_operand:XF 2 "register_operand" "0") + (match_operand:XF 3 "register_operand" "1")] + UNSPEC_FSCALE_FRACT)) + (set (match_operand:XF 1 "register_operand" "=u") + (unspec:XF [(match_dup 2) (match_dup 3)] + UNSPEC_FSCALE_EXP))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" + "fscale" + [(set_attr "type" "fpspc") + (set_attr "mode" "XF")]) + +(define_expand "expNcorexf3" + [(set (match_dup 3) (mult:XF (match_operand:XF 1 "register_operand" "") + (match_operand:XF 2 "register_operand" ""))) + (set (match_dup 4) (unspec:XF [(match_dup 3)] UNSPEC_FRNDINT)) + (set (match_dup 5) (minus:XF (match_dup 3) (match_dup 4))) + (set (match_dup 6) (unspec:XF [(match_dup 5)] UNSPEC_F2XM1)) + (set (match_dup 8) (plus:XF (match_dup 6) (match_dup 7))) + (parallel [(set (match_operand:XF 0 "register_operand" "") + (unspec:XF [(match_dup 8) (match_dup 4)] + UNSPEC_FSCALE_FRACT)) + (set (match_dup 9) + (unspec:XF [(match_dup 8) (match_dup 4)] + UNSPEC_FSCALE_EXP))])] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" +{ + int i; + + if (optimize_insn_for_size_p ()) + FAIL; + + for (i = 3; i < 10; i++) + operands[i] = gen_reg_rtx (XFmode); + + emit_move_insn (operands[7], CONST1_RTX (XFmode)); /* fld1 */ +}) + +(define_expand "expxf2" + [(use (match_operand:XF 0 "register_operand" "")) + (use (match_operand:XF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" +{ + rtx op2; + + if (optimize_insn_for_size_p ()) + FAIL; + + op2 = gen_reg_rtx (XFmode); + emit_move_insn (op2, standard_80387_constant_rtx (5)); /* fldl2e */ + + emit_insn (gen_expNcorexf3 (operands[0], operands[1], op2)); + DONE; +}) + +(define_expand "exp2" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "general_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" +{ + rtx op0, op1; + + if (optimize_insn_for_size_p ()) + FAIL; + + op0 = gen_reg_rtx (XFmode); + op1 = gen_reg_rtx (XFmode); + + emit_insn (gen_extendxf2 (op1, operands[1])); + emit_insn (gen_expxf2 (op0, op1)); + emit_insn (gen_truncxf2_i387_noop (operands[0], op0)); + DONE; +}) + +(define_expand "exp10xf2" + [(use (match_operand:XF 0 "register_operand" "")) + (use (match_operand:XF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" +{ + rtx op2; + + if (optimize_insn_for_size_p ()) + FAIL; + + op2 = gen_reg_rtx (XFmode); + emit_move_insn (op2, standard_80387_constant_rtx (6)); /* fldl2t */ + + emit_insn (gen_expNcorexf3 (operands[0], operands[1], op2)); + DONE; +}) + +(define_expand "exp102" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "general_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" +{ + rtx op0, op1; + + if (optimize_insn_for_size_p ()) + FAIL; + + op0 = gen_reg_rtx (XFmode); + op1 = gen_reg_rtx (XFmode); + + emit_insn (gen_extendxf2 (op1, operands[1])); + emit_insn (gen_exp10xf2 (op0, op1)); + emit_insn (gen_truncxf2_i387_noop (operands[0], op0)); + DONE; +}) + +(define_expand "exp2xf2" + [(use (match_operand:XF 0 "register_operand" "")) + (use (match_operand:XF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" +{ + rtx op2; + + if (optimize_insn_for_size_p ()) + FAIL; + + op2 = gen_reg_rtx (XFmode); + emit_move_insn (op2, CONST1_RTX (XFmode)); /* fld1 */ + + emit_insn (gen_expNcorexf3 (operands[0], operands[1], op2)); + DONE; +}) + +(define_expand "exp22" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "general_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" +{ + rtx op0, op1; + + if (optimize_insn_for_size_p ()) + FAIL; + + op0 = gen_reg_rtx (XFmode); + op1 = gen_reg_rtx (XFmode); + + emit_insn (gen_extendxf2 (op1, operands[1])); + emit_insn (gen_exp2xf2 (op0, op1)); + emit_insn (gen_truncxf2_i387_noop (operands[0], op0)); + DONE; +}) + +(define_expand "expm1xf2" + [(set (match_dup 3) (mult:XF (match_operand:XF 1 "register_operand" "") + (match_dup 2))) + (set (match_dup 4) (unspec:XF [(match_dup 3)] UNSPEC_FRNDINT)) + (set (match_dup 5) (minus:XF (match_dup 3) (match_dup 4))) + (set (match_dup 9) (float_extend:XF (match_dup 13))) + (set (match_dup 6) (unspec:XF [(match_dup 5)] UNSPEC_F2XM1)) + (parallel [(set (match_dup 7) + (unspec:XF [(match_dup 6) (match_dup 4)] + UNSPEC_FSCALE_FRACT)) + (set (match_dup 8) + (unspec:XF [(match_dup 6) (match_dup 4)] + UNSPEC_FSCALE_EXP))]) + (parallel [(set (match_dup 10) + (unspec:XF [(match_dup 9) (match_dup 8)] + UNSPEC_FSCALE_FRACT)) + (set (match_dup 11) + (unspec:XF [(match_dup 9) (match_dup 8)] + UNSPEC_FSCALE_EXP))]) + (set (match_dup 12) (minus:XF (match_dup 10) + (float_extend:XF (match_dup 13)))) + (set (match_operand:XF 0 "register_operand" "") + (plus:XF (match_dup 12) (match_dup 7)))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" +{ + int i; + + if (optimize_insn_for_size_p ()) + FAIL; + + for (i = 2; i < 13; i++) + operands[i] = gen_reg_rtx (XFmode); + + operands[13] + = validize_mem (force_const_mem (SFmode, CONST1_RTX (SFmode))); /* fld1 */ + + emit_move_insn (operands[2], standard_80387_constant_rtx (5)); /* fldl2e */ +}) + +(define_expand "expm12" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "general_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" +{ + rtx op0, op1; + + if (optimize_insn_for_size_p ()) + FAIL; + + op0 = gen_reg_rtx (XFmode); + op1 = gen_reg_rtx (XFmode); + + emit_insn (gen_extendxf2 (op1, operands[1])); + emit_insn (gen_expm1xf2 (op0, op1)); + emit_insn (gen_truncxf2_i387_noop (operands[0], op0)); + DONE; +}) + +(define_expand "ldexpxf3" + [(set (match_dup 3) + (float:XF (match_operand:SI 2 "register_operand" ""))) + (parallel [(set (match_operand:XF 0 " register_operand" "") + (unspec:XF [(match_operand:XF 1 "register_operand" "") + (match_dup 3)] + UNSPEC_FSCALE_FRACT)) + (set (match_dup 4) + (unspec:XF [(match_dup 1) (match_dup 3)] + UNSPEC_FSCALE_EXP))])] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" +{ + if (optimize_insn_for_size_p ()) + FAIL; + + operands[3] = gen_reg_rtx (XFmode); + operands[4] = gen_reg_rtx (XFmode); +}) + +(define_expand "ldexp3" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "general_operand" "")) + (use (match_operand:SI 2 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" +{ + rtx op0, op1; + + if (optimize_insn_for_size_p ()) + FAIL; + + op0 = gen_reg_rtx (XFmode); + op1 = gen_reg_rtx (XFmode); + + emit_insn (gen_extendxf2 (op1, operands[1])); + emit_insn (gen_ldexpxf3 (op0, op1, operands[2])); + emit_insn (gen_truncxf2_i387_noop (operands[0], op0)); + DONE; +}) + +(define_expand "scalbxf3" + [(parallel [(set (match_operand:XF 0 " register_operand" "") + (unspec:XF [(match_operand:XF 1 "register_operand" "") + (match_operand:XF 2 "register_operand" "")] + UNSPEC_FSCALE_FRACT)) + (set (match_dup 3) + (unspec:XF [(match_dup 1) (match_dup 2)] + UNSPEC_FSCALE_EXP))])] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" +{ + if (optimize_insn_for_size_p ()) + FAIL; + + operands[3] = gen_reg_rtx (XFmode); +}) + +(define_expand "scalb3" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "general_operand" "")) + (use (match_operand:MODEF 2 "general_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" +{ + rtx op0, op1, op2; + + if (optimize_insn_for_size_p ()) + FAIL; + + op0 = gen_reg_rtx (XFmode); + op1 = gen_reg_rtx (XFmode); + op2 = gen_reg_rtx (XFmode); + + emit_insn (gen_extendxf2 (op1, operands[1])); + emit_insn (gen_extendxf2 (op2, operands[2])); + emit_insn (gen_scalbxf3 (op0, op1, op2)); + emit_insn (gen_truncxf2_i387_noop (operands[0], op0)); + DONE; +}) + +(define_expand "significandxf2" + [(parallel [(set (match_operand:XF 0 "register_operand" "") + (unspec:XF [(match_operand:XF 1 "register_operand" "")] + UNSPEC_XTRACT_FRACT)) + (set (match_dup 2) + (unspec:XF [(match_dup 1)] UNSPEC_XTRACT_EXP))])] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" + "operands[2] = gen_reg_rtx (XFmode);") + +(define_expand "significand2" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" +{ + rtx op0 = gen_reg_rtx (XFmode); + rtx op1 = gen_reg_rtx (XFmode); + + emit_insn (gen_fxtract_extendxf3_i387 (op0, op1, operands[1])); + emit_insn (gen_truncxf2_i387_noop (operands[0], op0)); + DONE; +}) + + +(define_insn "sse4_1_round2" + [(set (match_operand:MODEF 0 "register_operand" "=x") + (unspec:MODEF [(match_operand:MODEF 1 "register_operand" "x") + (match_operand:SI 2 "const_0_to_15_operand" "n")] + UNSPEC_ROUND))] + "TARGET_ROUND" + "%vrounds\t{%2, %1, %d0|%d0, %1, %2}" + [(set_attr "type" "ssecvt") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "")]) + +(define_insn "rintxf2" + [(set (match_operand:XF 0 "register_operand" "=f") + (unspec:XF [(match_operand:XF 1 "register_operand" "0")] + UNSPEC_FRNDINT))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" + "frndint" + [(set_attr "type" "fpspc") + (set_attr "mode" "XF")]) + +(define_expand "rint2" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "register_operand" ""))] + "(TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations) + || (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && !flag_trapping_math)" +{ + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && !flag_trapping_math) + { + if (!TARGET_ROUND && optimize_insn_for_size_p ()) + FAIL; + if (TARGET_ROUND) + emit_insn (gen_sse4_1_round2 + (operands[0], operands[1], GEN_INT (0x04))); + else + ix86_expand_rint (operands[0], operands[1]); + } + else + { + rtx op0 = gen_reg_rtx (XFmode); + rtx op1 = gen_reg_rtx (XFmode); + + emit_insn (gen_extendxf2 (op1, operands[1])); + emit_insn (gen_rintxf2 (op0, op1)); + + emit_insn (gen_truncxf2_i387_noop (operands[0], op0)); + } + DONE; +}) + +(define_expand "round2" + [(match_operand:MODEF 0 "register_operand" "") + (match_operand:MODEF 1 "nonimmediate_operand" "")] + "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && !flag_trapping_math && !flag_rounding_math" +{ + if (optimize_insn_for_size_p ()) + FAIL; + if (TARGET_64BIT || (mode != DFmode)) + ix86_expand_round (operands[0], operands[1]); + else + ix86_expand_rounddf_32 (operands[0], operands[1]); + DONE; +}) + +(define_insn_and_split "*fistdi2_1" + [(set (match_operand:DI 0 "nonimmediate_operand" "") + (unspec:DI [(match_operand:XF 1 "register_operand" "")] + UNSPEC_FIST))] + "TARGET_USE_FANCY_MATH_387 + && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] +{ + if (memory_operand (operands[0], VOIDmode)) + emit_insn (gen_fistdi2 (operands[0], operands[1])); + else + { + operands[2] = assign_386_stack_local (DImode, SLOT_TEMP); + emit_insn (gen_fistdi2_with_temp (operands[0], operands[1], + operands[2])); + } + DONE; +} + [(set_attr "type" "fpspc") + (set_attr "mode" "DI")]) + +(define_insn "fistdi2" + [(set (match_operand:DI 0 "memory_operand" "=m") + (unspec:DI [(match_operand:XF 1 "register_operand" "f")] + UNSPEC_FIST)) + (clobber (match_scratch:XF 2 "=&1f"))] + "TARGET_USE_FANCY_MATH_387" + "* return output_fix_trunc (insn, operands, 0);" + [(set_attr "type" "fpspc") + (set_attr "mode" "DI")]) + +(define_insn "fistdi2_with_temp" + [(set (match_operand:DI 0 "nonimmediate_operand" "=m,?r") + (unspec:DI [(match_operand:XF 1 "register_operand" "f,f")] + UNSPEC_FIST)) + (clobber (match_operand:DI 2 "memory_operand" "=X,m")) + (clobber (match_scratch:XF 3 "=&1f,&1f"))] + "TARGET_USE_FANCY_MATH_387" + "#" + [(set_attr "type" "fpspc") + (set_attr "mode" "DI")]) + +(define_split + [(set (match_operand:DI 0 "register_operand" "") + (unspec:DI [(match_operand:XF 1 "register_operand" "")] + UNSPEC_FIST)) + (clobber (match_operand:DI 2 "memory_operand" "")) + (clobber (match_scratch 3 ""))] + "reload_completed" + [(parallel [(set (match_dup 2) (unspec:DI [(match_dup 1)] UNSPEC_FIST)) + (clobber (match_dup 3))]) + (set (match_dup 0) (match_dup 2))]) + +(define_split + [(set (match_operand:DI 0 "memory_operand" "") + (unspec:DI [(match_operand:XF 1 "register_operand" "")] + UNSPEC_FIST)) + (clobber (match_operand:DI 2 "memory_operand" "")) + (clobber (match_scratch 3 ""))] + "reload_completed" + [(parallel [(set (match_dup 0) (unspec:DI [(match_dup 1)] UNSPEC_FIST)) + (clobber (match_dup 3))])]) + +(define_insn_and_split "*fist2_1" + [(set (match_operand:X87MODEI12 0 "register_operand" "") + (unspec:X87MODEI12 [(match_operand:XF 1 "register_operand" "")] + UNSPEC_FIST))] + "TARGET_USE_FANCY_MATH_387 + && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] +{ + operands[2] = assign_386_stack_local (mode, SLOT_TEMP); + emit_insn (gen_fist2_with_temp (operands[0], operands[1], + operands[2])); + DONE; +} + [(set_attr "type" "fpspc") + (set_attr "mode" "")]) + +(define_insn "fist2" + [(set (match_operand:X87MODEI12 0 "memory_operand" "=m") + (unspec:X87MODEI12 [(match_operand:XF 1 "register_operand" "f")] + UNSPEC_FIST))] + "TARGET_USE_FANCY_MATH_387" + "* return output_fix_trunc (insn, operands, 0);" + [(set_attr "type" "fpspc") + (set_attr "mode" "")]) + +(define_insn "fist2_with_temp" + [(set (match_operand:X87MODEI12 0 "register_operand" "=r") + (unspec:X87MODEI12 [(match_operand:XF 1 "register_operand" "f")] + UNSPEC_FIST)) + (clobber (match_operand:X87MODEI12 2 "memory_operand" "=m"))] + "TARGET_USE_FANCY_MATH_387" + "#" + [(set_attr "type" "fpspc") + (set_attr "mode" "")]) + +(define_split + [(set (match_operand:X87MODEI12 0 "register_operand" "") + (unspec:X87MODEI12 [(match_operand:XF 1 "register_operand" "")] + UNSPEC_FIST)) + (clobber (match_operand:X87MODEI12 2 "memory_operand" ""))] + "reload_completed" + [(set (match_dup 2) (unspec:X87MODEI12 [(match_dup 1)] UNSPEC_FIST)) + (set (match_dup 0) (match_dup 2))]) + +(define_split + [(set (match_operand:X87MODEI12 0 "memory_operand" "") + (unspec:X87MODEI12 [(match_operand:XF 1 "register_operand" "")] + UNSPEC_FIST)) + (clobber (match_operand:X87MODEI12 2 "memory_operand" ""))] + "reload_completed" + [(set (match_dup 0) (unspec:X87MODEI12 [(match_dup 1)] UNSPEC_FIST))]) + +(define_expand "lrintxf2" + [(set (match_operand:X87MODEI 0 "nonimmediate_operand" "") + (unspec:X87MODEI [(match_operand:XF 1 "register_operand" "")] + UNSPEC_FIST))] + "TARGET_USE_FANCY_MATH_387") + +(define_expand "lrint2" + [(set (match_operand:SSEMODEI24 0 "nonimmediate_operand" "") + (unspec:SSEMODEI24 [(match_operand:MODEF 1 "register_operand" "")] + UNSPEC_FIX_NOTRUNC))] + "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && ((mode != DImode) || TARGET_64BIT)") + +(define_expand "lround2" + [(match_operand:SSEMODEI24 0 "nonimmediate_operand" "") + (match_operand:MODEF 1 "register_operand" "")] + "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && ((mode != DImode) || TARGET_64BIT) + && !flag_trapping_math && !flag_rounding_math" +{ + if (optimize_insn_for_size_p ()) + FAIL; + ix86_expand_lround (operands[0], operands[1]); + DONE; +}) + +;; Rounding mode control word calculation could clobber FLAGS_REG. +(define_insn_and_split "frndintxf2_floor" + [(set (match_operand:XF 0 "register_operand" "") + (unspec:XF [(match_operand:XF 1 "register_operand" "")] + UNSPEC_FRNDINT_FLOOR)) + (clobber (reg:CC FLAGS_REG))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations + && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] +{ + ix86_optimize_mode_switching[I387_FLOOR] = 1; + + operands[2] = assign_386_stack_local (HImode, SLOT_CW_STORED); + operands[3] = assign_386_stack_local (HImode, SLOT_CW_FLOOR); + + emit_insn (gen_frndintxf2_floor_i387 (operands[0], operands[1], + operands[2], operands[3])); + DONE; +} + [(set_attr "type" "frndint") + (set_attr "i387_cw" "floor") + (set_attr "mode" "XF")]) + +(define_insn "frndintxf2_floor_i387" + [(set (match_operand:XF 0 "register_operand" "=f") + (unspec:XF [(match_operand:XF 1 "register_operand" "0")] + UNSPEC_FRNDINT_FLOOR)) + (use (match_operand:HI 2 "memory_operand" "m")) + (use (match_operand:HI 3 "memory_operand" "m"))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" + "fldcw\t%3\n\tfrndint\n\tfldcw\t%2" + [(set_attr "type" "frndint") + (set_attr "i387_cw" "floor") + (set_attr "mode" "XF")]) + +(define_expand "floorxf2" + [(use (match_operand:XF 0 "register_operand" "")) + (use (match_operand:XF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" +{ + if (optimize_insn_for_size_p ()) + FAIL; + emit_insn (gen_frndintxf2_floor (operands[0], operands[1])); + DONE; +}) + +(define_expand "floor2" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "register_operand" ""))] + "(TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations) + || (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && !flag_trapping_math)" +{ + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && !flag_trapping_math + && (TARGET_ROUND || optimize_insn_for_speed_p ())) + { + if (!TARGET_ROUND && optimize_insn_for_size_p ()) + FAIL; + if (TARGET_ROUND) + emit_insn (gen_sse4_1_round2 + (operands[0], operands[1], GEN_INT (0x01))); + else if (TARGET_64BIT || (mode != DFmode)) + ix86_expand_floorceil (operands[0], operands[1], true); + else + ix86_expand_floorceildf_32 (operands[0], operands[1], true); + } + else + { + rtx op0, op1; + + if (optimize_insn_for_size_p ()) + FAIL; + + op0 = gen_reg_rtx (XFmode); + op1 = gen_reg_rtx (XFmode); + emit_insn (gen_extendxf2 (op1, operands[1])); + emit_insn (gen_frndintxf2_floor (op0, op1)); + + emit_insn (gen_truncxf2_i387_noop (operands[0], op0)); + } + DONE; +}) + +(define_insn_and_split "*fist2_floor_1" + [(set (match_operand:X87MODEI 0 "nonimmediate_operand" "") + (unspec:X87MODEI [(match_operand:XF 1 "register_operand" "")] + UNSPEC_FIST_FLOOR)) + (clobber (reg:CC FLAGS_REG))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations + && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] +{ + ix86_optimize_mode_switching[I387_FLOOR] = 1; + + operands[2] = assign_386_stack_local (HImode, SLOT_CW_STORED); + operands[3] = assign_386_stack_local (HImode, SLOT_CW_FLOOR); + if (memory_operand (operands[0], VOIDmode)) + emit_insn (gen_fist2_floor (operands[0], operands[1], + operands[2], operands[3])); + else + { + operands[4] = assign_386_stack_local (mode, SLOT_TEMP); + emit_insn (gen_fist2_floor_with_temp (operands[0], operands[1], + operands[2], operands[3], + operands[4])); + } + DONE; +} + [(set_attr "type" "fistp") + (set_attr "i387_cw" "floor") + (set_attr "mode" "")]) + +(define_insn "fistdi2_floor" + [(set (match_operand:DI 0 "memory_operand" "=m") + (unspec:DI [(match_operand:XF 1 "register_operand" "f")] + UNSPEC_FIST_FLOOR)) + (use (match_operand:HI 2 "memory_operand" "m")) + (use (match_operand:HI 3 "memory_operand" "m")) + (clobber (match_scratch:XF 4 "=&1f"))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" + "* return output_fix_trunc (insn, operands, 0);" + [(set_attr "type" "fistp") + (set_attr "i387_cw" "floor") + (set_attr "mode" "DI")]) + +(define_insn "fistdi2_floor_with_temp" + [(set (match_operand:DI 0 "nonimmediate_operand" "=m,?r") + (unspec:DI [(match_operand:XF 1 "register_operand" "f,f")] + UNSPEC_FIST_FLOOR)) + (use (match_operand:HI 2 "memory_operand" "m,m")) + (use (match_operand:HI 3 "memory_operand" "m,m")) + (clobber (match_operand:DI 4 "memory_operand" "=X,m")) + (clobber (match_scratch:XF 5 "=&1f,&1f"))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" + "#" + [(set_attr "type" "fistp") + (set_attr "i387_cw" "floor") + (set_attr "mode" "DI")]) + +(define_split + [(set (match_operand:DI 0 "register_operand" "") + (unspec:DI [(match_operand:XF 1 "register_operand" "")] + UNSPEC_FIST_FLOOR)) + (use (match_operand:HI 2 "memory_operand" "")) + (use (match_operand:HI 3 "memory_operand" "")) + (clobber (match_operand:DI 4 "memory_operand" "")) + (clobber (match_scratch 5 ""))] + "reload_completed" + [(parallel [(set (match_dup 4) (unspec:DI [(match_dup 1)] UNSPEC_FIST_FLOOR)) + (use (match_dup 2)) + (use (match_dup 3)) + (clobber (match_dup 5))]) + (set (match_dup 0) (match_dup 4))]) + +(define_split + [(set (match_operand:DI 0 "memory_operand" "") + (unspec:DI [(match_operand:XF 1 "register_operand" "")] + UNSPEC_FIST_FLOOR)) + (use (match_operand:HI 2 "memory_operand" "")) + (use (match_operand:HI 3 "memory_operand" "")) + (clobber (match_operand:DI 4 "memory_operand" "")) + (clobber (match_scratch 5 ""))] + "reload_completed" + [(parallel [(set (match_dup 0) (unspec:DI [(match_dup 1)] UNSPEC_FIST_FLOOR)) + (use (match_dup 2)) + (use (match_dup 3)) + (clobber (match_dup 5))])]) + +(define_insn "fist2_floor" + [(set (match_operand:X87MODEI12 0 "memory_operand" "=m") + (unspec:X87MODEI12 [(match_operand:XF 1 "register_operand" "f")] + UNSPEC_FIST_FLOOR)) + (use (match_operand:HI 2 "memory_operand" "m")) + (use (match_operand:HI 3 "memory_operand" "m"))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" + "* return output_fix_trunc (insn, operands, 0);" + [(set_attr "type" "fistp") + (set_attr "i387_cw" "floor") + (set_attr "mode" "")]) + +(define_insn "fist2_floor_with_temp" + [(set (match_operand:X87MODEI12 0 "nonimmediate_operand" "=m,?r") + (unspec:X87MODEI12 [(match_operand:XF 1 "register_operand" "f,f")] + UNSPEC_FIST_FLOOR)) + (use (match_operand:HI 2 "memory_operand" "m,m")) + (use (match_operand:HI 3 "memory_operand" "m,m")) + (clobber (match_operand:X87MODEI12 4 "memory_operand" "=X,m"))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" + "#" + [(set_attr "type" "fistp") + (set_attr "i387_cw" "floor") + (set_attr "mode" "")]) + +(define_split + [(set (match_operand:X87MODEI12 0 "register_operand" "") + (unspec:X87MODEI12 [(match_operand:XF 1 "register_operand" "")] + UNSPEC_FIST_FLOOR)) + (use (match_operand:HI 2 "memory_operand" "")) + (use (match_operand:HI 3 "memory_operand" "")) + (clobber (match_operand:X87MODEI12 4 "memory_operand" ""))] + "reload_completed" + [(parallel [(set (match_dup 4) (unspec:X87MODEI12 [(match_dup 1)] + UNSPEC_FIST_FLOOR)) + (use (match_dup 2)) + (use (match_dup 3))]) + (set (match_dup 0) (match_dup 4))]) + +(define_split + [(set (match_operand:X87MODEI12 0 "memory_operand" "") + (unspec:X87MODEI12 [(match_operand:XF 1 "register_operand" "")] + UNSPEC_FIST_FLOOR)) + (use (match_operand:HI 2 "memory_operand" "")) + (use (match_operand:HI 3 "memory_operand" "")) + (clobber (match_operand:X87MODEI12 4 "memory_operand" ""))] + "reload_completed" + [(parallel [(set (match_dup 0) (unspec:X87MODEI12 [(match_dup 1)] + UNSPEC_FIST_FLOOR)) + (use (match_dup 2)) + (use (match_dup 3))])]) + +(define_expand "lfloorxf2" + [(parallel [(set (match_operand:X87MODEI 0 "nonimmediate_operand" "") + (unspec:X87MODEI [(match_operand:XF 1 "register_operand" "")] + UNSPEC_FIST_FLOOR)) + (clobber (reg:CC FLAGS_REG))])] + "TARGET_USE_FANCY_MATH_387 + && (!TARGET_SSE_MATH || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations") + +(define_expand "lfloor2" + [(match_operand:SWI48 0 "nonimmediate_operand" "") + (match_operand:MODEF 1 "register_operand" "")] + "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && !flag_trapping_math" +{ + if (TARGET_64BIT && optimize_insn_for_size_p ()) + FAIL; + ix86_expand_lfloorceil (operands[0], operands[1], true); + DONE; +}) + +;; Rounding mode control word calculation could clobber FLAGS_REG. +(define_insn_and_split "frndintxf2_ceil" + [(set (match_operand:XF 0 "register_operand" "") + (unspec:XF [(match_operand:XF 1 "register_operand" "")] + UNSPEC_FRNDINT_CEIL)) + (clobber (reg:CC FLAGS_REG))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations + && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] +{ + ix86_optimize_mode_switching[I387_CEIL] = 1; + + operands[2] = assign_386_stack_local (HImode, SLOT_CW_STORED); + operands[3] = assign_386_stack_local (HImode, SLOT_CW_CEIL); + + emit_insn (gen_frndintxf2_ceil_i387 (operands[0], operands[1], + operands[2], operands[3])); + DONE; +} + [(set_attr "type" "frndint") + (set_attr "i387_cw" "ceil") + (set_attr "mode" "XF")]) + +(define_insn "frndintxf2_ceil_i387" + [(set (match_operand:XF 0 "register_operand" "=f") + (unspec:XF [(match_operand:XF 1 "register_operand" "0")] + UNSPEC_FRNDINT_CEIL)) + (use (match_operand:HI 2 "memory_operand" "m")) + (use (match_operand:HI 3 "memory_operand" "m"))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" + "fldcw\t%3\n\tfrndint\n\tfldcw\t%2" + [(set_attr "type" "frndint") + (set_attr "i387_cw" "ceil") + (set_attr "mode" "XF")]) + +(define_expand "ceilxf2" + [(use (match_operand:XF 0 "register_operand" "")) + (use (match_operand:XF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" +{ + if (optimize_insn_for_size_p ()) + FAIL; + emit_insn (gen_frndintxf2_ceil (operands[0], operands[1])); + DONE; +}) + +(define_expand "ceil2" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "register_operand" ""))] + "(TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations) + || (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && !flag_trapping_math)" +{ + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && !flag_trapping_math + && (TARGET_ROUND || optimize_insn_for_speed_p ())) + { + if (TARGET_ROUND) + emit_insn (gen_sse4_1_round2 + (operands[0], operands[1], GEN_INT (0x02))); + else if (optimize_insn_for_size_p ()) + FAIL; + else if (TARGET_64BIT || (mode != DFmode)) + ix86_expand_floorceil (operands[0], operands[1], false); + else + ix86_expand_floorceildf_32 (operands[0], operands[1], false); + } + else + { + rtx op0, op1; + + if (optimize_insn_for_size_p ()) + FAIL; + + op0 = gen_reg_rtx (XFmode); + op1 = gen_reg_rtx (XFmode); + emit_insn (gen_extendxf2 (op1, operands[1])); + emit_insn (gen_frndintxf2_ceil (op0, op1)); + + emit_insn (gen_truncxf2_i387_noop (operands[0], op0)); + } + DONE; +}) + +(define_insn_and_split "*fist2_ceil_1" + [(set (match_operand:X87MODEI 0 "nonimmediate_operand" "") + (unspec:X87MODEI [(match_operand:XF 1 "register_operand" "")] + UNSPEC_FIST_CEIL)) + (clobber (reg:CC FLAGS_REG))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations + && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] +{ + ix86_optimize_mode_switching[I387_CEIL] = 1; + + operands[2] = assign_386_stack_local (HImode, SLOT_CW_STORED); + operands[3] = assign_386_stack_local (HImode, SLOT_CW_CEIL); + if (memory_operand (operands[0], VOIDmode)) + emit_insn (gen_fist2_ceil (operands[0], operands[1], + operands[2], operands[3])); + else + { + operands[4] = assign_386_stack_local (mode, SLOT_TEMP); + emit_insn (gen_fist2_ceil_with_temp (operands[0], operands[1], + operands[2], operands[3], + operands[4])); + } + DONE; +} + [(set_attr "type" "fistp") + (set_attr "i387_cw" "ceil") + (set_attr "mode" "")]) + +(define_insn "fistdi2_ceil" + [(set (match_operand:DI 0 "memory_operand" "=m") + (unspec:DI [(match_operand:XF 1 "register_operand" "f")] + UNSPEC_FIST_CEIL)) + (use (match_operand:HI 2 "memory_operand" "m")) + (use (match_operand:HI 3 "memory_operand" "m")) + (clobber (match_scratch:XF 4 "=&1f"))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" + "* return output_fix_trunc (insn, operands, 0);" + [(set_attr "type" "fistp") + (set_attr "i387_cw" "ceil") + (set_attr "mode" "DI")]) + +(define_insn "fistdi2_ceil_with_temp" + [(set (match_operand:DI 0 "nonimmediate_operand" "=m,?r") + (unspec:DI [(match_operand:XF 1 "register_operand" "f,f")] + UNSPEC_FIST_CEIL)) + (use (match_operand:HI 2 "memory_operand" "m,m")) + (use (match_operand:HI 3 "memory_operand" "m,m")) + (clobber (match_operand:DI 4 "memory_operand" "=X,m")) + (clobber (match_scratch:XF 5 "=&1f,&1f"))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" + "#" + [(set_attr "type" "fistp") + (set_attr "i387_cw" "ceil") + (set_attr "mode" "DI")]) + +(define_split + [(set (match_operand:DI 0 "register_operand" "") + (unspec:DI [(match_operand:XF 1 "register_operand" "")] + UNSPEC_FIST_CEIL)) + (use (match_operand:HI 2 "memory_operand" "")) + (use (match_operand:HI 3 "memory_operand" "")) + (clobber (match_operand:DI 4 "memory_operand" "")) + (clobber (match_scratch 5 ""))] + "reload_completed" + [(parallel [(set (match_dup 4) (unspec:DI [(match_dup 1)] UNSPEC_FIST_CEIL)) + (use (match_dup 2)) + (use (match_dup 3)) + (clobber (match_dup 5))]) + (set (match_dup 0) (match_dup 4))]) + +(define_split + [(set (match_operand:DI 0 "memory_operand" "") + (unspec:DI [(match_operand:XF 1 "register_operand" "")] + UNSPEC_FIST_CEIL)) + (use (match_operand:HI 2 "memory_operand" "")) + (use (match_operand:HI 3 "memory_operand" "")) + (clobber (match_operand:DI 4 "memory_operand" "")) + (clobber (match_scratch 5 ""))] + "reload_completed" + [(parallel [(set (match_dup 0) (unspec:DI [(match_dup 1)] UNSPEC_FIST_CEIL)) + (use (match_dup 2)) + (use (match_dup 3)) + (clobber (match_dup 5))])]) + +(define_insn "fist2_ceil" + [(set (match_operand:X87MODEI12 0 "memory_operand" "=m") + (unspec:X87MODEI12 [(match_operand:XF 1 "register_operand" "f")] + UNSPEC_FIST_CEIL)) + (use (match_operand:HI 2 "memory_operand" "m")) + (use (match_operand:HI 3 "memory_operand" "m"))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" + "* return output_fix_trunc (insn, operands, 0);" + [(set_attr "type" "fistp") + (set_attr "i387_cw" "ceil") + (set_attr "mode" "")]) + +(define_insn "fist2_ceil_with_temp" + [(set (match_operand:X87MODEI12 0 "nonimmediate_operand" "=m,?r") + (unspec:X87MODEI12 [(match_operand:XF 1 "register_operand" "f,f")] + UNSPEC_FIST_CEIL)) + (use (match_operand:HI 2 "memory_operand" "m,m")) + (use (match_operand:HI 3 "memory_operand" "m,m")) + (clobber (match_operand:X87MODEI12 4 "memory_operand" "=X,m"))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" + "#" + [(set_attr "type" "fistp") + (set_attr "i387_cw" "ceil") + (set_attr "mode" "")]) + +(define_split + [(set (match_operand:X87MODEI12 0 "register_operand" "") + (unspec:X87MODEI12 [(match_operand:XF 1 "register_operand" "")] + UNSPEC_FIST_CEIL)) + (use (match_operand:HI 2 "memory_operand" "")) + (use (match_operand:HI 3 "memory_operand" "")) + (clobber (match_operand:X87MODEI12 4 "memory_operand" ""))] + "reload_completed" + [(parallel [(set (match_dup 4) (unspec:X87MODEI12 [(match_dup 1)] + UNSPEC_FIST_CEIL)) + (use (match_dup 2)) + (use (match_dup 3))]) + (set (match_dup 0) (match_dup 4))]) + +(define_split + [(set (match_operand:X87MODEI12 0 "memory_operand" "") + (unspec:X87MODEI12 [(match_operand:XF 1 "register_operand" "")] + UNSPEC_FIST_CEIL)) + (use (match_operand:HI 2 "memory_operand" "")) + (use (match_operand:HI 3 "memory_operand" "")) + (clobber (match_operand:X87MODEI12 4 "memory_operand" ""))] + "reload_completed" + [(parallel [(set (match_dup 0) (unspec:X87MODEI12 [(match_dup 1)] + UNSPEC_FIST_CEIL)) + (use (match_dup 2)) + (use (match_dup 3))])]) + +(define_expand "lceilxf2" + [(parallel [(set (match_operand:X87MODEI 0 "nonimmediate_operand" "") + (unspec:X87MODEI [(match_operand:XF 1 "register_operand" "")] + UNSPEC_FIST_CEIL)) + (clobber (reg:CC FLAGS_REG))])] + "TARGET_USE_FANCY_MATH_387 + && (!TARGET_SSE_MATH || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations") + +(define_expand "lceil2" + [(match_operand:SWI48 0 "nonimmediate_operand" "") + (match_operand:MODEF 1 "register_operand" "")] + "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && !flag_trapping_math" +{ + ix86_expand_lfloorceil (operands[0], operands[1], false); + DONE; +}) + +;; Rounding mode control word calculation could clobber FLAGS_REG. +(define_insn_and_split "frndintxf2_trunc" + [(set (match_operand:XF 0 "register_operand" "") + (unspec:XF [(match_operand:XF 1 "register_operand" "")] + UNSPEC_FRNDINT_TRUNC)) + (clobber (reg:CC FLAGS_REG))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations + && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] +{ + ix86_optimize_mode_switching[I387_TRUNC] = 1; + + operands[2] = assign_386_stack_local (HImode, SLOT_CW_STORED); + operands[3] = assign_386_stack_local (HImode, SLOT_CW_TRUNC); + + emit_insn (gen_frndintxf2_trunc_i387 (operands[0], operands[1], + operands[2], operands[3])); + DONE; +} + [(set_attr "type" "frndint") + (set_attr "i387_cw" "trunc") + (set_attr "mode" "XF")]) + +(define_insn "frndintxf2_trunc_i387" + [(set (match_operand:XF 0 "register_operand" "=f") + (unspec:XF [(match_operand:XF 1 "register_operand" "0")] + UNSPEC_FRNDINT_TRUNC)) + (use (match_operand:HI 2 "memory_operand" "m")) + (use (match_operand:HI 3 "memory_operand" "m"))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" + "fldcw\t%3\n\tfrndint\n\tfldcw\t%2" + [(set_attr "type" "frndint") + (set_attr "i387_cw" "trunc") + (set_attr "mode" "XF")]) + +(define_expand "btruncxf2" + [(use (match_operand:XF 0 "register_operand" "")) + (use (match_operand:XF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" +{ + if (optimize_insn_for_size_p ()) + FAIL; + emit_insn (gen_frndintxf2_trunc (operands[0], operands[1])); + DONE; +}) + +(define_expand "btrunc2" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "register_operand" ""))] + "(TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations) + || (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && !flag_trapping_math)" +{ + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH + && !flag_trapping_math + && (TARGET_ROUND || optimize_insn_for_speed_p ())) + { + if (TARGET_ROUND) + emit_insn (gen_sse4_1_round2 + (operands[0], operands[1], GEN_INT (0x03))); + else if (optimize_insn_for_size_p ()) + FAIL; + else if (TARGET_64BIT || (mode != DFmode)) + ix86_expand_trunc (operands[0], operands[1]); + else + ix86_expand_truncdf_32 (operands[0], operands[1]); + } + else + { + rtx op0, op1; + + if (optimize_insn_for_size_p ()) + FAIL; + + op0 = gen_reg_rtx (XFmode); + op1 = gen_reg_rtx (XFmode); + emit_insn (gen_extendxf2 (op1, operands[1])); + emit_insn (gen_frndintxf2_trunc (op0, op1)); + + emit_insn (gen_truncxf2_i387_noop (operands[0], op0)); + } + DONE; +}) + +;; Rounding mode control word calculation could clobber FLAGS_REG. +(define_insn_and_split "frndintxf2_mask_pm" + [(set (match_operand:XF 0 "register_operand" "") + (unspec:XF [(match_operand:XF 1 "register_operand" "")] + UNSPEC_FRNDINT_MASK_PM)) + (clobber (reg:CC FLAGS_REG))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations + && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] +{ + ix86_optimize_mode_switching[I387_MASK_PM] = 1; + + operands[2] = assign_386_stack_local (HImode, SLOT_CW_STORED); + operands[3] = assign_386_stack_local (HImode, SLOT_CW_MASK_PM); + + emit_insn (gen_frndintxf2_mask_pm_i387 (operands[0], operands[1], + operands[2], operands[3])); + DONE; +} + [(set_attr "type" "frndint") + (set_attr "i387_cw" "mask_pm") + (set_attr "mode" "XF")]) + +(define_insn "frndintxf2_mask_pm_i387" + [(set (match_operand:XF 0 "register_operand" "=f") + (unspec:XF [(match_operand:XF 1 "register_operand" "0")] + UNSPEC_FRNDINT_MASK_PM)) + (use (match_operand:HI 2 "memory_operand" "m")) + (use (match_operand:HI 3 "memory_operand" "m"))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" + "fldcw\t%3\n\tfrndint\n\tfclex\n\tfldcw\t%2" + [(set_attr "type" "frndint") + (set_attr "i387_cw" "mask_pm") + (set_attr "mode" "XF")]) + +(define_expand "nearbyintxf2" + [(use (match_operand:XF 0 "register_operand" "")) + (use (match_operand:XF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && flag_unsafe_math_optimizations" +{ + emit_insn (gen_frndintxf2_mask_pm (operands[0], operands[1])); + DONE; +}) + +(define_expand "nearbyint2" + [(use (match_operand:MODEF 0 "register_operand" "")) + (use (match_operand:MODEF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + || TARGET_MIX_SSE_I387) + && flag_unsafe_math_optimizations" +{ + rtx op0 = gen_reg_rtx (XFmode); + rtx op1 = gen_reg_rtx (XFmode); + + emit_insn (gen_extendxf2 (op1, operands[1])); + emit_insn (gen_frndintxf2_mask_pm (op0, op1)); + + emit_insn (gen_truncxf2_i387_noop (operands[0], op0)); + DONE; +}) + +(define_insn "fxam2_i387" + [(set (match_operand:HI 0 "register_operand" "=a") + (unspec:HI + [(match_operand:X87MODEF 1 "register_operand" "f")] + UNSPEC_FXAM))] + "TARGET_USE_FANCY_MATH_387" + "fxam\n\tfnstsw\t%0" + [(set_attr "type" "multi") + (set_attr "length" "4") + (set_attr "unit" "i387") + (set_attr "mode" "")]) + +(define_insn_and_split "fxam2_i387_with_temp" + [(set (match_operand:HI 0 "register_operand" "") + (unspec:HI + [(match_operand:MODEF 1 "memory_operand" "")] + UNSPEC_FXAM_MEM))] + "TARGET_USE_FANCY_MATH_387 + && can_create_pseudo_p ()" + "#" + "&& 1" + [(set (match_dup 2)(match_dup 1)) + (set (match_dup 0) + (unspec:HI [(match_dup 2)] UNSPEC_FXAM))] +{ + operands[2] = gen_reg_rtx (mode); + + MEM_VOLATILE_P (operands[1]) = 1; +} + [(set_attr "type" "multi") + (set_attr "unit" "i387") + (set_attr "mode" "")]) + +(define_expand "isinfxf2" + [(use (match_operand:SI 0 "register_operand" "")) + (use (match_operand:XF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && TARGET_C99_FUNCTIONS" +{ + rtx mask = GEN_INT (0x45); + rtx val = GEN_INT (0x05); + + rtx cond; + + rtx scratch = gen_reg_rtx (HImode); + rtx res = gen_reg_rtx (QImode); + + emit_insn (gen_fxamxf2_i387 (scratch, operands[1])); + + emit_insn (gen_andqi_ext_0 (scratch, scratch, mask)); + emit_insn (gen_cmpqi_ext_3 (scratch, val)); + cond = gen_rtx_fmt_ee (EQ, QImode, + gen_rtx_REG (CCmode, FLAGS_REG), + const0_rtx); + emit_insn (gen_rtx_SET (VOIDmode, res, cond)); + emit_insn (gen_zero_extendqisi2 (operands[0], res)); + DONE; +}) + +(define_expand "isinf2" + [(use (match_operand:SI 0 "register_operand" "")) + (use (match_operand:MODEF 1 "nonimmediate_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && TARGET_C99_FUNCTIONS + && !(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)" +{ + rtx mask = GEN_INT (0x45); + rtx val = GEN_INT (0x05); + + rtx cond; + + rtx scratch = gen_reg_rtx (HImode); + rtx res = gen_reg_rtx (QImode); + + /* Remove excess precision by forcing value through memory. */ + if (memory_operand (operands[1], VOIDmode)) + emit_insn (gen_fxam2_i387_with_temp (scratch, operands[1])); + else + { + enum ix86_stack_slot slot = (virtuals_instantiated + ? SLOT_TEMP + : SLOT_VIRTUAL); + rtx temp = assign_386_stack_local (mode, slot); + + emit_move_insn (temp, operands[1]); + emit_insn (gen_fxam2_i387_with_temp (scratch, temp)); + } + + emit_insn (gen_andqi_ext_0 (scratch, scratch, mask)); + emit_insn (gen_cmpqi_ext_3 (scratch, val)); + cond = gen_rtx_fmt_ee (EQ, QImode, + gen_rtx_REG (CCmode, FLAGS_REG), + const0_rtx); + emit_insn (gen_rtx_SET (VOIDmode, res, cond)); + emit_insn (gen_zero_extendqisi2 (operands[0], res)); + DONE; +}) + +(define_expand "signbitxf2" + [(use (match_operand:SI 0 "register_operand" "")) + (use (match_operand:XF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387" +{ + rtx scratch = gen_reg_rtx (HImode); + + emit_insn (gen_fxamxf2_i387 (scratch, operands[1])); + emit_insn (gen_andsi3 (operands[0], + gen_lowpart (SImode, scratch), GEN_INT (0x200))); + DONE; +}) + +(define_insn "movmsk_df" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec:SI + [(match_operand:DF 1 "register_operand" "x")] + UNSPEC_MOVMSK))] + "SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH" + "%vmovmskpd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "DF")]) + +;; Use movmskpd in SSE mode to avoid store forwarding stall +;; for 32bit targets and movq+shrq sequence for 64bit targets. +(define_expand "signbitdf2" + [(use (match_operand:SI 0 "register_operand" "")) + (use (match_operand:DF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + || (SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH)" +{ + if (SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH) + { + emit_insn (gen_movmsk_df (operands[0], operands[1])); + emit_insn (gen_andsi3 (operands[0], operands[0], const1_rtx)); + } + else + { + rtx scratch = gen_reg_rtx (HImode); + + emit_insn (gen_fxamdf2_i387 (scratch, operands[1])); + emit_insn (gen_andsi3 (operands[0], + gen_lowpart (SImode, scratch), GEN_INT (0x200))); + } + DONE; +}) + +(define_expand "signbitsf2" + [(use (match_operand:SI 0 "register_operand" "")) + (use (match_operand:SF 1 "register_operand" ""))] + "TARGET_USE_FANCY_MATH_387 + && !(SSE_FLOAT_MODE_P (SFmode) && TARGET_SSE_MATH)" +{ + rtx scratch = gen_reg_rtx (HImode); + + emit_insn (gen_fxamsf2_i387 (scratch, operands[1])); + emit_insn (gen_andsi3 (operands[0], + gen_lowpart (SImode, scratch), GEN_INT (0x200))); + DONE; +}) + +;; Block operation instructions + +(define_insn "cld" + [(unspec_volatile [(const_int 0)] UNSPECV_CLD)] + "" + "cld" + [(set_attr "length" "1") + (set_attr "length_immediate" "0") + (set_attr "modrm" "0")]) + +(define_expand "movmem" + [(use (match_operand:BLK 0 "memory_operand" "")) + (use (match_operand:BLK 1 "memory_operand" "")) + (use (match_operand:SWI48 2 "nonmemory_operand" "")) + (use (match_operand:SWI48 3 "const_int_operand" "")) + (use (match_operand:SI 4 "const_int_operand" "")) + (use (match_operand:SI 5 "const_int_operand" ""))] + "" +{ + if (ix86_expand_movmem (operands[0], operands[1], operands[2], operands[3], + operands[4], operands[5])) + DONE; + else + FAIL; +}) + +;; Most CPUs don't like single string operations +;; Handle this case here to simplify previous expander. + +(define_expand "strmov" + [(set (match_dup 4) (match_operand 3 "memory_operand" "")) + (set (match_operand 1 "memory_operand" "") (match_dup 4)) + (parallel [(set (match_operand 0 "register_operand" "") (match_dup 5)) + (clobber (reg:CC FLAGS_REG))]) + (parallel [(set (match_operand 2 "register_operand" "") (match_dup 6)) + (clobber (reg:CC FLAGS_REG))])] + "" +{ + rtx adjust = GEN_INT (GET_MODE_SIZE (GET_MODE (operands[1]))); + + /* If .md ever supports :P for Pmode, these can be directly + in the pattern above. */ + operands[5] = gen_rtx_PLUS (Pmode, operands[0], adjust); + operands[6] = gen_rtx_PLUS (Pmode, operands[2], adjust); + + /* Can't use this if the user has appropriated esi or edi. */ + if ((TARGET_SINGLE_STRINGOP || optimize_insn_for_size_p ()) + && !(fixed_regs[SI_REG] || fixed_regs[DI_REG])) + { + emit_insn (gen_strmov_singleop (operands[0], operands[1], + operands[2], operands[3], + operands[5], operands[6])); + DONE; + } + + operands[4] = gen_reg_rtx (GET_MODE (operands[1])); +}) + +(define_expand "strmov_singleop" + [(parallel [(set (match_operand 1 "memory_operand" "") + (match_operand 3 "memory_operand" "")) + (set (match_operand 0 "register_operand" "") + (match_operand 4 "" "")) + (set (match_operand 2 "register_operand" "") + (match_operand 5 "" ""))])] + "" + "ix86_current_function_needs_cld = 1;") + +(define_insn "*strmovdi_rex_1" + [(set (mem:DI (match_operand:DI 2 "register_operand" "0")) + (mem:DI (match_operand:DI 3 "register_operand" "1"))) + (set (match_operand:DI 0 "register_operand" "=D") + (plus:DI (match_dup 2) + (const_int 8))) + (set (match_operand:DI 1 "register_operand" "=S") + (plus:DI (match_dup 3) + (const_int 8)))] + "TARGET_64BIT + && !(fixed_regs[SI_REG] || fixed_regs[DI_REG])" + "movsq" + [(set_attr "type" "str") + (set_attr "memory" "both") + (set_attr "mode" "DI")]) + +(define_insn "*strmovsi_1" + [(set (mem:SI (match_operand:P 2 "register_operand" "0")) + (mem:SI (match_operand:P 3 "register_operand" "1"))) + (set (match_operand:P 0 "register_operand" "=D") + (plus:P (match_dup 2) + (const_int 4))) + (set (match_operand:P 1 "register_operand" "=S") + (plus:P (match_dup 3) + (const_int 4)))] + "!(fixed_regs[SI_REG] || fixed_regs[DI_REG])" + "movs{l|d}" + [(set_attr "type" "str") + (set_attr "memory" "both") + (set_attr "mode" "SI")]) + +(define_insn "*strmovhi_1" + [(set (mem:HI (match_operand:P 2 "register_operand" "0")) + (mem:HI (match_operand:P 3 "register_operand" "1"))) + (set (match_operand:P 0 "register_operand" "=D") + (plus:P (match_dup 2) + (const_int 2))) + (set (match_operand:P 1 "register_operand" "=S") + (plus:P (match_dup 3) + (const_int 2)))] + "!(fixed_regs[SI_REG] || fixed_regs[DI_REG])" + "movsw" + [(set_attr "type" "str") + (set_attr "memory" "both") + (set_attr "mode" "HI")]) + +(define_insn "*strmovqi_1" + [(set (mem:QI (match_operand:P 2 "register_operand" "0")) + (mem:QI (match_operand:P 3 "register_operand" "1"))) + (set (match_operand:P 0 "register_operand" "=D") + (plus:P (match_dup 2) + (const_int 1))) + (set (match_operand:P 1 "register_operand" "=S") + (plus:P (match_dup 3) + (const_int 1)))] + "!(fixed_regs[SI_REG] || fixed_regs[DI_REG])" + "movsb" + [(set_attr "type" "str") + (set_attr "memory" "both") + (set (attr "prefix_rex") + (if_then_else + (ne (symbol_ref "mode == DImode") (const_int 0)) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "QI")]) + +(define_expand "rep_mov" + [(parallel [(set (match_operand 4 "register_operand" "") (const_int 0)) + (set (match_operand 0 "register_operand" "") + (match_operand 5 "" "")) + (set (match_operand 2 "register_operand" "") + (match_operand 6 "" "")) + (set (match_operand 1 "memory_operand" "") + (match_operand 3 "memory_operand" "")) + (use (match_dup 4))])] + "" + "ix86_current_function_needs_cld = 1;") + +(define_insn "*rep_movdi_rex64" + [(set (match_operand:DI 2 "register_operand" "=c") (const_int 0)) + (set (match_operand:DI 0 "register_operand" "=D") + (plus:DI (ashift:DI (match_operand:DI 5 "register_operand" "2") + (const_int 3)) + (match_operand:DI 3 "register_operand" "0"))) + (set (match_operand:DI 1 "register_operand" "=S") + (plus:DI (ashift:DI (match_dup 5) (const_int 3)) + (match_operand:DI 4 "register_operand" "1"))) + (set (mem:BLK (match_dup 3)) + (mem:BLK (match_dup 4))) + (use (match_dup 5))] + "TARGET_64BIT + && !(fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])" + "rep{%;} movsq" + [(set_attr "type" "str") + (set_attr "prefix_rep" "1") + (set_attr "memory" "both") + (set_attr "mode" "DI")]) + +(define_insn "*rep_movsi" + [(set (match_operand:P 2 "register_operand" "=c") (const_int 0)) + (set (match_operand:P 0 "register_operand" "=D") + (plus:P (ashift:P (match_operand:P 5 "register_operand" "2") + (const_int 2)) + (match_operand:P 3 "register_operand" "0"))) + (set (match_operand:P 1 "register_operand" "=S") + (plus:P (ashift:P (match_dup 5) (const_int 2)) + (match_operand:P 4 "register_operand" "1"))) + (set (mem:BLK (match_dup 3)) + (mem:BLK (match_dup 4))) + (use (match_dup 5))] + "!(fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])" + "rep{%;} movs{l|d}" + [(set_attr "type" "str") + (set_attr "prefix_rep" "1") + (set_attr "memory" "both") + (set_attr "mode" "SI")]) + +(define_insn "*rep_movqi" + [(set (match_operand:P 2 "register_operand" "=c") (const_int 0)) + (set (match_operand:P 0 "register_operand" "=D") + (plus:P (match_operand:P 3 "register_operand" "0") + (match_operand:P 5 "register_operand" "2"))) + (set (match_operand:P 1 "register_operand" "=S") + (plus:P (match_operand:P 4 "register_operand" "1") (match_dup 5))) + (set (mem:BLK (match_dup 3)) + (mem:BLK (match_dup 4))) + (use (match_dup 5))] + "!(fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])" + "rep{%;} movsb" + [(set_attr "type" "str") + (set_attr "prefix_rep" "1") + (set_attr "memory" "both") + (set_attr "mode" "QI")]) + +(define_expand "setmem" + [(use (match_operand:BLK 0 "memory_operand" "")) + (use (match_operand:SWI48 1 "nonmemory_operand" "")) + (use (match_operand 2 "const_int_operand" "")) + (use (match_operand 3 "const_int_operand" "")) + (use (match_operand:SI 4 "const_int_operand" "")) + (use (match_operand:SI 5 "const_int_operand" ""))] + "" +{ + if (ix86_expand_setmem (operands[0], operands[1], + operands[2], operands[3], + operands[4], operands[5])) + DONE; + else + FAIL; +}) + +;; Most CPUs don't like single string operations +;; Handle this case here to simplify previous expander. + +(define_expand "strset" + [(set (match_operand 1 "memory_operand" "") + (match_operand 2 "register_operand" "")) + (parallel [(set (match_operand 0 "register_operand" "") + (match_dup 3)) + (clobber (reg:CC FLAGS_REG))])] + "" +{ + if (GET_MODE (operands[1]) != GET_MODE (operands[2])) + operands[1] = adjust_address_nv (operands[1], GET_MODE (operands[2]), 0); + + /* If .md ever supports :P for Pmode, this can be directly + in the pattern above. */ + operands[3] = gen_rtx_PLUS (Pmode, operands[0], + GEN_INT (GET_MODE_SIZE (GET_MODE + (operands[2])))); + /* Can't use this if the user has appropriated eax or edi. */ + if ((TARGET_SINGLE_STRINGOP || optimize_insn_for_size_p ()) + && !(fixed_regs[AX_REG] || fixed_regs[DI_REG])) + { + emit_insn (gen_strset_singleop (operands[0], operands[1], operands[2], + operands[3])); + DONE; + } +}) + +(define_expand "strset_singleop" + [(parallel [(set (match_operand 1 "memory_operand" "") + (match_operand 2 "register_operand" "")) + (set (match_operand 0 "register_operand" "") + (match_operand 3 "" ""))])] + "" + "ix86_current_function_needs_cld = 1;") + +(define_insn "*strsetdi_rex_1" + [(set (mem:DI (match_operand:DI 1 "register_operand" "0")) + (match_operand:DI 2 "register_operand" "a")) + (set (match_operand:DI 0 "register_operand" "=D") + (plus:DI (match_dup 1) + (const_int 8)))] + "TARGET_64BIT + && !(fixed_regs[AX_REG] || fixed_regs[DI_REG])" + "stosq" + [(set_attr "type" "str") + (set_attr "memory" "store") + (set_attr "mode" "DI")]) + +(define_insn "*strsetsi_1" + [(set (mem:SI (match_operand:P 1 "register_operand" "0")) + (match_operand:SI 2 "register_operand" "a")) + (set (match_operand:P 0 "register_operand" "=D") + (plus:P (match_dup 1) + (const_int 4)))] + "!(fixed_regs[AX_REG] || fixed_regs[DI_REG])" + "stos{l|d}" + [(set_attr "type" "str") + (set_attr "memory" "store") + (set_attr "mode" "SI")]) + +(define_insn "*strsethi_1" + [(set (mem:HI (match_operand:P 1 "register_operand" "0")) + (match_operand:HI 2 "register_operand" "a")) + (set (match_operand:P 0 "register_operand" "=D") + (plus:P (match_dup 1) + (const_int 2)))] + "!(fixed_regs[AX_REG] || fixed_regs[DI_REG])" + "stosw" + [(set_attr "type" "str") + (set_attr "memory" "store") + (set_attr "mode" "HI")]) + +(define_insn "*strsetqi_1" + [(set (mem:QI (match_operand:P 1 "register_operand" "0")) + (match_operand:QI 2 "register_operand" "a")) + (set (match_operand:P 0 "register_operand" "=D") + (plus:P (match_dup 1) + (const_int 1)))] + "!(fixed_regs[AX_REG] || fixed_regs[DI_REG])" + "stosb" + [(set_attr "type" "str") + (set_attr "memory" "store") + (set (attr "prefix_rex") + (if_then_else + (ne (symbol_ref "mode == DImode") (const_int 0)) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "QI")]) + +(define_expand "rep_stos" + [(parallel [(set (match_operand 1 "register_operand" "") (const_int 0)) + (set (match_operand 0 "register_operand" "") + (match_operand 4 "" "")) + (set (match_operand 2 "memory_operand" "") (const_int 0)) + (use (match_operand 3 "register_operand" "")) + (use (match_dup 1))])] + "" + "ix86_current_function_needs_cld = 1;") + +(define_insn "*rep_stosdi_rex64" + [(set (match_operand:DI 1 "register_operand" "=c") (const_int 0)) + (set (match_operand:DI 0 "register_operand" "=D") + (plus:DI (ashift:DI (match_operand:DI 4 "register_operand" "1") + (const_int 3)) + (match_operand:DI 3 "register_operand" "0"))) + (set (mem:BLK (match_dup 3)) + (const_int 0)) + (use (match_operand:DI 2 "register_operand" "a")) + (use (match_dup 4))] + "TARGET_64BIT + && !(fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])" + "rep{%;} stosq" + [(set_attr "type" "str") + (set_attr "prefix_rep" "1") + (set_attr "memory" "store") + (set_attr "mode" "DI")]) + +(define_insn "*rep_stossi" + [(set (match_operand:P 1 "register_operand" "=c") (const_int 0)) + (set (match_operand:P 0 "register_operand" "=D") + (plus:P (ashift:P (match_operand:P 4 "register_operand" "1") + (const_int 2)) + (match_operand:P 3 "register_operand" "0"))) + (set (mem:BLK (match_dup 3)) + (const_int 0)) + (use (match_operand:SI 2 "register_operand" "a")) + (use (match_dup 4))] + "!(fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])" + "rep{%;} stos{l|d}" + [(set_attr "type" "str") + (set_attr "prefix_rep" "1") + (set_attr "memory" "store") + (set_attr "mode" "SI")]) + +(define_insn "*rep_stosqi" + [(set (match_operand:P 1 "register_operand" "=c") (const_int 0)) + (set (match_operand:P 0 "register_operand" "=D") + (plus:P (match_operand:P 3 "register_operand" "0") + (match_operand:P 4 "register_operand" "1"))) + (set (mem:BLK (match_dup 3)) + (const_int 0)) + (use (match_operand:QI 2 "register_operand" "a")) + (use (match_dup 4))] + "!(fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])" + "rep{%;} stosb" + [(set_attr "type" "str") + (set_attr "prefix_rep" "1") + (set_attr "memory" "store") + (set (attr "prefix_rex") + (if_then_else + (ne (symbol_ref "mode == DImode") (const_int 0)) + (const_string "0") + (const_string "*"))) + (set_attr "mode" "QI")]) + +(define_expand "cmpstrnsi" + [(set (match_operand:SI 0 "register_operand" "") + (compare:SI (match_operand:BLK 1 "general_operand" "") + (match_operand:BLK 2 "general_operand" ""))) + (use (match_operand 3 "general_operand" "")) + (use (match_operand 4 "immediate_operand" ""))] + "" +{ + rtx addr1, addr2, out, outlow, count, countreg, align; + + if (optimize_insn_for_size_p () && !TARGET_INLINE_ALL_STRINGOPS) + FAIL; + + /* Can't use this if the user has appropriated ecx, esi or edi. */ + if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG]) + FAIL; + + out = operands[0]; + if (!REG_P (out)) + out = gen_reg_rtx (SImode); + + addr1 = copy_to_mode_reg (Pmode, XEXP (operands[1], 0)); + addr2 = copy_to_mode_reg (Pmode, XEXP (operands[2], 0)); + if (addr1 != XEXP (operands[1], 0)) + operands[1] = replace_equiv_address_nv (operands[1], addr1); + if (addr2 != XEXP (operands[2], 0)) + operands[2] = replace_equiv_address_nv (operands[2], addr2); + + count = operands[3]; + countreg = ix86_zero_extend_to_Pmode (count); + + /* %%% Iff we are testing strict equality, we can use known alignment + to good advantage. This may be possible with combine, particularly + once cc0 is dead. */ + align = operands[4]; + + if (CONST_INT_P (count)) + { + if (INTVAL (count) == 0) + { + emit_move_insn (operands[0], const0_rtx); + DONE; + } + emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, countreg, align, + operands[1], operands[2])); + } + else + { + rtx (*gen_cmp) (rtx, rtx); + + gen_cmp = (TARGET_64BIT + ? gen_cmpdi_1 : gen_cmpsi_1); + + emit_insn (gen_cmp (countreg, countreg)); + emit_insn (gen_cmpstrnqi_1 (addr1, addr2, countreg, align, + operands[1], operands[2])); + } + + outlow = gen_lowpart (QImode, out); + emit_insn (gen_cmpintqi (outlow)); + emit_move_insn (out, gen_rtx_SIGN_EXTEND (SImode, outlow)); + + if (operands[0] != out) + emit_move_insn (operands[0], out); + + DONE; +}) + +;; Produce a tri-state integer (-1, 0, 1) from condition codes. + +(define_expand "cmpintqi" + [(set (match_dup 1) + (gtu:QI (reg:CC FLAGS_REG) (const_int 0))) + (set (match_dup 2) + (ltu:QI (reg:CC FLAGS_REG) (const_int 0))) + (parallel [(set (match_operand:QI 0 "register_operand" "") + (minus:QI (match_dup 1) + (match_dup 2))) + (clobber (reg:CC FLAGS_REG))])] + "" +{ + operands[1] = gen_reg_rtx (QImode); + operands[2] = gen_reg_rtx (QImode); +}) + +;; memcmp recognizers. The `cmpsb' opcode does nothing if the count is +;; zero. Emit extra code to make sure that a zero-length compare is EQ. + +(define_expand "cmpstrnqi_nz_1" + [(parallel [(set (reg:CC FLAGS_REG) + (compare:CC (match_operand 4 "memory_operand" "") + (match_operand 5 "memory_operand" ""))) + (use (match_operand 2 "register_operand" "")) + (use (match_operand:SI 3 "immediate_operand" "")) + (clobber (match_operand 0 "register_operand" "")) + (clobber (match_operand 1 "register_operand" "")) + (clobber (match_dup 2))])] + "" + "ix86_current_function_needs_cld = 1;") + +(define_insn "*cmpstrnqi_nz_1" + [(set (reg:CC FLAGS_REG) + (compare:CC (mem:BLK (match_operand:P 4 "register_operand" "0")) + (mem:BLK (match_operand:P 5 "register_operand" "1")))) + (use (match_operand:P 6 "register_operand" "2")) + (use (match_operand:SI 3 "immediate_operand" "i")) + (clobber (match_operand:P 0 "register_operand" "=S")) + (clobber (match_operand:P 1 "register_operand" "=D")) + (clobber (match_operand:P 2 "register_operand" "=c"))] + "!(fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])" + "repz{%;} cmpsb" + [(set_attr "type" "str") + (set_attr "mode" "QI") + (set (attr "prefix_rex") + (if_then_else + (ne (symbol_ref "mode == DImode") (const_int 0)) + (const_string "0") + (const_string "*"))) + (set_attr "prefix_rep" "1")]) + +;; The same, but the count is not known to not be zero. + +(define_expand "cmpstrnqi_1" + [(parallel [(set (reg:CC FLAGS_REG) + (if_then_else:CC (ne (match_operand 2 "register_operand" "") + (const_int 0)) + (compare:CC (match_operand 4 "memory_operand" "") + (match_operand 5 "memory_operand" "")) + (const_int 0))) + (use (match_operand:SI 3 "immediate_operand" "")) + (use (reg:CC FLAGS_REG)) + (clobber (match_operand 0 "register_operand" "")) + (clobber (match_operand 1 "register_operand" "")) + (clobber (match_dup 2))])] + "" + "ix86_current_function_needs_cld = 1;") + +(define_insn "*cmpstrnqi_1" + [(set (reg:CC FLAGS_REG) + (if_then_else:CC (ne (match_operand:P 6 "register_operand" "2") + (const_int 0)) + (compare:CC (mem:BLK (match_operand:P 4 "register_operand" "0")) + (mem:BLK (match_operand:P 5 "register_operand" "1"))) + (const_int 0))) + (use (match_operand:SI 3 "immediate_operand" "i")) + (use (reg:CC FLAGS_REG)) + (clobber (match_operand:P 0 "register_operand" "=S")) + (clobber (match_operand:P 1 "register_operand" "=D")) + (clobber (match_operand:P 2 "register_operand" "=c"))] + "!(fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])" + "repz{%;} cmpsb" + [(set_attr "type" "str") + (set_attr "mode" "QI") + (set (attr "prefix_rex") + (if_then_else + (ne (symbol_ref "mode == DImode") (const_int 0)) + (const_string "0") + (const_string "*"))) + (set_attr "prefix_rep" "1")]) + +(define_expand "strlen" + [(set (match_operand:P 0 "register_operand" "") + (unspec:P [(match_operand:BLK 1 "general_operand" "") + (match_operand:QI 2 "immediate_operand" "") + (match_operand 3 "immediate_operand" "")] + UNSPEC_SCAS))] + "" +{ + if (ix86_expand_strlen (operands[0], operands[1], operands[2], operands[3])) + DONE; + else + FAIL; +}) + +(define_expand "strlenqi_1" + [(parallel [(set (match_operand 0 "register_operand" "") + (match_operand 2 "" "")) + (clobber (match_operand 1 "register_operand" "")) + (clobber (reg:CC FLAGS_REG))])] + "" + "ix86_current_function_needs_cld = 1;") + +(define_insn "*strlenqi_1" + [(set (match_operand:P 0 "register_operand" "=&c") + (unspec:P [(mem:BLK (match_operand:P 5 "register_operand" "1")) + (match_operand:QI 2 "register_operand" "a") + (match_operand:P 3 "immediate_operand" "i") + (match_operand:P 4 "register_operand" "0")] UNSPEC_SCAS)) + (clobber (match_operand:P 1 "register_operand" "=D")) + (clobber (reg:CC FLAGS_REG))] + "!(fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])" + "repnz{%;} scasb" + [(set_attr "type" "str") + (set_attr "mode" "QI") + (set (attr "prefix_rex") + (if_then_else + (ne (symbol_ref "mode == DImode") (const_int 0)) + (const_string "0") + (const_string "*"))) + (set_attr "prefix_rep" "1")]) + +;; Peephole optimizations to clean up after cmpstrn*. This should be +;; handled in combine, but it is not currently up to the task. +;; When used for their truth value, the cmpstrn* expanders generate +;; code like this: +;; +;; repz cmpsb +;; seta %al +;; setb %dl +;; cmpb %al, %dl +;; jcc label +;; +;; The intermediate three instructions are unnecessary. + +;; This one handles cmpstrn*_nz_1... +(define_peephole2 + [(parallel[ + (set (reg:CC FLAGS_REG) + (compare:CC (mem:BLK (match_operand 4 "register_operand" "")) + (mem:BLK (match_operand 5 "register_operand" "")))) + (use (match_operand 6 "register_operand" "")) + (use (match_operand:SI 3 "immediate_operand" "")) + (clobber (match_operand 0 "register_operand" "")) + (clobber (match_operand 1 "register_operand" "")) + (clobber (match_operand 2 "register_operand" ""))]) + (set (match_operand:QI 7 "register_operand" "") + (gtu:QI (reg:CC FLAGS_REG) (const_int 0))) + (set (match_operand:QI 8 "register_operand" "") + (ltu:QI (reg:CC FLAGS_REG) (const_int 0))) + (set (reg FLAGS_REG) + (compare (match_dup 7) (match_dup 8))) + ] + "peep2_reg_dead_p (4, operands[7]) && peep2_reg_dead_p (4, operands[8])" + [(parallel[ + (set (reg:CC FLAGS_REG) + (compare:CC (mem:BLK (match_dup 4)) + (mem:BLK (match_dup 5)))) + (use (match_dup 6)) + (use (match_dup 3)) + (clobber (match_dup 0)) + (clobber (match_dup 1)) + (clobber (match_dup 2))])]) + +;; ...and this one handles cmpstrn*_1. +(define_peephole2 + [(parallel[ + (set (reg:CC FLAGS_REG) + (if_then_else:CC (ne (match_operand 6 "register_operand" "") + (const_int 0)) + (compare:CC (mem:BLK (match_operand 4 "register_operand" "")) + (mem:BLK (match_operand 5 "register_operand" ""))) + (const_int 0))) + (use (match_operand:SI 3 "immediate_operand" "")) + (use (reg:CC FLAGS_REG)) + (clobber (match_operand 0 "register_operand" "")) + (clobber (match_operand 1 "register_operand" "")) + (clobber (match_operand 2 "register_operand" ""))]) + (set (match_operand:QI 7 "register_operand" "") + (gtu:QI (reg:CC FLAGS_REG) (const_int 0))) + (set (match_operand:QI 8 "register_operand" "") + (ltu:QI (reg:CC FLAGS_REG) (const_int 0))) + (set (reg FLAGS_REG) + (compare (match_dup 7) (match_dup 8))) + ] + "peep2_reg_dead_p (4, operands[7]) && peep2_reg_dead_p (4, operands[8])" + [(parallel[ + (set (reg:CC FLAGS_REG) + (if_then_else:CC (ne (match_dup 6) + (const_int 0)) + (compare:CC (mem:BLK (match_dup 4)) + (mem:BLK (match_dup 5))) + (const_int 0))) + (use (match_dup 3)) + (use (reg:CC FLAGS_REG)) + (clobber (match_dup 0)) + (clobber (match_dup 1)) + (clobber (match_dup 2))])]) + +;; Conditional move instructions. + +(define_expand "movcc" + [(set (match_operand:SWIM 0 "register_operand" "") + (if_then_else:SWIM (match_operand 1 "ordered_comparison_operator" "") + (match_operand:SWIM 2 "general_operand" "") + (match_operand:SWIM 3 "general_operand" "")))] + "" + "if (ix86_expand_int_movcc (operands)) DONE; else FAIL;") + +;; Data flow gets confused by our desire for `sbbl reg,reg', and clearing +;; the register first winds up with `sbbl $0,reg', which is also weird. +;; So just document what we're doing explicitly. + +(define_expand "x86_movcc_0_m1" + [(parallel + [(set (match_operand:SWI48 0 "register_operand" "") + (if_then_else:SWI48 + (match_operator:SWI48 2 "ix86_carry_flag_operator" + [(match_operand 1 "flags_reg_operand" "") + (const_int 0)]) + (const_int -1) + (const_int 0))) + (clobber (reg:CC FLAGS_REG))])]) + +(define_insn "*x86_movcc_0_m1" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (if_then_else:SWI48 (match_operator 1 "ix86_carry_flag_operator" + [(reg FLAGS_REG) (const_int 0)]) + (const_int -1) + (const_int 0))) + (clobber (reg:CC FLAGS_REG))] + "" + "sbb{}\t%0, %0" + ; Since we don't have the proper number of operands for an alu insn, + ; fill in all the blanks. + [(set_attr "type" "alu") + (set_attr "use_carry" "1") + (set_attr "pent_pair" "pu") + (set_attr "memory" "none") + (set_attr "imm_disp" "false") + (set_attr "mode" "") + (set_attr "length_immediate" "0")]) + +(define_insn "*x86_movcc_0_m1_se" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (sign_extract:SWI48 (match_operator 1 "ix86_carry_flag_operator" + [(reg FLAGS_REG) (const_int 0)]) + (const_int 1) + (const_int 0))) + (clobber (reg:CC FLAGS_REG))] + "" + "sbb{}\t%0, %0" + [(set_attr "type" "alu") + (set_attr "use_carry" "1") + (set_attr "pent_pair" "pu") + (set_attr "memory" "none") + (set_attr "imm_disp" "false") + (set_attr "mode" "") + (set_attr "length_immediate" "0")]) + +(define_insn "*x86_movcc_0_m1_neg" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (neg:SWI48 (match_operator 1 "ix86_carry_flag_operator" + [(reg FLAGS_REG) (const_int 0)]))) + (clobber (reg:CC FLAGS_REG))] + "" + "sbb{}\t%0, %0" + [(set_attr "type" "alu") + (set_attr "use_carry" "1") + (set_attr "pent_pair" "pu") + (set_attr "memory" "none") + (set_attr "imm_disp" "false") + (set_attr "mode" "") + (set_attr "length_immediate" "0")]) + +(define_insn "*movcc_noc" + [(set (match_operand:SWI248 0 "register_operand" "=r,r") + (if_then_else:SWI248 (match_operator 1 "ix86_comparison_operator" + [(reg FLAGS_REG) (const_int 0)]) + (match_operand:SWI248 2 "nonimmediate_operand" "rm,0") + (match_operand:SWI248 3 "nonimmediate_operand" "0,rm")))] + "TARGET_CMOVE && !(MEM_P (operands[2]) && MEM_P (operands[3]))" + "@ + cmov%O2%C1\t{%2, %0|%0, %2} + cmov%O2%c1\t{%3, %0|%0, %3}" + [(set_attr "type" "icmov") + (set_attr "mode" "")]) + +(define_insn_and_split "*movqicc_noc" + [(set (match_operand:QI 0 "register_operand" "=r,r") + (if_then_else:QI (match_operator 1 "ix86_comparison_operator" + [(match_operand 4 "flags_reg_operand" "") + (const_int 0)]) + (match_operand:QI 2 "register_operand" "r,0") + (match_operand:QI 3 "register_operand" "0,r")))] + "TARGET_CMOVE && !TARGET_PARTIAL_REG_STALL" + "#" + "&& reload_completed" + [(set (match_dup 0) + (if_then_else:SI (match_op_dup 1 [(match_dup 4) (const_int 0)]) + (match_dup 2) + (match_dup 3)))] + "operands[0] = gen_lowpart (SImode, operands[0]); + operands[2] = gen_lowpart (SImode, operands[2]); + operands[3] = gen_lowpart (SImode, operands[3]);" + [(set_attr "type" "icmov") + (set_attr "mode" "SI")]) + +(define_expand "movcc" + [(set (match_operand:X87MODEF 0 "register_operand" "") + (if_then_else:X87MODEF + (match_operand 1 "ix86_fp_comparison_operator" "") + (match_operand:X87MODEF 2 "register_operand" "") + (match_operand:X87MODEF 3 "register_operand" "")))] + "(TARGET_80387 && TARGET_CMOVE) + || (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)" + "if (ix86_expand_fp_movcc (operands)) DONE; else FAIL;") + +(define_insn "*movxfcc_1" + [(set (match_operand:XF 0 "register_operand" "=f,f") + (if_then_else:XF (match_operator 1 "fcmov_comparison_operator" + [(reg FLAGS_REG) (const_int 0)]) + (match_operand:XF 2 "register_operand" "f,0") + (match_operand:XF 3 "register_operand" "0,f")))] + "TARGET_80387 && TARGET_CMOVE" + "@ + fcmov%F1\t{%2, %0|%0, %2} + fcmov%f1\t{%3, %0|%0, %3}" + [(set_attr "type" "fcmov") + (set_attr "mode" "XF")]) + +(define_insn "*movdfcc_1_rex64" + [(set (match_operand:DF 0 "register_operand" "=f,f,r,r") + (if_then_else:DF (match_operator 1 "fcmov_comparison_operator" + [(reg FLAGS_REG) (const_int 0)]) + (match_operand:DF 2 "nonimmediate_operand" "f,0,rm,0") + (match_operand:DF 3 "nonimmediate_operand" "0,f,0,rm")))] + "TARGET_64BIT && TARGET_80387 && TARGET_CMOVE + && !(MEM_P (operands[2]) && MEM_P (operands[3]))" + "@ + fcmov%F1\t{%2, %0|%0, %2} + fcmov%f1\t{%3, %0|%0, %3} + cmov%O2%C1\t{%2, %0|%0, %2} + cmov%O2%c1\t{%3, %0|%0, %3}" + [(set_attr "type" "fcmov,fcmov,icmov,icmov") + (set_attr "mode" "DF,DF,DI,DI")]) + +(define_insn "*movdfcc_1" + [(set (match_operand:DF 0 "register_operand" "=f,f,&r,&r") + (if_then_else:DF (match_operator 1 "fcmov_comparison_operator" + [(reg FLAGS_REG) (const_int 0)]) + (match_operand:DF 2 "nonimmediate_operand" "f,0,rm,0") + (match_operand:DF 3 "nonimmediate_operand" "0,f,0,rm")))] + "!TARGET_64BIT && TARGET_80387 && TARGET_CMOVE + && !(MEM_P (operands[2]) && MEM_P (operands[3]))" + "@ + fcmov%F1\t{%2, %0|%0, %2} + fcmov%f1\t{%3, %0|%0, %3} + # + #" + [(set_attr "type" "fcmov,fcmov,multi,multi") + (set_attr "mode" "DF,DF,DI,DI")]) + +(define_split + [(set (match_operand:DF 0 "register_and_not_any_fp_reg_operand" "") + (if_then_else:DF (match_operator 1 "fcmov_comparison_operator" + [(match_operand 4 "flags_reg_operand" "") + (const_int 0)]) + (match_operand:DF 2 "nonimmediate_operand" "") + (match_operand:DF 3 "nonimmediate_operand" "")))] + "!TARGET_64BIT && reload_completed" + [(set (match_dup 2) + (if_then_else:SI (match_op_dup 1 [(match_dup 4) (const_int 0)]) + (match_dup 5) + (match_dup 6))) + (set (match_dup 3) + (if_then_else:SI (match_op_dup 1 [(match_dup 4) (const_int 0)]) + (match_dup 7) + (match_dup 8)))] +{ + split_double_mode (DImode, &operands[2], 2, &operands[5], &operands[7]); + split_double_mode (DImode, &operands[0], 1, &operands[2], &operands[3]); +}) + +(define_insn "*movsfcc_1_387" + [(set (match_operand:SF 0 "register_operand" "=f,f,r,r") + (if_then_else:SF (match_operator 1 "fcmov_comparison_operator" + [(reg FLAGS_REG) (const_int 0)]) + (match_operand:SF 2 "nonimmediate_operand" "f,0,rm,0") + (match_operand:SF 3 "nonimmediate_operand" "0,f,0,rm")))] + "TARGET_80387 && TARGET_CMOVE + && !(MEM_P (operands[2]) && MEM_P (operands[3]))" + "@ + fcmov%F1\t{%2, %0|%0, %2} + fcmov%f1\t{%3, %0|%0, %3} + cmov%O2%C1\t{%2, %0|%0, %2} + cmov%O2%c1\t{%3, %0|%0, %3}" + [(set_attr "type" "fcmov,fcmov,icmov,icmov") + (set_attr "mode" "SF,SF,SI,SI")]) + +;; All moves in XOP pcmov instructions are 128 bits and hence we restrict +;; the scalar versions to have only XMM registers as operands. + +;; XOP conditional move +(define_insn "*xop_pcmov_" + [(set (match_operand:MODEF 0 "register_operand" "=x") + (if_then_else:MODEF + (match_operand:MODEF 1 "register_operand" "x") + (match_operand:MODEF 2 "register_operand" "x") + (match_operand:MODEF 3 "register_operand" "x")))] + "TARGET_XOP" + "vpcmov\t{%1, %3, %2, %0|%0, %2, %3, %1}" + [(set_attr "type" "sse4arg")]) + +;; These versions of the min/max patterns are intentionally ignorant of +;; their behavior wrt -0.0 and NaN (via the commutative operand mark). +;; Since both the tree-level MAX_EXPR and the rtl-level SMAX operator +;; are undefined in this condition, we're certain this is correct. + +(define_insn "*avx_3" + [(set (match_operand:MODEF 0 "register_operand" "=x") + (smaxmin:MODEF + (match_operand:MODEF 1 "nonimmediate_operand" "%x") + (match_operand:MODEF 2 "nonimmediate_operand" "xm")))] + "AVX_FLOAT_MODE_P (mode) && TARGET_SSE_MATH" + "vs\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseadd") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "3" + [(set (match_operand:MODEF 0 "register_operand" "=x") + (smaxmin:MODEF + (match_operand:MODEF 1 "nonimmediate_operand" "%0") + (match_operand:MODEF 2 "nonimmediate_operand" "xm")))] + "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH" + "s\t{%2, %0|%0, %2}" + [(set_attr "type" "sseadd") + (set_attr "mode" "")]) + +;; These versions of the min/max patterns implement exactly the operations +;; min = (op1 < op2 ? op1 : op2) +;; max = (!(op1 < op2) ? op1 : op2) +;; Their operands are not commutative, and thus they may be used in the +;; presence of -0.0 and NaN. + +(define_insn "*avx_ieee_smin3" + [(set (match_operand:MODEF 0 "register_operand" "=x") + (unspec:MODEF + [(match_operand:MODEF 1 "register_operand" "x") + (match_operand:MODEF 2 "nonimmediate_operand" "xm")] + UNSPEC_IEEE_MIN))] + "AVX_FLOAT_MODE_P (mode) && TARGET_SSE_MATH" + "vmins\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseadd") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "*ieee_smin3" + [(set (match_operand:MODEF 0 "register_operand" "=x") + (unspec:MODEF + [(match_operand:MODEF 1 "register_operand" "0") + (match_operand:MODEF 2 "nonimmediate_operand" "xm")] + UNSPEC_IEEE_MIN))] + "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH" + "mins\t{%2, %0|%0, %2}" + [(set_attr "type" "sseadd") + (set_attr "mode" "")]) + +(define_insn "*avx_ieee_smax3" + [(set (match_operand:MODEF 0 "register_operand" "=x") + (unspec:MODEF + [(match_operand:MODEF 1 "register_operand" "0") + (match_operand:MODEF 2 "nonimmediate_operand" "xm")] + UNSPEC_IEEE_MAX))] + "AVX_FLOAT_MODE_P (mode) && TARGET_SSE_MATH" + "vmaxs\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseadd") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "*ieee_smax3" + [(set (match_operand:MODEF 0 "register_operand" "=x") + (unspec:MODEF + [(match_operand:MODEF 1 "register_operand" "0") + (match_operand:MODEF 2 "nonimmediate_operand" "xm")] + UNSPEC_IEEE_MAX))] + "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH" + "maxs\t{%2, %0|%0, %2}" + [(set_attr "type" "sseadd") + (set_attr "mode" "")]) + +;; Make two stack loads independent: +;; fld aa fld aa +;; fld %st(0) -> fld bb +;; fmul bb fmul %st(1), %st +;; +;; Actually we only match the last two instructions for simplicity. +(define_peephole2 + [(set (match_operand 0 "fp_register_operand" "") + (match_operand 1 "fp_register_operand" "")) + (set (match_dup 0) + (match_operator 2 "binary_fp_operator" + [(match_dup 0) + (match_operand 3 "memory_operand" "")]))] + "REGNO (operands[0]) != REGNO (operands[1])" + [(set (match_dup 0) (match_dup 3)) + (set (match_dup 0) (match_dup 4))] + + ;; The % modifier is not operational anymore in peephole2's, so we have to + ;; swap the operands manually in the case of addition and multiplication. + "if (COMMUTATIVE_ARITH_P (operands[2])) + operands[4] = gen_rtx_fmt_ee (GET_CODE (operands[2]), + GET_MODE (operands[2]), + operands[0], operands[1]); + else + operands[4] = gen_rtx_fmt_ee (GET_CODE (operands[2]), + GET_MODE (operands[2]), + operands[1], operands[0]);") + +;; Conditional addition patterns +(define_expand "addcc" + [(match_operand:SWI 0 "register_operand" "") + (match_operand 1 "ordered_comparison_operator" "") + (match_operand:SWI 2 "register_operand" "") + (match_operand:SWI 3 "const_int_operand" "")] + "" + "if (ix86_expand_int_addcc (operands)) DONE; else FAIL;") + +;; Misc patterns (?) + +;; This pattern exists to put a dependency on all ebp-based memory accesses. +;; Otherwise there will be nothing to keep +;; +;; [(set (reg ebp) (reg esp))] +;; [(set (reg esp) (plus (reg esp) (const_int -160000))) +;; (clobber (eflags)] +;; [(set (mem (plus (reg ebp) (const_int -160000))) (const_int 0))] +;; +;; in proper program order. + +(define_insn "pro_epilogue_adjust_stack__add" + [(set (match_operand:P 0 "register_operand" "=r,r") + (plus:P (match_operand:P 1 "register_operand" "0,r") + (match_operand:P 2 "" "r,l"))) + (clobber (reg:CC FLAGS_REG)) + (clobber (mem:BLK (scratch)))] + "" +{ + switch (get_attr_type (insn)) + { + case TYPE_IMOV: + return "mov{}\t{%1, %0|%0, %1}"; + + case TYPE_ALU: + gcc_assert (rtx_equal_p (operands[0], operands[1])); + if (x86_maybe_negate_const_int (&operands[2], mode)) + return "sub{}\t{%2, %0|%0, %2}"; + + return "add{}\t{%2, %0|%0, %2}"; + + default: + operands[2] = SET_SRC (XVECEXP (PATTERN (insn), 0, 0)); + return "lea{}\t{%a2, %0|%0, %a2}"; + } +} + [(set (attr "type") + (cond [(and (eq_attr "alternative" "0") + (eq (symbol_ref "TARGET_OPT_AGU") (const_int 0))) + (const_string "alu") + (match_operand: 2 "const0_operand" "") + (const_string "imov") + ] + (const_string "lea"))) + (set (attr "length_immediate") + (cond [(eq_attr "type" "imov") + (const_string "0") + (and (eq_attr "type" "alu") + (match_operand 2 "const128_operand" "")) + (const_string "1") + ] + (const_string "*"))) + (set_attr "mode" "")]) + +(define_insn "pro_epilogue_adjust_stack__sub" + [(set (match_operand:P 0 "register_operand" "=r") + (minus:P (match_operand:P 1 "register_operand" "0") + (match_operand:P 2 "register_operand" "r"))) + (clobber (reg:CC FLAGS_REG)) + (clobber (mem:BLK (scratch)))] + "" + "sub{}\t{%2, %0|%0, %2}" + [(set_attr "type" "alu") + (set_attr "mode" "")]) + +(define_insn "allocate_stack_worker_probe_" + [(set (match_operand:P 0 "register_operand" "=a") + (unspec_volatile:P [(match_operand:P 1 "register_operand" "0")] + UNSPECV_STACK_PROBE)) + (clobber (reg:CC FLAGS_REG))] + "ix86_target_stack_probe ()" + "call\t___chkstk_ms" + [(set_attr "type" "multi") + (set_attr "length" "5")]) + +(define_expand "allocate_stack" + [(match_operand 0 "register_operand" "") + (match_operand 1 "general_operand" "")] + "ix86_target_stack_probe ()" +{ + rtx x; + +#ifndef CHECK_STACK_LIMIT +#define CHECK_STACK_LIMIT 0 +#endif + + if (CHECK_STACK_LIMIT && CONST_INT_P (operands[1]) + && INTVAL (operands[1]) < CHECK_STACK_LIMIT) + { + x = expand_simple_binop (Pmode, MINUS, stack_pointer_rtx, operands[1], + stack_pointer_rtx, 0, OPTAB_DIRECT); + if (x != stack_pointer_rtx) + emit_move_insn (stack_pointer_rtx, x); + } + else + { + x = copy_to_mode_reg (Pmode, operands[1]); + if (TARGET_64BIT) + emit_insn (gen_allocate_stack_worker_probe_di (x, x)); + else + emit_insn (gen_allocate_stack_worker_probe_si (x, x)); + x = expand_simple_binop (Pmode, MINUS, stack_pointer_rtx, x, + stack_pointer_rtx, 0, OPTAB_DIRECT); + if (x != stack_pointer_rtx) + emit_move_insn (stack_pointer_rtx, x); + } + + emit_move_insn (operands[0], virtual_stack_dynamic_rtx); + DONE; +}) + +;; Use IOR for stack probes, this is shorter. +(define_expand "probe_stack" + [(match_operand 0 "memory_operand" "")] + "" +{ + rtx (*gen_ior3) (rtx, rtx, rtx); + + gen_ior3 = (GET_MODE (operands[0]) == DImode + ? gen_iordi3 : gen_iorsi3); + + emit_insn (gen_ior3 (operands[0], operands[0], const0_rtx)); + DONE; +}) + +(define_insn "adjust_stack_and_probe" + [(set (match_operand:P 0 "register_operand" "=r") + (unspec_volatile:P [(match_operand:P 1 "register_operand" "0")] + UNSPECV_PROBE_STACK_RANGE)) + (set (reg:P SP_REG) + (minus:P (reg:P SP_REG) (match_operand:P 2 "const_int_operand" "n"))) + (clobber (reg:CC FLAGS_REG)) + (clobber (mem:BLK (scratch)))] + "" + "* return output_adjust_stack_and_probe (operands[0]);" + [(set_attr "type" "multi")]) + +(define_insn "probe_stack_range" + [(set (match_operand:P 0 "register_operand" "=r") + (unspec_volatile:P [(match_operand:P 1 "register_operand" "0") + (match_operand:P 2 "const_int_operand" "n")] + UNSPECV_PROBE_STACK_RANGE)) + (clobber (reg:CC FLAGS_REG))] + "" + "* return output_probe_stack_range (operands[0], operands[2]);" + [(set_attr "type" "multi")]) + +(define_expand "builtin_setjmp_receiver" + [(label_ref (match_operand 0 "" ""))] + "!TARGET_64BIT && flag_pic" +{ +#if TARGET_MACHO + if (TARGET_MACHO) + { + rtx xops[3]; + rtx picreg = gen_rtx_REG (Pmode, PIC_OFFSET_TABLE_REGNUM); + rtx label_rtx = gen_label_rtx (); + emit_insn (gen_set_got_labelled (pic_offset_table_rtx, label_rtx)); + xops[0] = xops[1] = picreg; + xops[2] = machopic_gen_offset (gen_rtx_LABEL_REF (SImode, label_rtx)); + ix86_expand_binary_operator (MINUS, SImode, xops); + } + else +#endif + emit_insn (gen_set_got (pic_offset_table_rtx)); + DONE; +}) + +;; Avoid redundant prefixes by splitting HImode arithmetic to SImode. + +(define_split + [(set (match_operand 0 "register_operand" "") + (match_operator 3 "promotable_binary_operator" + [(match_operand 1 "register_operand" "") + (match_operand 2 "aligned_operand" "")])) + (clobber (reg:CC FLAGS_REG))] + "! TARGET_PARTIAL_REG_STALL && reload_completed + && ((GET_MODE (operands[0]) == HImode + && ((optimize_function_for_speed_p (cfun) && !TARGET_FAST_PREFIX) + /* ??? next two lines just !satisfies_constraint_K (...) */ + || !CONST_INT_P (operands[2]) + || satisfies_constraint_K (operands[2]))) + || (GET_MODE (operands[0]) == QImode + && (TARGET_PROMOTE_QImode || optimize_function_for_size_p (cfun))))" + [(parallel [(set (match_dup 0) + (match_op_dup 3 [(match_dup 1) (match_dup 2)])) + (clobber (reg:CC FLAGS_REG))])] + "operands[0] = gen_lowpart (SImode, operands[0]); + operands[1] = gen_lowpart (SImode, operands[1]); + if (GET_CODE (operands[3]) != ASHIFT) + operands[2] = gen_lowpart (SImode, operands[2]); + PUT_MODE (operands[3], SImode);") + +; Promote the QImode tests, as i386 has encoding of the AND +; instruction with 32-bit sign-extended immediate and thus the +; instruction size is unchanged, except in the %eax case for +; which it is increased by one byte, hence the ! optimize_size. +(define_split + [(set (match_operand 0 "flags_reg_operand" "") + (match_operator 2 "compare_operator" + [(and (match_operand 3 "aligned_operand" "") + (match_operand 4 "const_int_operand" "")) + (const_int 0)])) + (set (match_operand 1 "register_operand" "") + (and (match_dup 3) (match_dup 4)))] + "! TARGET_PARTIAL_REG_STALL && reload_completed + && optimize_insn_for_speed_p () + && ((GET_MODE (operands[1]) == HImode && ! TARGET_FAST_PREFIX) + || (GET_MODE (operands[1]) == QImode && TARGET_PROMOTE_QImode)) + /* Ensure that the operand will remain sign-extended immediate. */ + && ix86_match_ccmode (insn, INTVAL (operands[4]) >= 0 ? CCNOmode : CCZmode)" + [(parallel [(set (match_dup 0) + (match_op_dup 2 [(and:SI (match_dup 3) (match_dup 4)) + (const_int 0)])) + (set (match_dup 1) + (and:SI (match_dup 3) (match_dup 4)))])] +{ + operands[4] + = gen_int_mode (INTVAL (operands[4]) + & GET_MODE_MASK (GET_MODE (operands[1])), SImode); + operands[1] = gen_lowpart (SImode, operands[1]); + operands[3] = gen_lowpart (SImode, operands[3]); +}) + +; Don't promote the QImode tests, as i386 doesn't have encoding of +; the TEST instruction with 32-bit sign-extended immediate and thus +; the instruction size would at least double, which is not what we +; want even with ! optimize_size. +(define_split + [(set (match_operand 0 "flags_reg_operand" "") + (match_operator 1 "compare_operator" + [(and (match_operand:HI 2 "aligned_operand" "") + (match_operand:HI 3 "const_int_operand" "")) + (const_int 0)]))] + "! TARGET_PARTIAL_REG_STALL && reload_completed + && ! TARGET_FAST_PREFIX + && optimize_insn_for_speed_p () + /* Ensure that the operand will remain sign-extended immediate. */ + && ix86_match_ccmode (insn, INTVAL (operands[3]) >= 0 ? CCNOmode : CCZmode)" + [(set (match_dup 0) + (match_op_dup 1 [(and:SI (match_dup 2) (match_dup 3)) + (const_int 0)]))] +{ + operands[3] + = gen_int_mode (INTVAL (operands[3]) + & GET_MODE_MASK (GET_MODE (operands[2])), SImode); + operands[2] = gen_lowpart (SImode, operands[2]); +}) + +(define_split + [(set (match_operand 0 "register_operand" "") + (neg (match_operand 1 "register_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "! TARGET_PARTIAL_REG_STALL && reload_completed + && (GET_MODE (operands[0]) == HImode + || (GET_MODE (operands[0]) == QImode + && (TARGET_PROMOTE_QImode + || optimize_insn_for_size_p ())))" + [(parallel [(set (match_dup 0) + (neg:SI (match_dup 1))) + (clobber (reg:CC FLAGS_REG))])] + "operands[0] = gen_lowpart (SImode, operands[0]); + operands[1] = gen_lowpart (SImode, operands[1]);") + +(define_split + [(set (match_operand 0 "register_operand" "") + (not (match_operand 1 "register_operand" "")))] + "! TARGET_PARTIAL_REG_STALL && reload_completed + && (GET_MODE (operands[0]) == HImode + || (GET_MODE (operands[0]) == QImode + && (TARGET_PROMOTE_QImode + || optimize_insn_for_size_p ())))" + [(set (match_dup 0) + (not:SI (match_dup 1)))] + "operands[0] = gen_lowpart (SImode, operands[0]); + operands[1] = gen_lowpart (SImode, operands[1]);") + +(define_split + [(set (match_operand 0 "register_operand" "") + (if_then_else (match_operator 1 "ordered_comparison_operator" + [(reg FLAGS_REG) (const_int 0)]) + (match_operand 2 "register_operand" "") + (match_operand 3 "register_operand" "")))] + "! TARGET_PARTIAL_REG_STALL && TARGET_CMOVE + && (GET_MODE (operands[0]) == HImode + || (GET_MODE (operands[0]) == QImode + && (TARGET_PROMOTE_QImode + || optimize_insn_for_size_p ())))" + [(set (match_dup 0) + (if_then_else:SI (match_dup 1) (match_dup 2) (match_dup 3)))] + "operands[0] = gen_lowpart (SImode, operands[0]); + operands[2] = gen_lowpart (SImode, operands[2]); + operands[3] = gen_lowpart (SImode, operands[3]);") + +;; RTL Peephole optimizations, run before sched2. These primarily look to +;; transform a complex memory operation into two memory to register operations. + +;; Don't push memory operands +(define_peephole2 + [(set (match_operand:SWI 0 "push_operand" "") + (match_operand:SWI 1 "memory_operand" "")) + (match_scratch:SWI 2 "")] + "optimize_insn_for_speed_p () && !TARGET_PUSH_MEMORY + && !RTX_FRAME_RELATED_P (peep2_next_insn (0))" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (match_dup 2))]) + +;; We need to handle SFmode only, because DFmode and XFmode are split to +;; SImode pushes. +(define_peephole2 + [(set (match_operand:SF 0 "push_operand" "") + (match_operand:SF 1 "memory_operand" "")) + (match_scratch:SF 2 "r")] + "optimize_insn_for_speed_p () && !TARGET_PUSH_MEMORY + && !RTX_FRAME_RELATED_P (peep2_next_insn (0))" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (match_dup 2))]) + +;; Don't move an immediate directly to memory when the instruction +;; gets too big. +(define_peephole2 + [(match_scratch:SWI124 1 "") + (set (match_operand:SWI124 0 "memory_operand" "") + (const_int 0))] + "optimize_insn_for_speed_p () + && !TARGET_USE_MOV0 + && TARGET_SPLIT_LONG_MOVES + && get_attr_length (insn) >= ix86_cur_cost ()->large_insn + && peep2_regno_dead_p (0, FLAGS_REG)" + [(parallel [(set (match_dup 2) (const_int 0)) + (clobber (reg:CC FLAGS_REG))]) + (set (match_dup 0) (match_dup 1))] + "operands[2] = gen_lowpart (SImode, operands[1]);") + +(define_peephole2 + [(match_scratch:SWI124 2 "") + (set (match_operand:SWI124 0 "memory_operand" "") + (match_operand:SWI124 1 "immediate_operand" ""))] + "optimize_insn_for_speed_p () + && TARGET_SPLIT_LONG_MOVES + && get_attr_length (insn) >= ix86_cur_cost ()->large_insn" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (match_dup 2))]) + +;; Don't compare memory with zero, load and use a test instead. +(define_peephole2 + [(set (match_operand 0 "flags_reg_operand" "") + (match_operator 1 "compare_operator" + [(match_operand:SI 2 "memory_operand" "") + (const_int 0)])) + (match_scratch:SI 3 "r")] + "optimize_insn_for_speed_p () && ix86_match_ccmode (insn, CCNOmode)" + [(set (match_dup 3) (match_dup 2)) + (set (match_dup 0) (match_op_dup 1 [(match_dup 3) (const_int 0)]))]) + +;; NOT is not pairable on Pentium, while XOR is, but one byte longer. +;; Don't split NOTs with a displacement operand, because resulting XOR +;; will not be pairable anyway. +;; +;; On AMD K6, NOT is vector decoded with memory operand that cannot be +;; represented using a modRM byte. The XOR replacement is long decoded, +;; so this split helps here as well. +;; +;; Note: Can't do this as a regular split because we can't get proper +;; lifetime information then. + +(define_peephole2 + [(set (match_operand:SWI124 0 "nonimmediate_operand" "") + (not:SWI124 (match_operand:SWI124 1 "nonimmediate_operand" "")))] + "optimize_insn_for_speed_p () + && ((TARGET_NOT_UNPAIRABLE + && (!MEM_P (operands[0]) + || !memory_displacement_operand (operands[0], mode))) + || (TARGET_NOT_VECTORMODE + && long_memory_operand (operands[0], mode))) + && peep2_regno_dead_p (0, FLAGS_REG)" + [(parallel [(set (match_dup 0) + (xor:SWI124 (match_dup 1) (const_int -1))) + (clobber (reg:CC FLAGS_REG))])]) + +;; Non pairable "test imm, reg" instructions can be translated to +;; "and imm, reg" if reg dies. The "and" form is also shorter (one +;; byte opcode instead of two, have a short form for byte operands), +;; so do it for other CPUs as well. Given that the value was dead, +;; this should not create any new dependencies. Pass on the sub-word +;; versions if we're concerned about partial register stalls. + +(define_peephole2 + [(set (match_operand 0 "flags_reg_operand" "") + (match_operator 1 "compare_operator" + [(and:SI (match_operand:SI 2 "register_operand" "") + (match_operand:SI 3 "immediate_operand" "")) + (const_int 0)]))] + "ix86_match_ccmode (insn, CCNOmode) + && (true_regnum (operands[2]) != AX_REG + || satisfies_constraint_K (operands[3])) + && peep2_reg_dead_p (1, operands[2])" + [(parallel + [(set (match_dup 0) + (match_op_dup 1 [(and:SI (match_dup 2) (match_dup 3)) + (const_int 0)])) + (set (match_dup 2) + (and:SI (match_dup 2) (match_dup 3)))])]) + +;; We don't need to handle HImode case, because it will be promoted to SImode +;; on ! TARGET_PARTIAL_REG_STALL + +(define_peephole2 + [(set (match_operand 0 "flags_reg_operand" "") + (match_operator 1 "compare_operator" + [(and:QI (match_operand:QI 2 "register_operand" "") + (match_operand:QI 3 "immediate_operand" "")) + (const_int 0)]))] + "! TARGET_PARTIAL_REG_STALL + && ix86_match_ccmode (insn, CCNOmode) + && true_regnum (operands[2]) != AX_REG + && peep2_reg_dead_p (1, operands[2])" + [(parallel + [(set (match_dup 0) + (match_op_dup 1 [(and:QI (match_dup 2) (match_dup 3)) + (const_int 0)])) + (set (match_dup 2) + (and:QI (match_dup 2) (match_dup 3)))])]) + +(define_peephole2 + [(set (match_operand 0 "flags_reg_operand" "") + (match_operator 1 "compare_operator" + [(and:SI + (zero_extract:SI + (match_operand 2 "ext_register_operand" "") + (const_int 8) + (const_int 8)) + (match_operand 3 "const_int_operand" "")) + (const_int 0)]))] + "! TARGET_PARTIAL_REG_STALL + && ix86_match_ccmode (insn, CCNOmode) + && true_regnum (operands[2]) != AX_REG + && peep2_reg_dead_p (1, operands[2])" + [(parallel [(set (match_dup 0) + (match_op_dup 1 + [(and:SI + (zero_extract:SI + (match_dup 2) + (const_int 8) + (const_int 8)) + (match_dup 3)) + (const_int 0)])) + (set (zero_extract:SI (match_dup 2) + (const_int 8) + (const_int 8)) + (and:SI + (zero_extract:SI + (match_dup 2) + (const_int 8) + (const_int 8)) + (match_dup 3)))])]) + +;; Don't do logical operations with memory inputs. +(define_peephole2 + [(match_scratch:SI 2 "r") + (parallel [(set (match_operand:SI 0 "register_operand" "") + (match_operator:SI 3 "arith_or_logical_operator" + [(match_dup 0) + (match_operand:SI 1 "memory_operand" "")])) + (clobber (reg:CC FLAGS_REG))])] + "optimize_insn_for_speed_p () && ! TARGET_READ_MODIFY" + [(set (match_dup 2) (match_dup 1)) + (parallel [(set (match_dup 0) + (match_op_dup 3 [(match_dup 0) (match_dup 2)])) + (clobber (reg:CC FLAGS_REG))])]) + +(define_peephole2 + [(match_scratch:SI 2 "r") + (parallel [(set (match_operand:SI 0 "register_operand" "") + (match_operator:SI 3 "arith_or_logical_operator" + [(match_operand:SI 1 "memory_operand" "") + (match_dup 0)])) + (clobber (reg:CC FLAGS_REG))])] + "optimize_insn_for_speed_p () && ! TARGET_READ_MODIFY" + [(set (match_dup 2) (match_dup 1)) + (parallel [(set (match_dup 0) + (match_op_dup 3 [(match_dup 2) (match_dup 0)])) + (clobber (reg:CC FLAGS_REG))])]) + +;; Prefer Load+RegOp to Mov+MemOp. Watch out for cases when the memory address +;; refers to the destination of the load! + +(define_peephole2 + [(set (match_operand:SI 0 "register_operand" "") + (match_operand:SI 1 "register_operand" "")) + (parallel [(set (match_dup 0) + (match_operator:SI 3 "commutative_operator" + [(match_dup 0) + (match_operand:SI 2 "memory_operand" "")])) + (clobber (reg:CC FLAGS_REG))])] + "REGNO (operands[0]) != REGNO (operands[1]) + && GENERAL_REGNO_P (REGNO (operands[0])) + && GENERAL_REGNO_P (REGNO (operands[1]))" + [(set (match_dup 0) (match_dup 4)) + (parallel [(set (match_dup 0) + (match_op_dup 3 [(match_dup 0) (match_dup 1)])) + (clobber (reg:CC FLAGS_REG))])] + "operands[4] = replace_rtx (operands[2], operands[0], operands[1]);") + +(define_peephole2 + [(set (match_operand 0 "register_operand" "") + (match_operand 1 "register_operand" "")) + (set (match_dup 0) + (match_operator 3 "commutative_operator" + [(match_dup 0) + (match_operand 2 "memory_operand" "")]))] + "REGNO (operands[0]) != REGNO (operands[1]) + && ((MMX_REG_P (operands[0]) && MMX_REG_P (operands[1])) + || (SSE_REG_P (operands[0]) && SSE_REG_P (operands[1])))" + [(set (match_dup 0) (match_dup 2)) + (set (match_dup 0) + (match_op_dup 3 [(match_dup 0) (match_dup 1)]))]) + +; Don't do logical operations with memory outputs +; +; These two don't make sense for PPro/PII -- we're expanding a 4-uop +; instruction into two 1-uop insns plus a 2-uop insn. That last has +; the same decoder scheduling characteristics as the original. + +(define_peephole2 + [(match_scratch:SI 2 "r") + (parallel [(set (match_operand:SI 0 "memory_operand" "") + (match_operator:SI 3 "arith_or_logical_operator" + [(match_dup 0) + (match_operand:SI 1 "nonmemory_operand" "")])) + (clobber (reg:CC FLAGS_REG))])] + "optimize_insn_for_speed_p () && ! TARGET_READ_MODIFY_WRITE + /* Do not split stack checking probes. */ + && GET_CODE (operands[3]) != IOR && operands[1] != const0_rtx" + [(set (match_dup 2) (match_dup 0)) + (parallel [(set (match_dup 2) + (match_op_dup 3 [(match_dup 2) (match_dup 1)])) + (clobber (reg:CC FLAGS_REG))]) + (set (match_dup 0) (match_dup 2))]) + +(define_peephole2 + [(match_scratch:SI 2 "r") + (parallel [(set (match_operand:SI 0 "memory_operand" "") + (match_operator:SI 3 "arith_or_logical_operator" + [(match_operand:SI 1 "nonmemory_operand" "") + (match_dup 0)])) + (clobber (reg:CC FLAGS_REG))])] + "optimize_insn_for_speed_p () && ! TARGET_READ_MODIFY_WRITE + /* Do not split stack checking probes. */ + && GET_CODE (operands[3]) != IOR && operands[1] != const0_rtx" + [(set (match_dup 2) (match_dup 0)) + (parallel [(set (match_dup 2) + (match_op_dup 3 [(match_dup 1) (match_dup 2)])) + (clobber (reg:CC FLAGS_REG))]) + (set (match_dup 0) (match_dup 2))]) + +;; Attempt to always use XOR for zeroing registers. +(define_peephole2 + [(set (match_operand 0 "register_operand" "") + (match_operand 1 "const0_operand" ""))] + "GET_MODE_SIZE (GET_MODE (operands[0])) <= UNITS_PER_WORD + && (! TARGET_USE_MOV0 || optimize_insn_for_size_p ()) + && GENERAL_REG_P (operands[0]) + && peep2_regno_dead_p (0, FLAGS_REG)" + [(parallel [(set (match_dup 0) (const_int 0)) + (clobber (reg:CC FLAGS_REG))])] + "operands[0] = gen_lowpart (word_mode, operands[0]);") + +(define_peephole2 + [(set (strict_low_part (match_operand 0 "register_operand" "")) + (const_int 0))] + "(GET_MODE (operands[0]) == QImode + || GET_MODE (operands[0]) == HImode) + && (! TARGET_USE_MOV0 || optimize_insn_for_size_p ()) + && peep2_regno_dead_p (0, FLAGS_REG)" + [(parallel [(set (strict_low_part (match_dup 0)) (const_int 0)) + (clobber (reg:CC FLAGS_REG))])]) + +;; For HI, SI and DI modes, or $-1,reg is smaller than mov $-1,reg. +(define_peephole2 + [(set (match_operand:SWI248 0 "register_operand" "") + (const_int -1))] + "(optimize_insn_for_size_p () || TARGET_MOVE_M1_VIA_OR) + && peep2_regno_dead_p (0, FLAGS_REG)" + [(parallel [(set (match_dup 0) (const_int -1)) + (clobber (reg:CC FLAGS_REG))])] +{ + if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode)) + operands[0] = gen_lowpart (SImode, operands[0]); +}) + +;; Attempt to convert simple lea to add/shift. +;; These can be created by move expanders. + +(define_peephole2 + [(set (match_operand:SWI48 0 "register_operand" "") + (plus:SWI48 (match_dup 0) + (match_operand:SWI48 1 "" "")))] + "peep2_regno_dead_p (0, FLAGS_REG)" + [(parallel [(set (match_dup 0) (plus:SWI48 (match_dup 0) (match_dup 1))) + (clobber (reg:CC FLAGS_REG))])]) + +(define_peephole2 + [(set (match_operand:SI 0 "register_operand" "") + (subreg:SI (plus:DI (match_operand:DI 1 "register_operand" "") + (match_operand:DI 2 "nonmemory_operand" "")) 0))] + "TARGET_64BIT + && peep2_regno_dead_p (0, FLAGS_REG) + && REGNO (operands[0]) == REGNO (operands[1])" + [(parallel [(set (match_dup 0) (plus:SI (match_dup 0) (match_dup 2))) + (clobber (reg:CC FLAGS_REG))])] + "operands[2] = gen_lowpart (SImode, operands[2]);") + +(define_peephole2 + [(set (match_operand:SWI48 0 "register_operand" "") + (mult:SWI48 (match_dup 0) + (match_operand:SWI48 1 "const_int_operand" "")))] + "exact_log2 (INTVAL (operands[1])) >= 0 + && peep2_regno_dead_p (0, FLAGS_REG)" + [(parallel [(set (match_dup 0) (ashift:SWI48 (match_dup 0) (match_dup 2))) + (clobber (reg:CC FLAGS_REG))])] + "operands[2] = GEN_INT (exact_log2 (INTVAL (operands[1])));") + +(define_peephole2 + [(set (match_operand:SI 0 "register_operand" "") + (subreg:SI (mult:DI (match_operand:DI 1 "register_operand" "") + (match_operand:DI 2 "const_int_operand" "")) 0))] + "TARGET_64BIT + && exact_log2 (INTVAL (operands[2])) >= 0 + && REGNO (operands[0]) == REGNO (operands[1]) + && peep2_regno_dead_p (0, FLAGS_REG)" + [(parallel [(set (match_dup 0) (ashift:SI (match_dup 0) (match_dup 2))) + (clobber (reg:CC FLAGS_REG))])] + "operands[2] = GEN_INT (exact_log2 (INTVAL (operands[2])));") + +;; The ESP adjustments can be done by the push and pop instructions. Resulting +;; code is shorter, since push is only 1 byte, while add imm, %esp is 3 bytes. +;; On many CPUs it is also faster, since special hardware to avoid esp +;; dependencies is present. + +;; While some of these conversions may be done using splitters, we use +;; peepholes in order to allow combine_stack_adjustments pass to see +;; nonobfuscated RTL. + +;; Convert prologue esp subtractions to push. +;; We need register to push. In order to keep verify_flow_info happy we have +;; two choices +;; - use scratch and clobber it in order to avoid dependencies +;; - use already live register +;; We can't use the second way right now, since there is no reliable way how to +;; verify that given register is live. First choice will also most likely in +;; fewer dependencies. On the place of esp adjustments it is very likely that +;; call clobbered registers are dead. We may want to use base pointer as an +;; alternative when no register is available later. + +(define_peephole2 + [(match_scratch:P 1 "r") + (parallel [(set (reg:P SP_REG) + (plus:P (reg:P SP_REG) + (match_operand:P 0 "const_int_operand" ""))) + (clobber (reg:CC FLAGS_REG)) + (clobber (mem:BLK (scratch)))])] + "(TARGET_SINGLE_PUSH || optimize_insn_for_size_p ()) + && INTVAL (operands[0]) == -GET_MODE_SIZE (Pmode)" + [(clobber (match_dup 1)) + (parallel [(set (mem:P (pre_dec:P (reg:P SP_REG))) (match_dup 1)) + (clobber (mem:BLK (scratch)))])]) + +(define_peephole2 + [(match_scratch:P 1 "r") + (parallel [(set (reg:P SP_REG) + (plus:P (reg:P SP_REG) + (match_operand:P 0 "const_int_operand" ""))) + (clobber (reg:CC FLAGS_REG)) + (clobber (mem:BLK (scratch)))])] + "(TARGET_DOUBLE_PUSH || optimize_insn_for_size_p ()) + && INTVAL (operands[0]) == -2*GET_MODE_SIZE (Pmode)" + [(clobber (match_dup 1)) + (set (mem:P (pre_dec:P (reg:P SP_REG))) (match_dup 1)) + (parallel [(set (mem:P (pre_dec:P (reg:P SP_REG))) (match_dup 1)) + (clobber (mem:BLK (scratch)))])]) + +;; Convert esp subtractions to push. +(define_peephole2 + [(match_scratch:P 1 "r") + (parallel [(set (reg:P SP_REG) + (plus:P (reg:P SP_REG) + (match_operand:P 0 "const_int_operand" ""))) + (clobber (reg:CC FLAGS_REG))])] + "(TARGET_SINGLE_PUSH || optimize_insn_for_size_p ()) + && INTVAL (operands[0]) == -GET_MODE_SIZE (Pmode)" + [(clobber (match_dup 1)) + (set (mem:P (pre_dec:P (reg:P SP_REG))) (match_dup 1))]) + +(define_peephole2 + [(match_scratch:P 1 "r") + (parallel [(set (reg:P SP_REG) + (plus:P (reg:P SP_REG) + (match_operand:P 0 "const_int_operand" ""))) + (clobber (reg:CC FLAGS_REG))])] + "(TARGET_DOUBLE_PUSH || optimize_insn_for_size_p ()) + && INTVAL (operands[0]) == -2*GET_MODE_SIZE (Pmode)" + [(clobber (match_dup 1)) + (set (mem:P (pre_dec:P (reg:P SP_REG))) (match_dup 1)) + (set (mem:P (pre_dec:P (reg:P SP_REG))) (match_dup 1))]) + +;; Convert epilogue deallocator to pop. +(define_peephole2 + [(match_scratch:P 1 "r") + (parallel [(set (reg:P SP_REG) + (plus:P (reg:P SP_REG) + (match_operand:P 0 "const_int_operand" ""))) + (clobber (reg:CC FLAGS_REG)) + (clobber (mem:BLK (scratch)))])] + "(TARGET_SINGLE_POP || optimize_insn_for_size_p ()) + && INTVAL (operands[0]) == GET_MODE_SIZE (Pmode)" + [(parallel [(set (match_dup 1) (mem:P (post_inc:P (reg:P SP_REG)))) + (clobber (mem:BLK (scratch)))])]) + +;; Two pops case is tricky, since pop causes dependency +;; on destination register. We use two registers if available. +(define_peephole2 + [(match_scratch:P 1 "r") + (match_scratch:P 2 "r") + (parallel [(set (reg:P SP_REG) + (plus:P (reg:P SP_REG) + (match_operand:P 0 "const_int_operand" ""))) + (clobber (reg:CC FLAGS_REG)) + (clobber (mem:BLK (scratch)))])] + "(TARGET_DOUBLE_POP || optimize_insn_for_size_p ()) + && INTVAL (operands[0]) == 2*GET_MODE_SIZE (Pmode)" + [(parallel [(set (match_dup 1) (mem:P (post_inc:P (reg:P SP_REG)))) + (clobber (mem:BLK (scratch)))]) + (set (match_dup 2) (mem:P (post_inc:P (reg:P SP_REG))))]) + +(define_peephole2 + [(match_scratch:P 1 "r") + (parallel [(set (reg:P SP_REG) + (plus:P (reg:P SP_REG) + (match_operand:P 0 "const_int_operand" ""))) + (clobber (reg:CC FLAGS_REG)) + (clobber (mem:BLK (scratch)))])] + "optimize_insn_for_size_p () + && INTVAL (operands[0]) == 2*GET_MODE_SIZE (Pmode)" + [(parallel [(set (match_dup 1) (mem:P (post_inc:P (reg:P SP_REG)))) + (clobber (mem:BLK (scratch)))]) + (set (match_dup 1) (mem:P (post_inc:P (reg:P SP_REG))))]) + +;; Convert esp additions to pop. +(define_peephole2 + [(match_scratch:P 1 "r") + (parallel [(set (reg:P SP_REG) + (plus:P (reg:P SP_REG) + (match_operand:P 0 "const_int_operand" ""))) + (clobber (reg:CC FLAGS_REG))])] + "INTVAL (operands[0]) == GET_MODE_SIZE (Pmode)" + [(set (match_dup 1) (mem:P (post_inc:P (reg:P SP_REG))))]) + +;; Two pops case is tricky, since pop causes dependency +;; on destination register. We use two registers if available. +(define_peephole2 + [(match_scratch:P 1 "r") + (match_scratch:P 2 "r") + (parallel [(set (reg:P SP_REG) + (plus:P (reg:P SP_REG) + (match_operand:P 0 "const_int_operand" ""))) + (clobber (reg:CC FLAGS_REG))])] + "INTVAL (operands[0]) == 2*GET_MODE_SIZE (Pmode)" + [(set (match_dup 1) (mem:P (post_inc:P (reg:P SP_REG)))) + (set (match_dup 2) (mem:P (post_inc:P (reg:P SP_REG))))]) + +(define_peephole2 + [(match_scratch:P 1 "r") + (parallel [(set (reg:P SP_REG) + (plus:P (reg:P SP_REG) + (match_operand:P 0 "const_int_operand" ""))) + (clobber (reg:CC FLAGS_REG))])] + "optimize_insn_for_size_p () + && INTVAL (operands[0]) == 2*GET_MODE_SIZE (Pmode)" + [(set (match_dup 1) (mem:P (post_inc:P (reg:P SP_REG)))) + (set (match_dup 1) (mem:P (post_inc:P (reg:P SP_REG))))]) + +;; Convert compares with 1 to shorter inc/dec operations when CF is not +;; required and register dies. Similarly for 128 to -128. +(define_peephole2 + [(set (match_operand 0 "flags_reg_operand" "") + (match_operator 1 "compare_operator" + [(match_operand 2 "register_operand" "") + (match_operand 3 "const_int_operand" "")]))] + "(((!TARGET_FUSE_CMP_AND_BRANCH || optimize_insn_for_size_p ()) + && incdec_operand (operands[3], GET_MODE (operands[3]))) + || (!TARGET_FUSE_CMP_AND_BRANCH + && INTVAL (operands[3]) == 128)) + && ix86_match_ccmode (insn, CCGCmode) + && peep2_reg_dead_p (1, operands[2])" + [(parallel [(set (match_dup 0) + (match_op_dup 1 [(match_dup 2) (match_dup 3)])) + (clobber (match_dup 2))])]) + +;; Convert imul by three, five and nine into lea +(define_peephole2 + [(parallel + [(set (match_operand:SWI48 0 "register_operand" "") + (mult:SWI48 (match_operand:SWI48 1 "register_operand" "") + (match_operand:SWI48 2 "const_int_operand" ""))) + (clobber (reg:CC FLAGS_REG))])] + "INTVAL (operands[2]) == 3 + || INTVAL (operands[2]) == 5 + || INTVAL (operands[2]) == 9" + [(set (match_dup 0) + (plus:SWI48 (mult:SWI48 (match_dup 1) (match_dup 2)) + (match_dup 1)))] + "operands[2] = GEN_INT (INTVAL (operands[2]) - 1);") + +(define_peephole2 + [(parallel + [(set (match_operand:SWI48 0 "register_operand" "") + (mult:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "") + (match_operand:SWI48 2 "const_int_operand" ""))) + (clobber (reg:CC FLAGS_REG))])] + "optimize_insn_for_speed_p () + && (INTVAL (operands[2]) == 3 + || INTVAL (operands[2]) == 5 + || INTVAL (operands[2]) == 9)" + [(set (match_dup 0) (match_dup 1)) + (set (match_dup 0) + (plus:SWI48 (mult:SWI48 (match_dup 0) (match_dup 2)) + (match_dup 0)))] + "operands[2] = GEN_INT (INTVAL (operands[2]) - 1);") + +;; imul $32bit_imm, mem, reg is vector decoded, while +;; imul $32bit_imm, reg, reg is direct decoded. +(define_peephole2 + [(match_scratch:SWI48 3 "r") + (parallel [(set (match_operand:SWI48 0 "register_operand" "") + (mult:SWI48 (match_operand:SWI48 1 "memory_operand" "") + (match_operand:SWI48 2 "immediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))])] + "TARGET_SLOW_IMUL_IMM32_MEM && optimize_insn_for_speed_p () + && !satisfies_constraint_K (operands[2])" + [(set (match_dup 3) (match_dup 1)) + (parallel [(set (match_dup 0) (mult:SWI48 (match_dup 3) (match_dup 2))) + (clobber (reg:CC FLAGS_REG))])]) + +(define_peephole2 + [(match_scratch:SI 3 "r") + (parallel [(set (match_operand:DI 0 "register_operand" "") + (zero_extend:DI + (mult:SI (match_operand:SI 1 "memory_operand" "") + (match_operand:SI 2 "immediate_operand" "")))) + (clobber (reg:CC FLAGS_REG))])] + "TARGET_64BIT + && TARGET_SLOW_IMUL_IMM32_MEM && optimize_insn_for_speed_p () + && !satisfies_constraint_K (operands[2])" + [(set (match_dup 3) (match_dup 1)) + (parallel [(set (match_dup 0) + (zero_extend:DI (mult:SI (match_dup 3) (match_dup 2)))) + (clobber (reg:CC FLAGS_REG))])]) + +;; imul $8/16bit_imm, regmem, reg is vector decoded. +;; Convert it into imul reg, reg +;; It would be better to force assembler to encode instruction using long +;; immediate, but there is apparently no way to do so. +(define_peephole2 + [(parallel [(set (match_operand:SWI248 0 "register_operand" "") + (mult:SWI248 + (match_operand:SWI248 1 "nonimmediate_operand" "") + (match_operand:SWI248 2 "const_int_operand" ""))) + (clobber (reg:CC FLAGS_REG))]) + (match_scratch:SWI248 3 "r")] + "TARGET_SLOW_IMUL_IMM8 && optimize_insn_for_speed_p () + && satisfies_constraint_K (operands[2])" + [(set (match_dup 3) (match_dup 2)) + (parallel [(set (match_dup 0) (mult:SWI248 (match_dup 0) (match_dup 3))) + (clobber (reg:CC FLAGS_REG))])] +{ + if (!rtx_equal_p (operands[0], operands[1])) + emit_move_insn (operands[0], operands[1]); +}) + +;; After splitting up read-modify operations, array accesses with memory +;; operands might end up in form: +;; sall $2, %eax +;; movl 4(%esp), %edx +;; addl %edx, %eax +;; instead of pre-splitting: +;; sall $2, %eax +;; addl 4(%esp), %eax +;; Turn it into: +;; movl 4(%esp), %edx +;; leal (%edx,%eax,4), %eax + +(define_peephole2 + [(match_scratch:P 5 "r") + (parallel [(set (match_operand 0 "register_operand" "") + (ashift (match_operand 1 "register_operand" "") + (match_operand 2 "const_int_operand" ""))) + (clobber (reg:CC FLAGS_REG))]) + (parallel [(set (match_operand 3 "register_operand" "") + (plus (match_dup 0) + (match_operand 4 "x86_64_general_operand" ""))) + (clobber (reg:CC FLAGS_REG))])] + "IN_RANGE (INTVAL (operands[2]), 1, 3) + /* Validate MODE for lea. */ + && ((!TARGET_PARTIAL_REG_STALL + && (GET_MODE (operands[0]) == QImode + || GET_MODE (operands[0]) == HImode)) + || GET_MODE (operands[0]) == SImode + || (TARGET_64BIT && GET_MODE (operands[0]) == DImode)) + && (rtx_equal_p (operands[0], operands[3]) + || peep2_reg_dead_p (2, operands[0])) + /* We reorder load and the shift. */ + && !reg_overlap_mentioned_p (operands[0], operands[4])" + [(set (match_dup 5) (match_dup 4)) + (set (match_dup 0) (match_dup 1))] +{ + enum machine_mode op1mode = GET_MODE (operands[1]); + enum machine_mode mode = op1mode == DImode ? DImode : SImode; + int scale = 1 << INTVAL (operands[2]); + rtx index = gen_lowpart (Pmode, operands[1]); + rtx base = gen_lowpart (Pmode, operands[5]); + rtx dest = gen_lowpart (mode, operands[3]); + + operands[1] = gen_rtx_PLUS (Pmode, base, + gen_rtx_MULT (Pmode, index, GEN_INT (scale))); + operands[5] = base; + if (mode != Pmode) + operands[1] = gen_rtx_SUBREG (mode, operands[1], 0); + if (op1mode != Pmode) + operands[5] = gen_rtx_SUBREG (op1mode, operands[5], 0); + operands[0] = dest; +}) + +;; Call-value patterns last so that the wildcard operand does not +;; disrupt insn-recog's switch tables. + +(define_insn_and_split "*call_value_pop_0_vzeroupper" + [(parallel + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:SI 1 "constant_call_address_operand" "")) + (match_operand:SI 2 "" ""))) + (set (reg:SI SP_REG) + (plus:SI (reg:SI SP_REG) + (match_operand:SI 3 "immediate_operand" "")))]) + (unspec [(match_operand 4 "const_int_operand" "")] + UNSPEC_CALL_NEEDS_VZEROUPPER)] + "TARGET_VZEROUPPER && !TARGET_64BIT" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_call_vzeroupper (curr_insn, operands[4]); DONE;" + [(set_attr "type" "callv")]) + +(define_insn "*call_value_pop_0" + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:SI 1 "constant_call_address_operand" "")) + (match_operand:SI 2 "" ""))) + (set (reg:SI SP_REG) + (plus:SI (reg:SI SP_REG) + (match_operand:SI 3 "immediate_operand" "")))] + "!TARGET_64BIT" + { return ix86_output_call_insn (insn, operands[1], 1); } + [(set_attr "type" "callv")]) + +(define_insn_and_split "*call_value_pop_1_vzeroupper" + [(parallel + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:SI 1 "call_insn_operand" "lzm")) + (match_operand:SI 2 "" ""))) + (set (reg:SI SP_REG) + (plus:SI (reg:SI SP_REG) + (match_operand:SI 3 "immediate_operand" "i")))]) + (unspec [(match_operand 4 "const_int_operand" "")] + UNSPEC_CALL_NEEDS_VZEROUPPER)] + "TARGET_VZEROUPPER && !TARGET_64BIT && !SIBLING_CALL_P (insn)" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_call_vzeroupper (curr_insn, operands[4]); DONE;" + [(set_attr "type" "callv")]) + +(define_insn "*call_value_pop_1" + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:SI 1 "call_insn_operand" "lzm")) + (match_operand:SI 2 "" ""))) + (set (reg:SI SP_REG) + (plus:SI (reg:SI SP_REG) + (match_operand:SI 3 "immediate_operand" "i")))] + "!TARGET_64BIT && !SIBLING_CALL_P (insn)" + { return ix86_output_call_insn (insn, operands[1], 1); } + [(set_attr "type" "callv")]) + +(define_insn_and_split "*sibcall_value_pop_1_vzeroupper" + [(parallel + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:SI 1 "sibcall_insn_operand" "z,U")) + (match_operand:SI 2 "" ""))) + (set (reg:SI SP_REG) + (plus:SI (reg:SI SP_REG) + (match_operand:SI 3 "immediate_operand" "i,i")))]) + (unspec [(match_operand 4 "const_int_operand" "")] + UNSPEC_CALL_NEEDS_VZEROUPPER)] + "TARGET_VZEROUPPER && !TARGET_64BIT && SIBLING_CALL_P (insn)" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_call_vzeroupper (curr_insn, operands[4]); DONE;" + [(set_attr "type" "callv")]) + +(define_insn "*sibcall_value_pop_1" + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:SI 1 "sibcall_insn_operand" "z,U")) + (match_operand:SI 2 "" ""))) + (set (reg:SI SP_REG) + (plus:SI (reg:SI SP_REG) + (match_operand:SI 3 "immediate_operand" "i,i")))] + "!TARGET_64BIT && SIBLING_CALL_P (insn)" + { return ix86_output_call_insn (insn, operands[1], 1); } + [(set_attr "type" "callv")]) + +(define_insn_and_split "*call_value_0_vzeroupper" + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:SI 1 "constant_call_address_operand" "")) + (match_operand:SI 2 "" ""))) + (unspec [(match_operand 3 "const_int_operand" "")] + UNSPEC_CALL_NEEDS_VZEROUPPER)] + "TARGET_VZEROUPPER && !TARGET_64BIT" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_call_vzeroupper (curr_insn, operands[3]); DONE;" + [(set_attr "type" "callv")]) + +(define_insn "*call_value_0" + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:SI 1 "constant_call_address_operand" "")) + (match_operand:SI 2 "" "")))] + "!TARGET_64BIT" + { return ix86_output_call_insn (insn, operands[1], 1); } + [(set_attr "type" "callv")]) + +(define_insn_and_split "*call_value_0_rex64_vzeroupper" + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:DI 1 "constant_call_address_operand" "")) + (match_operand:DI 2 "const_int_operand" ""))) + (unspec [(match_operand 3 "const_int_operand" "")] + UNSPEC_CALL_NEEDS_VZEROUPPER)] + "TARGET_VZEROUPPER && TARGET_64BIT" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_call_vzeroupper (curr_insn, operands[3]); DONE;" + [(set_attr "type" "callv")]) + +(define_insn "*call_value_0_rex64" + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:DI 1 "constant_call_address_operand" "")) + (match_operand:DI 2 "const_int_operand" "")))] + "TARGET_64BIT" + { return ix86_output_call_insn (insn, operands[1], 1); } + [(set_attr "type" "callv")]) + +(define_insn_and_split "*call_value_0_rex64_ms_sysv_vzeroupper" + [(parallel + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:DI 1 "constant_call_address_operand" "")) + (match_operand:DI 2 "const_int_operand" ""))) + (unspec [(const_int 0)] UNSPEC_MS_TO_SYSV_CALL) + (clobber (reg:TI XMM6_REG)) + (clobber (reg:TI XMM7_REG)) + (clobber (reg:TI XMM8_REG)) + (clobber (reg:TI XMM9_REG)) + (clobber (reg:TI XMM10_REG)) + (clobber (reg:TI XMM11_REG)) + (clobber (reg:TI XMM12_REG)) + (clobber (reg:TI XMM13_REG)) + (clobber (reg:TI XMM14_REG)) + (clobber (reg:TI XMM15_REG)) + (clobber (reg:DI SI_REG)) + (clobber (reg:DI DI_REG))]) + (unspec [(match_operand 3 "const_int_operand" "")] + UNSPEC_CALL_NEEDS_VZEROUPPER)] + "TARGET_VZEROUPPER && TARGET_64BIT && !SIBLING_CALL_P (insn)" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_call_vzeroupper (curr_insn, operands[3]); DONE;" + [(set_attr "type" "callv")]) + +(define_insn "*call_value_0_rex64_ms_sysv" + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:DI 1 "constant_call_address_operand" "")) + (match_operand:DI 2 "const_int_operand" ""))) + (unspec [(const_int 0)] UNSPEC_MS_TO_SYSV_CALL) + (clobber (reg:TI XMM6_REG)) + (clobber (reg:TI XMM7_REG)) + (clobber (reg:TI XMM8_REG)) + (clobber (reg:TI XMM9_REG)) + (clobber (reg:TI XMM10_REG)) + (clobber (reg:TI XMM11_REG)) + (clobber (reg:TI XMM12_REG)) + (clobber (reg:TI XMM13_REG)) + (clobber (reg:TI XMM14_REG)) + (clobber (reg:TI XMM15_REG)) + (clobber (reg:DI SI_REG)) + (clobber (reg:DI DI_REG))] + "TARGET_64BIT && !SIBLING_CALL_P (insn)" + { return ix86_output_call_insn (insn, operands[1], 1); } + [(set_attr "type" "callv")]) + +(define_insn_and_split "*call_value_1_vzeroupper" + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:SI 1 "call_insn_operand" "lzm")) + (match_operand:SI 2 "" ""))) + (unspec [(match_operand 3 "const_int_operand" "")] + UNSPEC_CALL_NEEDS_VZEROUPPER)] + "TARGET_VZEROUPPER && !TARGET_64BIT && !SIBLING_CALL_P (insn)" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_call_vzeroupper (curr_insn, operands[3]); DONE;" + [(set_attr "type" "callv")]) + +(define_insn "*call_value_1" + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:SI 1 "call_insn_operand" "lzm")) + (match_operand:SI 2 "" "")))] + "!TARGET_64BIT && !SIBLING_CALL_P (insn)" + { return ix86_output_call_insn (insn, operands[1], 1); } + [(set_attr "type" "callv")]) + +(define_insn_and_split "*sibcall_value_1_vzeroupper" + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:SI 1 "sibcall_insn_operand" "z,U")) + (match_operand:SI 2 "" ""))) + (unspec [(match_operand 3 "const_int_operand" "")] + UNSPEC_CALL_NEEDS_VZEROUPPER)] + "TARGET_VZEROUPPER && !TARGET_64BIT && SIBLING_CALL_P (insn)" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_call_vzeroupper (curr_insn, operands[3]); DONE;" + [(set_attr "type" "callv")]) + +(define_insn "*sibcall_value_1" + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:SI 1 "sibcall_insn_operand" "z,U")) + (match_operand:SI 2 "" "")))] + "!TARGET_64BIT && SIBLING_CALL_P (insn)" + { return ix86_output_call_insn (insn, operands[1], 1); } + [(set_attr "type" "callv")]) + +(define_insn_and_split "*call_value_1_rex64_vzeroupper" + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:DI 1 "call_insn_operand" "rzm")) + (match_operand:DI 2 "" ""))) + (unspec [(match_operand 3 "const_int_operand" "")] + UNSPEC_CALL_NEEDS_VZEROUPPER)] + "TARGET_VZEROUPPER && TARGET_64BIT && !SIBLING_CALL_P (insn) + && ix86_cmodel != CM_LARGE && ix86_cmodel != CM_LARGE_PIC" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_call_vzeroupper (curr_insn, operands[3]); DONE;" + [(set_attr "type" "callv")]) + +(define_insn "*call_value_1_rex64" + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:DI 1 "call_insn_operand" "rzm")) + (match_operand:DI 2 "" "")))] + "TARGET_64BIT && !SIBLING_CALL_P (insn) + && ix86_cmodel != CM_LARGE && ix86_cmodel != CM_LARGE_PIC" + { return ix86_output_call_insn (insn, operands[1], 1); } + [(set_attr "type" "callv")]) + +(define_insn_and_split "*call_value_1_rex64_ms_sysv_vzeroupper" + [(parallel + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:DI 1 "call_insn_operand" "rzm")) + (match_operand:DI 2 "" ""))) + (unspec [(const_int 0)] UNSPEC_MS_TO_SYSV_CALL) + (clobber (reg:TI XMM6_REG)) + (clobber (reg:TI XMM7_REG)) + (clobber (reg:TI XMM8_REG)) + (clobber (reg:TI XMM9_REG)) + (clobber (reg:TI XMM10_REG)) + (clobber (reg:TI XMM11_REG)) + (clobber (reg:TI XMM12_REG)) + (clobber (reg:TI XMM13_REG)) + (clobber (reg:TI XMM14_REG)) + (clobber (reg:TI XMM15_REG)) + (clobber (reg:DI SI_REG)) + (clobber (reg:DI DI_REG))]) + (unspec [(match_operand 3 "const_int_operand" "")] + UNSPEC_CALL_NEEDS_VZEROUPPER)] + "TARGET_VZEROUPPER && TARGET_64BIT && !SIBLING_CALL_P (insn)" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_call_vzeroupper (curr_insn, operands[3]); DONE;" + [(set_attr "type" "callv")]) + +(define_insn "*call_value_1_rex64_ms_sysv" + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:DI 1 "call_insn_operand" "rzm")) + (match_operand:DI 2 "" ""))) + (unspec [(const_int 0)] UNSPEC_MS_TO_SYSV_CALL) + (clobber (reg:TI XMM6_REG)) + (clobber (reg:TI XMM7_REG)) + (clobber (reg:TI XMM8_REG)) + (clobber (reg:TI XMM9_REG)) + (clobber (reg:TI XMM10_REG)) + (clobber (reg:TI XMM11_REG)) + (clobber (reg:TI XMM12_REG)) + (clobber (reg:TI XMM13_REG)) + (clobber (reg:TI XMM14_REG)) + (clobber (reg:TI XMM15_REG)) + (clobber (reg:DI SI_REG)) + (clobber (reg:DI DI_REG))] + "TARGET_64BIT && !SIBLING_CALL_P (insn)" + { return ix86_output_call_insn (insn, operands[1], 1); } + [(set_attr "type" "callv")]) + +(define_insn_and_split "*call_value_1_rex64_large_vzeroupper" + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:DI 1 "call_insn_operand" "rm")) + (match_operand:DI 2 "" ""))) + (unspec [(match_operand 3 "const_int_operand" "")] + UNSPEC_CALL_NEEDS_VZEROUPPER)] + "TARGET_VZEROUPPER && TARGET_64BIT && !SIBLING_CALL_P (insn)" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_call_vzeroupper (curr_insn, operands[3]); DONE;" + [(set_attr "type" "callv")]) + +(define_insn "*call_value_1_rex64_large" + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:DI 1 "call_insn_operand" "rm")) + (match_operand:DI 2 "" "")))] + "TARGET_64BIT && !SIBLING_CALL_P (insn)" + { return ix86_output_call_insn (insn, operands[1], 1); } + [(set_attr "type" "callv")]) + +(define_insn_and_split "*sibcall_value_1_rex64_vzeroupper" + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:DI 1 "sibcall_insn_operand" "z,U")) + (match_operand:DI 2 "" ""))) + (unspec [(match_operand 3 "const_int_operand" "")] + UNSPEC_CALL_NEEDS_VZEROUPPER)] + "TARGET_VZEROUPPER && TARGET_64BIT && SIBLING_CALL_P (insn)" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_call_vzeroupper (curr_insn, operands[3]); DONE;" + [(set_attr "type" "callv")]) + +(define_insn "*sibcall_value_1_rex64" + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:DI 1 "sibcall_insn_operand" "z,U")) + (match_operand:DI 2 "" "")))] + "TARGET_64BIT && SIBLING_CALL_P (insn)" + { return ix86_output_call_insn (insn, operands[1], 1); } + [(set_attr "type" "callv")]) + +;; We used to use "int $5", in honor of #BR which maps to interrupt vector 5. +;; That, however, is usually mapped by the OS to SIGSEGV, which is often +;; caught for use by garbage collectors and the like. Using an insn that +;; maps to SIGILL makes it more likely the program will rightfully die. +;; Keeping with tradition, "6" is in honor of #UD. +(define_insn "trap" + [(trap_if (const_int 1) (const_int 6))] + "" + { return ASM_SHORT "0x0b0f"; } + [(set_attr "length" "2")]) + +(define_expand "prefetch" + [(prefetch (match_operand 0 "address_operand" "") + (match_operand:SI 1 "const_int_operand" "") + (match_operand:SI 2 "const_int_operand" ""))] + "TARGET_PREFETCH_SSE || TARGET_3DNOW" +{ + int rw = INTVAL (operands[1]); + int locality = INTVAL (operands[2]); + + gcc_assert (rw == 0 || rw == 1); + gcc_assert (locality >= 0 && locality <= 3); + gcc_assert (GET_MODE (operands[0]) == Pmode + || GET_MODE (operands[0]) == VOIDmode); + + /* Use 3dNOW prefetch in case we are asking for write prefetch not + supported by SSE counterpart or the SSE prefetch is not available + (K6 machines). Otherwise use SSE prefetch as it allows specifying + of locality. */ + if (TARGET_3DNOW && (!TARGET_PREFETCH_SSE || rw)) + operands[2] = GEN_INT (3); + else + operands[1] = const0_rtx; +}) + +(define_insn "*prefetch_sse_" + [(prefetch (match_operand:P 0 "address_operand" "p") + (const_int 0) + (match_operand:SI 1 "const_int_operand" ""))] + "TARGET_PREFETCH_SSE" +{ + static const char * const patterns[4] = { + "prefetchnta\t%a0", "prefetcht2\t%a0", "prefetcht1\t%a0", "prefetcht0\t%a0" + }; + + int locality = INTVAL (operands[1]); + gcc_assert (locality >= 0 && locality <= 3); + + return patterns[locality]; +} + [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "prefetch") + (set (attr "length_address") + (symbol_ref "memory_address_length (operands[0])")) + (set_attr "memory" "none")]) + +(define_insn "*prefetch_3dnow_" + [(prefetch (match_operand:P 0 "address_operand" "p") + (match_operand:SI 1 "const_int_operand" "n") + (const_int 3))] + "TARGET_3DNOW" +{ + if (INTVAL (operands[1]) == 0) + return "prefetch\t%a0"; + else + return "prefetchw\t%a0"; +} + [(set_attr "type" "mmx") + (set (attr "length_address") + (symbol_ref "memory_address_length (operands[0])")) + (set_attr "memory" "none")]) + +(define_expand "stack_protect_set" + [(match_operand 0 "memory_operand" "") + (match_operand 1 "memory_operand" "")] + "" +{ + rtx (*insn)(rtx, rtx); + +#ifdef TARGET_THREAD_SSP_OFFSET + operands[1] = GEN_INT (TARGET_THREAD_SSP_OFFSET); + insn = (TARGET_64BIT + ? gen_stack_tls_protect_set_di + : gen_stack_tls_protect_set_si); +#else + insn = (TARGET_64BIT + ? gen_stack_protect_set_di + : gen_stack_protect_set_si); +#endif + + emit_insn (insn (operands[0], operands[1])); + DONE; +}) + +(define_insn "stack_protect_set_" + [(set (match_operand:P 0 "memory_operand" "=m") + (unspec:P [(match_operand:P 1 "memory_operand" "m")] UNSPEC_SP_SET)) + (set (match_scratch:P 2 "=&r") (const_int 0)) + (clobber (reg:CC FLAGS_REG))] + "" + "mov{}\t{%1, %2|%2, %1}\;mov{}\t{%2, %0|%0, %2}\;xor{l}\t%k2, %k2" + [(set_attr "type" "multi")]) + +(define_insn "stack_tls_protect_set_" + [(set (match_operand:P 0 "memory_operand" "=m") + (unspec:P [(match_operand:P 1 "const_int_operand" "i")] + UNSPEC_SP_TLS_SET)) + (set (match_scratch:P 2 "=&r") (const_int 0)) + (clobber (reg:CC FLAGS_REG))] + "" + "mov{}\t{%@:%P1, %2|%2, PTR %@:%P1}\;mov{}\t{%2, %0|%0, %2}\;xor{l}\t%k2, %k2" + [(set_attr "type" "multi")]) + +(define_expand "stack_protect_test" + [(match_operand 0 "memory_operand" "") + (match_operand 1 "memory_operand" "") + (match_operand 2 "" "")] + "" +{ + rtx flags = gen_rtx_REG (CCZmode, FLAGS_REG); + + rtx (*insn)(rtx, rtx, rtx); + +#ifdef TARGET_THREAD_SSP_OFFSET + operands[1] = GEN_INT (TARGET_THREAD_SSP_OFFSET); + insn = (TARGET_64BIT + ? gen_stack_tls_protect_test_di + : gen_stack_tls_protect_test_si); +#else + insn = (TARGET_64BIT + ? gen_stack_protect_test_di + : gen_stack_protect_test_si); +#endif + + emit_insn (insn (flags, operands[0], operands[1])); + + emit_jump_insn (gen_cbranchcc4 (gen_rtx_EQ (VOIDmode, flags, const0_rtx), + flags, const0_rtx, operands[2])); + DONE; +}) + +(define_insn "stack_protect_test_" + [(set (match_operand:CCZ 0 "flags_reg_operand" "") + (unspec:CCZ [(match_operand:P 1 "memory_operand" "m") + (match_operand:P 2 "memory_operand" "m")] + UNSPEC_SP_TEST)) + (clobber (match_scratch:P 3 "=&r"))] + "" + "mov{}\t{%1, %3|%3, %1}\;xor{}\t{%2, %3|%3, %2}" + [(set_attr "type" "multi")]) + +(define_insn "stack_tls_protect_test_" + [(set (match_operand:CCZ 0 "flags_reg_operand" "") + (unspec:CCZ [(match_operand:P 1 "memory_operand" "m") + (match_operand:P 2 "const_int_operand" "i")] + UNSPEC_SP_TLS_TEST)) + (clobber (match_scratch:P 3 "=r"))] + "" + "mov{}\t{%1, %3|%3, %1}\;xor{}\t{%@:%P2, %3|%3, PTR %@:%P2}" + [(set_attr "type" "multi")]) + +(define_insn "sse4_2_crc32" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec:SI + [(match_operand:SI 1 "register_operand" "0") + (match_operand:SWI124 2 "nonimmediate_operand" "m")] + UNSPEC_CRC32))] + "TARGET_SSE4_2 || TARGET_CRC32" + "crc32{}\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog1") + (set_attr "prefix_rep" "1") + (set_attr "prefix_extra" "1") + (set (attr "prefix_data16") + (if_then_else (match_operand:HI 2 "" "") + (const_string "1") + (const_string "*"))) + (set (attr "prefix_rex") + (if_then_else (match_operand:QI 2 "ext_QIreg_operand" "") + (const_string "1") + (const_string "*"))) + (set_attr "mode" "SI")]) + +(define_insn "sse4_2_crc32di" + [(set (match_operand:DI 0 "register_operand" "=r") + (unspec:DI + [(match_operand:DI 1 "register_operand" "0") + (match_operand:DI 2 "nonimmediate_operand" "rm")] + UNSPEC_CRC32))] + "TARGET_64BIT && (TARGET_SSE4_2 || TARGET_CRC32)" + "crc32{q}\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog1") + (set_attr "prefix_rep" "1") + (set_attr "prefix_extra" "1") + (set_attr "mode" "DI")]) + +(define_expand "rdpmc" + [(match_operand:DI 0 "register_operand" "") + (match_operand:SI 1 "register_operand" "")] + "" +{ + rtx reg = gen_reg_rtx (DImode); + rtx si; + + /* Force operand 1 into ECX. */ + rtx ecx = gen_rtx_REG (SImode, CX_REG); + emit_insn (gen_rtx_SET (VOIDmode, ecx, operands[1])); + si = gen_rtx_UNSPEC_VOLATILE (DImode, gen_rtvec (1, ecx), + UNSPECV_RDPMC); + + if (TARGET_64BIT) + { + rtvec vec = rtvec_alloc (2); + rtx load = gen_rtx_PARALLEL (VOIDmode, vec); + rtx upper = gen_reg_rtx (DImode); + rtx di = gen_rtx_UNSPEC_VOLATILE (DImode, + gen_rtvec (1, const0_rtx), + UNSPECV_RDPMC); + RTVEC_ELT (vec, 0) = gen_rtx_SET (VOIDmode, reg, si); + RTVEC_ELT (vec, 1) = gen_rtx_SET (VOIDmode, upper, di); + emit_insn (load); + upper = expand_simple_binop (DImode, ASHIFT, upper, GEN_INT (32), + NULL, 1, OPTAB_DIRECT); + reg = expand_simple_binop (DImode, IOR, reg, upper, reg, 1, + OPTAB_DIRECT); + } + else + emit_insn (gen_rtx_SET (VOIDmode, reg, si)); + emit_insn (gen_rtx_SET (VOIDmode, operands[0], reg)); + DONE; +}) + +(define_insn "*rdpmc" + [(set (match_operand:DI 0 "register_operand" "=A") + (unspec_volatile:DI [(match_operand:SI 1 "register_operand" "c")] + UNSPECV_RDPMC))] + "!TARGET_64BIT" + "rdpmc" + [(set_attr "type" "other") + (set_attr "length" "2")]) + +(define_insn "*rdpmc_rex64" + [(set (match_operand:DI 0 "register_operand" "=a") + (unspec_volatile:DI [(match_operand:SI 2 "register_operand" "c")] + UNSPECV_RDPMC)) + (set (match_operand:DI 1 "register_operand" "=d") + (unspec_volatile:DI [(const_int 0)] UNSPECV_RDPMC))] + "TARGET_64BIT" + "rdpmc" + [(set_attr "type" "other") + (set_attr "length" "2")]) + +(define_expand "rdtsc" + [(set (match_operand:DI 0 "register_operand" "") + (unspec_volatile:DI [(const_int 0)] UNSPECV_RDTSC))] + "" +{ + if (TARGET_64BIT) + { + rtvec vec = rtvec_alloc (2); + rtx load = gen_rtx_PARALLEL (VOIDmode, vec); + rtx upper = gen_reg_rtx (DImode); + rtx lower = gen_reg_rtx (DImode); + rtx src = gen_rtx_UNSPEC_VOLATILE (DImode, + gen_rtvec (1, const0_rtx), + UNSPECV_RDTSC); + RTVEC_ELT (vec, 0) = gen_rtx_SET (VOIDmode, lower, src); + RTVEC_ELT (vec, 1) = gen_rtx_SET (VOIDmode, upper, src); + emit_insn (load); + upper = expand_simple_binop (DImode, ASHIFT, upper, GEN_INT (32), + NULL, 1, OPTAB_DIRECT); + lower = expand_simple_binop (DImode, IOR, lower, upper, lower, 1, + OPTAB_DIRECT); + emit_insn (gen_rtx_SET (VOIDmode, operands[0], lower)); + DONE; + } +}) + +(define_insn "*rdtsc" + [(set (match_operand:DI 0 "register_operand" "=A") + (unspec_volatile:DI [(const_int 0)] UNSPECV_RDTSC))] + "!TARGET_64BIT" + "rdtsc" + [(set_attr "type" "other") + (set_attr "length" "2")]) + +(define_insn "*rdtsc_rex64" + [(set (match_operand:DI 0 "register_operand" "=a") + (unspec_volatile:DI [(const_int 0)] UNSPECV_RDTSC)) + (set (match_operand:DI 1 "register_operand" "=d") + (unspec_volatile:DI [(const_int 0)] UNSPECV_RDTSC))] + "TARGET_64BIT" + "rdtsc" + [(set_attr "type" "other") + (set_attr "length" "2")]) + +(define_expand "rdtscp" + [(match_operand:DI 0 "register_operand" "") + (match_operand:SI 1 "memory_operand" "")] + "" +{ + rtx di = gen_rtx_UNSPEC_VOLATILE (DImode, + gen_rtvec (1, const0_rtx), + UNSPECV_RDTSCP); + rtx si = gen_rtx_UNSPEC_VOLATILE (SImode, + gen_rtvec (1, const0_rtx), + UNSPECV_RDTSCP); + rtx reg = gen_reg_rtx (DImode); + rtx tmp = gen_reg_rtx (SImode); + + if (TARGET_64BIT) + { + rtvec vec = rtvec_alloc (3); + rtx load = gen_rtx_PARALLEL (VOIDmode, vec); + rtx upper = gen_reg_rtx (DImode); + RTVEC_ELT (vec, 0) = gen_rtx_SET (VOIDmode, reg, di); + RTVEC_ELT (vec, 1) = gen_rtx_SET (VOIDmode, upper, di); + RTVEC_ELT (vec, 2) = gen_rtx_SET (VOIDmode, tmp, si); + emit_insn (load); + upper = expand_simple_binop (DImode, ASHIFT, upper, GEN_INT (32), + NULL, 1, OPTAB_DIRECT); + reg = expand_simple_binop (DImode, IOR, reg, upper, reg, 1, + OPTAB_DIRECT); + } + else + { + rtvec vec = rtvec_alloc (2); + rtx load = gen_rtx_PARALLEL (VOIDmode, vec); + RTVEC_ELT (vec, 0) = gen_rtx_SET (VOIDmode, reg, di); + RTVEC_ELT (vec, 1) = gen_rtx_SET (VOIDmode, tmp, si); + emit_insn (load); + } + emit_insn (gen_rtx_SET (VOIDmode, operands[0], reg)); + emit_insn (gen_rtx_SET (VOIDmode, operands[1], tmp)); + DONE; +}) + +(define_insn "*rdtscp" + [(set (match_operand:DI 0 "register_operand" "=A") + (unspec_volatile:DI [(const_int 0)] UNSPECV_RDTSCP)) + (set (match_operand:SI 1 "register_operand" "=c") + (unspec_volatile:SI [(const_int 0)] UNSPECV_RDTSCP))] + "!TARGET_64BIT" + "rdtscp" + [(set_attr "type" "other") + (set_attr "length" "3")]) + +(define_insn "*rdtscp_rex64" + [(set (match_operand:DI 0 "register_operand" "=a") + (unspec_volatile:DI [(const_int 0)] UNSPECV_RDTSCP)) + (set (match_operand:DI 1 "register_operand" "=d") + (unspec_volatile:DI [(const_int 0)] UNSPECV_RDTSCP)) + (set (match_operand:SI 2 "register_operand" "=c") + (unspec_volatile:SI [(const_int 0)] UNSPECV_RDTSCP))] + "TARGET_64BIT" + "rdtscp" + [(set_attr "type" "other") + (set_attr "length" "3")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; LWP instructions +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_expand "lwp_llwpcb" + [(unspec_volatile [(match_operand 0 "register_operand" "r")] + UNSPECV_LLWP_INTRINSIC)] + "TARGET_LWP") + +(define_insn "*lwp_llwpcb1" + [(unspec_volatile [(match_operand:P 0 "register_operand" "r")] + UNSPECV_LLWP_INTRINSIC)] + "TARGET_LWP" + "llwpcb\t%0" + [(set_attr "type" "lwp") + (set_attr "mode" "") + (set_attr "length" "5")]) + +(define_expand "lwp_slwpcb" + [(set (match_operand 0 "register_operand" "=r") + (unspec_volatile [(const_int 0)] UNSPECV_SLWP_INTRINSIC))] + "TARGET_LWP" +{ + if (TARGET_64BIT) + emit_insn (gen_lwp_slwpcbdi (operands[0])); + else + emit_insn (gen_lwp_slwpcbsi (operands[0])); + DONE; +}) + +(define_insn "lwp_slwpcb" + [(set (match_operand:P 0 "register_operand" "=r") + (unspec_volatile:P [(const_int 0)] UNSPECV_SLWP_INTRINSIC))] + "TARGET_LWP" + "slwpcb\t%0" + [(set_attr "type" "lwp") + (set_attr "mode" "") + (set_attr "length" "5")]) + +(define_expand "lwp_lwpval3" + [(unspec_volatile [(match_operand:SWI48 1 "register_operand" "r") + (match_operand:SI 2 "nonimmediate_operand" "rm") + (match_operand:SI 3 "const_int_operand" "i")] + UNSPECV_LWPVAL_INTRINSIC)] + "TARGET_LWP" + ;; Avoid unused variable warning. + "(void) operands[0];") + +(define_insn "*lwp_lwpval3_1" + [(unspec_volatile [(match_operand:SWI48 0 "register_operand" "r") + (match_operand:SI 1 "nonimmediate_operand" "rm") + (match_operand:SI 2 "const_int_operand" "i")] + UNSPECV_LWPVAL_INTRINSIC)] + "TARGET_LWP" + "lwpval\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "lwp") + (set_attr "mode" "") + (set (attr "length") + (symbol_ref "ix86_attr_length_address_default (insn) + 9"))]) + +(define_expand "lwp_lwpins3" + [(set (reg:CCC FLAGS_REG) + (unspec_volatile:CCC [(match_operand:SWI48 1 "register_operand" "r") + (match_operand:SI 2 "nonimmediate_operand" "rm") + (match_operand:SI 3 "const_int_operand" "i")] + UNSPECV_LWPINS_INTRINSIC)) + (set (match_operand:QI 0 "nonimmediate_operand" "=qm") + (eq:QI (reg:CCC FLAGS_REG) (const_int 0)))] + "TARGET_LWP") + +(define_insn "*lwp_lwpins3_1" + [(set (reg:CCC FLAGS_REG) + (unspec_volatile:CCC [(match_operand:SWI48 0 "register_operand" "r") + (match_operand:SI 1 "nonimmediate_operand" "rm") + (match_operand:SI 2 "const_int_operand" "i")] + UNSPECV_LWPINS_INTRINSIC))] + "TARGET_LWP" + "lwpins\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "lwp") + (set_attr "mode" "") + (set (attr "length") + (symbol_ref "ix86_attr_length_address_default (insn) + 9"))]) + +(define_insn "rdfsbase" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (unspec_volatile:SWI48 [(const_int 0)] UNSPECV_RDFSBASE))] + "TARGET_64BIT && TARGET_FSGSBASE" + "rdfsbase %0" + [(set_attr "type" "other") + (set_attr "prefix_extra" "2")]) + +(define_insn "rdgsbase" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (unspec_volatile:SWI48 [(const_int 0)] UNSPECV_RDGSBASE))] + "TARGET_64BIT && TARGET_FSGSBASE" + "rdgsbase %0" + [(set_attr "type" "other") + (set_attr "prefix_extra" "2")]) + +(define_insn "wrfsbase" + [(unspec_volatile [(match_operand:SWI48 0 "register_operand" "r")] + UNSPECV_WRFSBASE)] + "TARGET_64BIT && TARGET_FSGSBASE" + "wrfsbase %0" + [(set_attr "type" "other") + (set_attr "prefix_extra" "2")]) + +(define_insn "wrgsbase" + [(unspec_volatile [(match_operand:SWI48 0 "register_operand" "r")] + UNSPECV_WRGSBASE)] + "TARGET_64BIT && TARGET_FSGSBASE" + "wrgsbase %0" + [(set_attr "type" "other") + (set_attr "prefix_extra" "2")]) + +(define_insn "rdrand_1" + [(set (match_operand:SWI248 0 "register_operand" "=r") + (unspec_volatile:SWI248 [(const_int 0)] UNSPECV_RDRAND)) + (set (reg:CCC FLAGS_REG) + (unspec_volatile:CCC [(const_int 0)] UNSPECV_RDRAND))] + "TARGET_RDRND" + "rdrand\t%0" + [(set_attr "type" "other") + (set_attr "prefix_extra" "1")]) + +(include "mmx.md") +(include "sse.md") +(include "sync.md") diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt new file mode 100644 index 000000000..fe5949f3b --- /dev/null +++ b/gcc/config/i386/i386.opt @@ -0,0 +1,425 @@ +; Options for the IA-32 and AMD64 ports of the compiler. + +; Copyright (C) 2005, 2006, 2007, 2008, 2009, +; 2010, 2011 Free Software Foundation, Inc. +; +; This file is part of GCC. +; +; GCC is free software; you can redistribute it and/or modify it under +; the terms of the GNU General Public License as published by the Free +; Software Foundation; either version 3, or (at your option) any later +; version. +; +; GCC is distributed in the hope that it will be useful, but WITHOUT ANY +; WARRANTY; without even the implied warranty of MERCHANTABILITY or +; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +; for more details. +; +; You should have received a copy of the GNU General Public License +; along with GCC; see the file COPYING3. If not see +; . + +; Bit flags that specify the ISA we are compiling for. +Variable +int ix86_isa_flags = TARGET_64BIT_DEFAULT | TARGET_SUBTARGET_ISA_DEFAULT + +;; Definitions to add to the cl_target_option structure +;; -march= processor +TargetSave +unsigned char arch + +;; -mtune= processor +TargetSave +unsigned char tune + +;; -mfpath= +TargetSave +unsigned char fpmath + +;; CPU schedule model +TargetSave +unsigned char schedule + +;; branch cost +TargetSave +unsigned char branch_cost + +;; which flags were passed by the user +TargetSave +int ix86_isa_flags_explicit + +;; which flags were passed by the user +TargetSave +int ix86_target_flags_explicit + +;; whether -mtune was not specified +TargetSave +unsigned char tune_defaulted + +;; whether -march was specified +TargetSave +unsigned char arch_specified + +;; x86 options +m128bit-long-double +Target RejectNegative Report Mask(128BIT_LONG_DOUBLE) Save +sizeof(long double) is 16 + +m80387 +Target Report Mask(80387) Save +Use hardware fp + +m96bit-long-double +Target RejectNegative Report InverseMask(128BIT_LONG_DOUBLE) Save +sizeof(long double) is 12 + +maccumulate-outgoing-args +Target Report Mask(ACCUMULATE_OUTGOING_ARGS) Save +Reserve space for outgoing arguments in the function prologue + +malign-double +Target Report Mask(ALIGN_DOUBLE) Save +Align some doubles on dword boundary + +malign-functions= +Target RejectNegative Joined Var(ix86_align_funcs_string) +Function starts are aligned to this power of 2 + +malign-jumps= +Target RejectNegative Joined Var(ix86_align_jumps_string) +Jump targets are aligned to this power of 2 + +malign-loops= +Target RejectNegative Joined Var(ix86_align_loops_string) +Loop code aligned to this power of 2 + +malign-stringops +Target RejectNegative Report InverseMask(NO_ALIGN_STRINGOPS, ALIGN_STRINGOPS) Save +Align destination of the string operations + +march= +Target RejectNegative Joined Var(ix86_arch_string) +Generate code for given CPU + +masm= +Target RejectNegative Joined Var(ix86_asm_string) +Use given assembler dialect + +mbranch-cost= +Target RejectNegative Joined Var(ix86_branch_cost_string) +Branches are this expensive (1-5, arbitrary units) + +mlarge-data-threshold= +Target RejectNegative Joined Var(ix86_section_threshold_string) +Data greater than given threshold will go into .ldata section in x86-64 medium model + +mcmodel= +Target RejectNegative Joined Var(ix86_cmodel_string) +Use given x86-64 code model + +mcpu= +Target RejectNegative Joined Undocumented Alias(mtune=) Warn(%<-mcpu=%> is deprecated; use %<-mtune=%> or %<-march=%> instead) + +mfancy-math-387 +Target RejectNegative Report InverseMask(NO_FANCY_MATH_387, USE_FANCY_MATH_387) Save +Generate sin, cos, sqrt for FPU + +mforce-drap +Target Report Var(ix86_force_drap) +Always use Dynamic Realigned Argument Pointer (DRAP) to realign stack + +mfp-ret-in-387 +Target Report Mask(FLOAT_RETURNS) Save +Return values of functions in FPU registers + +mfpmath= +Target RejectNegative Joined Var(ix86_fpmath_string) +Generate floating point mathematics using given instruction set + +mhard-float +Target RejectNegative Mask(80387) MaskExists Save +Use hardware fp + +mieee-fp +Target Report Mask(IEEE_FP) Save +Use IEEE math for fp comparisons + +minline-all-stringops +Target Report Mask(INLINE_ALL_STRINGOPS) Save +Inline all known string operations + +minline-stringops-dynamically +Target Report Mask(INLINE_STRINGOPS_DYNAMICALLY) Save +Inline memset/memcpy string operations, but perform inline version only for small blocks + +mintel-syntax +Target Undocumented Alias(masm=, intel, att) Warn(%<-mintel-syntax%> and %<-mno-intel-syntax%> are deprecated; use %<-masm=intel%> and %<-masm=att%> instead) +;; Deprecated + +mms-bitfields +Target Report Mask(MS_BITFIELD_LAYOUT) Save +Use native (MS) bitfield layout + +mno-align-stringops +Target RejectNegative Report Mask(NO_ALIGN_STRINGOPS) Undocumented Save + +mno-fancy-math-387 +Target RejectNegative Report Mask(NO_FANCY_MATH_387) Undocumented Save + +mno-push-args +Target RejectNegative Report Mask(NO_PUSH_ARGS) Undocumented Save + +mno-red-zone +Target RejectNegative Report Mask(NO_RED_ZONE) Undocumented Save + +momit-leaf-frame-pointer +Target Report Mask(OMIT_LEAF_FRAME_POINTER) Save +Omit the frame pointer in leaf functions + +mpc +Target RejectNegative Report Joined Var(ix87_precision_string) +Set 80387 floating-point precision (-mpc32, -mpc64, -mpc80) + +mpreferred-stack-boundary= +Target RejectNegative Joined Var(ix86_preferred_stack_boundary_string) +Attempt to keep stack aligned to this power of 2 + +mincoming-stack-boundary= +Target RejectNegative Joined Var(ix86_incoming_stack_boundary_string) +Assume incoming stack aligned to this power of 2 + +mpush-args +Target Report InverseMask(NO_PUSH_ARGS, PUSH_ARGS) Save +Use push instructions to save outgoing arguments + +mred-zone +Target RejectNegative Report InverseMask(NO_RED_ZONE, RED_ZONE) Save +Use red-zone in the x86-64 code + +mregparm= +Target RejectNegative Joined Var(ix86_regparm_string) +Number of registers used to pass integer arguments + +mrtd +Target Report Mask(RTD) Save +Alternate calling convention + +msoft-float +Target InverseMask(80387) Save +Do not use hardware fp + +msseregparm +Target RejectNegative Mask(SSEREGPARM) Save +Use SSE register passing conventions for SF and DF mode + +mstackrealign +Target Report Var(ix86_force_align_arg_pointer) Init(-1) +Realign stack in prologue + +mstack-arg-probe +Target Report Mask(STACK_PROBE) Save +Enable stack probing + +mstringop-strategy= +Target RejectNegative Joined Var(ix86_stringop_string) +Chose strategy to generate stringop using + +mtls-dialect= +Target RejectNegative Joined Var(ix86_tls_dialect_string) +Use given thread-local storage dialect + +mtls-direct-seg-refs +Target Report Mask(TLS_DIRECT_SEG_REFS) +Use direct references against %gs when accessing tls data + +mtune= +Target RejectNegative Joined Var(ix86_tune_string) +Schedule code for given CPU + +mabi= +Target RejectNegative Joined Var(ix86_abi_string) +Generate code that conforms to the given ABI + +mveclibabi= +Target RejectNegative Joined Var(ix86_veclibabi_string) +Vector library ABI to use + +mvect8-ret-in-mem +Target Report Mask(VECT8_RETURNS) Save +Return 8-byte vectors in memory + +mrecip +Target Report Mask(RECIP) Save +Generate reciprocals instead of divss and sqrtss. + +mcld +Target Report Mask(CLD) Save +Generate cld instruction in the function prologue. + +mvzeroupper +Target Report Mask(VZEROUPPER) Save +Generate vzeroupper instruction before a transfer of control flow out of +the function. + +mdispatch-scheduler +Target RejectNegative Var(flag_dispatch_scheduler) +Do dispatch scheduling if processor is bdver1 and Haifa scheduling +is selected. + +mprefer-avx128 +Target Report Mask(PREFER_AVX128) SAVE +Use 128-bit AVX instructions instead of 256-bit AVX instructions in the auto-vectorizer. + +;; ISA support + +m32 +Target RejectNegative Negative(m64) Report InverseMask(ISA_64BIT) Var(ix86_isa_flags) Save +Generate 32bit i386 code + +m64 +Target RejectNegative Negative(m32) Report Mask(ISA_64BIT) Var(ix86_isa_flags) Save +Generate 64bit x86-64 code + +mmmx +Target Report Mask(ISA_MMX) Var(ix86_isa_flags) Save +Support MMX built-in functions + +m3dnow +Target Report Mask(ISA_3DNOW) Var(ix86_isa_flags) Save +Support 3DNow! built-in functions + +m3dnowa +Target Undocumented Mask(ISA_3DNOW_A) Var(ix86_isa_flags) Save +Support Athlon 3Dnow! built-in functions + +msse +Target Report Mask(ISA_SSE) Var(ix86_isa_flags) Save +Support MMX and SSE built-in functions and code generation + +msse2 +Target Report Mask(ISA_SSE2) Var(ix86_isa_flags) Save +Support MMX, SSE and SSE2 built-in functions and code generation + +msse3 +Target Report Mask(ISA_SSE3) Var(ix86_isa_flags) Save +Support MMX, SSE, SSE2 and SSE3 built-in functions and code generation + +mssse3 +Target Report Mask(ISA_SSSE3) Var(ix86_isa_flags) Save +Support MMX, SSE, SSE2, SSE3 and SSSE3 built-in functions and code generation + +msse4.1 +Target Report Mask(ISA_SSE4_1) Var(ix86_isa_flags) Save +Support MMX, SSE, SSE2, SSE3, SSSE3 and SSE4.1 built-in functions and code generation + +msse4.2 +Target Report Mask(ISA_SSE4_2) Var(ix86_isa_flags) Save +Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1 and SSE4.2 built-in functions and code generation + +msse4 +Target RejectNegative Report Mask(ISA_SSE4_2) MaskExists Var(ix86_isa_flags) Save +Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1 and SSE4.2 built-in functions and code generation + +mno-sse4 +Target RejectNegative Report InverseMask(ISA_SSE4_1) MaskExists Var(ix86_isa_flags) Save +Do not support SSE4.1 and SSE4.2 built-in functions and code generation + +msse5 +Target Undocumented Alias(mavx) Warn(%<-msse5%> was removed) +;; Deprecated + +mavx +Target Report Mask(ISA_AVX) Var(ix86_isa_flags) Save +Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2 and AVX built-in functions and code generation + +mfma +Target Report Mask(ISA_FMA) Var(ix86_isa_flags) Save +Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and FMA built-in functions and code generation + +msse4a +Target Report Mask(ISA_SSE4A) Var(ix86_isa_flags) Save +Support MMX, SSE, SSE2, SSE3 and SSE4A built-in functions and code generation + +mfma4 +Target Report Mask(ISA_FMA4) Var(ix86_isa_flags) Save +Support FMA4 built-in functions and code generation + +mxop +Target Report Mask(ISA_XOP) Var(ix86_isa_flags) Save +Support XOP built-in functions and code generation + +mlwp +Target Report Mask(ISA_LWP) Var(ix86_isa_flags) Save +Support LWP built-in functions and code generation + +mabm +Target Report Mask(ISA_ABM) Var(ix86_isa_flags) Save +Support code generation of Advanced Bit Manipulation (ABM) instructions. + +mpopcnt +Target Report Mask(ISA_POPCNT) Var(ix86_isa_flags) Save +Support code generation of popcnt instruction. + +mbmi +Target Report Mask(ISA_BMI) Var(ix86_isa_flags) Save +Support BMI built-in functions and code generation + +mtbm +Target Report Mask(ISA_TBM) Var(ix86_isa_flags) Save +Support TBM built-in functions and code generation + +mcx16 +Target Report Mask(ISA_CX16) Var(ix86_isa_flags) Save +Support code generation of cmpxchg16b instruction. + +msahf +Target Report Mask(ISA_SAHF) Var(ix86_isa_flags) Save +Support code generation of sahf instruction in 64bit x86-64 code. + +mmovbe +Target Report Mask(ISA_MOVBE) Var(ix86_isa_flags) Save +Support code generation of movbe instruction. + +mcrc32 +Target Report Mask(ISA_CRC32) Var(ix86_isa_flags) Save +Support code generation of crc32 instruction. + +maes +Target Report Mask(ISA_AES) Var(ix86_isa_flags) Save +Support AES built-in functions and code generation + +mpclmul +Target Report Mask(ISA_PCLMUL) Var(ix86_isa_flags) Save +Support PCLMUL built-in functions and code generation + +msse2avx +Target Report Var(ix86_sse2avx) +Encode SSE instructions with VEX prefix + +mfsgsbase +Target Report Mask(ISA_FSGSBASE) Var(ix86_isa_flags) Save +Support FSGSBASE built-in functions and code generation + +mrdrnd +Target Report Mask(ISA_RDRND) Var(ix86_isa_flags) Save +Support RDRND built-in functions and code generation + +mf16c +Target Report Mask(ISA_F16C) Var(ix86_isa_flags) Save +Support F16C built-in functions and code generation + +mfentry +Target Report Var(flag_fentry) Init(-1) +Emit profiling counter call at function entry before prologue. + +m8bit-idiv +Target Report Mask(USE_8BIT_IDIV) Save +Expand 32bit/64bit integer divide into 8bit unsigned integer divide with run-time check + +mavx256-split-unaligned-load +Target Report Mask(AVX256_SPLIT_UNALIGNED_LOAD) Save +Split 32-byte AVX unaligned load + +mavx256-split-unaligned-store +Target Report Mask(AVX256_SPLIT_UNALIGNED_STORE) Save +Split 32-byte AVX unaligned store diff --git a/gcc/config/i386/i386elf.h b/gcc/config/i386/i386elf.h new file mode 100644 index 000000000..79d7b9585 --- /dev/null +++ b/gcc/config/i386/i386elf.h @@ -0,0 +1,125 @@ +/* Target definitions for GCC for Intel 80386 using ELF + Copyright (C) 1988, 1991, 1995, 2000, 2001, 2002, 2007, 2008, 2010 + Free Software Foundation, Inc. + + Derived from sysv4.h written by Ron Guilmette (rfg@netcom.com). + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +/* Use stabs instead of DWARF debug format. */ +#undef PREFERRED_DEBUGGING_TYPE +#define PREFERRED_DEBUGGING_TYPE DBX_DEBUG + +#define TARGET_VERSION fprintf (stderr, " (i386 bare ELF target)"); + +/* The ELF ABI for the i386 says that records and unions are returned + in memory. */ + +#define SUBTARGET_RETURN_IN_MEMORY(TYPE, FNTYPE) \ + (TYPE_MODE (TYPE) == BLKmode \ + || (VECTOR_MODE_P (TYPE_MODE (TYPE)) && int_size_in_bytes (TYPE) == 8)) + +#undef CPP_SPEC +#define CPP_SPEC "" + +#define ENDFILE_SPEC "crtend.o%s" + +#define STARTFILE_SPEC "%{!shared: \ + %{!symbolic: \ + %{pg:gcrt0.o%s}%{!pg:%{p:mcrt0.o%s}%{!p:crt0.o%s}}}}\ + crtbegin.o%s" + +#undef DBX_REGISTER_NUMBER +#define DBX_REGISTER_NUMBER(n) \ + (TARGET_64BIT ? dbx64_register_map[n] : svr4_dbx_register_map[n]) + +/* The routine used to output sequences of byte values. We use a special + version of this for most svr4 targets because doing so makes the + generated assembly code more compact (and thus faster to assemble) + as well as more readable. Note that if we find subparts of the + character sequence which end with NUL (and which are shorter than + STRING_LIMIT) we output those using ASM_OUTPUT_LIMITED_STRING. */ + +#undef ASM_OUTPUT_ASCII +#define ASM_OUTPUT_ASCII(FILE, STR, LENGTH) \ + do \ + { \ + const unsigned char *_ascii_bytes = \ + (const unsigned char *) (STR); \ + const unsigned char *limit = _ascii_bytes + (LENGTH); \ + unsigned bytes_in_chunk = 0; \ + for (; _ascii_bytes < limit; _ascii_bytes++) \ + { \ + const unsigned char *p; \ + if (bytes_in_chunk >= 64) \ + { \ + fputc ('\n', (FILE)); \ + bytes_in_chunk = 0; \ + } \ + for (p = _ascii_bytes; p < limit && *p != '\0'; p++) \ + continue; \ + if (p < limit && (p - _ascii_bytes) <= (long) STRING_LIMIT) \ + { \ + if (bytes_in_chunk > 0) \ + { \ + fputc ('\n', (FILE)); \ + bytes_in_chunk = 0; \ + } \ + ASM_OUTPUT_LIMITED_STRING ((FILE), _ascii_bytes); \ + _ascii_bytes = p; \ + } \ + else \ + { \ + if (bytes_in_chunk == 0) \ + fputs (ASM_BYTE, (FILE)); \ + else \ + fputc (',', (FILE)); \ + fprintf ((FILE), "0x%02x", *_ascii_bytes); \ + bytes_in_chunk += 5; \ + } \ + } \ + if (bytes_in_chunk > 0) \ + fputc ('\n', (FILE)); \ + } \ + while (0) + +#define LOCAL_LABEL_PREFIX "." + +/* Switch into a generic section. */ +#define TARGET_ASM_NAMED_SECTION default_elf_asm_named_section + +/* If defined, a C expression whose value is a string containing the + assembler operation to identify the following data as + uninitialized global data. If not defined, and neither + `ASM_OUTPUT_BSS' nor `ASM_OUTPUT_ALIGNED_BSS' are defined, + uninitialized global data will be output in the data section if + `-fno-common' is passed, otherwise `ASM_OUTPUT_COMMON' will be + used. */ +#undef BSS_SECTION_ASM_OP +#define BSS_SECTION_ASM_OP "\t.section\t.bss" + +/* Like `ASM_OUTPUT_BSS' except takes the required alignment as a + separate, explicit argument. If you define this macro, it is used + in place of `ASM_OUTPUT_BSS', and gives you more flexibility in + handling the required alignment of the variable. The alignment is + specified as the number of bits. + + Try to use function `asm_output_aligned_bss' defined in file + `varasm.c' when defining this macro. */ +#undef ASM_OUTPUT_ALIGNED_BSS +#define ASM_OUTPUT_ALIGNED_BSS(FILE, DECL, NAME, SIZE, ALIGN) \ + asm_output_aligned_bss (FILE, DECL, NAME, SIZE, ALIGN) diff --git a/gcc/config/i386/ia32intrin.h b/gcc/config/i386/ia32intrin.h new file mode 100644 index 000000000..76c20a69a --- /dev/null +++ b/gcc/config/i386/ia32intrin.h @@ -0,0 +1,234 @@ +/* Copyright (C) 2009, 2010 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86INTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +/* 32bit bsf */ +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bsfd (int __X) +{ + return __builtin_ctz (__X); +} + +/* 32bit bsr */ +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bsrd (int __X) +{ + return __builtin_ia32_bsrsi (__X); +} + +/* 32bit bswap */ +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bswapd (int __X) +{ + return __builtin_bswap32 (__X); +} + +#ifdef __SSE4_2__ +/* 32bit accumulate CRC32 (polynomial 0x11EDC6F41) value. */ +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__crc32b (unsigned int __C, unsigned char __V) +{ + return __builtin_ia32_crc32qi (__C, __V); +} + +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__crc32w (unsigned int __C, unsigned short __V) +{ + return __builtin_ia32_crc32hi (__C, __V); +} + +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__crc32d (unsigned int __C, unsigned int __V) +{ + return __builtin_ia32_crc32si (__C, __V); +} +#endif /* SSE4.2 */ + +/* 32bit popcnt */ +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__popcntd (unsigned int __X) +{ + return __builtin_popcount (__X); +} + +/* rdpmc */ +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rdpmc (int __S) +{ + return __builtin_ia32_rdpmc (__S); +} + +/* rdtsc */ +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rdtsc (void) +{ + return __builtin_ia32_rdtsc (); +} + +/* rdtscp */ +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rdtscp (unsigned int *__A) +{ + return __builtin_ia32_rdtscp (__A); +} + +/* 8bit rol */ +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rolb (unsigned char __X, int __C) +{ + return __builtin_ia32_rolqi (__X, __C); +} + +/* 16bit rol */ +extern __inline unsigned short +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rolw (unsigned short __X, int __C) +{ + return __builtin_ia32_rolhi (__X, __C); +} + +/* 32bit rol */ +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rold (unsigned int __X, int __C) +{ + return (__X << __C) | (__X >> (32 - __C)); +} + +/* 8bit ror */ +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rorb (unsigned char __X, int __C) +{ + return __builtin_ia32_rorqi (__X, __C); +} + +/* 16bit ror */ +extern __inline unsigned short +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rorw (unsigned short __X, int __C) +{ + return __builtin_ia32_rorhi (__X, __C); +} + +/* 32bit ror */ +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rord (unsigned int __X, int __C) +{ + return (__X >> __C) | (__X << (32 - __C)); +} + +#ifdef __x86_64__ +/* 64bit bsf */ +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bsfq (long long __X) +{ + return __builtin_ctzll (__X); +} + +/* 64bit bsr */ +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bsrq (long long __X) +{ + return __builtin_ia32_bsrdi (__X); +} + +/* 64bit bswap */ +extern __inline long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bswapq (long long __X) +{ + return __builtin_bswap64 (__X); +} + +#ifdef __SSE4_2__ +/* 64bit accumulate CRC32 (polynomial 0x11EDC6F41) value. */ +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__crc32q (unsigned long long __C, unsigned long long __V) +{ + return __builtin_ia32_crc32di (__C, __V); +} +#endif + +/* 64bit popcnt */ +extern __inline long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__popcntq (unsigned long long __X) +{ + return __builtin_popcountll (__X); +} + +/* 64bit rol */ +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rolq (unsigned long long __X, int __C) +{ + return (__X << __C) | (__X >> (64 - __C)); +} + +/* 64bit ror */ +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rorq (unsigned long long __X, int __C) +{ + return (__X >> __C) | (__X << (64 - __C)); +} + +#define _bswap64(a) __bswapq(a) +#define _popcnt64(a) __popcntq(a) +#define _lrotl(a,b) __rolq((a), (b)) +#define _lrotr(a,b) __rorq((a), (b)) +#else +#define _lrotl(a,b) __rold((a), (b)) +#define _lrotr(a,b) __rord((a), (b)) +#endif + +#define _bit_scan_forward(a) __bsfd(a) +#define _bit_scan_reverse(a) __bsrd(a) +#define _bswap(a) __bswapd(a) +#define _popcnt32(a) __popcntd(a) +#define _rdpmc(a) __rdpmc(a) +#define _rdtsc() __rdtsc() +#define _rdtscp(a) __rdtscp(a) +#define _rotwl(a,b) __rolw((a), (b)) +#define _rotwr(a,b) __rorw((a), (b)) +#define _rotl(a,b) __rold((a), (b)) +#define _rotr(a,b) __rord((a), (b)) diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h new file mode 100644 index 000000000..11a1a4e0c --- /dev/null +++ b/gcc/config/i386/immintrin.h @@ -0,0 +1,203 @@ +/* Copyright (C) 2008, 2009, 2010 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#define _IMMINTRIN_H_INCLUDED + +#ifdef __MMX__ +#include +#endif + +#ifdef __SSE__ +#include +#endif + +#ifdef __SSE2__ +#include +#endif + +#ifdef __SSE3__ +#include +#endif + +#ifdef __SSSE3__ +#include +#endif + +#if defined (__SSE4_2__) || defined (__SSE4_1__) +#include +#endif + +#if defined (__AES__) || defined (__PCLMUL__) +#include +#endif + +#ifdef __AVX__ +#include +#endif + +#ifdef __RDRND__ +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rdrand16_step (unsigned short *__P) +{ + return __builtin_ia32_rdrand16_step (__P); +} + +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rdrand32_step (unsigned int *__P) +{ + return __builtin_ia32_rdrand32_step (__P); +} +#endif /* __RDRND__ */ + +#ifdef __x86_64__ +#ifdef __FSGSBASE__ +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_readfsbase_u32 (void) +{ + return __builtin_ia32_rdfsbase32 (); +} + +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_readfsbase_u64 (void) +{ + return __builtin_ia32_rdfsbase64 (); +} + +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_readgsbase_u32 (void) +{ + return __builtin_ia32_rdgsbase32 (); +} + +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_readgsbase_u64 (void) +{ + return __builtin_ia32_rdgsbase64 (); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_writefsbase_u32 (unsigned int __B) +{ + __builtin_ia32_wrfsbase32 (__B); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_writefsbase_u64 (unsigned long long __B) +{ + __builtin_ia32_wrfsbase64 (__B); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_writegsbase_u32 (unsigned int __B) +{ + __builtin_ia32_wrgsbase32 (__B); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_writegsbase_u64 (unsigned long long __B) +{ + __builtin_ia32_wrgsbase64 (__B); +} +#endif /* __FSGSBASE__ */ + +#ifdef __RDRND__ +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rdrand64_step (unsigned long long *__P) +{ + return __builtin_ia32_rdrand64_step (__P); +} +#endif /* __RDRND__ */ +#endif /* __x86_64__ */ + +#ifdef __F16C__ +extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_cvtsh_ss (unsigned short __S) +{ + __v8hi __H = __extension__ (__v8hi){ __S, 0, 0, 0, 0, 0, 0, 0 }; + __v4sf __A = __builtin_ia32_vcvtph2ps (__H); + return __builtin_ia32_vec_ext_v4sf (__A, 0); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtph_ps (__m128i __A) +{ + return (__m128) __builtin_ia32_vcvtph2ps ((__v8hi) __A); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtph_ps (__m128i __A) +{ + return (__m256) __builtin_ia32_vcvtph2ps256 ((__v8hi) __A); +} + +#ifdef __OPTIMIZE__ +extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_cvtss_sh (float __F, const int __I) +{ + __v4sf __A = __extension__ (__v4sf){ __F, 0, 0, 0 }; + __v8hi __H = __builtin_ia32_vcvtps2ph (__A, __I); + return (unsigned short) __builtin_ia32_vec_ext_v8hi (__H, 0); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_ph (__m128 __A, const int __I) +{ + return (__m128i) __builtin_ia32_vcvtps2ph ((__v4sf) __A, __I); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtps_ph (__m256 __A, const int __I) +{ + return (__m128i) __builtin_ia32_vcvtps2ph256 ((__v8sf) __A, __I); +} +#else +#define _cvtss_sh(__F, __I) \ + (__extension__ \ + ({ \ + __v4sf __A = __extension__ (__v4sf){ __F, 0, 0, 0 }; \ + __v8hi __H = __builtin_ia32_vcvtps2ph (__A, __I); \ + (unsigned short) __builtin_ia32_vec_ext_v8hi (__H, 0); \ + })) + +#define _mm_cvtps_ph(A, I) \ + ((__m128i) __builtin_ia32_vcvtps2ph ((__v4sf)(__m128) A, (int) (I))) + +#define _mm256_cvtps_ph(A, I) \ + ((__m128i) __builtin_ia32_vcvtps2ph256 ((__v8sf)(__m256) A, (int) (I))) +#endif + +#endif /* __F16C__ */ + +#endif /* _IMMINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/k6.md b/gcc/config/i386/k6.md new file mode 100644 index 000000000..030bc26a6 --- /dev/null +++ b/gcc/config/i386/k6.md @@ -0,0 +1,267 @@ +;; AMD K6/K6-2 Scheduling +;; Copyright (C) 2002, 2004, 2007 +;; Free Software Foundation, Inc. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . +;; +;; The K6 architecture is quite similar to PPro. Important difference is +;; that there are only two decoders and they seems to be much slower than +;; any of the execution units. So we have to pay much more attention to +;; proper scheduling for the decoders. +;; FIXME: We don't do that right now. A good start would be to sort the +;; instructions based on length. +;; +;; This description is based on data from the following documents: +;; +;; "AMD-K6 Processor Data Sheet (Preliminary information)" +;; Advanced Micro Devices, Inc., 1998. +;; +;; "AMD-K6 Processor Code Optimization Application Note" +;; Advanced Micro Devices, Inc., 2000. +;; +;; CPU execution units of the K6: +;; +;; store describes the Store unit. This unit is not modelled +;; completely and it is only used to model lea operation. +;; Otherwise it lies outside of any critical path. +;; load describes the Load unit +;; alux describes the Integer X unit +;; mm describes the Multimedia unit, which shares a pipe +;; with the Integer X unit. This unit is used for MMX, +;; which is not implemented for K6. +;; aluy describes the Integer Y unit +;; fpu describes the FPU unit +;; branch describes the Branch unit +;; +;; The fp unit is not pipelined, and it can only do one operation per two +;; cycles, including fxcg. +;; +;; Generally this is a very poor description, but at least no worse than +;; the old description, and a lot easier to extend to something more +;; reasonable if anyone still cares enough about this architecture in 2004. +;; +;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real. + +(define_automaton "k6_decoder,k6_load_unit,k6_store_unit,k6_integer_units,k6_fpu_unit,k6_branch_unit") + +;; The K6 instruction decoding begins before the on-chip instruction cache is +;; filled. Depending on the length of the instruction, two simple instructions +;; can be decoded in two parallel short decoders, or one complex instruction can +;; be decoded in either the long or the vector decoder. For all practical +;; purposes, the long and vector decoder can be modelled as one decoder. +(define_cpu_unit "k6_decode_short0" "k6_decoder") +(define_cpu_unit "k6_decode_short1" "k6_decoder") +(define_cpu_unit "k6_decode_long" "k6_decoder") +(exclusion_set "k6_decode_long" "k6_decode_short0,k6_decode_short1") +(define_reservation "k6_decode_short" "k6_decode_short0|k6_decode_short1") +(define_reservation "k6_decode_vector" "k6_decode_long") + +(define_cpu_unit "k6_store" "k6_store_unit") +(define_cpu_unit "k6_load" "k6_load_unit") +(define_cpu_unit "k6_alux,k6_aluy" "k6_integer_units") +(define_cpu_unit "k6_fpu" "k6_fpu_unit") +(define_cpu_unit "k6_branch" "k6_branch_unit") + +;; Shift instructions and certain arithmetic are issued only on Integer X. +(define_insn_reservation "k6_alux_only" 1 + (and (eq_attr "cpu" "k6") + (and (eq_attr "type" "ishift,ishift1,rotate,rotate1,alu1,negnot") + (eq_attr "memory" "none"))) + "k6_decode_short,k6_alux") + +(define_insn_reservation "k6_alux_only_load" 3 + (and (eq_attr "cpu" "k6") + (and (eq_attr "type" "ishift,ishift1,rotate,rotate1,alu1,negnot") + (eq_attr "memory" "load"))) + "k6_decode_short,k6_load,k6_alux") + +(define_insn_reservation "k6_alux_only_store" 3 + (and (eq_attr "cpu" "k6") + (and (eq_attr "type" "ishift,ishift1,rotate,rotate1,alu1,negnot") + (eq_attr "memory" "store,both,unknown"))) + "k6_decode_long,k6_load,k6_alux,k6_store") + +;; Integer divide and multiply can only be issued on Integer X, too. +(define_insn_reservation "k6_alu_imul" 2 + (and (eq_attr "cpu" "k6") + (eq_attr "type" "imul")) + "k6_decode_vector,k6_alux*3") + +(define_insn_reservation "k6_alu_imul_load" 4 + (and (eq_attr "cpu" "k6") + (and (eq_attr "type" "imul") + (eq_attr "memory" "load"))) + "k6_decode_vector,k6_load,k6_alux*3") + +(define_insn_reservation "k6_alu_imul_store" 4 + (and (eq_attr "cpu" "k6") + (and (eq_attr "type" "imul") + (eq_attr "memory" "store,both,unknown"))) + "k6_decode_vector,k6_load,k6_alux*3,k6_store") + +;; ??? Guessed latencies based on the old pipeline description. +(define_insn_reservation "k6_alu_idiv" 17 + (and (eq_attr "cpu" "k6") + (and (eq_attr "type" "idiv") + (eq_attr "memory" "none"))) + "k6_decode_vector,k6_alux*17") + +(define_insn_reservation "k6_alu_idiv_mem" 19 + (and (eq_attr "cpu" "k6") + (and (eq_attr "type" "idiv") + (eq_attr "memory" "!none"))) + "k6_decode_vector,k6_load,k6_alux*17") + +;; Basic word and doubleword ALU ops can be issued on both Integer units. +(define_insn_reservation "k6_alu" 1 + (and (eq_attr "cpu" "k6") + (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec,setcc") + (eq_attr "memory" "none"))) + "k6_decode_short,k6_alux|k6_aluy") + +(define_insn_reservation "k6_alu_load" 3 + (and (eq_attr "cpu" "k6") + (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec,setcc") + (eq_attr "memory" "load"))) + "k6_decode_short,k6_load,k6_alux|k6_aluy") + +(define_insn_reservation "k6_alu_store" 3 + (and (eq_attr "cpu" "k6") + (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec,setcc") + (eq_attr "memory" "store,both,unknown"))) + "k6_decode_long,k6_load,k6_alux|k6_aluy,k6_store") + +;; A "load immediate" operation does not require execution at all, +;; it is available immediately after decoding. Special-case this. +(define_insn_reservation "k6_alu_imov" 1 + (and (eq_attr "cpu" "k6") + (and (eq_attr "type" "imov") + (and (eq_attr "memory" "none") + (match_operand 1 "nonimmediate_operand")))) + "k6_decode_short,k6_alux|k6_aluy") + +(define_insn_reservation "k6_alu_imov_imm" 0 + (and (eq_attr "cpu" "k6") + (and (eq_attr "type" "imov") + (and (eq_attr "memory" "none") + (match_operand 1 "immediate_operand")))) + "k6_decode_short") + +(define_insn_reservation "k6_alu_imov_load" 2 + (and (eq_attr "cpu" "k6") + (and (eq_attr "type" "imov") + (eq_attr "memory" "load"))) + "k6_decode_short,k6_load") + +(define_insn_reservation "k6_alu_imov_store" 1 + (and (eq_attr "cpu" "k6") + (and (eq_attr "type" "imov") + (eq_attr "memory" "store"))) + "k6_decode_short,k6_store") + +(define_insn_reservation "k6_alu_imov_both" 2 + (and (eq_attr "cpu" "k6") + (and (eq_attr "type" "imov") + (eq_attr "memory" "both,unknown"))) + "k6_decode_long,k6_load,k6_alux|k6_aluy") + +;; The branch unit. +(define_insn_reservation "k6_branch_call" 1 + (and (eq_attr "cpu" "k6") + (eq_attr "type" "call,callv")) + "k6_decode_vector,k6_branch") + +(define_insn_reservation "k6_branch_branch" 1 + (and (eq_attr "cpu" "k6") + (eq_attr "type" "ibr")) + "k6_decode_short,k6_branch") + +;; The load and units have two pipeline stages. The load latency is +;; two cycles. +(define_insn_reservation "k6_load_pop" 3 + (and (eq_attr "cpu" "k6") + (ior (eq_attr "type" "pop") + (eq_attr "memory" "load,both"))) + "k6_decode_short,k6_load") + +(define_insn_reservation "k6_load_leave" 5 + (and (eq_attr "cpu" "k6") + (eq_attr "type" "leave")) + "k6_decode_long,k6_load,(k6_alux|k6_aluy)*2") + +;; ??? From the old pipeline description. Egad! +;; ??? Apparently we take care of this reservation in adjust_cost. +(define_insn_reservation "k6_load_str" 10 + (and (eq_attr "cpu" "k6") + (and (eq_attr "type" "str") + (eq_attr "memory" "load,both"))) + "k6_decode_vector,k6_load*10") + +;; The store unit handles lea and push. It is otherwise unmodelled. +(define_insn_reservation "k6_store_lea" 2 + (and (eq_attr "cpu" "k6") + (eq_attr "type" "lea")) + "k6_decode_short,k6_store,k6_alux|k6_aluy") + +(define_insn_reservation "k6_store_push" 2 + (and (eq_attr "cpu" "k6") + (ior (eq_attr "type" "push") + (eq_attr "memory" "store,both"))) + "k6_decode_short,k6_store") + +(define_insn_reservation "k6_store_str" 10 + (and (eq_attr "cpu" "k6") + (eq_attr "type" "str")) + "k6_store*10") + +;; Most FPU instructions have latency 2 and throughput 2. +(define_insn_reservation "k6_fpu" 2 + (and (eq_attr "cpu" "k6") + (and (eq_attr "type" "fop,fmov,fcmp,fistp") + (eq_attr "memory" "none"))) + "k6_decode_vector,k6_fpu*2") + +(define_insn_reservation "k6_fpu_load" 6 + (and (eq_attr "cpu" "k6") + (and (eq_attr "type" "fop,fmov,fcmp,fistp") + (eq_attr "memory" "load,both"))) + "k6_decode_short,k6_load,k6_fpu*2") + +(define_insn_reservation "k6_fpu_store" 6 + (and (eq_attr "cpu" "k6") + (and (eq_attr "type" "fop,fmov,fcmp,fistp") + (eq_attr "memory" "store"))) + "k6_decode_short,k6_store,k6_fpu*2") + +(define_insn_reservation "k6_fpu_fmul" 2 + (and (eq_attr "cpu" "k6") + (and (eq_attr "type" "fmul") + (eq_attr "memory" "none"))) + "k6_decode_short,k6_fpu*2") + +(define_insn_reservation "k6_fpu_fmul_load" 2 + (and (eq_attr "cpu" "k6") + (and (eq_attr "type" "fmul") + (eq_attr "memory" "load,both"))) + "k6_decode_short,k6_load,k6_fpu*2") + +;; ??? Guessed latencies from the old pipeline description. +(define_insn_reservation "k6_fpu_expensive" 56 + (and (eq_attr "cpu" "k6") + (eq_attr "type" "fdiv,fpspc")) + "k6_decode_short,k6_fpu*56") + diff --git a/gcc/config/i386/kfreebsd-gnu.h b/gcc/config/i386/kfreebsd-gnu.h new file mode 100644 index 000000000..b5fb2ba29 --- /dev/null +++ b/gcc/config/i386/kfreebsd-gnu.h @@ -0,0 +1,25 @@ +/* Definitions for Intel 386 running kFreeBSD-based GNU systems with ELF format + Copyright (C) 2004, 2007 + Free Software Foundation, Inc. + Contributed by Robert Millan. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#undef LINK_EMULATION +#define LINK_EMULATION "elf_i386_fbsd" +#undef REG_NAME +#define REG_NAME(reg) sc_ ## reg diff --git a/gcc/config/i386/knetbsd-gnu.h b/gcc/config/i386/knetbsd-gnu.h new file mode 100644 index 000000000..54f5a6920 --- /dev/null +++ b/gcc/config/i386/knetbsd-gnu.h @@ -0,0 +1,23 @@ +/* Definitions for Intel 386 running kNetBSD-based GNU systems with ELF format + Copyright (C) 2004, 2007 + Free Software Foundation, Inc. + Contributed by Robert Millan. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#undef REG_NAME +#define REG_NAME(reg) sc_ ## reg diff --git a/gcc/config/i386/kopensolaris-gnu.h b/gcc/config/i386/kopensolaris-gnu.h new file mode 100644 index 000000000..3e315b83f --- /dev/null +++ b/gcc/config/i386/kopensolaris-gnu.h @@ -0,0 +1,22 @@ +/* Definitions for Intel 386 running kOpenSolaris-based GNU systems with ELF format + Copyright (C) 2009 + Free Software Foundation, Inc. + Contributed by Robert Millan. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#undef MD_UNWIND_SUPPORT diff --git a/gcc/config/i386/libgcc-glibc.ver b/gcc/config/i386/libgcc-glibc.ver new file mode 100644 index 000000000..e79d3267f --- /dev/null +++ b/gcc/config/i386/libgcc-glibc.ver @@ -0,0 +1,186 @@ +# Copyright (C) 2008, 2010 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +# In order to work around the very problems that force us to now generally +# create a libgcc.so, glibc reexported a number of routines from libgcc.a. +# By now choosing the same version tags for these specific routines, we +# maintain enough binary compatibility to allow future versions of glibc +# to defer implementation of these routines to libgcc.so via DT_AUXILIARY. + +%ifndef __x86_64__ +%exclude { + __divdi3 + __moddi3 + __udivdi3 + __umoddi3 + __register_frame + __register_frame_table + __deregister_frame + __register_frame_info + __deregister_frame_info + __frame_state_for + __register_frame_info_table +} + +%inherit GCC_3.0 GLIBC_2.0 +GLIBC_2.0 { + # Sampling of DImode arithmetic used by (at least) i386 and m68k. + __divdi3 + __moddi3 + __udivdi3 + __umoddi3 + + # Exception handling support functions used by most everyone. + __register_frame + __register_frame_table + __deregister_frame + __register_frame_info + __deregister_frame_info + __frame_state_for + __register_frame_info_table +} +%endif + +# 128 bit long double support was introduced with GCC 4.3.0 to 64bit +# and with GCC 4.4.0 to 32bit. These lines make the symbols to get +# a @@GCC_4.3.0 or @@GCC_4.4.0 attached. + +%exclude { + __addtf3 + __divtc3 + __divtf3 + __eqtf2 + __extenddftf2 + __extendsftf2 + __extendxftf2 + __fixtfdi + __fixtfsi + __fixtfti + __fixunstfdi + __fixunstfsi + __fixunstfti + __floatditf + __floatsitf + __floattitf + __floatunditf + __floatunsitf + __floatuntitf + __getf2 + __gttf2 + __letf2 + __lttf2 + __multc3 + __multf3 + __negtf2 + __netf2 + __powitf2 + __subtf3 + __trunctfdf2 + __trunctfsf2 + __trunctfxf2 + __unordtf2 +} + +%ifdef __x86_64__ +# Those symbols had improper versions when they were added to gcc 4.3.0. +# We corrected the default version to GCC_4.3.0. But we keep the old +# version for backward binary compatibility. +GCC_3.0 { + __gttf2 + __lttf2 + __netf2 +} + +GCC_4.0.0 { + __divtc3 + __multc3 + __powitf2 +} + +GCC_4.3.0 { + __addtf3 + __divtc3 + __divtf3 + __eqtf2 + __extenddftf2 + __extendsftf2 + __extendxftf2 + __fixtfdi + __fixtfsi + __fixtfti + __fixunstfdi + __fixunstfsi + __fixunstfti + __floatditf + __floatsitf + __floattitf + __floatunditf + __floatunsitf + __floatuntitf + __getf2 + __gttf2 + __letf2 + __lttf2 + __multc3 + __multf3 + __negtf2 + __netf2 + __powitf2 + __subtf3 + __trunctfdf2 + __trunctfsf2 + __trunctfxf2 + __unordtf2 +} +%else +GCC_4.4.0 { + __addtf3 + __copysigntf3 + __divtc3 + __divtf3 + __eqtf2 + __extenddftf2 + __extendsftf2 + __fabstf2 + __fixtfdi + __fixtfsi + __fixunstfdi + __fixunstfsi + __floatditf + __floatsitf + __floatunditf + __floatunsitf + __getf2 + __gttf2 + __letf2 + __lttf2 + __multc3 + __multf3 + __negtf2 + __netf2 + __powitf2 + __subtf3 + __trunctfdf2 + __trunctfsf2 + __trunctfxf2 + __unordtf2 +} +GCC_4.5.0 { + __extendxftf2 +} +%endif diff --git a/gcc/config/i386/linux-unwind.h b/gcc/config/i386/linux-unwind.h new file mode 100644 index 000000000..9e4be8010 --- /dev/null +++ b/gcc/config/i386/linux-unwind.h @@ -0,0 +1,197 @@ +/* DWARF2 EH unwinding support for AMD x86-64 and x86. + Copyright (C) 2004, 2005, 2006, 2009, 2010, 2012 Free Software Foundation, + Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ + +/* Do code reading to identify a signal frame, and set the frame + state data appropriately. See unwind-dw2.c for the structs. + Don't use this at all if inhibit_libc is used. */ + +#ifndef inhibit_libc + +#ifdef __x86_64__ + +#include +#include + +#define MD_FALLBACK_FRAME_STATE_FOR x86_64_fallback_frame_state + +static _Unwind_Reason_Code +x86_64_fallback_frame_state (struct _Unwind_Context *context, + _Unwind_FrameState *fs) +{ + unsigned char *pc = context->ra; + struct sigcontext *sc; + long new_cfa; + + /* movq __NR_rt_sigreturn, %rax ; syscall */ + if (*(unsigned char *)(pc+0) == 0x48 + && *(unsigned long *)(pc+1) == 0x050f0000000fc0c7) + { + struct ucontext *uc_ = context->cfa; + /* The void * cast is necessary to avoid an aliasing warning. + The aliasing warning is correct, but should not be a problem + because it does not alias anything. */ + sc = (struct sigcontext *) (void *) &uc_->uc_mcontext; + } + else + return _URC_END_OF_STACK; + + new_cfa = sc->rsp; + fs->regs.cfa_how = CFA_REG_OFFSET; + /* Register 7 is rsp */ + fs->regs.cfa_reg = 7; + fs->regs.cfa_offset = new_cfa - (long) context->cfa; + + /* The SVR4 register numbering macros aren't usable in libgcc. */ + fs->regs.reg[0].how = REG_SAVED_OFFSET; + fs->regs.reg[0].loc.offset = (long)&sc->rax - new_cfa; + fs->regs.reg[1].how = REG_SAVED_OFFSET; + fs->regs.reg[1].loc.offset = (long)&sc->rdx - new_cfa; + fs->regs.reg[2].how = REG_SAVED_OFFSET; + fs->regs.reg[2].loc.offset = (long)&sc->rcx - new_cfa; + fs->regs.reg[3].how = REG_SAVED_OFFSET; + fs->regs.reg[3].loc.offset = (long)&sc->rbx - new_cfa; + fs->regs.reg[4].how = REG_SAVED_OFFSET; + fs->regs.reg[4].loc.offset = (long)&sc->rsi - new_cfa; + fs->regs.reg[5].how = REG_SAVED_OFFSET; + fs->regs.reg[5].loc.offset = (long)&sc->rdi - new_cfa; + fs->regs.reg[6].how = REG_SAVED_OFFSET; + fs->regs.reg[6].loc.offset = (long)&sc->rbp - new_cfa; + fs->regs.reg[8].how = REG_SAVED_OFFSET; + fs->regs.reg[8].loc.offset = (long)&sc->r8 - new_cfa; + fs->regs.reg[9].how = REG_SAVED_OFFSET; + fs->regs.reg[9].loc.offset = (long)&sc->r9 - new_cfa; + fs->regs.reg[10].how = REG_SAVED_OFFSET; + fs->regs.reg[10].loc.offset = (long)&sc->r10 - new_cfa; + fs->regs.reg[11].how = REG_SAVED_OFFSET; + fs->regs.reg[11].loc.offset = (long)&sc->r11 - new_cfa; + fs->regs.reg[12].how = REG_SAVED_OFFSET; + fs->regs.reg[12].loc.offset = (long)&sc->r12 - new_cfa; + fs->regs.reg[13].how = REG_SAVED_OFFSET; + fs->regs.reg[13].loc.offset = (long)&sc->r13 - new_cfa; + fs->regs.reg[14].how = REG_SAVED_OFFSET; + fs->regs.reg[14].loc.offset = (long)&sc->r14 - new_cfa; + fs->regs.reg[15].how = REG_SAVED_OFFSET; + fs->regs.reg[15].loc.offset = (long)&sc->r15 - new_cfa; + fs->regs.reg[16].how = REG_SAVED_OFFSET; + fs->regs.reg[16].loc.offset = (long)&sc->rip - new_cfa; + fs->retaddr_column = 16; + fs->signal_frame = 1; + return _URC_NO_REASON; +} + +#else /* ifdef __x86_64__ */ + +/* There's no sys/ucontext.h for glibc 2.0, so no + signal-turned-exceptions for them. There's also no configure-run for + the target, so we can't check on (e.g.) HAVE_SYS_UCONTEXT_H. Using the + target libc version macro should be enough. */ +#if defined __GLIBC__ && !(__GLIBC__ == 2 && __GLIBC_MINOR__ == 0) + +#include +#include + +#define MD_FALLBACK_FRAME_STATE_FOR x86_fallback_frame_state + +static _Unwind_Reason_Code +x86_fallback_frame_state (struct _Unwind_Context *context, + _Unwind_FrameState *fs) +{ + unsigned char *pc = context->ra; + struct sigcontext *sc; + long new_cfa; + + /* popl %eax ; movl $__NR_sigreturn,%eax ; int $0x80 */ + if (*(unsigned short *)(pc+0) == 0xb858 + && *(unsigned int *)(pc+2) == 119 + && *(unsigned short *)(pc+6) == 0x80cd) + sc = context->cfa + 4; + /* movl $__NR_rt_sigreturn,%eax ; int $0x80 */ + else if (*(unsigned char *)(pc+0) == 0xb8 + && *(unsigned int *)(pc+1) == 173 + && *(unsigned short *)(pc+5) == 0x80cd) + { + struct rt_sigframe { + int sig; + siginfo_t *pinfo; + void *puc; + siginfo_t info; + struct ucontext uc; + } *rt_ = context->cfa; + /* The void * cast is necessary to avoid an aliasing warning. + The aliasing warning is correct, but should not be a problem + because it does not alias anything. */ + sc = (struct sigcontext *) (void *) &rt_->uc.uc_mcontext; + } + else + return _URC_END_OF_STACK; + + new_cfa = sc->REG_NAME(esp); + fs->regs.cfa_how = CFA_REG_OFFSET; + fs->regs.cfa_reg = 4; + fs->regs.cfa_offset = new_cfa - (long) context->cfa; + + /* The SVR4 register numbering macros aren't usable in libgcc. */ + fs->regs.reg[0].how = REG_SAVED_OFFSET; + fs->regs.reg[0].loc.offset = (long)&sc->REG_NAME(eax) - new_cfa; + fs->regs.reg[3].how = REG_SAVED_OFFSET; + fs->regs.reg[3].loc.offset = (long)&sc->REG_NAME(ebx) - new_cfa; + fs->regs.reg[1].how = REG_SAVED_OFFSET; + fs->regs.reg[1].loc.offset = (long)&sc->REG_NAME(ecx) - new_cfa; + fs->regs.reg[2].how = REG_SAVED_OFFSET; + fs->regs.reg[2].loc.offset = (long)&sc->REG_NAME(edx) - new_cfa; + fs->regs.reg[6].how = REG_SAVED_OFFSET; + fs->regs.reg[6].loc.offset = (long)&sc->REG_NAME(esi) - new_cfa; + fs->regs.reg[7].how = REG_SAVED_OFFSET; + fs->regs.reg[7].loc.offset = (long)&sc->REG_NAME(edi) - new_cfa; + fs->regs.reg[5].how = REG_SAVED_OFFSET; + fs->regs.reg[5].loc.offset = (long)&sc->REG_NAME(ebp) - new_cfa; + fs->regs.reg[8].how = REG_SAVED_OFFSET; + fs->regs.reg[8].loc.offset = (long)&sc->REG_NAME(eip) - new_cfa; + fs->retaddr_column = 8; + fs->signal_frame = 1; + return _URC_NO_REASON; +} + +#define MD_FROB_UPDATE_CONTEXT x86_frob_update_context + +/* Fix up for kernels that have vDSO, but don't have S flag in it. */ + +static void +x86_frob_update_context (struct _Unwind_Context *context, + _Unwind_FrameState *fs ATTRIBUTE_UNUSED) +{ + unsigned char *pc = context->ra; + + /* movl $__NR_rt_sigreturn,%eax ; {int $0x80 | syscall} */ + if (*(unsigned char *)(pc+0) == 0xb8 + && *(unsigned int *)(pc+1) == 173 + && (*(unsigned short *)(pc+5) == 0x80cd + || *(unsigned short *)(pc+5) == 0x050f)) + _Unwind_SetSignalFrame (context, 1); +} + +#endif /* not glibc 2.0 */ +#endif /* ifdef __x86_64__ */ +#endif /* ifdef inhibit_libc */ diff --git a/gcc/config/i386/linux.h b/gcc/config/i386/linux.h new file mode 100644 index 000000000..0084c8313 --- /dev/null +++ b/gcc/config/i386/linux.h @@ -0,0 +1,215 @@ +/* Definitions for Intel 386 running Linux-based GNU systems with ELF format. + Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2001, 2002, 2004, 2005, + 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. + Contributed by Eric Youngdale. + Modified for stabs-in-ELF by H.J. Lu. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +/* Output at beginning of assembler file. */ +/* The .file command should always begin the output. */ +#define TARGET_ASM_FILE_START_FILE_DIRECTIVE true + +#define TARGET_VERSION fprintf (stderr, " (i386 Linux/ELF)"); + +/* The svr4 ABI for the i386 says that records and unions are returned + in memory. */ +#undef DEFAULT_PCC_STRUCT_RETURN +#define DEFAULT_PCC_STRUCT_RETURN 1 + +/* We arrange for the whole %gs segment to map the tls area. */ +#undef TARGET_TLS_DIRECT_SEG_REFS_DEFAULT +#define TARGET_TLS_DIRECT_SEG_REFS_DEFAULT MASK_TLS_DIRECT_SEG_REFS + +#undef ASM_COMMENT_START +#define ASM_COMMENT_START "#" + +#undef DBX_REGISTER_NUMBER +#define DBX_REGISTER_NUMBER(n) \ + (TARGET_64BIT ? dbx64_register_map[n] : svr4_dbx_register_map[n]) + +/* Output assembler code to FILE to call the profiler. + To the best of my knowledge, no Linux libc has required the label + argument to mcount. */ + +#define NO_PROFILE_COUNTERS 1 + +#undef MCOUNT_NAME +#define MCOUNT_NAME "mcount" + +/* The GLIBC version of mcount for the x86 assumes that there is a + frame, so we cannot allow profiling without a frame pointer. */ + +#undef SUBTARGET_FRAME_POINTER_REQUIRED +#define SUBTARGET_FRAME_POINTER_REQUIRED crtl->profile + +#undef SIZE_TYPE +#define SIZE_TYPE "unsigned int" + +#undef PTRDIFF_TYPE +#define PTRDIFF_TYPE "int" + +#undef WCHAR_TYPE +#define WCHAR_TYPE "long int" + +#undef WCHAR_TYPE_SIZE +#define WCHAR_TYPE_SIZE BITS_PER_WORD + +#define TARGET_OS_CPP_BUILTINS() \ + do \ + { \ + LINUX_TARGET_OS_CPP_BUILTINS(); \ + } \ + while (0) + +#undef CPP_SPEC +#define CPP_SPEC "%{posix:-D_POSIX_SOURCE} %{pthread:-D_REENTRANT}" + +#undef CC1_SPEC +#define CC1_SPEC "%(cc1_cpu) %{profile:-p}" + +/* Provide a LINK_SPEC appropriate for Linux. Here we provide support + for the special GCC options -static and -shared, which allow us to + link things in one of these three modes by applying the appropriate + combinations of options at link-time. + + When the -shared link option is used a final link is not being + done. */ + +/* These macros may be overridden in k*bsd-gnu.h and i386/k*bsd-gnu.h. */ +#define LINK_EMULATION "elf_i386" +#define GLIBC_DYNAMIC_LINKER "/lib/ld-linux.so.2" + +#undef ASM_SPEC +#define ASM_SPEC \ + "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}}" + +#undef SUBTARGET_EXTRA_SPECS +#define SUBTARGET_EXTRA_SPECS \ + { "link_emulation", LINK_EMULATION },\ + { "dynamic_linker", LINUX_DYNAMIC_LINKER } + +#undef LINK_SPEC +#define LINK_SPEC "-m %(link_emulation) %{shared:-shared} \ + %{!shared: \ + %{!static: \ + %{rdynamic:-export-dynamic} \ + -dynamic-linker %(dynamic_linker)} \ + %{static:-static}}" + +/* Similar to standard Linux, but adding -ffast-math support. */ +#undef ENDFILE_SPEC +#define ENDFILE_SPEC \ + "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \ + %{mpc32:crtprec32.o%s} \ + %{mpc64:crtprec64.o%s} \ + %{mpc80:crtprec80.o%s} \ + %{shared|pie:crtendS.o%s;:crtend.o%s} crtn.o%s" + +/* A C statement (sans semicolon) to output to the stdio stream + FILE the assembler definition of uninitialized global DECL named + NAME whose size is SIZE bytes and alignment is ALIGN bytes. + Try to use asm_output_aligned_bss to implement this macro. */ + +#define ASM_OUTPUT_ALIGNED_BSS(FILE, DECL, NAME, SIZE, ALIGN) \ + asm_output_aligned_bss (FILE, DECL, NAME, SIZE, ALIGN) + +/* A C statement to output to the stdio stream FILE an assembler + command to advance the location counter to a multiple of 1< 8 byte \ + alignment is preferred. */ \ + if ((LOG) > 3 \ + && (1 << (LOG)) > ((MAX_SKIP) + 1) \ + && (MAX_SKIP) >= 7) \ + fputs ("\t.p2align 3\n", (FILE)); \ + } \ + } \ + } while (0) +#endif + +/* Handle special EH pointer encodings. Absolute, pc-relative, and + indirect are handled automatically. */ +#define ASM_MAYBE_OUTPUT_ENCODED_ADDR_RTX(FILE, ENCODING, SIZE, ADDR, DONE) \ + do { \ + if ((SIZE) == 4 && ((ENCODING) & 0x70) == DW_EH_PE_datarel) \ + { \ + fputs (ASM_LONG, FILE); \ + assemble_name (FILE, XSTR (ADDR, 0)); \ + fputs (((ENCODING) & DW_EH_PE_indirect ? "@GOT" : "@GOTOFF"), FILE); \ + goto DONE; \ + } \ + } while (0) + +/* Used by crtstuff.c to initialize the base of data-relative relocations. + These are GOT relative on x86, so return the pic register. */ +#ifdef __PIC__ +#define CRT_GET_RFIB_DATA(BASE) \ + { \ + register void *ebx_ __asm__("ebx"); \ + BASE = ebx_; \ + } +#else +#define CRT_GET_RFIB_DATA(BASE) \ + __asm__ ("call\t.LPR%=\n" \ + ".LPR%=:\n\t" \ + "pop{l}\t%0\n\t" \ + /* Due to a GAS bug, this cannot use EAX. That encodes \ + smaller than the traditional EBX, which results in the \ + offset being off by one. */ \ + "add{l}\t{$_GLOBAL_OFFSET_TABLE_+[.-.LPR%=],%0" \ + "|%0,_GLOBAL_OFFSET_TABLE_+(.-.LPR%=)}" \ + : "=d"(BASE)) +#endif + +/* Put all *tf routines in libgcc. */ +#undef LIBGCC2_HAS_TF_MODE +#define LIBGCC2_HAS_TF_MODE 1 +#define LIBGCC2_TF_CEXT q +#define TF_SIZE 113 + +#define TARGET_ASM_FILE_END file_end_indicate_exec_stack + +#define MD_UNWIND_SUPPORT "config/i386/linux-unwind.h" + +/* The stack pointer needs to be moved while checking the stack. */ +#define STACK_CHECK_MOVING_SP 1 + +/* Static stack checking is supported by means of probes. */ +#define STACK_CHECK_STATIC_BUILTIN 1 + +/* This macro may be overridden in i386/k*bsd-gnu.h. */ +#define REG_NAME(reg) reg + +#ifdef TARGET_LIBC_PROVIDES_SSP +/* i386 glibc provides __stack_chk_guard in %gs:0x14. */ +#define TARGET_THREAD_SSP_OFFSET 0x14 + +/* We steal the last transactional memory word. */ +#define TARGET_CAN_SPLIT_STACK +#define TARGET_THREAD_SPLIT_STACK_OFFSET 0x30 +#endif diff --git a/gcc/config/i386/linux64.h b/gcc/config/i386/linux64.h new file mode 100644 index 000000000..103ab0c99 --- /dev/null +++ b/gcc/config/i386/linux64.h @@ -0,0 +1,132 @@ +/* Definitions for AMD x86-64 running Linux-based GNU systems with ELF format. + Copyright (C) 2001, 2002, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 + Free Software Foundation, Inc. + Contributed by Jan Hubicka , based on linux.h. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ + +#if TARGET_64BIT_DEFAULT +#define TARGET_VERSION fprintf (stderr, " (x86-64 Linux/ELF)"); +#else +#define TARGET_VERSION fprintf (stderr, " (i386 Linux/ELF)"); +#endif + +#define TARGET_OS_CPP_BUILTINS() \ + do \ + { \ + LINUX_TARGET_OS_CPP_BUILTINS(); \ + } \ + while (0) + +#undef CPP_SPEC +#define CPP_SPEC "%{posix:-D_POSIX_SOURCE} %{pthread:-D_REENTRANT}" + +#undef CC1_SPEC +#define CC1_SPEC "%(cc1_cpu) %{profile:-p}" + +/* The svr4 ABI for the i386 says that records and unions are returned + in memory. In the 64bit compilation we will turn this flag off in + ix86_option_override_internal, as we never do pcc_struct_return + scheme on this target. */ +#undef DEFAULT_PCC_STRUCT_RETURN +#define DEFAULT_PCC_STRUCT_RETURN 1 + +/* We arrange for the whole %fs segment to map the tls area. */ +#undef TARGET_TLS_DIRECT_SEG_REFS_DEFAULT +#define TARGET_TLS_DIRECT_SEG_REFS_DEFAULT MASK_TLS_DIRECT_SEG_REFS + +/* Provide a LINK_SPEC. Here we provide support for the special GCC + options -static and -shared, which allow us to link things in one + of these three modes by applying the appropriate combinations of + options at link-time. + + When the -shared link option is used a final link is not being + done. */ + +#define GLIBC_DYNAMIC_LINKER32 "/lib/ld-linux.so.2" +#define GLIBC_DYNAMIC_LINKER64 "/lib64/ld-linux-x86-64.so.2" + +#if TARGET_64BIT_DEFAULT +#define SPEC_32 "m32" +#define SPEC_64 "!m32" +#else +#define SPEC_32 "!m64" +#define SPEC_64 "m64" +#endif + +#undef ASM_SPEC +#define ASM_SPEC "%{" SPEC_32 ":--32} %{" SPEC_64 ":--64} \ + %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}}" + +#undef LINK_SPEC +#define LINK_SPEC "%{" SPEC_64 ":-m elf_x86_64} %{" SPEC_32 ":-m elf_i386} \ + %{shared:-shared} \ + %{!shared: \ + %{!static: \ + %{rdynamic:-export-dynamic} \ + %{" SPEC_32 ":-dynamic-linker " LINUX_DYNAMIC_LINKER32 "} \ + %{" SPEC_64 ":-dynamic-linker " LINUX_DYNAMIC_LINKER64 "}} \ + %{static:-static}}" + +/* Similar to standard Linux, but adding -ffast-math support. */ +#undef ENDFILE_SPEC +#define ENDFILE_SPEC \ + "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \ + %{mpc32:crtprec32.o%s} \ + %{mpc64:crtprec64.o%s} \ + %{mpc80:crtprec80.o%s} \ + %{shared|pie:crtendS.o%s;:crtend.o%s} crtn.o%s" + +#if TARGET_64BIT_DEFAULT +#define MULTILIB_DEFAULTS { "m64" } +#else +#define MULTILIB_DEFAULTS { "m32" } +#endif + +/* Put all *tf routines in libgcc. */ +#undef LIBGCC2_HAS_TF_MODE +#define LIBGCC2_HAS_TF_MODE 1 +#define LIBGCC2_TF_CEXT q +#define TF_SIZE 113 + +#define TARGET_ASM_FILE_END file_end_indicate_exec_stack + +#define MD_UNWIND_SUPPORT "config/i386/linux-unwind.h" + +/* The stack pointer needs to be moved while checking the stack. */ +#define STACK_CHECK_MOVING_SP 1 + +/* Static stack checking is supported by means of probes. */ +#define STACK_CHECK_STATIC_BUILTIN 1 + +/* This macro may be overridden in i386/k*bsd-gnu.h. */ +#define REG_NAME(reg) reg + +#ifdef TARGET_LIBC_PROVIDES_SSP +/* i386 glibc provides __stack_chk_guard in %gs:0x14, + x86_64 glibc provides it in %fs:0x28. */ +#define TARGET_THREAD_SSP_OFFSET (TARGET_64BIT ? 0x28 : 0x14) + +/* We steal the last transactional memory word. */ +#define TARGET_CAN_SPLIT_STACK +#define TARGET_THREAD_SPLIT_STACK_OFFSET (TARGET_64BIT ? 0x70 : 0x30) +#endif diff --git a/gcc/config/i386/lwpintrin.h b/gcc/config/i386/lwpintrin.h new file mode 100644 index 000000000..954b039e5 --- /dev/null +++ b/gcc/config/i386/lwpintrin.h @@ -0,0 +1,100 @@ +/* Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86INTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _LWPINTRIN_H_INCLUDED +#define _LWPINTRIN_H_INCLUDED + +#ifndef __LWP__ +# error "LWP instruction set not enabled" +#else + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__llwpcb (void *pcbAddress) +{ + __builtin_ia32_llwpcb (pcbAddress); +} + +extern __inline void * __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__slwpcb (void) +{ + return __builtin_ia32_slwpcb (); +} + +#ifdef __OPTIMIZE__ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lwpval32 (unsigned int data2, unsigned int data1, unsigned int flags) +{ + __builtin_ia32_lwpval32 (data2, data1, flags); +} + +#ifdef __x86_64__ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lwpval64 (unsigned long long data2, unsigned int data1, unsigned int flags) +{ + __builtin_ia32_lwpval64 (data2, data1, flags); +} +#endif +#else +#define __lwpval32(D2, D1, F) \ + (__builtin_ia32_lwpval32 ((unsigned int) (D2), (unsigned int) (D1), \ + (unsigned int) (F))) +#ifdef __x86_64__ +#define __lwpval64(D2, D1, F) \ + (__builtin_ia32_lwpval64 ((unsigned long long) (D2), (unsigned int) (D1), \ + (unsigned int) (F))) +#endif +#endif + + +#ifdef __OPTIMIZE__ +extern __inline unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lwpins32 (unsigned int data2, unsigned int data1, unsigned int flags) +{ + return __builtin_ia32_lwpins32 (data2, data1, flags); +} + +#ifdef __x86_64__ +extern __inline unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lwpins64 (unsigned long long data2, unsigned int data1, unsigned int flags) +{ + return __builtin_ia32_lwpins64 (data2, data1, flags); +} +#endif +#else +#define __lwpins32(D2, D1, F) \ + (__builtin_ia32_lwpins32 ((unsigned int) (D2), (unsigned int) (D1), \ + (unsigned int) (F))) +#ifdef __x86_64__ +#define __lwpins64(D2, D1, F) \ + (__builtin_ia32_lwpins64 ((unsigned long long) (D2), (unsigned int) (D1), \ + (unsigned int) (F))) +#endif +#endif + +#endif /* __LWP__ */ + +#endif /* _LWPINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/lynx.h b/gcc/config/i386/lynx.h new file mode 100644 index 000000000..df73e9b85 --- /dev/null +++ b/gcc/config/i386/lynx.h @@ -0,0 +1,90 @@ +/* Definitions for LynxOS on i386. + Copyright (C) 1993, 1995, 1996, 2002, 2004, 2005, 2007, 2010 + Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#define TARGET_VERSION fputs (" (i386/LynxOS)", stderr); + +#define TARGET_OS_CPP_BUILTINS() \ + do \ + { \ + builtin_define ("__LITTLE_ENDIAN__"); \ + builtin_define ("__x86__"); \ + } \ + while (0) + +/* The svr4 ABI for the i386 says that records and unions are returned + in memory. */ + +#define DEFAULT_PCC_STRUCT_RETURN 1 + +/* BSS_SECTION_ASM_OP gets defined i386/unix.h. */ + +#define ASM_OUTPUT_ALIGNED_BSS(FILE, DECL, NAME, SIZE, ALIGN) \ + asm_output_aligned_bss (FILE, DECL, NAME, SIZE, ALIGN) + +/* LynxOS's GDB counts the floating point registers from 16. */ + +#undef DBX_REGISTER_NUMBER +#define DBX_REGISTER_NUMBER(n) \ + (TARGET_64BIT ? dbx64_register_map[n] \ + : (n) == 0 ? 0 \ + : (n) == 1 ? 2 \ + : (n) == 2 ? 1 \ + : (n) == 3 ? 3 \ + : (n) == 4 ? 6 \ + : (n) == 5 ? 7 \ + : (n) == 6 ? 5 \ + : (n) == 7 ? 4 \ + : ((n) >= FIRST_STACK_REG && (n) <= LAST_STACK_REG) ? (int) (n) + 8 \ + : (-1)) + +/* A C statement to output to the stdio stream FILE an assembler + command to advance the location counter to a multiple of 1< types on systems using mingw. + Copyright (C) 2009 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#define SIG_ATOMIC_TYPE "int" + +#define INT8_TYPE "signed char" +#define INT16_TYPE "short int" +#define INT32_TYPE "int" +#define INT64_TYPE "long long int" +#define UINT8_TYPE "unsigned char" +#define UINT16_TYPE "short unsigned int" +#define UINT32_TYPE "unsigned int" +#define UINT64_TYPE "long long unsigned int" + +#define INT_LEAST8_TYPE "signed char" +#define INT_LEAST16_TYPE "short int" +#define INT_LEAST32_TYPE "int" +#define INT_LEAST64_TYPE "long long int" +#define UINT_LEAST8_TYPE "unsigned char" +#define UINT_LEAST16_TYPE "short unsigned int" +#define UINT_LEAST32_TYPE "unsigned int" +#define UINT_LEAST64_TYPE "long long unsigned int" + +#define INT_FAST8_TYPE "signed char" +#define INT_FAST16_TYPE "short int" +#define INT_FAST32_TYPE "int" +#define INT_FAST64_TYPE "long long int" +#define UINT_FAST8_TYPE "unsigned char" +#define UINT_FAST16_TYPE "short unsigned int" +#define UINT_FAST32_TYPE "unsigned int" +#define UINT_FAST64_TYPE "long long unsigned int" + +#define INTPTR_TYPE (TARGET_64BIT ? "long long int" : "int") +#define UINTPTR_TYPE (TARGET_64BIT ? "long long unsigned int" : "unsigned int") diff --git a/gcc/config/i386/mingw-w64.h b/gcc/config/i386/mingw-w64.h new file mode 100644 index 000000000..f84434a3f --- /dev/null +++ b/gcc/config/i386/mingw-w64.h @@ -0,0 +1,79 @@ +/* Operating system specific defines to be used when targeting GCC for + hosting on Windows 32/64 via mingw-w64 runtime, using GNU tools and + the Windows API Library. + Copyright (C) 2009, + 2010 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +/* Enable -municode feature. */ + +#undef CPP_SPEC +#define CPP_SPEC "%{posix:-D_POSIX_SOURCE} %{mthreads:-D_MT} \ + %{municode:-DUNICODE}" + +#undef STARTFILE_SPEC +#define STARTFILE_SPEC "%{shared|mdll:dllcrt2%O%s} \ + %{!shared:%{!mdll:%{!municode:crt2%O%s}}} \ + %{!shared:%{!mdll:%{municode:crt2u%O%s}}} \ + %{pg:gcrt2%O%s} \ + crtbegin.o%s" + +/* Enable multilib. */ + +#undef ASM_SPEC +#define ASM_SPEC "%{m32:--32} %{m64:--64}" + +#undef SPEC_32 +#undef SPEC_64 +#if TARGET_64BIT_DEFAULT +#define SPEC_32 "m32" +#define SPEC_64 "!m32" +#else +#define SPEC_32 "!m64" +#define SPEC_64 "m64" +#endif + +#undef SUB_LINK_ENTRY32 +#undef SUB_LINK_ENTRY64 +#define SUB_LINK_ENTRY32 "-e _DllMainCRTStartup@12" +#if defined(USE_MINGW64_LEADING_UNDERSCORES) +#define SUB_LINK_ENTRY64 "-e _DllMainCRTStartup" +#else +#define SUB_LINK_ENTRY64 "-e DllMainCRTStartup" +#endif + +#undef SUB_LINK_SPEC +#undef SUB_LINK_ENTRY +#define SUB_LINK_SPEC "%{" SPEC_64 ":-m i386pep} %{" SPEC_32 ":-m i386pe}" +#define SUB_LINK_ENTRY "%{" SPEC_64 ":" SUB_LINK_ENTRY64 "} %{" SPEC_32 ":" SUB_LINK_ENTRY32 "}" + +#undef MULTILIB_DEFAULTS +#if TARGET_64BIT_DEFAULT +#define MULTILIB_DEFAULTS { "m64" } +#else +#define MULTILIB_DEFAULTS { "m32" } +#endif + +#undef LINK_SPEC +#define LINK_SPEC SUB_LINK_SPEC " %{mwindows:--subsystem windows} \ + %{mconsole:--subsystem console} \ + %{shared: %{mdll: %eshared and mdll are not compatible}} \ + %{shared: --shared} %{mdll:--dll} \ + %{static:-Bstatic} %{!static:-Bdynamic} \ + %{shared|mdll: " SUB_LINK_ENTRY " --enable-auto-image-base} \ + %(shared_libgcc_undefs)" diff --git a/gcc/config/i386/mingw-w64.opt b/gcc/config/i386/mingw-w64.opt new file mode 100644 index 000000000..965f4c0ce --- /dev/null +++ b/gcc/config/i386/mingw-w64.opt @@ -0,0 +1,23 @@ +; MinGW-w64-specific options. + +; Copyright (C) 2009 Free Software Foundation, Inc. +; +; This file is part of GCC. +; +; GCC is free software; you can redistribute it and/or modify it under +; the terms of the GNU General Public License as published by the Free +; Software Foundation; either version 3, or (at your option) any later +; version. +; +; GCC is distributed in the hope that it will be useful, but WITHOUT ANY +; WARRANTY; without even the implied warranty of MERCHANTABILITY or +; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +; for more details. +; +; You should have received a copy of the GNU General Public License +; along with GCC; see the file COPYING3. If not see +; . + +municode +Target +Use unicode startup and define UNICODE macro diff --git a/gcc/config/i386/mingw.opt b/gcc/config/i386/mingw.opt new file mode 100644 index 000000000..bd9a4b630 --- /dev/null +++ b/gcc/config/i386/mingw.opt @@ -0,0 +1,27 @@ +; MinGW-specific options. + +; Copyright (C) 2008 Free Software Foundation, Inc. +; +; This file is part of GCC. +; +; GCC is free software; you can redistribute it and/or modify it under +; the terms of the GNU General Public License as published by the Free +; Software Foundation; either version 3, or (at your option) any later +; version. +; +; GCC is distributed in the hope that it will be useful, but WITHOUT ANY +; WARRANTY; without even the implied warranty of MERCHANTABILITY or +; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +; for more details. +; +; You should have received a copy of the GNU General Public License +; along with GCC; see the file COPYING3. If not see +; . + +Wpedantic-ms-format +C ObjC C++ ObjC++ Var(warn_pedantic_ms_format) Init(1) Warning +Warn about none ISO msvcrt scanf/printf width extensions + +fset-stack-executable +Common Report Var(flag_setstackexecutable) Init(1) Optimization +For nested functions on stack executable permission is set. diff --git a/gcc/config/i386/mingw32.h b/gcc/config/i386/mingw32.h new file mode 100644 index 000000000..27da92b4e --- /dev/null +++ b/gcc/config/i386/mingw32.h @@ -0,0 +1,247 @@ +/* Operating system specific defines to be used when targeting GCC for + hosting on Windows32, using GNU tools and the Windows32 API Library. + Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2007, 2008, + 2009, 2010 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#undef TARGET_VERSION +#if TARGET_64BIT_DEFAULT +#define TARGET_VERSION fprintf (stderr,"(x86_64 MinGW"); +#else +#define TARGET_VERSION fprintf (stderr," (x86 MinGW)"); +#endif + +/* See i386/crtdll.h for an alternative definition. _INTEGRAL_MAX_BITS + is for compatibility with native compiler. */ +#define EXTRA_OS_CPP_BUILTINS() \ + do \ + { \ + builtin_define ("__MSVCRT__"); \ + builtin_define ("__MINGW32__"); \ + builtin_define ("_WIN32"); \ + builtin_define_std ("WIN32"); \ + builtin_define_std ("WINNT"); \ + builtin_define_with_int_value ("_INTEGRAL_MAX_BITS", \ + TYPE_PRECISION (intmax_type_node));\ + if (TARGET_64BIT && ix86_abi == MS_ABI) \ + { \ + builtin_define ("__MINGW64__"); \ + builtin_define_std ("WIN64"); \ + builtin_define ("_WIN64"); \ + } \ + } \ + while (0) + +#undef SUB_LINK_ENTRY32 +#undef SUB_LINK_ENTRY64 +#define SUB_LINK_ENTRY32 "-e _DllMainCRTStartup@12" +#if defined(USE_MINGW64_LEADING_UNDERSCORES) +#define SUB_LINK_ENTRY64 "-e _DllMainCRTStartup" +#else +#define SUB_LINK_ENTRY64 "-e DllMainCRTStartup" +#endif + +#undef SUB_LINK_ENTRY +#if TARGET_64BIT_DEFAULT +#define SUB_LINK_ENTRY SUB_LINK_ENTRY64 +#else +#define SUB_LINK_ENTRY SUB_LINK_ENTRY32 +#endif + +/* Override the standard choice of /usr/include as the default prefix + to try when searching for header files. */ +#undef STANDARD_INCLUDE_DIR +#define STANDARD_INCLUDE_DIR "/mingw/include" +#undef STANDARD_INCLUDE_COMPONENT +#define STANDARD_INCLUDE_COMPONENT "MINGW" + +#undef CPP_SPEC +#define CPP_SPEC "%{posix:-D_POSIX_SOURCE} %{mthreads:-D_MT}" + +/* For Windows applications, include more libraries, but always include + kernel32. */ +#undef LIB_SPEC +#define LIB_SPEC "%{pg:-lgmon} %{mwindows:-lgdi32 -lcomdlg32} \ + -ladvapi32 -lshell32 -luser32 -lkernel32" + +/* Weak symbols do not get resolved if using a Windows dll import lib. + Make the unwind registration references strong undefs. */ +#if DWARF2_UNWIND_INFO +/* DW2-unwind is just available for 32-bit mode. */ +#if TARGET_64BIT_DEFAULT +#error DW2 unwind is not available for 64-bit. +#endif +#define SHARED_LIBGCC_UNDEFS_SPEC \ + "%{shared-libgcc: -u ___register_frame_info -u ___deregister_frame_info}" +#else +#define SHARED_LIBGCC_UNDEFS_SPEC "" +#endif + +#undef SUBTARGET_EXTRA_SPECS +#define SUBTARGET_EXTRA_SPECS \ + { "shared_libgcc_undefs", SHARED_LIBGCC_UNDEFS_SPEC } + +#define LINK_SPEC "%{mwindows:--subsystem windows} \ + %{mconsole:--subsystem console} \ + %{shared: %{mdll: %eshared and mdll are not compatible}} \ + %{shared: --shared} %{mdll:--dll} \ + %{static:-Bstatic} %{!static:-Bdynamic} \ + %{shared|mdll: " SUB_LINK_ENTRY " --enable-auto-image-base} \ + %(shared_libgcc_undefs)" + +/* Include in the mingw32 libraries with libgcc */ +#ifdef ENABLE_SHARED_LIBGCC +#define SHARED_LIBGCC_SPEC "%{shared-libgcc:-lgcc_s} %{!shared-libgcc:-lgcc_eh}" +#else +#define SHARED_LIBGCC_SPEC /*empty*/ +#endif +#undef REAL_LIBGCC_SPEC +#define REAL_LIBGCC_SPEC \ + "%{mthreads:-lmingwthrd} -lmingw32 \ + "SHARED_LIBGCC_SPEC" \ + -lgcc \ + -lmoldname -lmingwex -lmsvcrt" + +#undef STARTFILE_SPEC +#define STARTFILE_SPEC "%{shared|mdll:dllcrt2%O%s} \ + %{!shared:%{!mdll:crt2%O%s}} %{pg:gcrt2%O%s} \ + crtbegin.o%s" + +#undef ENDFILE_SPEC +#define ENDFILE_SPEC \ + "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \ + crtend.o%s" + +/* Override startfile prefix defaults. */ +#ifndef STANDARD_STARTFILE_PREFIX_1 +#define STANDARD_STARTFILE_PREFIX_1 "/mingw/lib/" +#endif +#ifndef STANDARD_STARTFILE_PREFIX_2 +#define STANDARD_STARTFILE_PREFIX_2 "" +#endif + +/* Output STRING, a string representing a filename, to FILE. + We canonicalize it to be in Unix format (backslashes are replaced + forward slashes. */ +#undef OUTPUT_QUOTED_STRING +#define OUTPUT_QUOTED_STRING(FILE, STRING) \ +do { \ + char c; \ + \ + putc ('\"', asm_file); \ + \ + while ((c = *string++) != 0) \ + { \ + if (c == '\\') \ + c = '/'; \ + \ + if (ISPRINT (c)) \ + { \ + if (c == '\"') \ + putc ('\\', asm_file); \ + putc (c, asm_file); \ + } \ + else \ + fprintf (asm_file, "\\%03o", (unsigned char) c); \ + } \ + \ + putc ('\"', asm_file); \ +} while (0) + +/* Define as short unsigned for compatibility with MS runtime. */ +#undef WINT_TYPE +#define WINT_TYPE "short unsigned int" + +/* mingw32 uses the -mthreads option to enable thread support. */ +#undef GOMP_SELF_SPECS +#define GOMP_SELF_SPECS "%{fopenmp: -mthreads}" + +/* mingw32 atexit function is safe to use in shared libraries. Use it + to register C++ static destructors. */ +#define TARGET_CXX_USE_ATEXIT_FOR_CXA_ATEXIT hook_bool_void_true + +/* Contains a pointer to type target_ovr_attr defining the target specific + overrides of format attributes. See c-format.h for structure + definition. */ +#undef TARGET_OVERRIDES_FORMAT_ATTRIBUTES +#define TARGET_OVERRIDES_FORMAT_ATTRIBUTES mingw_format_attribute_overrides + +/* Specify the count of elements in TARGET_OVERRIDES_ATTRIBUTE. */ +#undef TARGET_OVERRIDES_FORMAT_ATTRIBUTES_COUNT +#define TARGET_OVERRIDES_FORMAT_ATTRIBUTES_COUNT 3 + +/* Custom initialization for warning -Wpedantic-ms-format for c-format. */ +#undef TARGET_OVERRIDES_FORMAT_INIT +#define TARGET_OVERRIDES_FORMAT_INIT msformat_init + +/* MS specific format attributes for ms_printf, ms_scanf, ms_strftime. */ +#undef TARGET_FORMAT_TYPES +#define TARGET_FORMAT_TYPES mingw_format_attributes + +#undef TARGET_N_FORMAT_TYPES +#define TARGET_N_FORMAT_TYPES 3 + +/* Let defaults.h definition of TARGET_USE_JCR_SECTION apply. */ +#undef TARGET_USE_JCR_SECTION + +#undef MINGW_ENABLE_EXECUTE_STACK +#define MINGW_ENABLE_EXECUTE_STACK \ +extern void __enable_execute_stack (void *); \ +void \ +__enable_execute_stack (void *addr) \ +{ \ + MEMORY_BASIC_INFORMATION b; \ + if (!VirtualQuery (addr, &b, sizeof(b))) \ + abort (); \ + VirtualProtect (b.BaseAddress, b.RegionSize, PAGE_EXECUTE_READWRITE, \ + &b.Protect); \ +} + +#undef ENABLE_EXECUTE_STACK +#define ENABLE_EXECUTE_STACK MINGW_ENABLE_EXECUTE_STACK +#undef CHECK_EXECUTE_STACK_ENABLED +#define CHECK_EXECUTE_STACK_ENABLED flag_setstackexecutable + +#ifdef IN_LIBGCC2 +#include +#endif + +/* For 64-bit Windows we can't use DW2 unwind info. Also for multilib + builds we can't use it, too. */ +#if !TARGET_64BIT_DEFAULT && !defined (TARGET_BI_ARCH) +#define MD_UNWIND_SUPPORT "config/i386/w32-unwind.h" +#endif + +/* This matches SHLIB_SONAME and SHLIB_SOVERSION in t-cygming. */ +/* This matches SHLIB_SONAME and SHLIB_SOVERSION in t-cygwin. */ +#if DWARF2_UNWIND_INFO +#define LIBGCC_EH_EXTN "_dw2" +#else +#define LIBGCC_EH_EXTN "_sjlj" +#endif +#define LIBGCC_SONAME "libgcc_s" LIBGCC_EH_EXTN "-1.dll" + +/* We should find a way to not have to update this manually. */ +#define LIBGCJ_SONAME "libgcj" /*LIBGCC_EH_EXTN*/ "-12.dll" + +/* For 32-bit Windows we need valid frame-pointer for function using + setjmp. */ +#undef SUBTARGET_FRAME_POINTER_REQUIRED +#define SUBTARGET_FRAME_POINTER_REQUIRED \ + (!TARGET_64BIT && cfun->calls_setjmp) + diff --git a/gcc/config/i386/mm3dnow.h b/gcc/config/i386/mm3dnow.h new file mode 100644 index 000000000..0d0735c9a --- /dev/null +++ b/gcc/config/i386/mm3dnow.h @@ -0,0 +1,215 @@ +/* Copyright (C) 2004, 2007, 2008, 2009 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Implemented from the mm3dnow.h (of supposedly AMD origin) included with + MSVC 7.1. */ + +#ifndef _MM3DNOW_H_INCLUDED +#define _MM3DNOW_H_INCLUDED + +#ifdef __3dNOW__ + +#include + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_femms (void) +{ + __builtin_ia32_femms(); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pavgusb (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pavgusb ((__v8qi)__A, (__v8qi)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pf2id (__m64 __A) +{ + return (__m64)__builtin_ia32_pf2id ((__v2sf)__A); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfacc (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfacc ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfadd (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfadd ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfcmpeq (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfcmpeq ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfcmpge (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfcmpge ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfcmpgt (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfcmpgt ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfmax (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfmax ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfmin (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfmin ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfmul (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfmul ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfrcp (__m64 __A) +{ + return (__m64)__builtin_ia32_pfrcp ((__v2sf)__A); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfrcpit1 (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfrcpit1 ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfrcpit2 (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfrcpit2 ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfrsqrt (__m64 __A) +{ + return (__m64)__builtin_ia32_pfrsqrt ((__v2sf)__A); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfrsqit1 (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfrsqit1 ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfsub (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfsub ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfsubr (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfsubr ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pi2fd (__m64 __A) +{ + return (__m64)__builtin_ia32_pi2fd ((__v2si)__A); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pmulhrw (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pmulhrw ((__v4hi)__A, (__v4hi)__B); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_prefetch (void *__P) +{ + __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_prefetchw (void *__P) +{ + __builtin_prefetch (__P, 1, 3 /* _MM_HINT_T0 */); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_from_float (float __A) +{ + return __extension__ (__m64)(__v2sf){ __A, 0.0f }; +} + +extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_to_float (__m64 __A) +{ + union { __v2sf v; float a[2]; } __tmp; + __tmp.v = (__v2sf)__A; + return __tmp.a[0]; +} + +#ifdef __3dNOW_A__ + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pf2iw (__m64 __A) +{ + return (__m64)__builtin_ia32_pf2iw ((__v2sf)__A); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfnacc (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfnacc ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfpnacc (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfpnacc ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pi2fw (__m64 __A) +{ + return (__m64)__builtin_ia32_pi2fw ((__v2si)__A); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pswapd (__m64 __A) +{ + return (__m64)__builtin_ia32_pswapdsf ((__v2sf)__A); +} + +#endif /* __3dNOW_A__ */ +#endif /* __3dNOW__ */ + +#endif /* _MM3DNOW_H_INCLUDED */ diff --git a/gcc/config/i386/mmintrin.h b/gcc/config/i386/mmintrin.h new file mode 100644 index 000000000..497e22edd --- /dev/null +++ b/gcc/config/i386/mmintrin.h @@ -0,0 +1,921 @@ +/* Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 + Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 9.0. */ + +#ifndef _MMINTRIN_H_INCLUDED +#define _MMINTRIN_H_INCLUDED + +#ifndef __MMX__ +# error "MMX instruction set not enabled" +#else +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__)); + +/* Internal data types for implementing the intrinsics. */ +typedef int __v2si __attribute__ ((__vector_size__ (8))); +typedef short __v4hi __attribute__ ((__vector_size__ (8))); +typedef char __v8qi __attribute__ ((__vector_size__ (8))); +typedef long long __v1di __attribute__ ((__vector_size__ (8))); +typedef float __v2sf __attribute__ ((__vector_size__ (8))); + +/* Empty the multimedia state. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_empty (void) +{ + __builtin_ia32_emms (); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_empty (void) +{ + _mm_empty (); +} + +/* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi32_si64 (int __i) +{ + return (__m64) __builtin_ia32_vec_init_v2si (__i, 0); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_from_int (int __i) +{ + return _mm_cvtsi32_si64 (__i); +} + +#ifdef __x86_64__ +/* Convert I to a __m64 object. */ + +/* Intel intrinsic. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_from_int64 (long long __i) +{ + return (__m64) __i; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64_m64 (long long __i) +{ + return (__m64) __i; +} + +/* Microsoft intrinsic. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64x_si64 (long long __i) +{ + return (__m64) __i; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pi64x (long long __i) +{ + return (__m64) __i; +} +#endif + +/* Convert the lower 32 bits of the __m64 object into an integer. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64_si32 (__m64 __i) +{ + return __builtin_ia32_vec_ext_v2si ((__v2si)__i, 0); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_to_int (__m64 __i) +{ + return _mm_cvtsi64_si32 (__i); +} + +#ifdef __x86_64__ +/* Convert the __m64 object to a 64bit integer. */ + +/* Intel intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_to_int64 (__m64 __i) +{ + return (long long)__i; +} + +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtm64_si64 (__m64 __i) +{ + return (long long)__i; +} + +/* Microsoft intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64_si64x (__m64 __i) +{ + return (long long)__i; +} +#endif + +/* Pack the four 16-bit values from M1 into the lower four 8-bit values of + the result, and the four 16-bit values from M2 into the upper four 8-bit + values of the result, all with signed saturation. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packs_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_packsswb (__m64 __m1, __m64 __m2) +{ + return _mm_packs_pi16 (__m1, __m2); +} + +/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of + the result, and the two 32-bit values from M2 into the upper two 16-bit + values of the result, all with signed saturation. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packs_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_packssdw (__m64 __m1, __m64 __m2) +{ + return _mm_packs_pi32 (__m1, __m2); +} + +/* Pack the four 16-bit values from M1 into the lower four 8-bit values of + the result, and the four 16-bit values from M2 into the upper four 8-bit + values of the result, all with unsigned saturation. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packs_pu16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_packuswb (__m64 __m1, __m64 __m2) +{ + return _mm_packs_pu16 (__m1, __m2); +} + +/* Interleave the four 8-bit values from the high half of M1 with the four + 8-bit values from the high half of M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_punpckhbw (__m64 __m1, __m64 __m2) +{ + return _mm_unpackhi_pi8 (__m1, __m2); +} + +/* Interleave the two 16-bit values from the high half of M1 with the two + 16-bit values from the high half of M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_punpckhwd (__m64 __m1, __m64 __m2) +{ + return _mm_unpackhi_pi16 (__m1, __m2); +} + +/* Interleave the 32-bit value from the high half of M1 with the 32-bit + value from the high half of M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_punpckhdq (__m64 __m1, __m64 __m2) +{ + return _mm_unpackhi_pi32 (__m1, __m2); +} + +/* Interleave the four 8-bit values from the low half of M1 with the four + 8-bit values from the low half of M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_punpcklbw (__m64 __m1, __m64 __m2) +{ + return _mm_unpacklo_pi8 (__m1, __m2); +} + +/* Interleave the two 16-bit values from the low half of M1 with the two + 16-bit values from the low half of M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_punpcklwd (__m64 __m1, __m64 __m2) +{ + return _mm_unpacklo_pi16 (__m1, __m2); +} + +/* Interleave the 32-bit value from the low half of M1 with the 32-bit + value from the low half of M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_punpckldq (__m64 __m1, __m64 __m2) +{ + return _mm_unpacklo_pi32 (__m1, __m2); +} + +/* Add the 8-bit values in M1 to the 8-bit values in M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_paddb (__m64 __m1, __m64 __m2) +{ + return _mm_add_pi8 (__m1, __m2); +} + +/* Add the 16-bit values in M1 to the 16-bit values in M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_paddw (__m64 __m1, __m64 __m2) +{ + return _mm_add_pi16 (__m1, __m2); +} + +/* Add the 32-bit values in M1 to the 32-bit values in M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_paddd (__m64 __m1, __m64 __m2) +{ + return _mm_add_pi32 (__m1, __m2); +} + +/* Add the 64-bit values in M1 to the 64-bit values in M2. */ +#ifdef __SSE2__ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_si64 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_paddq ((__v1di)__m1, (__v1di)__m2); +} +#endif + +/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed + saturated arithmetic. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_paddsb (__m64 __m1, __m64 __m2) +{ + return _mm_adds_pi8 (__m1, __m2); +} + +/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed + saturated arithmetic. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_paddsw (__m64 __m1, __m64 __m2) +{ + return _mm_adds_pi16 (__m1, __m2); +} + +/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned + saturated arithmetic. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_pu8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_paddusb (__m64 __m1, __m64 __m2) +{ + return _mm_adds_pu8 (__m1, __m2); +} + +/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned + saturated arithmetic. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_pu16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_paddusw (__m64 __m1, __m64 __m2) +{ + return _mm_adds_pu16 (__m1, __m2); +} + +/* Subtract the 8-bit values in M2 from the 8-bit values in M1. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psubb (__m64 __m1, __m64 __m2) +{ + return _mm_sub_pi8 (__m1, __m2); +} + +/* Subtract the 16-bit values in M2 from the 16-bit values in M1. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psubw (__m64 __m1, __m64 __m2) +{ + return _mm_sub_pi16 (__m1, __m2); +} + +/* Subtract the 32-bit values in M2 from the 32-bit values in M1. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psubd (__m64 __m1, __m64 __m2) +{ + return _mm_sub_pi32 (__m1, __m2); +} + +/* Add the 64-bit values in M1 to the 64-bit values in M2. */ +#ifdef __SSE2__ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_si64 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_psubq ((__v1di)__m1, (__v1di)__m2); +} +#endif + +/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed + saturating arithmetic. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psubsb (__m64 __m1, __m64 __m2) +{ + return _mm_subs_pi8 (__m1, __m2); +} + +/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using + signed saturating arithmetic. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psubsw (__m64 __m1, __m64 __m2) +{ + return _mm_subs_pi16 (__m1, __m2); +} + +/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using + unsigned saturating arithmetic. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_pu8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psubusb (__m64 __m1, __m64 __m2) +{ + return _mm_subs_pu8 (__m1, __m2); +} + +/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using + unsigned saturating arithmetic. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_pu16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psubusw (__m64 __m1, __m64 __m2) +{ + return _mm_subs_pu16 (__m1, __m2); +} + +/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing + four 32-bit intermediate results, which are then summed by pairs to + produce two 32-bit results. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_madd_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pmaddwd (__m64 __m1, __m64 __m2) +{ + return _mm_madd_pi16 (__m1, __m2); +} + +/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in + M2 and produce the high 16 bits of the 32-bit results. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhi_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pmulhw (__m64 __m1, __m64 __m2) +{ + return _mm_mulhi_pi16 (__m1, __m2); +} + +/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce + the low 16 bits of the results. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mullo_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pmullw (__m64 __m1, __m64 __m2) +{ + return _mm_mullo_pi16 (__m1, __m2); +} + +/* Shift four 16-bit values in M left by COUNT. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sll_pi16 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (__v4hi)__count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psllw (__m64 __m, __m64 __count) +{ + return _mm_sll_pi16 (__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_pi16 (__m64 __m, int __count) +{ + return (__m64) __builtin_ia32_psllwi ((__v4hi)__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psllwi (__m64 __m, int __count) +{ + return _mm_slli_pi16 (__m, __count); +} + +/* Shift two 32-bit values in M left by COUNT. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sll_pi32 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_ia32_pslld ((__v2si)__m, (__v2si)__count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pslld (__m64 __m, __m64 __count) +{ + return _mm_sll_pi32 (__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_pi32 (__m64 __m, int __count) +{ + return (__m64) __builtin_ia32_pslldi ((__v2si)__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pslldi (__m64 __m, int __count) +{ + return _mm_slli_pi32 (__m, __count); +} + +/* Shift the 64-bit value in M left by COUNT. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sll_si64 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_ia32_psllq ((__v1di)__m, (__v1di)__count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psllq (__m64 __m, __m64 __count) +{ + return _mm_sll_si64 (__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_si64 (__m64 __m, int __count) +{ + return (__m64) __builtin_ia32_psllqi ((__v1di)__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psllqi (__m64 __m, int __count) +{ + return _mm_slli_si64 (__m, __count); +} + +/* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sra_pi16 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (__v4hi)__count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psraw (__m64 __m, __m64 __count) +{ + return _mm_sra_pi16 (__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srai_pi16 (__m64 __m, int __count) +{ + return (__m64) __builtin_ia32_psrawi ((__v4hi)__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psrawi (__m64 __m, int __count) +{ + return _mm_srai_pi16 (__m, __count); +} + +/* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sra_pi32 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_ia32_psrad ((__v2si)__m, (__v2si)__count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psrad (__m64 __m, __m64 __count) +{ + return _mm_sra_pi32 (__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srai_pi32 (__m64 __m, int __count) +{ + return (__m64) __builtin_ia32_psradi ((__v2si)__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psradi (__m64 __m, int __count) +{ + return _mm_srai_pi32 (__m, __count); +} + +/* Shift four 16-bit values in M right by COUNT; shift in zeros. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srl_pi16 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (__v4hi)__count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psrlw (__m64 __m, __m64 __count) +{ + return _mm_srl_pi16 (__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_pi16 (__m64 __m, int __count) +{ + return (__m64) __builtin_ia32_psrlwi ((__v4hi)__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psrlwi (__m64 __m, int __count) +{ + return _mm_srli_pi16 (__m, __count); +} + +/* Shift two 32-bit values in M right by COUNT; shift in zeros. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srl_pi32 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_ia32_psrld ((__v2si)__m, (__v2si)__count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psrld (__m64 __m, __m64 __count) +{ + return _mm_srl_pi32 (__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_pi32 (__m64 __m, int __count) +{ + return (__m64) __builtin_ia32_psrldi ((__v2si)__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psrldi (__m64 __m, int __count) +{ + return _mm_srli_pi32 (__m, __count); +} + +/* Shift the 64-bit value in M left by COUNT; shift in zeros. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srl_si64 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_ia32_psrlq ((__v1di)__m, (__v1di)__count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psrlq (__m64 __m, __m64 __count) +{ + return _mm_srl_si64 (__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_si64 (__m64 __m, int __count) +{ + return (__m64) __builtin_ia32_psrlqi ((__v1di)__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psrlqi (__m64 __m, int __count) +{ + return _mm_srli_si64 (__m, __count); +} + +/* Bit-wise AND the 64-bit values in M1 and M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_and_si64 (__m64 __m1, __m64 __m2) +{ + return __builtin_ia32_pand (__m1, __m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pand (__m64 __m1, __m64 __m2) +{ + return _mm_and_si64 (__m1, __m2); +} + +/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the + 64-bit value in M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_andnot_si64 (__m64 __m1, __m64 __m2) +{ + return __builtin_ia32_pandn (__m1, __m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pandn (__m64 __m1, __m64 __m2) +{ + return _mm_andnot_si64 (__m1, __m2); +} + +/* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_or_si64 (__m64 __m1, __m64 __m2) +{ + return __builtin_ia32_por (__m1, __m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_por (__m64 __m1, __m64 __m2) +{ + return _mm_or_si64 (__m1, __m2); +} + +/* Bit-wise exclusive OR the 64-bit values in M1 and M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_xor_si64 (__m64 __m1, __m64 __m2) +{ + return __builtin_ia32_pxor (__m1, __m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pxor (__m64 __m1, __m64 __m2) +{ + return _mm_xor_si64 (__m1, __m2); +} + +/* Compare eight 8-bit values. The result of the comparison is 0xFF if the + test is true and zero if false. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pcmpeqb (__m64 __m1, __m64 __m2) +{ + return _mm_cmpeq_pi8 (__m1, __m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pcmpgtb (__m64 __m1, __m64 __m2) +{ + return _mm_cmpgt_pi8 (__m1, __m2); +} + +/* Compare four 16-bit values. The result of the comparison is 0xFFFF if + the test is true and zero if false. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pcmpeqw (__m64 __m1, __m64 __m2) +{ + return _mm_cmpeq_pi16 (__m1, __m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pcmpgtw (__m64 __m1, __m64 __m2) +{ + return _mm_cmpgt_pi16 (__m1, __m2); +} + +/* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if + the test is true and zero if false. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pcmpeqd (__m64 __m1, __m64 __m2) +{ + return _mm_cmpeq_pi32 (__m1, __m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pcmpgtd (__m64 __m1, __m64 __m2) +{ + return _mm_cmpgt_pi32 (__m1, __m2); +} + +/* Creates a 64-bit zero. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setzero_si64 (void) +{ + return (__m64)0LL; +} + +/* Creates a vector of two 32-bit values; I0 is least significant. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pi32 (int __i1, int __i0) +{ + return (__m64) __builtin_ia32_vec_init_v2si (__i0, __i1); +} + +/* Creates a vector of four 16-bit values; W0 is least significant. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pi16 (short __w3, short __w2, short __w1, short __w0) +{ + return (__m64) __builtin_ia32_vec_init_v4hi (__w0, __w1, __w2, __w3); +} + +/* Creates a vector of eight 8-bit values; B0 is least significant. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pi8 (char __b7, char __b6, char __b5, char __b4, + char __b3, char __b2, char __b1, char __b0) +{ + return (__m64) __builtin_ia32_vec_init_v8qi (__b0, __b1, __b2, __b3, + __b4, __b5, __b6, __b7); +} + +/* Similar, but with the arguments in reverse order. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_pi32 (int __i0, int __i1) +{ + return _mm_set_pi32 (__i1, __i0); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3) +{ + return _mm_set_pi16 (__w3, __w2, __w1, __w0); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3, + char __b4, char __b5, char __b6, char __b7) +{ + return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); +} + +/* Creates a vector of two 32-bit values, both elements containing I. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_pi32 (int __i) +{ + return _mm_set_pi32 (__i, __i); +} + +/* Creates a vector of four 16-bit values, all elements containing W. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_pi16 (short __w) +{ + return _mm_set_pi16 (__w, __w, __w, __w); +} + +/* Creates a vector of eight 8-bit values, all elements containing B. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_pi8 (char __b) +{ + return _mm_set_pi8 (__b, __b, __b, __b, __b, __b, __b, __b); +} + +#endif /* __MMX__ */ +#endif /* _MMINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md new file mode 100644 index 000000000..ca3762286 --- /dev/null +++ b/gcc/config/i386/mmx.md @@ -0,0 +1,1716 @@ +;; GCC machine description for MMX and 3dNOW! instructions +;; Copyright (C) 2005, 2007, 2008, 2009, 2010 +;; Free Software Foundation, Inc. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +;; The MMX and 3dNOW! patterns are in the same file because they use +;; the same register file, and 3dNOW! adds a number of extensions to +;; the base integer MMX isa. + +;; Note! Except for the basic move instructions, *all* of these +;; patterns are outside the normal optabs namespace. This is because +;; use of these registers requires the insertion of emms or femms +;; instructions to return to normal fpu mode. The compiler doesn't +;; know how to do that itself, which means it's up to the user. Which +;; means that we should never use any of these patterns except at the +;; direction of the user via a builtin. + +;; 8 byte integral modes handled by MMX (and by extension, SSE) +(define_mode_iterator MMXMODEI [V8QI V4HI V2SI]) +(define_mode_iterator MMXMODEI8 [V8QI V4HI V2SI V1DI]) + +;; All 8-byte vector modes handled by MMX +(define_mode_iterator MMXMODE [V8QI V4HI V2SI V1DI V2SF]) + +;; Mix-n-match +(define_mode_iterator MMXMODE12 [V8QI V4HI]) +(define_mode_iterator MMXMODE24 [V4HI V2SI]) +(define_mode_iterator MMXMODE248 [V4HI V2SI V1DI]) + +;; Mapping from integer vector mode to mnemonic suffix +(define_mode_attr mmxvecsize [(V8QI "b") (V4HI "w") (V2SI "d") (V1DI "q")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Move patterns +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; All of these patterns are enabled for MMX as well as 3dNOW. +;; This is essential for maintaining stable calling conventions. + +(define_expand "mov" + [(set (match_operand:MMXMODEI8 0 "nonimmediate_operand" "") + (match_operand:MMXMODEI8 1 "nonimmediate_operand" ""))] + "TARGET_MMX" +{ + ix86_expand_vector_move (mode, operands); + DONE; +}) + +;; movd instead of movq is required to handle broken assemblers. +(define_insn "*mov_internal_rex64" + [(set (match_operand:MMXMODEI8 0 "nonimmediate_operand" + "=rm,r,!?y,!y,!?y,m ,!y ,*Y2,x,x ,m,r ,Yi") + (match_operand:MMXMODEI8 1 "vector_move_operand" + "Cr ,m,C ,!y,m ,!?y,*Y2,!y ,C,xm,x,Yi,r"))] + "TARGET_64BIT && TARGET_MMX + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "@ + mov{q}\t{%1, %0|%0, %1} + mov{q}\t{%1, %0|%0, %1} + pxor\t%0, %0 + movq\t{%1, %0|%0, %1} + movq\t{%1, %0|%0, %1} + movq\t{%1, %0|%0, %1} + movdq2q\t{%1, %0|%0, %1} + movq2dq\t{%1, %0|%0, %1} + %vpxor\t%0, %d0 + %vmovq\t{%1, %0|%0, %1} + %vmovq\t{%1, %0|%0, %1} + %vmovd\t{%1, %0|%0, %1} + %vmovd\t{%1, %0|%0, %1}" + [(set_attr "type" "imov,imov,mmx,mmxmov,mmxmov,mmxmov,ssecvt,ssecvt,sselog1,ssemov,ssemov,ssemov,ssemov") + (set_attr "unit" "*,*,*,*,*,*,mmx,mmx,*,*,*,*,*") + (set_attr "prefix_rep" "*,*,*,*,*,*,1,1,*,1,*,*,*") + (set_attr "prefix_data16" "*,*,*,*,*,*,*,*,*,*,1,1,1") + (set (attr "prefix_rex") + (if_then_else (eq_attr "alternative" "9,10") + (symbol_ref "x86_extended_reg_mentioned_p (insn)") + (const_string "*"))) + (set (attr "prefix") + (if_then_else (eq_attr "alternative" "8,9,10,11,12") + (const_string "maybe_vex") + (const_string "orig"))) + (set_attr "mode" "DI")]) + +(define_insn "*mov_internal_avx" + [(set (match_operand:MMXMODEI8 0 "nonimmediate_operand" + "=!?y,!y,!?y,m ,!y ,*Y2,*Y2,*Y2 ,m ,r ,m") + (match_operand:MMXMODEI8 1 "vector_move_operand" + "C ,!y,m ,!?y,*Y2,!y ,C ,*Y2m,*Y2,irm,r"))] + "TARGET_AVX + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "@ + pxor\t%0, %0 + movq\t{%1, %0|%0, %1} + movq\t{%1, %0|%0, %1} + movq\t{%1, %0|%0, %1} + movdq2q\t{%1, %0|%0, %1} + movq2dq\t{%1, %0|%0, %1} + vpxor\t%0, %0, %0 + vmovq\t{%1, %0|%0, %1} + vmovq\t{%1, %0|%0, %1} + # + #" + [(set_attr "type" "mmx,mmxmov,mmxmov,mmxmov,ssecvt,ssecvt,sselog1,ssemov,ssemov,*,*") + (set_attr "unit" "*,*,*,*,mmx,mmx,*,*,*,*,*") + (set_attr "prefix_rep" "*,*,*,*,1,1,*,*,*,*,*") + (set (attr "prefix") + (if_then_else (eq_attr "alternative" "6,7,8") + (const_string "vex") + (const_string "orig"))) + (set_attr "mode" "DI,DI,DI,DI,DI,DI,TI,DI,DI,DI,DI")]) + +(define_insn "*mov_internal" + [(set (match_operand:MMXMODEI8 0 "nonimmediate_operand" + "=!?y,!y,!?y,m ,!y ,*Y2,*Y2,*Y2 ,m ,*x,*x,*x,m ,r ,m") + (match_operand:MMXMODEI8 1 "vector_move_operand" + "C ,!y,m ,!?y,*Y2,!y ,C ,*Y2m,*Y2,C ,*x,m ,*x,irm,r"))] + "TARGET_MMX + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "@ + pxor\t%0, %0 + movq\t{%1, %0|%0, %1} + movq\t{%1, %0|%0, %1} + movq\t{%1, %0|%0, %1} + movdq2q\t{%1, %0|%0, %1} + movq2dq\t{%1, %0|%0, %1} + pxor\t%0, %0 + movq\t{%1, %0|%0, %1} + movq\t{%1, %0|%0, %1} + xorps\t%0, %0 + movaps\t{%1, %0|%0, %1} + movlps\t{%1, %0|%0, %1} + movlps\t{%1, %0|%0, %1} + # + #" + [(set_attr "type" "mmx,mmxmov,mmxmov,mmxmov,ssecvt,ssecvt,sselog1,ssemov,ssemov,sselog1,ssemov,ssemov,ssemov,*,*") + (set_attr "unit" "*,*,*,*,mmx,mmx,*,*,*,*,*,*,*,*,*") + (set_attr "prefix_rep" "*,*,*,*,1,1,*,1,*,*,*,*,*,*,*") + (set_attr "prefix_data16" "*,*,*,*,*,*,*,*,1,*,*,*,*,*,*") + (set_attr "mode" "DI,DI,DI,DI,DI,DI,TI,DI,DI,V4SF,V4SF,V2SF,V2SF,DI,DI")]) + +(define_expand "movv2sf" + [(set (match_operand:V2SF 0 "nonimmediate_operand" "") + (match_operand:V2SF 1 "nonimmediate_operand" ""))] + "TARGET_MMX" +{ + ix86_expand_vector_move (V2SFmode, operands); + DONE; +}) + +(define_insn "*movv2sf_internal_rex64_avx" + [(set (match_operand:V2SF 0 "nonimmediate_operand" + "=rm,r,!?y,!y,!?y,m ,!y,Y2,x,x,x,m,r,x") + (match_operand:V2SF 1 "vector_move_operand" + "Cr ,m,C ,!y,m ,!?y,Y2,!y,C,x,m,x,x,r"))] + "TARGET_64BIT && TARGET_AVX + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "@ + mov{q}\t{%1, %0|%0, %1} + mov{q}\t{%1, %0|%0, %1} + pxor\t%0, %0 + movq\t{%1, %0|%0, %1} + movq\t{%1, %0|%0, %1} + movq\t{%1, %0|%0, %1} + movdq2q\t{%1, %0|%0, %1} + movq2dq\t{%1, %0|%0, %1} + vxorps\t%0, %0, %0 + vmovaps\t{%1, %0|%0, %1} + vmovlps\t{%1, %0, %0|%0, %0, %1} + vmovlps\t{%1, %0|%0, %1} + vmovq\t{%1, %0|%0, %1} + vmovq\t{%1, %0|%0, %1}" + [(set_attr "type" "imov,imov,mmx,mmxmov,mmxmov,mmxmov,ssecvt,ssecvt,ssemov,sselog1,ssemov,ssemov,ssemov,ssemov") + (set_attr "unit" "*,*,*,*,*,*,mmx,mmx,*,*,*,*,*,*") + (set_attr "prefix_rep" "*,*,*,*,*,*,1,1,*,*,*,*,*,*") + (set_attr "length_vex" "*,*,*,*,*,*,*,*,*,*,*,*,4,4") + (set (attr "prefix") + (if_then_else (eq_attr "alternative" "8,9,10,11,12,13") + (const_string "vex") + (const_string "orig"))) + (set_attr "mode" "DI,DI,DI,DI,DI,DI,DI,DI,V4SF,V4SF,V2SF,V2SF,DI,DI")]) + +;; movd instead of movq is required to handle broken assemblers. +(define_insn "*movv2sf_internal_rex64" + [(set (match_operand:V2SF 0 "nonimmediate_operand" + "=rm,r,!?y,!y,!?y,m ,!y ,*Y2,x,x,x,m,r ,Yi") + (match_operand:V2SF 1 "vector_move_operand" + "Cr ,m,C ,!y,m ,!?y,*Y2,!y ,C,x,m,x,Yi,r"))] + "TARGET_64BIT && TARGET_MMX + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "@ + mov{q}\t{%1, %0|%0, %1} + mov{q}\t{%1, %0|%0, %1} + pxor\t%0, %0 + movq\t{%1, %0|%0, %1} + movq\t{%1, %0|%0, %1} + movq\t{%1, %0|%0, %1} + movdq2q\t{%1, %0|%0, %1} + movq2dq\t{%1, %0|%0, %1} + xorps\t%0, %0 + movaps\t{%1, %0|%0, %1} + movlps\t{%1, %0|%0, %1} + movlps\t{%1, %0|%0, %1} + movd\t{%1, %0|%0, %1} + movd\t{%1, %0|%0, %1}" + [(set_attr "type" "imov,imov,mmx,mmxmov,mmxmov,mmxmov,ssecvt,ssecvt,ssemov,sselog1,ssemov,ssemov,ssemov,ssemov") + (set_attr "unit" "*,*,*,*,*,*,mmx,mmx,*,*,*,*,*,*") + (set_attr "prefix_rep" "*,*,*,*,*,*,1,1,*,*,*,*,*,*") + (set_attr "mode" "DI,DI,DI,DI,DI,DI,DI,DI,V4SF,V4SF,V2SF,V2SF,DI,DI")]) + +(define_insn "*movv2sf_internal_avx" + [(set (match_operand:V2SF 0 "nonimmediate_operand" + "=!?y,!y,!?y,m ,!y ,*Y2,*x,*x,*x,m ,r ,m") + (match_operand:V2SF 1 "vector_move_operand" + "C ,!y,m ,!?y,*Y2,!y ,C ,*x,m ,*x,irm,r"))] + "TARGET_AVX + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "@ + pxor\t%0, %0 + movq\t{%1, %0|%0, %1} + movq\t{%1, %0|%0, %1} + movq\t{%1, %0|%0, %1} + movdq2q\t{%1, %0|%0, %1} + movq2dq\t{%1, %0|%0, %1} + vxorps\t%0, %0, %0 + vmovaps\t{%1, %0|%0, %1} + vmovlps\t{%1, %0, %0|%0, %0, %1} + vmovlps\t{%1, %0|%0, %1} + # + #" + [(set_attr "type" "mmx,mmxmov,mmxmov,mmxmov,ssecvt,ssecvt,sselog1,ssemov,ssemov,ssemov,*,*") + (set_attr "unit" "*,*,*,*,mmx,mmx,*,*,*,*,*,*") + (set_attr "prefix_rep" "*,*,*,*,1,1,*,*,*,*,*,*") + (set (attr "prefix") + (if_then_else (eq_attr "alternative" "6,7,8,9") + (const_string "vex") + (const_string "orig"))) + (set_attr "mode" "DI,DI,DI,DI,DI,DI,V4SF,V4SF,V2SF,V2SF,DI,DI")]) + +(define_insn "*movv2sf_internal" + [(set (match_operand:V2SF 0 "nonimmediate_operand" + "=!?y,!y,!?y,m ,!y ,*Y2,*x,*x,*x,m ,r ,m") + (match_operand:V2SF 1 "vector_move_operand" + "C ,!y,m ,!?y,*Y2,!y ,C ,*x,m ,*x,irm,r"))] + "TARGET_MMX + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "@ + pxor\t%0, %0 + movq\t{%1, %0|%0, %1} + movq\t{%1, %0|%0, %1} + movq\t{%1, %0|%0, %1} + movdq2q\t{%1, %0|%0, %1} + movq2dq\t{%1, %0|%0, %1} + xorps\t%0, %0 + movaps\t{%1, %0|%0, %1} + movlps\t{%1, %0|%0, %1} + movlps\t{%1, %0|%0, %1} + # + #" + [(set_attr "type" "mmx,mmxmov,mmxmov,mmxmov,ssecvt,ssecvt,sselog1,ssemov,ssemov,ssemov,*,*") + (set_attr "unit" "*,*,*,*,mmx,mmx,*,*,*,*,*,*") + (set_attr "prefix_rep" "*,*,*,*,1,1,*,*,*,*,*,*") + (set_attr "mode" "DI,DI,DI,DI,DI,DI,V4SF,V4SF,V2SF,V2SF,DI,DI")]) + +;; %%% This multiword shite has got to go. +(define_split + [(set (match_operand:MMXMODE 0 "nonimmediate_operand" "") + (match_operand:MMXMODE 1 "general_operand" ""))] + "!TARGET_64BIT && reload_completed + && (!MMX_REG_P (operands[0]) && !SSE_REG_P (operands[0])) + && (!MMX_REG_P (operands[1]) && !SSE_REG_P (operands[1]))" + [(const_int 0)] + "ix86_split_long_move (operands); DONE;") + +(define_expand "push1" + [(match_operand:MMXMODE 0 "register_operand" "")] + "TARGET_MMX" +{ + ix86_expand_push (mode, operands[0]); + DONE; +}) + +(define_expand "movmisalign" + [(set (match_operand:MMXMODE 0 "nonimmediate_operand" "") + (match_operand:MMXMODE 1 "nonimmediate_operand" ""))] + "TARGET_MMX" +{ + ix86_expand_vector_move (mode, operands); + DONE; +}) + +(define_insn "sse_movntdi" + [(set (match_operand:DI 0 "memory_operand" "=m") + (unspec:DI [(match_operand:DI 1 "register_operand" "y")] + UNSPEC_MOVNT))] + "TARGET_SSE || TARGET_3DNOW_A" + "movntq\t{%1, %0|%0, %1}" + [(set_attr "type" "mmxmov") + (set_attr "mode" "DI")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Parallel single-precision floating point arithmetic +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_expand "mmx_addv2sf3" + [(set (match_operand:V2SF 0 "register_operand" "") + (plus:V2SF + (match_operand:V2SF 1 "nonimmediate_operand" "") + (match_operand:V2SF 2 "nonimmediate_operand" "")))] + "TARGET_3DNOW" + "ix86_fixup_binary_operands_no_copy (PLUS, V2SFmode, operands);") + +(define_insn "*mmx_addv2sf3" + [(set (match_operand:V2SF 0 "register_operand" "=y") + (plus:V2SF (match_operand:V2SF 1 "nonimmediate_operand" "%0") + (match_operand:V2SF 2 "nonimmediate_operand" "ym")))] + "TARGET_3DNOW && ix86_binary_operator_ok (PLUS, V2SFmode, operands)" + "pfadd\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxadd") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2SF")]) + +(define_expand "mmx_subv2sf3" + [(set (match_operand:V2SF 0 "register_operand" "") + (minus:V2SF (match_operand:V2SF 1 "register_operand" "") + (match_operand:V2SF 2 "nonimmediate_operand" "")))] + "TARGET_3DNOW") + +(define_expand "mmx_subrv2sf3" + [(set (match_operand:V2SF 0 "register_operand" "") + (minus:V2SF (match_operand:V2SF 2 "register_operand" "") + (match_operand:V2SF 1 "nonimmediate_operand" "")))] + "TARGET_3DNOW") + +(define_insn "*mmx_subv2sf3" + [(set (match_operand:V2SF 0 "register_operand" "=y,y") + (minus:V2SF (match_operand:V2SF 1 "nonimmediate_operand" "0,ym") + (match_operand:V2SF 2 "nonimmediate_operand" "ym,0")))] + "TARGET_3DNOW && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "@ + pfsub\t{%2, %0|%0, %2} + pfsubr\t{%1, %0|%0, %1}" + [(set_attr "type" "mmxadd") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2SF")]) + +(define_expand "mmx_mulv2sf3" + [(set (match_operand:V2SF 0 "register_operand" "") + (mult:V2SF (match_operand:V2SF 1 "nonimmediate_operand" "") + (match_operand:V2SF 2 "nonimmediate_operand" "")))] + "TARGET_3DNOW" + "ix86_fixup_binary_operands_no_copy (MULT, V2SFmode, operands);") + +(define_insn "*mmx_mulv2sf3" + [(set (match_operand:V2SF 0 "register_operand" "=y") + (mult:V2SF (match_operand:V2SF 1 "nonimmediate_operand" "%0") + (match_operand:V2SF 2 "nonimmediate_operand" "ym")))] + "TARGET_3DNOW && ix86_binary_operator_ok (MULT, V2SFmode, operands)" + "pfmul\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxmul") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2SF")]) + +;; ??? For !flag_finite_math_only, the representation with SMIN/SMAX +;; isn't really correct, as those rtl operators aren't defined when +;; applied to NaNs. Hopefully the optimizers won't get too smart on us. + +(define_expand "mmx_v2sf3" + [(set (match_operand:V2SF 0 "register_operand" "") + (smaxmin:V2SF + (match_operand:V2SF 1 "nonimmediate_operand" "") + (match_operand:V2SF 2 "nonimmediate_operand" "")))] + "TARGET_3DNOW" +{ + if (!flag_finite_math_only) + operands[1] = force_reg (V2SFmode, operands[1]); + ix86_fixup_binary_operands_no_copy (, V2SFmode, operands); +}) + +(define_insn "*mmx_v2sf3_finite" + [(set (match_operand:V2SF 0 "register_operand" "=y") + (smaxmin:V2SF + (match_operand:V2SF 1 "nonimmediate_operand" "%0") + (match_operand:V2SF 2 "nonimmediate_operand" "ym")))] + "TARGET_3DNOW && flag_finite_math_only + && ix86_binary_operator_ok (, V2SFmode, operands)" + "pf\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxadd") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2SF")]) + +(define_insn "*mmx_v2sf3" + [(set (match_operand:V2SF 0 "register_operand" "=y") + (smaxmin:V2SF + (match_operand:V2SF 1 "register_operand" "0") + (match_operand:V2SF 2 "nonimmediate_operand" "ym")))] + "TARGET_3DNOW" + "pf\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxadd") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2SF")]) + +(define_insn "mmx_rcpv2sf2" + [(set (match_operand:V2SF 0 "register_operand" "=y") + (unspec:V2SF [(match_operand:V2SF 1 "nonimmediate_operand" "ym")] + UNSPEC_PFRCP))] + "TARGET_3DNOW" + "pfrcp\t{%1, %0|%0, %1}" + [(set_attr "type" "mmx") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2SF")]) + +(define_insn "mmx_rcpit1v2sf3" + [(set (match_operand:V2SF 0 "register_operand" "=y") + (unspec:V2SF [(match_operand:V2SF 1 "register_operand" "0") + (match_operand:V2SF 2 "nonimmediate_operand" "ym")] + UNSPEC_PFRCPIT1))] + "TARGET_3DNOW" + "pfrcpit1\t{%2, %0|%0, %2}" + [(set_attr "type" "mmx") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2SF")]) + +(define_insn "mmx_rcpit2v2sf3" + [(set (match_operand:V2SF 0 "register_operand" "=y") + (unspec:V2SF [(match_operand:V2SF 1 "register_operand" "0") + (match_operand:V2SF 2 "nonimmediate_operand" "ym")] + UNSPEC_PFRCPIT2))] + "TARGET_3DNOW" + "pfrcpit2\t{%2, %0|%0, %2}" + [(set_attr "type" "mmx") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2SF")]) + +(define_insn "mmx_rsqrtv2sf2" + [(set (match_operand:V2SF 0 "register_operand" "=y") + (unspec:V2SF [(match_operand:V2SF 1 "nonimmediate_operand" "ym")] + UNSPEC_PFRSQRT))] + "TARGET_3DNOW" + "pfrsqrt\t{%1, %0|%0, %1}" + [(set_attr "type" "mmx") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2SF")]) + +(define_insn "mmx_rsqit1v2sf3" + [(set (match_operand:V2SF 0 "register_operand" "=y") + (unspec:V2SF [(match_operand:V2SF 1 "register_operand" "0") + (match_operand:V2SF 2 "nonimmediate_operand" "ym")] + UNSPEC_PFRSQIT1))] + "TARGET_3DNOW" + "pfrsqit1\t{%2, %0|%0, %2}" + [(set_attr "type" "mmx") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2SF")]) + +(define_insn "mmx_haddv2sf3" + [(set (match_operand:V2SF 0 "register_operand" "=y") + (vec_concat:V2SF + (plus:SF + (vec_select:SF + (match_operand:V2SF 1 "register_operand" "0") + (parallel [(const_int 0)])) + (vec_select:SF (match_dup 1) (parallel [(const_int 1)]))) + (plus:SF + (vec_select:SF + (match_operand:V2SF 2 "nonimmediate_operand" "ym") + (parallel [(const_int 0)])) + (vec_select:SF (match_dup 2) (parallel [(const_int 1)])))))] + "TARGET_3DNOW" + "pfacc\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxadd") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2SF")]) + +(define_insn "mmx_hsubv2sf3" + [(set (match_operand:V2SF 0 "register_operand" "=y") + (vec_concat:V2SF + (minus:SF + (vec_select:SF + (match_operand:V2SF 1 "register_operand" "0") + (parallel [(const_int 0)])) + (vec_select:SF (match_dup 1) (parallel [(const_int 1)]))) + (minus:SF + (vec_select:SF + (match_operand:V2SF 2 "nonimmediate_operand" "ym") + (parallel [(const_int 0)])) + (vec_select:SF (match_dup 2) (parallel [(const_int 1)])))))] + "TARGET_3DNOW_A" + "pfnacc\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxadd") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2SF")]) + +(define_insn "mmx_addsubv2sf3" + [(set (match_operand:V2SF 0 "register_operand" "=y") + (vec_merge:V2SF + (plus:V2SF + (match_operand:V2SF 1 "register_operand" "0") + (match_operand:V2SF 2 "nonimmediate_operand" "ym")) + (minus:V2SF (match_dup 1) (match_dup 2)) + (const_int 1)))] + "TARGET_3DNOW_A" + "pfpnacc\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxadd") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2SF")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Parallel single-precision floating point comparisons +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_expand "mmx_eqv2sf3" + [(set (match_operand:V2SI 0 "register_operand" "") + (eq:V2SI (match_operand:V2SF 1 "nonimmediate_operand" "") + (match_operand:V2SF 2 "nonimmediate_operand" "")))] + "TARGET_3DNOW" + "ix86_fixup_binary_operands_no_copy (EQ, V2SFmode, operands);") + +(define_insn "*mmx_eqv2sf3" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (eq:V2SI (match_operand:V2SF 1 "nonimmediate_operand" "%0") + (match_operand:V2SF 2 "nonimmediate_operand" "ym")))] + "TARGET_3DNOW && ix86_binary_operator_ok (EQ, V2SFmode, operands)" + "pfcmpeq\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxcmp") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2SF")]) + +(define_insn "mmx_gtv2sf3" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (gt:V2SI (match_operand:V2SF 1 "register_operand" "0") + (match_operand:V2SF 2 "nonimmediate_operand" "ym")))] + "TARGET_3DNOW" + "pfcmpgt\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxcmp") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2SF")]) + +(define_insn "mmx_gev2sf3" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (ge:V2SI (match_operand:V2SF 1 "register_operand" "0") + (match_operand:V2SF 2 "nonimmediate_operand" "ym")))] + "TARGET_3DNOW" + "pfcmpge\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxcmp") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2SF")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Parallel single-precision floating point conversion operations +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn "mmx_pf2id" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (fix:V2SI (match_operand:V2SF 1 "nonimmediate_operand" "ym")))] + "TARGET_3DNOW" + "pf2id\t{%1, %0|%0, %1}" + [(set_attr "type" "mmxcvt") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2SF")]) + +(define_insn "mmx_pf2iw" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (sign_extend:V2SI + (ss_truncate:V2HI + (fix:V2SI + (match_operand:V2SF 1 "nonimmediate_operand" "ym")))))] + "TARGET_3DNOW_A" + "pf2iw\t{%1, %0|%0, %1}" + [(set_attr "type" "mmxcvt") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2SF")]) + +(define_insn "mmx_pi2fw" + [(set (match_operand:V2SF 0 "register_operand" "=y") + (float:V2SF + (sign_extend:V2SI + (truncate:V2HI + (match_operand:V2SI 1 "nonimmediate_operand" "ym")))))] + "TARGET_3DNOW_A" + "pi2fw\t{%1, %0|%0, %1}" + [(set_attr "type" "mmxcvt") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2SF")]) + +(define_insn "mmx_floatv2si2" + [(set (match_operand:V2SF 0 "register_operand" "=y") + (float:V2SF (match_operand:V2SI 1 "nonimmediate_operand" "ym")))] + "TARGET_3DNOW" + "pi2fd\t{%1, %0|%0, %1}" + [(set_attr "type" "mmxcvt") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2SF")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Parallel single-precision floating point element swizzling +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn "mmx_pswapdv2sf2" + [(set (match_operand:V2SF 0 "register_operand" "=y") + (vec_select:V2SF (match_operand:V2SF 1 "nonimmediate_operand" "ym") + (parallel [(const_int 1) (const_int 0)])))] + "TARGET_3DNOW_A" + "pswapd\t{%1, %0|%0, %1}" + [(set_attr "type" "mmxcvt") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2SF")]) + +(define_insn "*vec_dupv2sf" + [(set (match_operand:V2SF 0 "register_operand" "=y") + (vec_duplicate:V2SF + (match_operand:SF 1 "register_operand" "0")))] + "TARGET_MMX" + "punpckldq\t%0, %0" + [(set_attr "type" "mmxcvt") + (set_attr "mode" "DI")]) + +(define_insn "*mmx_concatv2sf" + [(set (match_operand:V2SF 0 "register_operand" "=y,y") + (vec_concat:V2SF + (match_operand:SF 1 "nonimmediate_operand" " 0,rm") + (match_operand:SF 2 "vector_move_operand" "ym,C")))] + "TARGET_MMX && !TARGET_SSE" + "@ + punpckldq\t{%2, %0|%0, %2} + movd\t{%1, %0|%0, %1}" + [(set_attr "type" "mmxcvt,mmxmov") + (set_attr "mode" "DI")]) + +(define_expand "vec_setv2sf" + [(match_operand:V2SF 0 "register_operand" "") + (match_operand:SF 1 "register_operand" "") + (match_operand 2 "const_int_operand" "")] + "TARGET_MMX" +{ + ix86_expand_vector_set (false, operands[0], operands[1], + INTVAL (operands[2])); + DONE; +}) + +;; Avoid combining registers from different units in a single alternative, +;; see comment above inline_secondary_memory_needed function in i386.c +(define_insn_and_split "*vec_extractv2sf_0" + [(set (match_operand:SF 0 "nonimmediate_operand" "=x, m,y ,m,f,r") + (vec_select:SF + (match_operand:V2SF 1 "nonimmediate_operand" " xm,x,ym,y,m,m") + (parallel [(const_int 0)])))] + "TARGET_MMX && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "#" + "&& reload_completed" + [(const_int 0)] +{ + rtx op1 = operands[1]; + if (REG_P (op1)) + op1 = gen_rtx_REG (SFmode, REGNO (op1)); + else + op1 = gen_lowpart (SFmode, op1); + emit_move_insn (operands[0], op1); + DONE; +}) + +;; Avoid combining registers from different units in a single alternative, +;; see comment above inline_secondary_memory_needed function in i386.c +(define_insn "*vec_extractv2sf_1" + [(set (match_operand:SF 0 "nonimmediate_operand" "=y,x,y,x,f,r") + (vec_select:SF + (match_operand:V2SF 1 "nonimmediate_operand" " 0,0,o,o,o,o") + (parallel [(const_int 1)])))] + "TARGET_MMX && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "@ + punpckhdq\t%0, %0 + unpckhps\t%0, %0 + # + # + # + #" + [(set_attr "type" "mmxcvt,sselog1,mmxmov,ssemov,fmov,imov") + (set_attr "mode" "DI,V4SF,SF,SF,SF,SF")]) + +(define_split + [(set (match_operand:SF 0 "register_operand" "") + (vec_select:SF + (match_operand:V2SF 1 "memory_operand" "") + (parallel [(const_int 1)])))] + "TARGET_MMX && reload_completed" + [(const_int 0)] +{ + operands[1] = adjust_address (operands[1], SFmode, 4); + emit_move_insn (operands[0], operands[1]); + DONE; +}) + +(define_expand "vec_extractv2sf" + [(match_operand:SF 0 "register_operand" "") + (match_operand:V2SF 1 "register_operand" "") + (match_operand 2 "const_int_operand" "")] + "TARGET_MMX" +{ + ix86_expand_vector_extract (false, operands[0], operands[1], + INTVAL (operands[2])); + DONE; +}) + +(define_expand "vec_initv2sf" + [(match_operand:V2SF 0 "register_operand" "") + (match_operand 1 "" "")] + "TARGET_SSE" +{ + ix86_expand_vector_init (false, operands[0], operands[1]); + DONE; +}) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Parallel integral arithmetic +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_expand "mmx_3" + [(set (match_operand:MMXMODEI8 0 "register_operand" "") + (plusminus:MMXMODEI8 + (match_operand:MMXMODEI8 1 "nonimmediate_operand" "") + (match_operand:MMXMODEI8 2 "nonimmediate_operand" "")))] + "TARGET_MMX || (TARGET_SSE2 && mode == V1DImode)" + "ix86_fixup_binary_operands_no_copy (, mode, operands);") + +(define_insn "*mmx_3" + [(set (match_operand:MMXMODEI8 0 "register_operand" "=y") + (plusminus:MMXMODEI8 + (match_operand:MMXMODEI8 1 "nonimmediate_operand" "0") + (match_operand:MMXMODEI8 2 "nonimmediate_operand" "ym")))] + "(TARGET_MMX || (TARGET_SSE2 && mode == V1DImode)) + && ix86_binary_operator_ok (, mode, operands)" + "p\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxadd") + (set_attr "mode" "DI")]) + +(define_expand "mmx_3" + [(set (match_operand:MMXMODE12 0 "register_operand" "") + (sat_plusminus:MMXMODE12 + (match_operand:MMXMODE12 1 "nonimmediate_operand" "") + (match_operand:MMXMODE12 2 "nonimmediate_operand" "")))] + "TARGET_MMX" + "ix86_fixup_binary_operands_no_copy (, mode, operands);") + +(define_insn "*mmx_3" + [(set (match_operand:MMXMODE12 0 "register_operand" "=y") + (sat_plusminus:MMXMODE12 + (match_operand:MMXMODE12 1 "nonimmediate_operand" "0") + (match_operand:MMXMODE12 2 "nonimmediate_operand" "ym")))] + "TARGET_MMX && ix86_binary_operator_ok (, mode, operands)" + "p\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxadd") + (set_attr "mode" "DI")]) + +(define_expand "mmx_mulv4hi3" + [(set (match_operand:V4HI 0 "register_operand" "") + (mult:V4HI (match_operand:V4HI 1 "nonimmediate_operand" "") + (match_operand:V4HI 2 "nonimmediate_operand" "")))] + "TARGET_MMX" + "ix86_fixup_binary_operands_no_copy (MULT, V4HImode, operands);") + +(define_insn "*mmx_mulv4hi3" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (mult:V4HI (match_operand:V4HI 1 "nonimmediate_operand" "%0") + (match_operand:V4HI 2 "nonimmediate_operand" "ym")))] + "TARGET_MMX && ix86_binary_operator_ok (MULT, V4HImode, operands)" + "pmullw\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxmul") + (set_attr "mode" "DI")]) + +(define_expand "mmx_smulv4hi3_highpart" + [(set (match_operand:V4HI 0 "register_operand" "") + (truncate:V4HI + (lshiftrt:V4SI + (mult:V4SI + (sign_extend:V4SI + (match_operand:V4HI 1 "nonimmediate_operand" "")) + (sign_extend:V4SI + (match_operand:V4HI 2 "nonimmediate_operand" ""))) + (const_int 16))))] + "TARGET_MMX" + "ix86_fixup_binary_operands_no_copy (MULT, V4HImode, operands);") + +(define_insn "*mmx_smulv4hi3_highpart" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (truncate:V4HI + (lshiftrt:V4SI + (mult:V4SI + (sign_extend:V4SI + (match_operand:V4HI 1 "nonimmediate_operand" "%0")) + (sign_extend:V4SI + (match_operand:V4HI 2 "nonimmediate_operand" "ym"))) + (const_int 16))))] + "TARGET_MMX && ix86_binary_operator_ok (MULT, V4HImode, operands)" + "pmulhw\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxmul") + (set_attr "mode" "DI")]) + +(define_expand "mmx_umulv4hi3_highpart" + [(set (match_operand:V4HI 0 "register_operand" "") + (truncate:V4HI + (lshiftrt:V4SI + (mult:V4SI + (zero_extend:V4SI + (match_operand:V4HI 1 "nonimmediate_operand" "")) + (zero_extend:V4SI + (match_operand:V4HI 2 "nonimmediate_operand" ""))) + (const_int 16))))] + "TARGET_SSE || TARGET_3DNOW_A" + "ix86_fixup_binary_operands_no_copy (MULT, V4HImode, operands);") + +(define_insn "*mmx_umulv4hi3_highpart" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (truncate:V4HI + (lshiftrt:V4SI + (mult:V4SI + (zero_extend:V4SI + (match_operand:V4HI 1 "nonimmediate_operand" "%0")) + (zero_extend:V4SI + (match_operand:V4HI 2 "nonimmediate_operand" "ym"))) + (const_int 16))))] + "(TARGET_SSE || TARGET_3DNOW_A) + && ix86_binary_operator_ok (MULT, V4HImode, operands)" + "pmulhuw\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxmul") + (set_attr "mode" "DI")]) + +(define_expand "mmx_pmaddwd" + [(set (match_operand:V2SI 0 "register_operand" "") + (plus:V2SI + (mult:V2SI + (sign_extend:V2SI + (vec_select:V2HI + (match_operand:V4HI 1 "nonimmediate_operand" "") + (parallel [(const_int 0) (const_int 2)]))) + (sign_extend:V2SI + (vec_select:V2HI + (match_operand:V4HI 2 "nonimmediate_operand" "") + (parallel [(const_int 0) (const_int 2)])))) + (mult:V2SI + (sign_extend:V2SI + (vec_select:V2HI (match_dup 1) + (parallel [(const_int 1) (const_int 3)]))) + (sign_extend:V2SI + (vec_select:V2HI (match_dup 2) + (parallel [(const_int 1) (const_int 3)]))))))] + "TARGET_MMX" + "ix86_fixup_binary_operands_no_copy (MULT, V4HImode, operands);") + +(define_insn "*mmx_pmaddwd" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (plus:V2SI + (mult:V2SI + (sign_extend:V2SI + (vec_select:V2HI + (match_operand:V4HI 1 "nonimmediate_operand" "%0") + (parallel [(const_int 0) (const_int 2)]))) + (sign_extend:V2SI + (vec_select:V2HI + (match_operand:V4HI 2 "nonimmediate_operand" "ym") + (parallel [(const_int 0) (const_int 2)])))) + (mult:V2SI + (sign_extend:V2SI + (vec_select:V2HI (match_dup 1) + (parallel [(const_int 1) (const_int 3)]))) + (sign_extend:V2SI + (vec_select:V2HI (match_dup 2) + (parallel [(const_int 1) (const_int 3)]))))))] + "TARGET_MMX && ix86_binary_operator_ok (MULT, V4HImode, operands)" + "pmaddwd\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxmul") + (set_attr "mode" "DI")]) + +(define_expand "mmx_pmulhrwv4hi3" + [(set (match_operand:V4HI 0 "register_operand" "") + (truncate:V4HI + (lshiftrt:V4SI + (plus:V4SI + (mult:V4SI + (sign_extend:V4SI + (match_operand:V4HI 1 "nonimmediate_operand" "")) + (sign_extend:V4SI + (match_operand:V4HI 2 "nonimmediate_operand" ""))) + (const_vector:V4SI [(const_int 32768) (const_int 32768) + (const_int 32768) (const_int 32768)])) + (const_int 16))))] + "TARGET_3DNOW" + "ix86_fixup_binary_operands_no_copy (MULT, V4HImode, operands);") + +(define_insn "*mmx_pmulhrwv4hi3" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (truncate:V4HI + (lshiftrt:V4SI + (plus:V4SI + (mult:V4SI + (sign_extend:V4SI + (match_operand:V4HI 1 "nonimmediate_operand" "%0")) + (sign_extend:V4SI + (match_operand:V4HI 2 "nonimmediate_operand" "ym"))) + (const_vector:V4SI [(const_int 32768) (const_int 32768) + (const_int 32768) (const_int 32768)])) + (const_int 16))))] + "TARGET_3DNOW && ix86_binary_operator_ok (MULT, V4HImode, operands)" + "pmulhrw\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxmul") + (set_attr "prefix_extra" "1") + (set_attr "mode" "DI")]) + +(define_expand "sse2_umulv1siv1di3" + [(set (match_operand:V1DI 0 "register_operand" "") + (mult:V1DI + (zero_extend:V1DI + (vec_select:V1SI + (match_operand:V2SI 1 "nonimmediate_operand" "") + (parallel [(const_int 0)]))) + (zero_extend:V1DI + (vec_select:V1SI + (match_operand:V2SI 2 "nonimmediate_operand" "") + (parallel [(const_int 0)])))))] + "TARGET_SSE2" + "ix86_fixup_binary_operands_no_copy (MULT, V2SImode, operands);") + +(define_insn "*sse2_umulv1siv1di3" + [(set (match_operand:V1DI 0 "register_operand" "=y") + (mult:V1DI + (zero_extend:V1DI + (vec_select:V1SI + (match_operand:V2SI 1 "nonimmediate_operand" "%0") + (parallel [(const_int 0)]))) + (zero_extend:V1DI + (vec_select:V1SI + (match_operand:V2SI 2 "nonimmediate_operand" "ym") + (parallel [(const_int 0)])))))] + "TARGET_SSE2 && ix86_binary_operator_ok (MULT, V2SImode, operands)" + "pmuludq\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxmul") + (set_attr "mode" "DI")]) + +(define_expand "mmx_v4hi3" + [(set (match_operand:V4HI 0 "register_operand" "") + (smaxmin:V4HI + (match_operand:V4HI 1 "nonimmediate_operand" "") + (match_operand:V4HI 2 "nonimmediate_operand" "")))] + "TARGET_SSE || TARGET_3DNOW_A" + "ix86_fixup_binary_operands_no_copy (, V4HImode, operands);") + +(define_insn "*mmx_v4hi3" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (smaxmin:V4HI + (match_operand:V4HI 1 "nonimmediate_operand" "%0") + (match_operand:V4HI 2 "nonimmediate_operand" "ym")))] + "(TARGET_SSE || TARGET_3DNOW_A) + && ix86_binary_operator_ok (, V4HImode, operands)" + "pw\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxadd") + (set_attr "mode" "DI")]) + +(define_expand "mmx_v8qi3" + [(set (match_operand:V8QI 0 "register_operand" "") + (umaxmin:V8QI + (match_operand:V8QI 1 "nonimmediate_operand" "") + (match_operand:V8QI 2 "nonimmediate_operand" "")))] + "TARGET_SSE || TARGET_3DNOW_A" + "ix86_fixup_binary_operands_no_copy (, V8QImode, operands);") + +(define_insn "*mmx_v8qi3" + [(set (match_operand:V8QI 0 "register_operand" "=y") + (umaxmin:V8QI + (match_operand:V8QI 1 "nonimmediate_operand" "%0") + (match_operand:V8QI 2 "nonimmediate_operand" "ym")))] + "(TARGET_SSE || TARGET_3DNOW_A) + && ix86_binary_operator_ok (, V8QImode, operands)" + "pb\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxadd") + (set_attr "mode" "DI")]) + +(define_insn "mmx_ashr3" + [(set (match_operand:MMXMODE24 0 "register_operand" "=y") + (ashiftrt:MMXMODE24 + (match_operand:MMXMODE24 1 "register_operand" "0") + (match_operand:SI 2 "nonmemory_operand" "yN")))] + "TARGET_MMX" + "psra\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxshft") + (set (attr "length_immediate") + (if_then_else (match_operand 2 "const_int_operand" "") + (const_string "1") + (const_string "0"))) + (set_attr "mode" "DI")]) + +(define_insn "mmx_lshr3" + [(set (match_operand:MMXMODE248 0 "register_operand" "=y") + (lshiftrt:MMXMODE248 + (match_operand:MMXMODE248 1 "register_operand" "0") + (match_operand:SI 2 "nonmemory_operand" "yN")))] + "TARGET_MMX" + "psrl\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxshft") + (set (attr "length_immediate") + (if_then_else (match_operand 2 "const_int_operand" "") + (const_string "1") + (const_string "0"))) + (set_attr "mode" "DI")]) + +(define_insn "mmx_ashl3" + [(set (match_operand:MMXMODE248 0 "register_operand" "=y") + (ashift:MMXMODE248 + (match_operand:MMXMODE248 1 "register_operand" "0") + (match_operand:SI 2 "nonmemory_operand" "yN")))] + "TARGET_MMX" + "psll\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxshft") + (set (attr "length_immediate") + (if_then_else (match_operand 2 "const_int_operand" "") + (const_string "1") + (const_string "0"))) + (set_attr "mode" "DI")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Parallel integral comparisons +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_expand "mmx_eq3" + [(set (match_operand:MMXMODEI 0 "register_operand" "") + (eq:MMXMODEI + (match_operand:MMXMODEI 1 "nonimmediate_operand" "") + (match_operand:MMXMODEI 2 "nonimmediate_operand" "")))] + "TARGET_MMX" + "ix86_fixup_binary_operands_no_copy (EQ, mode, operands);") + +(define_insn "*mmx_eq3" + [(set (match_operand:MMXMODEI 0 "register_operand" "=y") + (eq:MMXMODEI + (match_operand:MMXMODEI 1 "nonimmediate_operand" "%0") + (match_operand:MMXMODEI 2 "nonimmediate_operand" "ym")))] + "TARGET_MMX && ix86_binary_operator_ok (EQ, mode, operands)" + "pcmpeq\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxcmp") + (set_attr "mode" "DI")]) + +(define_insn "mmx_gt3" + [(set (match_operand:MMXMODEI 0 "register_operand" "=y") + (gt:MMXMODEI + (match_operand:MMXMODEI 1 "register_operand" "0") + (match_operand:MMXMODEI 2 "nonimmediate_operand" "ym")))] + "TARGET_MMX" + "pcmpgt\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxcmp") + (set_attr "mode" "DI")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Parallel integral logical operations +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn "mmx_andnot3" + [(set (match_operand:MMXMODEI 0 "register_operand" "=y") + (and:MMXMODEI + (not:MMXMODEI (match_operand:MMXMODEI 1 "register_operand" "0")) + (match_operand:MMXMODEI 2 "nonimmediate_operand" "ym")))] + "TARGET_MMX" + "pandn\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxadd") + (set_attr "mode" "DI")]) + +(define_expand "mmx_3" + [(set (match_operand:MMXMODEI 0 "register_operand" "") + (any_logic:MMXMODEI + (match_operand:MMXMODEI 1 "nonimmediate_operand" "") + (match_operand:MMXMODEI 2 "nonimmediate_operand" "")))] + "TARGET_MMX" + "ix86_fixup_binary_operands_no_copy (, mode, operands);") + +(define_insn "*mmx_3" + [(set (match_operand:MMXMODEI 0 "register_operand" "=y") + (any_logic:MMXMODEI + (match_operand:MMXMODEI 1 "nonimmediate_operand" "%0") + (match_operand:MMXMODEI 2 "nonimmediate_operand" "ym")))] + "TARGET_MMX && ix86_binary_operator_ok (, mode, operands)" + "p\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxadd") + (set_attr "mode" "DI")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Parallel integral element swizzling +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn "mmx_packsswb" + [(set (match_operand:V8QI 0 "register_operand" "=y") + (vec_concat:V8QI + (ss_truncate:V4QI + (match_operand:V4HI 1 "register_operand" "0")) + (ss_truncate:V4QI + (match_operand:V4HI 2 "nonimmediate_operand" "ym"))))] + "TARGET_MMX" + "packsswb\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxshft") + (set_attr "mode" "DI")]) + +(define_insn "mmx_packssdw" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (vec_concat:V4HI + (ss_truncate:V2HI + (match_operand:V2SI 1 "register_operand" "0")) + (ss_truncate:V2HI + (match_operand:V2SI 2 "nonimmediate_operand" "ym"))))] + "TARGET_MMX" + "packssdw\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxshft") + (set_attr "mode" "DI")]) + +(define_insn "mmx_packuswb" + [(set (match_operand:V8QI 0 "register_operand" "=y") + (vec_concat:V8QI + (us_truncate:V4QI + (match_operand:V4HI 1 "register_operand" "0")) + (us_truncate:V4QI + (match_operand:V4HI 2 "nonimmediate_operand" "ym"))))] + "TARGET_MMX" + "packuswb\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxshft") + (set_attr "mode" "DI")]) + +(define_insn "mmx_punpckhbw" + [(set (match_operand:V8QI 0 "register_operand" "=y") + (vec_select:V8QI + (vec_concat:V16QI + (match_operand:V8QI 1 "register_operand" "0") + (match_operand:V8QI 2 "nonimmediate_operand" "ym")) + (parallel [(const_int 4) (const_int 12) + (const_int 5) (const_int 13) + (const_int 6) (const_int 14) + (const_int 7) (const_int 15)])))] + "TARGET_MMX" + "punpckhbw\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxcvt") + (set_attr "mode" "DI")]) + +(define_insn "mmx_punpcklbw" + [(set (match_operand:V8QI 0 "register_operand" "=y") + (vec_select:V8QI + (vec_concat:V16QI + (match_operand:V8QI 1 "register_operand" "0") + (match_operand:V8QI 2 "nonimmediate_operand" "ym")) + (parallel [(const_int 0) (const_int 8) + (const_int 1) (const_int 9) + (const_int 2) (const_int 10) + (const_int 3) (const_int 11)])))] + "TARGET_MMX" + "punpcklbw\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxcvt") + (set_attr "mode" "DI")]) + +(define_insn "mmx_punpckhwd" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (vec_select:V4HI + (vec_concat:V8HI + (match_operand:V4HI 1 "register_operand" "0") + (match_operand:V4HI 2 "nonimmediate_operand" "ym")) + (parallel [(const_int 2) (const_int 6) + (const_int 3) (const_int 7)])))] + "TARGET_MMX" + "punpckhwd\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxcvt") + (set_attr "mode" "DI")]) + +(define_insn "mmx_punpcklwd" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (vec_select:V4HI + (vec_concat:V8HI + (match_operand:V4HI 1 "register_operand" "0") + (match_operand:V4HI 2 "nonimmediate_operand" "ym")) + (parallel [(const_int 0) (const_int 4) + (const_int 1) (const_int 5)])))] + "TARGET_MMX" + "punpcklwd\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxcvt") + (set_attr "mode" "DI")]) + +(define_insn "mmx_punpckhdq" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (vec_select:V2SI + (vec_concat:V4SI + (match_operand:V2SI 1 "register_operand" "0") + (match_operand:V2SI 2 "nonimmediate_operand" "ym")) + (parallel [(const_int 1) + (const_int 3)])))] + "TARGET_MMX" + "punpckhdq\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxcvt") + (set_attr "mode" "DI")]) + +(define_insn "mmx_punpckldq" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (vec_select:V2SI + (vec_concat:V4SI + (match_operand:V2SI 1 "register_operand" "0") + (match_operand:V2SI 2 "nonimmediate_operand" "ym")) + (parallel [(const_int 0) + (const_int 2)])))] + "TARGET_MMX" + "punpckldq\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxcvt") + (set_attr "mode" "DI")]) + +(define_expand "mmx_pinsrw" + [(set (match_operand:V4HI 0 "register_operand" "") + (vec_merge:V4HI + (vec_duplicate:V4HI + (match_operand:SI 2 "nonimmediate_operand" "")) + (match_operand:V4HI 1 "register_operand" "") + (match_operand:SI 3 "const_0_to_3_operand" "")))] + "TARGET_SSE || TARGET_3DNOW_A" +{ + operands[2] = gen_lowpart (HImode, operands[2]); + operands[3] = GEN_INT (1 << INTVAL (operands[3])); +}) + +(define_insn "*mmx_pinsrw" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (vec_merge:V4HI + (vec_duplicate:V4HI + (match_operand:HI 2 "nonimmediate_operand" "rm")) + (match_operand:V4HI 1 "register_operand" "0") + (match_operand:SI 3 "const_pow2_1_to_8_operand" "n")))] + "TARGET_SSE || TARGET_3DNOW_A" +{ + operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3]))); + if (MEM_P (operands[2])) + return "pinsrw\t{%3, %2, %0|%0, %2, %3}"; + else + return "pinsrw\t{%3, %k2, %0|%0, %k2, %3}"; +} + [(set_attr "type" "mmxcvt") + (set_attr "length_immediate" "1") + (set_attr "mode" "DI")]) + +(define_insn "mmx_pextrw" + [(set (match_operand:SI 0 "register_operand" "=r") + (zero_extend:SI + (vec_select:HI + (match_operand:V4HI 1 "register_operand" "y") + (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n")]))))] + "TARGET_SSE || TARGET_3DNOW_A" + "pextrw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "mmxcvt") + (set_attr "length_immediate" "1") + (set_attr "mode" "DI")]) + +(define_expand "mmx_pshufw" + [(match_operand:V4HI 0 "register_operand" "") + (match_operand:V4HI 1 "nonimmediate_operand" "") + (match_operand:SI 2 "const_int_operand" "")] + "TARGET_SSE || TARGET_3DNOW_A" +{ + int mask = INTVAL (operands[2]); + emit_insn (gen_mmx_pshufw_1 (operands[0], operands[1], + GEN_INT ((mask >> 0) & 3), + GEN_INT ((mask >> 2) & 3), + GEN_INT ((mask >> 4) & 3), + GEN_INT ((mask >> 6) & 3))); + DONE; +}) + +(define_insn "mmx_pshufw_1" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (vec_select:V4HI + (match_operand:V4HI 1 "nonimmediate_operand" "ym") + (parallel [(match_operand 2 "const_0_to_3_operand" "") + (match_operand 3 "const_0_to_3_operand" "") + (match_operand 4 "const_0_to_3_operand" "") + (match_operand 5 "const_0_to_3_operand" "")])))] + "TARGET_SSE || TARGET_3DNOW_A" +{ + int mask = 0; + mask |= INTVAL (operands[2]) << 0; + mask |= INTVAL (operands[3]) << 2; + mask |= INTVAL (operands[4]) << 4; + mask |= INTVAL (operands[5]) << 6; + operands[2] = GEN_INT (mask); + + return "pshufw\t{%2, %1, %0|%0, %1, %2}"; +} + [(set_attr "type" "mmxcvt") + (set_attr "length_immediate" "1") + (set_attr "mode" "DI")]) + +(define_insn "mmx_pswapdv2si2" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (vec_select:V2SI + (match_operand:V2SI 1 "nonimmediate_operand" "ym") + (parallel [(const_int 1) (const_int 0)])))] + "TARGET_3DNOW_A" + "pswapd\t{%1, %0|%0, %1}" + [(set_attr "type" "mmxcvt") + (set_attr "prefix_extra" "1") + (set_attr "mode" "DI")]) + +(define_insn "*vec_dupv4hi" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (vec_duplicate:V4HI + (truncate:HI + (match_operand:SI 1 "register_operand" "0"))))] + "TARGET_SSE || TARGET_3DNOW_A" + "pshufw\t{$0, %0, %0|%0, %0, 0}" + [(set_attr "type" "mmxcvt") + (set_attr "length_immediate" "1") + (set_attr "mode" "DI")]) + +(define_insn "*vec_dupv2si" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (vec_duplicate:V2SI + (match_operand:SI 1 "register_operand" "0")))] + "TARGET_MMX" + "punpckldq\t%0, %0" + [(set_attr "type" "mmxcvt") + (set_attr "mode" "DI")]) + +(define_insn "*mmx_concatv2si" + [(set (match_operand:V2SI 0 "register_operand" "=y,y") + (vec_concat:V2SI + (match_operand:SI 1 "nonimmediate_operand" " 0,rm") + (match_operand:SI 2 "vector_move_operand" "ym,C")))] + "TARGET_MMX && !TARGET_SSE" + "@ + punpckldq\t{%2, %0|%0, %2} + movd\t{%1, %0|%0, %1}" + [(set_attr "type" "mmxcvt,mmxmov") + (set_attr "mode" "DI")]) + +(define_expand "vec_setv2si" + [(match_operand:V2SI 0 "register_operand" "") + (match_operand:SI 1 "register_operand" "") + (match_operand 2 "const_int_operand" "")] + "TARGET_MMX" +{ + ix86_expand_vector_set (false, operands[0], operands[1], + INTVAL (operands[2])); + DONE; +}) + +;; Avoid combining registers from different units in a single alternative, +;; see comment above inline_secondary_memory_needed function in i386.c +(define_insn_and_split "*vec_extractv2si_0" + [(set (match_operand:SI 0 "nonimmediate_operand" "=x,m,y, m,r") + (vec_select:SI + (match_operand:V2SI 1 "nonimmediate_operand" "xm,x,ym,y,m") + (parallel [(const_int 0)])))] + "TARGET_MMX && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "#" + "&& reload_completed" + [(const_int 0)] +{ + rtx op1 = operands[1]; + if (REG_P (op1)) + op1 = gen_rtx_REG (SImode, REGNO (op1)); + else + op1 = gen_lowpart (SImode, op1); + emit_move_insn (operands[0], op1); + DONE; +}) + +;; Avoid combining registers from different units in a single alternative, +;; see comment above inline_secondary_memory_needed function in i386.c +(define_insn "*vec_extractv2si_1" + [(set (match_operand:SI 0 "nonimmediate_operand" "=y,Y2,Y2,x,y,x,r") + (vec_select:SI + (match_operand:V2SI 1 "nonimmediate_operand" " 0,0 ,Y2,0,o,o,o") + (parallel [(const_int 1)])))] + "TARGET_MMX && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "@ + punpckhdq\t%0, %0 + punpckhdq\t%0, %0 + pshufd\t{$85, %1, %0|%0, %1, 85} + unpckhps\t%0, %0 + # + # + #" + [(set_attr "type" "mmxcvt,sselog1,sselog1,sselog1,mmxmov,ssemov,imov") + (set_attr "length_immediate" "*,*,1,*,*,*,*") + (set_attr "mode" "DI,TI,TI,V4SF,SI,SI,SI")]) + +(define_split + [(set (match_operand:SI 0 "register_operand" "") + (vec_select:SI + (match_operand:V2SI 1 "memory_operand" "") + (parallel [(const_int 1)])))] + "TARGET_MMX && reload_completed" + [(const_int 0)] +{ + operands[1] = adjust_address (operands[1], SImode, 4); + emit_move_insn (operands[0], operands[1]); + DONE; +}) + +(define_expand "vec_extractv2si" + [(match_operand:SI 0 "register_operand" "") + (match_operand:V2SI 1 "register_operand" "") + (match_operand 2 "const_int_operand" "")] + "TARGET_MMX" +{ + ix86_expand_vector_extract (false, operands[0], operands[1], + INTVAL (operands[2])); + DONE; +}) + +(define_expand "vec_initv2si" + [(match_operand:V2SI 0 "register_operand" "") + (match_operand 1 "" "")] + "TARGET_SSE" +{ + ix86_expand_vector_init (false, operands[0], operands[1]); + DONE; +}) + +(define_expand "vec_setv4hi" + [(match_operand:V4HI 0 "register_operand" "") + (match_operand:HI 1 "register_operand" "") + (match_operand 2 "const_int_operand" "")] + "TARGET_MMX" +{ + ix86_expand_vector_set (false, operands[0], operands[1], + INTVAL (operands[2])); + DONE; +}) + +(define_expand "vec_extractv4hi" + [(match_operand:HI 0 "register_operand" "") + (match_operand:V4HI 1 "register_operand" "") + (match_operand 2 "const_int_operand" "")] + "TARGET_MMX" +{ + ix86_expand_vector_extract (false, operands[0], operands[1], + INTVAL (operands[2])); + DONE; +}) + +(define_expand "vec_initv4hi" + [(match_operand:V4HI 0 "register_operand" "") + (match_operand 1 "" "")] + "TARGET_SSE" +{ + ix86_expand_vector_init (false, operands[0], operands[1]); + DONE; +}) + +(define_expand "vec_setv8qi" + [(match_operand:V8QI 0 "register_operand" "") + (match_operand:QI 1 "register_operand" "") + (match_operand 2 "const_int_operand" "")] + "TARGET_MMX" +{ + ix86_expand_vector_set (false, operands[0], operands[1], + INTVAL (operands[2])); + DONE; +}) + +(define_expand "vec_extractv8qi" + [(match_operand:QI 0 "register_operand" "") + (match_operand:V8QI 1 "register_operand" "") + (match_operand 2 "const_int_operand" "")] + "TARGET_MMX" +{ + ix86_expand_vector_extract (false, operands[0], operands[1], + INTVAL (operands[2])); + DONE; +}) + +(define_expand "vec_initv8qi" + [(match_operand:V8QI 0 "register_operand" "") + (match_operand 1 "" "")] + "TARGET_SSE" +{ + ix86_expand_vector_init (false, operands[0], operands[1]); + DONE; +}) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Miscellaneous +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_expand "mmx_uavgv8qi3" + [(set (match_operand:V8QI 0 "register_operand" "") + (truncate:V8QI + (lshiftrt:V8HI + (plus:V8HI + (plus:V8HI + (zero_extend:V8HI + (match_operand:V8QI 1 "nonimmediate_operand" "")) + (zero_extend:V8HI + (match_operand:V8QI 2 "nonimmediate_operand" ""))) + (const_vector:V8HI [(const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1)])) + (const_int 1))))] + "TARGET_SSE || TARGET_3DNOW" + "ix86_fixup_binary_operands_no_copy (PLUS, V8QImode, operands);") + +(define_insn "*mmx_uavgv8qi3" + [(set (match_operand:V8QI 0 "register_operand" "=y") + (truncate:V8QI + (lshiftrt:V8HI + (plus:V8HI + (plus:V8HI + (zero_extend:V8HI + (match_operand:V8QI 1 "nonimmediate_operand" "%0")) + (zero_extend:V8HI + (match_operand:V8QI 2 "nonimmediate_operand" "ym"))) + (const_vector:V8HI [(const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1)])) + (const_int 1))))] + "(TARGET_SSE || TARGET_3DNOW) + && ix86_binary_operator_ok (PLUS, V8QImode, operands)" +{ + /* These two instructions have the same operation, but their encoding + is different. Prefer the one that is de facto standard. */ + if (TARGET_SSE || TARGET_3DNOW_A) + return "pavgb\t{%2, %0|%0, %2}"; + else + return "pavgusb\t{%2, %0|%0, %2}"; +} + [(set_attr "type" "mmxshft") + (set (attr "prefix_extra") + (if_then_else + (eq (symbol_ref "(TARGET_SSE || TARGET_3DNOW_A)") (const_int 0)) + (const_string "1") + (const_string "*"))) + (set_attr "mode" "DI")]) + +(define_expand "mmx_uavgv4hi3" + [(set (match_operand:V4HI 0 "register_operand" "") + (truncate:V4HI + (lshiftrt:V4SI + (plus:V4SI + (plus:V4SI + (zero_extend:V4SI + (match_operand:V4HI 1 "nonimmediate_operand" "")) + (zero_extend:V4SI + (match_operand:V4HI 2 "nonimmediate_operand" ""))) + (const_vector:V4SI [(const_int 1) (const_int 1) + (const_int 1) (const_int 1)])) + (const_int 1))))] + "TARGET_SSE || TARGET_3DNOW_A" + "ix86_fixup_binary_operands_no_copy (PLUS, V4HImode, operands);") + +(define_insn "*mmx_uavgv4hi3" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (truncate:V4HI + (lshiftrt:V4SI + (plus:V4SI + (plus:V4SI + (zero_extend:V4SI + (match_operand:V4HI 1 "nonimmediate_operand" "%0")) + (zero_extend:V4SI + (match_operand:V4HI 2 "nonimmediate_operand" "ym"))) + (const_vector:V4SI [(const_int 1) (const_int 1) + (const_int 1) (const_int 1)])) + (const_int 1))))] + "(TARGET_SSE || TARGET_3DNOW_A) + && ix86_binary_operator_ok (PLUS, V4HImode, operands)" + "pavgw\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxshft") + (set_attr "mode" "DI")]) + +(define_insn "mmx_psadbw" + [(set (match_operand:V1DI 0 "register_operand" "=y") + (unspec:V1DI [(match_operand:V8QI 1 "register_operand" "0") + (match_operand:V8QI 2 "nonimmediate_operand" "ym")] + UNSPEC_PSADBW))] + "TARGET_SSE || TARGET_3DNOW_A" + "psadbw\t{%2, %0|%0, %2}" + [(set_attr "type" "mmxshft") + (set_attr "mode" "DI")]) + +(define_insn "mmx_pmovmskb" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec:SI [(match_operand:V8QI 1 "register_operand" "y")] + UNSPEC_MOVMSK))] + "TARGET_SSE || TARGET_3DNOW_A" + "pmovmskb\t{%1, %0|%0, %1}" + [(set_attr "type" "mmxcvt") + (set_attr "mode" "DI")]) + +(define_expand "mmx_maskmovq" + [(set (match_operand:V8QI 0 "memory_operand" "") + (unspec:V8QI [(match_operand:V8QI 1 "register_operand" "") + (match_operand:V8QI 2 "register_operand" "") + (match_dup 0)] + UNSPEC_MASKMOV))] + "TARGET_SSE || TARGET_3DNOW_A") + +(define_insn "*mmx_maskmovq" + [(set (mem:V8QI (match_operand:SI 0 "register_operand" "D")) + (unspec:V8QI [(match_operand:V8QI 1 "register_operand" "y") + (match_operand:V8QI 2 "register_operand" "y") + (mem:V8QI (match_dup 0))] + UNSPEC_MASKMOV))] + "(TARGET_SSE || TARGET_3DNOW_A) && !TARGET_64BIT" + ;; @@@ check ordering of operands in intel/nonintel syntax + "maskmovq\t{%2, %1|%1, %2}" + [(set_attr "type" "mmxcvt") + (set_attr "mode" "DI")]) + +(define_insn "*mmx_maskmovq_rex" + [(set (mem:V8QI (match_operand:DI 0 "register_operand" "D")) + (unspec:V8QI [(match_operand:V8QI 1 "register_operand" "y") + (match_operand:V8QI 2 "register_operand" "y") + (mem:V8QI (match_dup 0))] + UNSPEC_MASKMOV))] + "(TARGET_SSE || TARGET_3DNOW_A) && TARGET_64BIT" + ;; @@@ check ordering of operands in intel/nonintel syntax + "maskmovq\t{%2, %1|%1, %2}" + [(set_attr "type" "mmxcvt") + (set_attr "mode" "DI")]) + +(define_expand "mmx_emms" + [(match_par_dup 0 [(const_int 0)])] + "TARGET_MMX" +{ + int regno; + + operands[0] = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (17)); + + XVECEXP (operands[0], 0, 0) + = gen_rtx_UNSPEC_VOLATILE (VOIDmode, gen_rtvec (1, const0_rtx), + UNSPECV_EMMS); + + for (regno = 0; regno < 8; regno++) + { + XVECEXP (operands[0], 0, regno + 1) + = gen_rtx_CLOBBER (VOIDmode, + gen_rtx_REG (XFmode, FIRST_STACK_REG + regno)); + + XVECEXP (operands[0], 0, regno + 9) + = gen_rtx_CLOBBER (VOIDmode, + gen_rtx_REG (DImode, FIRST_MMX_REG + regno)); + } +}) + +(define_insn "*mmx_emms" + [(match_parallel 0 "emms_operation" + [(unspec_volatile [(const_int 0)] UNSPECV_EMMS)])] + "TARGET_MMX" + "emms" + [(set_attr "type" "mmx") + (set_attr "modrm" "0") + (set_attr "memory" "none")]) + +(define_expand "mmx_femms" + [(match_par_dup 0 [(const_int 0)])] + "TARGET_3DNOW" +{ + int regno; + + operands[0] = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (17)); + + XVECEXP (operands[0], 0, 0) + = gen_rtx_UNSPEC_VOLATILE (VOIDmode, gen_rtvec (1, const0_rtx), + UNSPECV_FEMMS); + + for (regno = 0; regno < 8; regno++) + { + XVECEXP (operands[0], 0, regno + 1) + = gen_rtx_CLOBBER (VOIDmode, + gen_rtx_REG (XFmode, FIRST_STACK_REG + regno)); + + XVECEXP (operands[0], 0, regno + 9) + = gen_rtx_CLOBBER (VOIDmode, + gen_rtx_REG (DImode, FIRST_MMX_REG + regno)); + } +}) + +(define_insn "*mmx_femms" + [(match_parallel 0 "emms_operation" + [(unspec_volatile [(const_int 0)] UNSPECV_FEMMS)])] + "TARGET_3DNOW" + "femms" + [(set_attr "type" "mmx") + (set_attr "modrm" "0") + (set_attr "memory" "none")]) diff --git a/gcc/config/i386/msformat-c.c b/gcc/config/i386/msformat-c.c new file mode 100644 index 000000000..513952e86 --- /dev/null +++ b/gcc/config/i386/msformat-c.c @@ -0,0 +1,197 @@ +/* Check calls to formatted I/O functions (-Wformat). + Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, + 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2010 + Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "tm.h" +#include "tree.h" +#include "flags.h" +#include "c-family/c-common.h" +#include "intl.h" +#include "diagnostic.h" +#include "langhooks.h" +#include "c-family/c-format.h" +#include "alloc-pool.h" + +/* Mingw specific format attributes ms_printf, ms_scanf, and ms_strftime. */ + +static format_length_info ms_printf_length_specs[] = +{ + { "h", FMT_LEN_h, STD_C89, NULL, FMT_LEN_none, STD_C89, 0 }, + { "l", FMT_LEN_l, STD_C89, NULL, FMT_LEN_none, STD_C89, 0 }, + { "I32", FMT_LEN_l, STD_EXT, NULL, FMT_LEN_none, STD_C89, 1 }, + { "I64", FMT_LEN_ll, STD_EXT, NULL, FMT_LEN_none, STD_C89, 1 }, + { "I", FMT_LEN_L, STD_EXT, NULL, FMT_LEN_none, STD_C89, 1 }, + { NULL, FMT_LEN_none, STD_C89, NULL, FMT_LEN_none, STD_C89, 0 } +}; + +static const format_flag_spec ms_printf_flag_specs[] = +{ + { ' ', 0, 0, N_("' ' flag"), N_("the ' ' printf flag"), STD_C89 }, + { '+', 0, 0, N_("'+' flag"), N_("the '+' printf flag"), STD_C89 }, + { '#', 0, 0, N_("'#' flag"), N_("the '#' printf flag"), STD_C89 }, + { '0', 0, 0, N_("'0' flag"), N_("the '0' printf flag"), STD_C89 }, + { '-', 0, 0, N_("'-' flag"), N_("the '-' printf flag"), STD_C89 }, + { '\'', 0, 0, N_("''' flag"), N_("the ''' printf flag"), STD_EXT }, + { 'w', 0, 0, N_("field width"), N_("field width in printf format"), STD_C89 }, + { 'p', 0, 0, N_("precision"), N_("precision in printf format"), STD_C89 }, + { 'L', 0, 0, N_("length modifier"), N_("length modifier in printf format"), STD_C89 }, + { 0, 0, 0, NULL, NULL, STD_C89 } +}; + +static const format_flag_pair ms_printf_flag_pairs[] = +{ + { ' ', '+', 1, 0 }, + { '0', '-', 1, 0 }, { '0', 'p', 1, 'i' }, + { 0, 0, 0, 0 } +}; + +static const format_flag_spec ms_scanf_flag_specs[] = +{ + { '*', 0, 0, N_("assignment suppression"), N_("the assignment suppression scanf feature"), STD_C89 }, + { 'a', 0, 0, N_("'a' flag"), N_("the 'a' scanf flag"), STD_EXT }, + { 'w', 0, 0, N_("field width"), N_("field width in scanf format"), STD_C89 }, + { 'L', 0, 0, N_("length modifier"), N_("length modifier in scanf format"), STD_C89 }, + { '\'', 0, 0, N_("''' flag"), N_("the ''' scanf flag"), STD_EXT }, + { 0, 0, 0, NULL, NULL, STD_C89 } +}; + +static const format_flag_pair ms_scanf_flag_pairs[] = +{ + { '*', 'L', 0, 0 }, + { 0, 0, 0, 0 } +}; + +static const format_flag_spec ms_strftime_flag_specs[] = +{ + { '#', 0, 0, N_("'#' flag"), N_("the '#' strftime flag"), STD_EXT }, + { 0, 0, 0, NULL, NULL, STD_C89 } +}; + +static const format_flag_pair ms_strftime_flag_pairs[] = +{ + { 0, 0, 0, 0 } +}; + +static const format_char_info ms_print_char_table[] = +{ + /* C89 conversion specifiers. */ + { "di", 0, STD_C89, { T89_I, BADLEN, T89_S, T89_L, T9L_LL, T99_SST, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN }, "-wp0 +'", "i", NULL }, + { "oxX", 0, STD_C89, { T89_UI, BADLEN, T89_US, T89_UL, T9L_ULL, T99_ST, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN }, "-wp0#", "i", NULL }, + { "u", 0, STD_C89, { T89_UI, BADLEN, T89_US, T89_UL, T9L_ULL, T99_ST, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN }, "-wp0'", "i", NULL }, + { "fgG", 0, STD_C89, { T89_D, BADLEN, BADLEN, T99_D, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN }, "-wp0 +#'", "", NULL }, + { "eE", 0, STD_C89, { T89_D, BADLEN, BADLEN, T99_D, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN }, "-wp0 +#", "", NULL }, + { "c", 0, STD_C89, { T89_I, BADLEN, T89_S, T94_WI, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN }, "-w", "", NULL }, + { "s", 1, STD_C89, { T89_C, BADLEN, T89_S, T94_W, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN }, "-wp", "cR", NULL }, + { "p", 1, STD_C89, { T89_V, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN }, "-w", "c", NULL }, + { "n", 1, STD_C89, { T89_I, BADLEN, T89_S, T89_L, T9L_LL, BADLEN, BADLEN, BADLEN, T99_IM, BADLEN, BADLEN, BADLEN }, "", "W", NULL }, + /* X/Open conversion specifiers. */ + { "C", 0, STD_EXT, { TEX_WI, BADLEN, T89_S, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN }, "-w", "", NULL }, + { "S", 1, STD_EXT, { TEX_W, BADLEN, T89_S, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN }, "-wp", "R", NULL }, + { NULL, 0, STD_C89, NOLENGTHS, NULL, NULL, NULL } +}; + +static const format_char_info ms_scan_char_table[] = +{ + /* C89 conversion specifiers. */ + { "di", 1, STD_C89, { T89_I, BADLEN, T89_S, T89_L, T9L_LL, T99_SST, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN }, "*w'", "W", NULL }, + { "u", 1, STD_C89, { T89_UI, BADLEN, T89_US, T89_UL, T9L_ULL, T99_ST, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN }, "*w'", "W", NULL }, + { "oxX", 1, STD_C89, { T89_UI, BADLEN, T89_US, T89_UL, T9L_ULL, T99_ST, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN }, "*w", "W", NULL }, + { "efgEG", 1, STD_C89, { T89_F, BADLEN, BADLEN, T89_D, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN }, "*w'", "W", NULL }, + { "c", 1, STD_C89, { T89_C, BADLEN, T89_S, T94_W, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN }, "*w", "cW", NULL }, + { "s", 1, STD_C89, { T89_C, BADLEN, T89_S, T94_W, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN }, "*aw", "cW", NULL }, + { "[", 1, STD_C89, { T89_C, BADLEN, BADLEN, T94_W, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN }, "*aw", "cW[", NULL }, + { "p", 2, STD_C89, { T89_V, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN }, "*w", "W", NULL }, + { "n", 1, STD_C89, { T89_I, BADLEN, T89_S, T89_L, T9L_LL, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN }, "", "W", NULL }, + /* X/Open conversion specifiers. */ + { "C", 1, STD_EXT, { TEX_W, BADLEN, T89_S, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN }, "*w", "W", NULL }, + { "S", 1, STD_EXT, { TEX_W, BADLEN, T89_S, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN, BADLEN }, "*aw", "W", NULL }, + { NULL, 0, STD_C89, NOLENGTHS, NULL, NULL, NULL } +}; + +static const format_char_info ms_time_char_table[] = +{ + /* C89 conversion specifiers. */ + { "ABZab", 0, STD_C89, NOLENGTHS, "#", "", NULL }, + { "cx", 0, STD_C89, NOLENGTHS, "#", "3", NULL }, + { "HIMSUWdmw", 0, STD_C89, NOLENGTHS, "#", "", NULL }, + { "j", 0, STD_C89, NOLENGTHS, "#", "", NULL }, + { "p", 0, STD_C89, NOLENGTHS, "#", "", NULL }, + { "X", 0, STD_C89, NOLENGTHS, "#", "", NULL }, + { "y", 0, STD_C89, NOLENGTHS, "#", "4", NULL }, + { "Y", 0, STD_C89, NOLENGTHS, "#", "", NULL }, + { "%", 0, STD_C89, NOLENGTHS, "", "", NULL }, + /* C99 conversion specifiers. */ + { "z", 0, STD_C99, NOLENGTHS, "#", "", NULL }, + { NULL, 0, STD_C89, NOLENGTHS, NULL, NULL, NULL } +}; + +EXPORTED_CONST format_kind_info mingw_format_attributes[3] = +{ + { "ms_printf", ms_printf_length_specs, ms_print_char_table, " +#0-'", NULL, + ms_printf_flag_specs, ms_printf_flag_pairs, + FMT_FLAG_ARG_CONVERT|FMT_FLAG_DOLLAR_MULTIPLE|FMT_FLAG_USE_DOLLAR|FMT_FLAG_EMPTY_PREC_OK, + 'w', 0, 'p', 0, 'L', 0, + &integer_type_node, &integer_type_node + }, + { "ms_scanf", ms_printf_length_specs, ms_scan_char_table, "*'", NULL, + ms_scanf_flag_specs, ms_scanf_flag_pairs, + FMT_FLAG_ARG_CONVERT|FMT_FLAG_SCANF_A_KLUDGE|FMT_FLAG_USE_DOLLAR|FMT_FLAG_ZERO_WIDTH_BAD|FMT_FLAG_DOLLAR_GAP_POINTER_OK, + 'w', 0, 0, '*', 'L', 0, + NULL, NULL + }, + { "ms_strftime", NULL, ms_time_char_table, "", "#", + ms_strftime_flag_specs, ms_strftime_flag_pairs, + FMT_FLAG_FANCY_PERCENT_OK, 0, 0, 0, 0, 0, 0, + NULL, NULL + } +}; + +/* Default overrides for printf, scanf and strftime. */ +EXPORTED_CONST target_ovr_attr mingw_format_attribute_overrides[4] = +{ + { "ms_printf", "printf" }, + { "ms_scanf", "scanf" }, + { "ms_strftime", "strftime" } +}; + +/* Setup for option Wpedantic-ms-format. */ + +#ifdef TARGET_OVERRIDES_FORMAT_INIT + +/* Make sure TARGET_OVERRIDES_FORMAT_INIT is prototyped. */ +extern void TARGET_OVERRIDES_FORMAT_INIT (void); + +/* Helper. */ +#define C89_OR_EXT (warn_pedantic_ms_format ? STD_EXT : STD_C89) + +void +TARGET_OVERRIDES_FORMAT_INIT (void) +{ + ms_printf_length_specs[2].std = C89_OR_EXT; /* I32 */ + ms_printf_length_specs[3].std = C89_OR_EXT; /* I64 */ + ms_printf_length_specs[4].std = C89_OR_EXT; /* I */ +} + +#undef C89_OR_EXT + +#endif diff --git a/gcc/config/i386/netbsd-elf.h b/gcc/config/i386/netbsd-elf.h new file mode 100644 index 000000000..264d290a3 --- /dev/null +++ b/gcc/config/i386/netbsd-elf.h @@ -0,0 +1,124 @@ +/* Definitions of target machine for GCC, + for i386/ELF NetBSD systems. + Copyright (C) 2001, 2002, 2004, 2007 Free Software Foundation, Inc. + Contributed by matthew green + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#define TARGET_OS_CPP_BUILTINS() \ + do \ + { \ + NETBSD_OS_CPP_BUILTINS_ELF(); \ + } \ + while (0) + + +/* Extra specs needed for NetBSD/i386 ELF. */ + +#undef SUBTARGET_EXTRA_SPECS +#define SUBTARGET_EXTRA_SPECS \ + { "netbsd_cpp_spec", NETBSD_CPP_SPEC }, \ + { "netbsd_entry_point", NETBSD_ENTRY_POINT }, + + +/* Provide a LINK_SPEC appropriate for a NetBSD/i386 ELF target. */ + +#undef LINK_SPEC +#define LINK_SPEC NETBSD_LINK_SPEC_ELF + +#define NETBSD_ENTRY_POINT "__start" + + +/* Provide a CPP_SPEC appropriate for NetBSD. */ + +#undef CPP_SPEC +#define CPP_SPEC "%(netbsd_cpp_spec)" + + +/* Make gcc agree with */ + +#undef SIZE_TYPE +#define SIZE_TYPE "unsigned int" + +#undef PTRDIFF_TYPE +#define PTRDIFF_TYPE "int" + +#undef ASM_APP_ON +#define ASM_APP_ON "#APP\n" + +#undef ASM_APP_OFF +#define ASM_APP_OFF "#NO_APP\n" + +#undef ASM_COMMENT_START +#define ASM_COMMENT_START "#" + +#undef DBX_REGISTER_NUMBER +#define DBX_REGISTER_NUMBER(n) svr4_dbx_register_map[n] + + +/* Output assembler code to FILE to call the profiler. */ + +#undef NO_PROFILE_COUNTERS +#define NO_PROFILE_COUNTERS 1 + +#undef FUNCTION_PROFILER +#define FUNCTION_PROFILER(FILE, LABELNO) \ +{ \ + if (flag_pic) \ + fprintf (FILE, "\tcall __mcount@PLT\n"); \ + else \ + fprintf (FILE, "\tcall __mcount\n"); \ +} + + +#undef HAS_INIT_SECTION + +/* This is how we tell the assembler that two symbols have the same value. */ + +#define ASM_OUTPUT_DEF(FILE,NAME1,NAME2) \ + do { assemble_name(FILE, NAME1); \ + fputs(" = ", FILE); \ + assemble_name(FILE, NAME2); \ + fputc('\n', FILE); } while (0) + +/* A C statement to output to the stdio stream FILE an assembler + command to advance the location counter to a multiple of 1<. */ + +#define TARGET_OS_CPP_BUILTINS() \ + do \ + { \ + NETBSD_OS_CPP_BUILTINS_AOUT(); \ + } \ + while (0) + +#define TARGET_VERSION fprintf (stderr, " (NetBSD/i386 a.out)"); + +/* This goes away when the math-emulator is fixed */ +#undef TARGET_SUBTARGET_DEFAULT +#define TARGET_SUBTARGET_DEFAULT \ + (MASK_80387 | MASK_IEEE_FP | MASK_FLOAT_RETURNS | MASK_NO_FANCY_MATH_387) + +#undef SUBTARGET_EXTRA_SPECS +#define SUBTARGET_EXTRA_SPECS \ + { "netbsd_cpp_spec", NETBSD_CPP_SPEC }, + +#undef CPP_SPEC +#define CPP_SPEC "%(netbsd_cpp_spec)" + + +#undef SIZE_TYPE +#define SIZE_TYPE "unsigned int" + +#undef PTRDIFF_TYPE +#define PTRDIFF_TYPE "int" + +#undef ASM_APP_ON +#define ASM_APP_ON "#APP\n" + +#undef ASM_APP_OFF +#define ASM_APP_OFF "#NO_APP\n" + +/* Don't default to pcc-struct-return, because gcc is the only compiler, and + we want to retain compatibility with older gcc versions. */ +#define DEFAULT_PCC_STRUCT_RETURN 0 + +/* i386 netbsd still uses old binutils that don't insert nops by default + when the .align directive demands to insert extra space in the text + segment. */ +#undef ASM_OUTPUT_ALIGN +#define ASM_OUTPUT_ALIGN(FILE,LOG) \ + if ((LOG)!=0) fprintf ((FILE), "\t.align %d,0x90\n", (LOG)) + +/* Profiling routines, partially copied from i386/osfrose.h. */ + +/* Redefine this to use %eax instead of %edx. */ +#undef FUNCTION_PROFILER +#define FUNCTION_PROFILER(FILE, LABELNO) \ +{ \ + if (flag_pic) \ + { \ + fprintf (FILE, "\tcall mcount@PLT\n"); \ + } \ + else \ + { \ + fprintf (FILE, "\tcall mcount\n"); \ + } \ +} + +/* Until they use ELF or something that handles dwarf2 unwinds + and initialization stuff better. */ +#define DWARF2_UNWIND_INFO 0 + +/* Redefine this so that it becomes "_GLOBAL_OFFSET_TABLE_" when the label + prefix is added. */ +#undef GOT_SYMBOL_NAME +#define GOT_SYMBOL_NAME "GLOBAL_OFFSET_TABLE_" + +/* Attempt to enable execute permissions on the stack. */ +#define ENABLE_EXECUTE_STACK NETBSD_ENABLE_EXECUTE_STACK diff --git a/gcc/config/i386/netbsd64.h b/gcc/config/i386/netbsd64.h new file mode 100644 index 000000000..5add1032c --- /dev/null +++ b/gcc/config/i386/netbsd64.h @@ -0,0 +1,72 @@ +/* Definitions of target machine for GCC, + for x86-64/ELF NetBSD systems. + Copyright (C) 2002, 2004, 2007 Free Software Foundation, Inc. + Contributed by Wasabi Systems, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#define TARGET_OS_CPP_BUILTINS() \ + do \ + { \ + NETBSD_OS_CPP_BUILTINS_ELF(); \ + } \ + while (0) + + +/* Extra specs needed for NetBSD/x86-64 ELF. */ + +#undef SUBTARGET_EXTRA_SPECS +#define SUBTARGET_EXTRA_SPECS \ + { "netbsd_cpp_spec", NETBSD_CPP_SPEC }, \ + { "netbsd_link_spec", NETBSD_LINK_SPEC_ELF }, \ + { "netbsd_entry_point", NETBSD_ENTRY_POINT }, + + +/* Provide a LINK_SPEC appropriate for a NetBSD/x86-64 ELF target. */ + +#undef LINK_SPEC +#define LINK_SPEC \ + "%{m32:-m elf_i386} \ + %{m64:-m elf_x86_64} \ + %(netbsd_link_spec)" + +#define NETBSD_ENTRY_POINT "_start" + + +/* Provide a CPP_SPEC appropriate for NetBSD. */ + +#undef CPP_SPEC +#define CPP_SPEC "%(netbsd_cpp_spec)" + + +/* Output assembler code to FILE to call the profiler. */ + +#undef FUNCTION_PROFILER +#define FUNCTION_PROFILER(FILE, LABELNO) \ +{ \ + if (TARGET_64BIT && flag_pic) \ + fprintf (FILE, "\tcall *__mcount@PLT\n"); \ + else if (flag_pic) \ + fprintf (FILE, "\tcall *__mcount@PLT\n"); \ + else \ + fprintf (FILE, "\tcall __mcount\n"); \ +} + +/* Attempt to enable execute permissions on the stack. */ +#define ENABLE_EXECUTE_STACK NETBSD_ENABLE_EXECUTE_STACK + +#define TARGET_VERSION fprintf (stderr, " (NetBSD/x86_64 ELF)"); diff --git a/gcc/config/i386/netware-crt0.c b/gcc/config/i386/netware-crt0.c new file mode 100644 index 000000000..03141ab99 --- /dev/null +++ b/gcc/config/i386/netware-crt0.c @@ -0,0 +1,79 @@ +/* Startup routines for NetWare. + Contributed by Jan Beulich (jbeulich@novell.com) + Copyright (C) 2004, 2007 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#include +#include +#include "unwind-dw2-fde.h" + +int __init_environment (void *); +int __deinit_environment (void *); + + +#define SECTION_DECL(name, decl) decl __attribute__((__section__(name))) + +SECTION_DECL(".ctors", void(*const __CTOR_LIST__)(void)) + = (void(*)(void))(intptr_t)-1; +SECTION_DECL(".ctors$_", void(*const __CTOR_END__)(void)) = NULL; + +SECTION_DECL(".dtors", void(*const __DTOR_LIST__)(void)) + = (void(*)(void))(intptr_t)-1; +SECTION_DECL(".dtors$_", void(*const __DTOR_END__)(void)) = NULL; + +/* No need to use the __[de]register_frame_info_bases functions since + for us the bases are NULL always anyway. */ +void __register_frame_info (const void *, struct object *) + __attribute__((__weak__)); +void *__deregister_frame_info (const void *) __attribute__((__weak__)); + +SECTION_DECL(".eh_frame", /*const*/ uint32_t __EH_FRAME_BEGIN__[]) = { }; +SECTION_DECL(".eh_frame$_", /*const*/ uint32_t __EH_FRAME_END__[]) = {0}; + +int +__init_environment (void *unused __attribute__((__unused__))) +{ + void (* const * pctor)(void); + static struct object object; + + if (__register_frame_info) + __register_frame_info (__EH_FRAME_BEGIN__, &object); + + for (pctor = &__CTOR_END__ - 1; pctor > &__CTOR_LIST__; --pctor) + if (*pctor != NULL) + (*pctor)(); + + return 0; +} + +int +__deinit_environment (void *unused __attribute__((__unused__))) +{ + /* This should be static to prevent calling the same destructor + twice (just in case where we get here multiple times). */ + static void (* const * pdtor)(void) = &__DTOR_LIST__ + 1; + + while (pdtor < &__DTOR_END__) + if (*pdtor++ != NULL) + pdtor[-1] (); + + if (__deregister_frame_info) + __deregister_frame_info(__EH_FRAME_BEGIN__); + + return 0; +} diff --git a/gcc/config/i386/netware-libgcc.c b/gcc/config/i386/netware-libgcc.c new file mode 100644 index 000000000..0925d872a --- /dev/null +++ b/gcc/config/i386/netware-libgcc.c @@ -0,0 +1,58 @@ +/* Startup code for libgcc_s.nlm, necessary because we can't allow + libgcc_s to use libc's malloc & Co., which associate allocations + with the NLM owning the current (application) thread. + Contributed by Jan Beulich (jbeulich@novell.com) + Copyright (C) 2004, 2007 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#include +#include +#include +#include + +static rtag_t allocRTag; + +BOOL +DllMain (HINSTANCE libraryId __attribute__ ((__unused__)), + DWORD reason, void *hModule) +{ + switch (reason) + { + case DLL_NLM_STARTUP: + allocRTag = AllocateResourceTag (hModule, + "libgcc memory", AllocSignature); + return allocRTag != NULL; + case DLL_NLM_SHUTDOWN: + /* This does not recover resources associated with the tag... + ReturnResourceTag (allocRTag, 0); */ + break; + } + return 1; +} + +void * +malloc (size_t size) +{ + return AllocSleepOK (size, allocRTag, NULL); +} + +void +free (void *ptr) +{ + Free (ptr); +} diff --git a/gcc/config/i386/netware-libgcc.def b/gcc/config/i386/netware-libgcc.def new file mode 100644 index 000000000..a545631b1 --- /dev/null +++ b/gcc/config/i386/netware-libgcc.def @@ -0,0 +1,2 @@ +description "gcc runtime and intrinsics support" +copyright "Copyright (C) 1989-2005 Free Software Foundation, Inc." diff --git a/gcc/config/i386/netware-libgcc.exp b/gcc/config/i386/netware-libgcc.exp new file mode 100644 index 000000000..309cf7549 --- /dev/null +++ b/gcc/config/i386/netware-libgcc.exp @@ -0,0 +1,83 @@ +# libgcc_s.nlm exports + (libgcc2), + __absvdi2, + __absvsi2, + __addvdi3, + __addvsi3, +# __ashldi3, +# __ashrdi3, + __bswapdi2, + __bswapsi2, + __clzdi2, + __clzsi2, + __ctzdi2, + __ctzsi2, + __deregister_frame, + __deregister_frame_info, + __deregister_frame_info_bases, + __divdc3, +# __divdi3, + __divsc3, +# __divtc3, + __divxc3, + __emutls_get_address, + __emutls_register_common, + __ffsdi2, + __ffssi2, + __fixunsdfdi, + __fixunssfdi, +# __fixunstfdi, + __fixunsxfdi, + __floatundisf, + __floatundidf, +# __floatunditf, + __floatundixf, + __gcc_bcmp, + __gcc_personality_v0, +# __lshrdi3, +# __moddi3, + __muldc3, +# __muldi3, + __mulsc3, +# __multc3, + __mulvdi3, + __mulvsi3, + __mulxc3, + __negvdi2, + __negvsi2, + __paritydi2, + __paritysi2, + __popcountdi2, + __popcountsi2, + __powidf2 + __powisf2 +# __powitf2 + __powixf2 + __register_frame, + __register_frame_info, + __register_frame_info_bases, + __register_frame_info_table, + __register_frame_info_table_bases, + __register_frame_table, + __subvdi3, + __subvsi3, +# __umoddi3, +# __udivdi3, + _Unwind_Backtrace, + _Unwind_DeleteException, + _Unwind_FindEnclosingFunction, + _Unwind_Find_FDE, + _Unwind_ForcedUnwind, + _Unwind_GetCFA, + _Unwind_GetDataRelBase, + _Unwind_GetGR, + _Unwind_GetIP, + _Unwind_GetIPInfo, + _Unwind_GetLanguageSpecificData, + _Unwind_GetRegionStart, + _Unwind_GetTextRelBase, + _Unwind_RaiseException, + _Unwind_Resume, + _Unwind_Resume_or_Rethrow, + _Unwind_SetGR, + _Unwind_SetIP diff --git a/gcc/config/i386/netware.c b/gcc/config/i386/netware.c new file mode 100644 index 000000000..2232dbf6e --- /dev/null +++ b/gcc/config/i386/netware.c @@ -0,0 +1,229 @@ +/* Subroutines for insn-output.c for NetWare. + Contributed by Jan Beulich (jbeulich@novell.com) + Copyright (C) 2004, 2005, 2007, 2008, 2009, 2010 + Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "tm.h" +#include "rtl.h" +#include "regs.h" +#include "hard-reg-set.h" +#include "output.h" +#include "tree.h" +#include "flags.h" +#include "tm_p.h" +#include "diagnostic-core.h" +#include "langhooks.h" +#include "ggc.h" + +/* Return string which is the function name, identified by ID, modified + with PREFIX and a suffix consisting of an atsign (@) followed by the + number of bytes of arguments. If ID is NULL use the DECL_NAME as base. + Return NULL if no change required. */ + +static tree +gen_stdcall_or_fastcall_decoration (tree decl, tree id, char prefix) +{ + unsigned HOST_WIDE_INT total = 0; + const char *old_str = IDENTIFIER_POINTER (id != NULL_TREE ? id : DECL_NAME (decl)); + char *new_str; + tree type = TREE_TYPE (decl); + + if (prototype_p (type)) + { + tree arg; + function_args_iterator args_iter; + + /* This attribute is ignored for variadic functions. */ + if (stdarg_p (type)) + return NULL_TREE; + + /* Quit if we hit an incomplete type. Error is reported + by convert_arguments in c-typeck.c or cp/typeck.c. */ + FOREACH_FUNCTION_ARGS(type, arg, args_iter) + { + HOST_WIDE_INT parm_size; + unsigned HOST_WIDE_INT parm_boundary_bytes; + + if (! COMPLETE_TYPE_P (arg)) + break; + + parm_size = int_size_in_bytes (arg); + if (parm_size < 0) + break; + + parm_boundary_bytes = PARM_BOUNDARY / BITS_PER_UNIT; + + /* Must round up to include padding. This is done the same + way as in store_one_arg. */ + total += (parm_size + parm_boundary_bytes - 1) + / parm_boundary_bytes * parm_boundary_bytes; + } + } + + new_str = XALLOCAVEC (char, 1 + strlen (old_str) + 1 + 10 + 1); + sprintf (new_str, "%c%s@" HOST_WIDE_INT_PRINT_UNSIGNED, + prefix, old_str, total); + + return get_identifier (new_str); +} + +/* Return string which is the function name, identified by ID, modified + with an _n@ prefix (where n represents the number of arguments passed in + registers). If ID is NULL use the DECL_NAME as base. + Return NULL if no change required. */ + +static tree +gen_regparm_prefix (tree decl, tree id, unsigned int nregs) +{ + unsigned HOST_WIDE_INT total = 0; + const char *old_str = IDENTIFIER_POINTER (id != NULL_TREE ? id : DECL_NAME (decl)); + char *new_str; + tree type = TREE_TYPE (decl); + + if (prototype_p (type)) + { + tree arg; + function_args_iterator args_iter; + + /* This attribute is ignored for variadic functions. */ + if (stdarg_p (type)) + return NULL_TREE; + + /* Quit if we hit an incomplete type. Error is reported + by convert_arguments in c-typeck.c or cp/typeck.c. */ + FOREACH_FUNCTION_ARGS(type, arg, args_iter) + { + HOST_WIDE_INT parm_size; + unsigned HOST_WIDE_INT parm_boundary_bytes; + + if (! COMPLETE_TYPE_P (arg)) + break; + + parm_size = int_size_in_bytes (arg); + if (parm_size < 0) + break; + + parm_boundary_bytes = PARM_BOUNDARY / BITS_PER_UNIT; + + /* Must round up to include padding. This is done the same + way as in store_one_arg. */ + total += (parm_size + parm_boundary_bytes - 1) + / parm_boundary_bytes * parm_boundary_bytes; + } + } + + if (nregs > total / UNITS_PER_WORD) + nregs = total / UNITS_PER_WORD; + gcc_assert (nregs <= 9); + new_str = XALLOCAVEC (char, 3 + strlen (old_str) + 1); + sprintf (new_str, "_%u@%s", nregs, old_str); + + return get_identifier (new_str); +} + +/* Maybe decorate and get a new identifier for the DECL of a stdcall or + fastcall function. The original identifier is supplied in ID. */ + +static tree +i386_nlm_maybe_mangle_decl_assembler_name (tree decl, tree id) +{ + tree type_attributes = TYPE_ATTRIBUTES (TREE_TYPE (decl)); + tree new_id; + + if (lookup_attribute ("stdcall", type_attributes)) + new_id = gen_stdcall_or_fastcall_decoration (decl, id, '_'); + else if (lookup_attribute ("fastcall", type_attributes)) + new_id = gen_stdcall_or_fastcall_decoration (decl, id, FASTCALL_PREFIX); + else if ((new_id = lookup_attribute ("regparm", type_attributes))) + new_id = gen_regparm_prefix (decl, id, + TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (new_id)))); + else + new_id = NULL_TREE; + + return new_id; +} + +/* This is used as a target hook to modify the DECL_ASSEMBLER_NAME + in the language-independent default hook + langhooks.c:lhd_set_decl_assembler_name () + and in cp/mangle.c:mangle_decl (). */ +tree +i386_nlm_mangle_decl_assembler_name (tree decl, tree id) +{ + tree new_id = TREE_CODE (decl) == FUNCTION_DECL + ? i386_nlm_maybe_mangle_decl_assembler_name (decl, id) + : NULL_TREE; + + return (new_id ? new_id : id); +} + +void +i386_nlm_encode_section_info (tree decl, rtx rtl, int first) +{ + default_encode_section_info (decl, rtl, first); + + if (TREE_CODE (decl) == FUNCTION_DECL + /* Do not change the identifier if a verbatim asmspec + or if stdcall suffix already added. */ + && *IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)) != '*' + && !strchr (IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)), '@') + /* FIXME: Imported stdcall names are not modified by the Ada frontend. + Check and decorate the RTL name now. */ + && strcmp (lang_hooks.name, "GNU Ada") == 0) + { + rtx symbol = XEXP (rtl, 0); + tree new_id; + tree old_id = DECL_ASSEMBLER_NAME (decl); + + gcc_assert (GET_CODE (symbol) == SYMBOL_REF); + + if ((new_id = i386_nlm_maybe_mangle_decl_assembler_name (decl, old_id))) + XSTR (symbol, 0) = IDENTIFIER_POINTER (new_id); + } +} + +/* Strip the stdcall/fastcall/regparm pre-/suffix. */ + +const char * +i386_nlm_strip_name_encoding (const char *str) +{ + const char *name = default_strip_name_encoding (str); + + if (*str != '*' && (*name == '_' || *name == '@')) + { + const char *p = strchr (name + 1, '@'); + + if (p) + { + ++name; + if (ISDIGIT (p[1])) + name = ggc_alloc_string (name, p - name); + else + { + gcc_assert (ISDIGIT (*name)); + name++; + gcc_assert (name == p); + } + } + } + return name; +} diff --git a/gcc/config/i386/netware.h b/gcc/config/i386/netware.h new file mode 100644 index 000000000..7f63f4518 --- /dev/null +++ b/gcc/config/i386/netware.h @@ -0,0 +1,177 @@ +/* Core target definitions for GCC for Intel 80x86 running Netware. + and using dwarf for the debugging format. + Copyright (C) 1993, 1994, 2004, 2007, 2008, 2009, 2010, 2011 + Free Software Foundation, Inc. + + Written by David V. Henkel-Wallace (gumby@cygnus.com) + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#define TARGET_VERSION fprintf (stderr, " (x86 NetWare)"); + +#undef CPP_SPEC +#define CPP_SPEC "%{posix:-D_POSIX_SOURCE} %{pthread:-D_REENTRANT}" + +#undef ASM_SPEC +#define ASM_SPEC "" + +#undef LIB_SPEC +#define LIB_SPEC "" + +/* Kinda useless, but what the hell */ +#undef LINK_SPEC +#define LINK_SPEC "%{h*} %{v:-V}" + +#undef STARTFILE_SPEC +#define STARTFILE_SPEC "" + +#undef ENDFILE_SPEC +#define ENDFILE_SPEC "" + +#undef RELATIVE_PREFIX_NOT_LINKDIR +#undef LIBGCC_SPEC + +#define TARGET_OS_CPP_BUILTINS() \ + do \ + { \ + builtin_define_std ("IAPX386"); \ + builtin_define ("_M_IX86=300"); \ + builtin_define ("__netware__"); \ + builtin_assert ("system=netware"); \ + builtin_define ("__ELF__"); \ + builtin_define ("__cdecl=__attribute__((__cdecl__))"); \ + builtin_define ("__stdcall=__attribute__((__stdcall__))"); \ + builtin_define ("__fastcall=__attribute__((__fastcall__))"); \ + if (!flag_iso) \ + { \ + builtin_define ("_cdecl=__attribute__((__cdecl__))"); \ + builtin_define ("_stdcall=__attribute__((__stdcall__))"); \ + builtin_define ("_fastcall=__attribute__((__fastcall__))"); \ + } \ + } \ + while (0) + +#undef TARGET_CPU_DEFAULT +#define TARGET_CPU_DEFAULT TARGET_CPU_DEFAULT_pentium4 + +/* By default, target has a 80387, uses IEEE compatible arithmetic, + returns float values in the 387, and uses MSVC bit field layout. */ +#undef TARGET_SUBTARGET_DEFAULT +#define TARGET_SUBTARGET_DEFAULT (MASK_80387 | MASK_IEEE_FP | \ + MASK_FLOAT_RETURNS | MASK_ALIGN_DOUBLE | MASK_MS_BITFIELD_LAYOUT) + +/* Don't allow flag_pic to propagate since invalid relocations will + result otherwise. */ +#define SUBTARGET_OVERRIDE_OPTIONS \ +do { \ + if (flag_pic) \ + { \ + error ("-fPIC and -fpic are not supported for this target"); \ + flag_pic = 0; \ + } \ +} while (0) + +#undef MATH_LIBRARY +#define MATH_LIBRARY "" + +/* Align doubles and long-longs in structures on qword boundaries. */ +#undef BIGGEST_FIELD_ALIGNMENT +#define BIGGEST_FIELD_ALIGNMENT 64 + +#undef DEFAULT_PCC_STRUCT_RETURN +#define DEFAULT_PCC_STRUCT_RETURN 0 + +/* Implicit arguments pointing to aggregate return values are to be + removed by the caller. */ +#undef KEEP_AGGREGATE_RETURN_POINTER +#define KEEP_AGGREGATE_RETURN_POINTER 1 + +#undef ASM_COMMENT_START +#define ASM_COMMENT_START "#" + +#undef DBX_REGISTER_NUMBER +#define DBX_REGISTER_NUMBER(n) (svr4_dbx_register_map[n]) + +/* Default structure packing is 1-byte. */ +#define TARGET_DEFAULT_PACK_STRUCT 1 + +#undef SIZE_TYPE +#define SIZE_TYPE "unsigned int" + +#undef PTRDIFF_TYPE +#define PTRDIFF_TYPE "int" + +#undef WCHAR_TYPE +#define WCHAR_TYPE "short unsigned int" + +#undef WCHAR_TYPE_SIZE +#define WCHAR_TYPE_SIZE 16 + +#undef WINT_TYPE +#define WINT_TYPE "int" + +/* A C statement (sans semicolon) to output to the stdio stream + FILE the assembler definition of uninitialized global DECL named + NAME whose size is SIZE bytes and alignment is ALIGN bytes. + Try to use asm_output_aligned_bss to implement this macro. */ + +#define ASM_OUTPUT_ALIGNED_BSS(FILE, DECL, NAME, SIZE, ALIGN) \ + asm_output_aligned_bss (FILE, DECL, NAME, SIZE, ALIGN) + +/* Handle special EH pointer encodings. Absolute, pc-relative, and + indirect are handled automatically. */ +#define ASM_MAYBE_OUTPUT_ENCODED_ADDR_RTX(FILE, ENCODING, SIZE, ADDR, DONE) \ + do { \ + if ((SIZE) == 4 && ((ENCODING) & 0x70) == DW_EH_PE_datarel) \ + { \ + fputs (ASM_LONG, FILE); \ + assemble_name (FILE, XSTR (ADDR, 0)); \ + fputs (((ENCODING) & DW_EH_PE_indirect ? "@GOT" : "@GOTOFF"), FILE); \ + goto DONE; \ + } \ + } while (0) + +/* there is no TLS support in NLMs/on NetWare */ +#undef HAVE_AS_TLS + +#define HAS_INIT_SECTION +#undef INIT_SECTION_ASM_OP + +#define CTOR_LISTS_DEFINED_EXTERNALLY + +#undef READONLY_DATA_SECTION_ASM_OP +#define READONLY_DATA_SECTION_ASM_OP ".section\t.rodata" + +/* Define this macro if references to a symbol must be treated + differently depending on something about the variable or + function named by the symbol (such as what section it is in). + + On i386 running NetWare, modify the assembler name with an underscore (_) + or atsign (@) prefix and a suffix consisting of an atsign (@) followed by + a string of digits that represents the number of bytes of arguments passed + to the function, if it has the attribute STDCALL. Alternatively, if it has + the REGPARM attribute, prefix it with an underscore (_), a digit + representing the number of registers used, and an atsign (@). */ +void i386_nlm_encode_section_info (tree, rtx, int); +extern tree i386_nlm_mangle_decl_assembler_name (tree, tree); +const char *i386_nlm_strip_name_encoding (const char *); +#define SUBTARGET_ENCODE_SECTION_INFO i386_nlm_encode_section_info +#define TARGET_MANGLE_DECL_ASSEMBLER_NAME i386_nlm_mangle_decl_assembler_name +#undef TARGET_STRIP_NAME_ENCODING +#define TARGET_STRIP_NAME_ENCODING i386_nlm_strip_name_encoding + +#define TARGET_POSIX_IO diff --git a/gcc/config/i386/netware.opt b/gcc/config/i386/netware.opt new file mode 100644 index 000000000..e1d903a2f --- /dev/null +++ b/gcc/config/i386/netware.opt @@ -0,0 +1,33 @@ +; Netware options. + +; Copyright (C) 2011 +; Free Software Foundation, Inc. +; +; This file is part of GCC. +; +; GCC is free software; you can redistribute it and/or modify it under +; the terms of the GNU General Public License as published by the Free +; Software Foundation; either version 3, or (at your option) any later +; version. +; +; GCC is distributed in the hope that it will be useful, but WITHOUT ANY +; WARRANTY; without even the implied warranty of MERCHANTABILITY or +; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +; for more details. +; +; You should have received a copy of the GNU General Public License +; along with GCC; see the file COPYING3. If not see +; . + +; See the GCC internals manual (options.texi) for a description of +; this file's format. + +; Please try to keep this file in ASCII collating order. + +posix +Driver + +pthread +Driver + +; This comment is to ensure we retain the blank line above. diff --git a/gcc/config/i386/nmmintrin.h b/gcc/config/i386/nmmintrin.h new file mode 100644 index 000000000..2a2d264c6 --- /dev/null +++ b/gcc/config/i386/nmmintrin.h @@ -0,0 +1,37 @@ +/* Copyright (C) 2007, 2009 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 10.0. */ + +#ifndef _NMMINTRIN_H_INCLUDED +#define _NMMINTRIN_H_INCLUDED + +#ifndef __SSE4_2__ +# error "SSE4.2 instruction set not enabled" +#else +/* We just include SSE4.1 header file. */ +#include +#endif /* __SSE4_2__ */ + +#endif /* _NMMINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/nto.h b/gcc/config/i386/nto.h new file mode 100644 index 000000000..0a54ce02c --- /dev/null +++ b/gcc/config/i386/nto.h @@ -0,0 +1,108 @@ +/* Definitions for Intel 386 running QNX/Neutrino. + Copyright (C) 2002, 2003, 2007, 2010 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#undef DEFAULT_PCC_STRUCT_RETURN +#define DEFAULT_PCC_STRUCT_RETURN 1 + +#undef TARGET_VERSION +#define TARGET_VERSION fprintf (stderr, " (QNX/Neutrino/i386 ELF)"); + +#undef TARGET_OS_CPP_BUILTINS +#define TARGET_OS_CPP_BUILTINS() \ + do \ + { \ + builtin_define ("__X86__"); \ + builtin_define ("__QNXNTO__"); \ + builtin_define ("__QNX__"); \ + builtin_define ("__ELF__"); \ + builtin_define ("__LITTLEENDIAN__"); \ + builtin_assert ("system=qnx"); \ + builtin_assert ("system=qnxnto"); \ + builtin_assert ("system=nto"); \ + builtin_assert ("system=unix"); \ + } \ + while (0) + +#undef THREAD_MODEL_SPEC +#define THREAD_MODEL_SPEC "posix" + +#ifdef CROSS_DIRECTORY_STRUCTURE +#define SYSROOT_SUFFIX_SPEC "x86" +#endif + +#ifndef CROSS_DIRECTORY_STRUCTURE +#undef MD_EXEC_PREFIX +#define MD_EXEC_PREFIX "/usr/ccs/bin/" + +#undef MD_STARTFILE_PREFIX +#define MD_STARTFILE_PREFIX "/usr/ccs/lib/" +#endif + +#undef STARTFILE_SPEC +#define STARTFILE_SPEC \ +"%{!shared: \ + %{!symbolic: \ + %{pg:mcrt1.o%s} \ + %{!pg:%{p:mcrt1.o%s} \ + %{!p:crt1.o%s}}}} \ +crti.o%s \ +%{fexceptions: crtbegin.o%s} \ +%{!fexceptions: %R/lib/crtbegin.o}" + +#undef ENDFILE_SPEC +#define ENDFILE_SPEC \ + "crtend.o%s crtn.o%s" + +#undef LINK_SPEC +#define LINK_SPEC \ + "%{h*} %{v:-V} \ + %{static:-dn -Bstatic} \ + %{shared:-G -dy -z text} \ + %{symbolic:-Bsymbolic -G -dy -z text} \ + %{G:-G} \ + %{YP,*} \ + %{!YP,*:%{p:-Y P,%R/lib} \ + %{!p:-Y P,%R/lib}} \ + %{Qy:} %{!Qn:-Qy} \ + -m i386nto \ + %{!shared: --dynamic-linker /usr/lib/ldqnx.so.2}" + +#undef LIB_SPEC +#define LIB_SPEC "%{!shared:%{!symbolic:-lc}}" + +#undef ASM_SPEC +#define ASM_SPEC "" + +#undef SIZE_TYPE +#define SIZE_TYPE "unsigned int" + +#undef PTRDIFF_TYPE +#define PTRDIFF_TYPE "int" + +#undef WCHAR_TYPE +#define WCHAR_TYPE "long unsigned int" + +#undef WCHAR_TYPE_SIZE +#define WCHAR_TYPE_SIZE BITS_PER_WORD + +#define NO_IMPLICIT_EXTERN_C 1 + +#define TARGET_POSIX_IO + +#undef DBX_REGISTER_NUMBER diff --git a/gcc/config/i386/nto.opt b/gcc/config/i386/nto.opt new file mode 100644 index 000000000..ddfaa90c9 --- /dev/null +++ b/gcc/config/i386/nto.opt @@ -0,0 +1,33 @@ +; QNX options. + +; Copyright (C) 2011 +; Free Software Foundation, Inc. +; +; This file is part of GCC. +; +; GCC is free software; you can redistribute it and/or modify it under +; the terms of the GNU General Public License as published by the Free +; Software Foundation; either version 3, or (at your option) any later +; version. +; +; GCC is distributed in the hope that it will be useful, but WITHOUT ANY +; WARRANTY; without even the implied warranty of MERCHANTABILITY or +; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +; for more details. +; +; You should have received a copy of the GNU General Public License +; along with GCC; see the file COPYING3. If not see +; . + +; See the GCC internals manual (options.texi) for a description of +; this file's format. + +; Please try to keep this file in ASCII collating order. + +G +Driver + +YP, +Driver Joined + +; This comment is to ensure we retain the blank line above. diff --git a/gcc/config/i386/nwld.c b/gcc/config/i386/nwld.c new file mode 100644 index 000000000..05d1a92d1 --- /dev/null +++ b/gcc/config/i386/nwld.c @@ -0,0 +1,73 @@ +/* Subroutines for insn-output.c for NetWare. + Contributed by Jan Beulich (jbeulich@novell.com) + Copyright (C) 2004, 2007, 2010 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "tm.h" +#include "rtl.h" +#include "regs.h" +#include "hard-reg-set.h" +#include "output.h" +#include "tree.h" +#include "flags.h" +#include "tm_p.h" +#include "diagnostic-core.h" + +void +nwld_named_section_asm_out_constructor (rtx symbol, int priority) +{ +#if !SUPPORTS_INIT_PRIORITY + const char section[] = ".ctors"TARGET_SUB_SECTION_SEPARATOR; +#else + char section[20]; + + sprintf (section, + ".ctors"TARGET_SUB_SECTION_SEPARATOR"%.5u", + /* Invert the numbering so the linker puts us in the proper + order; constructors are run from right to left, and the + linker sorts in increasing order. */ + MAX_INIT_PRIORITY - priority); +#endif + + switch_to_section (get_section (section, 0, NULL)); + assemble_align (POINTER_SIZE); + assemble_integer (symbol, POINTER_SIZE / BITS_PER_UNIT, POINTER_SIZE, 1); +} + +void +nwld_named_section_asm_out_destructor (rtx symbol, int priority) +{ +#if !SUPPORTS_INIT_PRIORITY + const char section[] = ".dtors"TARGET_SUB_SECTION_SEPARATOR; +#else + char section[20]; + + sprintf (section, ".dtors"TARGET_SUB_SECTION_SEPARATOR"%.5u", + /* Invert the numbering so the linker puts us in the proper + order; destructors are run from left to right, and the + linker sorts in increasing order. */ + MAX_INIT_PRIORITY - priority); +#endif + + switch_to_section (get_section (section, 0, NULL)); + assemble_align (POINTER_SIZE); + assemble_integer (symbol, POINTER_SIZE / BITS_PER_UNIT, POINTER_SIZE, 1); +} diff --git a/gcc/config/i386/nwld.h b/gcc/config/i386/nwld.h new file mode 100644 index 000000000..6d8e54ff9 --- /dev/null +++ b/gcc/config/i386/nwld.h @@ -0,0 +1,69 @@ +/* nwld.h -- defines to be used when targeting GCC for some generic NetWare + system while using the Novell linker. + Copyright (C) 2004, 2007, 2010, 2011 Free Software Foundation, Inc. + + Written by Jan Beulich (jbeulich@novell.com) + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#undef LIB_SPEC +#define LIB_SPEC "-lc --def-file libc.def%s" + +#undef LIBGCC_SPEC +#define LIBGCC_SPEC "-lgcc %{!static-libgcc:--def-file libgcc.def%s}" + +#undef LINKER_NAME +#define LINKER_NAME "nwld" + +#undef LINK_SPEC +#define LINK_SPEC "--format:NLM --extensions:GNU" \ + " %{static:%{!nostdlib:%{!nodefaultlibs:%estatic linking is not supported\n}}}" + +#undef LINK_GCC_C_SEQUENCE_SPEC +#define LINK_GCC_C_SEQUENCE_SPEC "%L %G" + +/* In order to permit the linker to derive the output filename from the first + input file, put the common startup code as the last object. */ +#undef STARTFILE_SPEC +#define STARTFILE_SPEC "" + +#undef ENDFILE_SPEC +#define ENDFILE_SPEC "crt0%O%s ../imports/%{!posix:libc}%{posix:posix}pre.gcc%O%s" \ + " --def-file %{!posix:libc}%{posix:posix}pre.def%s" + +#define DRIVER_SELF_SPECS "%{!static-libgcc:-shared-libgcc}" + +#define TARGET_SUB_SECTION_SEPARATOR "$" + +void nwld_named_section_asm_out_constructor (rtx, int); +void nwld_named_section_asm_out_destructor (rtx, int); + +#define TARGET_ASM_CONSTRUCTOR nwld_named_section_asm_out_constructor +#define TARGET_ASM_DESTRUCTOR nwld_named_section_asm_out_destructor + +#define SUBSUBTARGET_OVERRIDE_OPTIONS \ +do { \ + /* XXX This can be enabled once gas output meets nwld's needs. */ \ + /* if (!flag_unwind_tables && !flag_exceptions) */ \ + flag_dwarf2_cfi_asm = 0; \ +} while (0) + +#undef EH_FRAME_SECTION_NAME +#define EH_FRAME_SECTION_NAME ".eh_frame"TARGET_SUB_SECTION_SEPARATOR + +/* nwld does not currently support stabs debug info */ +#undef DBX_DEBUGGING_INFO diff --git a/gcc/config/i386/openbsd.h b/gcc/config/i386/openbsd.h new file mode 100644 index 000000000..d64f15907 --- /dev/null +++ b/gcc/config/i386/openbsd.h @@ -0,0 +1,101 @@ +/* Configuration for an OpenBSD i386 target. + Copyright (C) 1999, 2000, 2002, 2004, 2007 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + + +#define TARGET_VERSION fprintf (stderr, " (OpenBSD/i386)"); + +/* This goes away when the math-emulator is fixed */ +#undef TARGET_SUBTARGET_DEFAULT +#define TARGET_SUBTARGET_DEFAULT \ + (MASK_80387 | MASK_IEEE_FP | MASK_FLOAT_RETURNS | MASK_NO_FANCY_MATH_387) + +#define TARGET_OS_CPP_BUILTINS() \ + do \ + { \ + builtin_define ("__unix__"); \ + builtin_define ("__OpenBSD__"); \ + builtin_assert ("system=unix"); \ + builtin_assert ("system=bsd"); \ + builtin_assert ("system=OpenBSD"); \ + } \ + while (0) + +/* Layout of source language data types. */ + +/* This must agree with */ +#undef SIZE_TYPE +#define SIZE_TYPE "unsigned int" + +#undef PTRDIFF_TYPE +#define PTRDIFF_TYPE "int" + +#undef WCHAR_TYPE +#define WCHAR_TYPE "int" + +#undef WCHAR_TYPE_SIZE +#define WCHAR_TYPE_SIZE 32 + +/* Assembler format: overall framework. */ + +#undef ASM_APP_ON +#define ASM_APP_ON "#APP\n" + +#undef ASM_APP_OFF +#define ASM_APP_OFF "#NO_APP\n" + +/* Stack & calling: aggregate returns. */ + +/* Don't default to pcc-struct-return, because gcc is the only compiler, and + we want to retain compatibility with older gcc versions. */ +#define DEFAULT_PCC_STRUCT_RETURN 0 + +/* Assembler format: alignment output. */ + +/* Kludgy test: when gas is upgraded, it will have p2align, and no problems + with nops. */ +#ifndef HAVE_GAS_MAX_SKIP_P2ALIGN +/* i386 OpenBSD still uses an older gas that doesn't insert nops by default + when the .align directive demands to insert extra space in the text + segment. */ +#undef ASM_OUTPUT_ALIGN +#define ASM_OUTPUT_ALIGN(FILE,LOG) \ + if ((LOG)!=0) fprintf ((FILE), "\t.align %d,0x90\n", (LOG)) +#endif + +/* Stack & calling: profiling. */ + +/* OpenBSD's profiler recovers all information from the stack pointer. + The icky part is not here, but in machine/profile.h. */ +#undef FUNCTION_PROFILER +#define FUNCTION_PROFILER(FILE, LABELNO) \ + fputs (flag_pic ? "\tcall mcount@PLT\n": "\tcall mcount\n", FILE); + +/* Assembler format: exception region output. */ + +/* All configurations that don't use elf must be explicit about not using + dwarf unwind information. */ +#define DWARF2_UNWIND_INFO 0 + +#undef ASM_PREFERRED_EH_DATA_FORMAT + +#undef ASM_COMMENT_START +#define ASM_COMMENT_START ";#" + +/* OpenBSD gas currently does not support quad, so do not use it. */ +#undef ASM_QUAD diff --git a/gcc/config/i386/openbsdelf.h b/gcc/config/i386/openbsdelf.h new file mode 100644 index 000000000..53949e8ab --- /dev/null +++ b/gcc/config/i386/openbsdelf.h @@ -0,0 +1,134 @@ +/* Configuration for an OpenBSD i386 target. + + Copyright (C) 2005, 2007, 2009, 2010 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +/* This keeps us from using libraries compiled with the native cc, so + undef it. */ +#undef NO_DOLLAR_IN_LABEL + +/* Override the default comment-starter of "/". */ +#undef ASM_COMMENT_START +#define ASM_COMMENT_START "#" + +#undef DBX_REGISTER_NUMBER +#define DBX_REGISTER_NUMBER(n) svr4_dbx_register_map[n] + +/* This goes away when the math-emulator is fixed */ +#undef TARGET_DEFAULT +#define TARGET_DEFAULT \ + (MASK_80387 | MASK_IEEE_FP | MASK_FLOAT_RETURNS | MASK_NO_FANCY_MATH_387) + +/* Run-time target specifications */ + +#define TARGET_OS_CPP_BUILTINS() \ + do \ + { \ + OPENBSD_OS_CPP_BUILTINS(); \ + } \ + while (0) + +/* As an elf system, we need crtbegin/crtend stuff. */ +#undef STARTFILE_SPEC +#define STARTFILE_SPEC "\ + %{!shared: %{pg:gcrt0%O%s} %{!pg:%{p:gcrt0%O%s} %{!p:crt0%O%s}} \ + crtbegin%O%s} %{shared:crtbeginS%O%s}" +#undef ENDFILE_SPEC +#define ENDFILE_SPEC "%{!shared:crtend%O%s} %{shared:crtendS%O%s}" + +/* Layout of source language data types. */ + +/* This must agree with */ +#undef SIZE_TYPE +#define SIZE_TYPE "long unsigned int" + +#undef PTRDIFF_TYPE +#define PTRDIFF_TYPE "long int" + +#undef WCHAR_TYPE +#define WCHAR_TYPE "int" + +#undef WCHAR_TYPE_SIZE +#define WCHAR_TYPE_SIZE BITS_PER_WORD + +#undef WINT_TYPE +#define WINT_TYPE "int" + +/* Assembler format: overall framework. */ + +#undef ASM_APP_ON +#define ASM_APP_ON "#APP\n" + +#undef ASM_APP_OFF +#define ASM_APP_OFF "#NO_APP\n" + +#undef SET_ASM_OP +#define SET_ASM_OP "\t.set\t" + +/* The following macros were originally stolen from i386v4.h. + These have to be defined to get PIC code correct. */ + +/* Assembler format: dispatch tables. */ + +/* Assembler format: sections. */ + +/* Stack & calling: aggregate returns. */ + +/* Don't default to pcc-struct-return, because gcc is the only compiler, and + we want to retain compatibility with older gcc versions. */ +#define DEFAULT_PCC_STRUCT_RETURN 0 + +/* Assembler format: alignment output. */ + +#ifdef HAVE_GAS_MAX_SKIP_P2ALIGN +#define ASM_OUTPUT_MAX_SKIP_ALIGN(FILE,LOG,MAX_SKIP) \ + if ((LOG) != 0) {\ + if ((MAX_SKIP) == 0) fprintf ((FILE), "\t.p2align %d\n", (LOG)); \ + else fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP)); \ + } +#endif + +/* Stack & calling: profiling. */ + +/* OpenBSD's profiler recovers all information from the stack pointer. + The icky part is not here, but in machine/profile.h. */ +#undef FUNCTION_PROFILER +#define FUNCTION_PROFILER(FILE, LABELNO) \ + fputs (flag_pic ? "\tcall __mcount@PLT\n": "\tcall __mcount\n", FILE); + +/* Assembler format: exception region output. */ + +/* our configuration still doesn't handle dwarf2 correctly */ +#define DWARF2_UNWIND_INFO 0 + +/* Assembler format: alignment output. */ + +/* Note that we pick up ASM_OUTPUT_MAX_SKIP_ALIGN from i386/gas.h */ + +/* Note that we pick up ASM_OUTPUT_MI_THUNK from unix.h. */ + +#undef LINK_SPEC +#define LINK_SPEC \ + "%{!shared:%{!nostdlib:%{!r:%{!e*:-e __start}}}} \ + %{shared:-shared} %{R*} \ + %{static:-Bstatic} \ + %{!static:-Bdynamic} \ + %{assert*} \ + -dynamic-linker /usr/libexec/ld.so" + +#define OBSD_HAS_CORRECT_SPECS diff --git a/gcc/config/i386/pentium.md b/gcc/config/i386/pentium.md new file mode 100644 index 000000000..c6c5bd55f --- /dev/null +++ b/gcc/config/i386/pentium.md @@ -0,0 +1,306 @@ +;; Pentium Scheduling +;; Copyright (C) 2002, 2007 Free Software Foundation, Inc. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ +;; +;; The Pentium is an in-order core with two integer pipelines. + +;; True for insns that behave like prefixed insns on the Pentium. +(define_attr "pent_prefix" "false,true" + (if_then_else (ior (eq_attr "prefix_0f" "1") + (ior (eq_attr "prefix_data16" "1") + (eq_attr "prefix_rep" "1"))) + (const_string "true") + (const_string "false"))) + +;; Categorize how an instruction slots. + +;; The non-MMX Pentium slots an instruction with prefixes on U pipe only, +;; while MMX Pentium can slot it on either U or V. Model non-MMX Pentium +;; rules, because it results in noticeably better code on non-MMX Pentium +;; and doesn't hurt much on MMX. (Prefixed instructions are not very +;; common, so the scheduler usually has a non-prefixed insn to pair). + +(define_attr "pent_pair" "uv,pu,pv,np" + (cond [(eq_attr "imm_disp" "true") + (const_string "np") + (ior (eq_attr "type" "alu1,alu,imov,icmp,test,lea,incdec") + (and (eq_attr "type" "pop,push") + (eq_attr "memory" "!both"))) + (if_then_else (eq_attr "pent_prefix" "true") + (const_string "pu") + (const_string "uv")) + (eq_attr "type" "ibr") + (const_string "pv") + (and (eq_attr "type" "ishift") + (match_operand 2 "const_int_operand" "")) + (const_string "pu") + (and (eq_attr "type" "rotate") + (match_operand 2 "const1_operand" "")) + (const_string "pu") + (and (eq_attr "type" "ishift1") + (match_operand 1 "const_int_operand" "")) + (const_string "pu") + (and (eq_attr "type" "rotate1") + (match_operand 1 "const1_operand" "")) + (const_string "pu") + (and (eq_attr "type" "call") + (match_operand 0 "constant_call_address_operand" "")) + (const_string "pv") + (and (eq_attr "type" "callv") + (match_operand 1 "constant_call_address_operand" "")) + (const_string "pv") + ] + (const_string "np"))) + +(define_automaton "pentium,pentium_fpu") + +;; Pentium do have U and V pipes. Instruction to both pipes +;; are always issued together, much like on VLIW. +;; +;; predecode +;; / \ +;; decodeu decodev +;; / | | +;; fpu executeu executev +;; | | | +;; fpu retire retire +;; | +;; fpu +;; We add dummy "port" pipes allocated only first cycle of +;; instruction to specify this behavior. + +(define_cpu_unit "pentium-portu,pentium-portv" "pentium") +(define_cpu_unit "pentium-u,pentium-v" "pentium") +(absence_set "pentium-portu" "pentium-u,pentium-v") +(presence_set "pentium-portv" "pentium-portu") + +;; Floating point instructions can overlap with new issue of integer +;; instructions. We model only first cycle of FP pipeline, as it is +;; fully pipelined. +(define_cpu_unit "pentium-fp" "pentium_fpu") + +;; There is non-pipelined multiplier unit used for complex operations. +(define_cpu_unit "pentium-fmul" "pentium_fpu") + +;; Pentium preserves memory ordering, so when load-execute-store +;; instruction is executed together with other instruction loading +;; data, the execution of the other instruction is delayed to very +;; last cycle of first instruction, when data are bypassed. +;; We model this by allocating "memory" unit when store is pending +;; and using conflicting load units together. + +(define_cpu_unit "pentium-memory" "pentium") +(define_cpu_unit "pentium-load0" "pentium") +(define_cpu_unit "pentium-load1" "pentium") +(absence_set "pentium-load0,pentium-load1" "pentium-memory") + +(define_reservation "pentium-load" "(pentium-load0 | pentium-load1)") +(define_reservation "pentium-np" "(pentium-u + pentium-v)") +(define_reservation "pentium-uv" "(pentium-u | pentium-v)") +(define_reservation "pentium-portuv" "(pentium-portu | pentium-portv)") +(define_reservation "pentium-firstu" "(pentium-u + pentium-portu)") +(define_reservation "pentium-firstv" "(pentium-v + pentium-portuv)") +(define_reservation "pentium-firstuv" "(pentium-uv + pentium-portuv)") +(define_reservation "pentium-firstuload" "(pentium-load + pentium-firstu)") +(define_reservation "pentium-firstvload" "(pentium-load + pentium-firstv)") +(define_reservation "pentium-firstuvload" "(pentium-load + pentium-firstuv) + | (pentium-firstv,pentium-v, + (pentium-load+pentium-firstv))") +(define_reservation "pentium-firstuboth" "(pentium-load + pentium-firstu + + pentium-memory)") +(define_reservation "pentium-firstvboth" "(pentium-load + pentium-firstv + + pentium-memory)") +(define_reservation "pentium-firstuvboth" "(pentium-load + pentium-firstuv + + pentium-memory) + | (pentium-firstv,pentium-v, + (pentium-load+pentium-firstv))") + +;; Few common long latency instructions +(define_insn_reservation "pent_mul" 11 + (and (eq_attr "cpu" "pentium") + (eq_attr "type" "imul")) + "pentium-np*11") + +(define_insn_reservation "pent_str" 12 + (and (eq_attr "cpu" "pentium") + (eq_attr "type" "str")) + "pentium-np*12") + +;; Integer division and some other long latency instruction block all +;; units, including the FP pipe. There is no value in modeling the +;; latency of these instructions and not modeling the latency +;; decreases the size of the DFA. +(define_insn_reservation "pent_block" 1 + (and (eq_attr "cpu" "pentium") + (eq_attr "type" "idiv")) + "pentium-np+pentium-fp") + +;; Moves usually have one cycle penalty, but there are exceptions. +(define_insn_reservation "pent_fmov" 1 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "type" "fmov") + (eq_attr "memory" "none,load"))) + "(pentium-fp+pentium-np)") + +(define_insn_reservation "pent_fpmovxf" 3 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "type" "fmov") + (and (eq_attr "memory" "load,store") + (eq_attr "mode" "XF")))) + "(pentium-fp+pentium-np)*3") + +(define_insn_reservation "pent_fpstore" 2 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "type" "fmov") + (ior (match_operand 1 "immediate_operand" "") + (eq_attr "memory" "store")))) + "(pentium-fp+pentium-np)*2") + +(define_insn_reservation "pent_imov" 1 + (and (eq_attr "cpu" "pentium") + (eq_attr "type" "imov")) + "pentium-firstuv") + +;; Push and pop instructions have 1 cycle latency and special +;; hardware bypass allows them to be paired with other push,pop +;; and call instructions. +(define_bypass 0 "pent_push,pent_pop" "pent_push,pent_pop,pent_call") +(define_insn_reservation "pent_push" 1 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "type" "push") + (eq_attr "memory" "store"))) + "pentium-firstuv") + +(define_insn_reservation "pent_pop" 1 + (and (eq_attr "cpu" "pentium") + (eq_attr "type" "pop,leave")) + "pentium-firstuv") + +;; Call and branch instruction can execute in either pipe, but +;; they are only pairable when in the v pipe. +(define_insn_reservation "pent_call" 10 + (and (eq_attr "cpu" "pentium") + (eq_attr "type" "call,callv")) + "pentium-firstv,pentium-v*9") + +(define_insn_reservation "pent_branch" 1 + (and (eq_attr "cpu" "pentium") + (eq_attr "type" "ibr")) + "pentium-firstv") + +;; Floating point instruction dispatch in U pipe, but continue +;; in FP pipeline allowing other instructions to be executed. +(define_insn_reservation "pent_fp" 3 + (and (eq_attr "cpu" "pentium") + (eq_attr "type" "fop,fistp")) + "(pentium-firstu+pentium-fp),nothing,nothing") + +;; First two cycles of fmul are not pipelined. +(define_insn_reservation "pent_fmul" 3 + (and (eq_attr "cpu" "pentium") + (eq_attr "type" "fmul")) + "(pentium-firstuv+pentium-fp+pentium-fmul),pentium-fmul,nothing") + +;; Long latency FP instructions overlap with integer instructions, +;; but only last 2 cycles with FP ones. +(define_insn_reservation "pent_fdiv" 39 + (and (eq_attr "cpu" "pentium") + (eq_attr "type" "fdiv")) + "(pentium-np+pentium-fp+pentium-fmul), + (pentium-fp+pentium-fmul)*36,pentium-fmul*2") + +(define_insn_reservation "pent_fpspc" 70 + (and (eq_attr "cpu" "pentium") + (eq_attr "type" "fpspc")) + "(pentium-np+pentium-fp+pentium-fmul), + (pentium-fp+pentium-fmul)*67,pentium-fmul*2") + +;; Integer instructions. Load/execute/store takes 3 cycles, +;; load/execute 2 cycles and execute only one cycle. +(define_insn_reservation "pent_uv_both" 3 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "pent_pair" "uv") + (eq_attr "memory" "both"))) + "pentium-firstuvboth,pentium-uv+pentium-memory,pentium-uv") + +(define_insn_reservation "pent_u_both" 3 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "pent_pair" "pu") + (eq_attr "memory" "both"))) + "pentium-firstuboth,pentium-u+pentium-memory,pentium-u") + +(define_insn_reservation "pent_v_both" 3 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "pent_pair" "pv") + (eq_attr "memory" "both"))) + "pentium-firstvboth,pentium-v+pentium-memory,pentium-v") + +(define_insn_reservation "pent_np_both" 3 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "pent_pair" "np") + (eq_attr "memory" "both"))) + "pentium-np,pentium-np,pentium-np") + +(define_insn_reservation "pent_uv_load" 2 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "pent_pair" "uv") + (eq_attr "memory" "load"))) + "pentium-firstuvload,pentium-uv") + +(define_insn_reservation "pent_u_load" 2 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "pent_pair" "pu") + (eq_attr "memory" "load"))) + "pentium-firstuload,pentium-u") + +(define_insn_reservation "pent_v_load" 2 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "pent_pair" "pv") + (eq_attr "memory" "load"))) + "pentium-firstvload,pentium-v") + +(define_insn_reservation "pent_np_load" 2 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "pent_pair" "np") + (eq_attr "memory" "load"))) + "pentium-np,pentium-np") + +(define_insn_reservation "pent_uv" 1 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "pent_pair" "uv") + (eq_attr "memory" "none"))) + "pentium-firstuv") + +(define_insn_reservation "pent_u" 1 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "pent_pair" "pu") + (eq_attr "memory" "none"))) + "pentium-firstu") + +(define_insn_reservation "pent_v" 1 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "pent_pair" "pv") + (eq_attr "memory" "none"))) + "pentium-firstv") + +(define_insn_reservation "pent_np" 1 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "pent_pair" "np") + (eq_attr "memory" "none"))) + "pentium-np") + diff --git a/gcc/config/i386/pmm_malloc.h b/gcc/config/i386/pmm_malloc.h new file mode 100644 index 000000000..0a9f2e227 --- /dev/null +++ b/gcc/config/i386/pmm_malloc.h @@ -0,0 +1,57 @@ +/* Copyright (C) 2004, 2006, 2009 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _MM_MALLOC_H_INCLUDED +#define _MM_MALLOC_H_INCLUDED + +#include + +/* We can't depend on since the prototype of posix_memalign + may not be visible. */ +#ifndef __cplusplus +extern int posix_memalign (void **, size_t, size_t); +#else +extern "C" int posix_memalign (void **, size_t, size_t) throw (); +#endif + +static __inline void * +_mm_malloc (size_t size, size_t alignment) +{ + void *ptr; + if (alignment == 1) + return malloc (size); + if (alignment == 2 || (sizeof (void *) == 8 && alignment == 4)) + alignment = sizeof (void *); + if (posix_memalign (&ptr, alignment, size) == 0) + return ptr; + else + return NULL; +} + +static __inline void +_mm_free (void * ptr) +{ + free (ptr); +} + +#endif /* _MM_MALLOC_H_INCLUDED */ diff --git a/gcc/config/i386/pmmintrin.h b/gcc/config/i386/pmmintrin.h new file mode 100644 index 000000000..c5c9ae27c --- /dev/null +++ b/gcc/config/i386/pmmintrin.h @@ -0,0 +1,128 @@ +/* Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 + Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 9.0. */ + +#ifndef _PMMINTRIN_H_INCLUDED +#define _PMMINTRIN_H_INCLUDED + +#ifndef __SSE3__ +# error "SSE3 instruction set not enabled" +#else + +/* We need definitions from the SSE2 and SSE header files*/ +#include + +/* Additional bits in the MXCSR. */ +#define _MM_DENORMALS_ZERO_MASK 0x0040 +#define _MM_DENORMALS_ZERO_ON 0x0040 +#define _MM_DENORMALS_ZERO_OFF 0x0000 + +#define _MM_SET_DENORMALS_ZERO_MODE(mode) \ + _mm_setcsr ((_mm_getcsr () & ~_MM_DENORMALS_ZERO_MASK) | (mode)) +#define _MM_GET_DENORMALS_ZERO_MODE() \ + (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_addsub_ps (__m128 __X, __m128 __Y) +{ + return (__m128) __builtin_ia32_addsubps ((__v4sf)__X, (__v4sf)__Y); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_ps (__m128 __X, __m128 __Y) +{ + return (__m128) __builtin_ia32_haddps ((__v4sf)__X, (__v4sf)__Y); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_ps (__m128 __X, __m128 __Y) +{ + return (__m128) __builtin_ia32_hsubps ((__v4sf)__X, (__v4sf)__Y); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movehdup_ps (__m128 __X) +{ + return (__m128) __builtin_ia32_movshdup ((__v4sf)__X); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_moveldup_ps (__m128 __X) +{ + return (__m128) __builtin_ia32_movsldup ((__v4sf)__X); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_addsub_pd (__m128d __X, __m128d __Y) +{ + return (__m128d) __builtin_ia32_addsubpd ((__v2df)__X, (__v2df)__Y); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_pd (__m128d __X, __m128d __Y) +{ + return (__m128d) __builtin_ia32_haddpd ((__v2df)__X, (__v2df)__Y); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_pd (__m128d __X, __m128d __Y) +{ + return (__m128d) __builtin_ia32_hsubpd ((__v2df)__X, (__v2df)__Y); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loaddup_pd (double const *__P) +{ + return _mm_load1_pd (__P); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movedup_pd (__m128d __X) +{ + return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0)); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_lddqu_si128 (__m128i const *__P) +{ + return (__m128i) __builtin_ia32_lddqu ((char const *)__P); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_monitor (void const * __P, unsigned int __E, unsigned int __H) +{ + __builtin_ia32_monitor (__P, __E, __H); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mwait (unsigned int __E, unsigned int __H) +{ + __builtin_ia32_mwait (__E, __H); +} + +#endif /* __SSE3__ */ + +#endif /* _PMMINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/popcntintrin.h b/gcc/config/i386/popcntintrin.h new file mode 100644 index 000000000..8d4d6571d --- /dev/null +++ b/gcc/config/i386/popcntintrin.h @@ -0,0 +1,46 @@ +/* Copyright (C) 2009 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef __POPCNT__ +# error "POPCNT instruction set not enabled" +#endif /* __POPCNT__ */ + +#ifndef _POPCNTINTRIN_H_INCLUDED +#define _POPCNTINTRIN_H_INCLUDED + +/* Calculate a number of bits set to 1. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_popcnt_u32 (unsigned int __X) +{ + return __builtin_popcount (__X); +} + +#ifdef __x86_64__ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_popcnt_u64 (unsigned long long __X) +{ + return __builtin_popcountll (__X); +} +#endif + +#endif /* _POPCNTINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/ppro.md b/gcc/config/i386/ppro.md new file mode 100644 index 000000000..bc1cb59d1 --- /dev/null +++ b/gcc/config/i386/ppro.md @@ -0,0 +1,758 @@ +;; Scheduling for the Intel P6 family of processors +;; Copyright (C) 2004, 2005, 2007, 2008, 2010 Free Software Foundation, Inc. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ + +;; The P6 family includes the Pentium Pro, Pentium II, Pentium III, Celeron +;; and Xeon lines of CPUs. The DFA scheduler description in this file is +;; based on information that can be found in the following three documents: +;; +;; "P6 Family of Processors Hardware Developer's Manual", +;; Intel, September 1999. +;; +;; "Intel Architecture Optimization Manual", +;; Intel, 1999 (Order Number: 245127-001). +;; +;; "How to optimize for the Pentium family of microprocessors", +;; by Agner Fog, PhD. +;; +;; The P6 pipeline has three major components: +;; 1) the FETCH/DECODE unit, an in-order issue front-end +;; 2) the DISPATCH/EXECUTE unit, which is the out-of-order core +;; 3) the RETIRE unit, an in-order retirement unit +;; +;; So, the P6 CPUs have out-of-order cores, but the instruction decoder and +;; retirement unit are naturally in-order. +;; +;; BUS INTERFACE UNIT +;; / \ +;; L1 ICACHE L1 DCACHE +;; / | \ | \ +;; DECODER0 DECODER1 DECODER2 DISP/EXEC RETIRE +;; \ | / | | +;; INSTRUCTION POOL __________|_______/ +;; (inc. reorder buffer) +;; +;; Since the P6 CPUs execute instructions out-of-order, the most important +;; consideration in performance tuning is making sure enough micro-ops are +;; ready for execution in the out-of-order core, while not stalling the +;; decoder. +;; +;; TODO: +;; - Find a less crude way to model complex instructions, in +;; particular how many cycles they take to be decoded. +;; - Include decoder latencies in the total reservation latencies. +;; This isn't necessary right now because we assume for every +;; instruction that it never blocks a decoder. +;; - Figure out where the p0 and p1 reservations come from. These +;; appear not to be in the manual +;; - Lots more because I'm sure this is still far from optimal :-) + +;; The ppro_idiv and ppro_fdiv automata are used to model issue +;; latencies of idiv and fdiv type insns. +(define_automaton "ppro_decoder,ppro_core,ppro_idiv,ppro_fdiv,ppro_load,ppro_store") + +;; Simple instructions of the register-register form have only one uop. +;; Load instructions are also only one uop. Store instructions decode to +;; two uops, and simple read-modify instructions also take two uops. +;; Simple instructions of the register-memory form have two to three uops. +;; Simple read-modify-write instructions have four uops. The rules for +;; the decoder are simple: +;; - an instruction with 1 uop can be decoded by any of the three +;; decoders in one cycle. +;; - an instruction with 1 to 4 uops can be decoded only by decoder 0 +;; but still in only one cycle. +;; - a complex (microcode) instruction can also only be decoded by +;; decoder 0, and this takes an unspecified number of cycles. +;; +;; The goal is to schedule such that we have a few-one-one uops sequence +;; in each cycle, to decode as many instructions per cycle as possible. +(define_cpu_unit "decoder0" "ppro_decoder") +(define_cpu_unit "decoder1" "ppro_decoder") +(define_cpu_unit "decoder2" "ppro_decoder") + +;; We first wish to find an instruction for decoder0, so exclude +;; decoder1 and decoder2 from being reserved until decoder 0 is +;; reserved. +(presence_set "decoder1" "decoder0") +(presence_set "decoder2" "decoder0") + +;; Most instructions can be decoded on any of the three decoders. +(define_reservation "decodern" "(decoder0|decoder1|decoder2)") + +;; The out-of-order core has five pipelines. During each cycle, the core +;; may dispatch zero or one uop on the port of any of the five pipelines +;; so the maximum number of dispatched uops per cycle is 5. In practicer, +;; 3 uops per cycle is more realistic. +;; +;; Two of the five pipelines contain several execution units: +;; +;; Port 0 Port 1 Port 2 Port 3 Port 4 +;; ALU ALU LOAD SAC SDA +;; FPU JUE +;; AGU MMX +;; MMX P3FPU +;; P3FPU +;; +;; (SAC=Store Address Calculation, SDA=Store Data Unit, P3FPU = SSE unit, +;; JUE = Jump Execution Unit, AGU = Address Generation Unit) +;; +(define_cpu_unit "p0,p1" "ppro_core") +(define_cpu_unit "p2" "ppro_load") +(define_cpu_unit "p3,p4" "ppro_store") +(define_cpu_unit "idiv" "ppro_idiv") +(define_cpu_unit "fdiv" "ppro_fdiv") + +;; Only the irregular instructions have to be modeled here. A load +;; increases the latency by 2 or 3, or by nothing if the manual gives +;; a latency already. Store latencies are not accounted for. +;; +;; The simple instructions follow a very regular pattern of 1 uop per +;; reg-reg operation, 1 uop per load on port 2. and 2 uops per store +;; on port 4 and port 3. These instructions are modelled at the bottom +;; of this file. +;; +;; For microcoded instructions we don't know how many uops are produced. +;; These instructions are the "complex" ones in the Intel manuals. All +;; we _do_ know is that they typically produce four or more uops, so +;; they can only be decoded on decoder0. Modelling their latencies +;; doesn't make sense because we don't know how these instructions are +;; executed in the core. So we just model that they can only be decoded +;; on decoder 0, and say that it takes a little while before the result +;; is available. +(define_insn_reservation "ppro_complex_insn" 6 + (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "other,multi,call,callv,str")) + "decoder0") + +;; imov with memory operands does not use the integer units. +(define_insn_reservation "ppro_imov" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "imov"))) + "decodern,(p0|p1)") + +(define_insn_reservation "ppro_imov_load" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (eq_attr "type" "imov"))) + "decodern,p2") + +(define_insn_reservation "ppro_imov_store" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "store") + (eq_attr "type" "imov"))) + "decoder0,p4+p3") + +;; imovx always decodes to one uop, and also doesn't use the integer +;; units if it has memory operands. +(define_insn_reservation "ppro_imovx" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "imovx"))) + "decodern,(p0|p1)") + +(define_insn_reservation "ppro_imovx_load" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (eq_attr "type" "imovx"))) + "decodern,p2") + +;; lea executes on port 0 with latency one and throughput 1. +(define_insn_reservation "ppro_lea" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "lea"))) + "decodern,p0") + +;; Shift and rotate execute on port 0 with latency and throughput 1. +;; The load and store units need to be reserved when memory operands +;; are involved. +(define_insn_reservation "ppro_shift_rotate" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "ishift,ishift1,rotate,rotate1"))) + "decodern,p0") + +(define_insn_reservation "ppro_shift_rotate_mem" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "!none") + (eq_attr "type" "ishift,ishift1,rotate,rotate1"))) + "decoder0,p2+p0,p4+p3") + + +;; The P6 has a sophisticated branch prediction mechanism to minimize +;; latencies due to branching. In particular, it has a fast way to +;; execute branches that are taken multiple times (such as in loops). +;; Branches not taken suffer no penalty, and correctly predicted +;; branches cost only one fetch cycle. Mispredicted branches are very +;; costly: typically 15 cycles and possibly as many as 26 cycles. +;; +;; Unfortunately all this makes it quite difficult to properly model +;; the latencies for the compiler. Here I've made the choice to be +;; optimistic and assume branches are often predicted correctly, so +;; they have latency 1, and the decoders are not blocked. +;; +;; In addition, the model assumes a branch always decodes to only 1 uop, +;; which is not exactly true because there are a few instructions that +;; decode to 2 uops or microcode. But this probably gives the best +;; results because we can assume these instructions can decode on all +;; decoders. +(define_insn_reservation "ppro_branch" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "ibr"))) + "decodern,p1") + +;; ??? Indirect branches probably have worse latency than this. +(define_insn_reservation "ppro_indirect_branch" 6 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "!none") + (eq_attr "type" "ibr"))) + "decoder0,p2+p1") + +(define_insn_reservation "ppro_leave" 4 + (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "leave")) + "decoder0,p2+(p0|p1),(p0|p1)") + +;; imul has throughput one, but latency 4, and can only execute on port 0. +(define_insn_reservation "ppro_imul" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "imul"))) + "decodern,p0") + +(define_insn_reservation "ppro_imul_mem" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "!none") + (eq_attr "type" "imul"))) + "decoder0,p2+p0") + +;; div and idiv are very similar, so we model them the same. +;; QI, HI, and SI have issue latency 12, 21, and 37, respectively. +;; These issue latencies are modelled via the ppro_div automaton. +(define_insn_reservation "ppro_idiv_QI" 19 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "QI") + (eq_attr "type" "idiv")))) + "decoder0,(p0+idiv)*2,(p0|p1)+idiv,idiv*9") + +(define_insn_reservation "ppro_idiv_QI_load" 19 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "QI") + (eq_attr "type" "idiv")))) + "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*9") + +(define_insn_reservation "ppro_idiv_HI" 23 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "HI") + (eq_attr "type" "idiv")))) + "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*17") + +(define_insn_reservation "ppro_idiv_HI_load" 23 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "HI") + (eq_attr "type" "idiv")))) + "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*18") + +(define_insn_reservation "ppro_idiv_SI" 39 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SI") + (eq_attr "type" "idiv")))) + "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*33") + +(define_insn_reservation "ppro_idiv_SI_load" 39 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SI") + (eq_attr "type" "idiv")))) + "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*34") + +;; Floating point operations always execute on port 0. +;; ??? where do these latencies come from? fadd has latency 3 and +;; has throughput "1/cycle (align with FADD)". What do they +;; mean and how can we model that? +(define_insn_reservation "ppro_fop" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none,unknown") + (eq_attr "type" "fop"))) + "decodern,p0") + +(define_insn_reservation "ppro_fop_load" 5 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (eq_attr "type" "fop"))) + "decoder0,p2+p0,p0") + +(define_insn_reservation "ppro_fop_store" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "store") + (eq_attr "type" "fop"))) + "decoder0,p0,p0,p0+p4+p3") + +(define_insn_reservation "ppro_fop_both" 5 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "both") + (eq_attr "type" "fop"))) + "decoder0,p2+p0,p0+p4+p3") + +(define_insn_reservation "ppro_fsgn" 1 + (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "fsgn")) + "decodern,p0") + +(define_insn_reservation "ppro_fistp" 5 + (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "fistp")) + "decoder0,p0*2,p4+p3") + +(define_insn_reservation "ppro_fcmov" 2 + (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "fcmov")) + "decoder0,p0*2") + +(define_insn_reservation "ppro_fcmp" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "fcmp"))) + "decodern,p0") + +(define_insn_reservation "ppro_fcmp_load" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (eq_attr "type" "fcmp"))) + "decoder0,p2+p0") + +(define_insn_reservation "ppro_fmov" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "fmov"))) + "decodern,p0") + +(define_insn_reservation "ppro_fmov_load" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "!XF") + (eq_attr "type" "fmov")))) + "decodern,p2") + +(define_insn_reservation "ppro_fmov_XF_load" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "XF") + (eq_attr "type" "fmov")))) + "decoder0,(p2+p0)*2") + +(define_insn_reservation "ppro_fmov_store" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "store") + (and (eq_attr "mode" "!XF") + (eq_attr "type" "fmov")))) + "decodern,p0") + +(define_insn_reservation "ppro_fmov_XF_store" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "store") + (and (eq_attr "mode" "XF") + (eq_attr "type" "fmov")))) + "decoder0,(p0+p4),(p0+p3)") + +;; fmul executes on port 0 with latency 5. It has issue latency 2, +;; but we don't model this. +(define_insn_reservation "ppro_fmul" 5 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "fmul"))) + "decoder0,p0*2") + +(define_insn_reservation "ppro_fmul_load" 6 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (eq_attr "type" "fmul"))) + "decoder0,p2+p0,p0") + +;; fdiv latencies depend on the mode of the operands. XFmode gives +;; a latency of 38 cycles, DFmode gives 32, and SFmode gives latency 18. +;; Division by a power of 2 takes only 9 cycles, but we cannot model +;; that. Throughput is equal to latency - 1, which we model using the +;; ppro_div automaton. +(define_insn_reservation "ppro_fdiv_SF" 18 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF") + (eq_attr "type" "fdiv,fpspc")))) + "decodern,p0+fdiv,fdiv*16") + +(define_insn_reservation "ppro_fdiv_SF_load" 19 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SF") + (eq_attr "type" "fdiv,fpspc")))) + "decoder0,p2+p0+fdiv,fdiv*16") + +(define_insn_reservation "ppro_fdiv_DF" 32 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "DF") + (eq_attr "type" "fdiv,fpspc")))) + "decodern,p0+fdiv,fdiv*30") + +(define_insn_reservation "ppro_fdiv_DF_load" 33 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "DF") + (eq_attr "type" "fdiv,fpspc")))) + "decoder0,p2+p0+fdiv,fdiv*30") + +(define_insn_reservation "ppro_fdiv_XF" 38 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "XF") + (eq_attr "type" "fdiv,fpspc")))) + "decodern,p0+fdiv,fdiv*36") + +(define_insn_reservation "ppro_fdiv_XF_load" 39 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "XF") + (eq_attr "type" "fdiv,fpspc")))) + "decoder0,p2+p0+fdiv,fdiv*36") + +;; MMX instructions can execute on either port 0 or port 1 with a +;; throughput of 1/cycle. +;; on port 0: - ALU (latency 1) +;; - Multiplier Unit (latency 3) +;; on port 1: - ALU (latency 1) +;; - Shift Unit (latency 1) +;; +;; MMX instructions are either of the type reg-reg, or read-modify, and +;; except for mmxshft and mmxmul they can execute on port 0 or port 1, +;; so they behave as "simple" instructions that need no special modelling. +;; We only have to model mmxshft and mmxmul. +(define_insn_reservation "ppro_mmx_shft" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "mmxshft"))) + "decodern,p1") + +(define_insn_reservation "ppro_mmx_shft_load" 2 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "mmxshft"))) + "decoder0,p2+p1") + +(define_insn_reservation "ppro_mmx_mul" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "mmxmul"))) + "decodern,p0") + +(define_insn_reservation "ppro_mmx_mul_load" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "mmxmul"))) + "decoder0,p2+p0") + +(define_insn_reservation "ppro_sse_mmxcvt" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "mode" "DI") + (eq_attr "type" "mmxcvt"))) + "decodern,p1") + +;; FIXME: These are Pentium III only, but we cannot tell here if +;; we're generating code for PentiumPro/Pentium II or Pentium III +;; (define_insn_reservation "ppro_sse_mmxshft" 2 +;; (and (eq_attr "cpu" "pentiumpro") +;; (and (eq_attr "mode" "DI") +;; (eq_attr "type" "mmxshft"))) +;; "decodern,p0") + +;; SSE is very complicated, and takes a bit more effort. +;; ??? I assumed that all SSE instructions decode on decoder0, +;; but is this correct? + +;; The sfence instruction. +(define_insn_reservation "ppro_sse_sfence" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "unknown") + (eq_attr "type" "sse"))) + "decoder0,p4+p3") + +;; FIXME: This reservation is all wrong when we're scheduling sqrtss. +(define_insn_reservation "ppro_sse_SF" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "mode" "SF") + (eq_attr "type" "sse"))) + "decodern,p0") + +(define_insn_reservation "ppro_sse_add_SF" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF") + (eq_attr "type" "sseadd")))) + "decodern,p1") + +(define_insn_reservation "ppro_sse_add_SF_load" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SF") + (eq_attr "type" "sseadd")))) + "decoder0,p2+p1") + +(define_insn_reservation "ppro_sse_cmp_SF" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF") + (eq_attr "type" "ssecmp")))) + "decoder0,p1") + +(define_insn_reservation "ppro_sse_cmp_SF_load" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SF") + (eq_attr "type" "ssecmp")))) + "decoder0,p2+p1") + +(define_insn_reservation "ppro_sse_comi_SF" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF") + (eq_attr "type" "ssecomi")))) + "decodern,p0") + +(define_insn_reservation "ppro_sse_comi_SF_load" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SF") + (eq_attr "type" "ssecomi")))) + "decoder0,p2+p0") + +(define_insn_reservation "ppro_sse_mul_SF" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF") + (eq_attr "type" "ssemul")))) + "decodern,p0") + +(define_insn_reservation "ppro_sse_mul_SF_load" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SF") + (eq_attr "type" "ssemul")))) + "decoder0,p2+p0") + +;; FIXME: ssediv doesn't close p0 for 17 cycles, surely??? +(define_insn_reservation "ppro_sse_div_SF" 18 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF") + (eq_attr "type" "ssediv")))) + "decoder0,p0*17") + +(define_insn_reservation "ppro_sse_div_SF_load" 18 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF") + (eq_attr "type" "ssediv")))) + "decoder0,(p2+p0),p0*16") + +(define_insn_reservation "ppro_sse_icvt_SF" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "mode" "SF") + (eq_attr "type" "sseicvt"))) + "decoder0,(p2+p1)*2") + +(define_insn_reservation "ppro_sse_icvt_SI" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "mode" "SI") + (eq_attr "type" "sseicvt"))) + "decoder0,(p2+p1)") + +(define_insn_reservation "ppro_sse_mov_SF" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF") + (eq_attr "type" "ssemov")))) + "decoder0,(p0|p1)") + +(define_insn_reservation "ppro_sse_mov_SF_load" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SF") + (eq_attr "type" "ssemov")))) + "decoder0,p2+(p0|p1)") + +(define_insn_reservation "ppro_sse_mov_SF_store" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "store") + (and (eq_attr "mode" "SF") + (eq_attr "type" "ssemov")))) + "decoder0,p4+p3") + +(define_insn_reservation "ppro_sse_V4SF" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "sse"))) + "decoder0,p1*2") + +(define_insn_reservation "ppro_sse_add_V4SF" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "sseadd")))) + "decoder0,p1*2") + +(define_insn_reservation "ppro_sse_add_V4SF_load" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "sseadd")))) + "decoder0,(p2+p1)*2") + +(define_insn_reservation "ppro_sse_cmp_V4SF" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "ssecmp")))) + "decoder0,p1*2") + +(define_insn_reservation "ppro_sse_cmp_V4SF_load" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "ssecmp")))) + "decoder0,(p2+p1)*2") + +(define_insn_reservation "ppro_sse_cvt_V4SF" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none,unknown") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "ssecvt")))) + "decoder0,p1*2") + +(define_insn_reservation "ppro_sse_cvt_V4SF_other" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "!none,unknown") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "ssecmp")))) + "decoder0,p1,p4+p3") + +(define_insn_reservation "ppro_sse_mul_V4SF" 5 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "ssemul")))) + "decoder0,p0*2") + +(define_insn_reservation "ppro_sse_mul_V4SF_load" 5 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "ssemul")))) + "decoder0,(p2+p0)*2") + +;; FIXME: p0 really closed this long??? +(define_insn_reservation "ppro_sse_div_V4SF" 48 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "ssediv")))) + "decoder0,p0*34") + +(define_insn_reservation "ppro_sse_div_V4SF_load" 48 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "ssediv")))) + "decoder0,(p2+p0)*2,p0*32") + +(define_insn_reservation "ppro_sse_log_V4SF" 2 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "sselog,sselog1")))) + "decodern,p1") + +(define_insn_reservation "ppro_sse_log_V4SF_load" 2 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "sselog,sselog1")))) + "decoder0,(p2+p1)") + +(define_insn_reservation "ppro_sse_mov_V4SF" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "ssemov")))) + "decoder0,(p0|p1)*2") + +(define_insn_reservation "ppro_sse_mov_V4SF_load" 2 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "ssemov")))) + "decoder0,p2*2") + +(define_insn_reservation "ppro_sse_mov_V4SF_store" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "store") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "ssemov")))) + "decoder0,(p4+p3)*2") + +;; All other instructions are modelled as simple instructions. +;; We have already modelled all i387 floating point instructions, so all +;; other instructions execute on either port 0 or port 1. This includes +;; the ALU units, and the MMX units. +;; +;; reg-reg instructions produce 1 uop so they can be decoded on any of +;; the three decoders. +(define_insn_reservation "ppro_insn" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none,unknown") + (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp"))) + "decodern,(p0|p1)") + +;; read-modify and register-memory instructions have 2 or three uops, +;; so they have to be decoded on decoder0. +(define_insn_reservation "ppro_insn_load" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp"))) + "decoder0,p2+(p0|p1)") + +(define_insn_reservation "ppro_insn_store" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "store") + (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp"))) + "decoder0,(p0|p1),p4+p3") + +;; read-modify-store instructions produce 4 uops so they have to be +;; decoded on decoder0 as well. +(define_insn_reservation "ppro_insn_both" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "both") + (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp"))) + "decoder0,p2+(p0|p1),p4+p3") + diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md new file mode 100644 index 000000000..7cce9d4ad --- /dev/null +++ b/gcc/config/i386/predicates.md @@ -0,0 +1,1226 @@ +;; Predicate definitions for IA-32 and x86-64. +;; Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010 +;; Free Software Foundation, Inc. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +;; Return true if OP is either a i387 or SSE fp register. +(define_predicate "any_fp_register_operand" + (and (match_code "reg") + (match_test "ANY_FP_REGNO_P (REGNO (op))"))) + +;; Return true if OP is an i387 fp register. +(define_predicate "fp_register_operand" + (and (match_code "reg") + (match_test "FP_REGNO_P (REGNO (op))"))) + +;; Return true if OP is a non-fp register_operand. +(define_predicate "register_and_not_any_fp_reg_operand" + (and (match_code "reg") + (not (match_test "ANY_FP_REGNO_P (REGNO (op))")))) + +;; Return true if OP is a register operand other than an i387 fp register. +(define_predicate "register_and_not_fp_reg_operand" + (and (match_code "reg") + (not (match_test "FP_REGNO_P (REGNO (op))")))) + +;; True if the operand is an MMX register. +(define_predicate "mmx_reg_operand" + (and (match_code "reg") + (match_test "MMX_REGNO_P (REGNO (op))"))) + +;; True if the operand is an SSE register. +(define_predicate "sse_reg_operand" + (and (match_code "reg") + (match_test "SSE_REGNO_P (REGNO (op))"))) + +;; True if the operand is a Q_REGS class register. +(define_predicate "q_regs_operand" + (match_operand 0 "register_operand") +{ + if (GET_CODE (op) == SUBREG) + op = SUBREG_REG (op); + return ANY_QI_REG_P (op); +}) + +;; Match an SI or HImode register for a zero_extract. +(define_special_predicate "ext_register_operand" + (match_operand 0 "register_operand") +{ + if ((!TARGET_64BIT || GET_MODE (op) != DImode) + && GET_MODE (op) != SImode && GET_MODE (op) != HImode) + return false; + if (GET_CODE (op) == SUBREG) + op = SUBREG_REG (op); + + /* Be careful to accept only registers having upper parts. */ + return (REG_P (op) + && (REGNO (op) > LAST_VIRTUAL_REGISTER || REGNO (op) <= BX_REG)); +}) + +;; Return true if op is the AX register. +(define_predicate "ax_reg_operand" + (and (match_code "reg") + (match_test "REGNO (op) == AX_REG"))) + +;; Return true if op is the flags register. +(define_predicate "flags_reg_operand" + (and (match_code "reg") + (match_test "REGNO (op) == FLAGS_REG"))) + +;; Return true if op is a QImode register operand other than +;; %[abcd][hl]. +(define_predicate "ext_QIreg_operand" + (and (match_code "reg") + (match_test "TARGET_64BIT + && GET_MODE (op) == QImode + && REGNO (op) > BX_REG"))) + +;; Similarly, but don't check mode of the operand. +(define_predicate "ext_QIreg_nomode_operand" + (and (match_code "reg") + (match_test "TARGET_64BIT + && REGNO (op) > BX_REG"))) + +;; Return true if op is not xmm0 register. +(define_predicate "reg_not_xmm0_operand" + (match_operand 0 "register_operand") +{ + if (GET_CODE (op) == SUBREG) + op = SUBREG_REG (op); + + return !REG_P (op) || REGNO (op) != FIRST_SSE_REG; +}) + +;; As above, but allow nonimmediate operands. +(define_predicate "nonimm_not_xmm0_operand" + (ior (match_operand 0 "memory_operand") + (match_operand 0 "reg_not_xmm0_operand"))) + +;; Return true if VALUE can be stored in a sign extended immediate field. +(define_predicate "x86_64_immediate_operand" + (match_code "const_int,symbol_ref,label_ref,const") +{ + if (!TARGET_64BIT) + return immediate_operand (op, mode); + + switch (GET_CODE (op)) + { + case CONST_INT: + /* CONST_DOUBLES never match, since HOST_BITS_PER_WIDE_INT is known + to be at least 32 and this all acceptable constants are + represented as CONST_INT. */ + if (HOST_BITS_PER_WIDE_INT == 32) + return true; + else + { + HOST_WIDE_INT val = trunc_int_for_mode (INTVAL (op), DImode); + return trunc_int_for_mode (val, SImode) == val; + } + break; + + case SYMBOL_REF: + /* For certain code models, the symbolic references are known to fit. + in CM_SMALL_PIC model we know it fits if it is local to the shared + library. Don't count TLS SYMBOL_REFs here, since they should fit + only if inside of UNSPEC handled below. */ + /* TLS symbols are not constant. */ + if (SYMBOL_REF_TLS_MODEL (op)) + return false; + return (ix86_cmodel == CM_SMALL || ix86_cmodel == CM_KERNEL + || (ix86_cmodel == CM_MEDIUM && !SYMBOL_REF_FAR_ADDR_P (op))); + + case LABEL_REF: + /* For certain code models, the code is near as well. */ + return (ix86_cmodel == CM_SMALL || ix86_cmodel == CM_MEDIUM + || ix86_cmodel == CM_KERNEL); + + case CONST: + /* We also may accept the offsetted memory references in certain + special cases. */ + if (GET_CODE (XEXP (op, 0)) == UNSPEC) + switch (XINT (XEXP (op, 0), 1)) + { + case UNSPEC_GOTPCREL: + case UNSPEC_DTPOFF: + case UNSPEC_GOTNTPOFF: + case UNSPEC_NTPOFF: + return true; + default: + break; + } + + if (GET_CODE (XEXP (op, 0)) == PLUS) + { + rtx op1 = XEXP (XEXP (op, 0), 0); + rtx op2 = XEXP (XEXP (op, 0), 1); + HOST_WIDE_INT offset; + + if (ix86_cmodel == CM_LARGE) + return false; + if (!CONST_INT_P (op2)) + return false; + offset = trunc_int_for_mode (INTVAL (op2), DImode); + switch (GET_CODE (op1)) + { + case SYMBOL_REF: + /* TLS symbols are not constant. */ + if (SYMBOL_REF_TLS_MODEL (op1)) + return false; + /* For CM_SMALL assume that latest object is 16MB before + end of 31bits boundary. We may also accept pretty + large negative constants knowing that all objects are + in the positive half of address space. */ + if ((ix86_cmodel == CM_SMALL + || (ix86_cmodel == CM_MEDIUM + && !SYMBOL_REF_FAR_ADDR_P (op1))) + && offset < 16*1024*1024 + && trunc_int_for_mode (offset, SImode) == offset) + return true; + /* For CM_KERNEL we know that all object resist in the + negative half of 32bits address space. We may not + accept negative offsets, since they may be just off + and we may accept pretty large positive ones. */ + if (ix86_cmodel == CM_KERNEL + && offset > 0 + && trunc_int_for_mode (offset, SImode) == offset) + return true; + break; + + case LABEL_REF: + /* These conditions are similar to SYMBOL_REF ones, just the + constraints for code models differ. */ + if ((ix86_cmodel == CM_SMALL || ix86_cmodel == CM_MEDIUM) + && offset < 16*1024*1024 + && trunc_int_for_mode (offset, SImode) == offset) + return true; + if (ix86_cmodel == CM_KERNEL + && offset > 0 + && trunc_int_for_mode (offset, SImode) == offset) + return true; + break; + + case UNSPEC: + switch (XINT (op1, 1)) + { + case UNSPEC_DTPOFF: + case UNSPEC_NTPOFF: + if (offset > 0 + && trunc_int_for_mode (offset, SImode) == offset) + return true; + } + break; + + default: + break; + } + } + break; + + default: + gcc_unreachable (); + } + + return false; +}) + +;; Return true if VALUE can be stored in the zero extended immediate field. +(define_predicate "x86_64_zext_immediate_operand" + (match_code "const_double,const_int,symbol_ref,label_ref,const") +{ + switch (GET_CODE (op)) + { + case CONST_DOUBLE: + if (HOST_BITS_PER_WIDE_INT == 32) + return (GET_MODE (op) == VOIDmode && !CONST_DOUBLE_HIGH (op)); + else + return false; + + case CONST_INT: + if (HOST_BITS_PER_WIDE_INT == 32) + return INTVAL (op) >= 0; + else + return !(INTVAL (op) & ~(HOST_WIDE_INT) 0xffffffff); + + case SYMBOL_REF: + /* For certain code models, the symbolic references are known to fit. */ + /* TLS symbols are not constant. */ + if (SYMBOL_REF_TLS_MODEL (op)) + return false; + return (ix86_cmodel == CM_SMALL + || (ix86_cmodel == CM_MEDIUM + && !SYMBOL_REF_FAR_ADDR_P (op))); + + case LABEL_REF: + /* For certain code models, the code is near as well. */ + return ix86_cmodel == CM_SMALL || ix86_cmodel == CM_MEDIUM; + + case CONST: + /* We also may accept the offsetted memory references in certain + special cases. */ + if (GET_CODE (XEXP (op, 0)) == PLUS) + { + rtx op1 = XEXP (XEXP (op, 0), 0); + rtx op2 = XEXP (XEXP (op, 0), 1); + + if (ix86_cmodel == CM_LARGE) + return false; + switch (GET_CODE (op1)) + { + case SYMBOL_REF: + /* TLS symbols are not constant. */ + if (SYMBOL_REF_TLS_MODEL (op1)) + return false; + /* For small code model we may accept pretty large positive + offsets, since one bit is available for free. Negative + offsets are limited by the size of NULL pointer area + specified by the ABI. */ + if ((ix86_cmodel == CM_SMALL + || (ix86_cmodel == CM_MEDIUM + && !SYMBOL_REF_FAR_ADDR_P (op1))) + && CONST_INT_P (op2) + && trunc_int_for_mode (INTVAL (op2), DImode) > -0x10000 + && trunc_int_for_mode (INTVAL (op2), SImode) == INTVAL (op2)) + return true; + /* ??? For the kernel, we may accept adjustment of + -0x10000000, since we know that it will just convert + negative address space to positive, but perhaps this + is not worthwhile. */ + break; + + case LABEL_REF: + /* These conditions are similar to SYMBOL_REF ones, just the + constraints for code models differ. */ + if ((ix86_cmodel == CM_SMALL || ix86_cmodel == CM_MEDIUM) + && CONST_INT_P (op2) + && trunc_int_for_mode (INTVAL (op2), DImode) > -0x10000 + && trunc_int_for_mode (INTVAL (op2), SImode) == INTVAL (op2)) + return true; + break; + + default: + return false; + } + } + break; + + default: + gcc_unreachable (); + } + return false; +}) + +;; Return true if OP is general operand representable on x86_64. +(define_predicate "x86_64_general_operand" + (if_then_else (match_test "TARGET_64BIT") + (ior (match_operand 0 "nonimmediate_operand") + (match_operand 0 "x86_64_immediate_operand")) + (match_operand 0 "general_operand"))) + +;; Return true if OP is general operand representable on x86_64 +;; as either sign extended or zero extended constant. +(define_predicate "x86_64_szext_general_operand" + (if_then_else (match_test "TARGET_64BIT") + (ior (match_operand 0 "nonimmediate_operand") + (match_operand 0 "x86_64_immediate_operand") + (match_operand 0 "x86_64_zext_immediate_operand")) + (match_operand 0 "general_operand"))) + +;; Return true if OP is nonmemory operand representable on x86_64. +(define_predicate "x86_64_nonmemory_operand" + (if_then_else (match_test "TARGET_64BIT") + (ior (match_operand 0 "register_operand") + (match_operand 0 "x86_64_immediate_operand")) + (match_operand 0 "nonmemory_operand"))) + +;; Return true if OP is nonmemory operand representable on x86_64. +(define_predicate "x86_64_szext_nonmemory_operand" + (if_then_else (match_test "TARGET_64BIT") + (ior (match_operand 0 "register_operand") + (match_operand 0 "x86_64_immediate_operand") + (match_operand 0 "x86_64_zext_immediate_operand")) + (match_operand 0 "nonmemory_operand"))) + +;; Return true when operand is PIC expression that can be computed by lea +;; operation. +(define_predicate "pic_32bit_operand" + (match_code "const,symbol_ref,label_ref") +{ + if (!flag_pic) + return false; + /* Rule out relocations that translate into 64bit constants. */ + if (TARGET_64BIT && GET_CODE (op) == CONST) + { + op = XEXP (op, 0); + if (GET_CODE (op) == PLUS && CONST_INT_P (XEXP (op, 1))) + op = XEXP (op, 0); + if (GET_CODE (op) == UNSPEC + && (XINT (op, 1) == UNSPEC_GOTOFF + || XINT (op, 1) == UNSPEC_GOT)) + return false; + } + return symbolic_operand (op, mode); +}) + + +;; Return true if OP is nonmemory operand acceptable by movabs patterns. +(define_predicate "x86_64_movabs_operand" + (if_then_else (match_test "!TARGET_64BIT || !flag_pic") + (match_operand 0 "nonmemory_operand") + (ior (match_operand 0 "register_operand") + (and (match_operand 0 "const_double_operand") + (match_test "GET_MODE_SIZE (mode) <= 8"))))) + +;; Return true if OP is either a symbol reference or a sum of a symbol +;; reference and a constant. +(define_predicate "symbolic_operand" + (match_code "symbol_ref,label_ref,const") +{ + switch (GET_CODE (op)) + { + case SYMBOL_REF: + case LABEL_REF: + return true; + + case CONST: + op = XEXP (op, 0); + if (GET_CODE (op) == SYMBOL_REF + || GET_CODE (op) == LABEL_REF + || (GET_CODE (op) == UNSPEC + && (XINT (op, 1) == UNSPEC_GOT + || XINT (op, 1) == UNSPEC_GOTOFF + || XINT (op, 1) == UNSPEC_GOTPCREL))) + return true; + if (GET_CODE (op) != PLUS + || !CONST_INT_P (XEXP (op, 1))) + return false; + + op = XEXP (op, 0); + if (GET_CODE (op) == SYMBOL_REF + || GET_CODE (op) == LABEL_REF) + return true; + /* Only @GOTOFF gets offsets. */ + if (GET_CODE (op) != UNSPEC + || XINT (op, 1) != UNSPEC_GOTOFF) + return false; + + op = XVECEXP (op, 0, 0); + if (GET_CODE (op) == SYMBOL_REF + || GET_CODE (op) == LABEL_REF) + return true; + return false; + + default: + gcc_unreachable (); + } +}) + +;; Return true if OP is a symbolic operand that resolves locally. +(define_predicate "local_symbolic_operand" + (match_code "const,label_ref,symbol_ref") +{ + if (GET_CODE (op) == CONST + && GET_CODE (XEXP (op, 0)) == PLUS + && CONST_INT_P (XEXP (XEXP (op, 0), 1))) + op = XEXP (XEXP (op, 0), 0); + + if (GET_CODE (op) == LABEL_REF) + return true; + + if (GET_CODE (op) != SYMBOL_REF) + return false; + + if (SYMBOL_REF_TLS_MODEL (op)) + return false; + + if (SYMBOL_REF_LOCAL_P (op)) + return true; + + /* There is, however, a not insubstantial body of code in the rest of + the compiler that assumes it can just stick the results of + ASM_GENERATE_INTERNAL_LABEL in a symbol_ref and have done. */ + /* ??? This is a hack. Should update the body of the compiler to + always create a DECL an invoke targetm.encode_section_info. */ + if (strncmp (XSTR (op, 0), internal_label_prefix, + internal_label_prefix_len) == 0) + return true; + + return false; +}) + +;; Test for a legitimate @GOTOFF operand. +;; +;; VxWorks does not impose a fixed gap between segments; the run-time +;; gap can be different from the object-file gap. We therefore can't +;; use @GOTOFF unless we are absolutely sure that the symbol is in the +;; same segment as the GOT. Unfortunately, the flexibility of linker +;; scripts means that we can't be sure of that in general, so assume +;; that @GOTOFF is never valid on VxWorks. +(define_predicate "gotoff_operand" + (and (match_test "!TARGET_VXWORKS_RTP") + (match_operand 0 "local_symbolic_operand"))) + +;; Test for various thread-local symbols. +(define_predicate "tls_symbolic_operand" + (and (match_code "symbol_ref") + (match_test "SYMBOL_REF_TLS_MODEL (op)"))) + +(define_predicate "tls_modbase_operand" + (and (match_code "symbol_ref") + (match_test "op == ix86_tls_module_base ()"))) + +(define_predicate "tp_or_register_operand" + (ior (match_operand 0 "register_operand") + (and (match_code "unspec") + (match_test "XINT (op, 1) == UNSPEC_TP")))) + +;; Test for a pc-relative call operand +(define_predicate "constant_call_address_operand" + (match_code "symbol_ref") +{ + if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC) + return false; + if (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op)) + return false; + return true; +}) + +;; P6 processors will jump to the address after the decrement when %esp +;; is used as a call operand, so they will execute return address as a code. +;; See Pentium Pro errata 70, Pentium 2 errata A33 and Pentium 3 errata E17. + +(define_predicate "call_register_no_elim_operand" + (match_operand 0 "register_operand") +{ + if (GET_CODE (op) == SUBREG) + op = SUBREG_REG (op); + + if (!TARGET_64BIT && op == stack_pointer_rtx) + return false; + + return register_no_elim_operand (op, mode); +}) + +;; True for any non-virtual or eliminable register. Used in places where +;; instantiation of such a register may cause the pattern to not be recognized. +(define_predicate "register_no_elim_operand" + (match_operand 0 "register_operand") +{ + if (GET_CODE (op) == SUBREG) + op = SUBREG_REG (op); + return !(op == arg_pointer_rtx + || op == frame_pointer_rtx + || IN_RANGE (REGNO (op), + FIRST_PSEUDO_REGISTER, LAST_VIRTUAL_REGISTER)); +}) + +;; Similarly, but include the stack pointer. This is used to prevent esp +;; from being used as an index reg. +(define_predicate "index_register_operand" + (match_operand 0 "register_operand") +{ + if (GET_CODE (op) == SUBREG) + op = SUBREG_REG (op); + if (reload_in_progress || reload_completed) + return REG_OK_FOR_INDEX_STRICT_P (op); + else + return REG_OK_FOR_INDEX_NONSTRICT_P (op); +}) + +;; Return false if this is any eliminable register. Otherwise general_operand. +(define_predicate "general_no_elim_operand" + (if_then_else (match_code "reg,subreg") + (match_operand 0 "register_no_elim_operand") + (match_operand 0 "general_operand"))) + +;; Return false if this is any eliminable register. Otherwise +;; register_operand or a constant. +(define_predicate "nonmemory_no_elim_operand" + (ior (match_operand 0 "register_no_elim_operand") + (match_operand 0 "immediate_operand"))) + +;; Test for a valid operand for a call instruction. +(define_predicate "call_insn_operand" + (ior (match_operand 0 "constant_call_address_operand") + (match_operand 0 "call_register_no_elim_operand") + (match_operand 0 "memory_operand"))) + +;; Similarly, but for tail calls, in which we cannot allow memory references. +(define_predicate "sibcall_insn_operand" + (ior (match_operand 0 "constant_call_address_operand") + (match_operand 0 "register_no_elim_operand"))) + +;; Match exactly zero. +(define_predicate "const0_operand" + (match_code "const_int,const_double,const_vector") +{ + if (mode == VOIDmode) + mode = GET_MODE (op); + return op == CONST0_RTX (mode); +}) + +;; Match exactly one. +(define_predicate "const1_operand" + (and (match_code "const_int") + (match_test "op == const1_rtx"))) + +;; Match exactly eight. +(define_predicate "const8_operand" + (and (match_code "const_int") + (match_test "INTVAL (op) == 8"))) + +;; Match exactly 128. +(define_predicate "const128_operand" + (and (match_code "const_int") + (match_test "INTVAL (op) == 128"))) + +;; Match 2, 4, or 8. Used for leal multiplicands. +(define_predicate "const248_operand" + (match_code "const_int") +{ + HOST_WIDE_INT i = INTVAL (op); + return i == 2 || i == 4 || i == 8; +}) + +;; Match 0 or 1. +(define_predicate "const_0_to_1_operand" + (and (match_code "const_int") + (match_test "op == const0_rtx || op == const1_rtx"))) + +;; Match 0 to 3. +(define_predicate "const_0_to_3_operand" + (and (match_code "const_int") + (match_test "IN_RANGE (INTVAL (op), 0, 3)"))) + +;; Match 0 to 7. +(define_predicate "const_0_to_7_operand" + (and (match_code "const_int") + (match_test "IN_RANGE (INTVAL (op), 0, 7)"))) + +;; Match 0 to 15. +(define_predicate "const_0_to_15_operand" + (and (match_code "const_int") + (match_test "IN_RANGE (INTVAL (op), 0, 15)"))) + +;; Match 0 to 31. +(define_predicate "const_0_to_31_operand" + (and (match_code "const_int") + (match_test "IN_RANGE (INTVAL (op), 0, 31)"))) + +;; Match 0 to 63. +(define_predicate "const_0_to_63_operand" + (and (match_code "const_int") + (match_test "IN_RANGE (INTVAL (op), 0, 63)"))) + +;; Match 0 to 255. +(define_predicate "const_0_to_255_operand" + (and (match_code "const_int") + (match_test "IN_RANGE (INTVAL (op), 0, 255)"))) + +;; Match (0 to 255) * 8 +(define_predicate "const_0_to_255_mul_8_operand" + (match_code "const_int") +{ + unsigned HOST_WIDE_INT val = INTVAL (op); + return val <= 255*8 && val % 8 == 0; +}) + +;; Return true if OP is CONST_INT >= 1 and <= 31 (a valid operand +;; for shift & compare patterns, as shifting by 0 does not change flags). +(define_predicate "const_1_to_31_operand" + (and (match_code "const_int") + (match_test "IN_RANGE (INTVAL (op), 1, 31)"))) + +;; Return true if OP is CONST_INT >= 1 and <= 63 (a valid operand +;; for 64bit shift & compare patterns, as shifting by 0 does not change flags). +(define_predicate "const_1_to_63_operand" + (and (match_code "const_int") + (match_test "IN_RANGE (INTVAL (op), 1, 63)"))) + +;; Match 2 or 3. +(define_predicate "const_2_to_3_operand" + (and (match_code "const_int") + (match_test "IN_RANGE (INTVAL (op), 2, 3)"))) + +;; Match 4 to 5. +(define_predicate "const_4_to_5_operand" + (and (match_code "const_int") + (match_test "IN_RANGE (INTVAL (op), 4, 5)"))) + +;; Match 4 to 7. +(define_predicate "const_4_to_7_operand" + (and (match_code "const_int") + (match_test "IN_RANGE (INTVAL (op), 4, 7)"))) + +;; Match 6 to 7. +(define_predicate "const_6_to_7_operand" + (and (match_code "const_int") + (match_test "IN_RANGE (INTVAL (op), 6, 7)"))) + +;; Match 8 to 11. +(define_predicate "const_8_to_11_operand" + (and (match_code "const_int") + (match_test "IN_RANGE (INTVAL (op), 8, 11)"))) + +;; Match 12 to 15. +(define_predicate "const_12_to_15_operand" + (and (match_code "const_int") + (match_test "IN_RANGE (INTVAL (op), 12, 15)"))) + +;; Match exactly one bit in 2-bit mask. +(define_predicate "const_pow2_1_to_2_operand" + (and (match_code "const_int") + (match_test "INTVAL (op) == 1 || INTVAL (op) == 2"))) + +;; Match exactly one bit in 4-bit mask. +(define_predicate "const_pow2_1_to_8_operand" + (match_code "const_int") +{ + unsigned int log = exact_log2 (INTVAL (op)); + return log <= 3; +}) + +;; Match exactly one bit in 8-bit mask. +(define_predicate "const_pow2_1_to_128_operand" + (match_code "const_int") +{ + unsigned int log = exact_log2 (INTVAL (op)); + return log <= 7; +}) + +;; Match exactly one bit in 16-bit mask. +(define_predicate "const_pow2_1_to_32768_operand" + (match_code "const_int") +{ + unsigned int log = exact_log2 (INTVAL (op)); + return log <= 15; +}) + +;; True if this is a constant appropriate for an increment or decrement. +(define_predicate "incdec_operand" + (match_code "const_int") +{ + /* On Pentium4, the inc and dec operations causes extra dependency on flag + registers, since carry flag is not set. */ + if (!TARGET_USE_INCDEC && !optimize_insn_for_size_p ()) + return false; + return op == const1_rtx || op == constm1_rtx; +}) + +;; True for registers, or 1 or -1. Used to optimize double-word shifts. +(define_predicate "reg_or_pm1_operand" + (ior (match_operand 0 "register_operand") + (and (match_code "const_int") + (match_test "op == const1_rtx || op == constm1_rtx")))) + +;; True if OP is acceptable as operand of DImode shift expander. +(define_predicate "shiftdi_operand" + (if_then_else (match_test "TARGET_64BIT") + (match_operand 0 "nonimmediate_operand") + (match_operand 0 "register_operand"))) + +(define_predicate "ashldi_input_operand" + (if_then_else (match_test "TARGET_64BIT") + (match_operand 0 "nonimmediate_operand") + (match_operand 0 "reg_or_pm1_operand"))) + +;; Return true if OP is a vector load from the constant pool with just +;; the first element nonzero. +(define_predicate "zero_extended_scalar_load_operand" + (match_code "mem") +{ + unsigned n_elts; + op = maybe_get_pool_constant (op); + + if (!(op && GET_CODE (op) == CONST_VECTOR)) + return false; + + n_elts = CONST_VECTOR_NUNITS (op); + + for (n_elts--; n_elts > 0; n_elts--) + { + rtx elt = CONST_VECTOR_ELT (op, n_elts); + if (elt != CONST0_RTX (GET_MODE_INNER (GET_MODE (op)))) + return false; + } + return true; +}) + +/* Return true if operand is a vector constant that is all ones. */ +(define_predicate "vector_all_ones_operand" + (match_code "const_vector") +{ + int nunits = GET_MODE_NUNITS (mode); + + if (GET_CODE (op) == CONST_VECTOR + && CONST_VECTOR_NUNITS (op) == nunits) + { + int i; + for (i = 0; i < nunits; ++i) + { + rtx x = CONST_VECTOR_ELT (op, i); + if (x != constm1_rtx) + return false; + } + return true; + } + + return false; +}) + +; Return true when OP is operand acceptable for standard SSE move. +(define_predicate "vector_move_operand" + (ior (match_operand 0 "nonimmediate_operand") + (match_operand 0 "const0_operand"))) + +;; Return true when OP is nonimmediate or standard SSE constant. +(define_predicate "nonimmediate_or_sse_const_operand" + (match_operand 0 "general_operand") +{ + if (nonimmediate_operand (op, mode)) + return true; + if (standard_sse_constant_p (op) > 0) + return true; + return false; +}) + +;; Return true if OP is a register or a zero. +(define_predicate "reg_or_0_operand" + (ior (match_operand 0 "register_operand") + (match_operand 0 "const0_operand"))) + +;; Return true if op if a valid address, and does not contain +;; a segment override. +(define_special_predicate "no_seg_address_operand" + (match_operand 0 "address_operand") +{ + struct ix86_address parts; + int ok; + + ok = ix86_decompose_address (op, &parts); + gcc_assert (ok); + return parts.seg == SEG_DEFAULT; +}) + +;; Return true if the rtx is known to be at least 32 bits aligned. +(define_predicate "aligned_operand" + (match_operand 0 "general_operand") +{ + struct ix86_address parts; + int ok; + + /* Registers and immediate operands are always "aligned". */ + if (!MEM_P (op)) + return true; + + /* All patterns using aligned_operand on memory operands ends up + in promoting memory operand to 64bit and thus causing memory mismatch. */ + if (TARGET_MEMORY_MISMATCH_STALL && !optimize_insn_for_size_p ()) + return false; + + /* Don't even try to do any aligned optimizations with volatiles. */ + if (MEM_VOLATILE_P (op)) + return false; + + if (MEM_ALIGN (op) >= 32) + return true; + + op = XEXP (op, 0); + + /* Pushes and pops are only valid on the stack pointer. */ + if (GET_CODE (op) == PRE_DEC + || GET_CODE (op) == POST_INC) + return true; + + /* Decode the address. */ + ok = ix86_decompose_address (op, &parts); + gcc_assert (ok); + + /* Look for some component that isn't known to be aligned. */ + if (parts.index) + { + if (REGNO_POINTER_ALIGN (REGNO (parts.index)) * parts.scale < 32) + return false; + } + if (parts.base) + { + if (REGNO_POINTER_ALIGN (REGNO (parts.base)) < 32) + return false; + } + if (parts.disp) + { + if (!CONST_INT_P (parts.disp) + || (INTVAL (parts.disp) & 3)) + return false; + } + + /* Didn't find one -- this must be an aligned address. */ + return true; +}) + +;; Return true if OP is memory operand with a displacement. +(define_predicate "memory_displacement_operand" + (match_operand 0 "memory_operand") +{ + struct ix86_address parts; + int ok; + + ok = ix86_decompose_address (XEXP (op, 0), &parts); + gcc_assert (ok); + return parts.disp != NULL_RTX; +}) + +;; Return true if OP is memory operand with a displacement only. +(define_predicate "memory_displacement_only_operand" + (match_operand 0 "memory_operand") +{ + struct ix86_address parts; + int ok; + + if (TARGET_64BIT) + return false; + + ok = ix86_decompose_address (XEXP (op, 0), &parts); + gcc_assert (ok); + + if (parts.base || parts.index) + return false; + + return parts.disp != NULL_RTX; +}) + +;; Return true if OP is memory operand which will need zero or +;; one register at most, not counting stack pointer or frame pointer. +(define_predicate "cmpxchg8b_pic_memory_operand" + (match_operand 0 "memory_operand") +{ + struct ix86_address parts; + int ok; + + ok = ix86_decompose_address (XEXP (op, 0), &parts); + gcc_assert (ok); + if (parts.base == NULL_RTX + || parts.base == arg_pointer_rtx + || parts.base == frame_pointer_rtx + || parts.base == hard_frame_pointer_rtx + || parts.base == stack_pointer_rtx) + return true; + + if (parts.index == NULL_RTX + || parts.index == arg_pointer_rtx + || parts.index == frame_pointer_rtx + || parts.index == hard_frame_pointer_rtx + || parts.index == stack_pointer_rtx) + return true; + + return false; +}) + + +;; Return true if OP is memory operand that cannot be represented +;; by the modRM array. +(define_predicate "long_memory_operand" + (and (match_operand 0 "memory_operand") + (match_test "memory_address_length (op)"))) + +;; Return true if OP is a comparison operator that can be issued by fcmov. +(define_predicate "fcmov_comparison_operator" + (match_operand 0 "comparison_operator") +{ + enum machine_mode inmode = GET_MODE (XEXP (op, 0)); + enum rtx_code code = GET_CODE (op); + + if (inmode == CCFPmode || inmode == CCFPUmode) + { + if (!ix86_trivial_fp_comparison_operator (op, mode)) + return false; + code = ix86_fp_compare_code_to_integer (code); + } + /* i387 supports just limited amount of conditional codes. */ + switch (code) + { + case LTU: case GTU: case LEU: case GEU: + if (inmode == CCmode || inmode == CCFPmode || inmode == CCFPUmode + || inmode == CCCmode) + return true; + return false; + case ORDERED: case UNORDERED: + case EQ: case NE: + return true; + default: + return false; + } +}) + +;; Return true if OP is a comparison that can be used in the CMPSS/CMPPS insns. +;; The first set are supported directly; the second set can't be done with +;; full IEEE support, i.e. NaNs. + +(define_predicate "sse_comparison_operator" + (match_code "eq,lt,le,unordered,ne,unge,ungt,ordered")) + +;; Return true if OP is a comparison operator that can be issued by +;; avx predicate generation instructions +(define_predicate "avx_comparison_float_operator" + (match_code "ne,eq,ge,gt,le,lt,unordered,ordered,uneq,unge,ungt,unle,unlt,ltgt")) + +(define_predicate "ix86_comparison_int_operator" + (match_code "ne,eq,ge,gt,le,lt")) + +(define_predicate "ix86_comparison_uns_operator" + (match_code "ne,eq,geu,gtu,leu,ltu")) + +(define_predicate "bt_comparison_operator" + (match_code "ne,eq")) + +;; Return true if OP is a valid comparison operator in valid mode. +(define_predicate "ix86_comparison_operator" + (match_operand 0 "comparison_operator") +{ + enum machine_mode inmode = GET_MODE (XEXP (op, 0)); + enum rtx_code code = GET_CODE (op); + + if (inmode == CCFPmode || inmode == CCFPUmode) + return ix86_trivial_fp_comparison_operator (op, mode); + + switch (code) + { + case EQ: case NE: + return true; + case LT: case GE: + if (inmode == CCmode || inmode == CCGCmode + || inmode == CCGOCmode || inmode == CCNOmode) + return true; + return false; + case LTU: case GTU: case LEU: case GEU: + if (inmode == CCmode || inmode == CCCmode) + return true; + return false; + case ORDERED: case UNORDERED: + if (inmode == CCmode) + return true; + return false; + case GT: case LE: + if (inmode == CCmode || inmode == CCGCmode || inmode == CCNOmode) + return true; + return false; + default: + return false; + } +}) + +;; Return true if OP is a valid comparison operator +;; testing carry flag to be set. +(define_predicate "ix86_carry_flag_operator" + (match_code "ltu,lt,unlt,gtu,gt,ungt,le,unle,ge,unge,ltgt,uneq") +{ + enum machine_mode inmode = GET_MODE (XEXP (op, 0)); + enum rtx_code code = GET_CODE (op); + + if (inmode == CCFPmode || inmode == CCFPUmode) + { + if (!ix86_trivial_fp_comparison_operator (op, mode)) + return false; + code = ix86_fp_compare_code_to_integer (code); + } + else if (inmode == CCCmode) + return code == LTU || code == GTU; + else if (inmode != CCmode) + return false; + + return code == LTU; +}) + +;; Return true if this comparison only requires testing one flag bit. +(define_predicate "ix86_trivial_fp_comparison_operator" + (match_code "gt,ge,unlt,unle,uneq,ltgt,ordered,unordered")) + +;; Return true if we know how to do this comparison. Others require +;; testing more than one flag bit, and we let the generic middle-end +;; code do that. +(define_predicate "ix86_fp_comparison_operator" + (if_then_else (match_test "ix86_fp_comparison_strategy (GET_CODE (op)) + == IX86_FPCMP_ARITH") + (match_operand 0 "comparison_operator") + (match_operand 0 "ix86_trivial_fp_comparison_operator"))) + +;; Same as above, but for swapped comparison used in fp_jcc_4_387. +(define_predicate "ix86_swapped_fp_comparison_operator" + (match_operand 0 "comparison_operator") +{ + enum rtx_code code = GET_CODE (op); + bool ret; + + PUT_CODE (op, swap_condition (code)); + ret = ix86_fp_comparison_operator (op, mode); + PUT_CODE (op, code); + return ret; +}) + +;; Nearly general operand, but accept any const_double, since we wish +;; to be able to drop them into memory rather than have them get pulled +;; into registers. +(define_predicate "cmp_fp_expander_operand" + (ior (match_code "const_double") + (match_operand 0 "general_operand"))) + +;; Return true if this is a valid binary floating-point operation. +(define_predicate "binary_fp_operator" + (match_code "plus,minus,mult,div")) + +;; Return true if this is a multiply operation. +(define_predicate "mult_operator" + (match_code "mult")) + +;; Return true if this is a division operation. +(define_predicate "div_operator" + (match_code "div")) + +;; Return true if this is a float extend operation. +(define_predicate "float_operator" + (match_code "float")) + +;; Return true for ARITHMETIC_P. +(define_predicate "arith_or_logical_operator" + (match_code "plus,mult,and,ior,xor,smin,smax,umin,umax,compare,minus,div, + mod,udiv,umod,ashift,rotate,ashiftrt,lshiftrt,rotatert")) + +;; Return true for COMMUTATIVE_P. +(define_predicate "commutative_operator" + (match_code "plus,mult,and,ior,xor,smin,smax,umin,umax")) + +;; Return true if OP is a binary operator that can be promoted to wider mode. +(define_predicate "promotable_binary_operator" + (ior (match_code "plus,and,ior,xor,ashift") + (and (match_code "mult") + (match_test "TARGET_TUNE_PROMOTE_HIMODE_IMUL")))) + +(define_predicate "compare_operator" + (match_code "compare")) + +(define_predicate "absneg_operator" + (match_code "abs,neg")) + +;; Return true if OP is misaligned memory operand +(define_predicate "misaligned_operand" + (and (match_code "mem") + (match_test "MEM_ALIGN (op) < GET_MODE_ALIGNMENT (mode)"))) + +;; Return true if OP is a emms operation, known to be a PARALLEL. +(define_predicate "emms_operation" + (match_code "parallel") +{ + unsigned i; + + if (XVECLEN (op, 0) != 17) + return false; + + for (i = 0; i < 8; i++) + { + rtx elt = XVECEXP (op, 0, i+1); + + if (GET_CODE (elt) != CLOBBER + || GET_CODE (SET_DEST (elt)) != REG + || GET_MODE (SET_DEST (elt)) != XFmode + || REGNO (SET_DEST (elt)) != FIRST_STACK_REG + i) + return false; + + elt = XVECEXP (op, 0, i+9); + + if (GET_CODE (elt) != CLOBBER + || GET_CODE (SET_DEST (elt)) != REG + || GET_MODE (SET_DEST (elt)) != DImode + || REGNO (SET_DEST (elt)) != FIRST_MMX_REG + i) + return false; + } + return true; +}) + +;; Return true if OP is a vzeroall operation, known to be a PARALLEL. +(define_predicate "vzeroall_operation" + (match_code "parallel") +{ + unsigned i, nregs = TARGET_64BIT ? 16 : 8; + + if ((unsigned) XVECLEN (op, 0) != 1 + nregs) + return false; + + for (i = 0; i < nregs; i++) + { + rtx elt = XVECEXP (op, 0, i+1); + + if (GET_CODE (elt) != SET + || GET_CODE (SET_DEST (elt)) != REG + || GET_MODE (SET_DEST (elt)) != V8SImode + || REGNO (SET_DEST (elt)) != SSE_REGNO (i) + || SET_SRC (elt) != CONST0_RTX (V8SImode)) + return false; + } + return true; +}) + +;; Return true if OP is a parallel for a vpermilp[ds] permute. +;; ??? It would be much easier if the PARALLEL for a VEC_SELECT +;; had a mode, but it doesn't. So we have 4 copies and install +;; the mode by hand. + +(define_predicate "avx_vpermilp_v8sf_operand" + (and (match_code "parallel") + (match_test "avx_vpermilp_parallel (op, V8SFmode)"))) + +(define_predicate "avx_vpermilp_v4df_operand" + (and (match_code "parallel") + (match_test "avx_vpermilp_parallel (op, V4DFmode)"))) + +(define_predicate "avx_vpermilp_v4sf_operand" + (and (match_code "parallel") + (match_test "avx_vpermilp_parallel (op, V4SFmode)"))) + +(define_predicate "avx_vpermilp_v2df_operand" + (and (match_code "parallel") + (match_test "avx_vpermilp_parallel (op, V2DFmode)"))) + +;; Return true if OP is a parallel for a vperm2f128 permute. + +(define_predicate "avx_vperm2f128_v8sf_operand" + (and (match_code "parallel") + (match_test "avx_vperm2f128_parallel (op, V8SFmode)"))) + +(define_predicate "avx_vperm2f128_v8si_operand" + (and (match_code "parallel") + (match_test "avx_vperm2f128_parallel (op, V8SImode)"))) + +(define_predicate "avx_vperm2f128_v4df_operand" + (and (match_code "parallel") + (match_test "avx_vperm2f128_parallel (op, V4DFmode)"))) + +;; Return true if OP is a parallel for a vbroadcast permute. + +(define_predicate "avx_vbroadcast_operand" + (and (match_code "parallel") + (match_code "const_int" "a")) +{ + rtx elt = XVECEXP (op, 0, 0); + int i, nelt = XVECLEN (op, 0); + + /* Don't bother checking there are the right number of operands, + merely that they're all identical. */ + for (i = 1; i < nelt; ++i) + if (XVECEXP (op, 0, i) != elt) + return false; + return true; +}) diff --git a/gcc/config/i386/rtemself.h b/gcc/config/i386/rtemself.h new file mode 100644 index 000000000..ac492ec35 --- /dev/null +++ b/gcc/config/i386/rtemself.h @@ -0,0 +1,32 @@ +/* Definitions for rtems targeting an ix86 using ELF. + Copyright (C) 1996, 1997, 2000, 2001, 2002, 2007 Free Software Foundation, Inc. + Contributed by Joel Sherrill (joel@OARcorp.com). + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +/* Specify predefined symbols in preprocessor. */ + +#define TARGET_OS_CPP_BUILTINS() \ + do \ + { \ + builtin_define ("__rtems__"); \ + builtin_define ("__USE_INIT_FINI__"); \ + builtin_assert ("system=rtems"); \ + if (!TARGET_80387) \ + builtin_define ("_SOFT_FLOAT"); \ + } \ + while (0) diff --git a/gcc/config/i386/sfp-machine.h b/gcc/config/i386/sfp-machine.h new file mode 100644 index 000000000..f2df86965 --- /dev/null +++ b/gcc/config/i386/sfp-machine.h @@ -0,0 +1,5 @@ +#ifdef __x86_64__ +#include "config/i386/64/sfp-machine.h" +#else +#include "config/i386/32/sfp-machine.h" +#endif diff --git a/gcc/config/i386/smmintrin.h b/gcc/config/i386/smmintrin.h new file mode 100644 index 000000000..e12c56a17 --- /dev/null +++ b/gcc/config/i386/smmintrin.h @@ -0,0 +1,831 @@ +/* Copyright (C) 2007, 2008, 2009, 2010 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . + + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 10.0. */ + +#ifndef _SMMINTRIN_H_INCLUDED +#define _SMMINTRIN_H_INCLUDED + +#ifndef __SSE4_1__ +# error "SSE4.1 instruction set not enabled" +#else + +/* We need definitions from the SSSE3, SSE3, SSE2 and SSE header + files. */ +#include + +/* Rounding mode macros. */ +#define _MM_FROUND_TO_NEAREST_INT 0x00 +#define _MM_FROUND_TO_NEG_INF 0x01 +#define _MM_FROUND_TO_POS_INF 0x02 +#define _MM_FROUND_TO_ZERO 0x03 +#define _MM_FROUND_CUR_DIRECTION 0x04 + +#define _MM_FROUND_RAISE_EXC 0x00 +#define _MM_FROUND_NO_EXC 0x08 + +#define _MM_FROUND_NINT \ + (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_FLOOR \ + (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_CEIL \ + (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_TRUNC \ + (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_RINT \ + (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_NEARBYINT \ + (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) + +/* Test Instruction */ +/* Packed integer 128-bit bitwise comparison. Return 1 if + (__V & __M) == 0. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testz_si128 (__m128i __M, __m128i __V) +{ + return __builtin_ia32_ptestz128 ((__v2di)__M, (__v2di)__V); +} + +/* Packed integer 128-bit bitwise comparison. Return 1 if + (__V & ~__M) == 0. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testc_si128 (__m128i __M, __m128i __V) +{ + return __builtin_ia32_ptestc128 ((__v2di)__M, (__v2di)__V); +} + +/* Packed integer 128-bit bitwise comparison. Return 1 if + (__V & __M) != 0 && (__V & ~__M) != 0. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testnzc_si128 (__m128i __M, __m128i __V) +{ + return __builtin_ia32_ptestnzc128 ((__v2di)__M, (__v2di)__V); +} + +/* Macros for packed integer 128-bit comparison intrinsics. */ +#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V)) + +#define _mm_test_all_ones(V) \ + _mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V))) + +#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V)) + +/* Packed/scalar double precision floating point rounding. */ + +#ifdef __OPTIMIZE__ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_pd (__m128d __V, const int __M) +{ + return (__m128d) __builtin_ia32_roundpd ((__v2df)__V, __M); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_sd(__m128d __D, __m128d __V, const int __M) +{ + return (__m128d) __builtin_ia32_roundsd ((__v2df)__D, + (__v2df)__V, + __M); +} +#else +#define _mm_round_pd(V, M) \ + ((__m128d) __builtin_ia32_roundpd ((__v2df)(__m128d)(V), (int)(M))) + +#define _mm_round_sd(D, V, M) \ + ((__m128d) __builtin_ia32_roundsd ((__v2df)(__m128d)(D), \ + (__v2df)(__m128d)(V), (int)(M))) +#endif + +/* Packed/scalar single precision floating point rounding. */ + +#ifdef __OPTIMIZE__ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_ps (__m128 __V, const int __M) +{ + return (__m128) __builtin_ia32_roundps ((__v4sf)__V, __M); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_ss (__m128 __D, __m128 __V, const int __M) +{ + return (__m128) __builtin_ia32_roundss ((__v4sf)__D, + (__v4sf)__V, + __M); +} +#else +#define _mm_round_ps(V, M) \ + ((__m128) __builtin_ia32_roundps ((__v4sf)(__m128)(V), (int)(M))) + +#define _mm_round_ss(D, V, M) \ + ((__m128) __builtin_ia32_roundss ((__v4sf)(__m128)(D), \ + (__v4sf)(__m128)(V), (int)(M))) +#endif + +/* Macros for ceil/floor intrinsics. */ +#define _mm_ceil_pd(V) _mm_round_pd ((V), _MM_FROUND_CEIL) +#define _mm_ceil_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_CEIL) + +#define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR) +#define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR) + +#define _mm_ceil_ps(V) _mm_round_ps ((V), _MM_FROUND_CEIL) +#define _mm_ceil_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_CEIL) + +#define _mm_floor_ps(V) _mm_round_ps ((V), _MM_FROUND_FLOOR) +#define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR) + +/* SSE4.1 */ + +/* Integer blend instructions - select data from 2 sources using + constant/variable mask. */ + +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_epi16 (__m128i __X, __m128i __Y, const int __M) +{ + return (__m128i) __builtin_ia32_pblendw128 ((__v8hi)__X, + (__v8hi)__Y, + __M); +} +#else +#define _mm_blend_epi16(X, Y, M) \ + ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(X), \ + (__v8hi)(__m128i)(Y), (int)(M))) +#endif + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blendv_epi8 (__m128i __X, __m128i __Y, __m128i __M) +{ + return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__X, + (__v16qi)__Y, + (__v16qi)__M); +} + +/* Single precision floating point blend instructions - select data + from 2 sources using constant/variable mask. */ + +#ifdef __OPTIMIZE__ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_ps (__m128 __X, __m128 __Y, const int __M) +{ + return (__m128) __builtin_ia32_blendps ((__v4sf)__X, + (__v4sf)__Y, + __M); +} +#else +#define _mm_blend_ps(X, Y, M) \ + ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(M))) +#endif + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blendv_ps (__m128 __X, __m128 __Y, __m128 __M) +{ + return (__m128) __builtin_ia32_blendvps ((__v4sf)__X, + (__v4sf)__Y, + (__v4sf)__M); +} + +/* Double precision floating point blend instructions - select data + from 2 sources using constant/variable mask. */ + +#ifdef __OPTIMIZE__ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_pd (__m128d __X, __m128d __Y, const int __M) +{ + return (__m128d) __builtin_ia32_blendpd ((__v2df)__X, + (__v2df)__Y, + __M); +} +#else +#define _mm_blend_pd(X, Y, M) \ + ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(M))) +#endif + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blendv_pd (__m128d __X, __m128d __Y, __m128d __M) +{ + return (__m128d) __builtin_ia32_blendvpd ((__v2df)__X, + (__v2df)__Y, + (__v2df)__M); +} + +/* Dot product instructions with mask-defined summing and zeroing parts + of result. */ + +#ifdef __OPTIMIZE__ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dp_ps (__m128 __X, __m128 __Y, const int __M) +{ + return (__m128) __builtin_ia32_dpps ((__v4sf)__X, + (__v4sf)__Y, + __M); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dp_pd (__m128d __X, __m128d __Y, const int __M) +{ + return (__m128d) __builtin_ia32_dppd ((__v2df)__X, + (__v2df)__Y, + __M); +} +#else +#define _mm_dp_ps(X, Y, M) \ + ((__m128) __builtin_ia32_dpps ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(M))) + +#define _mm_dp_pd(X, Y, M) \ + ((__m128d) __builtin_ia32_dppd ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(M))) +#endif + +/* Packed integer 64-bit comparison, zeroing or filling with ones + corresponding parts of result. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi64 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pcmpeqq ((__v2di)__X, (__v2di)__Y); +} + +/* Min/max packed integer instructions. */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epi8 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pminsb128 ((__v16qi)__X, (__v16qi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epi8 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi)__X, (__v16qi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epu16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pminuw128 ((__v8hi)__X, (__v8hi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epu16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi)__X, (__v8hi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pminsd128 ((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epu32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pminud128 ((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epu32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmaxud128 ((__v4si)__X, (__v4si)__Y); +} + +/* Packed integer 32-bit multiplication with truncation of upper + halves of results. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mullo_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmulld128 ((__v4si)__X, (__v4si)__Y); +} + +/* Packed integer 32-bit multiplication of 2 pairs of operands + with two 64-bit results. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__X, (__v4si)__Y); +} + +/* Insert single precision float into packed single precision array + element selected by index N. The bits [7-6] of N define S + index, the bits [5-4] define D index, and bits [3-0] define + zeroing mask for D. */ + +#ifdef __OPTIMIZE__ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_ps (__m128 __D, __m128 __S, const int __N) +{ + return (__m128) __builtin_ia32_insertps128 ((__v4sf)__D, + (__v4sf)__S, + __N); +} +#else +#define _mm_insert_ps(D, S, N) \ + ((__m128) __builtin_ia32_insertps128 ((__v4sf)(__m128)(D), \ + (__v4sf)(__m128)(S), (int)(N))) +#endif + +/* Helper macro to create the N value for _mm_insert_ps. */ +#define _MM_MK_INSERTPS_NDX(S, D, M) (((S) << 6) | ((D) << 4) | (M)) + +/* Extract binary representation of single precision float from packed + single precision array element of X selected by index N. */ + +#ifdef __OPTIMIZE__ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_ps (__m128 __X, const int __N) +{ + union { int i; float f; } __tmp; + __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)__X, __N); + return __tmp.i; +} +#else +#define _mm_extract_ps(X, N) \ + (__extension__ \ + ({ \ + union { int i; float f; } __tmp; \ + __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)(__m128)(X), (int)(N)); \ + __tmp.i; \ + })) +#endif + +/* Extract binary representation of single precision float into + D from packed single precision array element of S selected + by index N. */ +#define _MM_EXTRACT_FLOAT(D, S, N) \ + { (D) = __builtin_ia32_vec_ext_v4sf ((__v4sf)(S), (N)); } + +/* Extract specified single precision float element into the lower + part of __m128. */ +#define _MM_PICK_OUT_PS(X, N) \ + _mm_insert_ps (_mm_setzero_ps (), (X), \ + _MM_MK_INSERTPS_NDX ((N), 0, 0x0e)) + +/* Insert integer, S, into packed integer array element of D + selected by index N. */ + +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi8 (__m128i __D, int __S, const int __N) +{ + return (__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)__D, + __S, __N); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi32 (__m128i __D, int __S, const int __N) +{ + return (__m128i) __builtin_ia32_vec_set_v4si ((__v4si)__D, + __S, __N); +} + +#ifdef __x86_64__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi64 (__m128i __D, long long __S, const int __N) +{ + return (__m128i) __builtin_ia32_vec_set_v2di ((__v2di)__D, + __S, __N); +} +#endif +#else +#define _mm_insert_epi8(D, S, N) \ + ((__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)(__m128i)(D), \ + (int)(S), (int)(N))) + +#define _mm_insert_epi32(D, S, N) \ + ((__m128i) __builtin_ia32_vec_set_v4si ((__v4si)(__m128i)(D), \ + (int)(S), (int)(N))) + +#ifdef __x86_64__ +#define _mm_insert_epi64(D, S, N) \ + ((__m128i) __builtin_ia32_vec_set_v2di ((__v2di)(__m128i)(D), \ + (long long)(S), (int)(N))) +#endif +#endif + +/* Extract integer from packed integer array element of X selected by + index N. */ + +#ifdef __OPTIMIZE__ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi8 (__m128i __X, const int __N) +{ + return (unsigned char) __builtin_ia32_vec_ext_v16qi ((__v16qi)__X, __N); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi32 (__m128i __X, const int __N) +{ + return __builtin_ia32_vec_ext_v4si ((__v4si)__X, __N); +} + +#ifdef __x86_64__ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi64 (__m128i __X, const int __N) +{ + return __builtin_ia32_vec_ext_v2di ((__v2di)__X, __N); +} +#endif +#else +#define _mm_extract_epi8(X, N) \ + ((int) (unsigned char) __builtin_ia32_vec_ext_v16qi ((__v16qi)(__m128i)(X), (int)(N))) +#define _mm_extract_epi32(X, N) \ + ((int) __builtin_ia32_vec_ext_v4si ((__v4si)(__m128i)(X), (int)(N))) + +#ifdef __x86_64__ +#define _mm_extract_epi64(X, N) \ + ((long long) __builtin_ia32_vec_ext_v2di ((__v2di)(__m128i)(X), (int)(N))) +#endif +#endif + +/* Return horizontal packed word minimum and its index in bits [15:0] + and bits [18:16] respectively. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_minpos_epu16 (__m128i __X) +{ + return (__m128i) __builtin_ia32_phminposuw128 ((__v8hi)__X); +} + +/* Packed integer sign-extension. */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi8_epi32 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxbd128 ((__v16qi)__X); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi16_epi32 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxwd128 ((__v8hi)__X); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi8_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxbq128 ((__v16qi)__X); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi32_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxdq128 ((__v4si)__X); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi16_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxwq128 ((__v8hi)__X); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi8_epi16 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxbw128 ((__v16qi)__X); +} + +/* Packed integer zero-extension. */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu8_epi32 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxbd128 ((__v16qi)__X); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu16_epi32 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxwd128 ((__v8hi)__X); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu8_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxbq128 ((__v16qi)__X); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu32_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxdq128 ((__v4si)__X); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu16_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxwq128 ((__v8hi)__X); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu8_epi16 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxbw128 ((__v16qi)__X); +} + +/* Pack 8 double words from 2 operands into 8 words of result with + unsigned saturation. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packus_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_packusdw128 ((__v4si)__X, (__v4si)__Y); +} + +/* Sum absolute 8-bit integer difference of adjacent groups of 4 + byte integers in the first 2 operands. Starting offsets within + operands are determined by the 3rd mask operand. */ + +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M) +{ + return (__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)__X, + (__v16qi)__Y, __M); +} +#else +#define _mm_mpsadbw_epu8(X, Y, M) \ + ((__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(M))) +#endif + +/* Load double quadword using non-temporal aligned hint. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_load_si128 (__m128i *__X) +{ + return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __X); +} + +#ifdef __SSE4_2__ + +/* These macros specify the source data format. */ +#define _SIDD_UBYTE_OPS 0x00 +#define _SIDD_UWORD_OPS 0x01 +#define _SIDD_SBYTE_OPS 0x02 +#define _SIDD_SWORD_OPS 0x03 + +/* These macros specify the comparison operation. */ +#define _SIDD_CMP_EQUAL_ANY 0x00 +#define _SIDD_CMP_RANGES 0x04 +#define _SIDD_CMP_EQUAL_EACH 0x08 +#define _SIDD_CMP_EQUAL_ORDERED 0x0c + +/* These macros specify the the polarity. */ +#define _SIDD_POSITIVE_POLARITY 0x00 +#define _SIDD_NEGATIVE_POLARITY 0x10 +#define _SIDD_MASKED_POSITIVE_POLARITY 0x20 +#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 + +/* These macros specify the output selection in _mm_cmpXstri (). */ +#define _SIDD_LEAST_SIGNIFICANT 0x00 +#define _SIDD_MOST_SIGNIFICANT 0x40 + +/* These macros specify the output selection in _mm_cmpXstrm (). */ +#define _SIDD_BIT_MASK 0x00 +#define _SIDD_UNIT_MASK 0x40 + +/* Intrinsics for text/string processing. */ + +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistrm (__m128i __X, __m128i __Y, const int __M) +{ + return (__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)__X, + (__v16qi)__Y, + __M); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistri (__m128i __X, __m128i __Y, const int __M) +{ + return __builtin_ia32_pcmpistri128 ((__v16qi)__X, + (__v16qi)__Y, + __M); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestrm (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return (__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestri (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return __builtin_ia32_pcmpestri128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); +} +#else +#define _mm_cmpistrm(X, Y, M) \ + ((__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(M))) +#define _mm_cmpistri(X, Y, M) \ + ((int) __builtin_ia32_pcmpistri128 ((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(M))) + +#define _mm_cmpestrm(X, LX, Y, LY, M) \ + ((__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)(__m128i)(X), \ + (int)(LX), (__v16qi)(__m128i)(Y), \ + (int)(LY), (int)(M))) +#define _mm_cmpestri(X, LX, Y, LY, M) \ + ((int) __builtin_ia32_pcmpestri128 ((__v16qi)(__m128i)(X), (int)(LX), \ + (__v16qi)(__m128i)(Y), (int)(LY), \ + (int)(M))) +#endif + +/* Intrinsics for text/string processing and reading values of + EFlags. */ + +#ifdef __OPTIMIZE__ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistra (__m128i __X, __m128i __Y, const int __M) +{ + return __builtin_ia32_pcmpistria128 ((__v16qi)__X, + (__v16qi)__Y, + __M); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistrc (__m128i __X, __m128i __Y, const int __M) +{ + return __builtin_ia32_pcmpistric128 ((__v16qi)__X, + (__v16qi)__Y, + __M); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistro (__m128i __X, __m128i __Y, const int __M) +{ + return __builtin_ia32_pcmpistrio128 ((__v16qi)__X, + (__v16qi)__Y, + __M); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistrs (__m128i __X, __m128i __Y, const int __M) +{ + return __builtin_ia32_pcmpistris128 ((__v16qi)__X, + (__v16qi)__Y, + __M); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistrz (__m128i __X, __m128i __Y, const int __M) +{ + return __builtin_ia32_pcmpistriz128 ((__v16qi)__X, + (__v16qi)__Y, + __M); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestra (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return __builtin_ia32_pcmpestria128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestrc (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return __builtin_ia32_pcmpestric128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestro (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return __builtin_ia32_pcmpestrio128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestrs (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return __builtin_ia32_pcmpestris128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestrz (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return __builtin_ia32_pcmpestriz128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); +} +#else +#define _mm_cmpistra(X, Y, M) \ + ((int) __builtin_ia32_pcmpistria128 ((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(M))) +#define _mm_cmpistrc(X, Y, M) \ + ((int) __builtin_ia32_pcmpistric128 ((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(M))) +#define _mm_cmpistro(X, Y, M) \ + ((int) __builtin_ia32_pcmpistrio128 ((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(M))) +#define _mm_cmpistrs(X, Y, M) \ + ((int) __builtin_ia32_pcmpistris128 ((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(M))) +#define _mm_cmpistrz(X, Y, M) \ + ((int) __builtin_ia32_pcmpistriz128 ((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(M))) + +#define _mm_cmpestra(X, LX, Y, LY, M) \ + ((int) __builtin_ia32_pcmpestria128 ((__v16qi)(__m128i)(X), (int)(LX), \ + (__v16qi)(__m128i)(Y), (int)(LY), \ + (int)(M))) +#define _mm_cmpestrc(X, LX, Y, LY, M) \ + ((int) __builtin_ia32_pcmpestric128 ((__v16qi)(__m128i)(X), (int)(LX), \ + (__v16qi)(__m128i)(Y), (int)(LY), \ + (int)(M))) +#define _mm_cmpestro(X, LX, Y, LY, M) \ + ((int) __builtin_ia32_pcmpestrio128 ((__v16qi)(__m128i)(X), (int)(LX), \ + (__v16qi)(__m128i)(Y), (int)(LY), \ + (int)(M))) +#define _mm_cmpestrs(X, LX, Y, LY, M) \ + ((int) __builtin_ia32_pcmpestris128 ((__v16qi)(__m128i)(X), (int)(LX), \ + (__v16qi)(__m128i)(Y), (int)(LY), \ + (int)(M))) +#define _mm_cmpestrz(X, LX, Y, LY, M) \ + ((int) __builtin_ia32_pcmpestriz128 ((__v16qi)(__m128i)(X), (int)(LX), \ + (__v16qi)(__m128i)(Y), (int)(LY), \ + (int)(M))) +#endif + +/* Packed integer 64-bit comparison, zeroing or filling with ones + corresponding parts of result. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi64 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pcmpgtq ((__v2di)__X, (__v2di)__Y); +} + +#ifdef __POPCNT__ +#include +#endif + +/* Accumulate CRC32 (polynomial 0x11EDC6F41) value. */ +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_crc32_u8 (unsigned int __C, unsigned char __V) +{ + return __builtin_ia32_crc32qi (__C, __V); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_crc32_u16 (unsigned int __C, unsigned short __V) +{ + return __builtin_ia32_crc32hi (__C, __V); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_crc32_u32 (unsigned int __C, unsigned int __V) +{ + return __builtin_ia32_crc32si (__C, __V); +} + +#ifdef __x86_64__ +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_crc32_u64 (unsigned long long __C, unsigned long long __V) +{ + return __builtin_ia32_crc32di (__C, __V); +} +#endif + +#endif /* __SSE4_2__ */ + +#endif /* __SSE4_1__ */ + +#endif /* _SMMINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/sol2-10.h b/gcc/config/i386/sol2-10.h new file mode 100644 index 000000000..c3decd2ef --- /dev/null +++ b/gcc/config/i386/sol2-10.h @@ -0,0 +1,138 @@ +/* Solaris 10 configuration. + Copyright (C) 2004, 2006, 2007, 2008, 2009, 2010, 2011 + Free Software Foundation, Inc. + Contributed by CodeSourcery, LLC. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#undef ASM_COMMENT_START +#define ASM_COMMENT_START "/" + +/* binutils' GNU as understands --32 and --64, but the native Solaris + assembler requires -xarch=generic or -xarch=generic64 instead. */ +#undef ASM_SPEC +#ifdef USE_GAS +#define ASM_SPEC "%{m32:--32} %{m64:--64} -s %(asm_cpu)" +#else +#define ASM_SPEC "%{v:-V} %{Qy:} %{!Qn:-Qy} %{Ym,*} " \ + "%{m32:-xarch=generic} %{m64:-xarch=generic64} " \ + "-s %(asm_cpu)" +#endif + +/* The native Solaris assembler can't calculate the difference between + symbols in different sections, which causes problems for -fPIC jump + tables in .rodata. */ +#ifndef HAVE_AS_IX86_DIFF_SECT_DELTA +#undef JUMP_TABLES_IN_TEXT_SECTION +#define JUMP_TABLES_IN_TEXT_SECTION 1 + +/* The native Solaris assembler cannot handle the SYMBOL-. syntax, but + requires SYMBOL@rel/@rel64 instead. */ +#define ASM_OUTPUT_DWARF_PCREL(FILE, SIZE, LABEL) \ + do { \ + fputs (integer_asm_op (SIZE, FALSE), FILE); \ + assemble_name (FILE, LABEL); \ + fputs (SIZE == 8 ? "@rel64" : "@rel", FILE); \ + } while (0) +#endif + +/* As in sol2.h, override the default from i386/x86-64.h to work around + Sun as TLS bug. */ +#undef ASM_OUTPUT_ALIGNED_COMMON +#define ASM_OUTPUT_ALIGNED_COMMON(FILE, NAME, SIZE, ALIGN) \ + do \ + { \ + if (TARGET_SUN_TLS \ + && in_section \ + && ((in_section->common.flags & SECTION_TLS) == SECTION_TLS)) \ + switch_to_section (bss_section); \ + x86_elf_aligned_common (FILE, NAME, SIZE, ALIGN); \ + } \ + while (0) + +#undef NO_PROFILE_COUNTERS + +#undef MCOUNT_NAME +#define MCOUNT_NAME "_mcount" + +#undef WCHAR_TYPE +#define WCHAR_TYPE (TARGET_64BIT ? "int" : "long int") +#undef WCHAR_TYPE_SIZE +#define WCHAR_TYPE_SIZE 32 + +#undef WINT_TYPE +#define WINT_TYPE (TARGET_64BIT ? "int" : "long int") +#undef WINT_TYPE_SIZE +#define WINT_TYPE_SIZE 32 + +#define USE_IX86_FRAME_POINTER 1 +#define USE_X86_64_FRAME_POINTER 1 + +/* Override i386/sol2.h version: return 8-byte vectors in MMX registers if + possible, matching Sun Studio 12 Update 1+ compilers and other x86 + targets. */ +#undef TARGET_SUBTARGET_DEFAULT +#define TARGET_SUBTARGET_DEFAULT \ + (MASK_80387 | MASK_IEEE_FP | MASK_FLOAT_RETURNS) + +#define SUBTARGET_OPTIMIZATION_OPTIONS \ + { OPT_LEVELS_1_PLUS, OPT_momit_leaf_frame_pointer, NULL, 1 } + +#define MULTILIB_DEFAULTS { "m32" } + +#undef LINK_ARCH64_SPEC_BASE +#define LINK_ARCH64_SPEC_BASE \ + "%{G:-G} \ + %{YP,*} \ + %{R*} \ + %{compat-bsd: \ + %{!YP,*:%{p|pg:-Y P,/usr/ucblib/64:/usr/lib/libp/64:/lib/64:/usr/lib/64} \ + %{!p:%{!pg:-Y P,/usr/ucblib/64:/lib:/usr/lib/64}}} \ + -R /usr/ucblib/64} \ + %{!compat-bsd: \ + %{!YP,*:%{p|pg:-Y P,/usr/lib/libp/64:/lib/64:/usr/lib/64} \ + %{!p:%{!pg:-Y P,/lib/64:/usr/lib/64}}}}" + +#undef LINK_ARCH64_SPEC +#define LINK_ARCH64_SPEC LINK_ARCH64_SPEC_BASE + +#ifdef TARGET_GNU_LD +/* Since binutils 2.21, GNU ld supports new *_sol2 emulations to strictly + follow the Solaris 2 ABI. Prefer them if present. */ +#ifdef HAVE_LD_SOL2_EMULATION +#define I386_EMULATION "elf_i386_sol2" +#define X86_64_EMULATION "elf_x86_64_sol2" +#else +#define I386_EMULATION "elf_i386" +#define X86_64_EMULATION "elf_x86_64" +#endif + +#define TARGET_LD_EMULATION "%{m64:-m " X86_64_EMULATION "}" \ + "%{!m64:-m " I386_EMULATION "} " +#else +#define TARGET_LD_EMULATION "" +#endif + +#undef LINK_ARCH_SPEC +#define LINK_ARCH_SPEC TARGET_LD_EMULATION \ + "%{m64:" LINK_ARCH64_SPEC "}%{!m64:" LINK_ARCH32_SPEC "}" + +/* We do not need to search a special directory for startup files. */ +#undef MD_STARTFILE_PREFIX + +#undef TARGET_ASM_NAMED_SECTION +#define TARGET_ASM_NAMED_SECTION i386_solaris_elf_named_section diff --git a/gcc/config/i386/sol2-c1.asm b/gcc/config/i386/sol2-c1.asm new file mode 100644 index 000000000..4a89530cc --- /dev/null +++ b/gcc/config/i386/sol2-c1.asm @@ -0,0 +1,151 @@ +! crt1.s for Solaris 2, x86 + +! Copyright (C) 1993, 1998, 2008, 2009 Free Software Foundation, Inc. +! Written By Fred Fish, Nov 1992 +! +! This file is free software; you can redistribute it and/or modify it +! under the terms of the GNU General Public License as published by the +! Free Software Foundation; either version 3, or (at your option) any +! later version. +! +! This file is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of +! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +! General Public License for more details. +! +! Under Section 7 of GPL version 3, you are granted additional +! permissions described in the GCC Runtime Library Exception, version +! 3.1, as published by the Free Software Foundation. +! +! You should have received a copy of the GNU General Public License and +! a copy of the GCC Runtime Library Exception along with this program; +! see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +! . + + +! This file takes control of the process from the kernel, as specified +! in section 3 of the System V Application Binary Interface, Intel386 +! Processor Supplement. It has been constructed from information obtained +! from the ABI, information obtained from single stepping existing +! Solaris executables through their startup code with gdb, and from +! information obtained by single stepping executables on other i386 SVR4 +! implementations. This file is the first thing linked into any executable. + + .ident "GNU C crt1.s" + .weak _cleanup + .weak _DYNAMIC + .text + +! Start creating the initial frame by pushing a NULL value for the return +! address of the initial frame, and mark the end of the stack frame chain +! (the innermost stack frame) with a NULL value, per page 3-32 of the ABI. +! Initialize the first stack frame pointer in %ebp (the contents of which +! are unspecified at process initialization). + + .globl _start +_start: + pushl $0x0 + pushl $0x0 + movl %esp,%ebp + +! As specified per page 3-32 of the ABI, %edx contains a function +! pointer that should be registered with atexit(), for proper +! shared object termination. Just push it onto the stack for now +! to preserve it. We want to register _cleanup() first. + + pushl %edx + +! Check to see if there is an _cleanup() function linked in, and if +! so, register it with atexit() as the last thing to be run by +! atexit(). + + movl $_cleanup,%eax + testl %eax,%eax + je .L1 + pushl $_cleanup + call atexit + addl $0x4,%esp +.L1: + +! Now check to see if we have an _DYNAMIC table, and if so then +! we need to register the function pointer previously in %edx, but +! now conveniently saved on the stack as the argument to pass to +! atexit(). + + movl $_DYNAMIC,%eax + testl %eax,%eax + je .L2 + call atexit +.L2: + +! Register _fini() with atexit(). We will take care of calling _init() +! directly. + + pushl $_fini + call atexit + +! Compute the address of the environment vector on the stack and load +! it into the global variable _environ. Currently argc is at 8 off +! the frame pointer. Fetch the argument count into %eax, scale by the +! size of each arg (4 bytes) and compute the address of the environment +! vector which is 16 bytes (the two zero words we pushed, plus argc, +! plus the null word terminating the arg vector) further up the stack, +! off the frame pointer (whew!). + + movl 8(%ebp),%eax + leal 16(%ebp,%eax,4),%edx + movl %edx,_environ + +! Push the environment vector pointer, the argument vector pointer, +! and the argument count on to the stack to set up the arguments +! for _init(), _fpstart(), and main(). Note that the environment +! vector pointer and the arg count were previously loaded into +! %edx and %eax respectively. The only new value we need to compute +! is the argument vector pointer, which is at a fixed address off +! the initial frame pointer. + +! +! Make sure the stack is properly aligned. +! + andl $0xfffffff0,%esp + subl $4,%esp + + pushl %edx + leal 12(%ebp),%edx + pushl %edx + pushl %eax + +! Call _init(argc, argv, environ), _fpstart(argc, argv, environ), and +! main(argc, argv, environ). + + call _init + call __fpstart + call main + +! Pop the argc, argv, and environ arguments off the stack, push the +! value returned from main(), and call exit(). + + addl $12,%esp + pushl %eax + call exit + +! An inline equivalent of _exit, as specified in Figure 3-26 of the ABI. + + pushl $0x0 + movl $0x1,%eax + lcall $7,$0 + +! If all else fails, just try a halt! + + hlt + .type _start,@function + .size _start,.-_start + +! A dummy profiling support routine for non-profiling executables, +! in case we link in some objects that have been compiled for profiling. + + .weak _mcount +_mcount: + ret + .type _mcount,@function + .size _mcount,.-_mcount diff --git a/gcc/config/i386/sol2-ci.asm b/gcc/config/i386/sol2-ci.asm new file mode 100644 index 000000000..f2ff2025d --- /dev/null +++ b/gcc/config/i386/sol2-ci.asm @@ -0,0 +1,40 @@ +! crti.s for Solaris 2, x86. + +! Copyright (C) 1993, 2008, 2009 Free Software Foundation, Inc. +! Written By Fred Fish, Nov 1992 +! +! This file is free software; you can redistribute it and/or modify it +! under the terms of the GNU General Public License as published by the +! Free Software Foundation; either version 3, or (at your option) any +! later version. +! +! This file is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of +! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +! General Public License for more details. +! +! Under Section 7 of GPL version 3, you are granted additional +! permissions described in the GCC Runtime Library Exception, version +! 3.1, as published by the Free Software Foundation. +! +! You should have received a copy of the GNU General Public License and +! a copy of the GCC Runtime Library Exception along with this program; +! see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +! . + + +! This file just supplies labeled starting points for the .init and .fini +! sections. It is linked in before the values-Xx.o files and also before +! crtbegin.o. + + .ident "GNU C crti.s" + + .section .init + .globl _init + .type _init,@function +_init: + + .section .fini + .globl _fini + .type _fini,@function +_fini: diff --git a/gcc/config/i386/sol2-cn.asm b/gcc/config/i386/sol2-cn.asm new file mode 100644 index 000000000..217f04091 --- /dev/null +++ b/gcc/config/i386/sol2-cn.asm @@ -0,0 +1,35 @@ +! crtn.s for Solaris 2, x86. + +! Copyright (C) 1993, 2008, 2009 Free Software Foundation, Inc. +! Written By Fred Fish, Nov 1992 +! +! This file is free software; you can redistribute it and/or modify it +! under the terms of the GNU General Public License as published by the +! Free Software Foundation; either version 3, or (at your option) any +! later version. +! +! This file is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of +! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +! General Public License for more details. +! +! Under Section 7 of GPL version 3, you are granted additional +! permissions described in the GCC Runtime Library Exception, version +! 3.1, as published by the Free Software Foundation. +! +! You should have received a copy of the GNU General Public License and +! a copy of the GCC Runtime Library Exception along with this program; +! see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +! . + + +! This file just supplies returns for the .init and .fini sections. It is +! linked in after all other files. + + .ident "GNU C crtn.o" + + .section .init + ret $0x0 + + .section .fini + ret $0x0 diff --git a/gcc/config/i386/sol2-gas.h b/gcc/config/i386/sol2-gas.h new file mode 100644 index 000000000..8d15b9d11 --- /dev/null +++ b/gcc/config/i386/sol2-gas.h @@ -0,0 +1,31 @@ +/* Definitions of target machine for GCC, for x86 running Solaris 2 + using the GNU assembler. + +Copyright (C) 2010 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ + +/* Undefine this so that BNSYM/ENSYM pairs are emitted by STABS+. */ +#undef NO_DBX_BNSYM_ENSYM + +/* Restore default; gas doesn't understand Sun as .tcomm. */ +#undef TLS_COMMON_ASM_OP diff --git a/gcc/config/i386/sol2-gc1.asm b/gcc/config/i386/sol2-gc1.asm new file mode 100644 index 000000000..8cb989a9c --- /dev/null +++ b/gcc/config/i386/sol2-gc1.asm @@ -0,0 +1,155 @@ +! gcrt1.s for Solaris 2, x86 + +! Copyright (C) 1993, 2008, 2009 Free Software Foundation, Inc. +! Written By Fred Fish, Nov 1992 +! +! This file is free software; you can redistribute it and/or modify it +! under the terms of the GNU General Public License as published by the +! Free Software Foundation; either version 3, or (at your option) any +! later version. +! +! This file is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of +! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +! General Public License for more details. +! +! Under Section 7 of GPL version 3, you are granted additional +! permissions described in the GCC Runtime Library Exception, version +! 3.1, as published by the Free Software Foundation. +! +! You should have received a copy of the GNU General Public License and +! a copy of the GCC Runtime Library Exception along with this program; +! see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +! . + + +! This file takes control of the process from the kernel, as specified +! in section 3 of the System V Application Binary Interface, Intel386 +! Processor Supplement. It has been constructed from information obtained +! from the ABI, information obtained from single stepping existing +! Solaris executables through their startup code with gdb, and from +! information obtained by single stepping executables on other i386 SVR4 +! implementations. This file is the first thing linked into any executable. + +! This is a modified crt1.s by J.W.Hawtin 15/8/96, +! to allow program profiling, by calling monstartup on entry and _mcleanup +! on exit + + .ident "GNU C gcrt1.s" + .weak _DYNAMIC + .text + +! Start creating the initial frame by pushing a NULL value for the return +! address of the initial frame, and mark the end of the stack frame chain +! (the innermost stack frame) with a NULL value, per page 3-32 of the ABI. +! Initialize the first stack frame pointer in %ebp (the contents of which +! are unspecified at process initialization). + + .globl _start +_start: + pushl $0x0 + pushl $0x0 + movl %esp,%ebp + +! As specified per page 3-32 of the ABI, %edx contains a function +! pointer that should be registered with atexit(), for proper +! shared object termination. Just push it onto the stack for now +! to preserve it. We want to register _cleanup() first. + + pushl %edx + +! Check to see if there is an _cleanup() function linked in, and if +! so, register it with atexit() as the last thing to be run by +! atexit(). + + movl $_mcleanup,%eax + testl %eax,%eax + je .L1 + pushl $_mcleanup + call atexit + addl $0x4,%esp +.L1: + +! Now check to see if we have an _DYNAMIC table, and if so then +! we need to register the function pointer previously in %edx, but +! now conveniently saved on the stack as the argument to pass to +! atexit(). + + movl $_DYNAMIC,%eax + testl %eax,%eax + je .L2 + call atexit +.L2: + +! Register _fini() with atexit(). We will take care of calling _init() +! directly. + + pushl $_fini + call atexit + +! Start profiling + + pushl %ebp + movl %esp,%ebp + pushl $_etext + pushl $_start + call monstartup + addl $8,%esp + popl %ebp + +! Compute the address of the environment vector on the stack and load +! it into the global variable _environ. Currently argc is at 8 off +! the frame pointer. Fetch the argument count into %eax, scale by the +! size of each arg (4 bytes) and compute the address of the environment +! vector which is 16 bytes (the two zero words we pushed, plus argc, +! plus the null word terminating the arg vector) further up the stack, +! off the frame pointer (whew!). + + movl 8(%ebp),%eax + leal 16(%ebp,%eax,4),%edx + movl %edx,_environ + +! Push the environment vector pointer, the argument vector pointer, +! and the argument count on to the stack to set up the arguments +! for _init(), _fpstart(), and main(). Note that the environment +! vector pointer and the arg count were previously loaded into +! %edx and %eax respectively. The only new value we need to compute +! is the argument vector pointer, which is at a fixed address off +! the initial frame pointer. + +! +! Make sure the stack is properly aligned. +! + andl $0xfffffff0,%esp + subl $4,%esp + + pushl %edx + leal 12(%ebp),%edx + pushl %edx + pushl %eax + +! Call _init(argc, argv, environ), _fpstart(argc, argv, environ), and +! main(argc, argv, environ). + + call _init + call __fpstart + call main + +! Pop the argc, argv, and environ arguments off the stack, push the +! value returned from main(), and call exit(). + + addl $12,%esp + pushl %eax + call exit + +! An inline equivalent of _exit, as specified in Figure 3-26 of the ABI. + + pushl $0x0 + movl $0x1,%eax + lcall $7,$0 + +! If all else fails, just try a halt! + + hlt + .type _start,@function + .size _start,.-_start diff --git a/gcc/config/i386/sol2-unwind.h b/gcc/config/i386/sol2-unwind.h new file mode 100644 index 000000000..d93b60c78 --- /dev/null +++ b/gcc/config/i386/sol2-unwind.h @@ -0,0 +1,289 @@ +/* DWARF2 EH unwinding support for AMD x86-64 and x86. + Copyright (C) 2009, 2010 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ + +/* Do code reading to identify a signal frame, and set the frame + state data appropriately. See unwind-dw2.c for the structs. */ + +#include +#include + +#ifdef __x86_64__ + +#define MD_FALLBACK_FRAME_STATE_FOR x86_64_fallback_frame_state + +static _Unwind_Reason_Code +x86_64_fallback_frame_state (struct _Unwind_Context *context, + _Unwind_FrameState *fs) +{ + unsigned char *pc = context->ra; + mcontext_t *mctx; + long new_cfa; + + if (/* Solaris 10+ + ------------ + <__sighndlr+0>: push %rbp + <__sighndlr+1>: mov %rsp,%rbp + <__sighndlr+4>: callq *%rcx + <__sighndlr+6>: leaveq <--- PC + <__sighndlr+7>: retq */ + *(unsigned long *)(pc - 6) == 0xc3c9d1ffe5894855) + + /* We need to move up three frames: + + <-- context->cfa + __sighndlr + call_user_handler + sigacthandler + + + context->cfa points into the frame after the saved frame pointer and + saved pc (struct frame). + + The ucontext_t structure is in the kernel frame after the signal + number and a siginfo_t *. Since the frame sizes vary even within + Solaris 10 updates, we need to walk the stack to get there. */ + { + struct frame *fp = (struct frame *) context->cfa - 1; + struct handler_args { + int signo; + siginfo_t *sip; + ucontext_t ucontext; + } *handler_args; + ucontext_t *ucp; + + /* Next frame: __sighndlr frame pointer. */ + fp = (struct frame *) fp->fr_savfp; + /* call_user_handler frame pointer. */ + fp = (struct frame *) fp->fr_savfp; + /* sigacthandler frame pointer. */ + fp = (struct frame *) fp->fr_savfp; + + /* The argument area precedes the struct frame. */ + handler_args = (struct handler_args *) (fp + 1); + ucp = &handler_args->ucontext; + mctx = &ucp->uc_mcontext; + } + else + return _URC_END_OF_STACK; + + new_cfa = mctx->gregs[REG_RSP]; + + fs->regs.cfa_how = CFA_REG_OFFSET; + fs->regs.cfa_reg = 7; + fs->regs.cfa_offset = new_cfa - (long) context->cfa; + + /* The SVR4 register numbering macros aren't usable in libgcc. */ + fs->regs.reg[0].how = REG_SAVED_OFFSET; + fs->regs.reg[0].loc.offset = (long)&mctx->gregs[REG_RAX] - new_cfa; + fs->regs.reg[1].how = REG_SAVED_OFFSET; + fs->regs.reg[1].loc.offset = (long)&mctx->gregs[REG_RDX] - new_cfa; + fs->regs.reg[2].how = REG_SAVED_OFFSET; + fs->regs.reg[2].loc.offset = (long)&mctx->gregs[REG_RCX] - new_cfa; + fs->regs.reg[3].how = REG_SAVED_OFFSET; + fs->regs.reg[3].loc.offset = (long)&mctx->gregs[REG_RBX] - new_cfa; + fs->regs.reg[4].how = REG_SAVED_OFFSET; + fs->regs.reg[4].loc.offset = (long)&mctx->gregs[REG_RSI] - new_cfa; + fs->regs.reg[5].how = REG_SAVED_OFFSET; + fs->regs.reg[5].loc.offset = (long)&mctx->gregs[REG_RDI] - new_cfa; + fs->regs.reg[6].how = REG_SAVED_OFFSET; + fs->regs.reg[6].loc.offset = (long)&mctx->gregs[REG_RBP] - new_cfa; + fs->regs.reg[8].how = REG_SAVED_OFFSET; + fs->regs.reg[8].loc.offset = (long)&mctx->gregs[REG_R8] - new_cfa; + fs->regs.reg[9].how = REG_SAVED_OFFSET; + fs->regs.reg[9].loc.offset = (long)&mctx->gregs[REG_R9] - new_cfa; + fs->regs.reg[10].how = REG_SAVED_OFFSET; + fs->regs.reg[10].loc.offset = (long)&mctx->gregs[REG_R10] - new_cfa; + fs->regs.reg[11].how = REG_SAVED_OFFSET; + fs->regs.reg[11].loc.offset = (long)&mctx->gregs[REG_R11] - new_cfa; + fs->regs.reg[12].how = REG_SAVED_OFFSET; + fs->regs.reg[12].loc.offset = (long)&mctx->gregs[REG_R12] - new_cfa; + fs->regs.reg[13].how = REG_SAVED_OFFSET; + fs->regs.reg[13].loc.offset = (long)&mctx->gregs[REG_R13] - new_cfa; + fs->regs.reg[14].how = REG_SAVED_OFFSET; + fs->regs.reg[14].loc.offset = (long)&mctx->gregs[REG_R14] - new_cfa; + fs->regs.reg[15].how = REG_SAVED_OFFSET; + fs->regs.reg[15].loc.offset = (long)&mctx->gregs[REG_R15] - new_cfa; + fs->regs.reg[16].how = REG_SAVED_OFFSET; + fs->regs.reg[16].loc.offset = (long)&mctx->gregs[REG_RIP] - new_cfa; + fs->retaddr_column = 16; + fs->signal_frame = 1; + + return _URC_NO_REASON; +} + +#else + +#define MD_FALLBACK_FRAME_STATE_FOR x86_fallback_frame_state + +static _Unwind_Reason_Code +x86_fallback_frame_state (struct _Unwind_Context *context, + _Unwind_FrameState *fs) +{ + unsigned char *pc = context->ra; + mcontext_t *mctx; + long new_cfa; + + if (/* Solaris 8 - single-threaded + ---------------------------- + : mov 0x10(%ebp),%esi + : push %esi + : pushl 0xc(%ebp) + : mov 0x8(%ebp),%ecx + : push %ecx + : mov offset(%ebx),%eax + : call *(%eax,%ecx,4) + : add $0xc,%esp <--- PC + : push %esi ... */ + (*(unsigned long *)(pc - 20) == 0x5610758b + && *(unsigned long *)(pc - 16) == 0x8b0c75ff + && *(unsigned long *)(pc - 12) == 0x8b51084d + && *(unsigned char *)(pc - 8) == 0x83 + && *(unsigned long *)(pc - 4) == 0x8814ff00 + && *(unsigned long *)(pc - 0) == 0x560cc483) + + || /* Solaris 8 - multi-threaded + --------------------------- + <__sighndlr+0>: push %ebp + <__sighndlr+1>: mov %esp,%ebp + <__sighndlr+3>: pushl 0x10(%ebp) + <__sighndlr+6>: pushl 0xc(%ebp) + <__sighndlr+9>: pushl 0x8(%ebp) + <__sighndlr+12>: call *0x14(%ebp) + <__sighndlr+15>: leave <--- PC */ + (*(unsigned long *)(pc - 15) == 0xffec8b55 + && *(unsigned long *)(pc - 11) == 0x75ff1075 + && *(unsigned long *)(pc - 7) == 0x0875ff0c + && *(unsigned long *)(pc - 3) == 0xc91455ff) + + || /* Solaris 9 - single-threaded + ---------------------------- + : mov 0x244(%ebx),%ecx + : mov 0x8(%ebp),%eax + : mov (%ecx,%eax,4),%ecx + : pushl 0x10(%ebp) + : pushl 0xc(%ebp) + : push %eax + : call *%ecx + : add $0xc,%esp <--- PC + : pushl 0x10(%ebp) */ + (*(unsigned long *)(pc - 21) == 0x2448b8b + && *(unsigned long *)(pc - 17) == 0x458b0000 + && *(unsigned long *)(pc - 13) == 0x810c8b08 + && *(unsigned long *)(pc - 9) == 0xff1075ff + && *(unsigned long *)(pc - 5) == 0xff500c75 + && *(unsigned long *)(pc - 1) == 0xcc483d1) + + || /* Solaris 9 - multi-threaded, Solaris 10 + --------------------------------------- + <__sighndlr+0>: push %ebp + <__sighndlr+1>: mov %esp,%ebp + <__sighndlr+3>: pushl 0x10(%ebp) + <__sighndlr+6>: pushl 0xc(%ebp) + <__sighndlr+9>: pushl 0x8(%ebp) + <__sighndlr+12>: call *0x14(%ebp) + <__sighndlr+15>: add $0xc,%esp <--- PC + <__sighndlr+18>: leave + <__sighndlr+19>: ret */ + (*(unsigned long *)(pc - 15) == 0xffec8b55 + && *(unsigned long *)(pc - 11) == 0x75ff1075 + && *(unsigned long *)(pc - 7) == 0x0875ff0c + && *(unsigned long *)(pc - 3) == 0x831455ff + && *(unsigned long *)(pc + 1) == 0xc3c90cc4) + + || /* Solaris 11 before snv_125 + -------------------------- + <__sighndlr+0> push %ebp + <__sighndlr+1> mov %esp,%ebp + <__sighndlr+4> pushl 0x10(%ebp) + <__sighndlr+6> pushl 0xc(%ebp) + <__sighndlr+9> pushl 0x8(%ebp) + <__sighndlr+12> call *0x14(%ebp) + <__sighndlr+15> add $0xc,%esp + <__sighndlr+18> leave <--- PC + <__sighndlr+19> ret */ + (*(unsigned long *)(pc - 18) == 0xffec8b55 + && *(unsigned long *)(pc - 14) == 0x7fff107f + && *(unsigned long *)(pc - 10) == 0x0875ff0c + && *(unsigned long *)(pc - 6) == 0x83145fff + && *(unsigned long *)(pc - 1) == 0xc3c90cc4) + + || /* Solaris 11 since snv_125 + ------------------------- + <__sighndlr+0> push %ebp + <__sighndlr+1> mov %esp,%ebp + <__sighndlr+3> and $0xfffffff0,%esp + <__sighndlr+6> sub $0x4,%esp + <__sighndlr+9> pushl 0x10(%ebp) + <__sighndlr+12> pushl 0xc(%ebp) + <__sighndlr+15> pushl 0x8(%ebp) + <__sighndlr+18> call *0x14(%ebp) + <__sighndlr+21> leave <--- PC + <__sighndlr+22> ret */ + (*(unsigned long *)(pc - 21) == 0x83ec8b55 + && *(unsigned long *)(pc - 17) == 0xec83f0e4 + && *(unsigned long *)(pc - 13) == 0x1075ff04 + && *(unsigned long *)(pc - 9) == 0xff0c75ff + && *(unsigned long *)(pc - 5) == 0x55ff0875 + && (*(unsigned long *)(pc - 1) & 0x00ffffff) == 0x00c3c914)) + { + struct handler_args { + int signo; + siginfo_t *sip; + ucontext_t *ucontext; + } *handler_args = context->cfa; + mctx = &handler_args->ucontext->uc_mcontext; + } + else + return _URC_END_OF_STACK; + + new_cfa = mctx->gregs[UESP]; + + fs->regs.cfa_how = CFA_REG_OFFSET; + fs->regs.cfa_reg = 4; + fs->regs.cfa_offset = new_cfa - (long) context->cfa; + + /* The SVR4 register numbering macros aren't usable in libgcc. */ + fs->regs.reg[0].how = REG_SAVED_OFFSET; + fs->regs.reg[0].loc.offset = (long)&mctx->gregs[EAX] - new_cfa; + fs->regs.reg[3].how = REG_SAVED_OFFSET; + fs->regs.reg[3].loc.offset = (long)&mctx->gregs[EBX] - new_cfa; + fs->regs.reg[1].how = REG_SAVED_OFFSET; + fs->regs.reg[1].loc.offset = (long)&mctx->gregs[ECX] - new_cfa; + fs->regs.reg[2].how = REG_SAVED_OFFSET; + fs->regs.reg[2].loc.offset = (long)&mctx->gregs[EDX] - new_cfa; + fs->regs.reg[6].how = REG_SAVED_OFFSET; + fs->regs.reg[6].loc.offset = (long)&mctx->gregs[ESI] - new_cfa; + fs->regs.reg[7].how = REG_SAVED_OFFSET; + fs->regs.reg[7].loc.offset = (long)&mctx->gregs[EDI] - new_cfa; + fs->regs.reg[5].how = REG_SAVED_OFFSET; + fs->regs.reg[5].loc.offset = (long)&mctx->gregs[EBP] - new_cfa; + fs->regs.reg[8].how = REG_SAVED_OFFSET; + fs->regs.reg[8].loc.offset = (long)&mctx->gregs[EIP] - new_cfa; + fs->retaddr_column = 8; + fs->signal_frame = 1; + + return _URC_NO_REASON; +} + +#endif diff --git a/gcc/config/i386/sol2.h b/gcc/config/i386/sol2.h new file mode 100644 index 000000000..baddbb0b9 --- /dev/null +++ b/gcc/config/i386/sol2.h @@ -0,0 +1,182 @@ +/* Target definitions for GCC for Intel 80386 running Solaris 2 + Copyright (C) 1993, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, + 2004, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. + Contributed by Fred Fish (fnf@cygnus.com). + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +/* The Solaris 2.0 x86 linker botches alignment of code sections. + It tries to align to a 16 byte boundary by padding with 0x00000090 + ints, rather than 0x90 bytes (nop). This generates trash in the + ".init" section since the contribution from crtbegin.o is only 7 + bytes. The linker pads it to 16 bytes with a single 0x90 byte, and + two 0x00000090 ints, which generates a segmentation violation when + executed. This macro forces the assembler to do the padding, since + it knows what it is doing. */ +#define FORCE_CODE_SECTION_ALIGN asm(ALIGN_ASM_OP "16"); + +/* Old versions of the Solaris assembler can not handle the difference of + labels in different sections, so force DW_EH_PE_datarel. */ +#undef ASM_PREFERRED_EH_DATA_FORMAT +#define ASM_PREFERRED_EH_DATA_FORMAT(CODE,GLOBAL) \ + (flag_pic ? ((GLOBAL ? DW_EH_PE_indirect : 0) \ + | (TARGET_64BIT ? DW_EH_PE_pcrel | DW_EH_PE_sdata4 \ + : DW_EH_PE_datarel)) \ + : DW_EH_PE_absptr) + +/* The Solaris linker will not merge a read-only .eh_frame section + with a read-write .eh_frame section. None of the encodings used + with non-PIC code require runtime relocations. In 64-bit mode, + since there is no backwards compatibility issue, we use a read-only + section for .eh_frame. In 32-bit mode, we use a writable .eh_frame + section in order to be compatible with G++ for Solaris x86. */ +#undef EH_TABLES_CAN_BE_READ_ONLY +#define EH_TABLES_CAN_BE_READ_ONLY (TARGET_64BIT) + +/* Solaris 2/Intel as chokes on #line directives. */ +#undef CPP_SPEC +#define CPP_SPEC "%{,assembler-with-cpp:-P} %(cpp_subtarget)" + +/* FIXME: Removed -K PIC from generic Solaris 2 ASM_SPEC: the native assembler + gives many warnings: R_386_32 relocation is used for symbol ".text". */ +#undef ASM_SPEC +#define ASM_SPEC "%{v:-V} %{Qy:} %{!Qn:-Qy} %{Ym,*} -s %(asm_cpu)" + +#define ASM_CPU_SPEC "" + +#undef SUBTARGET_EXTRA_SPECS +#define SUBTARGET_EXTRA_SPECS \ + { "cpp_subtarget", CPP_SUBTARGET_SPEC }, \ + { "asm_cpu", ASM_CPU_SPEC }, \ + { "startfile_arch", STARTFILE_ARCH_SPEC }, \ + { "link_arch", LINK_ARCH_SPEC } + +#undef LOCAL_LABEL_PREFIX +#define LOCAL_LABEL_PREFIX "." + +/* The 32-bit Solaris assembler does not support .quad. Do not use it. */ +#ifndef HAVE_AS_IX86_QUAD +#undef ASM_QUAD +#endif + +/* The Solaris assembler wants a .local for non-exported aliases. */ +#define ASM_OUTPUT_DEF_FROM_DECLS(FILE, DECL, TARGET) \ + do { \ + const char *declname = \ + IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (DECL)); \ + ASM_OUTPUT_DEF ((FILE), declname, \ + IDENTIFIER_POINTER (TARGET)); \ + if (! TREE_PUBLIC (DECL)) \ + { \ + fprintf ((FILE), "%s", LOCAL_ASM_OP); \ + assemble_name ((FILE), declname); \ + fprintf ((FILE), "\n"); \ + } \ + } while (0) + +/* Follow Sun requirements for TLS code sequences and use Sun assembler TLS + syntax. */ +#undef TARGET_SUN_TLS +#define TARGET_SUN_TLS 1 + +/* The Sun assembler uses .tcomm for TLS common sections. */ +#define TLS_COMMON_ASM_OP ".tcomm" + +/* Similar to the Sun assembler on SPARC, the native assembler requires + TLS objects to be declared as @tls_obj (not @tls_object). Unlike SPARC, + gas doesn't understand this variant. */ +#ifndef USE_GAS +#undef ASM_DECLARE_OBJECT_NAME +#define ASM_DECLARE_OBJECT_NAME(FILE, NAME, DECL) \ + do \ + { \ + HOST_WIDE_INT size; \ + \ + if (targetm.have_tls && DECL_THREAD_LOCAL_P (DECL)) \ + ASM_OUTPUT_TYPE_DIRECTIVE (FILE, NAME, "tls_obj"); \ + else \ + ASM_OUTPUT_TYPE_DIRECTIVE (FILE, NAME, "object"); \ + \ + size_directive_output = 0; \ + if (!flag_inhibit_size_directive \ + && (DECL) && DECL_SIZE (DECL)) \ + { \ + size_directive_output = 1; \ + size = int_size_in_bytes (TREE_TYPE (DECL)); \ + ASM_OUTPUT_SIZE_DIRECTIVE (FILE, NAME, size); \ + } \ + \ + ASM_OUTPUT_LABEL (FILE, NAME); \ + } \ + while (0) +#endif + +/* The Solaris assembler cannot grok .stabd directives. */ +#undef NO_DBX_BNSYM_ENSYM +#define NO_DBX_BNSYM_ENSYM 1 + +/* Solaris-specific #pragmas are implemented on top of attributes. Hook in + the bits from config/sol2.c. */ +#define SUBTARGET_INSERT_ATTRIBUTES solaris_insert_attributes +#define SUBTARGET_ATTRIBUTE_TABLE SOLARIS_ATTRIBUTE_TABLE + +/* Register the Solaris-specific #pragma directives. */ +#define REGISTER_SUBTARGET_PRAGMAS() solaris_register_pragmas () + +/* Augment i386/unix.h version to return 8-byte vectors in memory, matching + Sun Studio compilers until version 12, the only ones supported on + Solaris 8 and 9. */ +#undef TARGET_SUBTARGET_DEFAULT +#define TARGET_SUBTARGET_DEFAULT \ + (MASK_80387 | MASK_IEEE_FP | MASK_FLOAT_RETURNS | MASK_VECT8_RETURNS) + +/* Output a simple call for .init/.fini. */ +#define ASM_OUTPUT_CALL(FILE, FN) \ + do \ + { \ + fprintf (FILE, "\tcall\t"); \ + ix86_print_operand (FILE, XEXP (DECL_RTL (FN), 0), 'P'); \ + fprintf (FILE, "\n"); \ + } \ + while (0) + +/* We do not need NT_VERSION notes. */ +#undef X86_FILE_START_VERSION_DIRECTIVE +#define X86_FILE_START_VERSION_DIRECTIVE false + +/* Static stack checking is supported by means of probes. */ +#define STACK_CHECK_STATIC_BUILTIN 1 + +/* Only recent versions of Solaris 11 ld properly support hidden .gnu.linkonce + sections, so don't use them. */ +#ifndef TARGET_GNU_LD +#define USE_HIDDEN_LINKONCE 0 +#endif + +/* Put all *tf routines in libgcc. */ +#undef LIBGCC2_HAS_TF_MODE +#define LIBGCC2_HAS_TF_MODE 1 +#define LIBGCC2_TF_CEXT q +#define TF_SIZE 113 + +#undef SIZE_TYPE +#define SIZE_TYPE "unsigned int" + +#undef PTRDIFF_TYPE +#define PTRDIFF_TYPE "int" + +#define MD_UNWIND_SUPPORT "config/i386/sol2-unwind.h" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md new file mode 100644 index 000000000..2463985f8 --- /dev/null +++ b/gcc/config/i386/sse.md @@ -0,0 +1,12125 @@ +;; GCC machine description for SSE instructions +;; Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 +;; Free Software Foundation, Inc. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + + +;; Instruction suffix for sign and zero extensions. +(define_code_attr extsuffix [(sign_extend "sx") (zero_extend "zx")]) + +;; 16 byte integral modes handled by SSE +(define_mode_iterator SSEMODEI [V16QI V8HI V4SI V2DI]) + +;; All 16-byte vector modes handled by SSE +(define_mode_iterator SSEMODE [V16QI V8HI V4SI V2DI V4SF V2DF]) +(define_mode_iterator SSEMODE16 [V16QI V8HI V4SI V2DI V1TI V4SF V2DF]) + +;; 32 byte integral vector modes handled by AVX +(define_mode_iterator AVX256MODEI [V32QI V16HI V8SI V4DI]) + +;; All 32-byte vector modes handled by AVX +(define_mode_iterator AVX256MODE [V32QI V16HI V8SI V4DI V8SF V4DF]) + +;; All QI vector modes handled by AVX +(define_mode_iterator AVXMODEQI [V32QI V16QI]) + +;; All DI vector modes handled by AVX +(define_mode_iterator AVXMODEDI [V4DI V2DI]) + +;; All vector modes handled by AVX +(define_mode_iterator AVXMODE + [V16QI V8HI V4SI V2DI V4SF V2DF V32QI V16HI V8SI V4DI V8SF V4DF]) +(define_mode_iterator AVXMODE16 + [V16QI V8HI V4SI V2DI V1TI V4SF V2DF V32QI V16HI V8SI V4DI V8SF V4DF]) + +;; Mix-n-match +(define_mode_iterator SSEMODE12 [V16QI V8HI]) +(define_mode_iterator SSEMODE24 [V8HI V4SI]) +(define_mode_iterator SSEMODE14 [V16QI V4SI]) +(define_mode_iterator SSEMODE124 [V16QI V8HI V4SI]) +(define_mode_iterator SSEMODE248 [V8HI V4SI V2DI]) +(define_mode_iterator SSEMODE1248 [V16QI V8HI V4SI V2DI]) +(define_mode_iterator SSEMODEF4 [SF DF V4SF V2DF]) +(define_mode_iterator SSEMODEF2P [V4SF V2DF]) + +(define_mode_iterator AVX256MODEF2P [V8SF V4DF]) +(define_mode_iterator AVX256MODE2P [V8SI V8SF V4DF]) +(define_mode_iterator AVX256MODE24P [V8SI V8SF V4DI V4DF]) +(define_mode_iterator AVX256MODE4P [V4DI V4DF]) +(define_mode_iterator AVX256MODE8P [V8SI V8SF]) +(define_mode_iterator AVXMODEF2P [V4SF V2DF V8SF V4DF]) +(define_mode_iterator AVXMODEF4P [V4SF V4DF]) +(define_mode_iterator AVXMODEFDP [V2DF V4DF]) +(define_mode_iterator AVXMODEFSP [V4SF V8SF]) +(define_mode_iterator AVXMODEDCVTDQ2PS [V4SF V8SF]) +(define_mode_iterator AVXMODEDCVTPS2DQ [V4SI V8SI]) + +(define_mode_iterator FMAMODE [SF DF V4SF V2DF V8SF V4DF]) + +;; Int-float size matches +(define_mode_iterator SSEMODE4S [V4SF V4SI]) +(define_mode_iterator SSEMODE2D [V2DF V2DI]) + +;; Modes handled by integer vcond pattern +(define_mode_iterator SSEMODE124C8 [V16QI V8HI V4SI + (V2DI "TARGET_SSE4_2")]) + +;; Modes handled by vec_extract_even/odd pattern. +(define_mode_iterator SSEMODE_EO + [(V4SF "TARGET_SSE") + (V2DF "TARGET_SSE2") + (V2DI "TARGET_SSE2") (V4SI "TARGET_SSE2") + (V8HI "TARGET_SSE2") (V16QI "TARGET_SSE2") + (V4DF "TARGET_AVX") (V8SF "TARGET_AVX")]) + +;; Modes handled by storent patterns. +(define_mode_iterator STORENT_MODE + [(SF "TARGET_SSE4A") (DF "TARGET_SSE4A") + (SI "TARGET_SSE2") (V2DI "TARGET_SSE2") (V2DF "TARGET_SSE2") + (V4SF "TARGET_SSE") + (V4DF "TARGET_AVX") (V8SF "TARGET_AVX")]) + +;; Modes handled by vector float patterns. +(define_mode_iterator VEC_FLOAT_MODE + [(V2DF "TARGET_SSE2") (V4SF "TARGET_SSE") + (V4DF "TARGET_AVX") (V8SF "TARGET_AVX")]) + +;; Modes handled by vector extract patterns. +(define_mode_iterator VEC_EXTRACT_MODE + [(V2DI "TARGET_SSE") (V4SI "TARGET_SSE") + (V8HI "TARGET_SSE") (V16QI "TARGET_SSE") + (V2DF "TARGET_SSE") (V4SF "TARGET_SSE") + (V4DF "TARGET_AVX") (V8SF "TARGET_AVX")]) + +;; Mapping from float mode to required SSE level +(define_mode_attr sse [(SF "sse") (DF "sse2") (V4SF "sse") (V2DF "sse2")]) + +;; Mapping from integer vector mode to mnemonic suffix +(define_mode_attr ssevecsize [(V16QI "b") (V8HI "w") (V4SI "d") (V2DI "q")]) + +;; Mapping of the insn mnemonic suffix +(define_mode_attr ssemodesuffix + [(SF "ss") (DF "sd") (V4SF "ps") (V2DF "pd") (V8SF "ps") (V4DF "pd") + (V8SI "ps") (V4DI "pd")]) +(define_mode_attr ssescalarmodesuffix + [(SF "ss") (DF "sd") (V4SF "ss") (V2DF "sd") (V8SF "ss") (V8SI "ss") + (V4DF "sd") (V4SI "d") (V4DI "sd")]) + +;; Mapping of the max integer size for xop rotate immediate constraint +(define_mode_attr sserotatemax [(V16QI "7") (V8HI "15") (V4SI "31") (V2DI "63")]) + +;; Mapping of vector modes back to the scalar modes +(define_mode_attr ssescalarmode [(V4SF "SF") (V2DF "DF") + (V16QI "QI") (V8HI "HI") + (V4SI "SI") (V2DI "DI")]) + +;; Mapping of vector modes to a vector mode of double size +(define_mode_attr ssedoublesizemode + [(V2DF "V4DF") (V2DI "V4DI") (V4SF "V8SF") (V4SI "V8SI") + (V8HI "V16HI") (V16QI "V32QI") + (V4DF "V8DF") (V8SF "V16SF") + (V4DI "V8DI") (V8SI "V16SI") (V16HI "V32HI") (V32QI "V64QI")]) + +;; Number of scalar elements in each vector type +(define_mode_attr ssescalarnum + [(V4SF "4") (V2DF "2") (V16QI "16") (V8HI "8") (V4SI "4") (V2DI "2") + (V8SF "8") (V4DF "4") (V32QI "32") (V16HI "16") (V8SI "8") (V4DI "4")]) + +;; Mapping for AVX +(define_mode_attr avxvecmode + [(V16QI "TI") (V8HI "TI") (V4SI "TI") (V2DI "TI") (V1TI "TI") + (V4SF "V4SF") (V8SF "V8SF") (V2DF "V2DF") (V4DF "V4DF") + (V32QI "OI") (V16HI "OI") (V8SI "OI") (V4DI "OI")]) +(define_mode_attr avxvecpsmode + [(V16QI "V4SF") (V8HI "V4SF") (V4SI "V4SF") (V2DI "V4SF") + (V32QI "V8SF") (V16HI "V8SF") (V8SI "V8SF") (V4DI "V8SF")]) +(define_mode_attr avxhalfvecmode + [(V32QI "V16QI") (V16HI "V8HI") (V8SI "V4SI") (V4DI "V2DI") + (V8SF "V4SF") (V4DF "V2DF") + (V16QI "V8QI") (V8HI "V4HI") (V4SI "V2SI") (V4SF "V2SF")]) +(define_mode_attr avxscalarmode + [(V16QI "QI") (V8HI "HI") (V4SI "SI") (V2DI "DI") (V4SF "SF") (V2DF "DF") + (V32QI "QI") (V16HI "HI") (V8SI "SI") (V4DI "DI") (V8SF "SF") (V4DF "DF")]) +(define_mode_attr avxcvtvecmode + [(V4SF "V4SI") (V8SF "V8SI") (V4SI "V4SF") (V8SI "V8SF")]) +(define_mode_attr avxpermvecmode + [(V2DF "V2DI") (V4SF "V4SI") (V4DF "V4DI") (V8SF "V8SI")]) +(define_mode_attr avxmodesuffixp + [(V2DF "pd") (V4SI "si") (V4SF "ps") (V8SF "ps") (V8SI "si") + (V4DF "pd")]) +(define_mode_attr avxmodesuffix + [(V16QI "") (V32QI "256") (V4SI "") (V4SF "") (V2DF "") + (V8SI "256") (V8SF "256") (V4DF "256")]) + +;; Mapping of immediate bits for blend instructions +(define_mode_attr blendbits + [(V8SF "255") (V4SF "15") (V4DF "15") (V2DF "3")]) + +;; Mapping of immediate bits for pinsr instructions +(define_mode_attr pinsrbits [(V16QI "32768") (V8HI "128") (V4SI "8")]) + +;; Patterns whose name begins with "sse{,2,3}_" are invoked by intrinsics. + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Move patterns +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_expand "mov" + [(set (match_operand:AVX256MODE 0 "nonimmediate_operand" "") + (match_operand:AVX256MODE 1 "nonimmediate_operand" ""))] + "TARGET_AVX" +{ + ix86_expand_vector_move (mode, operands); + DONE; +}) + +(define_insn "*avx_mov_internal" + [(set (match_operand:AVXMODE16 0 "nonimmediate_operand" "=x,x ,m") + (match_operand:AVXMODE16 1 "nonimmediate_or_sse_const_operand" "C ,xm,x"))] + "TARGET_AVX + && (register_operand (operands[0], mode) + || register_operand (operands[1], mode))" +{ + switch (which_alternative) + { + case 0: + return standard_sse_constant_opcode (insn, operands[1]); + case 1: + case 2: + switch (get_attr_mode (insn)) + { + case MODE_V8SF: + case MODE_V4SF: + if (misaligned_operand (operands[0], mode) + || misaligned_operand (operands[1], mode)) + return "vmovups\t{%1, %0|%0, %1}"; + else + return "vmovaps\t{%1, %0|%0, %1}"; + case MODE_V4DF: + case MODE_V2DF: + if (misaligned_operand (operands[0], mode) + || misaligned_operand (operands[1], mode)) + return "vmovupd\t{%1, %0|%0, %1}"; + else if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "vmovaps\t{%1, %0|%0, %1}"; + else + return "vmovapd\t{%1, %0|%0, %1}"; + default: + if (misaligned_operand (operands[0], mode) + || misaligned_operand (operands[1], mode)) + return "vmovdqu\t{%1, %0|%0, %1}"; + else if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "vmovaps\t{%1, %0|%0, %1}"; + else + return "vmovdqa\t{%1, %0|%0, %1}"; + } + default: + gcc_unreachable (); + } +} + [(set_attr "type" "sselog1,ssemov,ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +;; All of these patterns are enabled for SSE1 as well as SSE2. +;; This is essential for maintaining stable calling conventions. + +(define_expand "mov" + [(set (match_operand:SSEMODE16 0 "nonimmediate_operand" "") + (match_operand:SSEMODE16 1 "nonimmediate_operand" ""))] + "TARGET_SSE" +{ + ix86_expand_vector_move (mode, operands); + DONE; +}) + +(define_insn "*mov_internal" + [(set (match_operand:SSEMODE16 0 "nonimmediate_operand" "=x,x ,m") + (match_operand:SSEMODE16 1 "nonimmediate_or_sse_const_operand" "C ,xm,x"))] + "TARGET_SSE + && (register_operand (operands[0], mode) + || register_operand (operands[1], mode))" +{ + switch (which_alternative) + { + case 0: + return standard_sse_constant_opcode (insn, operands[1]); + case 1: + case 2: + switch (get_attr_mode (insn)) + { + case MODE_V4SF: + return "movaps\t{%1, %0|%0, %1}"; + case MODE_V2DF: + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "movaps\t{%1, %0|%0, %1}"; + else + return "movapd\t{%1, %0|%0, %1}"; + default: + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "movaps\t{%1, %0|%0, %1}"; + else + return "movdqa\t{%1, %0|%0, %1}"; + } + default: + gcc_unreachable (); + } +} + [(set_attr "type" "sselog1,ssemov,ssemov") + (set (attr "mode") + (cond [(ior (ior (ne (symbol_ref "optimize_function_for_size_p (cfun)") (const_int 0)) + (eq (symbol_ref "TARGET_SSE2") (const_int 0))) + (and (eq_attr "alternative" "2") + (ne (symbol_ref "TARGET_SSE_TYPELESS_STORES") + (const_int 0)))) + (const_string "V4SF") + (eq (const_string "mode") (const_string "V4SFmode")) + (const_string "V4SF") + (eq (const_string "mode") (const_string "V2DFmode")) + (const_string "V2DF") + ] + (const_string "TI")))]) + +;; Move a DI from a 32-bit register pair (e.g. %edx:%eax) to an xmm. +;; We'd rather avoid this entirely; if the 32-bit reg pair was loaded +;; from memory, we'd prefer to load the memory directly into the %xmm +;; register. To facilitate this happy circumstance, this pattern won't +;; split until after register allocation. If the 64-bit value didn't +;; come from memory, this is the best we can do. This is much better +;; than storing %edx:%eax into a stack temporary and loading an %xmm +;; from there. + +(define_insn_and_split "movdi_to_sse" + [(parallel + [(set (match_operand:V4SI 0 "register_operand" "=?x,x") + (subreg:V4SI (match_operand:DI 1 "nonimmediate_operand" "r,m") 0)) + (clobber (match_scratch:V4SI 2 "=&x,X"))])] + "!TARGET_64BIT && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES" + "#" + "&& reload_completed" + [(const_int 0)] +{ + if (register_operand (operands[1], DImode)) + { + /* The DImode arrived in a pair of integral registers (e.g. %edx:%eax). + Assemble the 64-bit DImode value in an xmm register. */ + emit_insn (gen_sse2_loadld (operands[0], CONST0_RTX (V4SImode), + gen_rtx_SUBREG (SImode, operands[1], 0))); + emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode), + gen_rtx_SUBREG (SImode, operands[1], 4))); + emit_insn (gen_vec_interleave_lowv4si (operands[0], operands[0], + operands[2])); + } + else if (memory_operand (operands[1], DImode)) + emit_insn (gen_vec_concatv2di (gen_lowpart (V2DImode, operands[0]), + operands[1], const0_rtx)); + else + gcc_unreachable (); +}) + +(define_split + [(set (match_operand:V4SF 0 "register_operand" "") + (match_operand:V4SF 1 "zero_extended_scalar_load_operand" ""))] + "TARGET_SSE && reload_completed" + [(set (match_dup 0) + (vec_merge:V4SF + (vec_duplicate:V4SF (match_dup 1)) + (match_dup 2) + (const_int 1)))] +{ + operands[1] = simplify_gen_subreg (SFmode, operands[1], V4SFmode, 0); + operands[2] = CONST0_RTX (V4SFmode); +}) + +(define_split + [(set (match_operand:V2DF 0 "register_operand" "") + (match_operand:V2DF 1 "zero_extended_scalar_load_operand" ""))] + "TARGET_SSE2 && reload_completed" + [(set (match_dup 0) (vec_concat:V2DF (match_dup 1) (match_dup 2)))] +{ + operands[1] = simplify_gen_subreg (DFmode, operands[1], V2DFmode, 0); + operands[2] = CONST0_RTX (DFmode); +}) + +(define_expand "push1" + [(match_operand:AVX256MODE 0 "register_operand" "")] + "TARGET_AVX" +{ + ix86_expand_push (mode, operands[0]); + DONE; +}) + +(define_expand "push1" + [(match_operand:SSEMODE16 0 "register_operand" "")] + "TARGET_SSE" +{ + ix86_expand_push (mode, operands[0]); + DONE; +}) + +(define_expand "movmisalign" + [(set (match_operand:AVX256MODE 0 "nonimmediate_operand" "") + (match_operand:AVX256MODE 1 "nonimmediate_operand" ""))] + "TARGET_AVX" +{ + ix86_expand_vector_move_misalign (mode, operands); + DONE; +}) + +(define_expand "movmisalign" + [(set (match_operand:SSEMODE16 0 "nonimmediate_operand" "") + (match_operand:SSEMODE16 1 "nonimmediate_operand" ""))] + "TARGET_SSE" +{ + ix86_expand_vector_move_misalign (mode, operands); + DONE; +}) + +(define_insn "avx_movu" + [(set (match_operand:AVXMODEF2P 0 "nonimmediate_operand" "=x,m") + (unspec:AVXMODEF2P + [(match_operand:AVXMODEF2P 1 "nonimmediate_operand" "xm,x")] + UNSPEC_MOVU))] + "AVX_VEC_FLOAT_MODE_P (mode) + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "vmovu\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "movu" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "sse2_movq128" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (vec_concat:V2DI + (vec_select:DI + (match_operand:V2DI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (const_int 0)))] + "TARGET_SSE2" + "%vmovq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_insn "_movu" + [(set (match_operand:SSEMODEF2P 0 "nonimmediate_operand" "=x,m") + (unspec:SSEMODEF2P + [(match_operand:SSEMODEF2P 1 "nonimmediate_operand" "xm,x")] + UNSPEC_MOVU))] + "SSE_VEC_FLOAT_MODE_P (mode) + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "movu\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "movu" "1") + (set_attr "mode" "")]) + +(define_insn "avx_movdqu" + [(set (match_operand:AVXMODEQI 0 "nonimmediate_operand" "=x,m") + (unspec:AVXMODEQI + [(match_operand:AVXMODEQI 1 "nonimmediate_operand" "xm,x")] + UNSPEC_MOVU))] + "TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "vmovdqu\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "movu" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "sse2_movdqu" + [(set (match_operand:V16QI 0 "nonimmediate_operand" "=x,m") + (unspec:V16QI [(match_operand:V16QI 1 "nonimmediate_operand" "xm,x")] + UNSPEC_MOVU))] + "TARGET_SSE2 && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "movdqu\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "movu" "1") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_insn "avx_movnt" + [(set (match_operand:AVXMODEF2P 0 "memory_operand" "=m") + (unspec:AVXMODEF2P + [(match_operand:AVXMODEF2P 1 "register_operand" "x")] + UNSPEC_MOVNT))] + "AVX_VEC_FLOAT_MODE_P (mode)" + "vmovnt\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "_movnt" + [(set (match_operand:SSEMODEF2P 0 "memory_operand" "=m") + (unspec:SSEMODEF2P + [(match_operand:SSEMODEF2P 1 "register_operand" "x")] + UNSPEC_MOVNT))] + "SSE_VEC_FLOAT_MODE_P (mode)" + "movnt\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "")]) + +(define_insn "avx_movnt" + [(set (match_operand:AVXMODEDI 0 "memory_operand" "=m") + (unspec:AVXMODEDI + [(match_operand:AVXMODEDI 1 "register_operand" "x")] + UNSPEC_MOVNT))] + "TARGET_AVX" + "vmovntdq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "sse2_movntv2di" + [(set (match_operand:V2DI 0 "memory_operand" "=m") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "x")] + UNSPEC_MOVNT))] + "TARGET_SSE2" + "movntdq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_insn "sse2_movntsi" + [(set (match_operand:SI 0 "memory_operand" "=m") + (unspec:SI [(match_operand:SI 1 "register_operand" "r")] + UNSPEC_MOVNT))] + "TARGET_SSE2" + "movnti\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_data16" "0") + (set_attr "mode" "V2DF")]) + +(define_insn "avx_lddqu" + [(set (match_operand:AVXMODEQI 0 "register_operand" "=x") + (unspec:AVXMODEQI + [(match_operand:AVXMODEQI 1 "memory_operand" "m")] + UNSPEC_LDDQU))] + "TARGET_AVX" + "vlddqu\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "movu" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "sse3_lddqu" + [(set (match_operand:V16QI 0 "register_operand" "=x") + (unspec:V16QI [(match_operand:V16QI 1 "memory_operand" "m")] + UNSPEC_LDDQU))] + "TARGET_SSE3" + "lddqu\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "movu" "1") + (set_attr "prefix_data16" "0") + (set_attr "prefix_rep" "1") + (set_attr "mode" "TI")]) + +; Expand patterns for non-temporal stores. At the moment, only those +; that directly map to insns are defined; it would be possible to +; define patterns for other modes that would expand to several insns. + +(define_expand "storent" + [(set (match_operand:STORENT_MODE 0 "memory_operand" "") + (unspec:STORENT_MODE + [(match_operand:STORENT_MODE 1 "register_operand" "")] + UNSPEC_MOVNT))]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Parallel floating point arithmetic +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_expand "2" + [(set (match_operand:VEC_FLOAT_MODE 0 "register_operand" "") + (absneg:VEC_FLOAT_MODE + (match_operand:VEC_FLOAT_MODE 1 "register_operand" "")))] + "" + "ix86_expand_fp_absneg_operator (, mode, operands); DONE;") + +(define_insn_and_split "*avx_absneg2" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x,x") + (match_operator:AVXMODEF2P 3 "absneg_operator" + [(match_operand:AVXMODEF2P 1 "nonimmediate_operand" "x,m")])) + (use (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm,x"))] + "AVX_VEC_FLOAT_MODE_P (mode)" + "#" + "&& reload_completed" + [(const_int 0)] +{ + rtx t; + + if (MEM_P (operands[1])) + t = gen_rtx_fmt_ee (GET_CODE (operands[3]) == NEG ? XOR : AND, + mode, operands[2], operands[1]); + else + t = gen_rtx_fmt_ee (GET_CODE (operands[3]) == NEG ? XOR : AND, + mode, operands[1], operands[2]); + t = gen_rtx_SET (VOIDmode, operands[0], t); + emit_insn (t); + DONE; +}) + +(define_insn_and_split "*sse_absneg2" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x") + (match_operator:SSEMODEF2P 3 "absneg_operator" + [(match_operand:SSEMODEF2P 1 "nonimmediate_operand" "0,xm")])) + (use (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm,0"))] + "SSE_VEC_FLOAT_MODE_P (mode)" + "#" + "&& reload_completed" + [(const_int 0)] +{ + rtx t; + + t = operands[rtx_equal_p (operands[0], operands[1]) ? 2 : 1]; + t = gen_rtx_fmt_ee (GET_CODE (operands[3]) == NEG ? XOR : AND, + mode, operands[0], t); + t = gen_rtx_SET (VOIDmode, operands[0], t); + emit_insn (t); + DONE; +}) + +(define_expand "3" + [(set (match_operand:AVX256MODEF2P 0 "register_operand" "") + (plusminus:AVX256MODEF2P + (match_operand:AVX256MODEF2P 1 "nonimmediate_operand" "") + (match_operand:AVX256MODEF2P 2 "nonimmediate_operand" "")))] + "AVX256_VEC_FLOAT_MODE_P (mode)" + "ix86_fixup_binary_operands_no_copy (, mode, operands);") + +(define_insn "*avx_3" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x") + (plusminus:AVXMODEF2P + (match_operand:AVXMODEF2P 1 "nonimmediate_operand" "x") + (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")))] + "AVX_VEC_FLOAT_MODE_P (mode) + && ix86_binary_operator_ok (, mode, operands)" + "v\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseadd") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_expand "3" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "") + (plusminus:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "")))] + "SSE_VEC_FLOAT_MODE_P (mode)" + "ix86_fixup_binary_operands_no_copy (, mode, operands);") + +(define_insn "*3" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (plusminus:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "0") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")))] + "SSE_VEC_FLOAT_MODE_P (mode) + && ix86_binary_operator_ok (, mode, operands)" + "\t{%2, %0|%0, %2}" + [(set_attr "type" "sseadd") + (set_attr "mode" "")]) + +(define_insn "*avx_vm3" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (vec_merge:SSEMODEF2P + (plusminus:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "register_operand" "x") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")) + (match_dup 1) + (const_int 1)))] + "AVX128_VEC_FLOAT_MODE_P (mode)" + "v\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseadd") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "_vm3" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (vec_merge:SSEMODEF2P + (plusminus:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "register_operand" "0") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")) + (match_dup 1) + (const_int 1)))] + "SSE_VEC_FLOAT_MODE_P (mode)" + "\t{%2, %0|%0, %2}" + [(set_attr "type" "sseadd") + (set_attr "mode" "")]) + +(define_expand "mul3" + [(set (match_operand:AVX256MODEF2P 0 "register_operand" "") + (mult:AVX256MODEF2P + (match_operand:AVX256MODEF2P 1 "nonimmediate_operand" "") + (match_operand:AVX256MODEF2P 2 "nonimmediate_operand" "")))] + "AVX256_VEC_FLOAT_MODE_P (mode)" + "ix86_fixup_binary_operands_no_copy (MULT, mode, operands);") + +(define_insn "*avx_mul3" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x") + (mult:AVXMODEF2P + (match_operand:AVXMODEF2P 1 "nonimmediate_operand" "%x") + (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")))] + "AVX_VEC_FLOAT_MODE_P (mode) + && ix86_binary_operator_ok (MULT, mode, operands)" + "vmul\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssemul") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_expand "mul3" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "") + (mult:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "")))] + "SSE_VEC_FLOAT_MODE_P (mode)" + "ix86_fixup_binary_operands_no_copy (MULT, mode, operands);") + +(define_insn "*mul3" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (mult:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "%0") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")))] + "SSE_VEC_FLOAT_MODE_P (mode) + && ix86_binary_operator_ok (MULT, mode, operands)" + "mul\t{%2, %0|%0, %2}" + [(set_attr "type" "ssemul") + (set_attr "mode" "")]) + +(define_insn "*avx_vmmul3" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (vec_merge:SSEMODEF2P + (mult:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "register_operand" "x") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")) + (match_dup 1) + (const_int 1)))] + "AVX_VEC_FLOAT_MODE_P (mode)" + "vmul\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssemul") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "_vmmul3" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (vec_merge:SSEMODEF2P + (mult:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "register_operand" "0") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")) + (match_dup 1) + (const_int 1)))] + "SSE_VEC_FLOAT_MODE_P (mode)" + "mul\t{%2, %0|%0, %2}" + [(set_attr "type" "ssemul") + (set_attr "mode" "")]) + +(define_expand "divv8sf3" + [(set (match_operand:V8SF 0 "register_operand" "") + (div:V8SF (match_operand:V8SF 1 "register_operand" "") + (match_operand:V8SF 2 "nonimmediate_operand" "")))] + "TARGET_AVX" +{ + ix86_fixup_binary_operands_no_copy (DIV, V8SFmode, operands); + + if (TARGET_SSE_MATH && TARGET_RECIP && !optimize_insn_for_size_p () + && flag_finite_math_only && !flag_trapping_math + && flag_unsafe_math_optimizations) + { + ix86_emit_swdivsf (operands[0], operands[1], + operands[2], V8SFmode); + DONE; + } +}) + +(define_expand "divv4df3" + [(set (match_operand:V4DF 0 "register_operand" "") + (div:V4DF (match_operand:V4DF 1 "register_operand" "") + (match_operand:V4DF 2 "nonimmediate_operand" "")))] + "TARGET_AVX" + "ix86_fixup_binary_operands_no_copy (DIV, V4DFmode, operands);") + +(define_insn "avx_div3" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x") + (div:AVXMODEF2P + (match_operand:AVXMODEF2P 1 "register_operand" "x") + (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")))] + "AVX_VEC_FLOAT_MODE_P (mode)" + "vdiv\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssediv") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_expand "divv4sf3" + [(set (match_operand:V4SF 0 "register_operand" "") + (div:V4SF (match_operand:V4SF 1 "register_operand" "") + (match_operand:V4SF 2 "nonimmediate_operand" "")))] + "TARGET_SSE" +{ + if (TARGET_SSE_MATH && TARGET_RECIP && optimize_insn_for_speed_p () + && flag_finite_math_only && !flag_trapping_math + && flag_unsafe_math_optimizations) + { + ix86_emit_swdivsf (operands[0], operands[1], + operands[2], V4SFmode); + DONE; + } +}) + +(define_expand "divv2df3" + [(set (match_operand:V2DF 0 "register_operand" "") + (div:V2DF (match_operand:V2DF 1 "register_operand" "") + (match_operand:V2DF 2 "nonimmediate_operand" "")))] + "TARGET_SSE2") + +(define_insn "*avx_div3" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (div:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "register_operand" "x") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")))] + "AVX128_VEC_FLOAT_MODE_P (mode)" + "vdiv\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssediv") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "_div3" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (div:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "register_operand" "0") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")))] + "SSE_VEC_FLOAT_MODE_P (mode)" + "div\t{%2, %0|%0, %2}" + [(set_attr "type" "ssediv") + (set_attr "mode" "")]) + +(define_insn "*avx_vmdiv3" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (vec_merge:SSEMODEF2P + (div:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "register_operand" "x") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")) + (match_dup 1) + (const_int 1)))] + "AVX128_VEC_FLOAT_MODE_P (mode)" + "vdiv\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssediv") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "_vmdiv3" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (vec_merge:SSEMODEF2P + (div:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "register_operand" "0") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")) + (match_dup 1) + (const_int 1)))] + "SSE_VEC_FLOAT_MODE_P (mode)" + "div\t{%2, %0|%0, %2}" + [(set_attr "type" "ssediv") + (set_attr "mode" "")]) + +(define_insn "avx_rcpv8sf2" + [(set (match_operand:V8SF 0 "register_operand" "=x") + (unspec:V8SF + [(match_operand:V8SF 1 "nonimmediate_operand" "xm")] UNSPEC_RCP))] + "TARGET_AVX" + "vrcpps\t{%1, %0|%0, %1}" + [(set_attr "type" "sse") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn "sse_rcpv4sf2" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (unspec:V4SF + [(match_operand:V4SF 1 "nonimmediate_operand" "xm")] UNSPEC_RCP))] + "TARGET_SSE" + "%vrcpps\t{%1, %0|%0, %1}" + [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "rcp") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "V4SF")]) + +(define_insn "*avx_vmrcpv4sf2" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_merge:V4SF + (unspec:V4SF [(match_operand:V4SF 1 "nonimmediate_operand" "xm")] + UNSPEC_RCP) + (match_operand:V4SF 2 "register_operand" "x") + (const_int 1)))] + "TARGET_AVX" + "vrcpss\t{%1, %2, %0|%0, %2, %1}" + [(set_attr "type" "sse") + (set_attr "prefix" "vex") + (set_attr "mode" "SF")]) + +(define_insn "sse_vmrcpv4sf2" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_merge:V4SF + (unspec:V4SF [(match_operand:V4SF 1 "nonimmediate_operand" "xm")] + UNSPEC_RCP) + (match_operand:V4SF 2 "register_operand" "0") + (const_int 1)))] + "TARGET_SSE" + "rcpss\t{%1, %0|%0, %1}" + [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "rcp") + (set_attr "mode" "SF")]) + +(define_expand "sqrtv8sf2" + [(set (match_operand:V8SF 0 "register_operand" "") + (sqrt:V8SF (match_operand:V8SF 1 "nonimmediate_operand" "")))] + "TARGET_AVX" +{ + if (TARGET_SSE_MATH && TARGET_RECIP && !optimize_insn_for_size_p () + && flag_finite_math_only && !flag_trapping_math + && flag_unsafe_math_optimizations) + { + ix86_emit_swsqrtsf (operands[0], operands[1], V8SFmode, 0); + DONE; + } +}) + +(define_insn "avx_sqrtv8sf2" + [(set (match_operand:V8SF 0 "register_operand" "=x") + (sqrt:V8SF (match_operand:V8SF 1 "nonimmediate_operand" "xm")))] + "TARGET_AVX" + "vsqrtps\t{%1, %0|%0, %1}" + [(set_attr "type" "sse") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_expand "sqrtv4sf2" + [(set (match_operand:V4SF 0 "register_operand" "") + (sqrt:V4SF (match_operand:V4SF 1 "nonimmediate_operand" "")))] + "TARGET_SSE" +{ + if (TARGET_SSE_MATH && TARGET_RECIP && optimize_insn_for_speed_p () + && flag_finite_math_only && !flag_trapping_math + && flag_unsafe_math_optimizations) + { + ix86_emit_swsqrtsf (operands[0], operands[1], V4SFmode, 0); + DONE; + } +}) + +(define_insn "sse_sqrtv4sf2" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (sqrt:V4SF (match_operand:V4SF 1 "nonimmediate_operand" "xm")))] + "TARGET_SSE" + "%vsqrtps\t{%1, %0|%0, %1}" + [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "sqrt") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "V4SF")]) + +(define_insn "sqrtv4df2" + [(set (match_operand:V4DF 0 "register_operand" "=x") + (sqrt:V4DF (match_operand:V4DF 1 "nonimmediate_operand" "xm")))] + "TARGET_AVX" + "vsqrtpd\t{%1, %0|%0, %1}" + [(set_attr "type" "sse") + (set_attr "prefix" "vex") + (set_attr "mode" "V4DF")]) + +(define_insn "sqrtv2df2" + [(set (match_operand:V2DF 0 "register_operand" "=x") + (sqrt:V2DF (match_operand:V2DF 1 "nonimmediate_operand" "xm")))] + "TARGET_SSE2" + "%vsqrtpd\t{%1, %0|%0, %1}" + [(set_attr "type" "sse") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "V2DF")]) + +(define_insn "*avx_vmsqrt2" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (vec_merge:SSEMODEF2P + (sqrt:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "xm")) + (match_operand:SSEMODEF2P 2 "register_operand" "x") + (const_int 1)))] + "AVX_VEC_FLOAT_MODE_P (mode)" + "vsqrt\t{%1, %2, %0|%0, %2, %1}" + [(set_attr "type" "sse") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "_vmsqrt2" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (vec_merge:SSEMODEF2P + (sqrt:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "xm")) + (match_operand:SSEMODEF2P 2 "register_operand" "0") + (const_int 1)))] + "SSE_VEC_FLOAT_MODE_P (mode)" + "sqrt\t{%1, %0|%0, %1}" + [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "sqrt") + (set_attr "mode" "")]) + +(define_expand "rsqrtv8sf2" + [(set (match_operand:V8SF 0 "register_operand" "") + (unspec:V8SF + [(match_operand:V8SF 1 "nonimmediate_operand" "")] UNSPEC_RSQRT))] + "TARGET_AVX && TARGET_SSE_MATH" +{ + ix86_emit_swsqrtsf (operands[0], operands[1], V8SFmode, 1); + DONE; +}) + +(define_insn "avx_rsqrtv8sf2" + [(set (match_operand:V8SF 0 "register_operand" "=x") + (unspec:V8SF + [(match_operand:V8SF 1 "nonimmediate_operand" "xm")] UNSPEC_RSQRT))] + "TARGET_AVX" + "vrsqrtps\t{%1, %0|%0, %1}" + [(set_attr "type" "sse") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_expand "rsqrtv4sf2" + [(set (match_operand:V4SF 0 "register_operand" "") + (unspec:V4SF + [(match_operand:V4SF 1 "nonimmediate_operand" "")] UNSPEC_RSQRT))] + "TARGET_SSE_MATH" +{ + ix86_emit_swsqrtsf (operands[0], operands[1], V4SFmode, 1); + DONE; +}) + +(define_insn "sse_rsqrtv4sf2" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (unspec:V4SF + [(match_operand:V4SF 1 "nonimmediate_operand" "xm")] UNSPEC_RSQRT))] + "TARGET_SSE" + "%vrsqrtps\t{%1, %0|%0, %1}" + [(set_attr "type" "sse") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "V4SF")]) + +(define_insn "*avx_vmrsqrtv4sf2" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_merge:V4SF + (unspec:V4SF [(match_operand:V4SF 1 "nonimmediate_operand" "xm")] + UNSPEC_RSQRT) + (match_operand:V4SF 2 "register_operand" "x") + (const_int 1)))] + "TARGET_AVX" + "vrsqrtss\t{%1, %2, %0|%0, %2, %1}" + [(set_attr "type" "sse") + (set_attr "prefix" "vex") + (set_attr "mode" "SF")]) + +(define_insn "sse_vmrsqrtv4sf2" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_merge:V4SF + (unspec:V4SF [(match_operand:V4SF 1 "nonimmediate_operand" "xm")] + UNSPEC_RSQRT) + (match_operand:V4SF 2 "register_operand" "0") + (const_int 1)))] + "TARGET_SSE" + "rsqrtss\t{%1, %0|%0, %1}" + [(set_attr "type" "sse") + (set_attr "mode" "SF")]) + +;; ??? For !flag_finite_math_only, the representation with SMIN/SMAX +;; isn't really correct, as those rtl operators aren't defined when +;; applied to NaNs. Hopefully the optimizers won't get too smart on us. + +(define_expand "3" + [(set (match_operand:AVX256MODEF2P 0 "register_operand" "") + (smaxmin:AVX256MODEF2P + (match_operand:AVX256MODEF2P 1 "nonimmediate_operand" "") + (match_operand:AVX256MODEF2P 2 "nonimmediate_operand" "")))] + "AVX256_VEC_FLOAT_MODE_P (mode)" +{ + if (!flag_finite_math_only) + operands[1] = force_reg (mode, operands[1]); + ix86_fixup_binary_operands_no_copy (, mode, operands); +}) + +(define_expand "3" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "") + (smaxmin:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "")))] + "SSE_VEC_FLOAT_MODE_P (mode)" +{ + if (!flag_finite_math_only) + operands[1] = force_reg (mode, operands[1]); + ix86_fixup_binary_operands_no_copy (, mode, operands); +}) + +(define_insn "*avx_3_finite" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x") + (smaxmin:AVXMODEF2P + (match_operand:AVXMODEF2P 1 "nonimmediate_operand" "%x") + (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")))] + "AVX_VEC_FLOAT_MODE_P (mode) && flag_finite_math_only + && ix86_binary_operator_ok (, mode, operands)" + "v\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseadd") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "*3_finite" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (smaxmin:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "%0") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")))] + "SSE_VEC_FLOAT_MODE_P (mode) && flag_finite_math_only + && ix86_binary_operator_ok (, mode, operands)" + "\t{%2, %0|%0, %2}" + [(set_attr "type" "sseadd") + (set_attr "mode" "")]) + +(define_insn "*avx_3" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x") + (smaxmin:AVXMODEF2P + (match_operand:AVXMODEF2P 1 "nonimmediate_operand" "%x") + (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")))] + "AVX_VEC_FLOAT_MODE_P (mode)" + "v\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseadd") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "*3" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (smaxmin:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "register_operand" "0") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")))] + "SSE_VEC_FLOAT_MODE_P (mode)" + "\t{%2, %0|%0, %2}" + [(set_attr "type" "sseadd") + (set_attr "mode" "")]) + +(define_insn "*avx_vm3" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (vec_merge:SSEMODEF2P + (smaxmin:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "register_operand" "x") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")) + (match_dup 1) + (const_int 1)))] + "AVX128_VEC_FLOAT_MODE_P (mode)" + "v\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sse") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "_vm3" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (vec_merge:SSEMODEF2P + (smaxmin:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "register_operand" "0") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")) + (match_dup 1) + (const_int 1)))] + "SSE_VEC_FLOAT_MODE_P (mode)" + "\t{%2, %0|%0, %2}" + [(set_attr "type" "sseadd") + (set_attr "mode" "")]) + +;; These versions of the min/max patterns implement exactly the operations +;; min = (op1 < op2 ? op1 : op2) +;; max = (!(op1 < op2) ? op1 : op2) +;; Their operands are not commutative, and thus they may be used in the +;; presence of -0.0 and NaN. + +(define_insn "*avx_ieee_smin3" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x") + (unspec:AVXMODEF2P + [(match_operand:AVXMODEF2P 1 "register_operand" "x") + (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")] + UNSPEC_IEEE_MIN))] + "AVX_VEC_FLOAT_MODE_P (mode)" + "vmin\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseadd") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "*avx_ieee_smax3" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x") + (unspec:AVXMODEF2P + [(match_operand:AVXMODEF2P 1 "register_operand" "x") + (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")] + UNSPEC_IEEE_MAX))] + "AVX_VEC_FLOAT_MODE_P (mode)" + "vmax\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseadd") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "*ieee_smin3" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (unspec:SSEMODEF2P + [(match_operand:SSEMODEF2P 1 "register_operand" "0") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")] + UNSPEC_IEEE_MIN))] + "SSE_VEC_FLOAT_MODE_P (mode)" + "min\t{%2, %0|%0, %2}" + [(set_attr "type" "sseadd") + (set_attr "mode" "")]) + +(define_insn "*ieee_smax3" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (unspec:SSEMODEF2P + [(match_operand:SSEMODEF2P 1 "register_operand" "0") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")] + UNSPEC_IEEE_MAX))] + "SSE_VEC_FLOAT_MODE_P (mode)" + "max\t{%2, %0|%0, %2}" + [(set_attr "type" "sseadd") + (set_attr "mode" "")]) + +(define_insn "avx_addsubv8sf3" + [(set (match_operand:V8SF 0 "register_operand" "=x") + (vec_merge:V8SF + (plus:V8SF + (match_operand:V8SF 1 "register_operand" "x") + (match_operand:V8SF 2 "nonimmediate_operand" "xm")) + (minus:V8SF (match_dup 1) (match_dup 2)) + (const_int 170)))] + "TARGET_AVX" + "vaddsubps\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseadd") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn "avx_addsubv4df3" + [(set (match_operand:V4DF 0 "register_operand" "=x") + (vec_merge:V4DF + (plus:V4DF + (match_operand:V4DF 1 "register_operand" "x") + (match_operand:V4DF 2 "nonimmediate_operand" "xm")) + (minus:V4DF (match_dup 1) (match_dup 2)) + (const_int 10)))] + "TARGET_AVX" + "vaddsubpd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseadd") + (set_attr "prefix" "vex") + (set_attr "mode" "V4DF")]) + +(define_insn "*avx_addsubv4sf3" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_merge:V4SF + (plus:V4SF + (match_operand:V4SF 1 "register_operand" "x") + (match_operand:V4SF 2 "nonimmediate_operand" "xm")) + (minus:V4SF (match_dup 1) (match_dup 2)) + (const_int 10)))] + "TARGET_AVX" + "vaddsubps\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseadd") + (set_attr "prefix" "vex") + (set_attr "mode" "V4SF")]) + +(define_insn "sse3_addsubv4sf3" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_merge:V4SF + (plus:V4SF + (match_operand:V4SF 1 "register_operand" "0") + (match_operand:V4SF 2 "nonimmediate_operand" "xm")) + (minus:V4SF (match_dup 1) (match_dup 2)) + (const_int 10)))] + "TARGET_SSE3" + "addsubps\t{%2, %0|%0, %2}" + [(set_attr "type" "sseadd") + (set_attr "prefix_rep" "1") + (set_attr "mode" "V4SF")]) + +(define_insn "*avx_addsubv2df3" + [(set (match_operand:V2DF 0 "register_operand" "=x") + (vec_merge:V2DF + (plus:V2DF + (match_operand:V2DF 1 "register_operand" "x") + (match_operand:V2DF 2 "nonimmediate_operand" "xm")) + (minus:V2DF (match_dup 1) (match_dup 2)) + (const_int 2)))] + "TARGET_AVX" + "vaddsubpd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseadd") + (set_attr "prefix" "vex") + (set_attr "mode" "V2DF")]) + +(define_insn "sse3_addsubv2df3" + [(set (match_operand:V2DF 0 "register_operand" "=x") + (vec_merge:V2DF + (plus:V2DF + (match_operand:V2DF 1 "register_operand" "0") + (match_operand:V2DF 2 "nonimmediate_operand" "xm")) + (minus:V2DF (match_dup 1) (match_dup 2)) + (const_int 2)))] + "TARGET_SSE3" + "addsubpd\t{%2, %0|%0, %2}" + [(set_attr "type" "sseadd") + (set_attr "atom_unit" "complex") + (set_attr "mode" "V2DF")]) + +(define_insn "avx_hv4df3" + [(set (match_operand:V4DF 0 "register_operand" "=x") + (vec_concat:V4DF + (vec_concat:V2DF + (plusminus:DF + (vec_select:DF + (match_operand:V4DF 1 "register_operand" "x") + (parallel [(const_int 0)])) + (vec_select:DF (match_dup 1) (parallel [(const_int 1)]))) + (plusminus:DF + (vec_select:DF + (match_operand:V4DF 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (vec_select:DF (match_dup 2) (parallel [(const_int 1)])))) + (vec_concat:V2DF + (plusminus:DF + (vec_select:DF (match_dup 1) (parallel [(const_int 2)])) + (vec_select:DF (match_dup 1) (parallel [(const_int 3)]))) + (plusminus:DF + (vec_select:DF (match_dup 2) (parallel [(const_int 2)])) + (vec_select:DF (match_dup 2) (parallel [(const_int 3)]))))))] + "TARGET_AVX" + "vhpd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseadd") + (set_attr "prefix" "vex") + (set_attr "mode" "V4DF")]) + +(define_insn "avx_hv8sf3" + [(set (match_operand:V8SF 0 "register_operand" "=x") + (vec_concat:V8SF + (vec_concat:V4SF + (vec_concat:V2SF + (plusminus:SF + (vec_select:SF + (match_operand:V8SF 1 "register_operand" "x") + (parallel [(const_int 0)])) + (vec_select:SF (match_dup 1) (parallel [(const_int 1)]))) + (plusminus:SF + (vec_select:SF (match_dup 1) (parallel [(const_int 2)])) + (vec_select:SF (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2SF + (plusminus:SF + (vec_select:SF + (match_operand:V8SF 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (vec_select:SF (match_dup 2) (parallel [(const_int 1)]))) + (plusminus:SF + (vec_select:SF (match_dup 2) (parallel [(const_int 2)])) + (vec_select:SF (match_dup 2) (parallel [(const_int 3)]))))) + (vec_concat:V4SF + (vec_concat:V2SF + (plusminus:SF + (vec_select:SF (match_dup 1) (parallel [(const_int 4)])) + (vec_select:SF (match_dup 1) (parallel [(const_int 5)]))) + (plusminus:SF + (vec_select:SF (match_dup 1) (parallel [(const_int 6)])) + (vec_select:SF (match_dup 1) (parallel [(const_int 7)])))) + (vec_concat:V2SF + (plusminus:SF + (vec_select:SF (match_dup 2) (parallel [(const_int 4)])) + (vec_select:SF (match_dup 2) (parallel [(const_int 5)]))) + (plusminus:SF + (vec_select:SF (match_dup 2) (parallel [(const_int 6)])) + (vec_select:SF (match_dup 2) (parallel [(const_int 7)])))))))] + "TARGET_AVX" + "vhps\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseadd") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn "*avx_hv4sf3" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_concat:V4SF + (vec_concat:V2SF + (plusminus:SF + (vec_select:SF + (match_operand:V4SF 1 "register_operand" "x") + (parallel [(const_int 0)])) + (vec_select:SF (match_dup 1) (parallel [(const_int 1)]))) + (plusminus:SF + (vec_select:SF (match_dup 1) (parallel [(const_int 2)])) + (vec_select:SF (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2SF + (plusminus:SF + (vec_select:SF + (match_operand:V4SF 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (vec_select:SF (match_dup 2) (parallel [(const_int 1)]))) + (plusminus:SF + (vec_select:SF (match_dup 2) (parallel [(const_int 2)])) + (vec_select:SF (match_dup 2) (parallel [(const_int 3)]))))))] + "TARGET_AVX" + "vhps\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseadd") + (set_attr "prefix" "vex") + (set_attr "mode" "V4SF")]) + +(define_insn "sse3_hv4sf3" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_concat:V4SF + (vec_concat:V2SF + (plusminus:SF + (vec_select:SF + (match_operand:V4SF 1 "register_operand" "0") + (parallel [(const_int 0)])) + (vec_select:SF (match_dup 1) (parallel [(const_int 1)]))) + (plusminus:SF + (vec_select:SF (match_dup 1) (parallel [(const_int 2)])) + (vec_select:SF (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2SF + (plusminus:SF + (vec_select:SF + (match_operand:V4SF 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (vec_select:SF (match_dup 2) (parallel [(const_int 1)]))) + (plusminus:SF + (vec_select:SF (match_dup 2) (parallel [(const_int 2)])) + (vec_select:SF (match_dup 2) (parallel [(const_int 3)]))))))] + "TARGET_SSE3" + "hps\t{%2, %0|%0, %2}" + [(set_attr "type" "sseadd") + (set_attr "atom_unit" "complex") + (set_attr "prefix_rep" "1") + (set_attr "mode" "V4SF")]) + +(define_insn "*avx_hv2df3" + [(set (match_operand:V2DF 0 "register_operand" "=x") + (vec_concat:V2DF + (plusminus:DF + (vec_select:DF + (match_operand:V2DF 1 "register_operand" "x") + (parallel [(const_int 0)])) + (vec_select:DF (match_dup 1) (parallel [(const_int 1)]))) + (plusminus:DF + (vec_select:DF + (match_operand:V2DF 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (vec_select:DF (match_dup 2) (parallel [(const_int 1)])))))] + "TARGET_AVX" + "vhpd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseadd") + (set_attr "prefix" "vex") + (set_attr "mode" "V2DF")]) + +(define_insn "sse3_hv2df3" + [(set (match_operand:V2DF 0 "register_operand" "=x") + (vec_concat:V2DF + (plusminus:DF + (vec_select:DF + (match_operand:V2DF 1 "register_operand" "0") + (parallel [(const_int 0)])) + (vec_select:DF (match_dup 1) (parallel [(const_int 1)]))) + (plusminus:DF + (vec_select:DF + (match_operand:V2DF 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (vec_select:DF (match_dup 2) (parallel [(const_int 1)])))))] + "TARGET_SSE3" + "hpd\t{%2, %0|%0, %2}" + [(set_attr "type" "sseadd") + (set_attr "mode" "V2DF")]) + +(define_expand "reduc_splus_v8sf" + [(match_operand:V8SF 0 "register_operand" "") + (match_operand:V8SF 1 "register_operand" "")] + "TARGET_AVX" +{ + rtx tmp = gen_reg_rtx (V8SFmode); + rtx tmp2 = gen_reg_rtx (V8SFmode); + emit_insn (gen_avx_haddv8sf3 (tmp, operands[1], operands[1])); + emit_insn (gen_avx_haddv8sf3 (tmp2, tmp, tmp)); + emit_insn (gen_avx_vperm2f128v8sf3 (tmp, tmp2, tmp2, GEN_INT (1))); + emit_insn (gen_addv8sf3 (operands[0], tmp, tmp2)); + DONE; +}) + +(define_expand "reduc_splus_v4sf" + [(match_operand:V4SF 0 "register_operand" "") + (match_operand:V4SF 1 "register_operand" "")] + "TARGET_SSE" +{ + if (TARGET_SSE3) + { + rtx tmp = gen_reg_rtx (V4SFmode); + emit_insn (gen_sse3_haddv4sf3 (tmp, operands[1], operands[1])); + emit_insn (gen_sse3_haddv4sf3 (operands[0], tmp, tmp)); + } + else + ix86_expand_reduc_v4sf (gen_addv4sf3, operands[0], operands[1]); + DONE; +}) + +(define_expand "reduc_splus_v4df" + [(match_operand:V4DF 0 "register_operand" "") + (match_operand:V4DF 1 "register_operand" "")] + "TARGET_AVX" +{ + rtx tmp = gen_reg_rtx (V4DFmode); + rtx tmp2 = gen_reg_rtx (V4DFmode); + emit_insn (gen_avx_haddv4df3 (tmp, operands[1], operands[1])); + emit_insn (gen_avx_vperm2f128v4df3 (tmp2, tmp, tmp, GEN_INT (1))); + emit_insn (gen_addv4df3 (operands[0], tmp, tmp2)); + DONE; +}) + +(define_expand "reduc_splus_v2df" + [(match_operand:V2DF 0 "register_operand" "") + (match_operand:V2DF 1 "register_operand" "")] + "TARGET_SSE3" +{ + emit_insn (gen_sse3_haddv2df3 (operands[0], operands[1], operands[1])); + DONE; +}) + +(define_expand "reduc_smax_v4sf" + [(match_operand:V4SF 0 "register_operand" "") + (match_operand:V4SF 1 "register_operand" "")] + "TARGET_SSE" +{ + ix86_expand_reduc_v4sf (gen_smaxv4sf3, operands[0], operands[1]); + DONE; +}) + +(define_expand "reduc_smin_v4sf" + [(match_operand:V4SF 0 "register_operand" "") + (match_operand:V4SF 1 "register_operand" "")] + "TARGET_SSE" +{ + ix86_expand_reduc_v4sf (gen_sminv4sf3, operands[0], operands[1]); + DONE; +}) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Parallel floating point comparisons +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn "avx_cmp3" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x") + (unspec:AVXMODEF2P + [(match_operand:AVXMODEF2P 1 "register_operand" "x") + (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm") + (match_operand:SI 3 "const_0_to_31_operand" "n")] + UNSPEC_PCMP))] + "TARGET_AVX" + "vcmp\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "avx_cmp3" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (vec_merge:SSEMODEF2P + (unspec:SSEMODEF2P + [(match_operand:SSEMODEF2P 1 "register_operand" "x") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm") + (match_operand:SI 3 "const_0_to_31_operand" "n")] + UNSPEC_PCMP) + (match_dup 1) + (const_int 1)))] + "TARGET_AVX" + "vcmp\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +;; We don't promote 128bit vector compare intrinsics. But vectorizer +;; may generate 256bit vector compare instructions. +(define_insn "*avx_maskcmp3" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x") + (match_operator:AVXMODEF2P 3 "avx_comparison_float_operator" + [(match_operand:AVXMODEF2P 1 "register_operand" "x") + (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")]))] + "AVX_VEC_FLOAT_MODE_P (mode)" + "vcmp%D3\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssecmp") + (set_attr "prefix" "vex") + (set_attr "length_immediate" "1") + (set_attr "mode" "")]) + +(define_insn "_maskcmp3" + [(set (match_operand:SSEMODEF4 0 "register_operand" "=x") + (match_operator:SSEMODEF4 3 "sse_comparison_operator" + [(match_operand:SSEMODEF4 1 "register_operand" "0") + (match_operand:SSEMODEF4 2 "nonimmediate_operand" "xm")]))] + "!TARGET_XOP + && (SSE_FLOAT_MODE_P (mode) || SSE_VEC_FLOAT_MODE_P (mode))" + "cmp%D3\t{%2, %0|%0, %2}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "mode" "")]) + +(define_insn "*avx_vmmaskcmp3" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (vec_merge:SSEMODEF2P + (match_operator:SSEMODEF2P 3 "sse_comparison_operator" + [(match_operand:SSEMODEF2P 1 "register_operand" "x") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")]) + (match_dup 1) + (const_int 1)))] + "AVX_VEC_FLOAT_MODE_P (mode)" + "vcmp%D3\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssecmp") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "_vmmaskcmp3" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (vec_merge:SSEMODEF2P + (match_operator:SSEMODEF2P 3 "sse_comparison_operator" + [(match_operand:SSEMODEF2P 1 "register_operand" "0") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")]) + (match_dup 1) + (const_int 1)))] + "SSE_VEC_FLOAT_MODE_P (mode)" + "cmp%D3\t{%2, %0|%0, %2}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "mode" "")]) + +(define_insn "_comi" + [(set (reg:CCFP FLAGS_REG) + (compare:CCFP + (vec_select:MODEF + (match_operand: 0 "register_operand" "x") + (parallel [(const_int 0)])) + (vec_select:MODEF + (match_operand: 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0)]))))] + "SSE_FLOAT_MODE_P (mode)" + "%vcomis\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecomi") + (set_attr "prefix" "maybe_vex") + (set_attr "prefix_rep" "0") + (set (attr "prefix_data16") + (if_then_else (eq_attr "mode" "DF") + (const_string "1") + (const_string "0"))) + (set_attr "mode" "")]) + +(define_insn "_ucomi" + [(set (reg:CCFPU FLAGS_REG) + (compare:CCFPU + (vec_select:MODEF + (match_operand: 0 "register_operand" "x") + (parallel [(const_int 0)])) + (vec_select:MODEF + (match_operand: 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0)]))))] + "SSE_FLOAT_MODE_P (mode)" + "%vucomis\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecomi") + (set_attr "prefix" "maybe_vex") + (set_attr "prefix_rep" "0") + (set (attr "prefix_data16") + (if_then_else (eq_attr "mode" "DF") + (const_string "1") + (const_string "0"))) + (set_attr "mode" "")]) + +(define_expand "vcond" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "") + (if_then_else:AVXMODEF2P + (match_operator 3 "" + [(match_operand:AVXMODEF2P 4 "nonimmediate_operand" "") + (match_operand:AVXMODEF2P 5 "nonimmediate_operand" "")]) + (match_operand:AVXMODEF2P 1 "general_operand" "") + (match_operand:AVXMODEF2P 2 "general_operand" "")))] + "(SSE_VEC_FLOAT_MODE_P (mode) + || AVX_VEC_FLOAT_MODE_P (mode))" +{ + bool ok = ix86_expand_fp_vcond (operands); + gcc_assert (ok); + DONE; +}) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Parallel floating point logical operations +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn "avx_andnot3" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x") + (and:AVXMODEF2P + (not:AVXMODEF2P + (match_operand:AVXMODEF2P 1 "register_operand" "x")) + (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")))] + "AVX_VEC_FLOAT_MODE_P (mode)" + "vandn\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "_andnot3" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (and:SSEMODEF2P + (not:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "register_operand" "0")) + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")))] + "SSE_VEC_FLOAT_MODE_P (mode)" + "andn\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog") + (set_attr "mode" "")]) + +(define_expand "3" + [(set (match_operand:AVX256MODEF2P 0 "register_operand" "") + (any_logic:AVX256MODEF2P + (match_operand:AVX256MODEF2P 1 "nonimmediate_operand" "") + (match_operand:AVX256MODEF2P 2 "nonimmediate_operand" "")))] + "AVX256_VEC_FLOAT_MODE_P (mode)" + "ix86_fixup_binary_operands_no_copy (, mode, operands);") + +(define_insn "*avx_3" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x") + (any_logic:AVXMODEF2P + (match_operand:AVXMODEF2P 1 "nonimmediate_operand" "%x") + (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")))] + "AVX_VEC_FLOAT_MODE_P (mode) + && ix86_binary_operator_ok (, mode, operands)" +{ + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "vps\t{%2, %1, %0|%0, %1, %2}"; + else + return "v\t{%2, %1, %0|%0, %1, %2}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_expand "3" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "") + (any_logic:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "")))] + "SSE_VEC_FLOAT_MODE_P (mode)" + "ix86_fixup_binary_operands_no_copy (, mode, operands);") + +(define_insn "*3" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (any_logic:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "%0") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")))] + "SSE_VEC_FLOAT_MODE_P (mode) + && ix86_binary_operator_ok (, mode, operands)" +{ + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "ps\t{%2, %0|%0, %2}"; + else + return "\t{%2, %0|%0, %2}"; +} + [(set_attr "type" "sselog") + (set_attr "mode" "")]) + +(define_expand "copysign3" + [(set (match_dup 4) + (and:VEC_FLOAT_MODE + (not:VEC_FLOAT_MODE (match_dup 3)) + (match_operand:VEC_FLOAT_MODE 1 "nonimmediate_operand" ""))) + (set (match_dup 5) + (and:VEC_FLOAT_MODE (match_dup 3) + (match_operand:VEC_FLOAT_MODE 2 "nonimmediate_operand" ""))) + (set (match_operand:VEC_FLOAT_MODE 0 "register_operand" "") + (ior:VEC_FLOAT_MODE (match_dup 4) (match_dup 5)))] + "" +{ + operands[3] = ix86_build_signbit_mask (mode, 1, 0); + + operands[4] = gen_reg_rtx (mode); + operands[5] = gen_reg_rtx (mode); +}) + +;; Also define scalar versions. These are used for abs, neg, and +;; conditional move. Using subregs into vector modes causes register +;; allocation lossage. These patterns do not allow memory operands +;; because the native instructions read the full 128-bits. + +(define_insn "*avx_andnot3" + [(set (match_operand:MODEF 0 "register_operand" "=x") + (and:MODEF + (not:MODEF + (match_operand:MODEF 1 "register_operand" "x")) + (match_operand:MODEF 2 "register_operand" "x")))] + "AVX_FLOAT_MODE_P (mode)" + "vandnp\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "*andnot3" + [(set (match_operand:MODEF 0 "register_operand" "=x") + (and:MODEF + (not:MODEF + (match_operand:MODEF 1 "register_operand" "0")) + (match_operand:MODEF 2 "register_operand" "x")))] + "SSE_FLOAT_MODE_P (mode)" + "andnp\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog") + (set_attr "mode" "")]) + +(define_insn "*avx_3" + [(set (match_operand:MODEF 0 "register_operand" "=x") + (any_logic:MODEF + (match_operand:MODEF 1 "register_operand" "x") + (match_operand:MODEF 2 "register_operand" "x")))] + "AVX_FLOAT_MODE_P (mode)" +{ + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "vps\t{%2, %1, %0|%0, %1, %2}"; + else + return "vp\t{%2, %1, %0|%0, %1, %2}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "*3" + [(set (match_operand:MODEF 0 "register_operand" "=x") + (any_logic:MODEF + (match_operand:MODEF 1 "register_operand" "0") + (match_operand:MODEF 2 "register_operand" "x")))] + "SSE_FLOAT_MODE_P (mode)" +{ + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "ps\t{%2, %0|%0, %2}"; + else + return "p\t{%2, %0|%0, %2}"; +} + [(set_attr "type" "sselog") + (set_attr "mode" "")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; FMA4 floating point multiply/accumulate instructions. This +;; includes the scalar version of the instructions as well as the +;; vector. +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; In order to match (*a * *b) + *c, particularly when vectorizing, allow +;; combine to generate a multiply/add with two memory references. We then +;; split this insn, into loading up the destination register with one of the +;; memory operations. If we don't manage to split the insn, reload will +;; generate the appropriate moves. The reason this is needed, is that combine +;; has already folded one of the memory references into both the multiply and +;; add insns, and it can't generate a new pseudo. I.e.: +;; (set (reg1) (mem (addr1))) +;; (set (reg2) (mult (reg1) (mem (addr2)))) +;; (set (reg3) (plus (reg2) (mem (addr3)))) +;; +;; ??? This is historic, pre-dating the gimple fma transformation. +;; We could now properly represent that only one memory operand is +;; allowed and not be penalized during optimization. + +;; Intrinsic FMA operations. + +;; The standard names for fma is only available with SSE math enabled. +(define_expand "fma4" + [(set (match_operand:FMAMODE 0 "register_operand") + (fma:FMAMODE + (match_operand:FMAMODE 1 "nonimmediate_operand") + (match_operand:FMAMODE 2 "nonimmediate_operand") + (match_operand:FMAMODE 3 "nonimmediate_operand")))] + "(TARGET_FMA || TARGET_FMA4) && TARGET_SSE_MATH" + "") + +(define_expand "fms4" + [(set (match_operand:FMAMODE 0 "register_operand") + (fma:FMAMODE + (match_operand:FMAMODE 1 "nonimmediate_operand") + (match_operand:FMAMODE 2 "nonimmediate_operand") + (neg:FMAMODE (match_operand:FMAMODE 3 "nonimmediate_operand"))))] + "(TARGET_FMA || TARGET_FMA4) && TARGET_SSE_MATH" + "") + +(define_expand "fnma4" + [(set (match_operand:FMAMODE 0 "register_operand") + (fma:FMAMODE + (neg:FMAMODE (match_operand:FMAMODE 1 "nonimmediate_operand")) + (match_operand:FMAMODE 2 "nonimmediate_operand") + (match_operand:FMAMODE 3 "nonimmediate_operand")))] + "(TARGET_FMA || TARGET_FMA4) && TARGET_SSE_MATH" + "") + +(define_expand "fnms4" + [(set (match_operand:FMAMODE 0 "register_operand") + (fma:FMAMODE + (neg:FMAMODE (match_operand:FMAMODE 1 "nonimmediate_operand")) + (match_operand:FMAMODE 2 "nonimmediate_operand") + (neg:FMAMODE (match_operand:FMAMODE 3 "nonimmediate_operand"))))] + "(TARGET_FMA || TARGET_FMA4) && TARGET_SSE_MATH" + "") + +;; The builtin for fma4intrin.h is not constrained by SSE math enabled. +(define_expand "fma4i_fmadd_" + [(set (match_operand:FMAMODE 0 "register_operand") + (fma:FMAMODE + (match_operand:FMAMODE 1 "nonimmediate_operand") + (match_operand:FMAMODE 2 "nonimmediate_operand") + (match_operand:FMAMODE 3 "nonimmediate_operand")))] + "TARGET_FMA || TARGET_FMA4" + "") + +(define_insn "*fma4i_fmadd_" + [(set (match_operand:FMAMODE 0 "register_operand" "=x,x") + (fma:FMAMODE + (match_operand:FMAMODE 1 "nonimmediate_operand" "%x,x") + (match_operand:FMAMODE 2 "nonimmediate_operand" " x,m") + (match_operand:FMAMODE 3 "nonimmediate_operand" "xm,x")))] + "TARGET_FMA4" + "vfmadd\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "")]) + +(define_insn "*fma4i_fmsub_" + [(set (match_operand:FMAMODE 0 "register_operand" "=x,x") + (fma:FMAMODE + (match_operand:FMAMODE 1 "nonimmediate_operand" "%x,x") + (match_operand:FMAMODE 2 "nonimmediate_operand" " x,m") + (neg:FMAMODE + (match_operand:FMAMODE 3 "nonimmediate_operand" "xm,x"))))] + "TARGET_FMA4" + "vfmsub\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "")]) + +(define_insn "*fma4i_fnmadd_" + [(set (match_operand:FMAMODE 0 "register_operand" "=x,x") + (fma:FMAMODE + (neg:FMAMODE + (match_operand:FMAMODE 1 "nonimmediate_operand" "%x,x")) + (match_operand:FMAMODE 2 "nonimmediate_operand" " x,m") + (match_operand:FMAMODE 3 "nonimmediate_operand" "xm,x")))] + "TARGET_FMA4" + "vfnmadd\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "")]) + +(define_insn "*fma4i_fnmsub_" + [(set (match_operand:FMAMODE 0 "register_operand" "=x,x") + (fma:FMAMODE + (neg:FMAMODE + (match_operand:FMAMODE 1 "nonimmediate_operand" "%x,x")) + (match_operand:FMAMODE 2 "nonimmediate_operand" " x,m") + (neg:FMAMODE + (match_operand:FMAMODE 3 "nonimmediate_operand" "xm,x"))))] + "TARGET_FMA4" + "vfnmsub\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "")]) + +;; Scalar versions of the above. Unlike ADDSS et al, these write the +;; entire destination register, with the high-order elements zeroed. + +(define_expand "fma4i_vmfmadd_" + [(set (match_operand:SSEMODEF2P 0 "register_operand") + (vec_merge:SSEMODEF2P + (fma:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "nonimmediate_operand") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand") + (match_operand:SSEMODEF2P 3 "nonimmediate_operand")) + (match_dup 4) + (const_int 1)))] + "TARGET_FMA4" +{ + operands[4] = CONST0_RTX (mode); +}) + +(define_insn "*fma4i_vmfmadd_" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x") + (vec_merge:SSEMODEF2P + (fma:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "%x,x") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" " x,m") + (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x")) + (match_operand:SSEMODEF2P 4 "const0_operand" "") + (const_int 1)))] + "TARGET_FMA4" + "vfmadd\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "")]) + +(define_insn "*fma4i_vmfmsub_" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x") + (vec_merge:SSEMODEF2P + (fma:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "%x,x") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" " x,m") + (neg:SSEMODEF2P + (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))) + (match_operand:SSEMODEF2P 4 "const0_operand" "") + (const_int 1)))] + "TARGET_FMA4" + "vfmsub\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "")]) + +(define_insn "*fma4i_vmfnmadd_" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x") + (vec_merge:SSEMODEF2P + (fma:SSEMODEF2P + (neg:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "%x,x")) + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" " x,m") + (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x")) + (match_operand:SSEMODEF2P 4 "const0_operand" "") + (const_int 1)))] + "TARGET_FMA4" + "vfnmadd\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "")]) + +(define_insn "*fma4i_vmfnmsub_" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x") + (vec_merge:SSEMODEF2P + (fma:SSEMODEF2P + (neg:SSEMODEF2P + (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "%x,x")) + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" " x,m") + (neg:SSEMODEF2P + (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))) + (match_operand:SSEMODEF2P 4 "const0_operand" "") + (const_int 1)))] + "TARGET_FMA4" + "vfnmsub\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; FMA4 Parallel floating point multiply addsub and subadd operations. +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; It would be possible to represent these without the UNSPEC as +;; +;; (vec_merge +;; (fma op1 op2 op3) +;; (fma op1 op2 (neg op3)) +;; (merge-const)) +;; +;; But this doesn't seem useful in practice. + +(define_expand "fmaddsub_" + [(set (match_operand:AVXMODEF2P 0 "register_operand") + (unspec:AVXMODEF2P + [(match_operand:AVXMODEF2P 1 "nonimmediate_operand") + (match_operand:AVXMODEF2P 2 "nonimmediate_operand") + (match_operand:AVXMODEF2P 3 "nonimmediate_operand")] + UNSPEC_FMADDSUB))] + "TARGET_FMA || TARGET_FMA4" + "") + +(define_insn "*fma4_fmaddsub_" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x,x") + (unspec:AVXMODEF2P + [(match_operand:AVXMODEF2P 1 "nonimmediate_operand" "%x,x") + (match_operand:AVXMODEF2P 2 "nonimmediate_operand" " x,m") + (match_operand:AVXMODEF2P 3 "nonimmediate_operand" "xm,x")] + UNSPEC_FMADDSUB))] + "TARGET_FMA4" + "vfmaddsub\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "")]) + +(define_insn "*fma4_fmsubadd_" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x,x") + (unspec:AVXMODEF2P + [(match_operand:AVXMODEF2P 1 "nonimmediate_operand" "%x,x") + (match_operand:AVXMODEF2P 2 "nonimmediate_operand" " x,m") + (neg:AVXMODEF2P + (match_operand:AVXMODEF2P 3 "nonimmediate_operand" "xm,x"))] + UNSPEC_FMADDSUB))] + "TARGET_FMA4" + "vfmsubadd\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; FMA3 floating point multiply/accumulate instructions. +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn "*fma_fmadd_" + [(set (match_operand:FMAMODE 0 "register_operand" "=x,x,x") + (fma:FMAMODE + (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0,x") + (match_operand:FMAMODE 2 "nonimmediate_operand" "xm, x,xm") + (match_operand:FMAMODE 3 "nonimmediate_operand" " x,xm,0")))] + "TARGET_FMA" + "@ + vfmadd132\t{%2, %3, %0|%0, %3, %2} + vfmadd312\t{%3, %2, %0|%0, %2, %3} + vfmadd231\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "")]) + +(define_insn "*fma_fmsub_" + [(set (match_operand:FMAMODE 0 "register_operand" "=x,x,x") + (fma:FMAMODE + (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0,x") + (match_operand:FMAMODE 2 "nonimmediate_operand" "xm, x,xm") + (neg:FMAMODE + (match_operand:FMAMODE 3 "nonimmediate_operand" " x,xm,0"))))] + "TARGET_FMA" + "@ + vfmsub132\t{%2, %3, %0|%0, %3, %2} + vfmsub312\t{%3, %2, %0|%0, %2, %3} + vfmsub231\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "")]) + +(define_insn "*fma_fnmadd_" + [(set (match_operand:FMAMODE 0 "register_operand" "=x,x,x") + (fma:FMAMODE + (neg:FMAMODE + (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0,x")) + (match_operand:FMAMODE 2 "nonimmediate_operand" "xm, x,xm") + (match_operand:FMAMODE 3 "nonimmediate_operand" " x,xm,0")))] + "TARGET_FMA" + "@ + vfnmadd132\t{%2, %3, %0|%0, %3, %2} + vfnmadd312\t{%3, %2, %0|%0, %2, %3} + vfnmadd231\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "")]) + +(define_insn "*fma_fnmsub_" + [(set (match_operand:FMAMODE 0 "register_operand" "=x,x,x") + (fma:FMAMODE + (neg:FMAMODE + (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0,x")) + (match_operand:FMAMODE 2 "nonimmediate_operand" "xm, x,xm") + (neg:FMAMODE + (match_operand:FMAMODE 3 "nonimmediate_operand" " x,xm,0"))))] + "TARGET_FMA" + "@ + vfnmsub132\t{%2, %3, %0|%0, %3, %2} + vfnmsub312\t{%3, %2, %0|%0, %2, %3} + vfnmsub231\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "")]) + +(define_insn "*fma_fmaddsub_" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x,x,x") + (unspec:AVXMODEF2P + [(match_operand:AVXMODEF2P 1 "nonimmediate_operand" "%0, 0,x") + (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm, x,xm") + (match_operand:AVXMODEF2P 3 "nonimmediate_operand" " x,xm,0")] + UNSPEC_FMADDSUB))] + "TARGET_FMA" + "@ + vfmaddsub132\t{%2, %3, %0|%0, %3, %2} + vfmaddsub213\t{%3, %2, %0|%0, %2, %3} + vfmaddsub231\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "")]) + +(define_insn "*fma_fmsubadd_" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x,x,x") + (unspec:AVXMODEF2P + [(match_operand:AVXMODEF2P 1 "nonimmediate_operand" "%0, 0,x") + (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm, x,xm") + (neg:AVXMODEF2P + (match_operand:AVXMODEF2P 3 "nonimmediate_operand" " x,xm,0"))] + UNSPEC_FMADDSUB))] + "TARGET_FMA" + "@ + vfmsubadd132\t{%2, %3, %0|%0, %3, %2} + vfmsubadd213\t{%3, %2, %0|%0, %2, %3} + vfmsubadd231\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Parallel single-precision floating point conversion operations +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn "sse_cvtpi2ps" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_merge:V4SF + (vec_duplicate:V4SF + (float:V2SF (match_operand:V2SI 2 "nonimmediate_operand" "ym"))) + (match_operand:V4SF 1 "register_operand" "0") + (const_int 3)))] + "TARGET_SSE" + "cvtpi2ps\t{%2, %0|%0, %2}" + [(set_attr "type" "ssecvt") + (set_attr "mode" "V4SF")]) + +(define_insn "sse_cvtps2pi" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (vec_select:V2SI + (unspec:V4SI [(match_operand:V4SF 1 "nonimmediate_operand" "xm")] + UNSPEC_FIX_NOTRUNC) + (parallel [(const_int 0) (const_int 1)])))] + "TARGET_SSE" + "cvtps2pi\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "unit" "mmx") + (set_attr "mode" "DI")]) + +(define_insn "sse_cvttps2pi" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (vec_select:V2SI + (fix:V4SI (match_operand:V4SF 1 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 1)])))] + "TARGET_SSE" + "cvttps2pi\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "unit" "mmx") + (set_attr "prefix_rep" "0") + (set_attr "mode" "SF")]) + +(define_insn "*avx_cvtsi2ss" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_merge:V4SF + (vec_duplicate:V4SF + (float:SF (match_operand:SI 2 "nonimmediate_operand" "rm"))) + (match_operand:V4SF 1 "register_operand" "x") + (const_int 1)))] + "TARGET_AVX" + "vcvtsi2ss\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseicvt") + (set_attr "prefix" "vex") + (set_attr "mode" "SF")]) + +(define_insn "sse_cvtsi2ss" + [(set (match_operand:V4SF 0 "register_operand" "=x,x") + (vec_merge:V4SF + (vec_duplicate:V4SF + (float:SF (match_operand:SI 2 "nonimmediate_operand" "r,m"))) + (match_operand:V4SF 1 "register_operand" "0,0") + (const_int 1)))] + "TARGET_SSE" + "cvtsi2ss\t{%2, %0|%0, %2}" + [(set_attr "type" "sseicvt") + (set_attr "athlon_decode" "vector,double") + (set_attr "amdfam10_decode" "vector,double") + (set_attr "bdver1_decode" "double,direct") + (set_attr "mode" "SF")]) + +(define_insn "*avx_cvtsi2ssq" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_merge:V4SF + (vec_duplicate:V4SF + (float:SF (match_operand:DI 2 "nonimmediate_operand" "rm"))) + (match_operand:V4SF 1 "register_operand" "x") + (const_int 1)))] + "TARGET_AVX && TARGET_64BIT" + "vcvtsi2ssq\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseicvt") + (set_attr "length_vex" "4") + (set_attr "prefix" "vex") + (set_attr "mode" "SF")]) + +(define_insn "sse_cvtsi2ssq" + [(set (match_operand:V4SF 0 "register_operand" "=x,x") + (vec_merge:V4SF + (vec_duplicate:V4SF + (float:SF (match_operand:DI 2 "nonimmediate_operand" "r,rm"))) + (match_operand:V4SF 1 "register_operand" "0,0") + (const_int 1)))] + "TARGET_SSE && TARGET_64BIT" + "cvtsi2ssq\t{%2, %0|%0, %2}" + [(set_attr "type" "sseicvt") + (set_attr "prefix_rex" "1") + (set_attr "athlon_decode" "vector,double") + (set_attr "amdfam10_decode" "vector,double") + (set_attr "bdver1_decode" "double,direct") + (set_attr "mode" "SF")]) + +(define_insn "sse_cvtss2si" + [(set (match_operand:SI 0 "register_operand" "=r,r") + (unspec:SI + [(vec_select:SF + (match_operand:V4SF 1 "nonimmediate_operand" "x,m") + (parallel [(const_int 0)]))] + UNSPEC_FIX_NOTRUNC))] + "TARGET_SSE" + "%vcvtss2si\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "athlon_decode" "double,vector") + (set_attr "bdver1_decode" "double,double") + (set_attr "prefix_rep" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "SI")]) + +(define_insn "sse_cvtss2si_2" + [(set (match_operand:SI 0 "register_operand" "=r,r") + (unspec:SI [(match_operand:SF 1 "nonimmediate_operand" "x,m")] + UNSPEC_FIX_NOTRUNC))] + "TARGET_SSE" + "%vcvtss2si\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") + (set_attr "bdver1_decode" "double,double") + (set_attr "prefix_rep" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "SI")]) + +(define_insn "sse_cvtss2siq" + [(set (match_operand:DI 0 "register_operand" "=r,r") + (unspec:DI + [(vec_select:SF + (match_operand:V4SF 1 "nonimmediate_operand" "x,m") + (parallel [(const_int 0)]))] + UNSPEC_FIX_NOTRUNC))] + "TARGET_SSE && TARGET_64BIT" + "%vcvtss2si{q}\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "athlon_decode" "double,vector") + (set_attr "bdver1_decode" "double,double") + (set_attr "prefix_rep" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "DI")]) + +(define_insn "sse_cvtss2siq_2" + [(set (match_operand:DI 0 "register_operand" "=r,r") + (unspec:DI [(match_operand:SF 1 "nonimmediate_operand" "x,m")] + UNSPEC_FIX_NOTRUNC))] + "TARGET_SSE && TARGET_64BIT" + "%vcvtss2si{q}\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") + (set_attr "bdver1_decode" "double,double") + (set_attr "prefix_rep" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "DI")]) + +(define_insn "sse_cvttss2si" + [(set (match_operand:SI 0 "register_operand" "=r,r") + (fix:SI + (vec_select:SF + (match_operand:V4SF 1 "nonimmediate_operand" "x,m") + (parallel [(const_int 0)]))))] + "TARGET_SSE" + "%vcvttss2si\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") + (set_attr "bdver1_decode" "double,double") + (set_attr "prefix_rep" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "SI")]) + +(define_insn "sse_cvttss2siq" + [(set (match_operand:DI 0 "register_operand" "=r,r") + (fix:DI + (vec_select:SF + (match_operand:V4SF 1 "nonimmediate_operand" "x,m") + (parallel [(const_int 0)]))))] + "TARGET_SSE && TARGET_64BIT" + "%vcvttss2si{q}\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") + (set_attr "bdver1_decode" "double,double") + (set_attr "prefix_rep" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "DI")]) + +(define_insn "avx_cvtdq2ps" + [(set (match_operand:AVXMODEDCVTDQ2PS 0 "register_operand" "=x") + (float:AVXMODEDCVTDQ2PS + (match_operand: 1 "nonimmediate_operand" "xm")))] + "TARGET_AVX" + "vcvtdq2ps\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "sse2_cvtdq2ps" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (float:V4SF (match_operand:V4SI 1 "nonimmediate_operand" "xm")))] + "TARGET_SSE2" + "cvtdq2ps\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "mode" "V4SF")]) + +(define_expand "sse2_cvtudq2ps" + [(set (match_dup 5) + (float:V4SF (match_operand:V4SI 1 "nonimmediate_operand" ""))) + (set (match_dup 6) + (lt:V4SF (match_dup 5) (match_dup 3))) + (set (match_dup 7) + (and:V4SF (match_dup 6) (match_dup 4))) + (set (match_operand:V4SF 0 "register_operand" "") + (plus:V4SF (match_dup 5) (match_dup 7)))] + "TARGET_SSE2" +{ + REAL_VALUE_TYPE TWO32r; + rtx x; + int i; + + real_ldexp (&TWO32r, &dconst1, 32); + x = const_double_from_real_value (TWO32r, SFmode); + + operands[3] = force_reg (V4SFmode, CONST0_RTX (V4SFmode)); + operands[4] = force_reg (V4SFmode, + ix86_build_const_vector (V4SFmode, 1, x)); + + for (i = 5; i < 8; i++) + operands[i] = gen_reg_rtx (V4SFmode); +}) + +(define_insn "avx_cvtps2dq" + [(set (match_operand:AVXMODEDCVTPS2DQ 0 "register_operand" "=x") + (unspec:AVXMODEDCVTPS2DQ + [(match_operand: 1 "nonimmediate_operand" "xm")] + UNSPEC_FIX_NOTRUNC))] + "TARGET_AVX" + "vcvtps2dq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "sse2_cvtps2dq" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (unspec:V4SI [(match_operand:V4SF 1 "nonimmediate_operand" "xm")] + UNSPEC_FIX_NOTRUNC))] + "TARGET_SSE2" + "cvtps2dq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_insn "avx_cvttps2dq" + [(set (match_operand:AVXMODEDCVTPS2DQ 0 "register_operand" "=x") + (fix:AVXMODEDCVTPS2DQ + (match_operand: 1 "nonimmediate_operand" "xm")))] + "TARGET_AVX" + "vcvttps2dq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "sse2_cvttps2dq" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (fix:V4SI (match_operand:V4SF 1 "nonimmediate_operand" "xm")))] + "TARGET_SSE2" + "cvttps2dq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix_rep" "1") + (set_attr "prefix_data16" "0") + (set_attr "mode" "TI")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Parallel double-precision floating point conversion operations +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn "sse2_cvtpi2pd" + [(set (match_operand:V2DF 0 "register_operand" "=x,x") + (float:V2DF (match_operand:V2SI 1 "nonimmediate_operand" "y,m")))] + "TARGET_SSE2" + "cvtpi2pd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "unit" "mmx,*") + (set_attr "prefix_data16" "1,*") + (set_attr "mode" "V2DF")]) + +(define_insn "sse2_cvtpd2pi" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (unspec:V2SI [(match_operand:V2DF 1 "nonimmediate_operand" "xm")] + UNSPEC_FIX_NOTRUNC))] + "TARGET_SSE2" + "cvtpd2pi\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "unit" "mmx") + (set_attr "prefix_data16" "1") + (set_attr "mode" "DI") + (set_attr "bdver1_decode" "double")]) + +(define_insn "sse2_cvttpd2pi" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (fix:V2SI (match_operand:V2DF 1 "nonimmediate_operand" "xm")))] + "TARGET_SSE2" + "cvttpd2pi\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "unit" "mmx") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI") + (set_attr "bdver1_decode" "double")]) + +(define_insn "*avx_cvtsi2sd" + [(set (match_operand:V2DF 0 "register_operand" "=x") + (vec_merge:V2DF + (vec_duplicate:V2DF + (float:DF (match_operand:SI 2 "nonimmediate_operand" "rm"))) + (match_operand:V2DF 1 "register_operand" "x") + (const_int 1)))] + "TARGET_AVX" + "vcvtsi2sd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseicvt") + (set_attr "prefix" "vex") + (set_attr "mode" "DF")]) + +(define_insn "sse2_cvtsi2sd" + [(set (match_operand:V2DF 0 "register_operand" "=x,x") + (vec_merge:V2DF + (vec_duplicate:V2DF + (float:DF (match_operand:SI 2 "nonimmediate_operand" "r,m"))) + (match_operand:V2DF 1 "register_operand" "0,0") + (const_int 1)))] + "TARGET_SSE2" + "cvtsi2sd\t{%2, %0|%0, %2}" + [(set_attr "type" "sseicvt") + (set_attr "mode" "DF") + (set_attr "athlon_decode" "double,direct") + (set_attr "amdfam10_decode" "vector,double") + (set_attr "bdver1_decode" "double,direct")]) + +(define_insn "*avx_cvtsi2sdq" + [(set (match_operand:V2DF 0 "register_operand" "=x") + (vec_merge:V2DF + (vec_duplicate:V2DF + (float:DF (match_operand:DI 2 "nonimmediate_operand" "rm"))) + (match_operand:V2DF 1 "register_operand" "x") + (const_int 1)))] + "TARGET_AVX && TARGET_64BIT" + "vcvtsi2sdq\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseicvt") + (set_attr "length_vex" "4") + (set_attr "prefix" "vex") + (set_attr "mode" "DF")]) + +(define_insn "sse2_cvtsi2sdq" + [(set (match_operand:V2DF 0 "register_operand" "=x,x") + (vec_merge:V2DF + (vec_duplicate:V2DF + (float:DF (match_operand:DI 2 "nonimmediate_operand" "r,m"))) + (match_operand:V2DF 1 "register_operand" "0,0") + (const_int 1)))] + "TARGET_SSE2 && TARGET_64BIT" + "cvtsi2sdq\t{%2, %0|%0, %2}" + [(set_attr "type" "sseicvt") + (set_attr "prefix_rex" "1") + (set_attr "mode" "DF") + (set_attr "athlon_decode" "double,direct") + (set_attr "amdfam10_decode" "vector,double") + (set_attr "bdver1_decode" "double,direct")]) + +(define_insn "sse2_cvtsd2si" + [(set (match_operand:SI 0 "register_operand" "=r,r") + (unspec:SI + [(vec_select:DF + (match_operand:V2DF 1 "nonimmediate_operand" "x,m") + (parallel [(const_int 0)]))] + UNSPEC_FIX_NOTRUNC))] + "TARGET_SSE2" + "%vcvtsd2si\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "athlon_decode" "double,vector") + (set_attr "bdver1_decode" "double,double") + (set_attr "prefix_rep" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "SI")]) + +(define_insn "sse2_cvtsd2si_2" + [(set (match_operand:SI 0 "register_operand" "=r,r") + (unspec:SI [(match_operand:DF 1 "nonimmediate_operand" "x,m")] + UNSPEC_FIX_NOTRUNC))] + "TARGET_SSE2" + "%vcvtsd2si\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") + (set_attr "bdver1_decode" "double,double") + (set_attr "prefix_rep" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "SI")]) + +(define_insn "sse2_cvtsd2siq" + [(set (match_operand:DI 0 "register_operand" "=r,r") + (unspec:DI + [(vec_select:DF + (match_operand:V2DF 1 "nonimmediate_operand" "x,m") + (parallel [(const_int 0)]))] + UNSPEC_FIX_NOTRUNC))] + "TARGET_SSE2 && TARGET_64BIT" + "%vcvtsd2siq\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "athlon_decode" "double,vector") + (set_attr "bdver1_decode" "double,double") + (set_attr "prefix_rep" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "DI")]) + +(define_insn "sse2_cvtsd2siq_2" + [(set (match_operand:DI 0 "register_operand" "=r,r") + (unspec:DI [(match_operand:DF 1 "nonimmediate_operand" "x,m")] + UNSPEC_FIX_NOTRUNC))] + "TARGET_SSE2 && TARGET_64BIT" + "%vcvtsd2siq\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") + (set_attr "bdver1_decode" "double,double") + (set_attr "prefix_rep" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "DI")]) + +(define_insn "sse2_cvttsd2si" + [(set (match_operand:SI 0 "register_operand" "=r,r") + (fix:SI + (vec_select:DF + (match_operand:V2DF 1 "nonimmediate_operand" "x,m") + (parallel [(const_int 0)]))))] + "TARGET_SSE2" + "%vcvttsd2si\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "prefix_rep" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "SI") + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") + (set_attr "bdver1_decode" "double,double")]) + +(define_insn "sse2_cvttsd2siq" + [(set (match_operand:DI 0 "register_operand" "=r,r") + (fix:DI + (vec_select:DF + (match_operand:V2DF 1 "nonimmediate_operand" "x,m") + (parallel [(const_int 0)]))))] + "TARGET_SSE2 && TARGET_64BIT" + "%vcvttsd2siq\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "prefix_rep" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "DI") + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") + (set_attr "bdver1_decode" "double,double")]) + +(define_insn "avx_cvtdq2pd256" + [(set (match_operand:V4DF 0 "register_operand" "=x") + (float:V4DF (match_operand:V4SI 1 "nonimmediate_operand" "xm")))] + "TARGET_AVX" + "vcvtdq2pd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "V4DF")]) + +(define_insn "*avx_cvtdq2pd256_2" + [(set (match_operand:V4DF 0 "register_operand" "=x") + (float:V4DF + (vec_select:V4SI + (match_operand:V8SI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))))] + "TARGET_AVX" + "vcvtdq2pd\t{%x1, %0|%0, %x1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "V4DF")]) + +(define_insn "sse2_cvtdq2pd" + [(set (match_operand:V2DF 0 "register_operand" "=x") + (float:V2DF + (vec_select:V2SI + (match_operand:V4SI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) (const_int 1)]))))] + "TARGET_SSE2" + "%vcvtdq2pd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "V2DF")]) + +(define_insn "avx_cvtpd2dq256" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (unspec:V4SI [(match_operand:V4DF 1 "nonimmediate_operand" "xm")] + UNSPEC_FIX_NOTRUNC))] + "TARGET_AVX" + "vcvtpd2dq{y}\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + +(define_expand "sse2_cvtpd2dq" + [(set (match_operand:V4SI 0 "register_operand" "") + (vec_concat:V4SI + (unspec:V2SI [(match_operand:V2DF 1 "nonimmediate_operand" "")] + UNSPEC_FIX_NOTRUNC) + (match_dup 2)))] + "TARGET_SSE2" + "operands[2] = CONST0_RTX (V2SImode);") + +(define_insn "*sse2_cvtpd2dq" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (vec_concat:V4SI + (unspec:V2SI [(match_operand:V2DF 1 "nonimmediate_operand" "xm")] + UNSPEC_FIX_NOTRUNC) + (match_operand:V2SI 2 "const0_operand" "")))] + "TARGET_SSE2" + "* return TARGET_AVX ? \"vcvtpd2dq{x}\t{%1, %0|%0, %1}\" + : \"cvtpd2dq\t{%1, %0|%0, %1}\";" + [(set_attr "type" "ssecvt") + (set_attr "prefix_rep" "1") + (set_attr "prefix_data16" "0") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI") + (set_attr "amdfam10_decode" "double") + (set_attr "athlon_decode" "vector") + (set_attr "bdver1_decode" "double")]) + +(define_insn "avx_cvttpd2dq256" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (fix:V4SI (match_operand:V4DF 1 "nonimmediate_operand" "xm")))] + "TARGET_AVX" + "vcvttpd2dq{y}\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + +(define_expand "sse2_cvttpd2dq" + [(set (match_operand:V4SI 0 "register_operand" "") + (vec_concat:V4SI + (fix:V2SI (match_operand:V2DF 1 "nonimmediate_operand" "")) + (match_dup 2)))] + "TARGET_SSE2" + "operands[2] = CONST0_RTX (V2SImode);") + +(define_insn "*sse2_cvttpd2dq" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (vec_concat:V4SI + (fix:V2SI (match_operand:V2DF 1 "nonimmediate_operand" "xm")) + (match_operand:V2SI 2 "const0_operand" "")))] + "TARGET_SSE2" + "* return TARGET_AVX ? \"vcvttpd2dq{x}\t{%1, %0|%0, %1}\" + : \"cvttpd2dq\t{%1, %0|%0, %1}\";" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI") + (set_attr "amdfam10_decode" "double") + (set_attr "athlon_decode" "vector") + (set_attr "bdver1_decode" "double")]) + +(define_insn "*avx_cvtsd2ss" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_merge:V4SF + (vec_duplicate:V4SF + (float_truncate:V2SF + (match_operand:V2DF 2 "nonimmediate_operand" "xm"))) + (match_operand:V4SF 1 "register_operand" "x") + (const_int 1)))] + "TARGET_AVX" + "vcvtsd2ss\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "SF")]) + +(define_insn "sse2_cvtsd2ss" + [(set (match_operand:V4SF 0 "register_operand" "=x,x") + (vec_merge:V4SF + (vec_duplicate:V4SF + (float_truncate:V2SF + (match_operand:V2DF 2 "nonimmediate_operand" "x,m"))) + (match_operand:V4SF 1 "register_operand" "0,0") + (const_int 1)))] + "TARGET_SSE2" + "cvtsd2ss\t{%2, %0|%0, %2}" + [(set_attr "type" "ssecvt") + (set_attr "athlon_decode" "vector,double") + (set_attr "amdfam10_decode" "vector,double") + (set_attr "bdver1_decode" "direct,direct") + (set_attr "mode" "SF")]) + +(define_insn "*avx_cvtss2sd" + [(set (match_operand:V2DF 0 "register_operand" "=x") + (vec_merge:V2DF + (float_extend:V2DF + (vec_select:V2SF + (match_operand:V4SF 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0) (const_int 1)]))) + (match_operand:V2DF 1 "register_operand" "x") + (const_int 1)))] + "TARGET_AVX" + "vcvtss2sd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "DF")]) + +(define_insn "sse2_cvtss2sd" + [(set (match_operand:V2DF 0 "register_operand" "=x,x") + (vec_merge:V2DF + (float_extend:V2DF + (vec_select:V2SF + (match_operand:V4SF 2 "nonimmediate_operand" "x,m") + (parallel [(const_int 0) (const_int 1)]))) + (match_operand:V2DF 1 "register_operand" "0,0") + (const_int 1)))] + "TARGET_SSE2" + "cvtss2sd\t{%2, %0|%0, %2}" + [(set_attr "type" "ssecvt") + (set_attr "amdfam10_decode" "vector,double") + (set_attr "athlon_decode" "direct,direct") + (set_attr "bdver1_decode" "direct,direct") + (set_attr "mode" "DF")]) + +(define_insn "avx_cvtpd2ps256" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (float_truncate:V4SF + (match_operand:V4DF 1 "nonimmediate_operand" "xm")))] + "TARGET_AVX" + "vcvtpd2ps{y}\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "V4SF")]) + +(define_expand "sse2_cvtpd2ps" + [(set (match_operand:V4SF 0 "register_operand" "") + (vec_concat:V4SF + (float_truncate:V2SF + (match_operand:V2DF 1 "nonimmediate_operand" "")) + (match_dup 2)))] + "TARGET_SSE2" + "operands[2] = CONST0_RTX (V2SFmode);") + +(define_insn "*sse2_cvtpd2ps" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_concat:V4SF + (float_truncate:V2SF + (match_operand:V2DF 1 "nonimmediate_operand" "xm")) + (match_operand:V2SF 2 "const0_operand" "")))] + "TARGET_SSE2" + "* return TARGET_AVX ? \"vcvtpd2ps{x}\t{%1, %0|%0, %1}\" + : \"cvtpd2ps\t{%1, %0|%0, %1}\";" + [(set_attr "type" "ssecvt") + (set_attr "prefix_data16" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "V4SF") + (set_attr "amdfam10_decode" "double") + (set_attr "athlon_decode" "vector") + (set_attr "bdver1_decode" "double")]) + +(define_insn "avx_cvtps2pd256" + [(set (match_operand:V4DF 0 "register_operand" "=x") + (float_extend:V4DF + (match_operand:V4SF 1 "nonimmediate_operand" "xm")))] + "TARGET_AVX" + "vcvtps2pd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "V4DF")]) + +(define_insn "*avx_cvtps2pd256_2" + [(set (match_operand:V4DF 0 "register_operand" "=x") + (float_extend:V4DF + (vec_select:V4SF + (match_operand:V8SF 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))))] + "TARGET_AVX" + "vcvtps2pd\t{%x1, %0|%0, %x1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "V4DF")]) + +(define_insn "sse2_cvtps2pd" + [(set (match_operand:V2DF 0 "register_operand" "=x") + (float_extend:V2DF + (vec_select:V2SF + (match_operand:V4SF 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) (const_int 1)]))))] + "TARGET_SSE2" + "%vcvtps2pd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "V2DF") + (set_attr "prefix_data16" "0") + (set_attr "amdfam10_decode" "direct") + (set_attr "athlon_decode" "double") + (set_attr "bdver1_decode" "double")]) + +(define_expand "vec_unpacks_hi_v4sf" + [(set (match_dup 2) + (vec_select:V4SF + (vec_concat:V8SF + (match_dup 2) + (match_operand:V4SF 1 "nonimmediate_operand" "")) + (parallel [(const_int 6) + (const_int 7) + (const_int 2) + (const_int 3)]))) + (set (match_operand:V2DF 0 "register_operand" "") + (float_extend:V2DF + (vec_select:V2SF + (match_dup 2) + (parallel [(const_int 0) (const_int 1)]))))] + "TARGET_SSE2" + "operands[2] = gen_reg_rtx (V4SFmode);") + +(define_expand "vec_unpacks_hi_v8sf" + [(set (match_dup 2) + (vec_select:V4SF + (match_operand:V8SF 1 "nonimmediate_operand" "") + (parallel [(const_int 4) + (const_int 5) + (const_int 6) + (const_int 7)]))) + (set (match_operand:V4DF 0 "register_operand" "") + (float_extend:V4DF + (match_dup 2)))] + "TARGET_AVX" +{ + operands[2] = gen_reg_rtx (V4SFmode); +}) + +(define_expand "vec_unpacks_lo_v4sf" + [(set (match_operand:V2DF 0 "register_operand" "") + (float_extend:V2DF + (vec_select:V2SF + (match_operand:V4SF 1 "nonimmediate_operand" "") + (parallel [(const_int 0) (const_int 1)]))))] + "TARGET_SSE2") + +(define_expand "vec_unpacks_lo_v8sf" + [(set (match_operand:V4DF 0 "register_operand" "") + (float_extend:V4DF + (vec_select:V4SF + (match_operand:V8SF 1 "nonimmediate_operand" "") + (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))))] + "TARGET_AVX") + +(define_expand "vec_unpacks_float_hi_v8hi" + [(match_operand:V4SF 0 "register_operand" "") + (match_operand:V8HI 1 "register_operand" "")] + "TARGET_SSE2" +{ + rtx tmp = gen_reg_rtx (V4SImode); + + emit_insn (gen_vec_unpacks_hi_v8hi (tmp, operands[1])); + emit_insn (gen_sse2_cvtdq2ps (operands[0], tmp)); + DONE; +}) + +(define_expand "vec_unpacks_float_lo_v8hi" + [(match_operand:V4SF 0 "register_operand" "") + (match_operand:V8HI 1 "register_operand" "")] + "TARGET_SSE2" +{ + rtx tmp = gen_reg_rtx (V4SImode); + + emit_insn (gen_vec_unpacks_lo_v8hi (tmp, operands[1])); + emit_insn (gen_sse2_cvtdq2ps (operands[0], tmp)); + DONE; +}) + +(define_expand "vec_unpacku_float_hi_v8hi" + [(match_operand:V4SF 0 "register_operand" "") + (match_operand:V8HI 1 "register_operand" "")] + "TARGET_SSE2" +{ + rtx tmp = gen_reg_rtx (V4SImode); + + emit_insn (gen_vec_unpacku_hi_v8hi (tmp, operands[1])); + emit_insn (gen_sse2_cvtdq2ps (operands[0], tmp)); + DONE; +}) + +(define_expand "vec_unpacku_float_lo_v8hi" + [(match_operand:V4SF 0 "register_operand" "") + (match_operand:V8HI 1 "register_operand" "")] + "TARGET_SSE2" +{ + rtx tmp = gen_reg_rtx (V4SImode); + + emit_insn (gen_vec_unpacku_lo_v8hi (tmp, operands[1])); + emit_insn (gen_sse2_cvtdq2ps (operands[0], tmp)); + DONE; +}) + +(define_expand "vec_unpacks_float_hi_v4si" + [(set (match_dup 2) + (vec_select:V4SI + (match_operand:V4SI 1 "nonimmediate_operand" "") + (parallel [(const_int 2) + (const_int 3) + (const_int 2) + (const_int 3)]))) + (set (match_operand:V2DF 0 "register_operand" "") + (float:V2DF + (vec_select:V2SI + (match_dup 2) + (parallel [(const_int 0) (const_int 1)]))))] + "TARGET_SSE2" + "operands[2] = gen_reg_rtx (V4SImode);") + +(define_expand "vec_unpacks_float_lo_v4si" + [(set (match_operand:V2DF 0 "register_operand" "") + (float:V2DF + (vec_select:V2SI + (match_operand:V4SI 1 "nonimmediate_operand" "") + (parallel [(const_int 0) (const_int 1)]))))] + "TARGET_SSE2") + +(define_expand "vec_unpacks_float_hi_v8si" + [(set (match_dup 2) + (vec_select:V4SI + (match_operand:V8SI 1 "nonimmediate_operand" "") + (parallel [(const_int 4) + (const_int 5) + (const_int 6) + (const_int 7)]))) + (set (match_operand:V4DF 0 "register_operand" "") + (float:V4DF + (match_dup 2)))] + "TARGET_AVX" + "operands[2] = gen_reg_rtx (V4SImode);") + +(define_expand "vec_unpacks_float_lo_v8si" + [(set (match_operand:V4DF 0 "register_operand" "") + (float:V4DF + (vec_select:V4SI + (match_operand:V8SI 1 "nonimmediate_operand" "") + (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))))] + "TARGET_AVX") + +(define_expand "vec_unpacku_float_hi_v4si" + [(set (match_dup 5) + (vec_select:V4SI + (match_operand:V4SI 1 "nonimmediate_operand" "") + (parallel [(const_int 2) + (const_int 3) + (const_int 2) + (const_int 3)]))) + (set (match_dup 6) + (float:V2DF + (vec_select:V2SI + (match_dup 5) + (parallel [(const_int 0) (const_int 1)])))) + (set (match_dup 7) + (lt:V2DF (match_dup 6) (match_dup 3))) + (set (match_dup 8) + (and:V2DF (match_dup 7) (match_dup 4))) + (set (match_operand:V2DF 0 "register_operand" "") + (plus:V2DF (match_dup 6) (match_dup 8)))] + "TARGET_SSE2" +{ + REAL_VALUE_TYPE TWO32r; + rtx x; + int i; + + real_ldexp (&TWO32r, &dconst1, 32); + x = const_double_from_real_value (TWO32r, DFmode); + + operands[3] = force_reg (V2DFmode, CONST0_RTX (V2DFmode)); + operands[4] = force_reg (V2DFmode, + ix86_build_const_vector (V2DFmode, 1, x)); + + operands[5] = gen_reg_rtx (V4SImode); + + for (i = 6; i < 9; i++) + operands[i] = gen_reg_rtx (V2DFmode); +}) + +(define_expand "vec_unpacku_float_lo_v4si" + [(set (match_dup 5) + (float:V2DF + (vec_select:V2SI + (match_operand:V4SI 1 "nonimmediate_operand" "") + (parallel [(const_int 0) (const_int 1)])))) + (set (match_dup 6) + (lt:V2DF (match_dup 5) (match_dup 3))) + (set (match_dup 7) + (and:V2DF (match_dup 6) (match_dup 4))) + (set (match_operand:V2DF 0 "register_operand" "") + (plus:V2DF (match_dup 5) (match_dup 7)))] + "TARGET_SSE2" +{ + REAL_VALUE_TYPE TWO32r; + rtx x; + int i; + + real_ldexp (&TWO32r, &dconst1, 32); + x = const_double_from_real_value (TWO32r, DFmode); + + operands[3] = force_reg (V2DFmode, CONST0_RTX (V2DFmode)); + operands[4] = force_reg (V2DFmode, + ix86_build_const_vector (V2DFmode, 1, x)); + + for (i = 5; i < 8; i++) + operands[i] = gen_reg_rtx (V2DFmode); +}) + +(define_expand "vec_pack_trunc_v4df" + [(set (match_dup 3) + (float_truncate:V4SF + (match_operand:V4DF 1 "nonimmediate_operand" ""))) + (set (match_dup 4) + (float_truncate:V4SF + (match_operand:V4DF 2 "nonimmediate_operand" ""))) + (set (match_operand:V8SF 0 "register_operand" "") + (vec_concat:V8SF + (match_dup 3) + (match_dup 4)))] + "TARGET_AVX" +{ + operands[3] = gen_reg_rtx (V4SFmode); + operands[4] = gen_reg_rtx (V4SFmode); +}) + +(define_expand "vec_pack_trunc_v2df" + [(match_operand:V4SF 0 "register_operand" "") + (match_operand:V2DF 1 "nonimmediate_operand" "") + (match_operand:V2DF 2 "nonimmediate_operand" "")] + "TARGET_SSE2" +{ + rtx r1, r2; + + r1 = gen_reg_rtx (V4SFmode); + r2 = gen_reg_rtx (V4SFmode); + + emit_insn (gen_sse2_cvtpd2ps (r1, operands[1])); + emit_insn (gen_sse2_cvtpd2ps (r2, operands[2])); + emit_insn (gen_sse_movlhps (operands[0], r1, r2)); + DONE; +}) + +(define_expand "vec_pack_sfix_trunc_v2df" + [(match_operand:V4SI 0 "register_operand" "") + (match_operand:V2DF 1 "nonimmediate_operand" "") + (match_operand:V2DF 2 "nonimmediate_operand" "")] + "TARGET_SSE2" +{ + rtx r1, r2; + + r1 = gen_reg_rtx (V4SImode); + r2 = gen_reg_rtx (V4SImode); + + emit_insn (gen_sse2_cvttpd2dq (r1, operands[1])); + emit_insn (gen_sse2_cvttpd2dq (r2, operands[2])); + emit_insn (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]), + gen_lowpart (V2DImode, r1), + gen_lowpart (V2DImode, r2))); + DONE; +}) + +(define_expand "vec_pack_sfix_v2df" + [(match_operand:V4SI 0 "register_operand" "") + (match_operand:V2DF 1 "nonimmediate_operand" "") + (match_operand:V2DF 2 "nonimmediate_operand" "")] + "TARGET_SSE2" +{ + rtx r1, r2; + + r1 = gen_reg_rtx (V4SImode); + r2 = gen_reg_rtx (V4SImode); + + emit_insn (gen_sse2_cvtpd2dq (r1, operands[1])); + emit_insn (gen_sse2_cvtpd2dq (r2, operands[2])); + emit_insn (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]), + gen_lowpart (V2DImode, r1), + gen_lowpart (V2DImode, r2))); + DONE; +}) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Parallel single-precision floating point element swizzling +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_expand "sse_movhlps_exp" + [(set (match_operand:V4SF 0 "nonimmediate_operand" "") + (vec_select:V4SF + (vec_concat:V8SF + (match_operand:V4SF 1 "nonimmediate_operand" "") + (match_operand:V4SF 2 "nonimmediate_operand" "")) + (parallel [(const_int 6) + (const_int 7) + (const_int 2) + (const_int 3)])))] + "TARGET_SSE" +{ + rtx dst = ix86_fixup_binary_operands (UNKNOWN, V4SFmode, operands); + + emit_insn (gen_sse_movhlps (dst, operands[1], operands[2])); + + /* Fix up the destination if needed. */ + if (dst != operands[0]) + emit_move_insn (operands[0], dst); + + DONE; +}) + +(define_insn "*avx_movhlps" + [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,m") + (vec_select:V4SF + (vec_concat:V8SF + (match_operand:V4SF 1 "nonimmediate_operand" " x,x,0") + (match_operand:V4SF 2 "nonimmediate_operand" " x,o,x")) + (parallel [(const_int 6) + (const_int 7) + (const_int 2) + (const_int 3)])))] + "TARGET_AVX && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "@ + vmovhlps\t{%2, %1, %0|%0, %1, %2} + vmovlps\t{%H2, %1, %0|%0, %1, %H2} + vmovhps\t{%2, %0|%0, %2}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "V4SF,V2SF,V2SF")]) + +(define_insn "sse_movhlps" + [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,m") + (vec_select:V4SF + (vec_concat:V8SF + (match_operand:V4SF 1 "nonimmediate_operand" " 0,0,0") + (match_operand:V4SF 2 "nonimmediate_operand" " x,o,x")) + (parallel [(const_int 6) + (const_int 7) + (const_int 2) + (const_int 3)])))] + "TARGET_SSE && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "@ + movhlps\t{%2, %0|%0, %2} + movlps\t{%H2, %0|%0, %H2} + movhps\t{%2, %0|%0, %2}" + [(set_attr "type" "ssemov") + (set_attr "mode" "V4SF,V2SF,V2SF")]) + +(define_expand "sse_movlhps_exp" + [(set (match_operand:V4SF 0 "nonimmediate_operand" "") + (vec_select:V4SF + (vec_concat:V8SF + (match_operand:V4SF 1 "nonimmediate_operand" "") + (match_operand:V4SF 2 "nonimmediate_operand" "")) + (parallel [(const_int 0) + (const_int 1) + (const_int 4) + (const_int 5)])))] + "TARGET_SSE" +{ + rtx dst = ix86_fixup_binary_operands (UNKNOWN, V4SFmode, operands); + + emit_insn (gen_sse_movlhps (dst, operands[1], operands[2])); + + /* Fix up the destination if needed. */ + if (dst != operands[0]) + emit_move_insn (operands[0], dst); + + DONE; +}) + +(define_insn "*avx_movlhps" + [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,o") + (vec_select:V4SF + (vec_concat:V8SF + (match_operand:V4SF 1 "nonimmediate_operand" " x,x,0") + (match_operand:V4SF 2 "nonimmediate_operand" " x,m,x")) + (parallel [(const_int 0) + (const_int 1) + (const_int 4) + (const_int 5)])))] + "TARGET_AVX && ix86_binary_operator_ok (UNKNOWN, V4SFmode, operands)" + "@ + vmovlhps\t{%2, %1, %0|%0, %1, %2} + vmovhps\t{%2, %1, %0|%0, %1, %2} + vmovlps\t{%2, %H0|%H0, %2}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "V4SF,V2SF,V2SF")]) + +(define_insn "sse_movlhps" + [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,o") + (vec_select:V4SF + (vec_concat:V8SF + (match_operand:V4SF 1 "nonimmediate_operand" " 0,0,0") + (match_operand:V4SF 2 "nonimmediate_operand" " x,m,x")) + (parallel [(const_int 0) + (const_int 1) + (const_int 4) + (const_int 5)])))] + "TARGET_SSE && ix86_binary_operator_ok (UNKNOWN, V4SFmode, operands)" + "@ + movlhps\t{%2, %0|%0, %2} + movhps\t{%2, %0|%0, %2} + movlps\t{%2, %H0|%H0, %2}" + [(set_attr "type" "ssemov") + (set_attr "mode" "V4SF,V2SF,V2SF")]) + +;; Recall that the 256-bit unpck insns only shuffle within their lanes. +(define_insn "avx_unpckhps256" + [(set (match_operand:V8SF 0 "register_operand" "=x") + (vec_select:V8SF + (vec_concat:V16SF + (match_operand:V8SF 1 "register_operand" "x") + (match_operand:V8SF 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 2) (const_int 10) + (const_int 3) (const_int 11) + (const_int 6) (const_int 14) + (const_int 7) (const_int 15)])))] + "TARGET_AVX" + "vunpckhps\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn "*avx_interleave_highv4sf" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_select:V4SF + (vec_concat:V8SF + (match_operand:V4SF 1 "register_operand" "x") + (match_operand:V4SF 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 2) (const_int 6) + (const_int 3) (const_int 7)])))] + "TARGET_AVX" + "vunpckhps\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "V4SF")]) + +(define_expand "vec_interleave_highv8sf" + [(set (match_dup 3) + (vec_select:V8SF + (vec_concat:V16SF + (match_operand:V8SF 1 "register_operand" "x") + (match_operand:V8SF 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 8) + (const_int 1) (const_int 9) + (const_int 4) (const_int 12) + (const_int 5) (const_int 13)]))) + (set (match_dup 4) + (vec_select:V8SF + (vec_concat:V16SF + (match_dup 1) + (match_dup 2)) + (parallel [(const_int 2) (const_int 10) + (const_int 3) (const_int 11) + (const_int 6) (const_int 14) + (const_int 7) (const_int 15)]))) + (set (match_operand:V8SF 0 "register_operand" "") + (vec_select:V8SF + (vec_concat:V16SF + (match_dup 3) + (match_dup 4)) + (parallel [(const_int 4) (const_int 5) + (const_int 6) (const_int 7) + (const_int 12) (const_int 13) + (const_int 14) (const_int 15)])))] + "TARGET_AVX" +{ + operands[3] = gen_reg_rtx (V8SFmode); + operands[4] = gen_reg_rtx (V8SFmode); +}) + +(define_insn "vec_interleave_highv4sf" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_select:V4SF + (vec_concat:V8SF + (match_operand:V4SF 1 "register_operand" "0") + (match_operand:V4SF 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 2) (const_int 6) + (const_int 3) (const_int 7)])))] + "TARGET_SSE" + "unpckhps\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog") + (set_attr "mode" "V4SF")]) + +;; Recall that the 256-bit unpck insns only shuffle within their lanes. +(define_insn "avx_unpcklps256" + [(set (match_operand:V8SF 0 "register_operand" "=x") + (vec_select:V8SF + (vec_concat:V16SF + (match_operand:V8SF 1 "register_operand" "x") + (match_operand:V8SF 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 8) + (const_int 1) (const_int 9) + (const_int 4) (const_int 12) + (const_int 5) (const_int 13)])))] + "TARGET_AVX" + "vunpcklps\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn "*avx_interleave_lowv4sf" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_select:V4SF + (vec_concat:V8SF + (match_operand:V4SF 1 "register_operand" "x") + (match_operand:V4SF 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 4) + (const_int 1) (const_int 5)])))] + "TARGET_AVX" + "vunpcklps\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "V4SF")]) + +(define_expand "vec_interleave_lowv8sf" + [(set (match_dup 3) + (vec_select:V8SF + (vec_concat:V16SF + (match_operand:V8SF 1 "register_operand" "x") + (match_operand:V8SF 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 8) + (const_int 1) (const_int 9) + (const_int 4) (const_int 12) + (const_int 5) (const_int 13)]))) + (set (match_dup 4) + (vec_select:V8SF + (vec_concat:V16SF + (match_dup 1) + (match_dup 2)) + (parallel [(const_int 2) (const_int 10) + (const_int 3) (const_int 11) + (const_int 6) (const_int 14) + (const_int 7) (const_int 15)]))) + (set (match_operand:V8SF 0 "register_operand" "") + (vec_select:V8SF + (vec_concat:V16SF + (match_dup 3) + (match_dup 4)) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 8) (const_int 9) + (const_int 10) (const_int 11)])))] + "TARGET_AVX" +{ + operands[3] = gen_reg_rtx (V8SFmode); + operands[4] = gen_reg_rtx (V8SFmode); +}) + +(define_insn "vec_interleave_lowv4sf" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_select:V4SF + (vec_concat:V8SF + (match_operand:V4SF 1 "register_operand" "0") + (match_operand:V4SF 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 4) + (const_int 1) (const_int 5)])))] + "TARGET_SSE" + "unpcklps\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog") + (set_attr "mode" "V4SF")]) + +;; These are modeled with the same vec_concat as the others so that we +;; capture users of shufps that can use the new instructions +(define_insn "avx_movshdup256" + [(set (match_operand:V8SF 0 "register_operand" "=x") + (vec_select:V8SF + (vec_concat:V16SF + (match_operand:V8SF 1 "nonimmediate_operand" "xm") + (match_dup 1)) + (parallel [(const_int 1) (const_int 1) + (const_int 3) (const_int 3) + (const_int 5) (const_int 5) + (const_int 7) (const_int 7)])))] + "TARGET_AVX" + "vmovshdup\t{%1, %0|%0, %1}" + [(set_attr "type" "sse") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn "sse3_movshdup" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_select:V4SF + (vec_concat:V8SF + (match_operand:V4SF 1 "nonimmediate_operand" "xm") + (match_dup 1)) + (parallel [(const_int 1) + (const_int 1) + (const_int 7) + (const_int 7)])))] + "TARGET_SSE3" + "%vmovshdup\t{%1, %0|%0, %1}" + [(set_attr "type" "sse") + (set_attr "prefix_rep" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "V4SF")]) + +(define_insn "avx_movsldup256" + [(set (match_operand:V8SF 0 "register_operand" "=x") + (vec_select:V8SF + (vec_concat:V16SF + (match_operand:V8SF 1 "nonimmediate_operand" "xm") + (match_dup 1)) + (parallel [(const_int 0) (const_int 0) + (const_int 2) (const_int 2) + (const_int 4) (const_int 4) + (const_int 6) (const_int 6)])))] + "TARGET_AVX" + "vmovsldup\t{%1, %0|%0, %1}" + [(set_attr "type" "sse") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn "sse3_movsldup" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_select:V4SF + (vec_concat:V8SF + (match_operand:V4SF 1 "nonimmediate_operand" "xm") + (match_dup 1)) + (parallel [(const_int 0) + (const_int 0) + (const_int 6) + (const_int 6)])))] + "TARGET_SSE3" + "%vmovsldup\t{%1, %0|%0, %1}" + [(set_attr "type" "sse") + (set_attr "prefix_rep" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "V4SF")]) + +(define_expand "avx_shufps256" + [(match_operand:V8SF 0 "register_operand" "") + (match_operand:V8SF 1 "register_operand" "") + (match_operand:V8SF 2 "nonimmediate_operand" "") + (match_operand:SI 3 "const_int_operand" "")] + "TARGET_AVX" +{ + int mask = INTVAL (operands[3]); + emit_insn (gen_avx_shufps256_1 (operands[0], operands[1], operands[2], + GEN_INT ((mask >> 0) & 3), + GEN_INT ((mask >> 2) & 3), + GEN_INT (((mask >> 4) & 3) + 8), + GEN_INT (((mask >> 6) & 3) + 8), + GEN_INT (((mask >> 0) & 3) + 4), + GEN_INT (((mask >> 2) & 3) + 4), + GEN_INT (((mask >> 4) & 3) + 12), + GEN_INT (((mask >> 6) & 3) + 12))); + DONE; +}) + +;; One bit in mask selects 2 elements. +(define_insn "avx_shufps256_1" + [(set (match_operand:V8SF 0 "register_operand" "=x") + (vec_select:V8SF + (vec_concat:V16SF + (match_operand:V8SF 1 "register_operand" "x") + (match_operand:V8SF 2 "nonimmediate_operand" "xm")) + (parallel [(match_operand 3 "const_0_to_3_operand" "") + (match_operand 4 "const_0_to_3_operand" "") + (match_operand 5 "const_8_to_11_operand" "") + (match_operand 6 "const_8_to_11_operand" "") + (match_operand 7 "const_4_to_7_operand" "") + (match_operand 8 "const_4_to_7_operand" "") + (match_operand 9 "const_12_to_15_operand" "") + (match_operand 10 "const_12_to_15_operand" "")])))] + "TARGET_AVX + && (INTVAL (operands[3]) == (INTVAL (operands[7]) - 4) + && INTVAL (operands[4]) == (INTVAL (operands[8]) - 4) + && INTVAL (operands[5]) == (INTVAL (operands[9]) - 4) + && INTVAL (operands[6]) == (INTVAL (operands[10]) - 4))" +{ + int mask; + mask = INTVAL (operands[3]); + mask |= INTVAL (operands[4]) << 2; + mask |= (INTVAL (operands[5]) - 8) << 4; + mask |= (INTVAL (operands[6]) - 8) << 6; + operands[3] = GEN_INT (mask); + + return "vshufps\t{%3, %2, %1, %0|%0, %1, %2, %3}"; +} + [(set_attr "type" "sselog") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_expand "sse_shufps" + [(match_operand:V4SF 0 "register_operand" "") + (match_operand:V4SF 1 "register_operand" "") + (match_operand:V4SF 2 "nonimmediate_operand" "") + (match_operand:SI 3 "const_int_operand" "")] + "TARGET_SSE" +{ + int mask = INTVAL (operands[3]); + emit_insn (gen_sse_shufps_v4sf (operands[0], operands[1], operands[2], + GEN_INT ((mask >> 0) & 3), + GEN_INT ((mask >> 2) & 3), + GEN_INT (((mask >> 4) & 3) + 4), + GEN_INT (((mask >> 6) & 3) + 4))); + DONE; +}) + +(define_insn "*avx_shufps_" + [(set (match_operand:SSEMODE4S 0 "register_operand" "=x") + (vec_select:SSEMODE4S + (vec_concat: + (match_operand:SSEMODE4S 1 "register_operand" "x") + (match_operand:SSEMODE4S 2 "nonimmediate_operand" "xm")) + (parallel [(match_operand 3 "const_0_to_3_operand" "") + (match_operand 4 "const_0_to_3_operand" "") + (match_operand 5 "const_4_to_7_operand" "") + (match_operand 6 "const_4_to_7_operand" "")])))] + "TARGET_AVX" +{ + int mask = 0; + mask |= INTVAL (operands[3]) << 0; + mask |= INTVAL (operands[4]) << 2; + mask |= (INTVAL (operands[5]) - 4) << 4; + mask |= (INTVAL (operands[6]) - 4) << 6; + operands[3] = GEN_INT (mask); + + return "vshufps\t{%3, %2, %1, %0|%0, %1, %2, %3}"; +} + [(set_attr "type" "sselog") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "V4SF")]) + +(define_insn "sse_shufps_" + [(set (match_operand:SSEMODE4S 0 "register_operand" "=x") + (vec_select:SSEMODE4S + (vec_concat: + (match_operand:SSEMODE4S 1 "register_operand" "0") + (match_operand:SSEMODE4S 2 "nonimmediate_operand" "xm")) + (parallel [(match_operand 3 "const_0_to_3_operand" "") + (match_operand 4 "const_0_to_3_operand" "") + (match_operand 5 "const_4_to_7_operand" "") + (match_operand 6 "const_4_to_7_operand" "")])))] + "TARGET_SSE" +{ + int mask = 0; + mask |= INTVAL (operands[3]) << 0; + mask |= INTVAL (operands[4]) << 2; + mask |= (INTVAL (operands[5]) - 4) << 4; + mask |= (INTVAL (operands[6]) - 4) << 6; + operands[3] = GEN_INT (mask); + + return "shufps\t{%3, %2, %0|%0, %2, %3}"; +} + [(set_attr "type" "sselog") + (set_attr "length_immediate" "1") + (set_attr "mode" "V4SF")]) + +(define_insn "sse_storehps" + [(set (match_operand:V2SF 0 "nonimmediate_operand" "=m,x,x") + (vec_select:V2SF + (match_operand:V4SF 1 "nonimmediate_operand" "x,x,o") + (parallel [(const_int 2) (const_int 3)])))] + "TARGET_SSE" + "@ + %vmovhps\t{%1, %0|%0, %1} + %vmovhlps\t{%1, %d0|%d0, %1} + %vmovlps\t{%H1, %d0|%d0, %H1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "V2SF,V4SF,V2SF")]) + +(define_expand "sse_loadhps_exp" + [(set (match_operand:V4SF 0 "nonimmediate_operand" "") + (vec_concat:V4SF + (vec_select:V2SF + (match_operand:V4SF 1 "nonimmediate_operand" "") + (parallel [(const_int 0) (const_int 1)])) + (match_operand:V2SF 2 "nonimmediate_operand" "")))] + "TARGET_SSE" +{ + rtx dst = ix86_fixup_binary_operands (UNKNOWN, V4SFmode, operands); + + emit_insn (gen_sse_loadhps (dst, operands[1], operands[2])); + + /* Fix up the destination if needed. */ + if (dst != operands[0]) + emit_move_insn (operands[0], dst); + + DONE; +}) + +(define_insn "*avx_loadhps" + [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,o") + (vec_concat:V4SF + (vec_select:V2SF + (match_operand:V4SF 1 "nonimmediate_operand" "x,x,0") + (parallel [(const_int 0) (const_int 1)])) + (match_operand:V2SF 2 "nonimmediate_operand" "m,x,x")))] + "TARGET_AVX" + "@ + vmovhps\t{%2, %1, %0|%0, %1, %2} + vmovlhps\t{%2, %1, %0|%0, %1, %2} + vmovlps\t{%2, %H0|%H0, %2}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "V2SF,V4SF,V2SF")]) + +(define_insn "sse_loadhps" + [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,o") + (vec_concat:V4SF + (vec_select:V2SF + (match_operand:V4SF 1 "nonimmediate_operand" "0,0,0") + (parallel [(const_int 0) (const_int 1)])) + (match_operand:V2SF 2 "nonimmediate_operand" "m,x,x")))] + "TARGET_SSE" + "@ + movhps\t{%2, %0|%0, %2} + movlhps\t{%2, %0|%0, %2} + movlps\t{%2, %H0|%H0, %2}" + [(set_attr "type" "ssemov") + (set_attr "mode" "V2SF,V4SF,V2SF")]) + +(define_insn "*avx_storelps" + [(set (match_operand:V2SF 0 "nonimmediate_operand" "=m,x,x") + (vec_select:V2SF + (match_operand:V4SF 1 "nonimmediate_operand" "x,x,m") + (parallel [(const_int 0) (const_int 1)])))] + "TARGET_AVX" + "@ + vmovlps\t{%1, %0|%0, %1} + vmovaps\t{%1, %0|%0, %1} + vmovlps\t{%1, %0, %0|%0, %0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "V2SF,V2DF,V2SF")]) + +(define_insn "sse_storelps" + [(set (match_operand:V2SF 0 "nonimmediate_operand" "=m,x,x") + (vec_select:V2SF + (match_operand:V4SF 1 "nonimmediate_operand" "x,x,m") + (parallel [(const_int 0) (const_int 1)])))] + "TARGET_SSE" + "@ + movlps\t{%1, %0|%0, %1} + movaps\t{%1, %0|%0, %1} + movlps\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "V2SF,V4SF,V2SF")]) + +(define_expand "sse_loadlps_exp" + [(set (match_operand:V4SF 0 "nonimmediate_operand" "") + (vec_concat:V4SF + (match_operand:V2SF 2 "nonimmediate_operand" "") + (vec_select:V2SF + (match_operand:V4SF 1 "nonimmediate_operand" "") + (parallel [(const_int 2) (const_int 3)]))))] + "TARGET_SSE" +{ + rtx dst = ix86_fixup_binary_operands (UNKNOWN, V4SFmode, operands); + + emit_insn (gen_sse_loadlps (dst, operands[1], operands[2])); + + /* Fix up the destination if needed. */ + if (dst != operands[0]) + emit_move_insn (operands[0], dst); + + DONE; +}) + +(define_insn "*avx_loadlps" + [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,m") + (vec_concat:V4SF + (match_operand:V2SF 2 "nonimmediate_operand" "x,m,x") + (vec_select:V2SF + (match_operand:V4SF 1 "nonimmediate_operand" "x,x,0") + (parallel [(const_int 2) (const_int 3)]))))] + "TARGET_AVX" + "@ + shufps\t{$0xe4, %1, %2, %0|%0, %2, %1, 0xe4} + vmovlps\t{%2, %1, %0|%0, %1, %2} + vmovlps\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog,ssemov,ssemov") + (set_attr "length_immediate" "1,*,*") + (set_attr "prefix" "vex") + (set_attr "mode" "V4SF,V2SF,V2SF")]) + +(define_insn "sse_loadlps" + [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,m") + (vec_concat:V4SF + (match_operand:V2SF 2 "nonimmediate_operand" "0,m,x") + (vec_select:V2SF + (match_operand:V4SF 1 "nonimmediate_operand" "x,0,0") + (parallel [(const_int 2) (const_int 3)]))))] + "TARGET_SSE" + "@ + shufps\t{$0xe4, %1, %0|%0, %1, 0xe4} + movlps\t{%2, %0|%0, %2} + movlps\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog,ssemov,ssemov") + (set_attr "length_immediate" "1,*,*") + (set_attr "mode" "V4SF,V2SF,V2SF")]) + +(define_insn "*avx_movss" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_merge:V4SF + (match_operand:V4SF 2 "register_operand" "x") + (match_operand:V4SF 1 "register_operand" "x") + (const_int 1)))] + "TARGET_AVX" + "vmovss\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "SF")]) + +(define_insn "sse_movss" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_merge:V4SF + (match_operand:V4SF 2 "register_operand" "x") + (match_operand:V4SF 1 "register_operand" "0") + (const_int 1)))] + "TARGET_SSE" + "movss\t{%2, %0|%0, %2}" + [(set_attr "type" "ssemov") + (set_attr "mode" "SF")]) + +(define_expand "vec_dupv4sf" + [(set (match_operand:V4SF 0 "register_operand" "") + (vec_duplicate:V4SF + (match_operand:SF 1 "nonimmediate_operand" "")))] + "TARGET_SSE" +{ + if (!TARGET_AVX) + operands[1] = force_reg (SFmode, operands[1]); +}) + +(define_insn "*vec_dupv4sf_avx" + [(set (match_operand:V4SF 0 "register_operand" "=x,x") + (vec_duplicate:V4SF + (match_operand:SF 1 "nonimmediate_operand" "x,m")))] + "TARGET_AVX" + "@ + vshufps\t{$0, %1, %1, %0|%0, %1, %1, 0} + vbroadcastss\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog1,ssemov") + (set_attr "length_immediate" "1,0") + (set_attr "prefix_extra" "0,1") + (set_attr "prefix" "vex") + (set_attr "mode" "V4SF")]) + +(define_insn "*vec_dupv4sf" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_duplicate:V4SF + (match_operand:SF 1 "register_operand" "0")))] + "TARGET_SSE" + "shufps\t{$0, %0, %0|%0, %0, 0}" + [(set_attr "type" "sselog1") + (set_attr "length_immediate" "1") + (set_attr "mode" "V4SF")]) + +(define_insn "*vec_concatv2sf_avx" + [(set (match_operand:V2SF 0 "register_operand" "=x,x,x,*y ,*y") + (vec_concat:V2SF + (match_operand:SF 1 "nonimmediate_operand" " x,x,m, 0 , m") + (match_operand:SF 2 "vector_move_operand" " x,m,C,*ym, C")))] + "TARGET_AVX" + "@ + vunpcklps\t{%2, %1, %0|%0, %1, %2} + vinsertps\t{$0x10, %2, %1, %0|%0, %1, %2, 0x10} + vmovss\t{%1, %0|%0, %1} + punpckldq\t{%2, %0|%0, %2} + movd\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog,sselog,ssemov,mmxcvt,mmxmov") + (set_attr "length_immediate" "*,1,*,*,*") + (set_attr "prefix_extra" "*,1,*,*,*") + (set (attr "prefix") + (if_then_else (eq_attr "alternative" "3,4") + (const_string "orig") + (const_string "vex"))) + (set_attr "mode" "V4SF,V4SF,SF,DI,DI")]) + +;; Although insertps takes register source, we prefer +;; unpcklps with register source since it is shorter. +(define_insn "*vec_concatv2sf_sse4_1" + [(set (match_operand:V2SF 0 "register_operand" "=x,x,x,*y ,*y") + (vec_concat:V2SF + (match_operand:SF 1 "nonimmediate_operand" " 0,0,m, 0 , m") + (match_operand:SF 2 "vector_move_operand" " x,m,C,*ym, C")))] + "TARGET_SSE4_1" + "@ + unpcklps\t{%2, %0|%0, %2} + insertps\t{$0x10, %2, %0|%0, %2, 0x10} + movss\t{%1, %0|%0, %1} + punpckldq\t{%2, %0|%0, %2} + movd\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog,sselog,ssemov,mmxcvt,mmxmov") + (set_attr "prefix_data16" "*,1,*,*,*") + (set_attr "prefix_extra" "*,1,*,*,*") + (set_attr "length_immediate" "*,1,*,*,*") + (set_attr "mode" "V4SF,V4SF,SF,DI,DI")]) + +;; ??? In theory we can match memory for the MMX alternative, but allowing +;; nonimmediate_operand for operand 2 and *not* allowing memory for the SSE +;; alternatives pretty much forces the MMX alternative to be chosen. +(define_insn "*vec_concatv2sf_sse" + [(set (match_operand:V2SF 0 "register_operand" "=x,x,*y,*y") + (vec_concat:V2SF + (match_operand:SF 1 "nonimmediate_operand" " 0,m, 0, m") + (match_operand:SF 2 "reg_or_0_operand" " x,C,*y, C")))] + "TARGET_SSE" + "@ + unpcklps\t{%2, %0|%0, %2} + movss\t{%1, %0|%0, %1} + punpckldq\t{%2, %0|%0, %2} + movd\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog,ssemov,mmxcvt,mmxmov") + (set_attr "mode" "V4SF,SF,DI,DI")]) + +(define_insn "*vec_concatv4sf_avx" + [(set (match_operand:V4SF 0 "register_operand" "=x,x") + (vec_concat:V4SF + (match_operand:V2SF 1 "register_operand" " x,x") + (match_operand:V2SF 2 "nonimmediate_operand" " x,m")))] + "TARGET_AVX" + "@ + vmovlhps\t{%2, %1, %0|%0, %1, %2} + vmovhps\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "V4SF,V2SF")]) + +(define_insn "*vec_concatv4sf_sse" + [(set (match_operand:V4SF 0 "register_operand" "=x,x") + (vec_concat:V4SF + (match_operand:V2SF 1 "register_operand" " 0,0") + (match_operand:V2SF 2 "nonimmediate_operand" " x,m")))] + "TARGET_SSE" + "@ + movlhps\t{%2, %0|%0, %2} + movhps\t{%2, %0|%0, %2}" + [(set_attr "type" "ssemov") + (set_attr "mode" "V4SF,V2SF")]) + +(define_expand "vec_init" + [(match_operand:SSEMODE 0 "register_operand" "") + (match_operand 1 "" "")] + "TARGET_SSE" +{ + ix86_expand_vector_init (false, operands[0], operands[1]); + DONE; +}) + +;; Avoid combining registers from different units in a single alternative, +;; see comment above inline_secondary_memory_needed function in i386.c +(define_insn "*vec_set_0_avx" + [(set (match_operand:SSEMODE4S 0 "nonimmediate_operand" "=x,x, x,x, x,m, m,m") + (vec_merge:SSEMODE4S + (vec_duplicate:SSEMODE4S + (match_operand: 2 + "general_operand" " x,m,*r,x,*rm,x,*r,fF")) + (match_operand:SSEMODE4S 1 "vector_move_operand" " C,C, C,x, x,0, 0,0") + (const_int 1)))] + "TARGET_AVX" + "@ + vinsertps\t{$0xe, %2, %2, %0|%0, %2, %2, 0xe} + vmov\t{%2, %0|%0, %2} + vmovd\t{%2, %0|%0, %2} + vmovss\t{%2, %1, %0|%0, %1, %2} + vpinsrd\t{$0, %2, %1, %0|%0, %1, %2, 0} + # + # + #" + [(set_attr "type" "sselog,ssemov,ssemov,ssemov,sselog,*,*,*") + (set_attr "prefix_extra" "*,*,*,*,1,*,*,*") + (set_attr "length_immediate" "*,*,*,*,1,*,*,*") + (set_attr "prefix" "vex") + (set_attr "mode" "SF,,SI,SF,TI,*,*,*")]) + +;; Avoid combining registers from different units in a single alternative, +;; see comment above inline_secondary_memory_needed function in i386.c +(define_insn "*vec_set_0_sse4_1" + [(set (match_operand:SSEMODE4S 0 "nonimmediate_operand" "=x,x, x,x, x, m,m") + (vec_merge:SSEMODE4S + (vec_duplicate:SSEMODE4S + (match_operand: 2 + "general_operand" " x,m,*r,x,*rm,*r,fF")) + (match_operand:SSEMODE4S 1 "vector_move_operand" " C,C, C,0, 0, 0,0") + (const_int 1)))] + "TARGET_SSE4_1" + "@ + insertps\t{$0xe, %2, %0|%0, %2, 0xe} + mov\t{%2, %0|%0, %2} + movd\t{%2, %0|%0, %2} + movss\t{%2, %0|%0, %2} + pinsrd\t{$0, %2, %0|%0, %2, 0} + # + #" + [(set_attr "type" "sselog,ssemov,ssemov,ssemov,sselog,*,*") + (set_attr "prefix_extra" "*,*,*,*,1,*,*") + (set_attr "length_immediate" "*,*,*,*,1,*,*") + (set_attr "mode" "SF,,SI,SF,TI,*,*")]) + +;; Avoid combining registers from different units in a single alternative, +;; see comment above inline_secondary_memory_needed function in i386.c +(define_insn "*vec_set_0_sse2" + [(set (match_operand:SSEMODE4S 0 "nonimmediate_operand" "=x, x,x,m, m,m") + (vec_merge:SSEMODE4S + (vec_duplicate:SSEMODE4S + (match_operand: 2 + "general_operand" " m,*r,x,x,*r,fF")) + (match_operand:SSEMODE4S 1 "vector_move_operand" " C, C,0,0, 0,0") + (const_int 1)))] + "TARGET_SSE2" + "@ + mov\t{%2, %0|%0, %2} + movd\t{%2, %0|%0, %2} + movss\t{%2, %0|%0, %2} + # + # + #" + [(set_attr "type" "ssemov") + (set_attr "mode" ",SI,SF,*,*,*")]) + +;; Avoid combining registers from different units in a single alternative, +;; see comment above inline_secondary_memory_needed function in i386.c +(define_insn "vec_set_0" + [(set (match_operand:SSEMODE4S 0 "nonimmediate_operand" "=x,x,m, m,m") + (vec_merge:SSEMODE4S + (vec_duplicate:SSEMODE4S + (match_operand: 2 + "general_operand" " m,x,x,*r,fF")) + (match_operand:SSEMODE4S 1 "vector_move_operand" " C,0,0, 0,0") + (const_int 1)))] + "TARGET_SSE" + "@ + movss\t{%2, %0|%0, %2} + movss\t{%2, %0|%0, %2} + # + # + #" + [(set_attr "type" "ssemov") + (set_attr "mode" "SF,SF,*,*,*")]) + +;; A subset is vec_setv4sf. +(define_insn "*vec_setv4sf_avx" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_merge:V4SF + (vec_duplicate:V4SF + (match_operand:SF 2 "nonimmediate_operand" "xm")) + (match_operand:V4SF 1 "register_operand" "x") + (match_operand:SI 3 "const_pow2_1_to_8_operand" "n")))] + "TARGET_AVX" +{ + operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3])) << 4); + return "vinsertps\t{%3, %2, %1, %0|%0, %1, %2, %3}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "V4SF")]) + +(define_insn "*vec_setv4sf_sse4_1" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_merge:V4SF + (vec_duplicate:V4SF + (match_operand:SF 2 "nonimmediate_operand" "xm")) + (match_operand:V4SF 1 "register_operand" "0") + (match_operand:SI 3 "const_pow2_1_to_8_operand" "n")))] + "TARGET_SSE4_1" +{ + operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3])) << 4); + return "insertps\t{%3, %2, %0|%0, %2, %3}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "mode" "V4SF")]) + +(define_insn "*avx_insertps" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (unspec:V4SF [(match_operand:V4SF 2 "nonimmediate_operand" "xm") + (match_operand:V4SF 1 "register_operand" "x") + (match_operand:SI 3 "const_0_to_255_operand" "n")] + UNSPEC_INSERTPS))] + "TARGET_AVX" +{ + if (MEM_P (operands[2])) + { + unsigned count_s = INTVAL (operands[3]) >> 6; + if (count_s) + operands[3] = GEN_INT (INTVAL (operands[3]) & 0x3f); + operands[2] = adjust_address_nv (operands[2], SFmode, count_s * 4); + } + return "vinsertps\t{%3, %2, %1, %0|%0, %1, %2, %3}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "mode" "V4SF")]) + +(define_insn "sse4_1_insertps" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (unspec:V4SF [(match_operand:V4SF 2 "nonimmediate_operand" "xm") + (match_operand:V4SF 1 "register_operand" "0") + (match_operand:SI 3 "const_0_to_255_operand" "n")] + UNSPEC_INSERTPS))] + "TARGET_SSE4_1" +{ + if (MEM_P (operands[2])) + { + unsigned count_s = INTVAL (operands[3]) >> 6; + if (count_s) + operands[3] = GEN_INT (INTVAL (operands[3]) & 0x3f); + operands[2] = adjust_address_nv (operands[2], SFmode, count_s * 4); + } + return "insertps\t{%3, %2, %0|%0, %2, %3}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "mode" "V4SF")]) + +(define_split + [(set (match_operand:SSEMODE4S 0 "memory_operand" "") + (vec_merge:SSEMODE4S + (vec_duplicate:SSEMODE4S + (match_operand: 1 "nonmemory_operand" "")) + (match_dup 0) + (const_int 1)))] + "TARGET_SSE && reload_completed" + [(const_int 0)] +{ + emit_move_insn (adjust_address (operands[0], mode, 0), + operands[1]); + DONE; +}) + +(define_expand "vec_set" + [(match_operand:SSEMODE 0 "register_operand" "") + (match_operand: 1 "register_operand" "") + (match_operand 2 "const_int_operand" "")] + "TARGET_SSE" +{ + ix86_expand_vector_set (false, operands[0], operands[1], + INTVAL (operands[2])); + DONE; +}) + +(define_insn_and_split "*vec_extractv4sf_0" + [(set (match_operand:SF 0 "nonimmediate_operand" "=x,m,f,r") + (vec_select:SF + (match_operand:V4SF 1 "nonimmediate_operand" "xm,x,m,m") + (parallel [(const_int 0)])))] + "TARGET_SSE && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "#" + "&& reload_completed" + [(const_int 0)] +{ + rtx op1 = operands[1]; + if (REG_P (op1)) + op1 = gen_rtx_REG (SFmode, REGNO (op1)); + else + op1 = gen_lowpart (SFmode, op1); + emit_move_insn (operands[0], op1); + DONE; +}) + +(define_expand "avx_vextractf128" + [(match_operand: 0 "nonimmediate_operand" "") + (match_operand:AVX256MODE 1 "register_operand" "") + (match_operand:SI 2 "const_0_to_1_operand" "")] + "TARGET_AVX" +{ + switch (INTVAL (operands[2])) + { + case 0: + emit_insn (gen_vec_extract_lo_ (operands[0], operands[1])); + break; + case 1: + emit_insn (gen_vec_extract_hi_ (operands[0], operands[1])); + break; + default: + gcc_unreachable (); + } + DONE; +}) + +(define_insn_and_split "vec_extract_lo_" + [(set (match_operand: 0 "nonimmediate_operand" "=x,m") + (vec_select: + (match_operand:AVX256MODE4P 1 "nonimmediate_operand" "xm,x") + (parallel [(const_int 0) (const_int 1)])))] + "TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "#" + "&& reload_completed" + [(const_int 0)] +{ + rtx op1 = operands[1]; + if (REG_P (op1)) + op1 = gen_rtx_REG (mode, REGNO (op1)); + else + op1 = gen_lowpart (mode, op1); + emit_move_insn (operands[0], op1); + DONE; +}) + +(define_insn "vec_extract_hi_" + [(set (match_operand: 0 "nonimmediate_operand" "=x,m") + (vec_select: + (match_operand:AVX256MODE4P 1 "register_operand" "x,x") + (parallel [(const_int 2) (const_int 3)])))] + "TARGET_AVX" + "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "memory" "none,store") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn_and_split "vec_extract_lo_" + [(set (match_operand: 0 "nonimmediate_operand" "=x,m") + (vec_select: + (match_operand:AVX256MODE8P 1 "nonimmediate_operand" "xm,x") + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)])))] + "TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "#" + "&& reload_completed" + [(const_int 0)] +{ + rtx op1 = operands[1]; + if (REG_P (op1)) + op1 = gen_rtx_REG (mode, REGNO (op1)); + else + op1 = gen_lowpart (mode, op1); + emit_move_insn (operands[0], op1); + DONE; +}) + +(define_insn "vec_extract_hi_" + [(set (match_operand: 0 "nonimmediate_operand" "=x,m") + (vec_select: + (match_operand:AVX256MODE8P 1 "register_operand" "x,x") + (parallel [(const_int 4) (const_int 5) + (const_int 6) (const_int 7)])))] + "TARGET_AVX" + "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "memory" "none,store") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn_and_split "vec_extract_lo_v16hi" + [(set (match_operand:V8HI 0 "nonimmediate_operand" "=x,m") + (vec_select:V8HI + (match_operand:V16HI 1 "nonimmediate_operand" "xm,x") + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7)])))] + "TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "#" + "&& reload_completed" + [(const_int 0)] +{ + rtx op1 = operands[1]; + if (REG_P (op1)) + op1 = gen_rtx_REG (V8HImode, REGNO (op1)); + else + op1 = gen_lowpart (V8HImode, op1); + emit_move_insn (operands[0], op1); + DONE; +}) + +(define_insn "vec_extract_hi_v16hi" + [(set (match_operand:V8HI 0 "nonimmediate_operand" "=x,m") + (vec_select:V8HI + (match_operand:V16HI 1 "register_operand" "x,x") + (parallel [(const_int 8) (const_int 9) + (const_int 10) (const_int 11) + (const_int 12) (const_int 13) + (const_int 14) (const_int 15)])))] + "TARGET_AVX" + "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "memory" "none,store") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn_and_split "vec_extract_lo_v32qi" + [(set (match_operand:V16QI 0 "nonimmediate_operand" "=x,m") + (vec_select:V16QI + (match_operand:V32QI 1 "nonimmediate_operand" "xm,x") + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7) + (const_int 8) (const_int 9) + (const_int 10) (const_int 11) + (const_int 12) (const_int 13) + (const_int 14) (const_int 15)])))] + "TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "#" + "&& reload_completed" + [(const_int 0)] +{ + rtx op1 = operands[1]; + if (REG_P (op1)) + op1 = gen_rtx_REG (V16QImode, REGNO (op1)); + else + op1 = gen_lowpart (V16QImode, op1); + emit_move_insn (operands[0], op1); + DONE; +}) + +(define_insn "vec_extract_hi_v32qi" + [(set (match_operand:V16QI 0 "nonimmediate_operand" "=x,m") + (vec_select:V16QI + (match_operand:V32QI 1 "register_operand" "x,x") + (parallel [(const_int 16) (const_int 17) + (const_int 18) (const_int 19) + (const_int 20) (const_int 21) + (const_int 22) (const_int 23) + (const_int 24) (const_int 25) + (const_int 26) (const_int 27) + (const_int 28) (const_int 29) + (const_int 30) (const_int 31)])))] + "TARGET_AVX" + "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "memory" "none,store") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn "*sse4_1_extractps" + [(set (match_operand:SF 0 "nonimmediate_operand" "=rm") + (vec_select:SF + (match_operand:V4SF 1 "register_operand" "x") + (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n")])))] + "TARGET_SSE4_1" + "%vextractps\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "V4SF")]) + +(define_insn_and_split "*vec_extract_v4sf_mem" + [(set (match_operand:SF 0 "register_operand" "=x*rf") + (vec_select:SF + (match_operand:V4SF 1 "memory_operand" "o") + (parallel [(match_operand 2 "const_0_to_3_operand" "n")])))] + "TARGET_SSE" + "#" + "&& reload_completed" + [(const_int 0)] +{ + int i = INTVAL (operands[2]); + + emit_move_insn (operands[0], adjust_address (operands[1], SFmode, i*4)); + DONE; +}) + +(define_expand "vec_extract" + [(match_operand: 0 "register_operand" "") + (match_operand:VEC_EXTRACT_MODE 1 "register_operand" "") + (match_operand 2 "const_int_operand" "")] + "TARGET_SSE" +{ + ix86_expand_vector_extract (false, operands[0], operands[1], + INTVAL (operands[2])); + DONE; +}) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Parallel double-precision floating point element swizzling +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Recall that the 256-bit unpck insns only shuffle within their lanes. +(define_insn "avx_unpckhpd256" + [(set (match_operand:V4DF 0 "register_operand" "=x") + (vec_select:V4DF + (vec_concat:V8DF + (match_operand:V4DF 1 "register_operand" "x") + (match_operand:V4DF 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 1) (const_int 5) + (const_int 3) (const_int 7)])))] + "TARGET_AVX" + "vunpckhpd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "V4DF")]) + +(define_expand "vec_interleave_highv4df" + [(set (match_dup 3) + (vec_select:V4DF + (vec_concat:V8DF + (match_operand:V4DF 1 "register_operand" "x") + (match_operand:V4DF 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 4) + (const_int 2) (const_int 6)]))) + (set (match_dup 4) + (vec_select:V4DF + (vec_concat:V8DF + (match_dup 1) + (match_dup 2)) + (parallel [(const_int 1) (const_int 5) + (const_int 3) (const_int 7)]))) + (set (match_operand:V4DF 0 "register_operand" "") + (vec_select:V4DF + (vec_concat:V8DF + (match_dup 3) + (match_dup 4)) + (parallel [(const_int 2) (const_int 3) + (const_int 6) (const_int 7)])))] + "TARGET_AVX" +{ + operands[3] = gen_reg_rtx (V4DFmode); + operands[4] = gen_reg_rtx (V4DFmode); +}) + + +(define_expand "vec_interleave_highv2df" + [(set (match_operand:V2DF 0 "register_operand" "") + (vec_select:V2DF + (vec_concat:V4DF + (match_operand:V2DF 1 "nonimmediate_operand" "") + (match_operand:V2DF 2 "nonimmediate_operand" "")) + (parallel [(const_int 1) + (const_int 3)])))] + "TARGET_SSE2" +{ + if (!ix86_vec_interleave_v2df_operator_ok (operands, 1)) + operands[2] = force_reg (V2DFmode, operands[2]); +}) + +(define_insn "*avx_interleave_highv2df" + [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,x,m") + (vec_select:V2DF + (vec_concat:V4DF + (match_operand:V2DF 1 "nonimmediate_operand" " x,o,o,x") + (match_operand:V2DF 2 "nonimmediate_operand" " x,1,x,0")) + (parallel [(const_int 1) + (const_int 3)])))] + "TARGET_AVX && ix86_vec_interleave_v2df_operator_ok (operands, 1)" + "@ + vunpckhpd\t{%2, %1, %0|%0, %1, %2} + vmovddup\t{%H1, %0|%0, %H1} + vmovlpd\t{%H1, %2, %0|%0, %2, %H1} + vmovhpd\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog,sselog,ssemov,ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "V2DF,V2DF,V1DF,V1DF")]) + +(define_insn "*sse3_interleave_highv2df" + [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,x,m") + (vec_select:V2DF + (vec_concat:V4DF + (match_operand:V2DF 1 "nonimmediate_operand" " 0,o,o,x") + (match_operand:V2DF 2 "nonimmediate_operand" " x,1,0,0")) + (parallel [(const_int 1) + (const_int 3)])))] + "TARGET_SSE3 && ix86_vec_interleave_v2df_operator_ok (operands, 1)" + "@ + unpckhpd\t{%2, %0|%0, %2} + movddup\t{%H1, %0|%0, %H1} + movlpd\t{%H1, %0|%0, %H1} + movhpd\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog,sselog,ssemov,ssemov") + (set_attr "prefix_data16" "*,*,1,1") + (set_attr "mode" "V2DF,V2DF,V1DF,V1DF")]) + +(define_insn "*sse2_interleave_highv2df" + [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,m") + (vec_select:V2DF + (vec_concat:V4DF + (match_operand:V2DF 1 "nonimmediate_operand" " 0,o,x") + (match_operand:V2DF 2 "nonimmediate_operand" " x,0,0")) + (parallel [(const_int 1) + (const_int 3)])))] + "TARGET_SSE2 && ix86_vec_interleave_v2df_operator_ok (operands, 1)" + "@ + unpckhpd\t{%2, %0|%0, %2} + movlpd\t{%H1, %0|%0, %H1} + movhpd\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog,ssemov,ssemov") + (set_attr "prefix_data16" "*,1,1") + (set_attr "mode" "V2DF,V1DF,V1DF")]) + +;; Recall that the 256-bit unpck insns only shuffle within their lanes. +(define_expand "avx_movddup256" + [(set (match_operand:V4DF 0 "register_operand" "") + (vec_select:V4DF + (vec_concat:V8DF + (match_operand:V4DF 1 "nonimmediate_operand" "") + (match_dup 1)) + (parallel [(const_int 0) (const_int 4) + (const_int 2) (const_int 6)])))] + "TARGET_AVX") + +(define_expand "avx_unpcklpd256" + [(set (match_operand:V4DF 0 "register_operand" "") + (vec_select:V4DF + (vec_concat:V8DF + (match_operand:V4DF 1 "register_operand" "") + (match_operand:V4DF 2 "nonimmediate_operand" "")) + (parallel [(const_int 0) (const_int 4) + (const_int 2) (const_int 6)])))] + "TARGET_AVX") + +(define_insn "*avx_unpcklpd256" + [(set (match_operand:V4DF 0 "register_operand" "=x,x") + (vec_select:V4DF + (vec_concat:V8DF + (match_operand:V4DF 1 "nonimmediate_operand" " x,m") + (match_operand:V4DF 2 "nonimmediate_operand" "xm,1")) + (parallel [(const_int 0) (const_int 4) + (const_int 2) (const_int 6)])))] + "TARGET_AVX" + "@ + vunpcklpd\t{%2, %1, %0|%0, %1, %2} + vmovddup\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "V4DF")]) + +(define_expand "vec_interleave_lowv4df" + [(set (match_dup 3) + (vec_select:V4DF + (vec_concat:V8DF + (match_operand:V4DF 1 "register_operand" "x") + (match_operand:V4DF 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 4) + (const_int 2) (const_int 6)]))) + (set (match_dup 4) + (vec_select:V4DF + (vec_concat:V8DF + (match_dup 1) + (match_dup 2)) + (parallel [(const_int 1) (const_int 5) + (const_int 3) (const_int 7)]))) + (set (match_operand:V4DF 0 "register_operand" "") + (vec_select:V4DF + (vec_concat:V8DF + (match_dup 3) + (match_dup 4)) + (parallel [(const_int 0) (const_int 1) + (const_int 4) (const_int 5)])))] + "TARGET_AVX" +{ + operands[3] = gen_reg_rtx (V4DFmode); + operands[4] = gen_reg_rtx (V4DFmode); +}) + +(define_expand "vec_interleave_lowv2df" + [(set (match_operand:V2DF 0 "register_operand" "") + (vec_select:V2DF + (vec_concat:V4DF + (match_operand:V2DF 1 "nonimmediate_operand" "") + (match_operand:V2DF 2 "nonimmediate_operand" "")) + (parallel [(const_int 0) + (const_int 2)])))] + "TARGET_SSE2" +{ + if (!ix86_vec_interleave_v2df_operator_ok (operands, 0)) + operands[1] = force_reg (V2DFmode, operands[1]); +}) + +(define_insn "*avx_interleave_lowv2df" + [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,x,o") + (vec_select:V2DF + (vec_concat:V4DF + (match_operand:V2DF 1 "nonimmediate_operand" " x,m,x,0") + (match_operand:V2DF 2 "nonimmediate_operand" " x,1,m,x")) + (parallel [(const_int 0) + (const_int 2)])))] + "TARGET_AVX && ix86_vec_interleave_v2df_operator_ok (operands, 0)" + "@ + vunpcklpd\t{%2, %1, %0|%0, %1, %2} + vmovddup\t{%1, %0|%0, %1} + vmovhpd\t{%2, %1, %0|%0, %1, %2} + vmovlpd\t{%2, %H0|%H0, %2}" + [(set_attr "type" "sselog,sselog,ssemov,ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "V2DF,V2DF,V1DF,V1DF")]) + +(define_insn "*sse3_interleave_lowv2df" + [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,x,o") + (vec_select:V2DF + (vec_concat:V4DF + (match_operand:V2DF 1 "nonimmediate_operand" " 0,m,0,0") + (match_operand:V2DF 2 "nonimmediate_operand" " x,1,m,x")) + (parallel [(const_int 0) + (const_int 2)])))] + "TARGET_SSE3 && ix86_vec_interleave_v2df_operator_ok (operands, 0)" + "@ + unpcklpd\t{%2, %0|%0, %2} + movddup\t{%1, %0|%0, %1} + movhpd\t{%2, %0|%0, %2} + movlpd\t{%2, %H0|%H0, %2}" + [(set_attr "type" "sselog,sselog,ssemov,ssemov") + (set_attr "prefix_data16" "*,*,1,1") + (set_attr "mode" "V2DF,V2DF,V1DF,V1DF")]) + +(define_insn "*sse2_interleave_lowv2df" + [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,o") + (vec_select:V2DF + (vec_concat:V4DF + (match_operand:V2DF 1 "nonimmediate_operand" " 0,0,0") + (match_operand:V2DF 2 "nonimmediate_operand" " x,m,x")) + (parallel [(const_int 0) + (const_int 2)])))] + "TARGET_SSE2 && ix86_vec_interleave_v2df_operator_ok (operands, 0)" + "@ + unpcklpd\t{%2, %0|%0, %2} + movhpd\t{%2, %0|%0, %2} + movlpd\t{%2, %H0|%H0, %2}" + [(set_attr "type" "sselog,ssemov,ssemov") + (set_attr "prefix_data16" "*,1,1") + (set_attr "mode" "V2DF,V1DF,V1DF")]) + +(define_split + [(set (match_operand:V2DF 0 "memory_operand" "") + (vec_select:V2DF + (vec_concat:V4DF + (match_operand:V2DF 1 "register_operand" "") + (match_dup 1)) + (parallel [(const_int 0) + (const_int 2)])))] + "TARGET_SSE3 && reload_completed" + [(const_int 0)] +{ + rtx low = gen_rtx_REG (DFmode, REGNO (operands[1])); + emit_move_insn (adjust_address (operands[0], DFmode, 0), low); + emit_move_insn (adjust_address (operands[0], DFmode, 8), low); + DONE; +}) + +(define_split + [(set (match_operand:V2DF 0 "register_operand" "") + (vec_select:V2DF + (vec_concat:V4DF + (match_operand:V2DF 1 "memory_operand" "") + (match_dup 1)) + (parallel [(match_operand:SI 2 "const_0_to_1_operand" "") + (match_operand:SI 3 "const_int_operand" "")])))] + "TARGET_SSE3 && INTVAL (operands[2]) + 2 == INTVAL (operands[3])" + [(set (match_dup 0) (vec_duplicate:V2DF (match_dup 1)))] +{ + operands[1] = adjust_address (operands[1], DFmode, INTVAL (operands[2]) * 8); +}) + +(define_expand "avx_shufpd256" + [(match_operand:V4DF 0 "register_operand" "") + (match_operand:V4DF 1 "register_operand" "") + (match_operand:V4DF 2 "nonimmediate_operand" "") + (match_operand:SI 3 "const_int_operand" "")] + "TARGET_AVX" +{ + int mask = INTVAL (operands[3]); + emit_insn (gen_avx_shufpd256_1 (operands[0], operands[1], operands[2], + GEN_INT (mask & 1), + GEN_INT (mask & 2 ? 5 : 4), + GEN_INT (mask & 4 ? 3 : 2), + GEN_INT (mask & 8 ? 7 : 6))); + DONE; +}) + +(define_insn "avx_shufpd256_1" + [(set (match_operand:V4DF 0 "register_operand" "=x") + (vec_select:V4DF + (vec_concat:V8DF + (match_operand:V4DF 1 "register_operand" "x") + (match_operand:V4DF 2 "nonimmediate_operand" "xm")) + (parallel [(match_operand 3 "const_0_to_1_operand" "") + (match_operand 4 "const_4_to_5_operand" "") + (match_operand 5 "const_2_to_3_operand" "") + (match_operand 6 "const_6_to_7_operand" "")])))] + "TARGET_AVX" +{ + int mask; + mask = INTVAL (operands[3]); + mask |= (INTVAL (operands[4]) - 4) << 1; + mask |= (INTVAL (operands[5]) - 2) << 2; + mask |= (INTVAL (operands[6]) - 6) << 3; + operands[3] = GEN_INT (mask); + + return "vshufpd\t{%3, %2, %1, %0|%0, %1, %2, %3}"; +} + [(set_attr "type" "sselog") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "V4DF")]) + +(define_expand "sse2_shufpd" + [(match_operand:V2DF 0 "register_operand" "") + (match_operand:V2DF 1 "register_operand" "") + (match_operand:V2DF 2 "nonimmediate_operand" "") + (match_operand:SI 3 "const_int_operand" "")] + "TARGET_SSE2" +{ + int mask = INTVAL (operands[3]); + emit_insn (gen_sse2_shufpd_v2df (operands[0], operands[1], operands[2], + GEN_INT (mask & 1), + GEN_INT (mask & 2 ? 3 : 2))); + DONE; +}) + +(define_expand "vec_extract_even" + [(match_operand:SSEMODE_EO 0 "register_operand" "") + (match_operand:SSEMODE_EO 1 "register_operand" "") + (match_operand:SSEMODE_EO 2 "register_operand" "")] + "" +{ + ix86_expand_vec_extract_even_odd (operands[0], operands[1], operands[2], 0); + DONE; +}) + +(define_expand "vec_extract_odd" + [(match_operand:SSEMODE_EO 0 "register_operand" "") + (match_operand:SSEMODE_EO 1 "register_operand" "") + (match_operand:SSEMODE_EO 2 "register_operand" "")] + "" +{ + ix86_expand_vec_extract_even_odd (operands[0], operands[1], operands[2], 1); + DONE; +}) + +;; punpcklqdq and punpckhqdq are shorter than shufpd. +(define_insn "*avx_interleave_highv2di" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (vec_select:V2DI + (vec_concat:V4DI + (match_operand:V2DI 1 "register_operand" "x") + (match_operand:V2DI 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 1) + (const_int 3)])))] + "TARGET_AVX" + "vpunpckhqdq\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "vec_interleave_highv2di" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (vec_select:V2DI + (vec_concat:V4DI + (match_operand:V2DI 1 "register_operand" "0") + (match_operand:V2DI 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 1) + (const_int 3)])))] + "TARGET_SSE2" + "punpckhqdq\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_insn "*avx_interleave_lowv2di" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (vec_select:V2DI + (vec_concat:V4DI + (match_operand:V2DI 1 "register_operand" "x") + (match_operand:V2DI 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) + (const_int 2)])))] + "TARGET_AVX" + "vpunpcklqdq\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "vec_interleave_lowv2di" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (vec_select:V2DI + (vec_concat:V4DI + (match_operand:V2DI 1 "register_operand" "0") + (match_operand:V2DI 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) + (const_int 2)])))] + "TARGET_SSE2" + "punpcklqdq\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_insn "*avx_shufpd_" + [(set (match_operand:SSEMODE2D 0 "register_operand" "=x") + (vec_select:SSEMODE2D + (vec_concat: + (match_operand:SSEMODE2D 1 "register_operand" "x") + (match_operand:SSEMODE2D 2 "nonimmediate_operand" "xm")) + (parallel [(match_operand 3 "const_0_to_1_operand" "") + (match_operand 4 "const_2_to_3_operand" "")])))] + "TARGET_AVX" +{ + int mask; + mask = INTVAL (operands[3]); + mask |= (INTVAL (operands[4]) - 2) << 1; + operands[3] = GEN_INT (mask); + + return "vshufpd\t{%3, %2, %1, %0|%0, %1, %2, %3}"; +} + [(set_attr "type" "sselog") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "V2DF")]) + +(define_insn "sse2_shufpd_" + [(set (match_operand:SSEMODE2D 0 "register_operand" "=x") + (vec_select:SSEMODE2D + (vec_concat: + (match_operand:SSEMODE2D 1 "register_operand" "0") + (match_operand:SSEMODE2D 2 "nonimmediate_operand" "xm")) + (parallel [(match_operand 3 "const_0_to_1_operand" "") + (match_operand 4 "const_2_to_3_operand" "")])))] + "TARGET_SSE2" +{ + int mask; + mask = INTVAL (operands[3]); + mask |= (INTVAL (operands[4]) - 2) << 1; + operands[3] = GEN_INT (mask); + + return "shufpd\t{%3, %2, %0|%0, %2, %3}"; +} + [(set_attr "type" "sselog") + (set_attr "length_immediate" "1") + (set_attr "mode" "V2DF")]) + +;; Avoid combining registers from different units in a single alternative, +;; see comment above inline_secondary_memory_needed function in i386.c +(define_insn "*avx_storehpd" + [(set (match_operand:DF 0 "nonimmediate_operand" "=m,x,x,*f,r") + (vec_select:DF + (match_operand:V2DF 1 "nonimmediate_operand" " x,x,o,o,o") + (parallel [(const_int 1)])))] + "TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "@ + vmovhpd\t{%1, %0|%0, %1} + vunpckhpd\t{%1, %1, %0|%0, %1, %1} + # + # + #" + [(set_attr "type" "ssemov,sselog1,ssemov,fmov,imov") + (set_attr "prefix" "vex") + (set_attr "mode" "V1DF,V2DF,DF,DF,DF")]) + +;; Avoid combining registers from different units in a single alternative, +;; see comment above inline_secondary_memory_needed function in i386.c +(define_insn "sse2_storehpd" + [(set (match_operand:DF 0 "nonimmediate_operand" "=m,x,x,*f,r") + (vec_select:DF + (match_operand:V2DF 1 "nonimmediate_operand" " x,0,o,o,o") + (parallel [(const_int 1)])))] + "TARGET_SSE2 && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "@ + movhpd\t{%1, %0|%0, %1} + unpckhpd\t%0, %0 + # + # + #" + [(set_attr "type" "ssemov,sselog1,ssemov,fmov,imov") + (set_attr "prefix_data16" "1,*,*,*,*") + (set_attr "mode" "V1DF,V2DF,DF,DF,DF")]) + +(define_split + [(set (match_operand:DF 0 "register_operand" "") + (vec_select:DF + (match_operand:V2DF 1 "memory_operand" "") + (parallel [(const_int 1)])))] + "TARGET_SSE2 && reload_completed" + [(set (match_dup 0) (match_dup 1))] + "operands[1] = adjust_address (operands[1], DFmode, 8);") + +;; Avoid combining registers from different units in a single alternative, +;; see comment above inline_secondary_memory_needed function in i386.c +(define_insn "sse2_storelpd" + [(set (match_operand:DF 0 "nonimmediate_operand" "=m,x,x,*f,r") + (vec_select:DF + (match_operand:V2DF 1 "nonimmediate_operand" " x,x,m,m,m") + (parallel [(const_int 0)])))] + "TARGET_SSE2 && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "@ + %vmovlpd\t{%1, %0|%0, %1} + # + # + # + #" + [(set_attr "type" "ssemov,ssemov,ssemov,fmov,imov") + (set_attr "prefix_data16" "1,*,*,*,*") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "V1DF,DF,DF,DF,DF")]) + +(define_split + [(set (match_operand:DF 0 "register_operand" "") + (vec_select:DF + (match_operand:V2DF 1 "nonimmediate_operand" "") + (parallel [(const_int 0)])))] + "TARGET_SSE2 && reload_completed" + [(const_int 0)] +{ + rtx op1 = operands[1]; + if (REG_P (op1)) + op1 = gen_rtx_REG (DFmode, REGNO (op1)); + else + op1 = gen_lowpart (DFmode, op1); + emit_move_insn (operands[0], op1); + DONE; +}) + +(define_expand "sse2_loadhpd_exp" + [(set (match_operand:V2DF 0 "nonimmediate_operand" "") + (vec_concat:V2DF + (vec_select:DF + (match_operand:V2DF 1 "nonimmediate_operand" "") + (parallel [(const_int 0)])) + (match_operand:DF 2 "nonimmediate_operand" "")))] + "TARGET_SSE2" +{ + rtx dst = ix86_fixup_binary_operands (UNKNOWN, V2DFmode, operands); + + emit_insn (gen_sse2_loadhpd (dst, operands[1], operands[2])); + + /* Fix up the destination if needed. */ + if (dst != operands[0]) + emit_move_insn (operands[0], dst); + + DONE; +}) + +;; Avoid combining registers from different units in a single alternative, +;; see comment above inline_secondary_memory_needed function in i386.c +(define_insn "*avx_loadhpd" + [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,o,o,o") + (vec_concat:V2DF + (vec_select:DF + (match_operand:V2DF 1 "nonimmediate_operand" " x,x,0,0,0") + (parallel [(const_int 0)])) + (match_operand:DF 2 "nonimmediate_operand" " m,x,x,*f,r")))] + "TARGET_AVX && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "@ + vmovhpd\t{%2, %1, %0|%0, %1, %2} + vunpcklpd\t{%2, %1, %0|%0, %1, %2} + # + # + #" + [(set_attr "type" "ssemov,sselog,ssemov,fmov,imov") + (set_attr "prefix" "vex") + (set_attr "mode" "V1DF,V2DF,DF,DF,DF")]) + +;; Avoid combining registers from different units in a single alternative, +;; see comment above inline_secondary_memory_needed function in i386.c +(define_insn "sse2_loadhpd" + [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,o,o,o") + (vec_concat:V2DF + (vec_select:DF + (match_operand:V2DF 1 "nonimmediate_operand" " 0,0,0,0,0") + (parallel [(const_int 0)])) + (match_operand:DF 2 "nonimmediate_operand" " m,x,x,*f,r")))] + "TARGET_SSE2 && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "@ + movhpd\t{%2, %0|%0, %2} + unpcklpd\t{%2, %0|%0, %2} + # + # + #" + [(set_attr "type" "ssemov,sselog,ssemov,fmov,imov") + (set_attr "prefix_data16" "1,*,*,*,*") + (set_attr "mode" "V1DF,V2DF,DF,DF,DF")]) + +(define_split + [(set (match_operand:V2DF 0 "memory_operand" "") + (vec_concat:V2DF + (vec_select:DF (match_dup 0) (parallel [(const_int 0)])) + (match_operand:DF 1 "register_operand" "")))] + "TARGET_SSE2 && reload_completed" + [(set (match_dup 0) (match_dup 1))] + "operands[0] = adjust_address (operands[0], DFmode, 8);") + +(define_expand "sse2_loadlpd_exp" + [(set (match_operand:V2DF 0 "nonimmediate_operand" "") + (vec_concat:V2DF + (match_operand:DF 2 "nonimmediate_operand" "") + (vec_select:DF + (match_operand:V2DF 1 "nonimmediate_operand" "") + (parallel [(const_int 1)]))))] + "TARGET_SSE2" +{ + rtx dst = ix86_fixup_binary_operands (UNKNOWN, V2DFmode, operands); + + emit_insn (gen_sse2_loadlpd (dst, operands[1], operands[2])); + + /* Fix up the destination if needed. */ + if (dst != operands[0]) + emit_move_insn (operands[0], dst); + + DONE; +}) + +;; Avoid combining registers from different units in a single alternative, +;; see comment above inline_secondary_memory_needed function in i386.c +(define_insn "*avx_loadlpd" + [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,x,x,m,m,m") + (vec_concat:V2DF + (match_operand:DF 2 "nonimmediate_operand" " m,m,x,x,x,*f,r") + (vec_select:DF + (match_operand:V2DF 1 "vector_move_operand" " C,x,x,o,0,0,0") + (parallel [(const_int 1)]))))] + "TARGET_AVX && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "@ + vmovsd\t{%2, %0|%0, %2} + vmovlpd\t{%2, %1, %0|%0, %1, %2} + vmovsd\t{%2, %1, %0|%0, %1, %2} + vmovhpd\t{%H1, %2, %0|%0, %2, %H1} + # + # + #" + [(set_attr "type" "ssemov,ssemov,ssemov,ssemov,ssemov,fmov,imov") + (set_attr "prefix" "vex") + (set_attr "mode" "DF,V1DF,V1DF,V1DF,DF,DF,DF")]) + +;; Avoid combining registers from different units in a single alternative, +;; see comment above inline_secondary_memory_needed function in i386.c +(define_insn "sse2_loadlpd" + [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,x,x,x,m,m,m") + (vec_concat:V2DF + (match_operand:DF 2 "nonimmediate_operand" " m,m,x,0,0,x,*f,r") + (vec_select:DF + (match_operand:V2DF 1 "vector_move_operand" " C,0,0,x,o,0,0,0") + (parallel [(const_int 1)]))))] + "TARGET_SSE2 && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "@ + movsd\t{%2, %0|%0, %2} + movlpd\t{%2, %0|%0, %2} + movsd\t{%2, %0|%0, %2} + shufpd\t{$2, %1, %0|%0, %1, 2} + movhpd\t{%H1, %0|%0, %H1} + # + # + #" + [(set_attr "type" "ssemov,ssemov,ssemov,sselog,ssemov,ssemov,fmov,imov") + (set_attr "prefix_data16" "*,1,*,*,1,*,*,*") + (set_attr "length_immediate" "*,*,*,1,*,*,*,*") + (set_attr "mode" "DF,V1DF,V1DF,V2DF,V1DF,DF,DF,DF")]) + +(define_split + [(set (match_operand:V2DF 0 "memory_operand" "") + (vec_concat:V2DF + (match_operand:DF 1 "register_operand" "") + (vec_select:DF (match_dup 0) (parallel [(const_int 1)]))))] + "TARGET_SSE2 && reload_completed" + [(set (match_dup 0) (match_dup 1))] + "operands[0] = adjust_address (operands[0], DFmode, 0);") + +;; Not sure these two are ever used, but it doesn't hurt to have +;; them. -aoliva +(define_insn "*vec_extractv2df_1_sse" + [(set (match_operand:DF 0 "nonimmediate_operand" "=m,x,x") + (vec_select:DF + (match_operand:V2DF 1 "nonimmediate_operand" "x,x,o") + (parallel [(const_int 1)])))] + "!TARGET_SSE2 && TARGET_SSE + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "@ + movhps\t{%1, %0|%0, %1} + movhlps\t{%1, %0|%0, %1} + movlps\t{%H1, %0|%0, %H1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "V2SF,V4SF,V2SF")]) + +(define_insn "*vec_extractv2df_0_sse" + [(set (match_operand:DF 0 "nonimmediate_operand" "=m,x,x") + (vec_select:DF + (match_operand:V2DF 1 "nonimmediate_operand" "x,x,m") + (parallel [(const_int 0)])))] + "!TARGET_SSE2 && TARGET_SSE + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "@ + movlps\t{%1, %0|%0, %1} + movaps\t{%1, %0|%0, %1} + movlps\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "V2SF,V4SF,V2SF")]) + +(define_insn "*avx_movsd" + [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,m,x,o") + (vec_merge:V2DF + (match_operand:V2DF 2 "nonimmediate_operand" " x,m,x,x,0") + (match_operand:V2DF 1 "nonimmediate_operand" " x,x,0,o,x") + (const_int 1)))] + "TARGET_AVX" + "@ + vmovsd\t{%2, %1, %0|%0, %1, %2} + vmovlpd\t{%2, %1, %0|%0, %1, %2} + vmovlpd\t{%2, %0|%0, %2} + vmovhps\t{%H1, %2, %0|%0, %2, %H1} + vmovhps\t{%1, %H0|%H0, %1}" + [(set_attr "type" "ssemov,ssemov,ssemov,ssemov,ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "DF,V1DF,V1DF,V1DF,V1DF")]) + +(define_insn "sse2_movsd" + [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,m,x,x,o") + (vec_merge:V2DF + (match_operand:V2DF 2 "nonimmediate_operand" " x,m,x,0,0,0") + (match_operand:V2DF 1 "nonimmediate_operand" " 0,0,0,x,o,x") + (const_int 1)))] + "TARGET_SSE2" + "@ + movsd\t{%2, %0|%0, %2} + movlpd\t{%2, %0|%0, %2} + movlpd\t{%2, %0|%0, %2} + shufpd\t{$2, %1, %0|%0, %1, 2} + movhps\t{%H1, %0|%0, %H1} + movhps\t{%1, %H0|%H0, %1}" + [(set_attr "type" "ssemov,ssemov,ssemov,sselog,ssemov,ssemov") + (set_attr "prefix_data16" "*,1,1,*,*,*") + (set_attr "length_immediate" "*,*,*,1,*,*") + (set_attr "mode" "DF,V1DF,V1DF,V2DF,V1DF,V1DF")]) + +(define_expand "vec_dupv2df" + [(set (match_operand:V2DF 0 "register_operand" "") + (vec_duplicate:V2DF + (match_operand:DF 1 "nonimmediate_operand" "")))] + "TARGET_SSE2" +{ + if (!TARGET_SSE3) + operands[1] = force_reg (DFmode, operands[1]); +}) + +(define_insn "*vec_dupv2df_sse3" + [(set (match_operand:V2DF 0 "register_operand" "=x") + (vec_duplicate:V2DF + (match_operand:DF 1 "nonimmediate_operand" "xm")))] + "TARGET_SSE3" + "%vmovddup\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "DF")]) + +(define_insn "*vec_dupv2df" + [(set (match_operand:V2DF 0 "register_operand" "=x") + (vec_duplicate:V2DF + (match_operand:DF 1 "register_operand" "0")))] + "TARGET_SSE2" + "unpcklpd\t%0, %0" + [(set_attr "type" "sselog1") + (set_attr "mode" "V2DF")]) + +(define_insn "*vec_concatv2df_sse3" + [(set (match_operand:V2DF 0 "register_operand" "=x") + (vec_concat:V2DF + (match_operand:DF 1 "nonimmediate_operand" "xm") + (match_dup 1)))] + "TARGET_SSE3" + "%vmovddup\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "DF")]) + +(define_insn "*vec_concatv2df_avx" + [(set (match_operand:V2DF 0 "register_operand" "=x,x,x") + (vec_concat:V2DF + (match_operand:DF 1 "nonimmediate_operand" " x,x,m") + (match_operand:DF 2 "vector_move_operand" " x,m,C")))] + "TARGET_AVX" + "@ + vunpcklpd\t{%2, %1, %0|%0, %1, %2} + vmovhpd\t{%2, %1, %0|%0, %1, %2} + vmovsd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "DF,V1DF,DF")]) + +(define_insn "*vec_concatv2df" + [(set (match_operand:V2DF 0 "register_operand" "=Y2,Y2,Y2,x,x") + (vec_concat:V2DF + (match_operand:DF 1 "nonimmediate_operand" " 0 ,0 ,m ,0,0") + (match_operand:DF 2 "vector_move_operand" " Y2,m ,C ,x,m")))] + "TARGET_SSE" + "@ + unpcklpd\t{%2, %0|%0, %2} + movhpd\t{%2, %0|%0, %2} + movsd\t{%1, %0|%0, %1} + movlhps\t{%2, %0|%0, %2} + movhps\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog,ssemov,ssemov,ssemov,ssemov") + (set_attr "prefix_data16" "*,1,*,*,*") + (set_attr "mode" "V2DF,V1DF,DF,V4SF,V2SF")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Parallel integral arithmetic +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_expand "neg2" + [(set (match_operand:SSEMODEI 0 "register_operand" "") + (minus:SSEMODEI + (match_dup 2) + (match_operand:SSEMODEI 1 "nonimmediate_operand" "")))] + "TARGET_SSE2" + "operands[2] = force_reg (mode, CONST0_RTX (mode));") + +(define_expand "3" + [(set (match_operand:SSEMODEI 0 "register_operand" "") + (plusminus:SSEMODEI + (match_operand:SSEMODEI 1 "nonimmediate_operand" "") + (match_operand:SSEMODEI 2 "nonimmediate_operand" "")))] + "TARGET_SSE2" + "ix86_fixup_binary_operands_no_copy (, mode, operands);") + +(define_insn "*avx_3" + [(set (match_operand:SSEMODEI 0 "register_operand" "=x") + (plusminus:SSEMODEI + (match_operand:SSEMODEI 1 "nonimmediate_operand" "x") + (match_operand:SSEMODEI 2 "nonimmediate_operand" "xm")))] + "TARGET_AVX && ix86_binary_operator_ok (, mode, operands)" + "vp\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "*3" + [(set (match_operand:SSEMODEI 0 "register_operand" "=x") + (plusminus:SSEMODEI + (match_operand:SSEMODEI 1 "nonimmediate_operand" "0") + (match_operand:SSEMODEI 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE2 && ix86_binary_operator_ok (, mode, operands)" + "p\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_expand "sse2_3" + [(set (match_operand:SSEMODE12 0 "register_operand" "") + (sat_plusminus:SSEMODE12 + (match_operand:SSEMODE12 1 "nonimmediate_operand" "") + (match_operand:SSEMODE12 2 "nonimmediate_operand" "")))] + "TARGET_SSE2" + "ix86_fixup_binary_operands_no_copy (, mode, operands);") + +(define_insn "*avx_3" + [(set (match_operand:SSEMODE12 0 "register_operand" "=x") + (sat_plusminus:SSEMODE12 + (match_operand:SSEMODE12 1 "nonimmediate_operand" "x") + (match_operand:SSEMODE12 2 "nonimmediate_operand" "xm")))] + "TARGET_AVX && ix86_binary_operator_ok (, mode, operands)" + "vp\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "*sse2_3" + [(set (match_operand:SSEMODE12 0 "register_operand" "=x") + (sat_plusminus:SSEMODE12 + (match_operand:SSEMODE12 1 "nonimmediate_operand" "0") + (match_operand:SSEMODE12 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE2 && ix86_binary_operator_ok (, mode, operands)" + "p\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_insn_and_split "mulv16qi3" + [(set (match_operand:V16QI 0 "register_operand" "") + (mult:V16QI (match_operand:V16QI 1 "register_operand" "") + (match_operand:V16QI 2 "register_operand" "")))] + "TARGET_SSE2 + && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] +{ + rtx t[6]; + int i; + + for (i = 0; i < 6; ++i) + t[i] = gen_reg_rtx (V16QImode); + + /* Unpack data such that we've got a source byte in each low byte of + each word. We don't care what goes into the high byte of each word. + Rather than trying to get zero in there, most convenient is to let + it be a copy of the low byte. */ + emit_insn (gen_vec_interleave_highv16qi (t[0], operands[1], operands[1])); + emit_insn (gen_vec_interleave_highv16qi (t[1], operands[2], operands[2])); + emit_insn (gen_vec_interleave_lowv16qi (t[2], operands[1], operands[1])); + emit_insn (gen_vec_interleave_lowv16qi (t[3], operands[2], operands[2])); + + /* Multiply words. The end-of-line annotations here give a picture of what + the output of that instruction looks like. Dot means don't care; the + letters are the bytes of the result with A being the most significant. */ + emit_insn (gen_mulv8hi3 (gen_lowpart (V8HImode, t[4]), /* .A.B.C.D.E.F.G.H */ + gen_lowpart (V8HImode, t[0]), + gen_lowpart (V8HImode, t[1]))); + emit_insn (gen_mulv8hi3 (gen_lowpart (V8HImode, t[5]), /* .I.J.K.L.M.N.O.P */ + gen_lowpart (V8HImode, t[2]), + gen_lowpart (V8HImode, t[3]))); + + /* Extract the even bytes and merge them back together. */ + ix86_expand_vec_extract_even_odd (operands[0], t[5], t[4], 0); + DONE; +}) + +(define_expand "mulv8hi3" + [(set (match_operand:V8HI 0 "register_operand" "") + (mult:V8HI (match_operand:V8HI 1 "nonimmediate_operand" "") + (match_operand:V8HI 2 "nonimmediate_operand" "")))] + "TARGET_SSE2" + "ix86_fixup_binary_operands_no_copy (MULT, V8HImode, operands);") + +(define_insn "*avx_mulv8hi3" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (mult:V8HI (match_operand:V8HI 1 "nonimmediate_operand" "%x") + (match_operand:V8HI 2 "nonimmediate_operand" "xm")))] + "TARGET_AVX && ix86_binary_operator_ok (MULT, V8HImode, operands)" + "vpmullw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseimul") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "*mulv8hi3" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (mult:V8HI (match_operand:V8HI 1 "nonimmediate_operand" "%0") + (match_operand:V8HI 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE2 && ix86_binary_operator_ok (MULT, V8HImode, operands)" + "pmullw\t{%2, %0|%0, %2}" + [(set_attr "type" "sseimul") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_expand "mulv8hi3_highpart" + [(set (match_operand:V8HI 0 "register_operand" "") + (truncate:V8HI + (lshiftrt:V8SI + (mult:V8SI + (any_extend:V8SI + (match_operand:V8HI 1 "nonimmediate_operand" "")) + (any_extend:V8SI + (match_operand:V8HI 2 "nonimmediate_operand" ""))) + (const_int 16))))] + "TARGET_SSE2" + "ix86_fixup_binary_operands_no_copy (MULT, V8HImode, operands);") + +(define_insn "*avx_mulv8hi3_highpart" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (truncate:V8HI + (lshiftrt:V8SI + (mult:V8SI + (any_extend:V8SI + (match_operand:V8HI 1 "nonimmediate_operand" "%x")) + (any_extend:V8SI + (match_operand:V8HI 2 "nonimmediate_operand" "xm"))) + (const_int 16))))] + "TARGET_AVX && ix86_binary_operator_ok (MULT, V8HImode, operands)" + "vpmulhw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseimul") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "*mulv8hi3_highpart" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (truncate:V8HI + (lshiftrt:V8SI + (mult:V8SI + (any_extend:V8SI + (match_operand:V8HI 1 "nonimmediate_operand" "%0")) + (any_extend:V8SI + (match_operand:V8HI 2 "nonimmediate_operand" "xm"))) + (const_int 16))))] + "TARGET_SSE2 && ix86_binary_operator_ok (MULT, V8HImode, operands)" + "pmulhw\t{%2, %0|%0, %2}" + [(set_attr "type" "sseimul") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_expand "sse2_umulv2siv2di3" + [(set (match_operand:V2DI 0 "register_operand" "") + (mult:V2DI + (zero_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "nonimmediate_operand" "") + (parallel [(const_int 0) (const_int 2)]))) + (zero_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 2 "nonimmediate_operand" "") + (parallel [(const_int 0) (const_int 2)])))))] + "TARGET_SSE2" + "ix86_fixup_binary_operands_no_copy (MULT, V4SImode, operands);") + +(define_insn "*avx_umulv2siv2di3" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (mult:V2DI + (zero_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "nonimmediate_operand" "%x") + (parallel [(const_int 0) (const_int 2)]))) + (zero_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0) (const_int 2)])))))] + "TARGET_AVX && ix86_binary_operator_ok (MULT, V4SImode, operands)" + "vpmuludq\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseimul") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "*sse2_umulv2siv2di3" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (mult:V2DI + (zero_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "nonimmediate_operand" "%0") + (parallel [(const_int 0) (const_int 2)]))) + (zero_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0) (const_int 2)])))))] + "TARGET_SSE2 && ix86_binary_operator_ok (MULT, V4SImode, operands)" + "pmuludq\t{%2, %0|%0, %2}" + [(set_attr "type" "sseimul") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_expand "sse4_1_mulv2siv2di3" + [(set (match_operand:V2DI 0 "register_operand" "") + (mult:V2DI + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "nonimmediate_operand" "") + (parallel [(const_int 0) (const_int 2)]))) + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 2 "nonimmediate_operand" "") + (parallel [(const_int 0) (const_int 2)])))))] + "TARGET_SSE4_1" + "ix86_fixup_binary_operands_no_copy (MULT, V4SImode, operands);") + +(define_insn "*avx_mulv2siv2di3" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (mult:V2DI + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "nonimmediate_operand" "%x") + (parallel [(const_int 0) (const_int 2)]))) + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0) (const_int 2)])))))] + "TARGET_AVX && ix86_binary_operator_ok (MULT, V4SImode, operands)" + "vpmuldq\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseimul") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_mulv2siv2di3" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (mult:V2DI + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "nonimmediate_operand" "%0") + (parallel [(const_int 0) (const_int 2)]))) + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0) (const_int 2)])))))] + "TARGET_SSE4_1 && ix86_binary_operator_ok (MULT, V4SImode, operands)" + "pmuldq\t{%2, %0|%0, %2}" + [(set_attr "type" "sseimul") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_expand "sse2_pmaddwd" + [(set (match_operand:V4SI 0 "register_operand" "") + (plus:V4SI + (mult:V4SI + (sign_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 1 "nonimmediate_operand" "") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6)]))) + (sign_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 2 "nonimmediate_operand" "") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6)])))) + (mult:V4SI + (sign_extend:V4SI + (vec_select:V4HI (match_dup 1) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7)]))) + (sign_extend:V4SI + (vec_select:V4HI (match_dup 2) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7)]))))))] + "TARGET_SSE2" + "ix86_fixup_binary_operands_no_copy (MULT, V8HImode, operands);") + +(define_insn "*avx_pmaddwd" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (plus:V4SI + (mult:V4SI + (sign_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 1 "nonimmediate_operand" "%x") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6)]))) + (sign_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6)])))) + (mult:V4SI + (sign_extend:V4SI + (vec_select:V4HI (match_dup 1) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7)]))) + (sign_extend:V4SI + (vec_select:V4HI (match_dup 2) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7)]))))))] + "TARGET_AVX && ix86_binary_operator_ok (MULT, V8HImode, operands)" + "vpmaddwd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "*sse2_pmaddwd" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (plus:V4SI + (mult:V4SI + (sign_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 1 "nonimmediate_operand" "%0") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6)]))) + (sign_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6)])))) + (mult:V4SI + (sign_extend:V4SI + (vec_select:V4HI (match_dup 1) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7)]))) + (sign_extend:V4SI + (vec_select:V4HI (match_dup 2) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7)]))))))] + "TARGET_SSE2 && ix86_binary_operator_ok (MULT, V8HImode, operands)" + "pmaddwd\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "simul") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_expand "mulv4si3" + [(set (match_operand:V4SI 0 "register_operand" "") + (mult:V4SI (match_operand:V4SI 1 "register_operand" "") + (match_operand:V4SI 2 "register_operand" "")))] + "TARGET_SSE2" +{ + if (TARGET_SSE4_1 || TARGET_AVX) + ix86_fixup_binary_operands_no_copy (MULT, V4SImode, operands); +}) + +(define_insn "*avx_mulv4si3" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (mult:V4SI (match_operand:V4SI 1 "nonimmediate_operand" "%x") + (match_operand:V4SI 2 "nonimmediate_operand" "xm")))] + "TARGET_AVX && ix86_binary_operator_ok (MULT, V4SImode, operands)" + "vpmulld\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseimul") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_mulv4si3" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (mult:V4SI (match_operand:V4SI 1 "nonimmediate_operand" "%0") + (match_operand:V4SI 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE4_1 && ix86_binary_operator_ok (MULT, V4SImode, operands)" + "pmulld\t{%2, %0|%0, %2}" + [(set_attr "type" "sseimul") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn_and_split "*sse2_mulv4si3" + [(set (match_operand:V4SI 0 "register_operand" "") + (mult:V4SI (match_operand:V4SI 1 "register_operand" "") + (match_operand:V4SI 2 "register_operand" "")))] + "TARGET_SSE2 && !TARGET_SSE4_1 && !TARGET_AVX + && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] +{ + rtx t1, t2, t3, t4, t5, t6, thirtytwo; + rtx op0, op1, op2; + + op0 = operands[0]; + op1 = operands[1]; + op2 = operands[2]; + t1 = gen_reg_rtx (V4SImode); + t2 = gen_reg_rtx (V4SImode); + t3 = gen_reg_rtx (V4SImode); + t4 = gen_reg_rtx (V4SImode); + t5 = gen_reg_rtx (V4SImode); + t6 = gen_reg_rtx (V4SImode); + thirtytwo = GEN_INT (32); + + /* Multiply elements 2 and 0. */ + emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t1), + op1, op2)); + + /* Shift both input vectors down one element, so that elements 3 + and 1 are now in the slots for elements 2 and 0. For K8, at + least, this is faster than using a shuffle. */ + emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t2), + gen_lowpart (V1TImode, op1), + thirtytwo)); + emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t3), + gen_lowpart (V1TImode, op2), + thirtytwo)); + /* Multiply elements 3 and 1. */ + emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t4), + t2, t3)); + + /* Move the results in element 2 down to element 1; we don't care + what goes in elements 2 and 3. */ + emit_insn (gen_sse2_pshufd_1 (t5, t1, const0_rtx, const2_rtx, + const0_rtx, const0_rtx)); + emit_insn (gen_sse2_pshufd_1 (t6, t4, const0_rtx, const2_rtx, + const0_rtx, const0_rtx)); + + /* Merge the parts back together. */ + emit_insn (gen_vec_interleave_lowv4si (op0, t5, t6)); + DONE; +}) + +(define_insn_and_split "mulv2di3" + [(set (match_operand:V2DI 0 "register_operand" "") + (mult:V2DI (match_operand:V2DI 1 "register_operand" "") + (match_operand:V2DI 2 "register_operand" "")))] + "TARGET_SSE2 + && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] +{ + rtx t1, t2, t3, t4, t5, t6, thirtytwo; + rtx op0, op1, op2; + + op0 = operands[0]; + op1 = operands[1]; + op2 = operands[2]; + + if (TARGET_XOP) + { + /* op1: A,B,C,D, op2: E,F,G,H */ + op1 = gen_lowpart (V4SImode, op1); + op2 = gen_lowpart (V4SImode, op2); + + t1 = gen_reg_rtx (V4SImode); + t2 = gen_reg_rtx (V4SImode); + t3 = gen_reg_rtx (V2DImode); + t4 = gen_reg_rtx (V2DImode); + + /* t1: B,A,D,C */ + emit_insn (gen_sse2_pshufd_1 (t1, op1, + GEN_INT (1), + GEN_INT (0), + GEN_INT (3), + GEN_INT (2))); + + /* t2: (B*E),(A*F),(D*G),(C*H) */ + emit_insn (gen_mulv4si3 (t2, t1, op2)); + + /* t4: (B*E)+(A*F), (D*G)+(C*H) */ + emit_insn (gen_xop_phadddq (t3, t2)); + + /* t5: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */ + emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32))); + + /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */ + emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4)); + } + else + { + t1 = gen_reg_rtx (V2DImode); + t2 = gen_reg_rtx (V2DImode); + t3 = gen_reg_rtx (V2DImode); + t4 = gen_reg_rtx (V2DImode); + t5 = gen_reg_rtx (V2DImode); + t6 = gen_reg_rtx (V2DImode); + thirtytwo = GEN_INT (32); + + /* Multiply low parts. */ + emit_insn (gen_sse2_umulv2siv2di3 (t1, gen_lowpart (V4SImode, op1), + gen_lowpart (V4SImode, op2))); + + /* Shift input vectors left 32 bits so we can multiply high parts. */ + emit_insn (gen_lshrv2di3 (t2, op1, thirtytwo)); + emit_insn (gen_lshrv2di3 (t3, op2, thirtytwo)); + + /* Multiply high parts by low parts. */ + emit_insn (gen_sse2_umulv2siv2di3 (t4, gen_lowpart (V4SImode, op1), + gen_lowpart (V4SImode, t3))); + emit_insn (gen_sse2_umulv2siv2di3 (t5, gen_lowpart (V4SImode, op2), + gen_lowpart (V4SImode, t2))); + + /* Shift them back. */ + emit_insn (gen_ashlv2di3 (t4, t4, thirtytwo)); + emit_insn (gen_ashlv2di3 (t5, t5, thirtytwo)); + + /* Add the three parts together. */ + emit_insn (gen_addv2di3 (t6, t1, t4)); + emit_insn (gen_addv2di3 (op0, t6, t5)); + } + DONE; +}) + +(define_expand "vec_widen_smult_hi_v8hi" + [(match_operand:V4SI 0 "register_operand" "") + (match_operand:V8HI 1 "register_operand" "") + (match_operand:V8HI 2 "register_operand" "")] + "TARGET_SSE2" +{ + rtx op1, op2, t1, t2, dest; + + op1 = operands[1]; + op2 = operands[2]; + t1 = gen_reg_rtx (V8HImode); + t2 = gen_reg_rtx (V8HImode); + dest = gen_lowpart (V8HImode, operands[0]); + + emit_insn (gen_mulv8hi3 (t1, op1, op2)); + emit_insn (gen_smulv8hi3_highpart (t2, op1, op2)); + emit_insn (gen_vec_interleave_highv8hi (dest, t1, t2)); + DONE; +}) + +(define_expand "vec_widen_smult_lo_v8hi" + [(match_operand:V4SI 0 "register_operand" "") + (match_operand:V8HI 1 "register_operand" "") + (match_operand:V8HI 2 "register_operand" "")] + "TARGET_SSE2" +{ + rtx op1, op2, t1, t2, dest; + + op1 = operands[1]; + op2 = operands[2]; + t1 = gen_reg_rtx (V8HImode); + t2 = gen_reg_rtx (V8HImode); + dest = gen_lowpart (V8HImode, operands[0]); + + emit_insn (gen_mulv8hi3 (t1, op1, op2)); + emit_insn (gen_smulv8hi3_highpart (t2, op1, op2)); + emit_insn (gen_vec_interleave_lowv8hi (dest, t1, t2)); + DONE; +}) + +(define_expand "vec_widen_umult_hi_v8hi" + [(match_operand:V4SI 0 "register_operand" "") + (match_operand:V8HI 1 "register_operand" "") + (match_operand:V8HI 2 "register_operand" "")] + "TARGET_SSE2" +{ + rtx op1, op2, t1, t2, dest; + + op1 = operands[1]; + op2 = operands[2]; + t1 = gen_reg_rtx (V8HImode); + t2 = gen_reg_rtx (V8HImode); + dest = gen_lowpart (V8HImode, operands[0]); + + emit_insn (gen_mulv8hi3 (t1, op1, op2)); + emit_insn (gen_umulv8hi3_highpart (t2, op1, op2)); + emit_insn (gen_vec_interleave_highv8hi (dest, t1, t2)); + DONE; +}) + +(define_expand "vec_widen_umult_lo_v8hi" + [(match_operand:V4SI 0 "register_operand" "") + (match_operand:V8HI 1 "register_operand" "") + (match_operand:V8HI 2 "register_operand" "")] + "TARGET_SSE2" +{ + rtx op1, op2, t1, t2, dest; + + op1 = operands[1]; + op2 = operands[2]; + t1 = gen_reg_rtx (V8HImode); + t2 = gen_reg_rtx (V8HImode); + dest = gen_lowpart (V8HImode, operands[0]); + + emit_insn (gen_mulv8hi3 (t1, op1, op2)); + emit_insn (gen_umulv8hi3_highpart (t2, op1, op2)); + emit_insn (gen_vec_interleave_lowv8hi (dest, t1, t2)); + DONE; +}) + +(define_expand "vec_widen_smult_hi_v4si" + [(match_operand:V2DI 0 "register_operand" "") + (match_operand:V4SI 1 "register_operand" "") + (match_operand:V4SI 2 "register_operand" "")] + "TARGET_XOP" +{ + rtx t1, t2; + + t1 = gen_reg_rtx (V4SImode); + t2 = gen_reg_rtx (V4SImode); + + emit_insn (gen_sse2_pshufd_1 (t1, operands[1], + GEN_INT (0), + GEN_INT (2), + GEN_INT (1), + GEN_INT (3))); + emit_insn (gen_sse2_pshufd_1 (t2, operands[2], + GEN_INT (0), + GEN_INT (2), + GEN_INT (1), + GEN_INT (3))); + emit_insn (gen_xop_mulv2div2di3_high (operands[0], t1, t2)); + DONE; +}) + +(define_expand "vec_widen_smult_lo_v4si" + [(match_operand:V2DI 0 "register_operand" "") + (match_operand:V4SI 1 "register_operand" "") + (match_operand:V4SI 2 "register_operand" "")] + "TARGET_XOP" +{ + rtx t1, t2; + + t1 = gen_reg_rtx (V4SImode); + t2 = gen_reg_rtx (V4SImode); + + emit_insn (gen_sse2_pshufd_1 (t1, operands[1], + GEN_INT (0), + GEN_INT (2), + GEN_INT (1), + GEN_INT (3))); + emit_insn (gen_sse2_pshufd_1 (t2, operands[2], + GEN_INT (0), + GEN_INT (2), + GEN_INT (1), + GEN_INT (3))); + emit_insn (gen_xop_mulv2div2di3_low (operands[0], t1, t2)); + DONE; +}) + +(define_expand "vec_widen_umult_hi_v4si" + [(match_operand:V2DI 0 "register_operand" "") + (match_operand:V4SI 1 "register_operand" "") + (match_operand:V4SI 2 "register_operand" "")] + "TARGET_SSE2" +{ + rtx op1, op2, t1, t2; + + op1 = operands[1]; + op2 = operands[2]; + t1 = gen_reg_rtx (V4SImode); + t2 = gen_reg_rtx (V4SImode); + + emit_insn (gen_vec_interleave_highv4si (t1, op1, op1)); + emit_insn (gen_vec_interleave_highv4si (t2, op2, op2)); + emit_insn (gen_sse2_umulv2siv2di3 (operands[0], t1, t2)); + DONE; +}) + +(define_expand "vec_widen_umult_lo_v4si" + [(match_operand:V2DI 0 "register_operand" "") + (match_operand:V4SI 1 "register_operand" "") + (match_operand:V4SI 2 "register_operand" "")] + "TARGET_SSE2" +{ + rtx op1, op2, t1, t2; + + op1 = operands[1]; + op2 = operands[2]; + t1 = gen_reg_rtx (V4SImode); + t2 = gen_reg_rtx (V4SImode); + + emit_insn (gen_vec_interleave_lowv4si (t1, op1, op1)); + emit_insn (gen_vec_interleave_lowv4si (t2, op2, op2)); + emit_insn (gen_sse2_umulv2siv2di3 (operands[0], t1, t2)); + DONE; +}) + +(define_expand "sdot_prodv8hi" + [(match_operand:V4SI 0 "register_operand" "") + (match_operand:V8HI 1 "register_operand" "") + (match_operand:V8HI 2 "register_operand" "") + (match_operand:V4SI 3 "register_operand" "")] + "TARGET_SSE2" +{ + rtx t = gen_reg_rtx (V4SImode); + emit_insn (gen_sse2_pmaddwd (t, operands[1], operands[2])); + emit_insn (gen_addv4si3 (operands[0], operands[3], t)); + DONE; +}) + +(define_expand "udot_prodv4si" + [(match_operand:V2DI 0 "register_operand" "") + (match_operand:V4SI 1 "register_operand" "") + (match_operand:V4SI 2 "register_operand" "") + (match_operand:V2DI 3 "register_operand" "")] + "TARGET_SSE2" +{ + rtx t1, t2, t3, t4; + + t1 = gen_reg_rtx (V2DImode); + emit_insn (gen_sse2_umulv2siv2di3 (t1, operands[1], operands[2])); + emit_insn (gen_addv2di3 (t1, t1, operands[3])); + + t2 = gen_reg_rtx (V4SImode); + t3 = gen_reg_rtx (V4SImode); + emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t2), + gen_lowpart (V1TImode, operands[1]), + GEN_INT (32))); + emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t3), + gen_lowpart (V1TImode, operands[2]), + GEN_INT (32))); + + t4 = gen_reg_rtx (V2DImode); + emit_insn (gen_sse2_umulv2siv2di3 (t4, t2, t3)); + + emit_insn (gen_addv2di3 (operands[0], t1, t4)); + DONE; +}) + +(define_insn "*avx_ashr3" + [(set (match_operand:SSEMODE24 0 "register_operand" "=x") + (ashiftrt:SSEMODE24 + (match_operand:SSEMODE24 1 "register_operand" "x") + (match_operand:SI 2 "nonmemory_operand" "xN")))] + "TARGET_AVX" + "vpsra\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseishft") + (set_attr "prefix" "vex") + (set (attr "length_immediate") + (if_then_else (match_operand 2 "const_int_operand" "") + (const_string "1") + (const_string "0"))) + (set_attr "mode" "TI")]) + +(define_insn "ashr3" + [(set (match_operand:SSEMODE24 0 "register_operand" "=x") + (ashiftrt:SSEMODE24 + (match_operand:SSEMODE24 1 "register_operand" "0") + (match_operand:SI 2 "nonmemory_operand" "xN")))] + "TARGET_SSE2" + "psra\t{%2, %0|%0, %2}" + [(set_attr "type" "sseishft") + (set_attr "prefix_data16" "1") + (set (attr "length_immediate") + (if_then_else (match_operand 2 "const_int_operand" "") + (const_string "1") + (const_string "0"))) + (set_attr "mode" "TI")]) + +(define_insn "*avx_lshrv1ti3" + [(set (match_operand:V1TI 0 "register_operand" "=x") + (lshiftrt:V1TI + (match_operand:V1TI 1 "register_operand" "x") + (match_operand:SI 2 "const_0_to_255_mul_8_operand" "n")))] + "TARGET_AVX" +{ + operands[2] = GEN_INT (INTVAL (operands[2]) / 8); + return "vpsrldq\t{%2, %1, %0|%0, %1, %2}"; +} + [(set_attr "type" "sseishft") + (set_attr "prefix" "vex") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_insn "*avx_lshr3" + [(set (match_operand:SSEMODE248 0 "register_operand" "=x") + (lshiftrt:SSEMODE248 + (match_operand:SSEMODE248 1 "register_operand" "x") + (match_operand:SI 2 "nonmemory_operand" "xN")))] + "TARGET_AVX" + "vpsrl\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseishft") + (set_attr "prefix" "vex") + (set (attr "length_immediate") + (if_then_else (match_operand 2 "const_int_operand" "") + (const_string "1") + (const_string "0"))) + (set_attr "mode" "TI")]) + +(define_insn "sse2_lshrv1ti3" + [(set (match_operand:V1TI 0 "register_operand" "=x") + (lshiftrt:V1TI + (match_operand:V1TI 1 "register_operand" "0") + (match_operand:SI 2 "const_0_to_255_mul_8_operand" "n")))] + "TARGET_SSE2" +{ + operands[2] = GEN_INT (INTVAL (operands[2]) / 8); + return "psrldq\t{%2, %0|%0, %2}"; +} + [(set_attr "type" "sseishft") + (set_attr "prefix_data16" "1") + (set_attr "length_immediate" "1") + (set_attr "atom_unit" "sishuf") + (set_attr "mode" "TI")]) + +(define_insn "lshr3" + [(set (match_operand:SSEMODE248 0 "register_operand" "=x") + (lshiftrt:SSEMODE248 + (match_operand:SSEMODE248 1 "register_operand" "0") + (match_operand:SI 2 "nonmemory_operand" "xN")))] + "TARGET_SSE2" + "psrl\t{%2, %0|%0, %2}" + [(set_attr "type" "sseishft") + (set_attr "prefix_data16" "1") + (set (attr "length_immediate") + (if_then_else (match_operand 2 "const_int_operand" "") + (const_string "1") + (const_string "0"))) + (set_attr "mode" "TI")]) + +(define_insn "*avx_ashlv1ti3" + [(set (match_operand:V1TI 0 "register_operand" "=x") + (ashift:V1TI (match_operand:V1TI 1 "register_operand" "x") + (match_operand:SI 2 "const_0_to_255_mul_8_operand" "n")))] + "TARGET_AVX" +{ + operands[2] = GEN_INT (INTVAL (operands[2]) / 8); + return "vpslldq\t{%2, %1, %0|%0, %1, %2}"; +} + [(set_attr "type" "sseishft") + (set_attr "prefix" "vex") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_insn "*avx_ashl3" + [(set (match_operand:SSEMODE248 0 "register_operand" "=x") + (ashift:SSEMODE248 + (match_operand:SSEMODE248 1 "register_operand" "x") + (match_operand:SI 2 "nonmemory_operand" "xN")))] + "TARGET_AVX" + "vpsll\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseishft") + (set_attr "prefix" "vex") + (set (attr "length_immediate") + (if_then_else (match_operand 2 "const_int_operand" "") + (const_string "1") + (const_string "0"))) + (set_attr "mode" "TI")]) + +(define_insn "sse2_ashlv1ti3" + [(set (match_operand:V1TI 0 "register_operand" "=x") + (ashift:V1TI (match_operand:V1TI 1 "register_operand" "0") + (match_operand:SI 2 "const_0_to_255_mul_8_operand" "n")))] + "TARGET_SSE2" +{ + operands[2] = GEN_INT (INTVAL (operands[2]) / 8); + return "pslldq\t{%2, %0|%0, %2}"; +} + [(set_attr "type" "sseishft") + (set_attr "prefix_data16" "1") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_insn "ashl3" + [(set (match_operand:SSEMODE248 0 "register_operand" "=x") + (ashift:SSEMODE248 + (match_operand:SSEMODE248 1 "register_operand" "0") + (match_operand:SI 2 "nonmemory_operand" "xN")))] + "TARGET_SSE2" + "psll\t{%2, %0|%0, %2}" + [(set_attr "type" "sseishft") + (set_attr "prefix_data16" "1") + (set (attr "length_immediate") + (if_then_else (match_operand 2 "const_int_operand" "") + (const_string "1") + (const_string "0"))) + (set_attr "mode" "TI")]) + +(define_expand "vec_shl_" + [(set (match_operand:SSEMODEI 0 "register_operand" "") + (ashift:V1TI + (match_operand:SSEMODEI 1 "register_operand" "") + (match_operand:SI 2 "const_0_to_255_mul_8_operand" "")))] + "TARGET_SSE2" +{ + operands[0] = gen_lowpart (V1TImode, operands[0]); + operands[1] = gen_lowpart (V1TImode, operands[1]); +}) + +(define_expand "vec_shr_" + [(set (match_operand:SSEMODEI 0 "register_operand" "") + (lshiftrt:V1TI + (match_operand:SSEMODEI 1 "register_operand" "") + (match_operand:SI 2 "const_0_to_255_mul_8_operand" "")))] + "TARGET_SSE2" +{ + operands[0] = gen_lowpart (V1TImode, operands[0]); + operands[1] = gen_lowpart (V1TImode, operands[1]); +}) + +(define_insn "*avx_3" + [(set (match_operand:SSEMODE124 0 "register_operand" "=x") + (umaxmin:SSEMODE124 + (match_operand:SSEMODE124 1 "nonimmediate_operand" "%x") + (match_operand:SSEMODE124 2 "nonimmediate_operand" "xm")))] + "TARGET_AVX && ix86_binary_operator_ok (, mode, operands)" + "vp\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set (attr "prefix_extra") + (if_then_else (match_operand:V16QI 0 "" "") + (const_string "0") + (const_string "1"))) + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_expand "v16qi3" + [(set (match_operand:V16QI 0 "register_operand" "") + (umaxmin:V16QI + (match_operand:V16QI 1 "nonimmediate_operand" "") + (match_operand:V16QI 2 "nonimmediate_operand" "")))] + "TARGET_SSE2" + "ix86_fixup_binary_operands_no_copy (, V16QImode, operands);") + +(define_insn "*v16qi3" + [(set (match_operand:V16QI 0 "register_operand" "=x") + (umaxmin:V16QI + (match_operand:V16QI 1 "nonimmediate_operand" "%0") + (match_operand:V16QI 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE2 && ix86_binary_operator_ok (, V16QImode, operands)" + "pb\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_insn "*avx_3" + [(set (match_operand:SSEMODE124 0 "register_operand" "=x") + (smaxmin:SSEMODE124 + (match_operand:SSEMODE124 1 "nonimmediate_operand" "%x") + (match_operand:SSEMODE124 2 "nonimmediate_operand" "xm")))] + "TARGET_AVX && ix86_binary_operator_ok (, mode, operands)" + "vp\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set (attr "prefix_extra") + (if_then_else (match_operand:V8HI 0 "" "") + (const_string "0") + (const_string "1"))) + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_expand "v8hi3" + [(set (match_operand:V8HI 0 "register_operand" "") + (smaxmin:V8HI + (match_operand:V8HI 1 "nonimmediate_operand" "") + (match_operand:V8HI 2 "nonimmediate_operand" "")))] + "TARGET_SSE2" + "ix86_fixup_binary_operands_no_copy (, V8HImode, operands);") + +(define_insn "*v8hi3" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (smaxmin:V8HI + (match_operand:V8HI 1 "nonimmediate_operand" "%0") + (match_operand:V8HI 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE2 && ix86_binary_operator_ok (, V8HImode, operands)" + "pw\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_expand "umaxv8hi3" + [(set (match_operand:V8HI 0 "register_operand" "") + (umax:V8HI (match_operand:V8HI 1 "register_operand" "") + (match_operand:V8HI 2 "nonimmediate_operand" "")))] + "TARGET_SSE2" +{ + if (TARGET_SSE4_1) + ix86_fixup_binary_operands_no_copy (UMAX, V8HImode, operands); + else + { + rtx op0 = operands[0], op2 = operands[2], op3 = op0; + if (rtx_equal_p (op3, op2)) + op3 = gen_reg_rtx (V8HImode); + emit_insn (gen_sse2_ussubv8hi3 (op3, operands[1], op2)); + emit_insn (gen_addv8hi3 (op0, op3, op2)); + DONE; + } +}) + +(define_expand "smax3" + [(set (match_operand:SSEMODE14 0 "register_operand" "") + (smax:SSEMODE14 (match_operand:SSEMODE14 1 "register_operand" "") + (match_operand:SSEMODE14 2 "register_operand" "")))] + "TARGET_SSE2" +{ + if (TARGET_SSE4_1) + ix86_fixup_binary_operands_no_copy (SMAX, mode, operands); + else + { + rtx xops[6]; + bool ok; + + xops[0] = operands[0]; + xops[1] = operands[1]; + xops[2] = operands[2]; + xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]); + xops[4] = operands[1]; + xops[5] = operands[2]; + ok = ix86_expand_int_vcond (xops); + gcc_assert (ok); + DONE; + } +}) + +(define_insn "*sse4_1_3" + [(set (match_operand:SSEMODE14 0 "register_operand" "=x") + (smaxmin:SSEMODE14 + (match_operand:SSEMODE14 1 "nonimmediate_operand" "%0") + (match_operand:SSEMODE14 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE4_1 && ix86_binary_operator_ok (, mode, operands)" + "p\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_expand "smaxv2di3" + [(set (match_operand:V2DI 0 "register_operand" "") + (smax:V2DI (match_operand:V2DI 1 "register_operand" "") + (match_operand:V2DI 2 "register_operand" "")))] + "TARGET_SSE4_2" +{ + rtx xops[6]; + bool ok; + + xops[0] = operands[0]; + xops[1] = operands[1]; + xops[2] = operands[2]; + xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]); + xops[4] = operands[1]; + xops[5] = operands[2]; + ok = ix86_expand_int_vcond (xops); + gcc_assert (ok); + DONE; +}) + +(define_expand "umaxv4si3" + [(set (match_operand:V4SI 0 "register_operand" "") + (umax:V4SI (match_operand:V4SI 1 "register_operand" "") + (match_operand:V4SI 2 "register_operand" "")))] + "TARGET_SSE2" +{ + if (TARGET_SSE4_1) + ix86_fixup_binary_operands_no_copy (UMAX, V4SImode, operands); + else + { + rtx xops[6]; + bool ok; + + xops[0] = operands[0]; + xops[1] = operands[1]; + xops[2] = operands[2]; + xops[3] = gen_rtx_GTU (VOIDmode, operands[1], operands[2]); + xops[4] = operands[1]; + xops[5] = operands[2]; + ok = ix86_expand_int_vcond (xops); + gcc_assert (ok); + DONE; + } +}) + +(define_insn "*sse4_1_3" + [(set (match_operand:SSEMODE24 0 "register_operand" "=x") + (umaxmin:SSEMODE24 + (match_operand:SSEMODE24 1 "nonimmediate_operand" "%0") + (match_operand:SSEMODE24 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE4_1 && ix86_binary_operator_ok (, mode, operands)" + "p\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_expand "umaxv2di3" + [(set (match_operand:V2DI 0 "register_operand" "") + (umax:V2DI (match_operand:V2DI 1 "register_operand" "") + (match_operand:V2DI 2 "register_operand" "")))] + "TARGET_SSE4_2" +{ + rtx xops[6]; + bool ok; + + xops[0] = operands[0]; + xops[1] = operands[1]; + xops[2] = operands[2]; + xops[3] = gen_rtx_GTU (VOIDmode, operands[1], operands[2]); + xops[4] = operands[1]; + xops[5] = operands[2]; + ok = ix86_expand_int_vcond (xops); + gcc_assert (ok); + DONE; +}) + +(define_expand "smin3" + [(set (match_operand:SSEMODE14 0 "register_operand" "") + (smin:SSEMODE14 (match_operand:SSEMODE14 1 "register_operand" "") + (match_operand:SSEMODE14 2 "register_operand" "")))] + "TARGET_SSE2" +{ + if (TARGET_SSE4_1) + ix86_fixup_binary_operands_no_copy (SMIN, mode, operands); + else + { + rtx xops[6]; + bool ok; + + xops[0] = operands[0]; + xops[1] = operands[2]; + xops[2] = operands[1]; + xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]); + xops[4] = operands[1]; + xops[5] = operands[2]; + ok = ix86_expand_int_vcond (xops); + gcc_assert (ok); + DONE; + } +}) + +(define_expand "sminv2di3" + [(set (match_operand:V2DI 0 "register_operand" "") + (smin:V2DI (match_operand:V2DI 1 "register_operand" "") + (match_operand:V2DI 2 "register_operand" "")))] + "TARGET_SSE4_2" +{ + rtx xops[6]; + bool ok; + + xops[0] = operands[0]; + xops[1] = operands[2]; + xops[2] = operands[1]; + xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]); + xops[4] = operands[1]; + xops[5] = operands[2]; + ok = ix86_expand_int_vcond (xops); + gcc_assert (ok); + DONE; +}) + +(define_expand "umin3" + [(set (match_operand:SSEMODE24 0 "register_operand" "") + (umin:SSEMODE24 (match_operand:SSEMODE24 1 "register_operand" "") + (match_operand:SSEMODE24 2 "register_operand" "")))] + "TARGET_SSE2" +{ + if (TARGET_SSE4_1) + ix86_fixup_binary_operands_no_copy (UMIN, mode, operands); + else + { + rtx xops[6]; + bool ok; + + xops[0] = operands[0]; + xops[1] = operands[2]; + xops[2] = operands[1]; + xops[3] = gen_rtx_GTU (VOIDmode, operands[1], operands[2]); + xops[4] = operands[1]; + xops[5] = operands[2]; + ok = ix86_expand_int_vcond (xops); + gcc_assert (ok); + DONE; + } +}) + +(define_expand "uminv2di3" + [(set (match_operand:V2DI 0 "register_operand" "") + (umin:V2DI (match_operand:V2DI 1 "register_operand" "") + (match_operand:V2DI 2 "register_operand" "")))] + "TARGET_SSE4_2" +{ + rtx xops[6]; + bool ok; + + xops[0] = operands[0]; + xops[1] = operands[2]; + xops[2] = operands[1]; + xops[3] = gen_rtx_GTU (VOIDmode, operands[1], operands[2]); + xops[4] = operands[1]; + xops[5] = operands[2]; + ok = ix86_expand_int_vcond (xops); + gcc_assert (ok); + DONE; +}) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Parallel integral comparisons +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_expand "sse2_eq3" + [(set (match_operand:SSEMODE124 0 "register_operand" "") + (eq:SSEMODE124 + (match_operand:SSEMODE124 1 "nonimmediate_operand" "") + (match_operand:SSEMODE124 2 "nonimmediate_operand" "")))] + "TARGET_SSE2 && !TARGET_XOP " + "ix86_fixup_binary_operands_no_copy (EQ, mode, operands);") + +(define_insn "*avx_eq3" + [(set (match_operand:SSEMODE1248 0 "register_operand" "=x") + (eq:SSEMODE1248 + (match_operand:SSEMODE1248 1 "nonimmediate_operand" "%x") + (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm")))] + "TARGET_AVX && ix86_binary_operator_ok (EQ, mode, operands)" + "vpcmpeq\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssecmp") + (set (attr "prefix_extra") + (if_then_else (match_operand:V2DI 0 "" "") + (const_string "1") + (const_string "*"))) + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "*sse2_eq3" + [(set (match_operand:SSEMODE124 0 "register_operand" "=x") + (eq:SSEMODE124 + (match_operand:SSEMODE124 1 "nonimmediate_operand" "%0") + (match_operand:SSEMODE124 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE2 && !TARGET_XOP + && ix86_binary_operator_ok (EQ, mode, operands)" + "pcmpeq\t{%2, %0|%0, %2}" + [(set_attr "type" "ssecmp") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_expand "sse4_1_eqv2di3" + [(set (match_operand:V2DI 0 "register_operand" "") + (eq:V2DI + (match_operand:V2DI 1 "nonimmediate_operand" "") + (match_operand:V2DI 2 "nonimmediate_operand" "")))] + "TARGET_SSE4_1" + "ix86_fixup_binary_operands_no_copy (EQ, V2DImode, operands);") + +(define_insn "*sse4_1_eqv2di3" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (eq:V2DI + (match_operand:V2DI 1 "nonimmediate_operand" "%0") + (match_operand:V2DI 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE4_1 && ix86_binary_operator_ok (EQ, V2DImode, operands)" + "pcmpeqq\t{%2, %0|%0, %2}" + [(set_attr "type" "ssecmp") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "*avx_gt3" + [(set (match_operand:SSEMODE1248 0 "register_operand" "=x") + (gt:SSEMODE1248 + (match_operand:SSEMODE1248 1 "register_operand" "x") + (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm")))] + "TARGET_AVX" + "vpcmpgt\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssecmp") + (set (attr "prefix_extra") + (if_then_else (match_operand:V2DI 0 "" "") + (const_string "1") + (const_string "*"))) + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "sse2_gt3" + [(set (match_operand:SSEMODE124 0 "register_operand" "=x") + (gt:SSEMODE124 + (match_operand:SSEMODE124 1 "register_operand" "0") + (match_operand:SSEMODE124 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE2 && !TARGET_XOP" + "pcmpgt\t{%2, %0|%0, %2}" + [(set_attr "type" "ssecmp") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_insn "sse4_2_gtv2di3" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (gt:V2DI + (match_operand:V2DI 1 "register_operand" "0") + (match_operand:V2DI 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE4_2" + "pcmpgtq\t{%2, %0|%0, %2}" + [(set_attr "type" "ssecmp") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_expand "vcond" + [(set (match_operand:SSEMODE124C8 0 "register_operand" "") + (if_then_else:SSEMODE124C8 + (match_operator 3 "" + [(match_operand:SSEMODE124C8 4 "nonimmediate_operand" "") + (match_operand:SSEMODE124C8 5 "nonimmediate_operand" "")]) + (match_operand:SSEMODE124C8 1 "general_operand" "") + (match_operand:SSEMODE124C8 2 "general_operand" "")))] + "TARGET_SSE2" +{ + bool ok = ix86_expand_int_vcond (operands); + gcc_assert (ok); + DONE; +}) + +(define_expand "vcondu" + [(set (match_operand:SSEMODE124C8 0 "register_operand" "") + (if_then_else:SSEMODE124C8 + (match_operator 3 "" + [(match_operand:SSEMODE124C8 4 "nonimmediate_operand" "") + (match_operand:SSEMODE124C8 5 "nonimmediate_operand" "")]) + (match_operand:SSEMODE124C8 1 "general_operand" "") + (match_operand:SSEMODE124C8 2 "general_operand" "")))] + "TARGET_SSE2" +{ + bool ok = ix86_expand_int_vcond (operands); + gcc_assert (ok); + DONE; +}) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Parallel bitwise logical operations +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_expand "one_cmpl2" + [(set (match_operand:SSEMODEI 0 "register_operand" "") + (xor:SSEMODEI (match_operand:SSEMODEI 1 "nonimmediate_operand" "") + (match_dup 2)))] + "TARGET_SSE2" +{ + int i, n = GET_MODE_NUNITS (mode); + rtvec v = rtvec_alloc (n); + + for (i = 0; i < n; ++i) + RTVEC_ELT (v, i) = constm1_rtx; + + operands[2] = force_reg (mode, gen_rtx_CONST_VECTOR (mode, v)); +}) + +(define_insn "*avx_andnot3" + [(set (match_operand:AVX256MODEI 0 "register_operand" "=x") + (and:AVX256MODEI + (not:AVX256MODEI (match_operand:AVX256MODEI 1 "register_operand" "x")) + (match_operand:AVX256MODEI 2 "nonimmediate_operand" "xm")))] + "TARGET_AVX" + "vandnps\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "*sse_andnot3" + [(set (match_operand:SSEMODEI 0 "register_operand" "=x") + (and:SSEMODEI + (not:SSEMODEI (match_operand:SSEMODEI 1 "register_operand" "0")) + (match_operand:SSEMODEI 2 "nonimmediate_operand" "xm")))] + "(TARGET_SSE && !TARGET_SSE2)" + "andnps\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog") + (set_attr "mode" "V4SF")]) + +(define_insn "*avx_andnot3" + [(set (match_operand:SSEMODEI 0 "register_operand" "=x") + (and:SSEMODEI + (not:SSEMODEI (match_operand:SSEMODEI 1 "register_operand" "x")) + (match_operand:SSEMODEI 2 "nonimmediate_operand" "xm")))] + "TARGET_AVX" + "vpandn\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "sse2_andnot3" + [(set (match_operand:SSEMODEI 0 "register_operand" "=x") + (and:SSEMODEI + (not:SSEMODEI (match_operand:SSEMODEI 1 "register_operand" "0")) + (match_operand:SSEMODEI 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE2" + "pandn\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_insn "*andnottf3" + [(set (match_operand:TF 0 "register_operand" "=x") + (and:TF + (not:TF (match_operand:TF 1 "register_operand" "0")) + (match_operand:TF 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE2" + "pandn\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_expand "3" + [(set (match_operand:SSEMODEI 0 "register_operand" "") + (any_logic:SSEMODEI + (match_operand:SSEMODEI 1 "nonimmediate_operand" "") + (match_operand:SSEMODEI 2 "nonimmediate_operand" "")))] + "TARGET_SSE" + "ix86_fixup_binary_operands_no_copy (, mode, operands);") + +(define_insn "*avx_3" + [(set (match_operand:AVX256MODEI 0 "register_operand" "=x") + (any_logic:AVX256MODEI + (match_operand:AVX256MODEI 1 "nonimmediate_operand" "%x") + (match_operand:AVX256MODEI 2 "nonimmediate_operand" "xm")))] + "TARGET_AVX + && ix86_binary_operator_ok (, mode, operands)" + "vps\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "*sse_3" + [(set (match_operand:SSEMODEI 0 "register_operand" "=x") + (any_logic:SSEMODEI + (match_operand:SSEMODEI 1 "nonimmediate_operand" "%0") + (match_operand:SSEMODEI 2 "nonimmediate_operand" "xm")))] + "(TARGET_SSE && !TARGET_SSE2) + && ix86_binary_operator_ok (, mode, operands)" + "ps\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog") + (set_attr "mode" "V4SF")]) + +(define_insn "*avx_3" + [(set (match_operand:SSEMODEI 0 "register_operand" "=x") + (any_logic:SSEMODEI + (match_operand:SSEMODEI 1 "nonimmediate_operand" "%x") + (match_operand:SSEMODEI 2 "nonimmediate_operand" "xm")))] + "TARGET_AVX + && ix86_binary_operator_ok (, mode, operands)" + "vp\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "*sse2_3" + [(set (match_operand:SSEMODEI 0 "register_operand" "=x") + (any_logic:SSEMODEI + (match_operand:SSEMODEI 1 "nonimmediate_operand" "%0") + (match_operand:SSEMODEI 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE2 && ix86_binary_operator_ok (, mode, operands)" + "p\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_expand "tf3" + [(set (match_operand:TF 0 "register_operand" "") + (any_logic:TF + (match_operand:TF 1 "nonimmediate_operand" "") + (match_operand:TF 2 "nonimmediate_operand" "")))] + "TARGET_SSE2" + "ix86_fixup_binary_operands_no_copy (, TFmode, operands);") + +(define_insn "*tf3" + [(set (match_operand:TF 0 "register_operand" "=x") + (any_logic:TF + (match_operand:TF 1 "nonimmediate_operand" "%0") + (match_operand:TF 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE2 && ix86_binary_operator_ok (, TFmode, operands)" + "p\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Parallel integral element swizzling +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_expand "vec_pack_trunc_v8hi" + [(match_operand:V16QI 0 "register_operand" "") + (match_operand:V8HI 1 "register_operand" "") + (match_operand:V8HI 2 "register_operand" "")] + "TARGET_SSE2" +{ + rtx op1 = gen_lowpart (V16QImode, operands[1]); + rtx op2 = gen_lowpart (V16QImode, operands[2]); + ix86_expand_vec_extract_even_odd (operands[0], op1, op2, 0); + DONE; +}) + +(define_expand "vec_pack_trunc_v4si" + [(match_operand:V8HI 0 "register_operand" "") + (match_operand:V4SI 1 "register_operand" "") + (match_operand:V4SI 2 "register_operand" "")] + "TARGET_SSE2" +{ + rtx op1 = gen_lowpart (V8HImode, operands[1]); + rtx op2 = gen_lowpart (V8HImode, operands[2]); + ix86_expand_vec_extract_even_odd (operands[0], op1, op2, 0); + DONE; +}) + +(define_expand "vec_pack_trunc_v2di" + [(match_operand:V4SI 0 "register_operand" "") + (match_operand:V2DI 1 "register_operand" "") + (match_operand:V2DI 2 "register_operand" "")] + "TARGET_SSE2" +{ + rtx op1 = gen_lowpart (V4SImode, operands[1]); + rtx op2 = gen_lowpart (V4SImode, operands[2]); + ix86_expand_vec_extract_even_odd (operands[0], op1, op2, 0); + DONE; +}) + +(define_insn "*avx_packsswb" + [(set (match_operand:V16QI 0 "register_operand" "=x") + (vec_concat:V16QI + (ss_truncate:V8QI + (match_operand:V8HI 1 "register_operand" "x")) + (ss_truncate:V8QI + (match_operand:V8HI 2 "nonimmediate_operand" "xm"))))] + "TARGET_AVX" + "vpacksswb\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "sse2_packsswb" + [(set (match_operand:V16QI 0 "register_operand" "=x") + (vec_concat:V16QI + (ss_truncate:V8QI + (match_operand:V8HI 1 "register_operand" "0")) + (ss_truncate:V8QI + (match_operand:V8HI 2 "nonimmediate_operand" "xm"))))] + "TARGET_SSE2" + "packsswb\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_insn "*avx_packssdw" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (vec_concat:V8HI + (ss_truncate:V4HI + (match_operand:V4SI 1 "register_operand" "x")) + (ss_truncate:V4HI + (match_operand:V4SI 2 "nonimmediate_operand" "xm"))))] + "TARGET_AVX" + "vpackssdw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "sse2_packssdw" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (vec_concat:V8HI + (ss_truncate:V4HI + (match_operand:V4SI 1 "register_operand" "0")) + (ss_truncate:V4HI + (match_operand:V4SI 2 "nonimmediate_operand" "xm"))))] + "TARGET_SSE2" + "packssdw\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_insn "*avx_packuswb" + [(set (match_operand:V16QI 0 "register_operand" "=x") + (vec_concat:V16QI + (us_truncate:V8QI + (match_operand:V8HI 1 "register_operand" "x")) + (us_truncate:V8QI + (match_operand:V8HI 2 "nonimmediate_operand" "xm"))))] + "TARGET_AVX" + "vpackuswb\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "sse2_packuswb" + [(set (match_operand:V16QI 0 "register_operand" "=x") + (vec_concat:V16QI + (us_truncate:V8QI + (match_operand:V8HI 1 "register_operand" "0")) + (us_truncate:V8QI + (match_operand:V8HI 2 "nonimmediate_operand" "xm"))))] + "TARGET_SSE2" + "packuswb\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_insn "*avx_interleave_highv16qi" + [(set (match_operand:V16QI 0 "register_operand" "=x") + (vec_select:V16QI + (vec_concat:V32QI + (match_operand:V16QI 1 "register_operand" "x") + (match_operand:V16QI 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 8) (const_int 24) + (const_int 9) (const_int 25) + (const_int 10) (const_int 26) + (const_int 11) (const_int 27) + (const_int 12) (const_int 28) + (const_int 13) (const_int 29) + (const_int 14) (const_int 30) + (const_int 15) (const_int 31)])))] + "TARGET_AVX" + "vpunpckhbw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "vec_interleave_highv16qi" + [(set (match_operand:V16QI 0 "register_operand" "=x") + (vec_select:V16QI + (vec_concat:V32QI + (match_operand:V16QI 1 "register_operand" "0") + (match_operand:V16QI 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 8) (const_int 24) + (const_int 9) (const_int 25) + (const_int 10) (const_int 26) + (const_int 11) (const_int 27) + (const_int 12) (const_int 28) + (const_int 13) (const_int 29) + (const_int 14) (const_int 30) + (const_int 15) (const_int 31)])))] + "TARGET_SSE2" + "punpckhbw\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_insn "*avx_interleave_lowv16qi" + [(set (match_operand:V16QI 0 "register_operand" "=x") + (vec_select:V16QI + (vec_concat:V32QI + (match_operand:V16QI 1 "register_operand" "x") + (match_operand:V16QI 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 16) + (const_int 1) (const_int 17) + (const_int 2) (const_int 18) + (const_int 3) (const_int 19) + (const_int 4) (const_int 20) + (const_int 5) (const_int 21) + (const_int 6) (const_int 22) + (const_int 7) (const_int 23)])))] + "TARGET_AVX" + "vpunpcklbw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "vec_interleave_lowv16qi" + [(set (match_operand:V16QI 0 "register_operand" "=x") + (vec_select:V16QI + (vec_concat:V32QI + (match_operand:V16QI 1 "register_operand" "0") + (match_operand:V16QI 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 16) + (const_int 1) (const_int 17) + (const_int 2) (const_int 18) + (const_int 3) (const_int 19) + (const_int 4) (const_int 20) + (const_int 5) (const_int 21) + (const_int 6) (const_int 22) + (const_int 7) (const_int 23)])))] + "TARGET_SSE2" + "punpcklbw\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_insn "*avx_interleave_highv8hi" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (vec_select:V8HI + (vec_concat:V16HI + (match_operand:V8HI 1 "register_operand" "x") + (match_operand:V8HI 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 4) (const_int 12) + (const_int 5) (const_int 13) + (const_int 6) (const_int 14) + (const_int 7) (const_int 15)])))] + "TARGET_AVX" + "vpunpckhwd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "vec_interleave_highv8hi" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (vec_select:V8HI + (vec_concat:V16HI + (match_operand:V8HI 1 "register_operand" "0") + (match_operand:V8HI 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 4) (const_int 12) + (const_int 5) (const_int 13) + (const_int 6) (const_int 14) + (const_int 7) (const_int 15)])))] + "TARGET_SSE2" + "punpckhwd\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_insn "*avx_interleave_lowv8hi" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (vec_select:V8HI + (vec_concat:V16HI + (match_operand:V8HI 1 "register_operand" "x") + (match_operand:V8HI 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 8) + (const_int 1) (const_int 9) + (const_int 2) (const_int 10) + (const_int 3) (const_int 11)])))] + "TARGET_AVX" + "vpunpcklwd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "vec_interleave_lowv8hi" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (vec_select:V8HI + (vec_concat:V16HI + (match_operand:V8HI 1 "register_operand" "0") + (match_operand:V8HI 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 8) + (const_int 1) (const_int 9) + (const_int 2) (const_int 10) + (const_int 3) (const_int 11)])))] + "TARGET_SSE2" + "punpcklwd\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_insn "*avx_interleave_highv4si" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (vec_select:V4SI + (vec_concat:V8SI + (match_operand:V4SI 1 "register_operand" "x") + (match_operand:V4SI 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 2) (const_int 6) + (const_int 3) (const_int 7)])))] + "TARGET_AVX" + "vpunpckhdq\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "vec_interleave_highv4si" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (vec_select:V4SI + (vec_concat:V8SI + (match_operand:V4SI 1 "register_operand" "0") + (match_operand:V4SI 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 2) (const_int 6) + (const_int 3) (const_int 7)])))] + "TARGET_SSE2" + "punpckhdq\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_insn "*avx_interleave_lowv4si" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (vec_select:V4SI + (vec_concat:V8SI + (match_operand:V4SI 1 "register_operand" "x") + (match_operand:V4SI 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 4) + (const_int 1) (const_int 5)])))] + "TARGET_AVX" + "vpunpckldq\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "vec_interleave_lowv4si" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (vec_select:V4SI + (vec_concat:V8SI + (match_operand:V4SI 1 "register_operand" "0") + (match_operand:V4SI 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 4) + (const_int 1) (const_int 5)])))] + "TARGET_SSE2" + "punpckldq\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_insn "*avx_pinsr" + [(set (match_operand:SSEMODE124 0 "register_operand" "=x") + (vec_merge:SSEMODE124 + (vec_duplicate:SSEMODE124 + (match_operand: 2 "nonimmediate_operand" "rm")) + (match_operand:SSEMODE124 1 "register_operand" "x") + (match_operand:SI 3 "const_pow2_1_to__operand" "n")))] + "TARGET_AVX" +{ + operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3]))); + if (MEM_P (operands[2])) + return "vpinsr\t{%3, %2, %1, %0|%0, %1, %2, %3}"; + else + return "vpinsr\t{%3, %k2, %1, %0|%0, %1, %k2, %3}"; +} + [(set_attr "type" "sselog") + (set (attr "prefix_extra") + (if_then_else (match_operand:V8HI 0 "" "") + (const_string "0") + (const_string "1"))) + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_pinsrb" + [(set (match_operand:V16QI 0 "register_operand" "=x") + (vec_merge:V16QI + (vec_duplicate:V16QI + (match_operand:QI 2 "nonimmediate_operand" "rm")) + (match_operand:V16QI 1 "register_operand" "0") + (match_operand:SI 3 "const_pow2_1_to_32768_operand" "n")))] + "TARGET_SSE4_1" +{ + operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3]))); + if (MEM_P (operands[2])) + return "pinsrb\t{%3, %2, %0|%0, %2, %3}"; + else + return "pinsrb\t{%3, %k2, %0|%0, %k2, %3}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_insn "*sse2_pinsrw" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (vec_merge:V8HI + (vec_duplicate:V8HI + (match_operand:HI 2 "nonimmediate_operand" "rm")) + (match_operand:V8HI 1 "register_operand" "0") + (match_operand:SI 3 "const_pow2_1_to_128_operand" "n")))] + "TARGET_SSE2" +{ + operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3]))); + if (MEM_P (operands[2])) + return "pinsrw\t{%3, %2, %0|%0, %2, %3}"; + else + return "pinsrw\t{%3, %k2, %0|%0, %k2, %3}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +;; It must come before sse2_loadld since it is preferred. +(define_insn "*sse4_1_pinsrd" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (vec_merge:V4SI + (vec_duplicate:V4SI + (match_operand:SI 2 "nonimmediate_operand" "rm")) + (match_operand:V4SI 1 "register_operand" "0") + (match_operand:SI 3 "const_pow2_1_to_8_operand" "n")))] + "TARGET_SSE4_1" +{ + operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3]))); + return "pinsrd\t{%3, %2, %0|%0, %2, %3}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_insn "*avx_pinsrq" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (vec_merge:V2DI + (vec_duplicate:V2DI + (match_operand:DI 2 "nonimmediate_operand" "rm")) + (match_operand:V2DI 1 "register_operand" "x") + (match_operand:SI 3 "const_pow2_1_to_2_operand" "n")))] + "TARGET_AVX && TARGET_64BIT" +{ + operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3]))); + return "vpinsrq\t{%3, %2, %1, %0|%0, %1, %2, %3}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_pinsrq" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (vec_merge:V2DI + (vec_duplicate:V2DI + (match_operand:DI 2 "nonimmediate_operand" "rm")) + (match_operand:V2DI 1 "register_operand" "0") + (match_operand:SI 3 "const_pow2_1_to_2_operand" "n")))] + "TARGET_SSE4_1 && TARGET_64BIT" +{ + operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3]))); + return "pinsrq\t{%3, %2, %0|%0, %2, %3}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix_rex" "1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_pextrb_" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (zero_extend:SWI48 + (vec_select:QI + (match_operand:V16QI 1 "register_operand" "x") + (parallel [(match_operand:SI 2 "const_0_to_15_operand" "n")]))))] + "TARGET_SSE4_1" + "%vpextrb\t{%2, %1, %k0|%k0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_pextrb_memory" + [(set (match_operand:QI 0 "memory_operand" "=m") + (vec_select:QI + (match_operand:V16QI 1 "register_operand" "x") + (parallel [(match_operand:SI 2 "const_0_to_15_operand" "n")])))] + "TARGET_SSE4_1" + "%vpextrb\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_insn "*sse2_pextrw_" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (zero_extend:SWI48 + (vec_select:HI + (match_operand:V8HI 1 "register_operand" "x") + (parallel [(match_operand:SI 2 "const_0_to_7_operand" "n")]))))] + "TARGET_SSE2" + "%vpextrw\t{%2, %1, %k0|%k0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_pextrw_memory" + [(set (match_operand:HI 0 "memory_operand" "=m") + (vec_select:HI + (match_operand:V8HI 1 "register_operand" "x") + (parallel [(match_operand:SI 2 "const_0_to_7_operand" "n")])))] + "TARGET_SSE4_1" + "%vpextrw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_pextrd" + [(set (match_operand:SI 0 "nonimmediate_operand" "=rm") + (vec_select:SI + (match_operand:V4SI 1 "register_operand" "x") + (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n")])))] + "TARGET_SSE4_1" + "%vpextrd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_pextrd_zext" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (vec_select:SI + (match_operand:V4SI 1 "register_operand" "x") + (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n")]))))] + "TARGET_64BIT && TARGET_SSE4_1" + "%vpextrd\t{%2, %1, %k0|%k0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +;; It must come before *vec_extractv2di_1_sse since it is preferred. +(define_insn "*sse4_1_pextrq" + [(set (match_operand:DI 0 "nonimmediate_operand" "=rm") + (vec_select:DI + (match_operand:V2DI 1 "register_operand" "x") + (parallel [(match_operand:SI 2 "const_0_to_1_operand" "n")])))] + "TARGET_SSE4_1 && TARGET_64BIT" + "%vpextrq\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_rex" "1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_expand "sse2_pshufd" + [(match_operand:V4SI 0 "register_operand" "") + (match_operand:V4SI 1 "nonimmediate_operand" "") + (match_operand:SI 2 "const_int_operand" "")] + "TARGET_SSE2" +{ + int mask = INTVAL (operands[2]); + emit_insn (gen_sse2_pshufd_1 (operands[0], operands[1], + GEN_INT ((mask >> 0) & 3), + GEN_INT ((mask >> 2) & 3), + GEN_INT ((mask >> 4) & 3), + GEN_INT ((mask >> 6) & 3))); + DONE; +}) + +(define_insn "sse2_pshufd_1" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (vec_select:V4SI + (match_operand:V4SI 1 "nonimmediate_operand" "xm") + (parallel [(match_operand 2 "const_0_to_3_operand" "") + (match_operand 3 "const_0_to_3_operand" "") + (match_operand 4 "const_0_to_3_operand" "") + (match_operand 5 "const_0_to_3_operand" "")])))] + "TARGET_SSE2" +{ + int mask = 0; + mask |= INTVAL (operands[2]) << 0; + mask |= INTVAL (operands[3]) << 2; + mask |= INTVAL (operands[4]) << 4; + mask |= INTVAL (operands[5]) << 6; + operands[2] = GEN_INT (mask); + + return "%vpshufd\t{%2, %1, %0|%0, %1, %2}"; +} + [(set_attr "type" "sselog1") + (set_attr "prefix_data16" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_expand "sse2_pshuflw" + [(match_operand:V8HI 0 "register_operand" "") + (match_operand:V8HI 1 "nonimmediate_operand" "") + (match_operand:SI 2 "const_int_operand" "")] + "TARGET_SSE2" +{ + int mask = INTVAL (operands[2]); + emit_insn (gen_sse2_pshuflw_1 (operands[0], operands[1], + GEN_INT ((mask >> 0) & 3), + GEN_INT ((mask >> 2) & 3), + GEN_INT ((mask >> 4) & 3), + GEN_INT ((mask >> 6) & 3))); + DONE; +}) + +(define_insn "sse2_pshuflw_1" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (vec_select:V8HI + (match_operand:V8HI 1 "nonimmediate_operand" "xm") + (parallel [(match_operand 2 "const_0_to_3_operand" "") + (match_operand 3 "const_0_to_3_operand" "") + (match_operand 4 "const_0_to_3_operand" "") + (match_operand 5 "const_0_to_3_operand" "") + (const_int 4) + (const_int 5) + (const_int 6) + (const_int 7)])))] + "TARGET_SSE2" +{ + int mask = 0; + mask |= INTVAL (operands[2]) << 0; + mask |= INTVAL (operands[3]) << 2; + mask |= INTVAL (operands[4]) << 4; + mask |= INTVAL (operands[5]) << 6; + operands[2] = GEN_INT (mask); + + return "%vpshuflw\t{%2, %1, %0|%0, %1, %2}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "0") + (set_attr "prefix_rep" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_expand "sse2_pshufhw" + [(match_operand:V8HI 0 "register_operand" "") + (match_operand:V8HI 1 "nonimmediate_operand" "") + (match_operand:SI 2 "const_int_operand" "")] + "TARGET_SSE2" +{ + int mask = INTVAL (operands[2]); + emit_insn (gen_sse2_pshufhw_1 (operands[0], operands[1], + GEN_INT (((mask >> 0) & 3) + 4), + GEN_INT (((mask >> 2) & 3) + 4), + GEN_INT (((mask >> 4) & 3) + 4), + GEN_INT (((mask >> 6) & 3) + 4))); + DONE; +}) + +(define_insn "sse2_pshufhw_1" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (vec_select:V8HI + (match_operand:V8HI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 1) + (const_int 2) + (const_int 3) + (match_operand 2 "const_4_to_7_operand" "") + (match_operand 3 "const_4_to_7_operand" "") + (match_operand 4 "const_4_to_7_operand" "") + (match_operand 5 "const_4_to_7_operand" "")])))] + "TARGET_SSE2" +{ + int mask = 0; + mask |= (INTVAL (operands[2]) - 4) << 0; + mask |= (INTVAL (operands[3]) - 4) << 2; + mask |= (INTVAL (operands[4]) - 4) << 4; + mask |= (INTVAL (operands[5]) - 4) << 6; + operands[2] = GEN_INT (mask); + + return "%vpshufhw\t{%2, %1, %0|%0, %1, %2}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix_rep" "1") + (set_attr "prefix_data16" "0") + (set_attr "prefix" "maybe_vex") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_expand "sse2_loadd" + [(set (match_operand:V4SI 0 "register_operand" "") + (vec_merge:V4SI + (vec_duplicate:V4SI + (match_operand:SI 1 "nonimmediate_operand" "")) + (match_dup 2) + (const_int 1)))] + "TARGET_SSE" + "operands[2] = CONST0_RTX (V4SImode);") + +(define_insn "*avx_loadld" + [(set (match_operand:V4SI 0 "register_operand" "=x,Yi,x") + (vec_merge:V4SI + (vec_duplicate:V4SI + (match_operand:SI 2 "nonimmediate_operand" "m ,r ,x")) + (match_operand:V4SI 1 "reg_or_0_operand" "C ,C ,x") + (const_int 1)))] + "TARGET_AVX" + "@ + vmovd\t{%2, %0|%0, %2} + vmovd\t{%2, %0|%0, %2} + vmovss\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "TI,TI,V4SF")]) + +(define_insn "sse2_loadld" + [(set (match_operand:V4SI 0 "register_operand" "=Y2,Yi,x,x") + (vec_merge:V4SI + (vec_duplicate:V4SI + (match_operand:SI 2 "nonimmediate_operand" "m ,r ,m,x")) + (match_operand:V4SI 1 "reg_or_0_operand" "C ,C ,C,0") + (const_int 1)))] + "TARGET_SSE" + "@ + movd\t{%2, %0|%0, %2} + movd\t{%2, %0|%0, %2} + movss\t{%2, %0|%0, %2} + movss\t{%2, %0|%0, %2}" + [(set_attr "type" "ssemov") + (set_attr "mode" "TI,TI,V4SF,SF")]) + +(define_insn_and_split "sse2_stored" + [(set (match_operand:SI 0 "nonimmediate_operand" "=mx,r") + (vec_select:SI + (match_operand:V4SI 1 "register_operand" "x,Yi") + (parallel [(const_int 0)])))] + "TARGET_SSE" + "#" + "&& reload_completed + && (TARGET_INTER_UNIT_MOVES + || MEM_P (operands [0]) + || !GENERAL_REGNO_P (true_regnum (operands [0])))" + [(set (match_dup 0) (match_dup 1))] + "operands[1] = gen_rtx_REG (SImode, REGNO (operands[1]));") + +(define_insn_and_split "*vec_ext_v4si_mem" + [(set (match_operand:SI 0 "register_operand" "=r") + (vec_select:SI + (match_operand:V4SI 1 "memory_operand" "o") + (parallel [(match_operand 2 "const_0_to_3_operand" "")])))] + "" + "#" + "reload_completed" + [(const_int 0)] +{ + int i = INTVAL (operands[2]); + + emit_move_insn (operands[0], adjust_address (operands[1], SImode, i*4)); + DONE; +}) + +(define_expand "sse_storeq" + [(set (match_operand:DI 0 "nonimmediate_operand" "") + (vec_select:DI + (match_operand:V2DI 1 "register_operand" "") + (parallel [(const_int 0)])))] + "TARGET_SSE") + +(define_insn "*sse2_storeq_rex64" + [(set (match_operand:DI 0 "nonimmediate_operand" "=mx,*r,r") + (vec_select:DI + (match_operand:V2DI 1 "nonimmediate_operand" "x,Yi,o") + (parallel [(const_int 0)])))] + "TARGET_64BIT && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "@ + # + # + mov{q}\t{%1, %0|%0, %1}" + [(set_attr "type" "*,*,imov") + (set_attr "mode" "*,*,DI")]) + +(define_insn "*sse2_storeq" + [(set (match_operand:DI 0 "nonimmediate_operand" "=mx") + (vec_select:DI + (match_operand:V2DI 1 "register_operand" "x") + (parallel [(const_int 0)])))] + "TARGET_SSE" + "#") + +(define_split + [(set (match_operand:DI 0 "nonimmediate_operand" "") + (vec_select:DI + (match_operand:V2DI 1 "register_operand" "") + (parallel [(const_int 0)])))] + "TARGET_SSE + && reload_completed + && (TARGET_INTER_UNIT_MOVES + || MEM_P (operands [0]) + || !GENERAL_REGNO_P (true_regnum (operands [0])))" + [(set (match_dup 0) (match_dup 1))] + "operands[1] = gen_rtx_REG (DImode, REGNO (operands[1]));") + +(define_insn "*vec_extractv2di_1_rex64_avx" + [(set (match_operand:DI 0 "nonimmediate_operand" "=m,x,x,r") + (vec_select:DI + (match_operand:V2DI 1 "nonimmediate_operand" "x,x,o,o") + (parallel [(const_int 1)])))] + "TARGET_64BIT + && TARGET_AVX + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "@ + vmovhps\t{%1, %0|%0, %1} + vpsrldq\t{$8, %1, %0|%0, %1, 8} + vmovq\t{%H1, %0|%0, %H1} + mov{q}\t{%H1, %0|%0, %H1}" + [(set_attr "type" "ssemov,sseishft1,ssemov,imov") + (set_attr "length_immediate" "*,1,*,*") + (set_attr "memory" "*,none,*,*") + (set_attr "prefix" "vex,vex,vex,orig") + (set_attr "mode" "V2SF,TI,TI,DI")]) + +(define_insn "*vec_extractv2di_1_rex64" + [(set (match_operand:DI 0 "nonimmediate_operand" "=m,x,x,r") + (vec_select:DI + (match_operand:V2DI 1 "nonimmediate_operand" "x,0,o,o") + (parallel [(const_int 1)])))] + "TARGET_64BIT && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "@ + movhps\t{%1, %0|%0, %1} + psrldq\t{$8, %0|%0, 8} + movq\t{%H1, %0|%0, %H1} + mov{q}\t{%H1, %0|%0, %H1}" + [(set_attr "type" "ssemov,sseishft1,ssemov,imov") + (set_attr "length_immediate" "*,1,*,*") + (set_attr "memory" "*,none,*,*") + (set_attr "mode" "V2SF,TI,TI,DI")]) + +(define_insn "*vec_extractv2di_1_avx" + [(set (match_operand:DI 0 "nonimmediate_operand" "=m,x,x") + (vec_select:DI + (match_operand:V2DI 1 "nonimmediate_operand" "x,x,o") + (parallel [(const_int 1)])))] + "!TARGET_64BIT + && TARGET_AVX + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "@ + vmovhps\t{%1, %0|%0, %1} + vpsrldq\t{$8, %1, %0|%0, %1, 8} + vmovq\t{%H1, %0|%0, %H1}" + [(set_attr "type" "ssemov,sseishft1,ssemov") + (set_attr "length_immediate" "*,1,*") + (set_attr "memory" "*,none,*") + (set_attr "prefix" "vex") + (set_attr "mode" "V2SF,TI,TI")]) + +(define_insn "*vec_extractv2di_1_sse2" + [(set (match_operand:DI 0 "nonimmediate_operand" "=m,x,x") + (vec_select:DI + (match_operand:V2DI 1 "nonimmediate_operand" "x,0,o") + (parallel [(const_int 1)])))] + "!TARGET_64BIT + && TARGET_SSE2 && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "@ + movhps\t{%1, %0|%0, %1} + psrldq\t{$8, %0|%0, 8} + movq\t{%H1, %0|%0, %H1}" + [(set_attr "type" "ssemov,sseishft1,ssemov") + (set_attr "length_immediate" "*,1,*") + (set_attr "memory" "*,none,*") + (set_attr "mode" "V2SF,TI,TI")]) + +;; Not sure this is ever used, but it doesn't hurt to have it. -aoliva +(define_insn "*vec_extractv2di_1_sse" + [(set (match_operand:DI 0 "nonimmediate_operand" "=m,x,x") + (vec_select:DI + (match_operand:V2DI 1 "nonimmediate_operand" "x,x,o") + (parallel [(const_int 1)])))] + "!TARGET_SSE2 && TARGET_SSE + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + "@ + movhps\t{%1, %0|%0, %1} + movhlps\t{%1, %0|%0, %1} + movlps\t{%H1, %0|%0, %H1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "V2SF,V4SF,V2SF")]) + +(define_insn "*vec_dupv4si_avx" + [(set (match_operand:V4SI 0 "register_operand" "=x,x") + (vec_duplicate:V4SI + (match_operand:SI 1 "register_operand" "x,m")))] + "TARGET_AVX" + "@ + vpshufd\t{$0, %1, %0|%0, %1, 0} + vbroadcastss\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog1,ssemov") + (set_attr "length_immediate" "1,0") + (set_attr "prefix_extra" "0,1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI,V4SF")]) + +(define_insn "*vec_dupv4si" + [(set (match_operand:V4SI 0 "register_operand" "=Y2,x") + (vec_duplicate:V4SI + (match_operand:SI 1 "register_operand" " Y2,0")))] + "TARGET_SSE" + "@ + %vpshufd\t{$0, %1, %0|%0, %1, 0} + shufps\t{$0, %0, %0|%0, %0, 0}" + [(set_attr "type" "sselog1") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI,V4SF")]) + +(define_insn "*vec_dupv2di_avx" + [(set (match_operand:V2DI 0 "register_operand" "=x,x") + (vec_duplicate:V2DI + (match_operand:DI 1 "nonimmediate_operand" " x,m")))] + "TARGET_AVX" + "@ + vpunpcklqdq\t{%1, %1, %0|%0, %1, %1} + vmovddup\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI,DF")]) + +(define_insn "*vec_dupv2di_sse3" + [(set (match_operand:V2DI 0 "register_operand" "=x,x") + (vec_duplicate:V2DI + (match_operand:DI 1 "nonimmediate_operand" " 0,m")))] + "TARGET_SSE3" + "@ + punpcklqdq\t%0, %0 + movddup\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog1") + (set_attr "mode" "TI,DF")]) + +(define_insn "*vec_dupv2di" + [(set (match_operand:V2DI 0 "register_operand" "=Y2,x") + (vec_duplicate:V2DI + (match_operand:DI 1 "register_operand" " 0 ,0")))] + "TARGET_SSE" + "@ + punpcklqdq\t%0, %0 + movlhps\t%0, %0" + [(set_attr "type" "sselog1,ssemov") + (set_attr "mode" "TI,V4SF")]) + +(define_insn "*vec_concatv2si_avx" + [(set (match_operand:V2SI 0 "register_operand" "=x,x,x ,*y ,*y") + (vec_concat:V2SI + (match_operand:SI 1 "nonimmediate_operand" "x ,x,rm, 0 ,rm") + (match_operand:SI 2 "vector_move_operand" "rm,x,C ,*ym,C")))] + "TARGET_AVX" + "@ + vpinsrd\t{$0x1, %2, %1, %0|%0, %1, %2, 0x1} + vpunpckldq\t{%2, %1, %0|%0, %1, %2} + vmovd\t{%1, %0|%0, %1} + punpckldq\t{%2, %0|%0, %2} + movd\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog,sselog,ssemov,mmxcvt,mmxmov") + (set_attr "prefix_extra" "1,*,*,*,*") + (set_attr "length_immediate" "1,*,*,*,*") + (set (attr "prefix") + (if_then_else (eq_attr "alternative" "3,4") + (const_string "orig") + (const_string "vex"))) + (set_attr "mode" "TI,TI,TI,DI,DI")]) + +(define_insn "*vec_concatv2si_sse4_1" + [(set (match_operand:V2SI 0 "register_operand" "=x,x,x ,*y ,*y") + (vec_concat:V2SI + (match_operand:SI 1 "nonimmediate_operand" "0 ,0,rm, 0 ,rm") + (match_operand:SI 2 "vector_move_operand" "rm,x,C ,*ym,C")))] + "TARGET_SSE4_1" + "@ + pinsrd\t{$0x1, %2, %0|%0, %2, 0x1} + punpckldq\t{%2, %0|%0, %2} + movd\t{%1, %0|%0, %1} + punpckldq\t{%2, %0|%0, %2} + movd\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog,sselog,ssemov,mmxcvt,mmxmov") + (set_attr "prefix_extra" "1,*,*,*,*") + (set_attr "length_immediate" "1,*,*,*,*") + (set_attr "mode" "TI,TI,TI,DI,DI")]) + +;; ??? In theory we can match memory for the MMX alternative, but allowing +;; nonimmediate_operand for operand 2 and *not* allowing memory for the SSE +;; alternatives pretty much forces the MMX alternative to be chosen. +(define_insn "*vec_concatv2si_sse2" + [(set (match_operand:V2SI 0 "register_operand" "=x,x ,*y,*y") + (vec_concat:V2SI + (match_operand:SI 1 "nonimmediate_operand" " 0,rm, 0,rm") + (match_operand:SI 2 "reg_or_0_operand" " x,C ,*y, C")))] + "TARGET_SSE2" + "@ + punpckldq\t{%2, %0|%0, %2} + movd\t{%1, %0|%0, %1} + punpckldq\t{%2, %0|%0, %2} + movd\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog,ssemov,mmxcvt,mmxmov") + (set_attr "mode" "TI,TI,DI,DI")]) + +(define_insn "*vec_concatv2si_sse" + [(set (match_operand:V2SI 0 "register_operand" "=x,x,*y,*y") + (vec_concat:V2SI + (match_operand:SI 1 "nonimmediate_operand" " 0,m, 0,*rm") + (match_operand:SI 2 "reg_or_0_operand" " x,C,*y,C")))] + "TARGET_SSE" + "@ + unpcklps\t{%2, %0|%0, %2} + movss\t{%1, %0|%0, %1} + punpckldq\t{%2, %0|%0, %2} + movd\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog,ssemov,mmxcvt,mmxmov") + (set_attr "mode" "V4SF,V4SF,DI,DI")]) + +(define_insn "*vec_concatv4si_1_avx" + [(set (match_operand:V4SI 0 "register_operand" "=x,x") + (vec_concat:V4SI + (match_operand:V2SI 1 "register_operand" " x,x") + (match_operand:V2SI 2 "nonimmediate_operand" " x,m")))] + "TARGET_AVX" + "@ + vpunpcklqdq\t{%2, %1, %0|%0, %1, %2} + vmovhps\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog,ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "TI,V2SF")]) + +(define_insn "*vec_concatv4si_1" + [(set (match_operand:V4SI 0 "register_operand" "=Y2,x,x") + (vec_concat:V4SI + (match_operand:V2SI 1 "register_operand" " 0 ,0,0") + (match_operand:V2SI 2 "nonimmediate_operand" " Y2,x,m")))] + "TARGET_SSE" + "@ + punpcklqdq\t{%2, %0|%0, %2} + movlhps\t{%2, %0|%0, %2} + movhps\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog,ssemov,ssemov") + (set_attr "mode" "TI,V4SF,V2SF")]) + +(define_insn "*vec_concatv2di_avx" + [(set (match_operand:V2DI 0 "register_operand" "=x,?x,x,x") + (vec_concat:V2DI + (match_operand:DI 1 "nonimmediate_operand" " m,*y,x,x") + (match_operand:DI 2 "vector_move_operand" " C, C,x,m")))] + "!TARGET_64BIT && TARGET_AVX" + "@ + vmovq\t{%1, %0|%0, %1} + movq2dq\t{%1, %0|%0, %1} + vpunpcklqdq\t{%2, %1, %0|%0, %1, %2} + vmovhps\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssemov,ssemov,sselog,ssemov") + (set (attr "prefix") + (if_then_else (eq_attr "alternative" "1") + (const_string "orig") + (const_string "vex"))) + (set_attr "mode" "TI,TI,TI,V2SF")]) + +(define_insn "vec_concatv2di" + [(set (match_operand:V2DI 0 "register_operand" "=Y2 ,?Y2,Y2,x,x") + (vec_concat:V2DI + (match_operand:DI 1 "nonimmediate_operand" " mY2,*y ,0 ,0,0") + (match_operand:DI 2 "vector_move_operand" " C , C,Y2,x,m")))] + "!TARGET_64BIT && TARGET_SSE" + "@ + movq\t{%1, %0|%0, %1} + movq2dq\t{%1, %0|%0, %1} + punpcklqdq\t{%2, %0|%0, %2} + movlhps\t{%2, %0|%0, %2} + movhps\t{%2, %0|%0, %2}" + [(set_attr "type" "ssemov,ssemov,sselog,ssemov,ssemov") + (set_attr "mode" "TI,TI,TI,V4SF,V2SF")]) + +(define_insn "*vec_concatv2di_rex64_avx" + [(set (match_operand:V2DI 0 "register_operand" "=x,x,Yi,!x,x,x") + (vec_concat:V2DI + (match_operand:DI 1 "nonimmediate_operand" " x,m,r ,*y,x,x") + (match_operand:DI 2 "vector_move_operand" "rm,C,C ,C ,x,m")))] + "TARGET_64BIT && TARGET_AVX" + "@ + vpinsrq\t{$0x1, %2, %1, %0|%0, %1, %2, 0x1} + vmovq\t{%1, %0|%0, %1} + vmovq\t{%1, %0|%0, %1} + movq2dq\t{%1, %0|%0, %1} + vpunpcklqdq\t{%2, %1, %0|%0, %1, %2} + vmovhps\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog,ssemov,ssemov,ssemov,sselog,ssemov") + (set_attr "prefix_extra" "1,*,*,*,*,*") + (set_attr "length_immediate" "1,*,*,*,*,*") + (set (attr "prefix") + (if_then_else (eq_attr "alternative" "3") + (const_string "orig") + (const_string "vex"))) + (set_attr "mode" "TI,TI,TI,TI,TI,V2SF")]) + +;; movd instead of movq is required to handle broken assemblers. +(define_insn "*vec_concatv2di_rex64_sse4_1" + [(set (match_operand:V2DI 0 "register_operand" "=x ,x ,Yi,!x,x,x,x") + (vec_concat:V2DI + (match_operand:DI 1 "nonimmediate_operand" " 0 ,mx,r ,*y,0,0,0") + (match_operand:DI 2 "vector_move_operand" " rm,C ,C ,C ,x,x,m")))] + "TARGET_64BIT && TARGET_SSE4_1" + "@ + pinsrq\t{$0x1, %2, %0|%0, %2, 0x1} + movq\t{%1, %0|%0, %1} + movd\t{%1, %0|%0, %1} + movq2dq\t{%1, %0|%0, %1} + punpcklqdq\t{%2, %0|%0, %2} + movlhps\t{%2, %0|%0, %2} + movhps\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog,ssemov,ssemov,ssemov,sselog,ssemov,ssemov") + (set_attr "prefix_rex" "1,*,1,*,*,*,*") + (set_attr "prefix_extra" "1,*,*,*,*,*,*") + (set_attr "length_immediate" "1,*,*,*,*,*,*") + (set_attr "mode" "TI,TI,TI,TI,TI,V4SF,V2SF")]) + +;; movd instead of movq is required to handle broken assemblers. +(define_insn "*vec_concatv2di_rex64_sse" + [(set (match_operand:V2DI 0 "register_operand" "=Y2 ,Yi,!Y2,Y2,x,x") + (vec_concat:V2DI + (match_operand:DI 1 "nonimmediate_operand" " mY2,r ,*y ,0 ,0,0") + (match_operand:DI 2 "vector_move_operand" " C ,C ,C ,Y2,x,m")))] + "TARGET_64BIT && TARGET_SSE" + "@ + movq\t{%1, %0|%0, %1} + movd\t{%1, %0|%0, %1} + movq2dq\t{%1, %0|%0, %1} + punpcklqdq\t{%2, %0|%0, %2} + movlhps\t{%2, %0|%0, %2} + movhps\t{%2, %0|%0, %2}" + [(set_attr "type" "ssemov,ssemov,ssemov,sselog,ssemov,ssemov") + (set_attr "prefix_rex" "*,1,*,*,*,*") + (set_attr "mode" "TI,TI,TI,TI,V4SF,V2SF")]) + +(define_expand "vec_unpacku_hi_v16qi" + [(match_operand:V8HI 0 "register_operand" "") + (match_operand:V16QI 1 "register_operand" "")] + "TARGET_SSE2" +{ + if (TARGET_SSE4_1) + ix86_expand_sse4_unpack (operands, true, true); + else + ix86_expand_sse_unpack (operands, true, true); + DONE; +}) + +(define_expand "vec_unpacks_hi_v16qi" + [(match_operand:V8HI 0 "register_operand" "") + (match_operand:V16QI 1 "register_operand" "")] + "TARGET_SSE2" +{ + if (TARGET_SSE4_1) + ix86_expand_sse4_unpack (operands, false, true); + else + ix86_expand_sse_unpack (operands, false, true); + DONE; +}) + +(define_expand "vec_unpacku_lo_v16qi" + [(match_operand:V8HI 0 "register_operand" "") + (match_operand:V16QI 1 "register_operand" "")] + "TARGET_SSE2" +{ + if (TARGET_SSE4_1) + ix86_expand_sse4_unpack (operands, true, false); + else + ix86_expand_sse_unpack (operands, true, false); + DONE; +}) + +(define_expand "vec_unpacks_lo_v16qi" + [(match_operand:V8HI 0 "register_operand" "") + (match_operand:V16QI 1 "register_operand" "")] + "TARGET_SSE2" +{ + if (TARGET_SSE4_1) + ix86_expand_sse4_unpack (operands, false, false); + else + ix86_expand_sse_unpack (operands, false, false); + DONE; +}) + +(define_expand "vec_unpacku_hi_v8hi" + [(match_operand:V4SI 0 "register_operand" "") + (match_operand:V8HI 1 "register_operand" "")] + "TARGET_SSE2" +{ + if (TARGET_SSE4_1) + ix86_expand_sse4_unpack (operands, true, true); + else + ix86_expand_sse_unpack (operands, true, true); + DONE; +}) + +(define_expand "vec_unpacks_hi_v8hi" + [(match_operand:V4SI 0 "register_operand" "") + (match_operand:V8HI 1 "register_operand" "")] + "TARGET_SSE2" +{ + if (TARGET_SSE4_1) + ix86_expand_sse4_unpack (operands, false, true); + else + ix86_expand_sse_unpack (operands, false, true); + DONE; +}) + +(define_expand "vec_unpacku_lo_v8hi" + [(match_operand:V4SI 0 "register_operand" "") + (match_operand:V8HI 1 "register_operand" "")] + "TARGET_SSE2" +{ + if (TARGET_SSE4_1) + ix86_expand_sse4_unpack (operands, true, false); + else + ix86_expand_sse_unpack (operands, true, false); + DONE; +}) + +(define_expand "vec_unpacks_lo_v8hi" + [(match_operand:V4SI 0 "register_operand" "") + (match_operand:V8HI 1 "register_operand" "")] + "TARGET_SSE2" +{ + if (TARGET_SSE4_1) + ix86_expand_sse4_unpack (operands, false, false); + else + ix86_expand_sse_unpack (operands, false, false); + DONE; +}) + +(define_expand "vec_unpacku_hi_v4si" + [(match_operand:V2DI 0 "register_operand" "") + (match_operand:V4SI 1 "register_operand" "")] + "TARGET_SSE2" +{ + if (TARGET_SSE4_1) + ix86_expand_sse4_unpack (operands, true, true); + else + ix86_expand_sse_unpack (operands, true, true); + DONE; +}) + +(define_expand "vec_unpacks_hi_v4si" + [(match_operand:V2DI 0 "register_operand" "") + (match_operand:V4SI 1 "register_operand" "")] + "TARGET_SSE2" +{ + if (TARGET_SSE4_1) + ix86_expand_sse4_unpack (operands, false, true); + else + ix86_expand_sse_unpack (operands, false, true); + DONE; +}) + +(define_expand "vec_unpacku_lo_v4si" + [(match_operand:V2DI 0 "register_operand" "") + (match_operand:V4SI 1 "register_operand" "")] + "TARGET_SSE2" +{ + if (TARGET_SSE4_1) + ix86_expand_sse4_unpack (operands, true, false); + else + ix86_expand_sse_unpack (operands, true, false); + DONE; +}) + +(define_expand "vec_unpacks_lo_v4si" + [(match_operand:V2DI 0 "register_operand" "") + (match_operand:V4SI 1 "register_operand" "")] + "TARGET_SSE2" +{ + if (TARGET_SSE4_1) + ix86_expand_sse4_unpack (operands, false, false); + else + ix86_expand_sse_unpack (operands, false, false); + DONE; +}) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Miscellaneous +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_expand "sse2_uavgv16qi3" + [(set (match_operand:V16QI 0 "register_operand" "") + (truncate:V16QI + (lshiftrt:V16HI + (plus:V16HI + (plus:V16HI + (zero_extend:V16HI + (match_operand:V16QI 1 "nonimmediate_operand" "")) + (zero_extend:V16HI + (match_operand:V16QI 2 "nonimmediate_operand" ""))) + (const_vector:V16QI [(const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1)])) + (const_int 1))))] + "TARGET_SSE2" + "ix86_fixup_binary_operands_no_copy (PLUS, V16QImode, operands);") + +(define_insn "*avx_uavgv16qi3" + [(set (match_operand:V16QI 0 "register_operand" "=x") + (truncate:V16QI + (lshiftrt:V16HI + (plus:V16HI + (plus:V16HI + (zero_extend:V16HI + (match_operand:V16QI 1 "nonimmediate_operand" "%x")) + (zero_extend:V16HI + (match_operand:V16QI 2 "nonimmediate_operand" "xm"))) + (const_vector:V16QI [(const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1)])) + (const_int 1))))] + "TARGET_AVX && ix86_binary_operator_ok (PLUS, V16QImode, operands)" + "vpavgb\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "*sse2_uavgv16qi3" + [(set (match_operand:V16QI 0 "register_operand" "=x") + (truncate:V16QI + (lshiftrt:V16HI + (plus:V16HI + (plus:V16HI + (zero_extend:V16HI + (match_operand:V16QI 1 "nonimmediate_operand" "%0")) + (zero_extend:V16HI + (match_operand:V16QI 2 "nonimmediate_operand" "xm"))) + (const_vector:V16QI [(const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1)])) + (const_int 1))))] + "TARGET_SSE2 && ix86_binary_operator_ok (PLUS, V16QImode, operands)" + "pavgb\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_expand "sse2_uavgv8hi3" + [(set (match_operand:V8HI 0 "register_operand" "") + (truncate:V8HI + (lshiftrt:V8SI + (plus:V8SI + (plus:V8SI + (zero_extend:V8SI + (match_operand:V8HI 1 "nonimmediate_operand" "")) + (zero_extend:V8SI + (match_operand:V8HI 2 "nonimmediate_operand" ""))) + (const_vector:V8HI [(const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1)])) + (const_int 1))))] + "TARGET_SSE2" + "ix86_fixup_binary_operands_no_copy (PLUS, V8HImode, operands);") + +(define_insn "*avx_uavgv8hi3" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (truncate:V8HI + (lshiftrt:V8SI + (plus:V8SI + (plus:V8SI + (zero_extend:V8SI + (match_operand:V8HI 1 "nonimmediate_operand" "%x")) + (zero_extend:V8SI + (match_operand:V8HI 2 "nonimmediate_operand" "xm"))) + (const_vector:V8HI [(const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1)])) + (const_int 1))))] + "TARGET_AVX && ix86_binary_operator_ok (PLUS, V8HImode, operands)" + "vpavgw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "*sse2_uavgv8hi3" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (truncate:V8HI + (lshiftrt:V8SI + (plus:V8SI + (plus:V8SI + (zero_extend:V8SI + (match_operand:V8HI 1 "nonimmediate_operand" "%0")) + (zero_extend:V8SI + (match_operand:V8HI 2 "nonimmediate_operand" "xm"))) + (const_vector:V8HI [(const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1)])) + (const_int 1))))] + "TARGET_SSE2 && ix86_binary_operator_ok (PLUS, V8HImode, operands)" + "pavgw\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +;; The correct representation for this is absolutely enormous, and +;; surely not generally useful. +(define_insn "*avx_psadbw" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V16QI 1 "register_operand" "x") + (match_operand:V16QI 2 "nonimmediate_operand" "xm")] + UNSPEC_PSADBW))] + "TARGET_AVX" + "vpsadbw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "sse2_psadbw" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V16QI 1 "register_operand" "0") + (match_operand:V16QI 2 "nonimmediate_operand" "xm")] + UNSPEC_PSADBW))] + "TARGET_SSE2" + "psadbw\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "simul") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_insn "avx_movmsk256" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec:SI + [(match_operand:AVX256MODEF2P 1 "register_operand" "x")] + UNSPEC_MOVMSK))] + "AVX256_VEC_FLOAT_MODE_P (mode)" + "vmovmsk\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "_movmsk" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec:SI + [(match_operand:SSEMODEF2P 1 "register_operand" "x")] + UNSPEC_MOVMSK))] + "SSE_VEC_FLOAT_MODE_P (mode)" + "%vmovmsk\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "")]) + +(define_insn "sse2_pmovmskb" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec:SI [(match_operand:V16QI 1 "register_operand" "x")] + UNSPEC_MOVMSK))] + "TARGET_SSE2" + "%vpmovmskb\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_data16" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "SI")]) + +(define_expand "sse2_maskmovdqu" + [(set (match_operand:V16QI 0 "memory_operand" "") + (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "") + (match_operand:V16QI 2 "register_operand" "") + (match_dup 0)] + UNSPEC_MASKMOV))] + "TARGET_SSE2") + +(define_insn "*sse2_maskmovdqu" + [(set (mem:V16QI (match_operand:SI 0 "register_operand" "D")) + (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "x") + (match_operand:V16QI 2 "register_operand" "x") + (mem:V16QI (match_dup 0))] + UNSPEC_MASKMOV))] + "TARGET_SSE2 && !TARGET_64BIT" + ;; @@@ check ordering of operands in intel/nonintel syntax + "%vmaskmovdqu\t{%2, %1|%1, %2}" + [(set_attr "type" "ssemov") + (set_attr "prefix_data16" "1") + ;; The implicit %rdi operand confuses default length_vex computation. + (set_attr "length_vex" "3") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_insn "*sse2_maskmovdqu_rex64" + [(set (mem:V16QI (match_operand:DI 0 "register_operand" "D")) + (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "x") + (match_operand:V16QI 2 "register_operand" "x") + (mem:V16QI (match_dup 0))] + UNSPEC_MASKMOV))] + "TARGET_SSE2 && TARGET_64BIT" + ;; @@@ check ordering of operands in intel/nonintel syntax + "%vmaskmovdqu\t{%2, %1|%1, %2}" + [(set_attr "type" "ssemov") + (set_attr "prefix_data16" "1") + ;; The implicit %rdi operand confuses default length_vex computation. + (set (attr "length_vex") + (symbol_ref ("REGNO (operands[2]) >= FIRST_REX_SSE_REG ? 3 + 1 : 2 + 1"))) + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_insn "sse_ldmxcsr" + [(unspec_volatile [(match_operand:SI 0 "memory_operand" "m")] + UNSPECV_LDMXCSR)] + "TARGET_SSE" + "%vldmxcsr\t%0" + [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "mxcsr") + (set_attr "prefix" "maybe_vex") + (set_attr "memory" "load")]) + +(define_insn "sse_stmxcsr" + [(set (match_operand:SI 0 "memory_operand" "=m") + (unspec_volatile:SI [(const_int 0)] UNSPECV_STMXCSR))] + "TARGET_SSE" + "%vstmxcsr\t%0" + [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "mxcsr") + (set_attr "prefix" "maybe_vex") + (set_attr "memory" "store")]) + +(define_expand "sse_sfence" + [(set (match_dup 0) + (unspec:BLK [(match_dup 0)] UNSPEC_SFENCE))] + "TARGET_SSE || TARGET_3DNOW_A" +{ + operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)); + MEM_VOLATILE_P (operands[0]) = 1; +}) + +(define_insn "*sse_sfence" + [(set (match_operand:BLK 0 "" "") + (unspec:BLK [(match_dup 0)] UNSPEC_SFENCE))] + "TARGET_SSE || TARGET_3DNOW_A" + "sfence" + [(set_attr "type" "sse") + (set_attr "length_address" "0") + (set_attr "atom_sse_attr" "fence") + (set_attr "memory" "unknown")]) + +(define_insn "sse2_clflush" + [(unspec_volatile [(match_operand 0 "address_operand" "p")] + UNSPECV_CLFLUSH)] + "TARGET_SSE2" + "clflush\t%a0" + [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "fence") + (set_attr "memory" "unknown")]) + +(define_expand "sse2_mfence" + [(set (match_dup 0) + (unspec:BLK [(match_dup 0)] UNSPEC_MFENCE))] + "TARGET_SSE2" +{ + operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)); + MEM_VOLATILE_P (operands[0]) = 1; +}) + +(define_insn "*sse2_mfence" + [(set (match_operand:BLK 0 "" "") + (unspec:BLK [(match_dup 0)] UNSPEC_MFENCE))] + "TARGET_64BIT || TARGET_SSE2" + "mfence" + [(set_attr "type" "sse") + (set_attr "length_address" "0") + (set_attr "atom_sse_attr" "fence") + (set_attr "memory" "unknown")]) + +(define_expand "sse2_lfence" + [(set (match_dup 0) + (unspec:BLK [(match_dup 0)] UNSPEC_LFENCE))] + "TARGET_SSE2" +{ + operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)); + MEM_VOLATILE_P (operands[0]) = 1; +}) + +(define_insn "*sse2_lfence" + [(set (match_operand:BLK 0 "" "") + (unspec:BLK [(match_dup 0)] UNSPEC_LFENCE))] + "TARGET_SSE2" + "lfence" + [(set_attr "type" "sse") + (set_attr "length_address" "0") + (set_attr "atom_sse_attr" "lfence") + (set_attr "memory" "unknown")]) + +(define_insn "sse3_mwait" + [(unspec_volatile [(match_operand:SI 0 "register_operand" "a") + (match_operand:SI 1 "register_operand" "c")] + UNSPECV_MWAIT)] + "TARGET_SSE3" +;; 64bit version is "mwait %rax,%rcx". But only lower 32bits are used. +;; Since 32bit register operands are implicitly zero extended to 64bit, +;; we only need to set up 32bit registers. + "mwait" + [(set_attr "length" "3")]) + +(define_insn "sse3_monitor" + [(unspec_volatile [(match_operand:SI 0 "register_operand" "a") + (match_operand:SI 1 "register_operand" "c") + (match_operand:SI 2 "register_operand" "d")] + UNSPECV_MONITOR)] + "TARGET_SSE3 && !TARGET_64BIT" + "monitor\t%0, %1, %2" + [(set_attr "length" "3")]) + +(define_insn "sse3_monitor64" + [(unspec_volatile [(match_operand:DI 0 "register_operand" "a") + (match_operand:SI 1 "register_operand" "c") + (match_operand:SI 2 "register_operand" "d")] + UNSPECV_MONITOR)] + "TARGET_SSE3 && TARGET_64BIT" +;; 64bit version is "monitor %rax,%rcx,%rdx". But only lower 32bits in +;; RCX and RDX are used. Since 32bit register operands are implicitly +;; zero extended to 64bit, we only need to set up 32bit registers. + "monitor" + [(set_attr "length" "3")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; SSSE3 instructions +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn "*avx_phaddwv8hi3" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (vec_concat:V8HI + (vec_concat:V4HI + (vec_concat:V2HI + (plus:HI + (vec_select:HI + (match_operand:V8HI 1 "register_operand" "x") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 1)]))) + (plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 4)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 5)]))) + (plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 6)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 7)]))))) + (vec_concat:V4HI + (vec_concat:V2HI + (plus:HI + (vec_select:HI + (match_operand:V8HI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 1)]))) + (plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 4)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 5)]))) + (plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 6)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 7)])))))))] + "TARGET_AVX" + "vphaddw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "ssse3_phaddwv8hi3" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (vec_concat:V8HI + (vec_concat:V4HI + (vec_concat:V2HI + (plus:HI + (vec_select:HI + (match_operand:V8HI 1 "register_operand" "0") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 1)]))) + (plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 4)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 5)]))) + (plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 6)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 7)]))))) + (vec_concat:V4HI + (vec_concat:V2HI + (plus:HI + (vec_select:HI + (match_operand:V8HI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 1)]))) + (plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 4)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 5)]))) + (plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 6)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 7)])))))))] + "TARGET_SSSE3" + "phaddw\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "complex") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "ssse3_phaddwv4hi3" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (vec_concat:V4HI + (vec_concat:V2HI + (plus:HI + (vec_select:HI + (match_operand:V4HI 1 "register_operand" "0") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 1)]))) + (plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (plus:HI + (vec_select:HI + (match_operand:V4HI 2 "nonimmediate_operand" "ym") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 1)]))) + (plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 3)]))))))] + "TARGET_SSSE3" + "phaddw\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "complex") + (set_attr "prefix_extra" "1") + (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) + (set_attr "mode" "DI")]) + +(define_insn "*avx_phadddv4si3" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (vec_concat:V4SI + (vec_concat:V2SI + (plus:SI + (vec_select:SI + (match_operand:V4SI 1 "register_operand" "x") + (parallel [(const_int 0)])) + (vec_select:SI (match_dup 1) (parallel [(const_int 1)]))) + (plus:SI + (vec_select:SI (match_dup 1) (parallel [(const_int 2)])) + (vec_select:SI (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2SI + (plus:SI + (vec_select:SI + (match_operand:V4SI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (vec_select:SI (match_dup 2) (parallel [(const_int 1)]))) + (plus:SI + (vec_select:SI (match_dup 2) (parallel [(const_int 2)])) + (vec_select:SI (match_dup 2) (parallel [(const_int 3)]))))))] + "TARGET_AVX" + "vphaddd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "ssse3_phadddv4si3" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (vec_concat:V4SI + (vec_concat:V2SI + (plus:SI + (vec_select:SI + (match_operand:V4SI 1 "register_operand" "0") + (parallel [(const_int 0)])) + (vec_select:SI (match_dup 1) (parallel [(const_int 1)]))) + (plus:SI + (vec_select:SI (match_dup 1) (parallel [(const_int 2)])) + (vec_select:SI (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2SI + (plus:SI + (vec_select:SI + (match_operand:V4SI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (vec_select:SI (match_dup 2) (parallel [(const_int 1)]))) + (plus:SI + (vec_select:SI (match_dup 2) (parallel [(const_int 2)])) + (vec_select:SI (match_dup 2) (parallel [(const_int 3)]))))))] + "TARGET_SSSE3" + "phaddd\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "complex") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "ssse3_phadddv2si3" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (vec_concat:V2SI + (plus:SI + (vec_select:SI + (match_operand:V2SI 1 "register_operand" "0") + (parallel [(const_int 0)])) + (vec_select:SI (match_dup 1) (parallel [(const_int 1)]))) + (plus:SI + (vec_select:SI + (match_operand:V2SI 2 "nonimmediate_operand" "ym") + (parallel [(const_int 0)])) + (vec_select:SI (match_dup 2) (parallel [(const_int 1)])))))] + "TARGET_SSSE3" + "phaddd\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "complex") + (set_attr "prefix_extra" "1") + (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) + (set_attr "mode" "DI")]) + +(define_insn "*avx_phaddswv8hi3" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (vec_concat:V8HI + (vec_concat:V4HI + (vec_concat:V2HI + (ss_plus:HI + (vec_select:HI + (match_operand:V8HI 1 "register_operand" "x") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 1)]))) + (ss_plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (ss_plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 4)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 5)]))) + (ss_plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 6)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 7)]))))) + (vec_concat:V4HI + (vec_concat:V2HI + (ss_plus:HI + (vec_select:HI + (match_operand:V8HI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 1)]))) + (ss_plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (ss_plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 4)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 5)]))) + (ss_plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 6)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 7)])))))))] + "TARGET_AVX" + "vphaddsw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "ssse3_phaddswv8hi3" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (vec_concat:V8HI + (vec_concat:V4HI + (vec_concat:V2HI + (ss_plus:HI + (vec_select:HI + (match_operand:V8HI 1 "register_operand" "0") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 1)]))) + (ss_plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (ss_plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 4)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 5)]))) + (ss_plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 6)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 7)]))))) + (vec_concat:V4HI + (vec_concat:V2HI + (ss_plus:HI + (vec_select:HI + (match_operand:V8HI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 1)]))) + (ss_plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (ss_plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 4)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 5)]))) + (ss_plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 6)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 7)])))))))] + "TARGET_SSSE3" + "phaddsw\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "complex") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "ssse3_phaddswv4hi3" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (vec_concat:V4HI + (vec_concat:V2HI + (ss_plus:HI + (vec_select:HI + (match_operand:V4HI 1 "register_operand" "0") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 1)]))) + (ss_plus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (ss_plus:HI + (vec_select:HI + (match_operand:V4HI 2 "nonimmediate_operand" "ym") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 1)]))) + (ss_plus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 3)]))))))] + "TARGET_SSSE3" + "phaddsw\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "complex") + (set_attr "prefix_extra" "1") + (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) + (set_attr "mode" "DI")]) + +(define_insn "*avx_phsubwv8hi3" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (vec_concat:V8HI + (vec_concat:V4HI + (vec_concat:V2HI + (minus:HI + (vec_select:HI + (match_operand:V8HI 1 "register_operand" "x") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 1)]))) + (minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 4)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 5)]))) + (minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 6)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 7)]))))) + (vec_concat:V4HI + (vec_concat:V2HI + (minus:HI + (vec_select:HI + (match_operand:V8HI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 1)]))) + (minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 4)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 5)]))) + (minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 6)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 7)])))))))] + "TARGET_AVX" + "vphsubw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "ssse3_phsubwv8hi3" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (vec_concat:V8HI + (vec_concat:V4HI + (vec_concat:V2HI + (minus:HI + (vec_select:HI + (match_operand:V8HI 1 "register_operand" "0") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 1)]))) + (minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 4)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 5)]))) + (minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 6)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 7)]))))) + (vec_concat:V4HI + (vec_concat:V2HI + (minus:HI + (vec_select:HI + (match_operand:V8HI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 1)]))) + (minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 4)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 5)]))) + (minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 6)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 7)])))))))] + "TARGET_SSSE3" + "phsubw\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "complex") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "ssse3_phsubwv4hi3" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (vec_concat:V4HI + (vec_concat:V2HI + (minus:HI + (vec_select:HI + (match_operand:V4HI 1 "register_operand" "0") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 1)]))) + (minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (minus:HI + (vec_select:HI + (match_operand:V4HI 2 "nonimmediate_operand" "ym") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 1)]))) + (minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 3)]))))))] + "TARGET_SSSE3" + "phsubw\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "complex") + (set_attr "prefix_extra" "1") + (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) + (set_attr "mode" "DI")]) + +(define_insn "*avx_phsubdv4si3" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (vec_concat:V4SI + (vec_concat:V2SI + (minus:SI + (vec_select:SI + (match_operand:V4SI 1 "register_operand" "x") + (parallel [(const_int 0)])) + (vec_select:SI (match_dup 1) (parallel [(const_int 1)]))) + (minus:SI + (vec_select:SI (match_dup 1) (parallel [(const_int 2)])) + (vec_select:SI (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2SI + (minus:SI + (vec_select:SI + (match_operand:V4SI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (vec_select:SI (match_dup 2) (parallel [(const_int 1)]))) + (minus:SI + (vec_select:SI (match_dup 2) (parallel [(const_int 2)])) + (vec_select:SI (match_dup 2) (parallel [(const_int 3)]))))))] + "TARGET_AVX" + "vphsubd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "ssse3_phsubdv4si3" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (vec_concat:V4SI + (vec_concat:V2SI + (minus:SI + (vec_select:SI + (match_operand:V4SI 1 "register_operand" "0") + (parallel [(const_int 0)])) + (vec_select:SI (match_dup 1) (parallel [(const_int 1)]))) + (minus:SI + (vec_select:SI (match_dup 1) (parallel [(const_int 2)])) + (vec_select:SI (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2SI + (minus:SI + (vec_select:SI + (match_operand:V4SI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (vec_select:SI (match_dup 2) (parallel [(const_int 1)]))) + (minus:SI + (vec_select:SI (match_dup 2) (parallel [(const_int 2)])) + (vec_select:SI (match_dup 2) (parallel [(const_int 3)]))))))] + "TARGET_SSSE3" + "phsubd\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "complex") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "ssse3_phsubdv2si3" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (vec_concat:V2SI + (minus:SI + (vec_select:SI + (match_operand:V2SI 1 "register_operand" "0") + (parallel [(const_int 0)])) + (vec_select:SI (match_dup 1) (parallel [(const_int 1)]))) + (minus:SI + (vec_select:SI + (match_operand:V2SI 2 "nonimmediate_operand" "ym") + (parallel [(const_int 0)])) + (vec_select:SI (match_dup 2) (parallel [(const_int 1)])))))] + "TARGET_SSSE3" + "phsubd\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "complex") + (set_attr "prefix_extra" "1") + (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) + (set_attr "mode" "DI")]) + +(define_insn "*avx_phsubswv8hi3" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (vec_concat:V8HI + (vec_concat:V4HI + (vec_concat:V2HI + (ss_minus:HI + (vec_select:HI + (match_operand:V8HI 1 "register_operand" "x") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 1)]))) + (ss_minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (ss_minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 4)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 5)]))) + (ss_minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 6)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 7)]))))) + (vec_concat:V4HI + (vec_concat:V2HI + (ss_minus:HI + (vec_select:HI + (match_operand:V8HI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 1)]))) + (ss_minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (ss_minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 4)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 5)]))) + (ss_minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 6)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 7)])))))))] + "TARGET_AVX" + "vphsubsw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "ssse3_phsubswv8hi3" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (vec_concat:V8HI + (vec_concat:V4HI + (vec_concat:V2HI + (ss_minus:HI + (vec_select:HI + (match_operand:V8HI 1 "register_operand" "0") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 1)]))) + (ss_minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (ss_minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 4)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 5)]))) + (ss_minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 6)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 7)]))))) + (vec_concat:V4HI + (vec_concat:V2HI + (ss_minus:HI + (vec_select:HI + (match_operand:V8HI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 1)]))) + (ss_minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (ss_minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 4)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 5)]))) + (ss_minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 6)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 7)])))))))] + "TARGET_SSSE3" + "phsubsw\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "complex") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "ssse3_phsubswv4hi3" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (vec_concat:V4HI + (vec_concat:V2HI + (ss_minus:HI + (vec_select:HI + (match_operand:V4HI 1 "register_operand" "0") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 1)]))) + (ss_minus:HI + (vec_select:HI (match_dup 1) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 1) (parallel [(const_int 3)])))) + (vec_concat:V2HI + (ss_minus:HI + (vec_select:HI + (match_operand:V4HI 2 "nonimmediate_operand" "ym") + (parallel [(const_int 0)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 1)]))) + (ss_minus:HI + (vec_select:HI (match_dup 2) (parallel [(const_int 2)])) + (vec_select:HI (match_dup 2) (parallel [(const_int 3)]))))))] + "TARGET_SSSE3" + "phsubsw\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "complex") + (set_attr "prefix_extra" "1") + (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) + (set_attr "mode" "DI")]) + +(define_insn "*avx_pmaddubsw128" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (ss_plus:V8HI + (mult:V8HI + (zero_extend:V8HI + (vec_select:V8QI + (match_operand:V16QI 1 "register_operand" "x") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6) + (const_int 8) + (const_int 10) + (const_int 12) + (const_int 14)]))) + (sign_extend:V8HI + (vec_select:V8QI + (match_operand:V16QI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6) + (const_int 8) + (const_int 10) + (const_int 12) + (const_int 14)])))) + (mult:V8HI + (zero_extend:V8HI + (vec_select:V8QI (match_dup 1) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7) + (const_int 9) + (const_int 11) + (const_int 13) + (const_int 15)]))) + (sign_extend:V8HI + (vec_select:V8QI (match_dup 2) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7) + (const_int 9) + (const_int 11) + (const_int 13) + (const_int 15)]))))))] + "TARGET_AVX" + "vpmaddubsw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "ssse3_pmaddubsw128" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (ss_plus:V8HI + (mult:V8HI + (zero_extend:V8HI + (vec_select:V8QI + (match_operand:V16QI 1 "register_operand" "0") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6) + (const_int 8) + (const_int 10) + (const_int 12) + (const_int 14)]))) + (sign_extend:V8HI + (vec_select:V8QI + (match_operand:V16QI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6) + (const_int 8) + (const_int 10) + (const_int 12) + (const_int 14)])))) + (mult:V8HI + (zero_extend:V8HI + (vec_select:V8QI (match_dup 1) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7) + (const_int 9) + (const_int 11) + (const_int 13) + (const_int 15)]))) + (sign_extend:V8HI + (vec_select:V8QI (match_dup 2) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7) + (const_int 9) + (const_int 11) + (const_int 13) + (const_int 15)]))))))] + "TARGET_SSSE3" + "pmaddubsw\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "simul") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "ssse3_pmaddubsw" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (ss_plus:V4HI + (mult:V4HI + (zero_extend:V4HI + (vec_select:V4QI + (match_operand:V8QI 1 "register_operand" "0") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6)]))) + (sign_extend:V4HI + (vec_select:V4QI + (match_operand:V8QI 2 "nonimmediate_operand" "ym") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6)])))) + (mult:V4HI + (zero_extend:V4HI + (vec_select:V4QI (match_dup 1) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7)]))) + (sign_extend:V4HI + (vec_select:V4QI (match_dup 2) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7)]))))))] + "TARGET_SSSE3" + "pmaddubsw\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "simul") + (set_attr "prefix_extra" "1") + (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) + (set_attr "mode" "DI")]) + +(define_expand "ssse3_pmulhrswv8hi3" + [(set (match_operand:V8HI 0 "register_operand" "") + (truncate:V8HI + (lshiftrt:V8SI + (plus:V8SI + (lshiftrt:V8SI + (mult:V8SI + (sign_extend:V8SI + (match_operand:V8HI 1 "nonimmediate_operand" "")) + (sign_extend:V8SI + (match_operand:V8HI 2 "nonimmediate_operand" ""))) + (const_int 14)) + (const_vector:V8HI [(const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1)])) + (const_int 1))))] + "TARGET_SSSE3" + "ix86_fixup_binary_operands_no_copy (MULT, V8HImode, operands);") + +(define_insn "*avx_pmulhrswv8hi3" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (truncate:V8HI + (lshiftrt:V8SI + (plus:V8SI + (lshiftrt:V8SI + (mult:V8SI + (sign_extend:V8SI + (match_operand:V8HI 1 "nonimmediate_operand" "%x")) + (sign_extend:V8SI + (match_operand:V8HI 2 "nonimmediate_operand" "xm"))) + (const_int 14)) + (const_vector:V8HI [(const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1)])) + (const_int 1))))] + "TARGET_AVX && ix86_binary_operator_ok (MULT, V8HImode, operands)" + "vpmulhrsw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseimul") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "*ssse3_pmulhrswv8hi3" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (truncate:V8HI + (lshiftrt:V8SI + (plus:V8SI + (lshiftrt:V8SI + (mult:V8SI + (sign_extend:V8SI + (match_operand:V8HI 1 "nonimmediate_operand" "%0")) + (sign_extend:V8SI + (match_operand:V8HI 2 "nonimmediate_operand" "xm"))) + (const_int 14)) + (const_vector:V8HI [(const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1) + (const_int 1) (const_int 1)])) + (const_int 1))))] + "TARGET_SSSE3 && ix86_binary_operator_ok (MULT, V8HImode, operands)" + "pmulhrsw\t{%2, %0|%0, %2}" + [(set_attr "type" "sseimul") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_expand "ssse3_pmulhrswv4hi3" + [(set (match_operand:V4HI 0 "register_operand" "") + (truncate:V4HI + (lshiftrt:V4SI + (plus:V4SI + (lshiftrt:V4SI + (mult:V4SI + (sign_extend:V4SI + (match_operand:V4HI 1 "nonimmediate_operand" "")) + (sign_extend:V4SI + (match_operand:V4HI 2 "nonimmediate_operand" ""))) + (const_int 14)) + (const_vector:V4HI [(const_int 1) (const_int 1) + (const_int 1) (const_int 1)])) + (const_int 1))))] + "TARGET_SSSE3" + "ix86_fixup_binary_operands_no_copy (MULT, V4HImode, operands);") + +(define_insn "*ssse3_pmulhrswv4hi3" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (truncate:V4HI + (lshiftrt:V4SI + (plus:V4SI + (lshiftrt:V4SI + (mult:V4SI + (sign_extend:V4SI + (match_operand:V4HI 1 "nonimmediate_operand" "%0")) + (sign_extend:V4SI + (match_operand:V4HI 2 "nonimmediate_operand" "ym"))) + (const_int 14)) + (const_vector:V4HI [(const_int 1) (const_int 1) + (const_int 1) (const_int 1)])) + (const_int 1))))] + "TARGET_SSSE3 && ix86_binary_operator_ok (MULT, V4HImode, operands)" + "pmulhrsw\t{%2, %0|%0, %2}" + [(set_attr "type" "sseimul") + (set_attr "prefix_extra" "1") + (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) + (set_attr "mode" "DI")]) + +(define_insn "*avx_pshufbv16qi3" + [(set (match_operand:V16QI 0 "register_operand" "=x") + (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "x") + (match_operand:V16QI 2 "nonimmediate_operand" "xm")] + UNSPEC_PSHUFB))] + "TARGET_AVX" + "vpshufb\t{%2, %1, %0|%0, %1, %2}"; + [(set_attr "type" "sselog1") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "ssse3_pshufbv16qi3" + [(set (match_operand:V16QI 0 "register_operand" "=x") + (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0") + (match_operand:V16QI 2 "nonimmediate_operand" "xm")] + UNSPEC_PSHUFB))] + "TARGET_SSSE3" + "pshufb\t{%2, %0|%0, %2}"; + [(set_attr "type" "sselog1") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "ssse3_pshufbv8qi3" + [(set (match_operand:V8QI 0 "register_operand" "=y") + (unspec:V8QI [(match_operand:V8QI 1 "register_operand" "0") + (match_operand:V8QI 2 "nonimmediate_operand" "ym")] + UNSPEC_PSHUFB))] + "TARGET_SSSE3" + "pshufb\t{%2, %0|%0, %2}"; + [(set_attr "type" "sselog1") + (set_attr "prefix_extra" "1") + (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) + (set_attr "mode" "DI")]) + +(define_insn "*avx_psign3" + [(set (match_operand:SSEMODE124 0 "register_operand" "=x") + (unspec:SSEMODE124 + [(match_operand:SSEMODE124 1 "register_operand" "x") + (match_operand:SSEMODE124 2 "nonimmediate_operand" "xm")] + UNSPEC_PSIGN))] + "TARGET_AVX" + "vpsign\t{%2, %1, %0|%0, %1, %2}"; + [(set_attr "type" "sselog1") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "ssse3_psign3" + [(set (match_operand:SSEMODE124 0 "register_operand" "=x") + (unspec:SSEMODE124 + [(match_operand:SSEMODE124 1 "register_operand" "0") + (match_operand:SSEMODE124 2 "nonimmediate_operand" "xm")] + UNSPEC_PSIGN))] + "TARGET_SSSE3" + "psign\t{%2, %0|%0, %2}"; + [(set_attr "type" "sselog1") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "ssse3_psign3" + [(set (match_operand:MMXMODEI 0 "register_operand" "=y") + (unspec:MMXMODEI + [(match_operand:MMXMODEI 1 "register_operand" "0") + (match_operand:MMXMODEI 2 "nonimmediate_operand" "ym")] + UNSPEC_PSIGN))] + "TARGET_SSSE3" + "psign\t{%2, %0|%0, %2}"; + [(set_attr "type" "sselog1") + (set_attr "prefix_extra" "1") + (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) + (set_attr "mode" "DI")]) + +(define_insn "*avx_palignrti" + [(set (match_operand:TI 0 "register_operand" "=x") + (unspec:TI [(match_operand:TI 1 "register_operand" "x") + (match_operand:TI 2 "nonimmediate_operand" "xm") + (match_operand:SI 3 "const_0_to_255_mul_8_operand" "n")] + UNSPEC_PALIGNR))] + "TARGET_AVX" +{ + operands[3] = GEN_INT (INTVAL (operands[3]) / 8); + return "vpalignr\t{%3, %2, %1, %0|%0, %1, %2, %3}"; +} + [(set_attr "type" "sseishft") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "ssse3_palignrti" + [(set (match_operand:TI 0 "register_operand" "=x") + (unspec:TI [(match_operand:TI 1 "register_operand" "0") + (match_operand:TI 2 "nonimmediate_operand" "xm") + (match_operand:SI 3 "const_0_to_255_mul_8_operand" "n")] + UNSPEC_PALIGNR))] + "TARGET_SSSE3" +{ + operands[3] = GEN_INT (INTVAL (operands[3]) / 8); + return "palignr\t{%3, %2, %0|%0, %2, %3}"; +} + [(set_attr "type" "sseishft") + (set_attr "atom_unit" "sishuf") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_insn "ssse3_palignrdi" + [(set (match_operand:DI 0 "register_operand" "=y") + (unspec:DI [(match_operand:DI 1 "register_operand" "0") + (match_operand:DI 2 "nonimmediate_operand" "ym") + (match_operand:SI 3 "const_0_to_255_mul_8_operand" "n")] + UNSPEC_PALIGNR))] + "TARGET_SSSE3" +{ + operands[3] = GEN_INT (INTVAL (operands[3]) / 8); + return "palignr\t{%3, %2, %0|%0, %2, %3}"; +} + [(set_attr "type" "sseishft") + (set_attr "atom_unit" "sishuf") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) + (set_attr "mode" "DI")]) + +(define_insn "abs2" + [(set (match_operand:SSEMODE124 0 "register_operand" "=x") + (abs:SSEMODE124 (match_operand:SSEMODE124 1 "nonimmediate_operand" "xm")))] + "TARGET_SSSE3" + "%vpabs\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog1") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_insn "abs2" + [(set (match_operand:MMXMODEI 0 "register_operand" "=y") + (abs:MMXMODEI (match_operand:MMXMODEI 1 "nonimmediate_operand" "ym")))] + "TARGET_SSSE3" + "pabs\t{%1, %0|%0, %1}"; + [(set_attr "type" "sselog1") + (set_attr "prefix_rep" "0") + (set_attr "prefix_extra" "1") + (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) + (set_attr "mode" "DI")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; AMD SSE4A instructions +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn "sse4a_movnt" + [(set (match_operand:MODEF 0 "memory_operand" "=m") + (unspec:MODEF + [(match_operand:MODEF 1 "register_operand" "x")] + UNSPEC_MOVNT))] + "TARGET_SSE4A" + "movnts\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "")]) + +(define_insn "sse4a_vmmovnt" + [(set (match_operand: 0 "memory_operand" "=m") + (unspec: + [(vec_select: + (match_operand:SSEMODEF2P 1 "register_operand" "x") + (parallel [(const_int 0)]))] + UNSPEC_MOVNT))] + "TARGET_SSE4A" + "movnt\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "")]) + +(define_insn "sse4a_extrqi" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand 2 "const_0_to_255_operand" "") + (match_operand 3 "const_0_to_255_operand" "")] + UNSPEC_EXTRQI))] + "TARGET_SSE4A" + "extrq\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "sse") + (set_attr "prefix_data16" "1") + (set_attr "length_immediate" "2") + (set_attr "mode" "TI")]) + +(define_insn "sse4a_extrq" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V16QI 2 "register_operand" "x")] + UNSPEC_EXTRQ))] + "TARGET_SSE4A" + "extrq\t{%2, %0|%0, %2}" + [(set_attr "type" "sse") + (set_attr "prefix_data16" "1") + (set_attr "mode" "TI")]) + +(define_insn "sse4a_insertqi" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V2DI 2 "register_operand" "x") + (match_operand 3 "const_0_to_255_operand" "") + (match_operand 4 "const_0_to_255_operand" "")] + UNSPEC_INSERTQI))] + "TARGET_SSE4A" + "insertq\t{%4, %3, %2, %0|%0, %2, %3, %4}" + [(set_attr "type" "sseins") + (set_attr "prefix_data16" "0") + (set_attr "prefix_rep" "1") + (set_attr "length_immediate" "2") + (set_attr "mode" "TI")]) + +(define_insn "sse4a_insertq" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V2DI 2 "register_operand" "x")] + UNSPEC_INSERTQ))] + "TARGET_SSE4A" + "insertq\t{%2, %0|%0, %2}" + [(set_attr "type" "sseins") + (set_attr "prefix_data16" "0") + (set_attr "prefix_rep" "1") + (set_attr "mode" "TI")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Intel SSE4.1 instructions +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn "avx_blend" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x") + (vec_merge:AVXMODEF2P + (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm") + (match_operand:AVXMODEF2P 1 "register_operand" "x") + (match_operand:SI 3 "const_0_to__operand" "n")))] + "TARGET_AVX" + "vblend\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "avx_blendv" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x") + (unspec:AVXMODEF2P + [(match_operand:AVXMODEF2P 1 "register_operand" "x") + (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm") + (match_operand:AVXMODEF2P 3 "register_operand" "x")] + UNSPEC_BLENDV))] + "TARGET_AVX" + "vblendv\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "sse4_1_blend" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (vec_merge:SSEMODEF2P + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm") + (match_operand:SSEMODEF2P 1 "register_operand" "0") + (match_operand:SI 3 "const_0_to__operand" "n")))] + "TARGET_SSE4_1" + "blend\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "ssemov") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "mode" "")]) + +(define_insn "sse4_1_blendv" + [(set (match_operand:SSEMODEF2P 0 "reg_not_xmm0_operand" "=x") + (unspec:SSEMODEF2P + [(match_operand:SSEMODEF2P 1 "reg_not_xmm0_operand" "0") + (match_operand:SSEMODEF2P 2 "nonimm_not_xmm0_operand" "xm") + (match_operand:SSEMODEF2P 3 "register_operand" "Yz")] + UNSPEC_BLENDV))] + "TARGET_SSE4_1" + "blendv\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "ssemov") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "mode" "")]) + +(define_insn "avx_dp" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x") + (unspec:AVXMODEF2P + [(match_operand:AVXMODEF2P 1 "nonimmediate_operand" "%x") + (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm") + (match_operand:SI 3 "const_0_to_255_operand" "n")] + UNSPEC_DP))] + "TARGET_AVX" + "vdp\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemul") + (set_attr "prefix" "vex") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "mode" "")]) + +(define_insn "sse4_1_dp" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (unspec:SSEMODEF2P + [(match_operand:SSEMODEF2P 1 "nonimmediate_operand" "%0") + (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm") + (match_operand:SI 3 "const_0_to_255_operand" "n")] + UNSPEC_DP))] + "TARGET_SSE4_1" + "dp\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "ssemul") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "mode" "")]) + +(define_insn "sse4_1_movntdqa" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "memory_operand" "m")] + UNSPEC_MOVNTDQA))] + "TARGET_SSE4_1" + "%vmovntdqa\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_insn "*avx_mpsadbw" + [(set (match_operand:V16QI 0 "register_operand" "=x") + (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "x") + (match_operand:V16QI 2 "nonimmediate_operand" "xm") + (match_operand:SI 3 "const_0_to_255_operand" "n")] + UNSPEC_MPSADBW))] + "TARGET_AVX" + "vmpsadbw\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "sselog1") + (set_attr "prefix" "vex") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_mpsadbw" + [(set (match_operand:V16QI 0 "register_operand" "=x") + (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0") + (match_operand:V16QI 2 "nonimmediate_operand" "xm") + (match_operand:SI 3 "const_0_to_255_operand" "n")] + UNSPEC_MPSADBW))] + "TARGET_SSE4_1" + "mpsadbw\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "sselog1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_insn "*avx_packusdw" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (vec_concat:V8HI + (us_truncate:V4HI + (match_operand:V4SI 1 "register_operand" "x")) + (us_truncate:V4HI + (match_operand:V4SI 2 "nonimmediate_operand" "xm"))))] + "TARGET_AVX" + "vpackusdw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_packusdw" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (vec_concat:V8HI + (us_truncate:V4HI + (match_operand:V4SI 1 "register_operand" "0")) + (us_truncate:V4HI + (match_operand:V4SI 2 "nonimmediate_operand" "xm"))))] + "TARGET_SSE4_1" + "packusdw\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "*avx_pblendvb" + [(set (match_operand:V16QI 0 "register_operand" "=x") + (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "x") + (match_operand:V16QI 2 "nonimmediate_operand" "xm") + (match_operand:V16QI 3 "register_operand" "x")] + UNSPEC_BLENDV))] + "TARGET_AVX" + "vpblendvb\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_pblendvb" + [(set (match_operand:V16QI 0 "reg_not_xmm0_operand" "=x") + (unspec:V16QI [(match_operand:V16QI 1 "reg_not_xmm0_operand" "0") + (match_operand:V16QI 2 "nonimm_not_xmm0_operand" "xm") + (match_operand:V16QI 3 "register_operand" "Yz")] + UNSPEC_BLENDV))] + "TARGET_SSE4_1" + "pblendvb\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "*avx_pblendw" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (vec_merge:V8HI + (match_operand:V8HI 2 "nonimmediate_operand" "xm") + (match_operand:V8HI 1 "register_operand" "x") + (match_operand:SI 3 "const_0_to_255_operand" "n")))] + "TARGET_AVX" + "vpblendw\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "vex") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_pblendw" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (vec_merge:V8HI + (match_operand:V8HI 2 "nonimmediate_operand" "xm") + (match_operand:V8HI 1 "register_operand" "0") + (match_operand:SI 3 "const_0_to_255_operand" "n")))] + "TARGET_SSE4_1" + "pblendw\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_phminposuw" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (unspec:V8HI [(match_operand:V8HI 1 "nonimmediate_operand" "xm")] + UNSPEC_PHMINPOSUW))] + "TARGET_SSE4_1" + "%vphminposuw\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog1") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_v8qiv8hi2" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (any_extend:V8HI + (vec_select:V8QI + (match_operand:V16QI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 1) + (const_int 2) + (const_int 3) + (const_int 4) + (const_int 5) + (const_int 6) + (const_int 7)]))))] + "TARGET_SSE4_1" + "%vpmovbw\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_v4qiv4si2" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (any_extend:V4SI + (vec_select:V4QI + (match_operand:V16QI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 1) + (const_int 2) + (const_int 3)]))))] + "TARGET_SSE4_1" + "%vpmovbd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_v4hiv4si2" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (any_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 1) + (const_int 2) + (const_int 3)]))))] + "TARGET_SSE4_1" + "%vpmovwd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_v2qiv2di2" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (any_extend:V2DI + (vec_select:V2QI + (match_operand:V16QI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 1)]))))] + "TARGET_SSE4_1" + "%vpmovbq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_v2hiv2di2" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (any_extend:V2DI + (vec_select:V2HI + (match_operand:V8HI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 1)]))))] + "TARGET_SSE4_1" + "%vpmovwq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_v2siv2di2" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (any_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 1)]))))] + "TARGET_SSE4_1" + "%vpmovdq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +;; ptestps/ptestpd are very similar to comiss and ucomiss when +;; setting FLAGS_REG. But it is not a really compare instruction. +(define_insn "avx_vtest" + [(set (reg:CC FLAGS_REG) + (unspec:CC [(match_operand:AVXMODEF2P 0 "register_operand" "x") + (match_operand:AVXMODEF2P 1 "nonimmediate_operand" "xm")] + UNSPEC_VTESTP))] + "TARGET_AVX" + "vtest\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecomi") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +;; ptest is very similar to comiss and ucomiss when setting FLAGS_REG. +;; But it is not a really compare instruction. +(define_insn "avx_ptest256" + [(set (reg:CC FLAGS_REG) + (unspec:CC [(match_operand:V4DI 0 "register_operand" "x") + (match_operand:V4DI 1 "nonimmediate_operand" "xm")] + UNSPEC_PTEST))] + "TARGET_AVX" + "vptest\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecomi") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + +(define_insn "sse4_1_ptest" + [(set (reg:CC FLAGS_REG) + (unspec:CC [(match_operand:V2DI 0 "register_operand" "x") + (match_operand:V2DI 1 "nonimmediate_operand" "xm")] + UNSPEC_PTEST))] + "TARGET_SSE4_1" + "%vptest\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecomi") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_insn "avx_round256" + [(set (match_operand:AVX256MODEF2P 0 "register_operand" "=x") + (unspec:AVX256MODEF2P + [(match_operand:AVX256MODEF2P 1 "nonimmediate_operand" "xm") + (match_operand:SI 2 "const_0_to_15_operand" "n")] + UNSPEC_ROUND))] + "TARGET_AVX" + "vround\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssecvt") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "sse4_1_round" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (unspec:SSEMODEF2P + [(match_operand:SSEMODEF2P 1 "nonimmediate_operand" "xm") + (match_operand:SI 2 "const_0_to_15_operand" "n")] + UNSPEC_ROUND))] + "TARGET_ROUND" + "%vround\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssecvt") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "")]) + +(define_insn "*avx_round" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (vec_merge:SSEMODEF2P + (unspec:SSEMODEF2P + [(match_operand:SSEMODEF2P 2 "register_operand" "x") + (match_operand:SI 3 "const_0_to_15_operand" "n")] + UNSPEC_ROUND) + (match_operand:SSEMODEF2P 1 "register_operand" "x") + (const_int 1)))] + "TARGET_AVX" + "vround\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssecvt") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "sse4_1_round" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (vec_merge:SSEMODEF2P + (unspec:SSEMODEF2P + [(match_operand:SSEMODEF2P 2 "register_operand" "x") + (match_operand:SI 3 "const_0_to_15_operand" "n")] + UNSPEC_ROUND) + (match_operand:SSEMODEF2P 1 "register_operand" "0") + (const_int 1)))] + "TARGET_ROUND" + "round\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "ssecvt") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "mode" "")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Intel SSE4.2 string/text processing instructions +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn_and_split "sse4_2_pcmpestr" + [(set (match_operand:SI 0 "register_operand" "=c,c") + (unspec:SI + [(match_operand:V16QI 2 "reg_not_xmm0_operand" "x,x") + (match_operand:SI 3 "register_operand" "a,a") + (match_operand:V16QI 4 "nonimm_not_xmm0_operand" "x,m") + (match_operand:SI 5 "register_operand" "d,d") + (match_operand:SI 6 "const_0_to_255_operand" "n,n")] + UNSPEC_PCMPESTR)) + (set (match_operand:V16QI 1 "register_operand" "=Yz,Yz") + (unspec:V16QI + [(match_dup 2) + (match_dup 3) + (match_dup 4) + (match_dup 5) + (match_dup 6)] + UNSPEC_PCMPESTR)) + (set (reg:CC FLAGS_REG) + (unspec:CC + [(match_dup 2) + (match_dup 3) + (match_dup 4) + (match_dup 5) + (match_dup 6)] + UNSPEC_PCMPESTR))] + "TARGET_SSE4_2 + && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] +{ + int ecx = !find_regno_note (curr_insn, REG_UNUSED, REGNO (operands[0])); + int xmm0 = !find_regno_note (curr_insn, REG_UNUSED, REGNO (operands[1])); + int flags = !find_regno_note (curr_insn, REG_UNUSED, FLAGS_REG); + + if (ecx) + emit_insn (gen_sse4_2_pcmpestri (operands[0], operands[2], + operands[3], operands[4], + operands[5], operands[6])); + if (xmm0) + emit_insn (gen_sse4_2_pcmpestrm (operands[1], operands[2], + operands[3], operands[4], + operands[5], operands[6])); + if (flags && !(ecx || xmm0)) + emit_insn (gen_sse4_2_pcmpestr_cconly (NULL, NULL, + operands[2], operands[3], + operands[4], operands[5], + operands[6])); + if (!(flags || ecx || xmm0)) + emit_note (NOTE_INSN_DELETED); + + DONE; +} + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "memory" "none,load") + (set_attr "mode" "TI")]) + +(define_insn "sse4_2_pcmpestri" + [(set (match_operand:SI 0 "register_operand" "=c,c") + (unspec:SI + [(match_operand:V16QI 1 "register_operand" "x,x") + (match_operand:SI 2 "register_operand" "a,a") + (match_operand:V16QI 3 "nonimmediate_operand" "x,m") + (match_operand:SI 4 "register_operand" "d,d") + (match_operand:SI 5 "const_0_to_255_operand" "n,n")] + UNSPEC_PCMPESTR)) + (set (reg:CC FLAGS_REG) + (unspec:CC + [(match_dup 1) + (match_dup 2) + (match_dup 3) + (match_dup 4) + (match_dup 5)] + UNSPEC_PCMPESTR))] + "TARGET_SSE4_2" + "%vpcmpestri\t{%5, %3, %1|%1, %3, %5}" + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "length_immediate" "1") + (set_attr "memory" "none,load") + (set_attr "mode" "TI")]) + +(define_insn "sse4_2_pcmpestrm" + [(set (match_operand:V16QI 0 "register_operand" "=Yz,Yz") + (unspec:V16QI + [(match_operand:V16QI 1 "register_operand" "x,x") + (match_operand:SI 2 "register_operand" "a,a") + (match_operand:V16QI 3 "nonimmediate_operand" "x,m") + (match_operand:SI 4 "register_operand" "d,d") + (match_operand:SI 5 "const_0_to_255_operand" "n,n")] + UNSPEC_PCMPESTR)) + (set (reg:CC FLAGS_REG) + (unspec:CC + [(match_dup 1) + (match_dup 2) + (match_dup 3) + (match_dup 4) + (match_dup 5)] + UNSPEC_PCMPESTR))] + "TARGET_SSE4_2" + "%vpcmpestrm\t{%5, %3, %1|%1, %3, %5}" + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "memory" "none,load") + (set_attr "mode" "TI")]) + +(define_insn "sse4_2_pcmpestr_cconly" + [(set (reg:CC FLAGS_REG) + (unspec:CC + [(match_operand:V16QI 2 "register_operand" "x,x,x,x") + (match_operand:SI 3 "register_operand" "a,a,a,a") + (match_operand:V16QI 4 "nonimmediate_operand" "x,m,x,m") + (match_operand:SI 5 "register_operand" "d,d,d,d") + (match_operand:SI 6 "const_0_to_255_operand" "n,n,n,n")] + UNSPEC_PCMPESTR)) + (clobber (match_scratch:V16QI 0 "=Yz,Yz,X,X")) + (clobber (match_scratch:SI 1 "= X, X,c,c"))] + "TARGET_SSE4_2" + "@ + %vpcmpestrm\t{%6, %4, %2|%2, %4, %6} + %vpcmpestrm\t{%6, %4, %2|%2, %4, %6} + %vpcmpestri\t{%6, %4, %2|%2, %4, %6} + %vpcmpestri\t{%6, %4, %2|%2, %4, %6}" + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "memory" "none,load,none,load") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_insn_and_split "sse4_2_pcmpistr" + [(set (match_operand:SI 0 "register_operand" "=c,c") + (unspec:SI + [(match_operand:V16QI 2 "reg_not_xmm0_operand" "x,x") + (match_operand:V16QI 3 "nonimm_not_xmm0_operand" "x,m") + (match_operand:SI 4 "const_0_to_255_operand" "n,n")] + UNSPEC_PCMPISTR)) + (set (match_operand:V16QI 1 "register_operand" "=Yz,Yz") + (unspec:V16QI + [(match_dup 2) + (match_dup 3) + (match_dup 4)] + UNSPEC_PCMPISTR)) + (set (reg:CC FLAGS_REG) + (unspec:CC + [(match_dup 2) + (match_dup 3) + (match_dup 4)] + UNSPEC_PCMPISTR))] + "TARGET_SSE4_2 + && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] +{ + int ecx = !find_regno_note (curr_insn, REG_UNUSED, REGNO (operands[0])); + int xmm0 = !find_regno_note (curr_insn, REG_UNUSED, REGNO (operands[1])); + int flags = !find_regno_note (curr_insn, REG_UNUSED, FLAGS_REG); + + if (ecx) + emit_insn (gen_sse4_2_pcmpistri (operands[0], operands[2], + operands[3], operands[4])); + if (xmm0) + emit_insn (gen_sse4_2_pcmpistrm (operands[1], operands[2], + operands[3], operands[4])); + if (flags && !(ecx || xmm0)) + emit_insn (gen_sse4_2_pcmpistr_cconly (NULL, NULL, + operands[2], operands[3], + operands[4])); + if (!(flags || ecx || xmm0)) + emit_note (NOTE_INSN_DELETED); + + DONE; +} + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "memory" "none,load") + (set_attr "mode" "TI")]) + +(define_insn "sse4_2_pcmpistri" + [(set (match_operand:SI 0 "register_operand" "=c,c") + (unspec:SI + [(match_operand:V16QI 1 "register_operand" "x,x") + (match_operand:V16QI 2 "nonimmediate_operand" "x,m") + (match_operand:SI 3 "const_0_to_255_operand" "n,n")] + UNSPEC_PCMPISTR)) + (set (reg:CC FLAGS_REG) + (unspec:CC + [(match_dup 1) + (match_dup 2) + (match_dup 3)] + UNSPEC_PCMPISTR))] + "TARGET_SSE4_2" + "%vpcmpistri\t{%3, %2, %1|%1, %2, %3}" + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "memory" "none,load") + (set_attr "mode" "TI")]) + +(define_insn "sse4_2_pcmpistrm" + [(set (match_operand:V16QI 0 "register_operand" "=Yz,Yz") + (unspec:V16QI + [(match_operand:V16QI 1 "register_operand" "x,x") + (match_operand:V16QI 2 "nonimmediate_operand" "x,m") + (match_operand:SI 3 "const_0_to_255_operand" "n,n")] + UNSPEC_PCMPISTR)) + (set (reg:CC FLAGS_REG) + (unspec:CC + [(match_dup 1) + (match_dup 2) + (match_dup 3)] + UNSPEC_PCMPISTR))] + "TARGET_SSE4_2" + "%vpcmpistrm\t{%3, %2, %1|%1, %2, %3}" + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "memory" "none,load") + (set_attr "mode" "TI")]) + +(define_insn "sse4_2_pcmpistr_cconly" + [(set (reg:CC FLAGS_REG) + (unspec:CC + [(match_operand:V16QI 2 "register_operand" "x,x,x,x") + (match_operand:V16QI 3 "nonimmediate_operand" "x,m,x,m") + (match_operand:SI 4 "const_0_to_255_operand" "n,n,n,n")] + UNSPEC_PCMPISTR)) + (clobber (match_scratch:V16QI 0 "=Yz,Yz,X,X")) + (clobber (match_scratch:SI 1 "= X, X,c,c"))] + "TARGET_SSE4_2" + "@ + %vpcmpistrm\t{%4, %3, %2|%2, %3, %4} + %vpcmpistrm\t{%4, %3, %2|%2, %3, %4} + %vpcmpistri\t{%4, %3, %2|%2, %3, %4} + %vpcmpistri\t{%4, %3, %2|%2, %3, %4}" + [(set_attr "type" "sselog") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "memory" "none,load,none,load") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; XOP instructions +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; XOP parallel integer multiply/add instructions. +;; Note the XOP multiply/add instructions +;; a[i] = b[i] * c[i] + d[i]; +;; do not allow the value being added to be a memory operation. +(define_insn "xop_pmacsww" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (plus:V8HI + (mult:V8HI + (match_operand:V8HI 1 "nonimmediate_operand" "%x") + (match_operand:V8HI 2 "nonimmediate_operand" "xm")) + (match_operand:V8HI 3 "nonimmediate_operand" "x")))] + "TARGET_XOP" + "vpmacsww\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "TI")]) + +(define_insn "xop_pmacssww" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (ss_plus:V8HI + (mult:V8HI (match_operand:V8HI 1 "nonimmediate_operand" "%x") + (match_operand:V8HI 2 "nonimmediate_operand" "xm")) + (match_operand:V8HI 3 "nonimmediate_operand" "x")))] + "TARGET_XOP" + "vpmacssww\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "TI")]) + +(define_insn "xop_pmacsdd" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (plus:V4SI + (mult:V4SI + (match_operand:V4SI 1 "nonimmediate_operand" "%x") + (match_operand:V4SI 2 "nonimmediate_operand" "xm")) + (match_operand:V4SI 3 "nonimmediate_operand" "x")))] + "TARGET_XOP" + "vpmacsdd\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "TI")]) + +(define_insn "xop_pmacssdd" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (ss_plus:V4SI + (mult:V4SI (match_operand:V4SI 1 "nonimmediate_operand" "%x") + (match_operand:V4SI 2 "nonimmediate_operand" "xm")) + (match_operand:V4SI 3 "nonimmediate_operand" "x")))] + "TARGET_XOP" + "vpmacssdd\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "TI")]) + +(define_insn "xop_pmacssdql" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (ss_plus:V2DI + (mult:V2DI + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "nonimmediate_operand" "%x") + (parallel [(const_int 1) + (const_int 3)]))) + (vec_select:V2SI + (match_operand:V4SI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 1) + (const_int 3)]))) + (match_operand:V2DI 3 "nonimmediate_operand" "x")))] + "TARGET_XOP" + "vpmacssdql\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "TI")]) + +(define_insn "xop_pmacssdqh" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (ss_plus:V2DI + (mult:V2DI + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "nonimmediate_operand" "%x") + (parallel [(const_int 0) + (const_int 2)]))) + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 2)])))) + (match_operand:V2DI 3 "nonimmediate_operand" "x")))] + "TARGET_XOP" + "vpmacssdqh\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "TI")]) + +(define_insn "xop_pmacsdql" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (plus:V2DI + (mult:V2DI + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "nonimmediate_operand" "%x") + (parallel [(const_int 1) + (const_int 3)]))) + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 1) + (const_int 3)])))) + (match_operand:V2DI 3 "nonimmediate_operand" "x")))] + "TARGET_XOP" + "vpmacsdql\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "TI")]) + +;; We don't have a straight 32-bit parallel multiply and extend on XOP, so +;; fake it with a multiply/add. In general, we expect the define_split to +;; occur before register allocation, so we have to handle the corner case where +;; the target is the same as operands 1/2 +(define_insn_and_split "xop_mulv2div2di3_low" + [(set (match_operand:V2DI 0 "register_operand" "=&x") + (mult:V2DI + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "register_operand" "%x") + (parallel [(const_int 1) + (const_int 3)]))) + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 1) + (const_int 3)])))))] + "TARGET_XOP" + "#" + "&& reload_completed" + [(set (match_dup 0) + (match_dup 3)) + (set (match_dup 0) + (plus:V2DI + (mult:V2DI + (sign_extend:V2DI + (vec_select:V2SI + (match_dup 1) + (parallel [(const_int 1) + (const_int 3)]))) + (sign_extend:V2DI + (vec_select:V2SI + (match_dup 2) + (parallel [(const_int 1) + (const_int 3)])))) + (match_dup 0)))] +{ + operands[3] = CONST0_RTX (V2DImode); +} + [(set_attr "type" "ssemul") + (set_attr "mode" "TI")]) + +(define_insn "xop_pmacsdqh" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (plus:V2DI + (mult:V2DI + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "nonimmediate_operand" "%x") + (parallel [(const_int 0) + (const_int 2)]))) + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 2)])))) + (match_operand:V2DI 3 "nonimmediate_operand" "x")))] + "TARGET_XOP" + "vpmacsdqh\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "TI")]) + +;; We don't have a straight 32-bit parallel multiply and extend on XOP, so +;; fake it with a multiply/add. In general, we expect the define_split to +;; occur before register allocation, so we have to handle the corner case where +;; the target is the same as either operands[1] or operands[2] +(define_insn_and_split "xop_mulv2div2di3_high" + [(set (match_operand:V2DI 0 "register_operand" "=&x") + (mult:V2DI + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "register_operand" "%x") + (parallel [(const_int 0) + (const_int 2)]))) + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 2)])))))] + "TARGET_XOP" + "#" + "&& reload_completed" + [(set (match_dup 0) + (match_dup 3)) + (set (match_dup 0) + (plus:V2DI + (mult:V2DI + (sign_extend:V2DI + (vec_select:V2SI + (match_dup 1) + (parallel [(const_int 0) + (const_int 2)]))) + (sign_extend:V2DI + (vec_select:V2SI + (match_dup 2) + (parallel [(const_int 0) + (const_int 2)])))) + (match_dup 0)))] +{ + operands[3] = CONST0_RTX (V2DImode); +} + [(set_attr "type" "ssemul") + (set_attr "mode" "TI")]) + +;; XOP parallel integer multiply/add instructions for the intrinisics +(define_insn "xop_pmacsswd" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (ss_plus:V4SI + (mult:V4SI + (sign_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 1 "nonimmediate_operand" "%x") + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7)]))) + (sign_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7)])))) + (match_operand:V4SI 3 "nonimmediate_operand" "x")))] + "TARGET_XOP" + "vpmacsswd\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "TI")]) + +(define_insn "xop_pmacswd" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (plus:V4SI + (mult:V4SI + (sign_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 1 "nonimmediate_operand" "%x") + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7)]))) + (sign_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7)])))) + (match_operand:V4SI 3 "nonimmediate_operand" "x")))] + "TARGET_XOP" + "vpmacswd\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "TI")]) + +(define_insn "xop_pmadcsswd" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (ss_plus:V4SI + (plus:V4SI + (mult:V4SI + (sign_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 1 "nonimmediate_operand" "%x") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6)]))) + (sign_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6)])))) + (mult:V4SI + (sign_extend:V4SI + (vec_select:V4HI + (match_dup 1) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7)]))) + (sign_extend:V4SI + (vec_select:V4HI + (match_dup 2) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7)]))))) + (match_operand:V4SI 3 "nonimmediate_operand" "x")))] + "TARGET_XOP" + "vpmadcsswd\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "TI")]) + +(define_insn "xop_pmadcswd" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (plus:V4SI + (plus:V4SI + (mult:V4SI + (sign_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 1 "nonimmediate_operand" "%x") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6)]))) + (sign_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6)])))) + (mult:V4SI + (sign_extend:V4SI + (vec_select:V4HI + (match_dup 1) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7)]))) + (sign_extend:V4SI + (vec_select:V4HI + (match_dup 2) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7)]))))) + (match_operand:V4SI 3 "nonimmediate_operand" "x")))] + "TARGET_XOP" + "vpmadcswd\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "TI")]) + +;; XOP parallel XMM conditional moves +(define_insn "xop_pcmov_" + [(set (match_operand:SSEMODE 0 "register_operand" "=x,x") + (if_then_else:SSEMODE + (match_operand:SSEMODE 3 "nonimmediate_operand" "x,m") + (match_operand:SSEMODE 1 "register_operand" "x,x") + (match_operand:SSEMODE 2 "nonimmediate_operand" "xm,x")))] + "TARGET_XOP" + "vpcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "sse4arg")]) + +(define_insn "xop_pcmov_256" + [(set (match_operand:AVX256MODE 0 "register_operand" "=x,x") + (if_then_else:AVX256MODE + (match_operand:AVX256MODE 3 "nonimmediate_operand" "x,m") + (match_operand:AVX256MODE 1 "register_operand" "x,x") + (match_operand:AVX256MODE 2 "nonimmediate_operand" "xm,x")))] + "TARGET_XOP" + "vpcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "sse4arg")]) + +;; XOP horizontal add/subtract instructions +(define_insn "xop_phaddbw" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (plus:V8HI + (sign_extend:V8HI + (vec_select:V8QI + (match_operand:V16QI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6) + (const_int 8) + (const_int 10) + (const_int 12) + (const_int 14)]))) + (sign_extend:V8HI + (vec_select:V8QI + (match_dup 1) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7) + (const_int 9) + (const_int 11) + (const_int 13) + (const_int 15)])))))] + "TARGET_XOP" + "vphaddbw\t{%1, %0|%0, %1}" + [(set_attr "type" "sseiadd1")]) + +(define_insn "xop_phaddbd" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (plus:V4SI + (plus:V4SI + (sign_extend:V4SI + (vec_select:V4QI + (match_operand:V16QI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 4) + (const_int 8) + (const_int 12)]))) + (sign_extend:V4SI + (vec_select:V4QI + (match_dup 1) + (parallel [(const_int 1) + (const_int 5) + (const_int 9) + (const_int 13)])))) + (plus:V4SI + (sign_extend:V4SI + (vec_select:V4QI + (match_dup 1) + (parallel [(const_int 2) + (const_int 6) + (const_int 10) + (const_int 14)]))) + (sign_extend:V4SI + (vec_select:V4QI + (match_dup 1) + (parallel [(const_int 3) + (const_int 7) + (const_int 11) + (const_int 15)]))))))] + "TARGET_XOP" + "vphaddbd\t{%1, %0|%0, %1}" + [(set_attr "type" "sseiadd1")]) + +(define_insn "xop_phaddbq" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (plus:V2DI + (plus:V2DI + (plus:V2DI + (sign_extend:V2DI + (vec_select:V2QI + (match_operand:V16QI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 4)]))) + (sign_extend:V2DI + (vec_select:V2QI + (match_dup 1) + (parallel [(const_int 1) + (const_int 5)])))) + (plus:V2DI + (sign_extend:V2DI + (vec_select:V2QI + (match_dup 1) + (parallel [(const_int 2) + (const_int 6)]))) + (sign_extend:V2DI + (vec_select:V2QI + (match_dup 1) + (parallel [(const_int 3) + (const_int 7)]))))) + (plus:V2DI + (plus:V2DI + (sign_extend:V2DI + (vec_select:V2QI + (match_dup 1) + (parallel [(const_int 8) + (const_int 12)]))) + (sign_extend:V2DI + (vec_select:V2QI + (match_dup 1) + (parallel [(const_int 9) + (const_int 13)])))) + (plus:V2DI + (sign_extend:V2DI + (vec_select:V2QI + (match_dup 1) + (parallel [(const_int 10) + (const_int 14)]))) + (sign_extend:V2DI + (vec_select:V2QI + (match_dup 1) + (parallel [(const_int 11) + (const_int 15)])))))))] + "TARGET_XOP" + "vphaddbq\t{%1, %0|%0, %1}" + [(set_attr "type" "sseiadd1")]) + +(define_insn "xop_phaddwd" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (plus:V4SI + (sign_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6)]))) + (sign_extend:V4SI + (vec_select:V4HI + (match_dup 1) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7)])))))] + "TARGET_XOP" + "vphaddwd\t{%1, %0|%0, %1}" + [(set_attr "type" "sseiadd1")]) + +(define_insn "xop_phaddwq" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (plus:V2DI + (plus:V2DI + (sign_extend:V2DI + (vec_select:V2HI + (match_operand:V8HI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 4)]))) + (sign_extend:V2DI + (vec_select:V2HI + (match_dup 1) + (parallel [(const_int 1) + (const_int 5)])))) + (plus:V2DI + (sign_extend:V2DI + (vec_select:V2HI + (match_dup 1) + (parallel [(const_int 2) + (const_int 6)]))) + (sign_extend:V2DI + (vec_select:V2HI + (match_dup 1) + (parallel [(const_int 3) + (const_int 7)]))))))] + "TARGET_XOP" + "vphaddwq\t{%1, %0|%0, %1}" + [(set_attr "type" "sseiadd1")]) + +(define_insn "xop_phadddq" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (plus:V2DI + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 2)]))) + (sign_extend:V2DI + (vec_select:V2SI + (match_dup 1) + (parallel [(const_int 1) + (const_int 3)])))))] + "TARGET_XOP" + "vphadddq\t{%1, %0|%0, %1}" + [(set_attr "type" "sseiadd1")]) + +(define_insn "xop_phaddubw" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (plus:V8HI + (zero_extend:V8HI + (vec_select:V8QI + (match_operand:V16QI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6) + (const_int 8) + (const_int 10) + (const_int 12) + (const_int 14)]))) + (zero_extend:V8HI + (vec_select:V8QI + (match_dup 1) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7) + (const_int 9) + (const_int 11) + (const_int 13) + (const_int 15)])))))] + "TARGET_XOP" + "vphaddubw\t{%1, %0|%0, %1}" + [(set_attr "type" "sseiadd1")]) + +(define_insn "xop_phaddubd" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (plus:V4SI + (plus:V4SI + (zero_extend:V4SI + (vec_select:V4QI + (match_operand:V16QI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 4) + (const_int 8) + (const_int 12)]))) + (zero_extend:V4SI + (vec_select:V4QI + (match_dup 1) + (parallel [(const_int 1) + (const_int 5) + (const_int 9) + (const_int 13)])))) + (plus:V4SI + (zero_extend:V4SI + (vec_select:V4QI + (match_dup 1) + (parallel [(const_int 2) + (const_int 6) + (const_int 10) + (const_int 14)]))) + (zero_extend:V4SI + (vec_select:V4QI + (match_dup 1) + (parallel [(const_int 3) + (const_int 7) + (const_int 11) + (const_int 15)]))))))] + "TARGET_XOP" + "vphaddubd\t{%1, %0|%0, %1}" + [(set_attr "type" "sseiadd1")]) + +(define_insn "xop_phaddubq" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (plus:V2DI + (plus:V2DI + (plus:V2DI + (zero_extend:V2DI + (vec_select:V2QI + (match_operand:V16QI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 4)]))) + (sign_extend:V2DI + (vec_select:V2QI + (match_dup 1) + (parallel [(const_int 1) + (const_int 5)])))) + (plus:V2DI + (zero_extend:V2DI + (vec_select:V2QI + (match_dup 1) + (parallel [(const_int 2) + (const_int 6)]))) + (zero_extend:V2DI + (vec_select:V2QI + (match_dup 1) + (parallel [(const_int 3) + (const_int 7)]))))) + (plus:V2DI + (plus:V2DI + (zero_extend:V2DI + (vec_select:V2QI + (match_dup 1) + (parallel [(const_int 8) + (const_int 12)]))) + (sign_extend:V2DI + (vec_select:V2QI + (match_dup 1) + (parallel [(const_int 9) + (const_int 13)])))) + (plus:V2DI + (zero_extend:V2DI + (vec_select:V2QI + (match_dup 1) + (parallel [(const_int 10) + (const_int 14)]))) + (zero_extend:V2DI + (vec_select:V2QI + (match_dup 1) + (parallel [(const_int 11) + (const_int 15)])))))))] + "TARGET_XOP" + "vphaddubq\t{%1, %0|%0, %1}" + [(set_attr "type" "sseiadd1")]) + +(define_insn "xop_phadduwd" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (plus:V4SI + (zero_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6)]))) + (zero_extend:V4SI + (vec_select:V4HI + (match_dup 1) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7)])))))] + "TARGET_XOP" + "vphadduwd\t{%1, %0|%0, %1}" + [(set_attr "type" "sseiadd1")]) + +(define_insn "xop_phadduwq" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (plus:V2DI + (plus:V2DI + (zero_extend:V2DI + (vec_select:V2HI + (match_operand:V8HI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 4)]))) + (zero_extend:V2DI + (vec_select:V2HI + (match_dup 1) + (parallel [(const_int 1) + (const_int 5)])))) + (plus:V2DI + (zero_extend:V2DI + (vec_select:V2HI + (match_dup 1) + (parallel [(const_int 2) + (const_int 6)]))) + (zero_extend:V2DI + (vec_select:V2HI + (match_dup 1) + (parallel [(const_int 3) + (const_int 7)]))))))] + "TARGET_XOP" + "vphadduwq\t{%1, %0|%0, %1}" + [(set_attr "type" "sseiadd1")]) + +(define_insn "xop_phaddudq" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (plus:V2DI + (zero_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 2)]))) + (zero_extend:V2DI + (vec_select:V2SI + (match_dup 1) + (parallel [(const_int 1) + (const_int 3)])))))] + "TARGET_XOP" + "vphaddudq\t{%1, %0|%0, %1}" + [(set_attr "type" "sseiadd1")]) + +(define_insn "xop_phsubbw" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (minus:V8HI + (sign_extend:V8HI + (vec_select:V8QI + (match_operand:V16QI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6) + (const_int 8) + (const_int 10) + (const_int 12) + (const_int 14)]))) + (sign_extend:V8HI + (vec_select:V8QI + (match_dup 1) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7) + (const_int 9) + (const_int 11) + (const_int 13) + (const_int 15)])))))] + "TARGET_XOP" + "vphsubbw\t{%1, %0|%0, %1}" + [(set_attr "type" "sseiadd1")]) + +(define_insn "xop_phsubwd" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (minus:V4SI + (sign_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6)]))) + (sign_extend:V4SI + (vec_select:V4HI + (match_dup 1) + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7)])))))] + "TARGET_XOP" + "vphsubwd\t{%1, %0|%0, %1}" + [(set_attr "type" "sseiadd1")]) + +(define_insn "xop_phsubdq" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (minus:V2DI + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 2)]))) + (sign_extend:V2DI + (vec_select:V2SI + (match_dup 1) + (parallel [(const_int 1) + (const_int 3)])))))] + "TARGET_XOP" + "vphsubdq\t{%1, %0|%0, %1}" + [(set_attr "type" "sseiadd1")]) + +;; XOP permute instructions +(define_insn "xop_pperm" + [(set (match_operand:V16QI 0 "register_operand" "=x,x") + (unspec:V16QI + [(match_operand:V16QI 1 "register_operand" "x,x") + (match_operand:V16QI 2 "nonimmediate_operand" "x,m") + (match_operand:V16QI 3 "nonimmediate_operand" "xm,x")] + UNSPEC_XOP_PERMUTE))] + "TARGET_XOP && !(MEM_P (operands[2]) && MEM_P (operands[3]))" + "vpperm\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "sse4arg") + (set_attr "mode" "TI")]) + +;; XOP pack instructions that combine two vectors into a smaller vector +(define_insn "xop_pperm_pack_v2di_v4si" + [(set (match_operand:V4SI 0 "register_operand" "=x,x") + (vec_concat:V4SI + (truncate:V2SI + (match_operand:V2DI 1 "register_operand" "x,x")) + (truncate:V2SI + (match_operand:V2DI 2 "nonimmediate_operand" "x,m")))) + (use (match_operand:V16QI 3 "nonimmediate_operand" "xm,x"))] + "TARGET_XOP && !(MEM_P (operands[2]) && MEM_P (operands[3]))" + "vpperm\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "sse4arg") + (set_attr "mode" "TI")]) + +(define_insn "xop_pperm_pack_v4si_v8hi" + [(set (match_operand:V8HI 0 "register_operand" "=x,x") + (vec_concat:V8HI + (truncate:V4HI + (match_operand:V4SI 1 "register_operand" "x,x")) + (truncate:V4HI + (match_operand:V4SI 2 "nonimmediate_operand" "x,m")))) + (use (match_operand:V16QI 3 "nonimmediate_operand" "xm,x"))] + "TARGET_XOP && !(MEM_P (operands[2]) && MEM_P (operands[3]))" + "vpperm\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "sse4arg") + (set_attr "mode" "TI")]) + +(define_insn "xop_pperm_pack_v8hi_v16qi" + [(set (match_operand:V16QI 0 "register_operand" "=x,x") + (vec_concat:V16QI + (truncate:V8QI + (match_operand:V8HI 1 "register_operand" "x,x")) + (truncate:V8QI + (match_operand:V8HI 2 "nonimmediate_operand" "x,m")))) + (use (match_operand:V16QI 3 "nonimmediate_operand" "xm,x"))] + "TARGET_XOP && !(MEM_P (operands[2]) && MEM_P (operands[3]))" + "vpperm\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "sse4arg") + (set_attr "mode" "TI")]) + +;; XOP packed rotate instructions +(define_expand "rotl3" + [(set (match_operand:SSEMODE1248 0 "register_operand" "") + (rotate:SSEMODE1248 + (match_operand:SSEMODE1248 1 "nonimmediate_operand" "") + (match_operand:SI 2 "general_operand")))] + "TARGET_XOP" +{ + /* If we were given a scalar, convert it to parallel */ + if (! const_0_to__operand (operands[2], SImode)) + { + rtvec vs = rtvec_alloc (); + rtx par = gen_rtx_PARALLEL (mode, vs); + rtx reg = gen_reg_rtx (mode); + rtx op2 = operands[2]; + int i; + + if (GET_MODE (op2) != mode) + { + op2 = gen_reg_rtx (mode); + convert_move (op2, operands[2], false); + } + + for (i = 0; i < ; i++) + RTVEC_ELT (vs, i) = op2; + + emit_insn (gen_vec_init (reg, par)); + emit_insn (gen_xop_vrotl3 (operands[0], operands[1], reg)); + DONE; + } +}) + +(define_expand "rotr3" + [(set (match_operand:SSEMODE1248 0 "register_operand" "") + (rotatert:SSEMODE1248 + (match_operand:SSEMODE1248 1 "nonimmediate_operand" "") + (match_operand:SI 2 "general_operand")))] + "TARGET_XOP" +{ + /* If we were given a scalar, convert it to parallel */ + if (! const_0_to__operand (operands[2], SImode)) + { + rtvec vs = rtvec_alloc (); + rtx par = gen_rtx_PARALLEL (mode, vs); + rtx neg = gen_reg_rtx (mode); + rtx reg = gen_reg_rtx (mode); + rtx op2 = operands[2]; + int i; + + if (GET_MODE (op2) != mode) + { + op2 = gen_reg_rtx (mode); + convert_move (op2, operands[2], false); + } + + for (i = 0; i < ; i++) + RTVEC_ELT (vs, i) = op2; + + emit_insn (gen_vec_init (reg, par)); + emit_insn (gen_neg2 (neg, reg)); + emit_insn (gen_xop_vrotl3 (operands[0], operands[1], neg)); + DONE; + } +}) + +(define_insn "xop_rotl3" + [(set (match_operand:SSEMODE1248 0 "register_operand" "=x") + (rotate:SSEMODE1248 + (match_operand:SSEMODE1248 1 "nonimmediate_operand" "xm") + (match_operand:SI 2 "const_0_to__operand" "n")))] + "TARGET_XOP" + "vprot\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseishft") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_insn "xop_rotr3" + [(set (match_operand:SSEMODE1248 0 "register_operand" "=x") + (rotatert:SSEMODE1248 + (match_operand:SSEMODE1248 1 "nonimmediate_operand" "xm") + (match_operand:SI 2 "const_0_to__operand" "n")))] + "TARGET_XOP" +{ + operands[3] = GEN_INT (( * 8) - INTVAL (operands[2])); + return \"vprot\t{%3, %1, %0|%0, %1, %3}\"; +} + [(set_attr "type" "sseishft") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_expand "vrotr3" + [(match_operand:SSEMODE1248 0 "register_operand" "") + (match_operand:SSEMODE1248 1 "register_operand" "") + (match_operand:SSEMODE1248 2 "register_operand" "")] + "TARGET_XOP" +{ + rtx reg = gen_reg_rtx (mode); + emit_insn (gen_neg2 (reg, operands[2])); + emit_insn (gen_xop_vrotl3 (operands[0], operands[1], reg)); + DONE; +}) + +(define_expand "vrotl3" + [(match_operand:SSEMODE1248 0 "register_operand" "") + (match_operand:SSEMODE1248 1 "register_operand" "") + (match_operand:SSEMODE1248 2 "register_operand" "")] + "TARGET_XOP" +{ + emit_insn (gen_xop_vrotl3 (operands[0], operands[1], operands[2])); + DONE; +}) + +(define_insn "xop_vrotl3" + [(set (match_operand:SSEMODE1248 0 "register_operand" "=x,x") + (if_then_else:SSEMODE1248 + (ge:SSEMODE1248 + (match_operand:SSEMODE1248 2 "nonimmediate_operand" "x,m") + (const_int 0)) + (rotate:SSEMODE1248 + (match_operand:SSEMODE1248 1 "nonimmediate_operand" "xm,x") + (match_dup 2)) + (rotatert:SSEMODE1248 + (match_dup 1) + (neg:SSEMODE1248 (match_dup 2)))))] + "TARGET_XOP && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "vprot\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseishft") + (set_attr "prefix_data16" "0") + (set_attr "prefix_extra" "2") + (set_attr "mode" "TI")]) + +;; XOP packed shift instructions. +;; FIXME: add V2DI back in +(define_expand "vlshr3" + [(match_operand:SSEMODE124 0 "register_operand" "") + (match_operand:SSEMODE124 1 "register_operand" "") + (match_operand:SSEMODE124 2 "register_operand" "")] + "TARGET_XOP" +{ + rtx neg = gen_reg_rtx (mode); + emit_insn (gen_neg2 (neg, operands[2])); + emit_insn (gen_xop_lshl3 (operands[0], operands[1], neg)); + DONE; +}) + +(define_expand "vashr3" + [(match_operand:SSEMODE124 0 "register_operand" "") + (match_operand:SSEMODE124 1 "register_operand" "") + (match_operand:SSEMODE124 2 "register_operand" "")] + "TARGET_XOP" +{ + rtx neg = gen_reg_rtx (mode); + emit_insn (gen_neg2 (neg, operands[2])); + emit_insn (gen_xop_ashl3 (operands[0], operands[1], neg)); + DONE; +}) + +(define_expand "vashl3" + [(match_operand:SSEMODE124 0 "register_operand" "") + (match_operand:SSEMODE124 1 "register_operand" "") + (match_operand:SSEMODE124 2 "register_operand" "")] + "TARGET_XOP" +{ + emit_insn (gen_xop_ashl3 (operands[0], operands[1], operands[2])); + DONE; +}) + +(define_insn "xop_ashl3" + [(set (match_operand:SSEMODE1248 0 "register_operand" "=x,x") + (if_then_else:SSEMODE1248 + (ge:SSEMODE1248 + (match_operand:SSEMODE1248 2 "nonimmediate_operand" "x,m") + (const_int 0)) + (ashift:SSEMODE1248 + (match_operand:SSEMODE1248 1 "nonimmediate_operand" "xm,x") + (match_dup 2)) + (ashiftrt:SSEMODE1248 + (match_dup 1) + (neg:SSEMODE1248 (match_dup 2)))))] + "TARGET_XOP && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "vpsha\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseishft") + (set_attr "prefix_data16" "0") + (set_attr "prefix_extra" "2") + (set_attr "mode" "TI")]) + +(define_insn "xop_lshl3" + [(set (match_operand:SSEMODE1248 0 "register_operand" "=x,x") + (if_then_else:SSEMODE1248 + (ge:SSEMODE1248 + (match_operand:SSEMODE1248 2 "nonimmediate_operand" "x,m") + (const_int 0)) + (ashift:SSEMODE1248 + (match_operand:SSEMODE1248 1 "nonimmediate_operand" "xm,x") + (match_dup 2)) + (lshiftrt:SSEMODE1248 + (match_dup 1) + (neg:SSEMODE1248 (match_dup 2)))))] + "TARGET_XOP && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "vpshl\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseishft") + (set_attr "prefix_data16" "0") + (set_attr "prefix_extra" "2") + (set_attr "mode" "TI")]) + +;; SSE2 doesn't have some shift varients, so define versions for XOP +(define_expand "ashlv16qi3" + [(match_operand:V16QI 0 "register_operand" "") + (match_operand:V16QI 1 "register_operand" "") + (match_operand:SI 2 "nonmemory_operand" "")] + "TARGET_XOP" +{ + rtvec vs = rtvec_alloc (16); + rtx par = gen_rtx_PARALLEL (V16QImode, vs); + rtx reg = gen_reg_rtx (V16QImode); + int i; + for (i = 0; i < 16; i++) + RTVEC_ELT (vs, i) = operands[2]; + + emit_insn (gen_vec_initv16qi (reg, par)); + emit_insn (gen_xop_ashlv16qi3 (operands[0], operands[1], reg)); + DONE; +}) + +(define_expand "lshlv16qi3" + [(match_operand:V16QI 0 "register_operand" "") + (match_operand:V16QI 1 "register_operand" "") + (match_operand:SI 2 "nonmemory_operand" "")] + "TARGET_XOP" +{ + rtvec vs = rtvec_alloc (16); + rtx par = gen_rtx_PARALLEL (V16QImode, vs); + rtx reg = gen_reg_rtx (V16QImode); + int i; + for (i = 0; i < 16; i++) + RTVEC_ELT (vs, i) = operands[2]; + + emit_insn (gen_vec_initv16qi (reg, par)); + emit_insn (gen_xop_lshlv16qi3 (operands[0], operands[1], reg)); + DONE; +}) + +(define_expand "ashrv16qi3" + [(match_operand:V16QI 0 "register_operand" "") + (match_operand:V16QI 1 "register_operand" "") + (match_operand:SI 2 "nonmemory_operand" "")] + "TARGET_XOP" +{ + rtvec vs = rtvec_alloc (16); + rtx par = gen_rtx_PARALLEL (V16QImode, vs); + rtx reg = gen_reg_rtx (V16QImode); + int i; + rtx ele = ((CONST_INT_P (operands[2])) + ? GEN_INT (- INTVAL (operands[2])) + : operands[2]); + + for (i = 0; i < 16; i++) + RTVEC_ELT (vs, i) = ele; + + emit_insn (gen_vec_initv16qi (reg, par)); + + if (!CONST_INT_P (operands[2])) + { + rtx neg = gen_reg_rtx (V16QImode); + emit_insn (gen_negv16qi2 (neg, reg)); + emit_insn (gen_xop_ashlv16qi3 (operands[0], operands[1], neg)); + } + else + emit_insn (gen_xop_ashlv16qi3 (operands[0], operands[1], reg)); + + DONE; +}) + +(define_expand "ashrv2di3" + [(match_operand:V2DI 0 "register_operand" "") + (match_operand:V2DI 1 "register_operand" "") + (match_operand:DI 2 "nonmemory_operand" "")] + "TARGET_XOP" +{ + rtvec vs = rtvec_alloc (2); + rtx par = gen_rtx_PARALLEL (V2DImode, vs); + rtx reg = gen_reg_rtx (V2DImode); + rtx ele; + + if (CONST_INT_P (operands[2])) + ele = GEN_INT (- INTVAL (operands[2])); + else if (GET_MODE (operands[2]) != DImode) + { + rtx move = gen_reg_rtx (DImode); + ele = gen_reg_rtx (DImode); + convert_move (move, operands[2], false); + emit_insn (gen_negdi2 (ele, move)); + } + else + { + ele = gen_reg_rtx (DImode); + emit_insn (gen_negdi2 (ele, operands[2])); + } + + RTVEC_ELT (vs, 0) = ele; + RTVEC_ELT (vs, 1) = ele; + emit_insn (gen_vec_initv2di (reg, par)); + emit_insn (gen_xop_ashlv2di3 (operands[0], operands[1], reg)); + DONE; +}) + +;; XOP FRCZ support +(define_insn "xop_frcz2" + [(set (match_operand:FMAMODE 0 "register_operand" "=x") + (unspec:FMAMODE + [(match_operand:FMAMODE 1 "nonimmediate_operand" "xm")] + UNSPEC_FRCZ))] + "TARGET_XOP" + "vfrcz\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt1") + (set_attr "mode" "")]) + +;; scalar insns +(define_expand "xop_vmfrcz2" + [(set (match_operand:SSEMODEF2P 0 "register_operand") + (vec_merge:SSEMODEF2P + (unspec:SSEMODEF2P + [(match_operand:SSEMODEF2P 1 "nonimmediate_operand")] + UNSPEC_FRCZ) + (match_dup 3) + (const_int 1)))] + "TARGET_XOP" +{ + operands[3] = CONST0_RTX (mode); +}) + +(define_insn "*xop_vmfrcz_" + [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x") + (vec_merge:SSEMODEF2P + (unspec:SSEMODEF2P + [(match_operand:SSEMODEF2P 1 "nonimmediate_operand" "xm")] + UNSPEC_FRCZ) + (match_operand:SSEMODEF2P 2 "const0_operand") + (const_int 1)))] + "TARGET_XOP" + "vfrcz\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt1") + (set_attr "mode" "")]) + +(define_insn "xop_maskcmp3" + [(set (match_operand:SSEMODE1248 0 "register_operand" "=x") + (match_operator:SSEMODE1248 1 "ix86_comparison_int_operator" + [(match_operand:SSEMODE1248 2 "register_operand" "x") + (match_operand:SSEMODE1248 3 "nonimmediate_operand" "xm")]))] + "TARGET_XOP" + "vpcom%Y1\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "sse4arg") + (set_attr "prefix_data16" "0") + (set_attr "prefix_rep" "0") + (set_attr "prefix_extra" "2") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_insn "xop_maskcmp_uns3" + [(set (match_operand:SSEMODE1248 0 "register_operand" "=x") + (match_operator:SSEMODE1248 1 "ix86_comparison_uns_operator" + [(match_operand:SSEMODE1248 2 "register_operand" "x") + (match_operand:SSEMODE1248 3 "nonimmediate_operand" "xm")]))] + "TARGET_XOP" + "vpcom%Y1u\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "prefix_data16" "0") + (set_attr "prefix_rep" "0") + (set_attr "prefix_extra" "2") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +;; Version of pcom*u* that is called from the intrinsics that allows pcomequ* +;; and pcomneu* not to be converted to the signed ones in case somebody needs +;; the exact instruction generated for the intrinsic. +(define_insn "xop_maskcmp_uns23" + [(set (match_operand:SSEMODE1248 0 "register_operand" "=x") + (unspec:SSEMODE1248 + [(match_operator:SSEMODE1248 1 "ix86_comparison_uns_operator" + [(match_operand:SSEMODE1248 2 "register_operand" "x") + (match_operand:SSEMODE1248 3 "nonimmediate_operand" "xm")])] + UNSPEC_XOP_UNSIGNED_CMP))] + "TARGET_XOP" + "vpcom%Y1u\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "prefix_data16" "0") + (set_attr "prefix_extra" "2") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +;; Pcomtrue and pcomfalse support. These are useless instructions, but are +;; being added here to be complete. +(define_insn "xop_pcom_tf3" + [(set (match_operand:SSEMODE1248 0 "register_operand" "=x") + (unspec:SSEMODE1248 + [(match_operand:SSEMODE1248 1 "register_operand" "x") + (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm") + (match_operand:SI 3 "const_int_operand" "n")] + UNSPEC_XOP_TRUEFALSE))] + "TARGET_XOP" +{ + return ((INTVAL (operands[3]) != 0) + ? "vpcomtrue\t{%2, %1, %0|%0, %1, %2}" + : "vpcomfalse\t{%2, %1, %0|%0, %1, %2}"); +} + [(set_attr "type" "ssecmp") + (set_attr "prefix_data16" "0") + (set_attr "prefix_extra" "2") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_insn "xop_vpermil23" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x") + (unspec:AVXMODEF2P + [(match_operand:AVXMODEF2P 1 "register_operand" "x") + (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "%x") + (match_operand: 3 "nonimmediate_operand" "xm") + (match_operand:SI 4 "const_0_to_3_operand" "n")] + UNSPEC_VPERMIL2))] + "TARGET_XOP" + "vpermil2\t{%4, %3, %2, %1, %0|%0, %1, %2, %3, %4}" + [(set_attr "type" "sse4arg") + (set_attr "length_immediate" "1") + (set_attr "mode" "")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(define_insn "*avx_aesenc" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "x") + (match_operand:V2DI 2 "nonimmediate_operand" "xm")] + UNSPEC_AESENC))] + "TARGET_AES && TARGET_AVX" + "vaesenc\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog1") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "aesenc" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V2DI 2 "nonimmediate_operand" "xm")] + UNSPEC_AESENC))] + "TARGET_AES" + "aesenc\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog1") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "*avx_aesenclast" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "x") + (match_operand:V2DI 2 "nonimmediate_operand" "xm")] + UNSPEC_AESENCLAST))] + "TARGET_AES && TARGET_AVX" + "vaesenclast\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog1") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "aesenclast" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V2DI 2 "nonimmediate_operand" "xm")] + UNSPEC_AESENCLAST))] + "TARGET_AES" + "aesenclast\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog1") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "*avx_aesdec" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "x") + (match_operand:V2DI 2 "nonimmediate_operand" "xm")] + UNSPEC_AESDEC))] + "TARGET_AES && TARGET_AVX" + "vaesdec\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog1") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "aesdec" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V2DI 2 "nonimmediate_operand" "xm")] + UNSPEC_AESDEC))] + "TARGET_AES" + "aesdec\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog1") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "*avx_aesdeclast" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "x") + (match_operand:V2DI 2 "nonimmediate_operand" "xm")] + UNSPEC_AESDECLAST))] + "TARGET_AES && TARGET_AVX" + "vaesdeclast\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog1") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "aesdeclast" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V2DI 2 "nonimmediate_operand" "xm")] + UNSPEC_AESDECLAST))] + "TARGET_AES" + "aesdeclast\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog1") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "aesimc" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "nonimmediate_operand" "xm")] + UNSPEC_AESIMC))] + "TARGET_AES" + "%vaesimc\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog1") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_insn "aeskeygenassist" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "nonimmediate_operand" "xm") + (match_operand:SI 2 "const_0_to_255_operand" "n")] + UNSPEC_AESKEYGENASSIST))] + "TARGET_AES" + "%vaeskeygenassist\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + +(define_insn "*vpclmulqdq" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "x") + (match_operand:V2DI 2 "nonimmediate_operand" "xm") + (match_operand:SI 3 "const_0_to_255_operand" "n")] + UNSPEC_PCLMUL))] + "TARGET_PCLMUL && TARGET_AVX" + "vpclmulqdq\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "sselog1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "TI")]) + +(define_insn "pclmulqdq" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V2DI 2 "nonimmediate_operand" "xm") + (match_operand:SI 3 "const_0_to_255_operand" "n")] + UNSPEC_PCLMUL))] + "TARGET_PCLMUL" + "pclmulqdq\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "sselog1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "mode" "TI")]) + +(define_expand "avx_vzeroall" + [(match_par_dup 0 [(const_int 0)])] + "TARGET_AVX" +{ + int nregs = TARGET_64BIT ? 16 : 8; + int regno; + + operands[0] = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (nregs + 1)); + + XVECEXP (operands[0], 0, 0) + = gen_rtx_UNSPEC_VOLATILE (VOIDmode, gen_rtvec (1, const0_rtx), + UNSPECV_VZEROALL); + + for (regno = 0; regno < nregs; regno++) + XVECEXP (operands[0], 0, regno + 1) + = gen_rtx_SET (VOIDmode, + gen_rtx_REG (V8SImode, SSE_REGNO (regno)), + CONST0_RTX (V8SImode)); +}) + +(define_insn "*avx_vzeroall" + [(match_parallel 0 "vzeroall_operation" + [(unspec_volatile [(const_int 0)] UNSPECV_VZEROALL)])] + "TARGET_AVX" + "vzeroall" + [(set_attr "type" "sse") + (set_attr "modrm" "0") + (set_attr "memory" "none") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + +;; Clear the upper 128bits of AVX registers, equivalent to a NOP +;; if the upper 128bits are unused. +(define_insn "avx_vzeroupper" + [(unspec_volatile [(match_operand 0 "const_int_operand" "")] + UNSPECV_VZEROUPPER)] + "TARGET_AVX" + "vzeroupper" + [(set_attr "type" "sse") + (set_attr "modrm" "0") + (set_attr "memory" "none") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + +(define_insn_and_split "vec_dup" + [(set (match_operand:AVX256MODE24P 0 "register_operand" "=x,x") + (vec_duplicate:AVX256MODE24P + (match_operand: 1 "nonimmediate_operand" "m,?x")))] + "TARGET_AVX" + "@ + vbroadcast\t{%1, %0|%0, %1} + #" + "&& reload_completed && REG_P (operands[1])" + [(set (match_dup 2) (vec_duplicate: (match_dup 1))) + (set (match_dup 0) (vec_concat:AVX256MODE24P (match_dup 2) (match_dup 2)))] + "operands[2] = gen_rtx_REG (mode, REGNO (operands[0]));" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn "avx_vbroadcastf128_" + [(set (match_operand:AVX256MODE 0 "register_operand" "=x,x,x") + (vec_concat:AVX256MODE + (match_operand: 1 "nonimmediate_operand" "m,0,?x") + (match_dup 1)))] + "TARGET_AVX" + "@ + vbroadcastf128\t{%1, %0|%0, %1} + vinsertf128\t{$1, %1, %0, %0|%0, %0, %1, 1} + vperm2f128\t{$0, %t1, %t1, %0|%0, %t1, %t1, 0}" + [(set_attr "type" "ssemov,sselog1,sselog1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "0,1,1") + (set_attr "prefix" "vex") + (set_attr "mode" "V4SF,V8SF,V8SF")]) + +;; Recognize broadcast as a vec_select as produced by builtin_vec_perm. +;; If it so happens that the input is in memory, use vbroadcast. +;; Otherwise use vpermilp (and in the case of 256-bit modes, vperm2f128). +(define_insn "*avx_vperm_broadcast_v4sf" + [(set (match_operand:V4SF 0 "register_operand" "=x,x,x") + (vec_select:V4SF + (match_operand:V4SF 1 "nonimmediate_operand" "m,o,x") + (match_parallel 2 "avx_vbroadcast_operand" + [(match_operand 3 "const_int_operand" "C,n,n")])))] + "TARGET_AVX" +{ + int elt = INTVAL (operands[3]); + switch (which_alternative) + { + case 0: + case 1: + operands[1] = adjust_address_nv (operands[1], SFmode, elt * 4); + return "vbroadcastss\t{%1, %0|%0, %1}"; + case 2: + operands[2] = GEN_INT (elt * 0x55); + return "vpermilps\t{%2, %1, %0|%0, %1, %2}"; + default: + gcc_unreachable (); + } +} + [(set_attr "type" "ssemov,ssemov,sselog1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "0,0,1") + (set_attr "prefix" "vex") + (set_attr "mode" "SF,SF,V4SF")]) + +(define_insn_and_split "*avx_vperm_broadcast_" + [(set (match_operand:AVX256MODEF2P 0 "register_operand" "=x,x,x") + (vec_select:AVX256MODEF2P + (match_operand:AVX256MODEF2P 1 "nonimmediate_operand" "m,o,?x") + (match_parallel 2 "avx_vbroadcast_operand" + [(match_operand 3 "const_int_operand" "C,n,n")])))] + "TARGET_AVX" + "#" + "&& reload_completed" + [(set (match_dup 0) (vec_duplicate:AVX256MODEF2P (match_dup 1)))] +{ + rtx op0 = operands[0], op1 = operands[1]; + int elt = INTVAL (operands[3]); + + if (REG_P (op1)) + { + int mask; + + /* Shuffle element we care about into all elements of the 128-bit lane. + The other lane gets shuffled too, but we don't care. */ + if (mode == V4DFmode) + mask = (elt & 1 ? 15 : 0); + else + mask = (elt & 3) * 0x55; + emit_insn (gen_avx_vpermil (op0, op1, GEN_INT (mask))); + + /* Shuffle the lane we care about into both lanes of the dest. */ + mask = (elt / ( / 2)) * 0x11; + emit_insn (gen_avx_vperm2f1283 (op0, op0, op0, GEN_INT (mask))); + DONE; + } + + operands[1] = adjust_address_nv (op1, mode, + elt * GET_MODE_SIZE (mode)); +}) + +(define_expand "avx_vpermil" + [(set (match_operand:AVXMODEFDP 0 "register_operand" "") + (vec_select:AVXMODEFDP + (match_operand:AVXMODEFDP 1 "nonimmediate_operand" "") + (match_operand:SI 2 "const_0_to_255_operand" "")))] + "TARGET_AVX" +{ + int mask = INTVAL (operands[2]); + rtx perm[]; + + perm[0] = GEN_INT (mask & 1); + perm[1] = GEN_INT ((mask >> 1) & 1); + if (mode == V4DFmode) + { + perm[2] = GEN_INT (((mask >> 2) & 1) + 2); + perm[3] = GEN_INT (((mask >> 3) & 1) + 2); + } + + operands[2] + = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (, perm)); +}) + +(define_expand "avx_vpermil" + [(set (match_operand:AVXMODEFSP 0 "register_operand" "") + (vec_select:AVXMODEFSP + (match_operand:AVXMODEFSP 1 "nonimmediate_operand" "") + (match_operand:SI 2 "const_0_to_255_operand" "")))] + "TARGET_AVX" +{ + int mask = INTVAL (operands[2]); + rtx perm[]; + + perm[0] = GEN_INT (mask & 3); + perm[1] = GEN_INT ((mask >> 2) & 3); + perm[2] = GEN_INT ((mask >> 4) & 3); + perm[3] = GEN_INT ((mask >> 6) & 3); + if (mode == V8SFmode) + { + perm[4] = GEN_INT ((mask & 3) + 4); + perm[5] = GEN_INT (((mask >> 2) & 3) + 4); + perm[6] = GEN_INT (((mask >> 4) & 3) + 4); + perm[7] = GEN_INT (((mask >> 6) & 3) + 4); + } + + operands[2] + = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (, perm)); +}) + +(define_insn "*avx_vpermilp" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x") + (vec_select:AVXMODEF2P + (match_operand:AVXMODEF2P 1 "nonimmediate_operand" "xm") + (match_parallel 2 "avx_vpermilp__operand" + [(match_operand 3 "const_int_operand" "")])))] + "TARGET_AVX" +{ + int mask = avx_vpermilp_parallel (operands[2], mode) - 1; + operands[2] = GEN_INT (mask); + return "vpermil\t{%2, %1, %0|%0, %1, %2}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "avx_vpermilvar3" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x") + (unspec:AVXMODEF2P + [(match_operand:AVXMODEF2P 1 "register_operand" "x") + (match_operand: 2 "nonimmediate_operand" "xm")] + UNSPEC_VPERMIL))] + "TARGET_AVX" + "vpermil\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_expand "avx_vperm2f1283" + [(set (match_operand:AVX256MODE2P 0 "register_operand" "") + (unspec:AVX256MODE2P + [(match_operand:AVX256MODE2P 1 "register_operand" "") + (match_operand:AVX256MODE2P 2 "nonimmediate_operand" "") + (match_operand:SI 3 "const_0_to_255_operand" "")] + UNSPEC_VPERMIL2F128))] + "TARGET_AVX" +{ + int mask = INTVAL (operands[3]); + if ((mask & 0x88) == 0) + { + rtx perm[], t1, t2; + int i, base, nelt = , nelt2 = nelt / 2; + + base = (mask & 3) * nelt2; + for (i = 0; i < nelt2; ++i) + perm[i] = GEN_INT (base + i); + + base = ((mask >> 4) & 3) * nelt2; + for (i = 0; i < nelt2; ++i) + perm[i + nelt2] = GEN_INT (base + i); + + t2 = gen_rtx_VEC_CONCAT (mode, + operands[1], operands[2]); + t1 = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, perm)); + t2 = gen_rtx_VEC_SELECT (mode, t2, t1); + t2 = gen_rtx_SET (VOIDmode, operands[0], t2); + emit_insn (t2); + DONE; + } +}) + +;; Note that bits 7 and 3 of the imm8 allow lanes to be zeroed, which +;; means that in order to represent this properly in rtl we'd have to +;; nest *another* vec_concat with a zero operand and do the select from +;; a 4x wide vector. That doesn't seem very nice. +(define_insn "*avx_vperm2f128_full" + [(set (match_operand:AVX256MODE2P 0 "register_operand" "=x") + (unspec:AVX256MODE2P + [(match_operand:AVX256MODE2P 1 "register_operand" "x") + (match_operand:AVX256MODE2P 2 "nonimmediate_operand" "xm") + (match_operand:SI 3 "const_0_to_255_operand" "n")] + UNSPEC_VPERMIL2F128))] + "TARGET_AVX" + "vperm2f128\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn "*avx_vperm2f128_nozero" + [(set (match_operand:AVX256MODE2P 0 "register_operand" "=x") + (vec_select:AVX256MODE2P + (vec_concat: + (match_operand:AVX256MODE2P 1 "register_operand" "x") + (match_operand:AVX256MODE2P 2 "nonimmediate_operand" "xm")) + (match_parallel 3 "avx_vperm2f128__operand" + [(match_operand 4 "const_int_operand" "")])))] + "TARGET_AVX" +{ + int mask = avx_vperm2f128_parallel (operands[3], mode) - 1; + operands[3] = GEN_INT (mask); + return "vperm2f128\t{%3, %2, %1, %0|%0, %1, %2, %3}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_expand "avx_vinsertf128" + [(match_operand:AVX256MODE 0 "register_operand" "") + (match_operand:AVX256MODE 1 "register_operand" "") + (match_operand: 2 "nonimmediate_operand" "") + (match_operand:SI 3 "const_0_to_1_operand" "")] + "TARGET_AVX" +{ + switch (INTVAL (operands[3])) + { + case 0: + emit_insn (gen_vec_set_lo_ (operands[0], operands[1], + operands[2])); + break; + case 1: + emit_insn (gen_vec_set_hi_ (operands[0], operands[1], + operands[2])); + break; + default: + gcc_unreachable (); + } + DONE; +}) + +(define_insn "vec_set_lo_" + [(set (match_operand:AVX256MODE4P 0 "register_operand" "=x") + (vec_concat:AVX256MODE4P + (match_operand: 2 "nonimmediate_operand" "xm") + (vec_select: + (match_operand:AVX256MODE4P 1 "register_operand" "x") + (parallel [(const_int 2) (const_int 3)]))))] + "TARGET_AVX" + "vinsertf128\t{$0x0, %2, %1, %0|%0, %1, %2, 0x0}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn "vec_set_hi_" + [(set (match_operand:AVX256MODE4P 0 "register_operand" "=x") + (vec_concat:AVX256MODE4P + (vec_select: + (match_operand:AVX256MODE4P 1 "register_operand" "x") + (parallel [(const_int 0) (const_int 1)])) + (match_operand: 2 "nonimmediate_operand" "xm")))] + "TARGET_AVX" + "vinsertf128\t{$0x1, %2, %1, %0|%0, %1, %2, 0x1}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn "vec_set_lo_" + [(set (match_operand:AVX256MODE8P 0 "register_operand" "=x") + (vec_concat:AVX256MODE8P + (match_operand: 2 "nonimmediate_operand" "xm") + (vec_select: + (match_operand:AVX256MODE8P 1 "register_operand" "x") + (parallel [(const_int 4) (const_int 5) + (const_int 6) (const_int 7)]))))] + "TARGET_AVX" + "vinsertf128\t{$0x0, %2, %1, %0|%0, %1, %2, 0x0}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn "vec_set_hi_" + [(set (match_operand:AVX256MODE8P 0 "register_operand" "=x") + (vec_concat:AVX256MODE8P + (vec_select: + (match_operand:AVX256MODE8P 1 "register_operand" "x") + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)])) + (match_operand: 2 "nonimmediate_operand" "xm")))] + "TARGET_AVX" + "vinsertf128\t{$0x1, %2, %1, %0|%0, %1, %2, 0x1}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn "vec_set_lo_v16hi" + [(set (match_operand:V16HI 0 "register_operand" "=x") + (vec_concat:V16HI + (match_operand:V8HI 2 "nonimmediate_operand" "xm") + (vec_select:V8HI + (match_operand:V16HI 1 "register_operand" "x") + (parallel [(const_int 8) (const_int 9) + (const_int 10) (const_int 11) + (const_int 12) (const_int 13) + (const_int 14) (const_int 15)]))))] + "TARGET_AVX" + "vinsertf128\t{$0x0, %2, %1, %0|%0, %1, %2, 0x0}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn "vec_set_hi_v16hi" + [(set (match_operand:V16HI 0 "register_operand" "=x") + (vec_concat:V16HI + (vec_select:V8HI + (match_operand:V16HI 1 "register_operand" "x") + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7)])) + (match_operand:V8HI 2 "nonimmediate_operand" "xm")))] + "TARGET_AVX" + "vinsertf128\t{$0x1, %2, %1, %0|%0, %1, %2, 0x1}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn "vec_set_lo_v32qi" + [(set (match_operand:V32QI 0 "register_operand" "=x") + (vec_concat:V32QI + (match_operand:V16QI 2 "nonimmediate_operand" "xm") + (vec_select:V16QI + (match_operand:V32QI 1 "register_operand" "x") + (parallel [(const_int 16) (const_int 17) + (const_int 18) (const_int 19) + (const_int 20) (const_int 21) + (const_int 22) (const_int 23) + (const_int 24) (const_int 25) + (const_int 26) (const_int 27) + (const_int 28) (const_int 29) + (const_int 30) (const_int 31)]))))] + "TARGET_AVX" + "vinsertf128\t{$0x0, %2, %1, %0|%0, %1, %2, 0x0}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn "vec_set_hi_v32qi" + [(set (match_operand:V32QI 0 "register_operand" "=x") + (vec_concat:V32QI + (vec_select:V16QI + (match_operand:V32QI 1 "register_operand" "x") + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7) + (const_int 8) (const_int 9) + (const_int 10) (const_int 11) + (const_int 12) (const_int 13) + (const_int 14) (const_int 15)])) + (match_operand:V16QI 2 "nonimmediate_operand" "xm")))] + "TARGET_AVX" + "vinsertf128\t{$0x1, %2, %1, %0|%0, %1, %2, 0x1}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn "avx_maskload" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x") + (unspec:AVXMODEF2P + [(match_operand:AVXMODEF2P 1 "memory_operand" "m") + (match_operand: 2 "register_operand" "x")] + UNSPEC_MASKLOAD))] + "TARGET_AVX" + "vmaskmov\t{%1, %2, %0|%0, %2, %1}" + [(set_attr "type" "sselog1") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "avx_maskstore" + [(set (match_operand:AVXMODEF2P 0 "memory_operand" "=m") + (unspec:AVXMODEF2P + [(match_operand: 1 "register_operand" "x") + (match_operand:AVXMODEF2P 2 "register_operand" "x") + (match_dup 0)] + UNSPEC_MASKSTORE))] + "TARGET_AVX" + "vmaskmov\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog1") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn_and_split "avx__" + [(set (match_operand:AVX256MODE2P 0 "nonimmediate_operand" "=x,m") + (unspec:AVX256MODE2P + [(match_operand: 1 "nonimmediate_operand" "xm,x")] + UNSPEC_CAST))] + "TARGET_AVX" + "#" + "&& reload_completed" + [(const_int 0)] +{ + rtx op0 = operands[0]; + rtx op1 = operands[1]; + if (REG_P (op0)) + op0 = gen_rtx_REG (mode, REGNO (op0)); + else + op1 = gen_rtx_REG (mode, REGNO (op1)); + emit_move_insn (op0, op1); + DONE; +}) + +(define_expand "vec_init" + [(match_operand:AVX256MODE 0 "register_operand" "") + (match_operand 1 "" "")] + "TARGET_AVX" +{ + ix86_expand_vector_init (false, operands[0], operands[1]); + DONE; +}) + +(define_insn "*vec_concat_avx" + [(set (match_operand:AVX256MODE 0 "register_operand" "=x,x") + (vec_concat:AVX256MODE + (match_operand: 1 "register_operand" "x,x") + (match_operand: 2 "vector_move_operand" "xm,C")))] + "TARGET_AVX" +{ + switch (which_alternative) + { + case 0: + return "vinsertf128\t{$0x1, %2, %t1, %0|%0, %t1, %2, 0x1}"; + case 1: + switch (get_attr_mode (insn)) + { + case MODE_V8SF: + return "vmovaps\t{%1, %x0|%x0, %1}"; + case MODE_V4DF: + return "vmovapd\t{%1, %x0|%x0, %1}"; + default: + return "vmovdqa\t{%1, %x0|%x0, %1}"; + } + default: + gcc_unreachable (); + } +} + [(set_attr "type" "sselog,ssemov") + (set_attr "prefix_extra" "1,*") + (set_attr "length_immediate" "1,*") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "vcvtph2ps" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_select:V4SF + (unspec:V8SF [(match_operand:V8HI 1 "register_operand" "x")] + UNSPEC_VCVTPH2PS) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)])))] + "TARGET_F16C" + "vcvtph2ps\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "V4SF")]) + +(define_insn "*vcvtph2ps_load" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (unspec:V4SF [(match_operand:V4HI 1 "memory_operand" "m")] + UNSPEC_VCVTPH2PS))] + "TARGET_F16C" + "vcvtph2ps\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn "vcvtph2ps256" + [(set (match_operand:V8SF 0 "register_operand" "=x") + (unspec:V8SF [(match_operand:V8HI 1 "nonimmediate_operand" "xm")] + UNSPEC_VCVTPH2PS))] + "TARGET_F16C" + "vcvtph2ps\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_expand "vcvtps2ph" + [(set (match_operand:V8HI 0 "register_operand" "") + (vec_concat:V8HI + (unspec:V4HI [(match_operand:V4SF 1 "register_operand" "") + (match_operand:SI 2 "const_0_to_255_operand" "")] + UNSPEC_VCVTPS2PH) + (match_dup 3)))] + "TARGET_F16C" + "operands[3] = CONST0_RTX (V4HImode);") + +(define_insn "*vcvtps2ph" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (vec_concat:V8HI + (unspec:V4HI [(match_operand:V4SF 1 "register_operand" "x") + (match_operand:SI 2 "const_0_to_255_operand" "N")] + UNSPEC_VCVTPS2PH) + (match_operand:V4HI 3 "const0_operand" "")))] + "TARGET_F16C" + "vcvtps2ph\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "V4SF")]) + +(define_insn "*vcvtps2ph_store" + [(set (match_operand:V4HI 0 "memory_operand" "=m") + (unspec:V4HI [(match_operand:V4SF 1 "register_operand" "x") + (match_operand:SI 2 "const_0_to_255_operand" "N")] + UNSPEC_VCVTPS2PH))] + "TARGET_F16C" + "vcvtps2ph\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "V4SF")]) + +(define_insn "vcvtps2ph256" + [(set (match_operand:V8HI 0 "nonimmediate_operand" "=xm") + (unspec:V8HI [(match_operand:V8SF 1 "register_operand" "x") + (match_operand:SI 2 "const_0_to_255_operand" "N")] + UNSPEC_VCVTPS2PH))] + "TARGET_F16C" + "vcvtps2ph\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) diff --git a/gcc/config/i386/ssemath.h b/gcc/config/i386/ssemath.h new file mode 100644 index 000000000..357d6a378 --- /dev/null +++ b/gcc/config/i386/ssemath.h @@ -0,0 +1,25 @@ +/* Copyright (C) 2010 + Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#undef TARGET_FPMATH_DEFAULT +#define TARGET_FPMATH_DEFAULT (TARGET_SSE2 ? FPMATH_SSE : FPMATH_387) + +#undef TARGET_SUBTARGET32_ISA_DEFAULT +#define TARGET_SUBTARGET32_ISA_DEFAULT \ + (OPTION_MASK_ISA_MMX | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_SSE2) diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md new file mode 100644 index 000000000..3fdfee2e4 --- /dev/null +++ b/gcc/config/i386/sync.md @@ -0,0 +1,242 @@ +;; GCC machine description for i386 synchronization instructions. +;; Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 +;; Free Software Foundation, Inc. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +(define_mode_iterator CASMODE + [QI HI SI (DI "TARGET_64BIT || TARGET_CMPXCHG8B") + (TI "TARGET_64BIT && TARGET_CMPXCHG16B")]) +(define_mode_iterator DCASMODE + [(DI "!TARGET_64BIT && TARGET_CMPXCHG8B && !flag_pic") + (TI "TARGET_64BIT && TARGET_CMPXCHG16B")]) +(define_mode_attr doublemodesuffix [(DI "8") (TI "16")]) +(define_mode_attr DCASHMODE [(DI "SI") (TI "DI")]) + +(define_expand "memory_barrier" + [(set (match_dup 0) + (unspec:BLK [(match_dup 0)] UNSPEC_MFENCE))] + "" +{ + operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)); + MEM_VOLATILE_P (operands[0]) = 1; + + if (!(TARGET_64BIT || TARGET_SSE2)) + { + emit_insn (gen_memory_barrier_nosse (operands[0])); + DONE; + } +}) + +(define_insn "memory_barrier_nosse" + [(set (match_operand:BLK 0 "" "") + (unspec:BLK [(match_dup 0)] UNSPEC_MFENCE)) + (clobber (reg:CC FLAGS_REG))] + "!(TARGET_64BIT || TARGET_SSE2)" + "lock{%;} or{l}\t{$0, (%%esp)|DWORD PTR [esp], 0}" + [(set_attr "memory" "unknown")]) + +;; ??? It would be possible to use cmpxchg8b on pentium for DImode +;; changes. It's complicated because the insn uses ecx:ebx as the +;; new value; note that the registers are reversed from the order +;; that they'd be in with (reg:DI 2 ecx). Similarly for TImode +;; data in 64-bit mode. + +(define_expand "sync_compare_and_swap" + [(parallel + [(set (match_operand:CASMODE 0 "register_operand" "") + (match_operand:CASMODE 1 "memory_operand" "")) + (set (match_dup 1) + (unspec_volatile:CASMODE + [(match_dup 1) + (match_operand:CASMODE 2 "register_operand" "") + (match_operand:CASMODE 3 "register_operand" "")] + UNSPECV_CMPXCHG)) + (set (reg:CCZ FLAGS_REG) + (compare:CCZ + (unspec_volatile:CASMODE + [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPECV_CMPXCHG) + (match_dup 2)))])] + "TARGET_CMPXCHG" +{ + if ((mode == DImode && !TARGET_64BIT) || mode == TImode) + { + enum machine_mode hmode = mode == DImode ? SImode : DImode; + rtx low = simplify_gen_subreg (hmode, operands[3], mode, 0); + rtx high = simplify_gen_subreg (hmode, operands[3], mode, + GET_MODE_SIZE (hmode)); + low = force_reg (hmode, low); + high = force_reg (hmode, high); + if (mode == DImode) + { + if (flag_pic && !cmpxchg8b_pic_memory_operand (operands[1], DImode)) + operands[1] = replace_equiv_address (operands[1], + force_reg (Pmode, + XEXP (operands[1], + 0))); + emit_insn (gen_sync_double_compare_and_swapdi + (operands[0], operands[1], operands[2], low, high)); + } + else if (mode == TImode) + emit_insn (gen_sync_double_compare_and_swapti + (operands[0], operands[1], operands[2], low, high)); + else + gcc_unreachable (); + DONE; + } +}) + +(define_insn "*sync_compare_and_swap" + [(set (match_operand:SWI 0 "register_operand" "=a") + (match_operand:SWI 1 "memory_operand" "+m")) + (set (match_dup 1) + (unspec_volatile:SWI + [(match_dup 1) + (match_operand:SWI 2 "register_operand" "a") + (match_operand:SWI 3 "register_operand" "")] + UNSPECV_CMPXCHG)) + (set (reg:CCZ FLAGS_REG) + (compare:CCZ + (unspec_volatile:SWI + [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPECV_CMPXCHG) + (match_dup 2)))] + "TARGET_CMPXCHG" + "lock{%;} cmpxchg{}\t{%3, %1|%1, %3}") + +(define_insn "sync_double_compare_and_swap" + [(set (match_operand:DCASMODE 0 "register_operand" "=A") + (match_operand:DCASMODE 1 "memory_operand" "+m")) + (set (match_dup 1) + (unspec_volatile:DCASMODE + [(match_dup 1) + (match_operand:DCASMODE 2 "register_operand" "A") + (match_operand: 3 "register_operand" "b") + (match_operand: 4 "register_operand" "c")] + UNSPECV_CMPXCHG)) + (set (reg:CCZ FLAGS_REG) + (compare:CCZ + (unspec_volatile:DCASMODE + [(match_dup 1) (match_dup 2) (match_dup 3) (match_dup 4)] + UNSPECV_CMPXCHG) + (match_dup 2)))] + "" + "lock{%;} cmpxchgb\t%1") + +;; Theoretically we'd like to use constraint "r" (any reg) for operand +;; 3, but that includes ecx. If operand 3 and 4 are the same (like when +;; the input is -1LL) GCC might chose to allocate operand 3 to ecx, like +;; operand 4. This breaks, as the xchg will move the PIC register contents +;; to %ecx then --> boom. Operands 3 and 4 really need to be different +;; registers, which in this case means operand 3 must not be ecx. +;; Instead of playing tricks with fake early clobbers or the like we +;; just enumerate all regs possible here, which (as this is !TARGET_64BIT) +;; are just esi and edi. +(define_insn "*sync_double_compare_and_swapdi_pic" + [(set (match_operand:DI 0 "register_operand" "=A") + (match_operand:DI 1 "cmpxchg8b_pic_memory_operand" "+m")) + (set (match_dup 1) + (unspec_volatile:DI + [(match_dup 1) + (match_operand:DI 2 "register_operand" "A") + (match_operand:SI 3 "register_operand" "SD") + (match_operand:SI 4 "register_operand" "c")] + UNSPECV_CMPXCHG)) + (set (reg:CCZ FLAGS_REG) + (compare:CCZ + (unspec_volatile:DI + [(match_dup 1) (match_dup 2) (match_dup 3) (match_dup 4)] + UNSPECV_CMPXCHG) + (match_dup 2)))] + "!TARGET_64BIT && TARGET_CMPXCHG8B && flag_pic" + "xchg{l}\t%%ebx, %3\;lock{%;} cmpxchg8b\t%1\;xchg{l}\t%%ebx, %3") + +;; For operand 2 nonmemory_operand predicate is used instead of +;; register_operand to allow combiner to better optimize atomic +;; additions of constants. +(define_insn "sync_old_add" + [(set (match_operand:SWI 0 "register_operand" "=") + (unspec_volatile:SWI + [(match_operand:SWI 1 "memory_operand" "+m")] UNSPECV_XCHG)) + (set (match_dup 1) + (plus:SWI (match_dup 1) + (match_operand:SWI 2 "nonmemory_operand" "0"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_XADD" + "lock{%;} xadd{}\t{%0, %1|%1, %0}") + +;; Recall that xchg implicitly sets LOCK#, so adding it again wastes space. +(define_insn "sync_lock_test_and_set" + [(set (match_operand:SWI 0 "register_operand" "=") + (unspec_volatile:SWI + [(match_operand:SWI 1 "memory_operand" "+m")] UNSPECV_XCHG)) + (set (match_dup 1) + (match_operand:SWI 2 "register_operand" "0"))] + "" + "xchg{}\t{%1, %0|%0, %1}") + +(define_insn "sync_add" + [(set (match_operand:SWI 0 "memory_operand" "+m") + (unspec_volatile:SWI + [(plus:SWI (match_dup 0) + (match_operand:SWI 1 "nonmemory_operand" ""))] + UNSPECV_LOCK)) + (clobber (reg:CC FLAGS_REG))] + "" +{ + if (TARGET_USE_INCDEC) + { + if (operands[1] == const1_rtx) + return "lock{%;} inc{}\t%0"; + if (operands[1] == constm1_rtx) + return "lock{%;} dec{}\t%0"; + } + + if (x86_maybe_negate_const_int (&operands[1], mode)) + return "lock{%;} sub{}\t{%1, %0|%0, %1}"; + + return "lock{%;} add{}\t{%1, %0|%0, %1}"; +}) + +(define_insn "sync_sub" + [(set (match_operand:SWI 0 "memory_operand" "+m") + (unspec_volatile:SWI + [(minus:SWI (match_dup 0) + (match_operand:SWI 1 "nonmemory_operand" ""))] + UNSPECV_LOCK)) + (clobber (reg:CC FLAGS_REG))] + "" +{ + if (TARGET_USE_INCDEC) + { + if (operands[1] == const1_rtx) + return "lock{%;} dec{}\t%0"; + if (operands[1] == constm1_rtx) + return "lock{%;} inc{}\t%0"; + } + + return "lock{%;} sub{}\t{%1, %0|%0, %1}"; +}) + +(define_insn "sync_" + [(set (match_operand:SWI 0 "memory_operand" "+m") + (unspec_volatile:SWI + [(any_logic:SWI (match_dup 0) + (match_operand:SWI 1 "nonmemory_operand" ""))] + UNSPECV_LOCK)) + (clobber (reg:CC FLAGS_REG))] + "" + "lock{%;} {}\t{%1, %0|%0, %1}") diff --git a/gcc/config/i386/sysv4.h b/gcc/config/i386/sysv4.h new file mode 100644 index 000000000..64026e72b --- /dev/null +++ b/gcc/config/i386/sysv4.h @@ -0,0 +1,73 @@ +/* Target definitions for GCC for Intel 80386 running System V.4 + Copyright (C) 1991, 2001, 2002, 2007, 2008, 2011 + Free Software Foundation, Inc. + + Written by Ron Guilmette (rfg@netcom.com). + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +/* Output at beginning of assembler file. */ +/* The .file command should always begin the output. */ + +#define TARGET_ASM_FILE_START_FILE_DIRECTIVE true +#undef X86_FILE_START_VERSION_DIRECTIVE +#define X86_FILE_START_VERSION_DIRECTIVE true + +#undef DBX_REGISTER_NUMBER +#define DBX_REGISTER_NUMBER(n) svr4_dbx_register_map[n] + +/* A C statement (sans semicolon) to output to the stdio stream + FILE the assembler definition of uninitialized global DECL named + NAME whose size is SIZE bytes and alignment is ALIGN bytes. + Try to use asm_output_aligned_bss to implement this macro. */ + +#define ASM_OUTPUT_ALIGNED_BSS(FILE, DECL, NAME, SIZE, ALIGN) \ + asm_output_aligned_bss (FILE, DECL, NAME, SIZE, ALIGN) + +/* Handle special EH pointer encodings. Absolute, pc-relative, and + indirect are handled automatically. */ +#define ASM_MAYBE_OUTPUT_ENCODED_ADDR_RTX(FILE, ENCODING, SIZE, ADDR, DONE) \ + do { \ + if ((SIZE) == 4 && ((ENCODING) & 0x70) == DW_EH_PE_datarel) \ + { \ + fputs (ASM_LONG, (FILE)); \ + assemble_name (FILE, XSTR (ADDR, 0)); \ + fputs (((ENCODING) & DW_EH_PE_indirect ? "@GOT" : "@GOTOFF"), (FILE)); \ + goto DONE; \ + } \ + } while (0) + +/* Used by crtstuff.c to initialize the base of data-relative relocations. + These are GOT relative on x86, so return the pic register. */ +#ifdef __PIC__ +#define CRT_GET_RFIB_DATA(BASE) \ + { \ + register void *ebx_ __asm__("ebx"); \ + BASE = ebx_; \ + } +#else +#define CRT_GET_RFIB_DATA(BASE) \ + __asm__ ("call\t.LPR%=\n" \ + ".LPR%=:\n\t" \ + "pop{l}\t%0\n\t" \ + /* Due to a GAS bug, this cannot use EAX. That encodes \ + smaller than the traditional EBX, which results in the \ + offset being off by one. */ \ + "add{l}\t{$_GLOBAL_OFFSET_TABLE_+[.-.LPR%=],%0" \ + "|%0,_GLOBAL_OFFSET_TABLE_+(.-.LPR%=)}" \ + : "=d"(BASE)) +#endif diff --git a/gcc/config/i386/t-crtfm b/gcc/config/i386/t-crtfm new file mode 100644 index 000000000..4fa27e91c --- /dev/null +++ b/gcc/config/i386/t-crtfm @@ -0,0 +1,8 @@ +EXTRA_PARTS += crtfastmath.o + +$(T)crtfastmath.o: $(srcdir)/config/i386/crtfastmath.c \ + $(srcdir)/config/i386/cpuid.h $(GCC_PASSES) + $(GCC_FOR_TARGET) $(LIBGCC2_CFLAGS) $(MULTILIB_CFLAGS) \ + -msse -minline-all-stringops -c \ + $(srcdir)/config/i386/crtfastmath.c \ + -o $(T)crtfastmath$(objext) diff --git a/gcc/config/i386/t-crtpc b/gcc/config/i386/t-crtpc new file mode 100644 index 000000000..c165772f4 --- /dev/null +++ b/gcc/config/i386/t-crtpc @@ -0,0 +1,34 @@ +# Copyright (C) 2007 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +EXTRA_PARTS += crtprec32.o crtprec64.o crtprec80.o + +$(T)crtprec32.o: $(srcdir)/config/i386/crtprec.c $(GCC_PASSES) + $(GCC_FOR_TARGET) $(LIBGCC2_CFLAGS) $(MULTILIB_CFLAGS) -D__PREC=32 -c \ + $(srcdir)/config/i386/crtprec.c \ + -o $(T)crtprec32$(objext) + +$(T)crtprec64.o: $(srcdir)/config/i386/crtprec.c $(GCC_PASSES) + $(GCC_FOR_TARGET) $(LIBGCC2_CFLAGS) $(MULTILIB_CFLAGS) -D__PREC=64 -c \ + $(srcdir)/config/i386/crtprec.c \ + -o $(T)crtprec64$(objext) + +$(T)crtprec80.o: $(srcdir)/config/i386/crtprec.c $(GCC_PASSES) + $(GCC_FOR_TARGET) $(LIBGCC2_CFLAGS) $(MULTILIB_CFLAGS) -D__PREC=80 -c \ + $(srcdir)/config/i386/crtprec.c \ + -o $(T)crtprec80$(objext) diff --git a/gcc/config/i386/t-crtpic b/gcc/config/i386/t-crtpic new file mode 100644 index 000000000..ff81a9bef --- /dev/null +++ b/gcc/config/i386/t-crtpic @@ -0,0 +1,10 @@ +# The pushl in CTOR initialization interferes with frame pointer elimination. + +# We need to use -fPIC when we are using gcc to compile the routines in +# crtstuff.c. This is only really needed when we are going to use gcc/g++ +# to produce a shared library, but since we don't know ahead of time when +# we will be doing that, we just always use -fPIC when compiling the +# routines in crtstuff.c. + +CRTSTUFF_T_CFLAGS = -fPIC -fno-omit-frame-pointer +TARGET_LIBGCC2_CFLAGS = -fPIC diff --git a/gcc/config/i386/t-crtstuff b/gcc/config/i386/t-crtstuff new file mode 100644 index 000000000..c14dd9411 --- /dev/null +++ b/gcc/config/i386/t-crtstuff @@ -0,0 +1,7 @@ +# The pushl in CTOR initialization interferes with frame pointer elimination. +# crtend*.o cannot be compiled without -fno-asynchronous-unwind-tables, +# because then __FRAME_END__ might not be the last thing in .eh_frame +# section. -fno-asynchronous-unwind-tables is off by default for i386 +# and is on by default for x86-64. We turn it off for both i386 and +# x86-64. +CRTSTUFF_T_CFLAGS += -fno-omit-frame-pointer -fno-asynchronous-unwind-tables diff --git a/gcc/config/i386/t-cygming b/gcc/config/i386/t-cygming new file mode 100644 index 000000000..ccae237d3 --- /dev/null +++ b/gcc/config/i386/t-cygming @@ -0,0 +1,109 @@ +# Copyright (C) 2003, 2005, 2008, 2009, 2010 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +LIB1ASMSRC = i386/cygwin.asm +LIB1ASMFUNCS = _chkstk _chkstk_ms + +# cygwin and mingw always have a limits.h, but, depending upon how we are +# doing the build, it may not be installed yet. +LIMITS_H_TEST = true + +# If we are building next to winsup, this will let us find the real +# limits.h when building libgcc2. Otherwise, winsup must be installed +# first. +LIBGCC2_INCLUDES = -I$(srcdir)/../winsup/w32api/include + +winnt.o: $(srcdir)/config/i386/winnt.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \ + $(TM_H) $(RTL_H) $(REGS_H) hard-reg-set.h output.h $(TREE_H) flags.h \ + $(TM_P_H) $(HASHTAB_H) $(GGC_H) $(LTO_STREAMER_H) + $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ + $(srcdir)/config/i386/winnt.c + +winnt-cxx.o: $(srcdir)/config/i386/winnt-cxx.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \ + $(TM_H) $(TREE_H) flags.h \ + $(TM_P_H) $(HASHTAB_H) $(GGC_H) + $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ + $(srcdir)/config/i386/winnt-cxx.c + + +winnt-stubs.o: $(srcdir)/config/i386/winnt-stubs.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \ + $(TM_H) $(RTL_H) $(REGS_H) hard-reg-set.h output.h $(TREE_H) flags.h \ + $(TM_P_H) $(HASHTAB_H) $(GGC_H) + $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ + $(srcdir)/config/i386/winnt-stubs.c + +msformat-c.o: $(srcdir)/config/i386/msformat-c.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \ + $(TM_H) $(RTL_H) $(REGS_H) hard-reg-set.h output.h $(TREE_H) flags.h \ + $(TM_P_H) $(HASHTAB_H) $(GGC_H) + $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ + $(srcdir)/config/i386/msformat-c.c + +STMP_FIXINC=stmp-fixinc + +# Build a shared libgcc library for PECOFF with a DEF file +# with the GNU linker. +# +# mkmap-flat.awk is used with the pe_dll option to produce a DEF instead +# of an ELF map file. +# +# Warning: If SHLIB_SOVERSION or SHLIB_SONAME are updated, LIBGCC_SONAME +# in mingw32.h and SHLIB_MKMAP_OPTS below must be updated also. + +SHLIB_EXT = .dll +SHLIB_IMPLIB = @shlib_base_name@.a +SHLIB_SOVERSION = 1 +SHLIB_SONAME = @shlib_base_name@_$(EH_MODEL)-$(SHLIB_SOVERSION)$(SHLIB_EXT) +SHLIB_MAP = @shlib_map_file@ +SHLIB_OBJS = @shlib_objs@ +SHLIB_DIR = @multilib_dir@/shlib +SHLIB_SLIBDIR_QUAL = @shlib_slibdir_qual@ +# SHLIB_DLLDIR is defined by including one of either t-dlldir or t-dlldir-x +# (native/cross build respectively) in the tmake_file list in gcc/config.gcc. +ifndef SHLIB_DLLDIR +$(error SHLIB_DLLDIR must be defined) +endif + +SHLIB_LINK = $(LN_S) -f $(SHLIB_MAP) $(SHLIB_MAP).def && \ + if [ ! -d $(SHLIB_DIR) ]; then \ + mkdir $(SHLIB_DIR); \ + else true; fi && \ + $(GCC_FOR_TARGET) $(LIBGCC2_CFLAGS) -shared -nodefaultlibs \ + $(SHLIB_MAP).def \ + -Wl,--out-implib,$(SHLIB_DIR)/$(SHLIB_IMPLIB).tmp \ + -o $(SHLIB_DIR)/$(SHLIB_SONAME).tmp @multilib_flags@ \ + $(SHLIB_OBJS) $(SHLIB_LC) && \ + if [ -f $(SHLIB_DIR)/$(SHLIB_SONAME) ]; then \ + mv -f $(SHLIB_DIR)/$(SHLIB_SONAME) \ + $(SHLIB_DIR)/$(SHLIB_SONAME).backup; \ + else true; fi && \ + mv $(SHLIB_DIR)/$(SHLIB_SONAME).tmp $(SHLIB_DIR)/$(SHLIB_SONAME) && \ + mv $(SHLIB_DIR)/$(SHLIB_IMPLIB).tmp $(SHLIB_DIR)/$(SHLIB_IMPLIB) +# $(slibdir) double quoted to protect it from expansion while building +# libgcc.mk. We want this delayed until actual install time. +SHLIB_INSTALL = \ + $$(mkinstalldirs) $$(DESTDIR)$$(SHLIB_DLLDIR) \ + $$(DESTDIR)$$(slibdir)$(SHLIB_SLIBDIR_QUAL); \ + $(INSTALL) $(SHLIB_DIR)/$(SHLIB_SONAME) \ + $$(DESTDIR)$$(SHLIB_DLLDIR)/$(SHLIB_SONAME); \ + $(INSTALL_DATA) $(SHLIB_DIR)/$(SHLIB_IMPLIB) \ + $$(DESTDIR)$$(slibdir)$(SHLIB_SLIBDIR_QUAL)/$(SHLIB_IMPLIB) +SHLIB_MKMAP = $(srcdir)/mkmap-flat.awk +# We'd like to use SHLIB_SONAME here too, but shlib_base_name +# does not get substituted before mkmap-flat.awk is run. +SHLIB_MKMAP_OPTS = -v pe_dll=libgcc_s_$(EH_MODEL)-$(SHLIB_SOVERSION)$(SHLIB_EXT) +SHLIB_MAPFILES = $(srcdir)/libgcc-std.ver diff --git a/gcc/config/i386/t-cygwin b/gcc/config/i386/t-cygwin new file mode 100644 index 000000000..f5eda91c0 --- /dev/null +++ b/gcc/config/i386/t-cygwin @@ -0,0 +1,39 @@ +# Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2008, 2009, 2010 +# Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +# If we are building next to winsup, this will let us find the real +# limits.h when building libgcc2. Otherwise, winsup must be installed +# first. +LIBGCC2_INCLUDES += -I$(srcdir)/../winsup/include \ + -I$(srcdir)/../winsup/cygwin/include + +# Cygwin-specific parts of LIB_SPEC +SHLIB_LC = -lcygwin -ladvapi32 -lshell32 -luser32 -lkernel32 + +# We have already included one of the t-{dw2,sjlj}-eh fragments for EH_MODEL +SHLIB_EH_EXTENSION = $(subst -dw2,,-$(EH_MODEL)) + +# Cygwin uses different conventions than MinGW; override generic SHLIB_ def'ns here. +SHLIB_IMPLIB = @shlib_base_name@$(SHLIB_EXT).a +SHLIB_SONAME = cyggcc_s$(SHLIB_EH_EXTENSION)-$(SHLIB_SOVERSION)$(SHLIB_EXT) +# This must match the definitions of SHLIB_SONAME/SHLIB_SOVERSION and LIBGCC_SONAME. +# We'd like to use SHLIB_SONAME here too, and we can, since +# we don't rely on shlib_base_name substitution for it. +SHLIB_MKMAP_OPTS = -v pe_dll=$(SHLIB_SONAME) + diff --git a/gcc/config/i386/t-darwin b/gcc/config/i386/t-darwin new file mode 100644 index 000000000..22323e4ab --- /dev/null +++ b/gcc/config/i386/t-darwin @@ -0,0 +1,5 @@ +MULTILIB_OPTIONS = m64 +MULTILIB_DIRNAMES = x86_64 +LIB2_SIDITI_CONV_FUNCS=yes +LIB2FUNCS_EXTRA = $(srcdir)/config/darwin-64.c +LIB2FUNCS_EXCLUDE = _fixtfdi _fixunstfdi _floatditf _floatunditf diff --git a/gcc/config/i386/t-darwin64 b/gcc/config/i386/t-darwin64 new file mode 100644 index 000000000..81b4565ac --- /dev/null +++ b/gcc/config/i386/t-darwin64 @@ -0,0 +1,8 @@ +LIB2_SIDITI_CONV_FUNCS=yes +LIB2FUNCS_EXTRA = $(srcdir)/config/darwin-64.c + +MULTILIB_OPTIONS = m32 +MULTILIB_DIRNAMES = i386 + +LIBGCC = stmp-multilib +INSTALL_LIBGCC = install-multilib diff --git a/gcc/config/i386/t-djgpp b/gcc/config/i386/t-djgpp new file mode 100644 index 000000000..7b54b7ba7 --- /dev/null +++ b/gcc/config/i386/t-djgpp @@ -0,0 +1,2 @@ +# Location of DJGPP's header directory. +NATIVE_SYSTEM_HEADER_DIR=$(DJDIR)/include diff --git a/gcc/config/i386/t-dlldir b/gcc/config/i386/t-dlldir new file mode 100644 index 000000000..a3e03317a --- /dev/null +++ b/gcc/config/i386/t-dlldir @@ -0,0 +1,6 @@ + +# In a native build, target DLLs go in bindir, where they can be executed. +# Note double quoting to prevent variables from being evaluated until install +# time; we don't want to expand them during libgcc.mvars generation. + +SHLIB_DLLDIR = $$(bindir) diff --git a/gcc/config/i386/t-dlldir-x b/gcc/config/i386/t-dlldir-x new file mode 100644 index 000000000..07dd845f0 --- /dev/null +++ b/gcc/config/i386/t-dlldir-x @@ -0,0 +1,9 @@ + +# In a cross build, bindir contains host not target binaries, so target DLLs +# instead go in toolexeclibdir, alongside other target binaries and static libs. +# Note double quoting to prevent variables from being evaluated until install +# time; we don't want to expand them during libgcc.mvars generation, and in +# any case, $toolexeclibdir is not defined in the gcc/ subdirectory, only in +# target lib directories. + +SHLIB_DLLDIR = $$(toolexeclibdir) diff --git a/gcc/config/i386/t-dw2-eh b/gcc/config/i386/t-dw2-eh new file mode 100644 index 000000000..ffcc39aea --- /dev/null +++ b/gcc/config/i386/t-dw2-eh @@ -0,0 +1,3 @@ + +# We are using Dwarf-2 EH. +EH_MODEL = dw2 diff --git a/gcc/config/i386/t-fprules-softfp b/gcc/config/i386/t-fprules-softfp new file mode 100644 index 000000000..0b0068f90 --- /dev/null +++ b/gcc/config/i386/t-fprules-softfp @@ -0,0 +1,6 @@ +softfp_float_modes := tf +softfp_int_modes := si di ti +softfp_extensions := sftf dftf xftf +softfp_truncations := tfsf tfdf tfxf +softfp_machine_header := i386/sfp-machine.h +softfp_exclude_libgcc2 := n diff --git a/gcc/config/i386/t-gmm_malloc b/gcc/config/i386/t-gmm_malloc new file mode 100644 index 000000000..c37f8a759 --- /dev/null +++ b/gcc/config/i386/t-gmm_malloc @@ -0,0 +1,6 @@ +# Install gmm_malloc.h as mm_malloc.h. + +EXTRA_HEADERS += mm_malloc.h +mm_malloc.h: $(srcdir)/config/i386/gmm_malloc.h + rm -f $@ + cat $^ > $@ diff --git a/gcc/config/i386/t-gnu b/gcc/config/i386/t-gnu new file mode 100644 index 000000000..5f946c716 --- /dev/null +++ b/gcc/config/i386/t-gnu @@ -0,0 +1 @@ +MULTIARCH_DIRNAME = $(call if_multiarch,i386-gnu) diff --git a/gcc/config/i386/t-gthr-win32 b/gcc/config/i386/t-gthr-win32 new file mode 100644 index 000000000..f67fa1e25 --- /dev/null +++ b/gcc/config/i386/t-gthr-win32 @@ -0,0 +1,2 @@ +# We hide calls to w32api needed for w32 thread support here: +LIB2FUNCS_EXTRA = $(srcdir)/config/i386/gthr-win32.c diff --git a/gcc/config/i386/t-i386 b/gcc/config/i386/t-i386 new file mode 100644 index 000000000..1c658a149 --- /dev/null +++ b/gcc/config/i386/t-i386 @@ -0,0 +1,41 @@ +# Copyright (C) 2008, 2009, 2010 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +i386.o: $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \ + $(RTL_H) $(TREE_H) $(TM_P_H) $(REGS_H) hard-reg-set.h \ + $(REAL_H) insn-config.h conditions.h output.h insn-codes.h \ + $(INSN_ATTR_H) $(FLAGS_H) $(C_COMMON_H) except.h $(FUNCTION_H) \ + $(RECOG_H) $(EXPR_H) $(OPTABS_H) toplev.h $(BASIC_BLOCK_H) \ + $(GGC_H) $(TARGET_H) $(TARGET_DEF_H) langhooks.h $(CGRAPH_H) \ + $(TREE_GIMPLE_H) $(DWARF2_H) $(DF_H) tm-constrs.h $(PARAMS_H) \ + i386-builtin-types.inc debug.h dwarf2out.h sbitmap.h $(FIBHEAP_H) + +i386-c.o: $(srcdir)/config/i386/i386-c.c \ + $(srcdir)/config/i386/i386-protos.h $(CONFIG_H) $(SYSTEM_H) coretypes.h \ + $(TM_H) $(TREE_H) $(TM_P_H) $(FLAGS_H) $(C_COMMON_H) $(GGC_H) \ + $(TARGET_H) $(TARGET_DEF_H) $(CPPLIB_H) $(C_PRAGMA_H) + $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ + $(srcdir)/config/i386/i386-c.c + + +i386-builtin-types.inc: s-i386-bt ; @true +s-i386-bt: $(srcdir)/config/i386/i386-builtin-types.awk \ + $(srcdir)/config/i386/i386-builtin-types.def + $(AWK) -f $^ > tmp-bt.inc + $(SHELL) $(srcdir)/../move-if-change tmp-bt.inc i386-builtin-types.inc + $(STAMP) $@ diff --git a/gcc/config/i386/t-i386elf b/gcc/config/i386/t-i386elf new file mode 100644 index 000000000..9560d9055 --- /dev/null +++ b/gcc/config/i386/t-i386elf @@ -0,0 +1,4 @@ +# For svr4 we build crtbegin.o and crtend.o which serve to add begin and +# end labels to the .ctors and .dtors section when we link using gcc. + +EXTRA_PARTS=crtbegin.o crtend.o diff --git a/gcc/config/i386/t-interix b/gcc/config/i386/t-interix new file mode 100644 index 000000000..e7b016f1e --- /dev/null +++ b/gcc/config/i386/t-interix @@ -0,0 +1,8 @@ +LIB1ASMSRC = i386/cygwin.asm +LIB1ASMFUNCS = _chkstk _chkstk_ms + +winnt.o: $(srcdir)/config/i386/winnt.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \ + $(TM_H) $(RTL_H) $(REGS_H) hard-reg-set.h output.h $(TREE_H) flags.h \ + $(TM_P_H) $(HASHTAB_H) $(GGC_H) + $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ + $(srcdir)/config/i386/winnt.c diff --git a/gcc/config/i386/t-kfreebsd b/gcc/config/i386/t-kfreebsd new file mode 100644 index 000000000..b4310df8a --- /dev/null +++ b/gcc/config/i386/t-kfreebsd @@ -0,0 +1,5 @@ +MULTIARCH_DIRNAME = $(call if_multiarch,i386-kfreebsd-gnu) + +# MULTILIB_OSDIRNAMES are set in t-linux64. +KFREEBSD_OS = $(filter kfreebsd%, $(word 3, $(subst -, ,$(target)))) +MULTILIB_OSDIRNAMES := $(subst linux,$(KFREEBSD_OS),$(MULTILIB_OSDIRNAMES)) diff --git a/gcc/config/i386/t-linux b/gcc/config/i386/t-linux new file mode 100644 index 000000000..76e3f64f5 --- /dev/null +++ b/gcc/config/i386/t-linux @@ -0,0 +1,9 @@ +# On 64bit we do not need any exports for glibc for 64-bit libgcc_s. +# Need to support TImode for x86. Override the settings from +# t-slibgcc-elf-ver and t-linux +SHLIB_MAPFILES = $(srcdir)/libgcc-std.ver \ + $(srcdir)/config/i386/libgcc-glibc.ver + +ifneq (,$(findstring -linux,$(target))) +MULTIARCH_DIRNAME = $(call if_multiarch,i386-linux-gnu) +endif diff --git a/gcc/config/i386/t-linux64 b/gcc/config/i386/t-linux64 new file mode 100644 index 000000000..057744b4b --- /dev/null +++ b/gcc/config/i386/t-linux64 @@ -0,0 +1,36 @@ +# Copyright (C) 2002, 2005, 2007, 2008 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +# On Debian, Ubuntu and other derivative distributions, the 32bit libraries +# are found in /lib32 and /usr/lib32, /lib64 and /usr/lib64 are symlinks to +# /lib and /usr/lib, while other distributions install libraries into /lib64 +# and /usr/lib64. The LSB does not enforce the use of /lib64 and /usr/lib64, +# it doesn't tell anything about the 32bit libraries on those systems. Set +# MULTILIB_OSDIRNAMES according to what is found on the target. + +MULTILIB_OPTIONS = m64/m32 +MULTILIB_DIRNAMES = 64 32 +MULTILIB_OSDIRNAMES = ../lib64$(call if_multiarch,:x86_64-linux-gnu) \ + $(if $(wildcard $(shell echo $(SYSTEM_HEADER_DIR))/../../usr/lib32),../lib32,../lib)$(call if_multiarch,:i386-linux-gnu) + +LIBGCC = stmp-multilib +INSTALL_LIBGCC = install-multilib + +EXTRA_MULTILIB_PARTS=crtbegin.o crtend.o crtbeginS.o crtendS.o \ + crtbeginT.o crtprec32.o crtprec64.o crtprec80.o \ + crtfastmath.o diff --git a/gcc/config/i386/t-mingw-w32 b/gcc/config/i386/t-mingw-w32 new file mode 100644 index 000000000..a14218016 --- /dev/null +++ b/gcc/config/i386/t-mingw-w32 @@ -0,0 +1,12 @@ +# Match SYSTEM_INCLUDE_DIR +NATIVE_SYSTEM_HEADER_DIR = /mingw/include + +MULTILIB_OPTIONS = m64/m32 +MULTILIB_DIRNAMES = 64 32 +MULTILIB_OSDIRNAMES = ../lib64 ../lib + +# MinGW-specific parts of LIB_SPEC +SHLIB_LC = -lmingwthrd -lmingw32 -lmingwex -lmoldname -lmsvcrt -ladvapi32 -lshell32 -luser32 -lkernel32 + +LIBGCC = stmp-multilib +INSTALL_LIBGCC = install-multilib diff --git a/gcc/config/i386/t-mingw-w64 b/gcc/config/i386/t-mingw-w64 new file mode 100644 index 000000000..dbe2d00a2 --- /dev/null +++ b/gcc/config/i386/t-mingw-w64 @@ -0,0 +1,12 @@ +# Match SYSTEM_INCLUDE_DIR +NATIVE_SYSTEM_HEADER_DIR = /mingw/include + +MULTILIB_OPTIONS = m64/m32 +MULTILIB_DIRNAMES = 64 32 +MULTILIB_OSDIRNAMES = ../lib ../lib32 + +# MinGW-specific parts of LIB_SPEC +SHLIB_LC = -lmingwthrd -lmingw32 -lmingwex -lmoldname -lmsvcrt -ladvapi32 -lshell32 -luser32 -lkernel32 + +LIBGCC = stmp-multilib +INSTALL_LIBGCC = install-multilib diff --git a/gcc/config/i386/t-mingw32 b/gcc/config/i386/t-mingw32 new file mode 100644 index 000000000..a8235242a --- /dev/null +++ b/gcc/config/i386/t-mingw32 @@ -0,0 +1,5 @@ +# Match SYSTEM_INCLUDE_DIR +NATIVE_SYSTEM_HEADER_DIR = /mingw/include + +# MinGW-specific parts of LIB_SPEC +SHLIB_LC = -lmingwthrd -lmingw32 -lmingwex -lmoldname -lmsvcrt -ladvapi32 -lshell32 -luser32 -lkernel32 diff --git a/gcc/config/i386/t-netware b/gcc/config/i386/t-netware new file mode 100644 index 000000000..405c98f6a --- /dev/null +++ b/gcc/config/i386/t-netware @@ -0,0 +1,10 @@ +TARGET_LIBGCC2_CFLAGS = -mpreferred-stack-boundary=2 -fomit-frame-pointer + +netware.o: $(srcdir)/config/i386/netware.c $(RTL_H) $(TREE_H) $(CONFIG_H) $(TM_P_H) + $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ + $(srcdir)/config/i386/netware.c + +# We don't need some of GCC's own include files. +USER_H = $(srcdir)/ginclude/stdarg.h \ + $(srcdir)/ginclude/varargs.h \ + $(EXTRA_HEADERS) $(LANG_EXTRA_HEADERS) diff --git a/gcc/config/i386/t-nto b/gcc/config/i386/t-nto new file mode 100644 index 000000000..b80ff8029 --- /dev/null +++ b/gcc/config/i386/t-nto @@ -0,0 +1,4 @@ +CRTSTUFF_T_CFLAGS = -fno-omit-frame-pointer -fPIC +TARGET_LIBGCC2_CFLAGS = -fPIC -fexceptions + +EXTRA_PARTS = crtbegin.o diff --git a/gcc/config/i386/t-nwld b/gcc/config/i386/t-nwld new file mode 100644 index 000000000..e77279116 --- /dev/null +++ b/gcc/config/i386/t-nwld @@ -0,0 +1,50 @@ +# Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 +# Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +CRTSTUFF_T_CFLAGS = -mpreferred-stack-boundary=2 +CRT0STUFF_T_CFLAGS = -mpreferred-stack-boundary=2 $(INCLUDES) +# this is a slight misuse (it's not an assembler file) +CRT0_S = $(srcdir)/config/i386/netware-crt0.c +MCRT0_S = $(srcdir)/config/i386/netware-crt0.c + +$(T)libgcc.def: $(srcdir)/config/i386/t-nwld + echo "module libgcc_s" >$@ + +$(T)libc.def: $(srcdir)/config/i386/t-nwld + echo "module libc" >$@ + +$(T)libcpre.def: $(srcdir)/config/i386/t-nwld + echo "start _LibCPrelude" >$@ + echo "exit _LibCPostlude" >>$@ + echo "check _LibCCheckUnload" >>$@ + +$(T)posixpre.def: $(srcdir)/config/i386/t-nwld + echo "start POSIX_Start" >$@ + echo "exit POSIX_Stop" >>$@ + echo "check POSIX_CheckUnload" >>$@ + +nwld.o: $(srcdir)/config/i386/nwld.c $(RTL_H) $(TREE_H) $(CONFIG_H) $(TM_P_H) + $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ + $(srcdir)/config/i386/nwld.c + + +s-crt0: $(srcdir)/unwind-dw2-fde.h + +# To keep DRIVER_DEFINES correct. +SHLIB_LINK = dummy diff --git a/gcc/config/i386/t-openbsd b/gcc/config/i386/t-openbsd new file mode 100644 index 000000000..183046340 --- /dev/null +++ b/gcc/config/i386/t-openbsd @@ -0,0 +1,6 @@ +# gdb gets confused if pic code is linked with non pic +# We cope by building variants of libgcc. +MULTILIB_OPTIONS = fpic +MULTILIB_MATCHES=fpic=fPIC +LIBGCC = stmp-multilib +INSTALL_LIBGCC = install-multilib diff --git a/gcc/config/i386/t-pmm_malloc b/gcc/config/i386/t-pmm_malloc new file mode 100644 index 000000000..109009fbf --- /dev/null +++ b/gcc/config/i386/t-pmm_malloc @@ -0,0 +1,6 @@ +# Install pmm_malloc.h as mm_malloc.h. + +EXTRA_HEADERS += mm_malloc.h +mm_malloc.h: $(srcdir)/config/i386/pmm_malloc.h + rm -f $@ + cat $^ > $@ diff --git a/gcc/config/i386/t-rtems-i386 b/gcc/config/i386/t-rtems-i386 new file mode 100644 index 000000000..47dfc7e11 --- /dev/null +++ b/gcc/config/i386/t-rtems-i386 @@ -0,0 +1,69 @@ +# Copyright (C) 1999, 2001, 2002, 2005, 2007 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . +# +# +# This file was based on t-sol2 - x68 Solaris implementation. Actually, +# the source code to create crti.o anf crtn.o are exactly the same +# as the ones for Solaris. Later, we might want to have a RTEMS's +# version of these files. +# + +$(T)crti.o: $(srcdir)/config/i386/sol2-ci.asm $(GCC_PASSES) + sed -e '/^!/d' <$(srcdir)/config/i386/sol2-ci.asm >crti.s + $(GCC_FOR_TARGET) -c -o $(T)crti.o crti.s +$(T)crtn.o: $(srcdir)/config/i386/sol2-cn.asm $(GCC_PASSES) + sed -e '/^!/d' <$(srcdir)/config/i386/sol2-cn.asm >crtn.s + $(GCC_FOR_TARGET) -c -o $(T)crtn.o crtn.s + +# We want fine grained libraries, so use the new code to build the +# floating point emulation libraries. +FPBIT = fp-bit.c +DPBIT = dp-bit.c + +LIB2FUNCS_EXTRA = xp-bit.c + +dp-bit.c: $(srcdir)/config/fp-bit.c + echo '#ifdef __LITTLE_ENDIAN__' > dp-bit.c + echo '#define FLOAT_BIT_ORDER_MISMATCH' >>dp-bit.c + echo '#endif' >> dp-bit.c + cat $(srcdir)/config/fp-bit.c >> dp-bit.c + +fp-bit.c: $(srcdir)/config/fp-bit.c + echo '#define FLOAT' > fp-bit.c + echo '#ifdef __LITTLE_ENDIAN__' >> fp-bit.c + echo '#define FLOAT_BIT_ORDER_MISMATCH' >>fp-bit.c + echo '#endif' >> fp-bit.c + cat $(srcdir)/config/fp-bit.c >> fp-bit.c + +xp-bit.c: $(srcdir)/config/fp-bit.c + echo '#define EXTENDED_FLOAT_STUBS' > xp-bit.c + cat $(srcdir)/config/fp-bit.c >> xp-bit.c + +MULTILIB_OPTIONS = mtune=i486/mtune=pentium/mtune=pentiumpro \ +msoft-float +MULTILIB_DIRNAMES= m486 mpentium mpentiumpro soft-float +MULTILIB_MATCHES = msoft-float=mno-m80387 +MULTILIB_MATCHES += mtune?pentium=mtune?k6 mtune?pentiumpro=mtune?mathlon +MULTILIB_EXCEPTIONS = \ +mtune=pentium/*msoft-float* \ +mtune=pentiumpro/*msoft-float* + +EXTRA_MULTILIB_PARTS = crtbegin.o crtend.o + +LIBGCC = stmp-multilib +INSTALL_LIBGCC = install-multilib diff --git a/gcc/config/i386/t-sjlj-eh b/gcc/config/i386/t-sjlj-eh new file mode 100644 index 000000000..c9085f432 --- /dev/null +++ b/gcc/config/i386/t-sjlj-eh @@ -0,0 +1,3 @@ + +# We are using SjLj EH. +EH_MODEL = sjlj diff --git a/gcc/config/i386/t-sol2-10 b/gcc/config/i386/t-sol2-10 new file mode 100644 index 000000000..95eabf63d --- /dev/null +++ b/gcc/config/i386/t-sol2-10 @@ -0,0 +1,29 @@ +# Copyright (C) 2004 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +MULTILIB_OPTIONS = m32/m64 +MULTILIB_DIRNAMES = 32 amd64 +MULTILIB_OSDIRNAMES = . amd64 + +LIBGCC = stmp-multilib +INSTALL_LIBGCC = install-multilib + +# GCC contains i386 assembler sources for some of the startfiles +# which aren't appropriate for amd64. Just use the installed +# versions of: crt1.o crti.o crtn.o gcrt1.o +EXTRA_MULTILIB_PARTS=gmon.o crtbegin.o crtend.o diff --git a/gcc/config/i386/t-svr3dbx b/gcc/config/i386/t-svr3dbx new file mode 100644 index 000000000..517113791 --- /dev/null +++ b/gcc/config/i386/t-svr3dbx @@ -0,0 +1,7 @@ +# gas 1.38.1 supporting dbx-in-coff requires a link script. + +svr3.ifile: $(srcdir)/config/i386/svr3.ifile + rm -f svr3.ifile; cp $(srcdir)/config/i386/svr3.ifile . + +svr3z.ifile: $(srcdir)/config/i386/svr3z.ifile + rm -f svr3z.ifile; cp $(srcdir)/config/i386/svr3z.ifile . diff --git a/gcc/config/i386/t-vxworks b/gcc/config/i386/t-vxworks new file mode 100644 index 000000000..c440b1f90 --- /dev/null +++ b/gcc/config/i386/t-vxworks @@ -0,0 +1,8 @@ +# Multilibs for VxWorks. + +# Build multilibs for normal, -mrtp, and -mrtp -fPIC. +MULTILIB_OPTIONS = mrtp fPIC +MULTILIB_DIRNAMES = +MULTILIB_MATCHES = fPIC=fpic +MULTILIB_EXCEPTIONS = fPIC + diff --git a/gcc/config/i386/t-vxworksae b/gcc/config/i386/t-vxworksae new file mode 100644 index 000000000..0cea2bbf3 --- /dev/null +++ b/gcc/config/i386/t-vxworksae @@ -0,0 +1,5 @@ +# Multilibs for VxWorks AE. + +MULTILIB_OPTIONS = mvthreads +MULTILIB_MATCHES = +MULTILIB_EXCEPTIONS = diff --git a/gcc/config/i386/tbmintrin.h b/gcc/config/i386/tbmintrin.h new file mode 100644 index 000000000..8d2431d41 --- /dev/null +++ b/gcc/config/i386/tbmintrin.h @@ -0,0 +1,191 @@ +/* Copyright (C) 2010 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86INTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef __TBM__ +# error "TBM instruction set not enabled" +#endif /* __TBM__ */ + +#ifndef _TBMINTRIN_H_INCLUDED +#define _TBMINTRIN_H_INCLUDED + +#ifdef __OPTIMIZE__ +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bextri_u32 (unsigned int __X, const unsigned int __I) +{ + return __builtin_ia32_bextri_u32 (__X, __I); +} +#else +#define __bextri_u32(X, I) \ + ((unsigned int)__builtin_ia32_bextri_u32 ((unsigned int)(X), \ + (unsigned int)(I))) +#endif /*__OPTIMIZE__ */ + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blcfill_u32 (unsigned int __X) +{ + unsigned int tmp = (__X) & ((__X) + 1); + return tmp; +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blci_u32 (unsigned int __X) +{ + unsigned int tmp = (__X) | (~((__X) + 1)); + return tmp; +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blcic_u32 (unsigned int __X) +{ + unsigned int tmp = (~(__X)) & ((__X) + 1); + return tmp; +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blcmsk_u32 (unsigned int __X) +{ + unsigned int tmp = (__X) ^ ((__X) + 1); + return tmp; +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blcs_u32 (unsigned int __X) +{ + unsigned int tmp = (__X) | ((__X) + 1); + return tmp; +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsfill_u32 (unsigned int __X) +{ + unsigned int tmp = (__X) | ((__X) - 1); + return tmp; +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsic_u32 (unsigned int __X) +{ + unsigned int tmp = (~(__X)) | ((__X) - 1); + return tmp; +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__t1mskc_u32 (unsigned int __X) +{ + unsigned int tmp = (~(__X)) | ((__X) + 1); + return tmp; +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__tzmsk_u32 (unsigned int __X) +{ + unsigned int tmp = (~(__X)) & ((__X) - 1); + return tmp; +} + + + +#ifdef __x86_64__ +#ifdef __OPTIMIZE__ +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bextri_u64 (unsigned long long __X, const unsigned int __I) +{ + return __builtin_ia32_bextri_u64 (__X, __I); +} +#else +#define __bextri_u64(X, I) \ + ((unsigned long long)__builtin_ia32_bextri_u64 ((unsigned long long)(X), \ + (unsigned long long)(I))) +#endif /*__OPTIMIZE__ */ + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blcfill_u64 (unsigned long long __X) +{ + unsigned long long tmp = (__X) & ((__X) + 1); + return tmp; +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blci_u64 (unsigned long long __X) +{ + unsigned long long tmp = (__X) | (~((__X) + 1)); + return tmp; +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blcic_u64 (unsigned long long __X) +{ + unsigned long long tmp = (~(__X)) & ((__X) + 1); + return tmp; +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blcmsk_u64 (unsigned long long __X) +{ + unsigned long long tmp = (__X) ^ ((__X) + 1); + return tmp; +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blcs_u64 (unsigned long long __X) +{ + unsigned long long tmp = (__X) | ((__X) + 1); + return tmp; +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsfill_u64 (unsigned long long __X) +{ + unsigned long long tmp = (__X) | ((__X) - 1); + return tmp; +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsic_u64 (unsigned long long __X) +{ + unsigned long long tmp = (~(__X)) | ((__X) - 1); + return tmp; +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__t1mskc_u64 (unsigned long long __X) +{ + unsigned long long tmp = (~(__X)) | ((__X) + 1); + return tmp; +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__tzmsk_u64 (unsigned long long __X) +{ + unsigned long long tmp = (~(__X)) & ((__X) - 1); + return tmp; +} + + +#endif /* __x86_64__ */ +#endif /* _TBMINTRIN_H_INCLUDED */ + diff --git a/gcc/config/i386/tmmintrin.h b/gcc/config/i386/tmmintrin.h new file mode 100644 index 000000000..9835669ca --- /dev/null +++ b/gcc/config/i386/tmmintrin.h @@ -0,0 +1,244 @@ +/* Copyright (C) 2006, 2007, 2008, 2009 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 9.1. */ + +#ifndef _TMMINTRIN_H_INCLUDED +#define _TMMINTRIN_H_INCLUDED + +#ifndef __SSSE3__ +# error "SSSE3 instruction set not enabled" +#else + +/* We need definitions from the SSE3, SSE2 and SSE header files*/ +#include + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_epi16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_phaddw128 ((__v8hi)__X, (__v8hi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_phaddd128 ((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadds_epi16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_phaddsw128 ((__v8hi)__X, (__v8hi)__Y); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_pi16 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_phaddw ((__v4hi)__X, (__v4hi)__Y); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_pi32 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_phaddd ((__v2si)__X, (__v2si)__Y); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadds_pi16 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_phaddsw ((__v4hi)__X, (__v4hi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_epi16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_phsubw128 ((__v8hi)__X, (__v8hi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_phsubd128 ((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsubs_epi16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_phsubsw128 ((__v8hi)__X, (__v8hi)__Y); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_pi16 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_phsubw ((__v4hi)__X, (__v4hi)__Y); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_pi32 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_phsubd ((__v2si)__X, (__v2si)__Y); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsubs_pi16 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_phsubsw ((__v4hi)__X, (__v4hi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maddubs_epi16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmaddubsw128 ((__v16qi)__X, (__v16qi)__Y); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maddubs_pi16 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_pmaddubsw ((__v8qi)__X, (__v8qi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhrs_epi16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmulhrsw128 ((__v8hi)__X, (__v8hi)__Y); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhrs_pi16 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_pmulhrsw ((__v4hi)__X, (__v4hi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_epi8 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pshufb128 ((__v16qi)__X, (__v16qi)__Y); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_pi8 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_pshufb ((__v8qi)__X, (__v8qi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_epi8 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psignb128 ((__v16qi)__X, (__v16qi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_epi16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psignw128 ((__v8hi)__X, (__v8hi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psignd128 ((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_pi8 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_psignb ((__v8qi)__X, (__v8qi)__Y); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_pi16 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_psignw ((__v4hi)__X, (__v4hi)__Y); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_pi32 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_psignd ((__v2si)__X, (__v2si)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) +{ + return (__m128i) __builtin_ia32_palignr128 ((__v2di)__X, + (__v2di)__Y, __N * 8); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_alignr_pi8(__m64 __X, __m64 __Y, const int __N) +{ + return (__m64) __builtin_ia32_palignr ((__v1di)__X, + (__v1di)__Y, __N * 8); +} +#else +#define _mm_alignr_epi8(X, Y, N) \ + ((__m128i) __builtin_ia32_palignr128 ((__v2di)(__m128i)(X), \ + (__v2di)(__m128i)(Y), \ + (int)(N) * 8)) +#define _mm_alignr_pi8(X, Y, N) \ + ((__m64) __builtin_ia32_palignr ((__v1di)(__m64)(X), \ + (__v1di)(__m64)(Y), \ + (int)(N) * 8)) +#endif + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_epi8 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pabsb128 ((__v16qi)__X); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_epi16 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pabsw128 ((__v8hi)__X); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_epi32 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pabsd128 ((__v4si)__X); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_pi8 (__m64 __X) +{ + return (__m64) __builtin_ia32_pabsb ((__v8qi)__X); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_pi16 (__m64 __X) +{ + return (__m64) __builtin_ia32_pabsw ((__v4hi)__X); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_pi32 (__m64 __X) +{ + return (__m64) __builtin_ia32_pabsd ((__v2si)__X); +} + +#endif /* __SSSE3__ */ + +#endif /* _TMMINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/unix.h b/gcc/config/i386/unix.h new file mode 100644 index 000000000..abd665844 --- /dev/null +++ b/gcc/config/i386/unix.h @@ -0,0 +1,81 @@ +/* Definitions for Unix assembler syntax for the Intel 80386. + Copyright (C) 1988, 1994, 1999, 2000, 2001, 2002, 2007, 2009, 2010 + Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ + +/* This file defines the aspects of assembler syntax + that are the same for all the i386 Unix systems + (though they may differ in non-Unix systems). */ + +/* Define macro used to output shift-double opcodes when the shift + count is in %cl. Some assemblers require %cl as an argument; + some don't. This macro controls what to do: by default, don't + print %cl. */ +#define SHIFT_DOUBLE_OMITS_COUNT 1 + +/* Define the syntax of pseudo-ops, labels and comments. */ + +/* String containing the assembler's comment-starter. + Note the trailing space is necessary in case the character + that immediately follows the comment is '*'. If this happens + and the space is not there the assembler will interpret this + as the start of a C-like slash-star comment and complain when + there is no terminator. */ + +#define ASM_COMMENT_START "/ " + +/* Output to assembler file text saying following lines + may contain character constants, extra white space, comments, etc. */ + +#define ASM_APP_ON "/APP\n" + +/* Output to assembler file text saying following lines + no longer contain unusual constructs. */ + +#define ASM_APP_OFF "/NO_APP\n" + +/* Output before read-only data. */ + +#define TEXT_SECTION_ASM_OP "\t.text" + +/* Output before writable (initialized) data. */ + +#define DATA_SECTION_ASM_OP "\t.data" + +/* Output before writable (uninitialized) data. */ + +#define BSS_SECTION_ASM_OP "\t.bss" + +/* Globalizing directive for a label. */ +#define GLOBAL_ASM_OP "\t.globl\t" + +/* By default, target has a 80387, uses IEEE compatible arithmetic, + and returns float values in the 387. */ +#undef TARGET_SUBTARGET_DEFAULT +#define TARGET_SUBTARGET_DEFAULT \ + (MASK_80387 | MASK_IEEE_FP | MASK_FLOAT_RETURNS) + +/* By default, 64-bit mode uses 128-bit long double. */ +#undef TARGET_SUBTARGET64_DEFAULT +#define TARGET_SUBTARGET64_DEFAULT \ + MASK_128BIT_LONG_DOUBLE diff --git a/gcc/config/i386/vx-common.h b/gcc/config/i386/vx-common.h new file mode 100644 index 000000000..c5ec4ed79 --- /dev/null +++ b/gcc/config/i386/vx-common.h @@ -0,0 +1,33 @@ +/* IA32 VxWorks and VxWorks AE target definitions. + Copyright (C) 2007, 2008, 2010 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#define ASM_OUTPUT_ALIGNED_BSS(FILE, DECL, NAME, SIZE, ALIGN) \ + asm_output_aligned_bss (FILE, DECL, NAME, SIZE, ALIGN) + +/* VxWorks uses the same ABI as Solaris 2, so use i386/sol2.h version. */ + +#undef TARGET_SUBTARGET_DEFAULT +#define TARGET_SUBTARGET_DEFAULT \ + (MASK_80387 | MASK_IEEE_FP | MASK_FLOAT_RETURNS | MASK_VECT8_RETURNS) + +/* Provide our target specific DBX_REGISTER_NUMBER. VxWorks relies on + the SVR4 numbering. */ + +#undef DBX_REGISTER_NUMBER +#define DBX_REGISTER_NUMBER(n) svr4_dbx_register_map[n] diff --git a/gcc/config/i386/vxworks.h b/gcc/config/i386/vxworks.h new file mode 100644 index 000000000..09861e493 --- /dev/null +++ b/gcc/config/i386/vxworks.h @@ -0,0 +1,76 @@ +/* IA32 VxWorks target definitions for GNU compiler. + Copyright (C) 2003, 2004, 2005, 2007, 2010 Free Software Foundation, Inc. + Updated by CodeSourcery, LLC. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#undef TARGET_VERSION +#define TARGET_VERSION fprintf (stderr, " (80586, VxWorks syntax)"); + +#undef ASM_SPEC +#define ASM_SPEC "" + +#define TARGET_OS_CPP_BUILTINS() \ + do \ + { \ + VXWORKS_OS_CPP_BUILTINS (); \ + if (TARGET_386) \ + builtin_define ("CPU=I80386"); \ + else if (TARGET_486) \ + builtin_define ("CPU=I80486"); \ + else if (TARGET_PENTIUM) \ + { \ + builtin_define ("CPU=PENTIUM"); \ + builtin_define ("CPU_VARIANT=PENTIUM"); \ + } \ + else if (TARGET_PENTIUMPRO) \ + { \ + builtin_define ("CPU=PENTIUM2"); \ + builtin_define ("CPU_VARIANT=PENTIUMPRO"); \ + } \ + else if (TARGET_PENTIUM4) \ + { \ + builtin_define ("CPU=PENTIUM4"); \ + builtin_define ("CPU_VARIANT=PENTIUM4"); \ + } \ + } \ + while (0) + +#undef CPP_SPEC +#define CPP_SPEC VXWORKS_ADDITIONAL_CPP_SPEC +#undef LIB_SPEC +#define LIB_SPEC VXWORKS_LIB_SPEC +#undef STARTFILE_SPEC +#define STARTFILE_SPEC VXWORKS_STARTFILE_SPEC +#undef ENDFILE_SPEC +#define ENDFILE_SPEC VXWORKS_ENDFILE_SPEC +#undef LINK_SPEC +#define LINK_SPEC VXWORKS_LINK_SPEC + +#undef SUBTARGET_SWITCHES +#define SUBTARGET_SWITCHES EXTRA_SUBTARGET_SWITCHES + +#undef SUBTARGET_OVERRIDE_OPTIONS +#define SUBTARGET_OVERRIDE_OPTIONS VXWORKS_OVERRIDE_OPTIONS + +/* No _mcount profiling on VxWorks. */ +#undef FUNCTION_PROFILER +#define FUNCTION_PROFILER(FILE,LABELNO) VXWORKS_FUNCTION_PROFILER(FILE,LABELNO) + +/* We cannot use PC-relative accesses for VxWorks PIC because there is no + fixed gap between segments. */ +#undef ASM_PREFERRED_EH_DATA_FORMAT diff --git a/gcc/config/i386/vxworksae.h b/gcc/config/i386/vxworksae.h new file mode 100644 index 000000000..b4c9fe4eb --- /dev/null +++ b/gcc/config/i386/vxworksae.h @@ -0,0 +1,35 @@ +/* IA32 VxWorks AE target definitions for GNU compiler. + Copyright (C) 2005, 2007, 2010 Free Software Foundation, Inc. + Contributed by CodeSourcery, LLC. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +/* On VxWorks AE, we only want SIMNT. */ +#undef VXWORKS_CPU_DEFINE +#define VXWORKS_CPU_DEFINE() \ + do \ + builtin_define ("CPU=SIMNT"); \ + while (0) + +#undef ASM_SPEC +#define ASM_SPEC "" + +#undef SIZE_TYPE +#define SIZE_TYPE "unsigned int" + +#undef PTRDIFF_TYPE +#define PTRDIFF_TYPE "int" diff --git a/gcc/config/i386/w32-unwind.h b/gcc/config/i386/w32-unwind.h new file mode 100644 index 000000000..449e9a9c5 --- /dev/null +++ b/gcc/config/i386/w32-unwind.h @@ -0,0 +1,204 @@ +/* Definitions for Dwarf2 EH unwind support for Windows32 targets + Copyright (C) 2007, 2009, 2010 + Free Software Foundation, Inc. + Contributed by Pascal Obry + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ + + +/* This file implements the md_fallback_frame_state_for routine for + Windows, triggered when the GCC table based unwinding process hits a + frame for which no unwind info has been registered. This typically + occurs when raising an exception from a signal handler, because the + handler is actually called from the OS kernel. + + The basic idea is to detect that we are indeed trying to unwind past a + signal handler and to fill out the GCC internal unwinding structures for + the OS kernel frame as if it had been directly called from the + interrupted context. + + This is all assuming that the code to set the handler asked the kernel + to pass a pointer to such context information. + + There is three main parts. + + 1) The first thing to do is to check if we are in a signal context. If + not we can just return as there is nothing to do. We are probably on + some foreign code for which no unwind frame can be found. If this is + a call from the Windows signal handler, then: + + 2) We must get the signal context information. + + * With the standard exception filter: + + This is on Windows pointed to by an EXCEPTION_POINTERS. We know that + the signal handle will call an UnhandledExceptionFilter with this + parameter. The spec for this routine is: + + LONG WINAPI UnhandledExceptionFilter(struct _EXCEPTION_POINTERS*); + + So the pointer to struct _EXCEPTION_POINTERS must be somewhere on the + stack. + + This was found experimentally to always be at offset 0 of the context + frame in all cases handled by this implementation. + + * With the SEH exception handler: + + In this case the signal context is directly on the stack as the SEH + exception handler has the following prototype: + + DWORD + SEH_error_handler (PEXCEPTION_RECORD ExceptionRecord, + PVOID EstablisherFrame, + PCONTEXT ContextRecord, + PVOID DispatcherContext) + + This was found experimentally to always be at offset 56 of the + context frame in all cases handled by this implementation. + + 3) When we have the signal context we just have to save some registers + and set the return address based on the program counter (Eip). + + Note that this implementation follows closely the same principles as the + GNU/Linux and OSF ones. */ + +#define WIN32_MEAN_AND_LEAN +#include +/* Patterns found experimentally to be on a Windows signal handler */ + +/* In a standard exception filter */ + +#define SIG_PAT1 \ + (pc_[-2] == 0xff && pc_[-1] == 0xd0 /* call %eax */ \ + && pc_[0] == 0x83 && pc_[1] == 0xf8) /* cmp 0xdepl,%eax */ + +#define SIG_PAT2 \ + (pc_[-5] == 0xe8 && pc_[-4] == 0x68 /* call (depl16) */ \ + && pc_[0] == 0xc3) /* ret */ + +/* In a Win32 SEH handler */ + +#define SIG_SEH1 \ + (pc_[-5] == 0xe8 /* call addr */ \ + && pc_[0] == 0x83 && pc_[1] == 0xc4 /* add 0xval,%esp */ \ + && pc_[3] == 0xb8) /* mov 0xval,%eax */ + +#define SIG_SEH2 \ + (pc_[-5] == 0x8b && pc_[-4] == 0x4d /* mov depl(%ebp),%ecx */ \ + && pc_[0] == 0x64 && pc_[1] == 0x8b) /* mov %fs:(0), */ \ + +/* In the GCC alloca (stack probing) */ + +#define SIG_ALLOCA \ + (pc_[-1] == 0x83 /* orl $0x0,(%ecx) */ \ + && pc_[0] == 0x9 && pc_[1] == 0 \ + && pc_[2] == 0x2d && pc_[3] == 0 /* subl $0x1000,%eax */ \ + && pc_[4] == 0x10 && pc_[5] == 0) + + +#define MD_FALLBACK_FRAME_STATE_FOR i386_w32_fallback_frame_state + +static _Unwind_Reason_Code +i386_w32_fallback_frame_state (struct _Unwind_Context *context, + _Unwind_FrameState *fs) + +{ + void * ctx_ra_ = (void *)(context->ra); /* return address */ + void * ctx_cfa_ = (void *)(context->cfa); /* context frame address */ + unsigned char * pc_ = (unsigned char *) ctx_ra_; + + /* In the test below we look for two specific patterns found + experimentally to be in the Windows signal handler. */ + if (SIG_PAT1 || SIG_PAT2 || SIG_SEH1 || SIG_SEH2) + { + PEXCEPTION_POINTERS weinfo_; + PCONTEXT proc_ctx_; + long new_cfa_; + + if (SIG_SEH1) + proc_ctx_ = (PCONTEXT) (*(int*)(ctx_cfa_ + 56)); + else if (SIG_SEH2) + proc_ctx_ = (PCONTEXT) (*(int*)(ctx_cfa_ + 8)); + else + { + weinfo_ = (PEXCEPTION_POINTERS) (*(int*)ctx_cfa_); + proc_ctx_ = weinfo_->ContextRecord; + } + + /* The new context frame address is the stack pointer. */ + new_cfa_ = proc_ctx_->Esp; + fs->regs.cfa_how = CFA_REG_OFFSET; + fs->regs.cfa_reg = __builtin_dwarf_sp_column(); + fs->regs.cfa_offset = new_cfa_ - (long) ctx_cfa_; + + /* Restore registers. */ + fs->regs.reg[0].how = REG_SAVED_OFFSET; + fs->regs.reg[0].loc.offset = (long)&proc_ctx_->Eax - new_cfa_; + fs->regs.reg[3].how = REG_SAVED_OFFSET; + fs->regs.reg[3].loc.offset = (long)&proc_ctx_->Ebx - new_cfa_; + fs->regs.reg[1].how = REG_SAVED_OFFSET; + fs->regs.reg[1].loc.offset = (long)&proc_ctx_->Ecx - new_cfa_; + fs->regs.reg[2].how = REG_SAVED_OFFSET; + fs->regs.reg[2].loc.offset = (long)&proc_ctx_->Edx - new_cfa_; + fs->regs.reg[6].how = REG_SAVED_OFFSET; + fs->regs.reg[6].loc.offset = (long)&proc_ctx_->Esi - new_cfa_; + fs->regs.reg[7].how = REG_SAVED_OFFSET; + fs->regs.reg[7].loc.offset = (long)&proc_ctx_->Edi - new_cfa_; + fs->regs.reg[5].how = REG_SAVED_OFFSET; + fs->regs.reg[5].loc.offset = (long)&proc_ctx_->Ebp - new_cfa_; + fs->regs.reg[8].how = REG_SAVED_OFFSET; + fs->regs.reg[8].loc.offset = (long)&proc_ctx_->Eip - new_cfa_; + fs->retaddr_column = 8; + fs->signal_frame = 1; + + return _URC_NO_REASON; + } + + /* Unwinding through _alloca, propagating from a trap triggered by + one of it's probes prior to the real SP adjustment. The only + operations of interest performed is "pushl %ecx", followed by + ecx clobbering. */ + else if (SIG_ALLOCA) + { + /* Only one push between entry in _alloca and the probe trap. */ + long new_cfa_ = (long) ctx_cfa_ + 4; + + fs->regs.cfa_how = CFA_REG_OFFSET; + fs->regs.cfa_reg = __builtin_dwarf_sp_column(); + fs->regs.cfa_offset = new_cfa_ - (long) ctx_cfa_; + + /* The saved value of %ecx is at CFA - 4 */ + fs->regs.reg[1].how = REG_SAVED_OFFSET; + fs->regs.reg[1].loc.offset = -4; + + /* and what is stored at the CFA is the return address. */ + fs->retaddr_column = 8; + fs->regs.reg[8].how = REG_SAVED_OFFSET; + fs->regs.reg[8].loc.offset = 0; + fs->signal_frame = 1; + + return _URC_NO_REASON; + } + else + return _URC_END_OF_STACK; +} diff --git a/gcc/config/i386/winnt-cxx.c b/gcc/config/i386/winnt-cxx.c new file mode 100644 index 000000000..0c47e3a8b --- /dev/null +++ b/gcc/config/i386/winnt-cxx.c @@ -0,0 +1,175 @@ +/* Target support for C++ classes on Windows. + Contributed by Danny Smith (dannysmith@users.sourceforge.net) + Copyright (C) 2005, 2007, 2009, 2010 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "tm.h" +#include "tree.h" +#include "cp/cp-tree.h" /* This is why we're a separate module. */ +#include "flags.h" +#include "tm_p.h" +#include "diagnostic-core.h" +#include "hashtab.h" + +bool +i386_pe_type_dllimport_p (tree decl) +{ + gcc_assert (TREE_CODE (decl) == VAR_DECL + || TREE_CODE (decl) == FUNCTION_DECL); + + if (TARGET_NOP_FUN_DLLIMPORT && TREE_CODE (decl) == FUNCTION_DECL) + return false; + + /* We ignore the dllimport attribute for inline member functions. + This differs from MSVC behavior which treats it like GNUC + 'extern inline' extension. Also ignore for template + instantiations with linkonce semantics and artificial methods. */ + if (TREE_CODE (decl) == FUNCTION_DECL + && (DECL_DECLARED_INLINE_P (decl) + || DECL_TEMPLATE_INSTANTIATION (decl) + || DECL_ARTIFICIAL (decl))) + return false; + + /* Overrides of the class dllimport decls by out-of-class definitions are + handled by tree.c:merge_dllimport_decl_attributes. */ + return true; +} + +bool +i386_pe_type_dllexport_p (tree decl) +{ + gcc_assert (TREE_CODE (decl) == VAR_DECL + || TREE_CODE (decl) == FUNCTION_DECL); + + /* Avoid exporting compiler-generated default dtors and copy ctors. + The only artificial methods that need to be exported are virtual + and non-virtual thunks. */ + if (TREE_CODE (TREE_TYPE (decl)) == METHOD_TYPE + && DECL_ARTIFICIAL (decl) && !DECL_THUNK_P (decl)) + return false; + return true; +} + +static inline void maybe_add_dllimport (tree decl) +{ + if (i386_pe_type_dllimport_p (decl)) + DECL_DLLIMPORT_P (decl) = 1; +} + +static inline void maybe_add_dllexport (tree decl) +{ + if (i386_pe_type_dllexport_p (decl)) + { + tree decl_attrs = DECL_ATTRIBUTES (decl); + if (lookup_attribute ("dllexport", decl_attrs) != NULL_TREE) + /* Already done. */ + return; + DECL_ATTRIBUTES (decl) = tree_cons (get_identifier ("dllexport"), + NULL_TREE, decl_attrs); + } +} + +void +i386_pe_adjust_class_at_definition (tree t) +{ + tree member; + + gcc_assert (CLASS_TYPE_P (t)); + + + if (lookup_attribute ("dllexport", TYPE_ATTRIBUTES (t)) != NULL_TREE) + { + tree tmv = TYPE_MAIN_VARIANT (t); + + /* Make sure that we set dllexport attribute to typeinfo's + base declaration, as otherwise it would fail to be exported as + it isn't a class-member. */ + if (tmv != NULL_TREE + && CLASSTYPE_TYPEINFO_VAR (tmv) != NULL_TREE) + { + tree na, ti_decl = CLASSTYPE_TYPEINFO_VAR (tmv); + na = tree_cons (get_identifier ("dllexport"), NULL_TREE, + NULL_TREE); + decl_attributes (&ti_decl, na, 0); + } + + /* Check static VAR_DECL's. */ + for (member = TYPE_FIELDS (t); member; member = DECL_CHAIN (member)) + if (TREE_CODE (member) == VAR_DECL) + maybe_add_dllexport (member); + + /* Check FUNCTION_DECL's. */ + for (member = TYPE_METHODS (t); member; member = DECL_CHAIN (member)) + if (TREE_CODE (member) == FUNCTION_DECL) + { + tree thunk; + maybe_add_dllexport (member); + + /* Also add the attribute to its thunks. */ + for (thunk = DECL_THUNKS (member); thunk; + thunk = TREE_CHAIN (thunk)) + maybe_add_dllexport (thunk); + } + /* Check vtables */ + for (member = CLASSTYPE_VTABLES (t); member; member = DECL_CHAIN (member)) + if (TREE_CODE (member) == VAR_DECL) + maybe_add_dllexport (member); + } + + else if (lookup_attribute ("dllimport", TYPE_ATTRIBUTES (t)) != NULL_TREE) + { + /* We don't actually add the attribute to the decl, just set the flag + that signals that the address of this symbol is not a compile-time + constant. Any subsequent out-of-class declaration of members wil + cause the DECL_DLLIMPORT_P flag to be unset. + (See tree.c: merge_dllimport_decl_attributes). + That is just right since out-of class declarations can only be a + definition. */ + + /* Check static VAR_DECL's. */ + for (member = TYPE_FIELDS (t); member; member = DECL_CHAIN (member)) + if (TREE_CODE (member) == VAR_DECL) + maybe_add_dllimport (member); + + /* Check FUNCTION_DECL's. */ + for (member = TYPE_METHODS (t); member; member = DECL_CHAIN (member)) + if (TREE_CODE (member) == FUNCTION_DECL) + { + tree thunk; + maybe_add_dllimport (member); + + /* Also add the attribute to its thunks. */ + for (thunk = DECL_THUNKS (member); thunk; + thunk = DECL_CHAIN (thunk)) + maybe_add_dllimport (thunk); + } + + /* Check vtables */ + for (member = CLASSTYPE_VTABLES (t); member; member = DECL_CHAIN (member)) + if (TREE_CODE (member) == VAR_DECL) + maybe_add_dllimport (member); + + /* We leave typeinfo tables alone. We can't mark TI objects as + dllimport, since the address of a secondary VTT may be needed + for static initialization of a primary VTT. VTT's of + dllimport'd classes should always be link-once COMDAT. */ + } +} diff --git a/gcc/config/i386/winnt-stubs.c b/gcc/config/i386/winnt-stubs.c new file mode 100644 index 000000000..eb4f124bb --- /dev/null +++ b/gcc/config/i386/winnt-stubs.c @@ -0,0 +1,52 @@ +/* Dummy subroutines for language-specific support on Windows. + Contributed by Danny Smith (dannysmith@users.sourceforge.net) + Copyright (C) 2005, 2007, 2010 + Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "tm.h" +#include "rtl.h" +#include "regs.h" +#include "hard-reg-set.h" +#include "output.h" +#include "tree.h" +#include "flags.h" +#include "tm_p.h" +#include "diagnostic-core.h" +#include "hashtab.h" + +bool +i386_pe_type_dllimport_p (tree decl ATTRIBUTE_UNUSED) +{ + return false; +} + + +bool +i386_pe_type_dllexport_p (tree decl ATTRIBUTE_UNUSED) +{ + return false; +} + + +void +i386_pe_adjust_class_at_definition (tree t ATTRIBUTE_UNUSED) +{ } diff --git a/gcc/config/i386/winnt.c b/gcc/config/i386/winnt.c new file mode 100644 index 000000000..169832fd3 --- /dev/null +++ b/gcc/config/i386/winnt.c @@ -0,0 +1,1134 @@ +/* Subroutines for insn-output.c for Windows NT. + Contributed by Douglas Rupp (drupp@cs.washington.edu) + Copyright (C) 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, + 2005, 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "tm.h" +#include "rtl.h" +#include "regs.h" +#include "hard-reg-set.h" +#include "output.h" +#include "tree.h" +#include "flags.h" +#include "tm_p.h" +#include "diagnostic-core.h" +#include "hashtab.h" +#include "langhooks.h" +#include "ggc.h" +#include "target.h" +#include "except.h" +#include "lto-streamer.h" + +/* i386/PE specific attribute support. + + i386/PE has two new attributes: + dllexport - for exporting a function/variable that will live in a dll + dllimport - for importing a function/variable from a dll + + Microsoft allows multiple declspecs in one __declspec, separating + them with spaces. We do NOT support this. Instead, use __declspec + multiple times. +*/ + +/* Handle a "shared" attribute; + arguments as in struct attribute_spec.handler. */ +tree +ix86_handle_shared_attribute (tree *node, tree name, + tree args ATTRIBUTE_UNUSED, + int flags ATTRIBUTE_UNUSED, bool *no_add_attrs) +{ + if (TREE_CODE (*node) != VAR_DECL) + { + warning (OPT_Wattributes, "%qE attribute only applies to variables", + name); + *no_add_attrs = true; + } + + return NULL_TREE; +} + +/* Handle a "selectany" attribute; + arguments as in struct attribute_spec.handler. */ +tree +ix86_handle_selectany_attribute (tree *node, tree name, + tree args ATTRIBUTE_UNUSED, + int flags ATTRIBUTE_UNUSED, + bool *no_add_attrs) +{ + /* The attribute applies only to objects that are initialized and have + external linkage. However, we may not know about initialization + until the language frontend has processed the decl. We'll check for + initialization later in encode_section_info. */ + if (TREE_CODE (*node) != VAR_DECL || !TREE_PUBLIC (*node)) + { + error ("%qE attribute applies only to initialized variables" + " with external linkage", name); + *no_add_attrs = true; + } + + return NULL_TREE; +} + + +/* Return the type that we should use to determine if DECL is + imported or exported. */ + +static tree +associated_type (tree decl) +{ + return (DECL_CONTEXT (decl) && TYPE_P (DECL_CONTEXT (decl)) + ? DECL_CONTEXT (decl) : NULL_TREE); +} + +/* Return true if DECL should be a dllexport'd object. */ + +static bool +i386_pe_determine_dllexport_p (tree decl) +{ + if (TREE_CODE (decl) != VAR_DECL && TREE_CODE (decl) != FUNCTION_DECL) + return false; + + /* Don't export local clones of dllexports. */ + if (!TREE_PUBLIC (decl)) + return false; + + if (lookup_attribute ("dllexport", DECL_ATTRIBUTES (decl))) + return true; + + return false; +} + +/* Return true if DECL should be a dllimport'd object. */ + +static bool +i386_pe_determine_dllimport_p (tree decl) +{ + tree assoc; + + if (TREE_CODE (decl) != VAR_DECL && TREE_CODE (decl) != FUNCTION_DECL) + return false; + + if (DECL_DLLIMPORT_P (decl)) + return true; + + /* The DECL_DLLIMPORT_P flag was set for decls in the class definition + by targetm.cxx.adjust_class_at_definition. Check again to emit + error message if the class attribute has been overridden by an + out-of-class definition of static data. */ + assoc = associated_type (decl); + if (assoc && lookup_attribute ("dllimport", TYPE_ATTRIBUTES (assoc)) + && TREE_CODE (decl) == VAR_DECL + && TREE_STATIC (decl) && TREE_PUBLIC (decl) + && !DECL_EXTERNAL (decl) + /* vtable's are linkonce constants, so defining a vtable is not + an error as long as we don't try to import it too. */ + && !DECL_VIRTUAL_P (decl)) + error ("definition of static data member %q+D of " + "dllimport%'d class", decl); + + return false; +} + +/* Handle the -mno-fun-dllimport target switch. */ + +bool +i386_pe_valid_dllimport_attribute_p (const_tree decl) +{ + if (TARGET_NOP_FUN_DLLIMPORT && TREE_CODE (decl) == FUNCTION_DECL) + return false; + return true; +} + +/* Return string which is the function name, identified by ID, modified + with a suffix consisting of an atsign (@) followed by the number of + bytes of arguments. If ID is NULL use the DECL_NAME as base. If + FASTCALL is true, also add the FASTCALL_PREFIX. + Return NULL if no change required. */ + +static tree +gen_stdcall_or_fastcall_suffix (tree decl, tree id, bool fastcall) +{ + HOST_WIDE_INT total = 0; + const char *old_str = IDENTIFIER_POINTER (id != NULL_TREE ? id : DECL_NAME (decl)); + char *new_str, *p; + tree type = TREE_TYPE (decl); + tree arg; + function_args_iterator args_iter; + + gcc_assert (TREE_CODE (decl) == FUNCTION_DECL); + + if (prototype_p (type)) + { + /* This attribute is ignored for variadic functions. */ + if (stdarg_p (type)) + return NULL_TREE; + + /* Quit if we hit an incomplete type. Error is reported + by convert_arguments in c-typeck.c or cp/typeck.c. */ + FOREACH_FUNCTION_ARGS(type, arg, args_iter) + { + HOST_WIDE_INT parm_size; + HOST_WIDE_INT parm_boundary_bytes = PARM_BOUNDARY / BITS_PER_UNIT; + + if (! COMPLETE_TYPE_P (arg)) + break; + + parm_size = int_size_in_bytes (arg); + if (parm_size < 0) + break; + + /* Must round up to include padding. This is done the same + way as in store_one_arg. */ + parm_size = ((parm_size + parm_boundary_bytes - 1) + / parm_boundary_bytes * parm_boundary_bytes); + total += parm_size; + } + } + /* Assume max of 8 base 10 digits in the suffix. */ + p = new_str = XALLOCAVEC (char, 1 + strlen (old_str) + 1 + 8 + 1); + if (fastcall) + *p++ = FASTCALL_PREFIX; + sprintf (p, "%s@" HOST_WIDE_INT_PRINT_DEC, old_str, total); + + return get_identifier (new_str); +} + +/* Maybe decorate and get a new identifier for the DECL of a stdcall or + fastcall function. The original identifier is supplied in ID. */ + +static tree +i386_pe_maybe_mangle_decl_assembler_name (tree decl, tree id) +{ + tree new_id = NULL_TREE; + + if (TREE_CODE (decl) == FUNCTION_DECL) + { + tree type_attributes = TYPE_ATTRIBUTES (TREE_TYPE (decl)); + if (lookup_attribute ("stdcall", type_attributes)) + new_id = gen_stdcall_or_fastcall_suffix (decl, id, false); + else if (lookup_attribute ("fastcall", type_attributes)) + new_id = gen_stdcall_or_fastcall_suffix (decl, id, true); + } + + return new_id; +} + +/* Emit an assembler directive to set symbol for DECL visibility to + the visibility type VIS, which must not be VISIBILITY_DEFAULT. + As for PE there is no hidden support in gas, we just warn for + user-specified visibility attributes. */ + +void +i386_pe_assemble_visibility (tree decl, + int vis ATTRIBUTE_UNUSED) +{ + if (!decl + || !lookup_attribute ("visibility", DECL_ATTRIBUTES (decl))) + return; + warning (OPT_Wattributes, "visibility attribute not supported " + "in this configuration; ignored"); +} + +/* This is used as a target hook to modify the DECL_ASSEMBLER_NAME + in the language-independent default hook + langhooks,c:lhd_set_decl_assembler_name () + and in cp/mangle,c:mangle_decl (). */ +tree +i386_pe_mangle_decl_assembler_name (tree decl, tree id) +{ + tree new_id = i386_pe_maybe_mangle_decl_assembler_name (decl, id); + + return (new_id ? new_id : id); +} + +/* This hook behaves the same as varasm.c/assemble_name(), but + generates the name into memory rather than outputting it to + a file stream. */ + +tree +i386_pe_mangle_assembler_name (const char *name ATTRIBUTE_UNUSED) +{ + const char *skipped = name + (*name == '*' ? 1 : 0); + const char *stripped = targetm.strip_name_encoding (skipped); + if (*name != '*' && *user_label_prefix && *stripped != FASTCALL_PREFIX) + stripped = ACONCAT ((user_label_prefix, stripped, NULL)); + return get_identifier (stripped); +} + +void +i386_pe_encode_section_info (tree decl, rtx rtl, int first) +{ + rtx symbol; + int flags; + + /* Do this last, due to our frobbing of DECL_DLLIMPORT_P above. */ + default_encode_section_info (decl, rtl, first); + + /* Careful not to prod global register variables. */ + if (!MEM_P (rtl)) + return; + + symbol = XEXP (rtl, 0); + gcc_assert (GET_CODE (symbol) == SYMBOL_REF); + + switch (TREE_CODE (decl)) + { + case FUNCTION_DECL: + /* FIXME: Imported stdcall names are not modified by the Ada frontend. + Check and decorate the RTL name now. */ + if (strcmp (lang_hooks.name, "GNU Ada") == 0) + { + tree new_id; + tree old_id = DECL_ASSEMBLER_NAME (decl); + const char* asm_str = IDENTIFIER_POINTER (old_id); + /* Do not change the identifier if a verbatim asmspec + or if stdcall suffix already added. */ + if (!(*asm_str == '*' || strchr (asm_str, '@')) + && (new_id = i386_pe_maybe_mangle_decl_assembler_name (decl, + old_id))) + XSTR (symbol, 0) = IDENTIFIER_POINTER (new_id); + } + break; + + case VAR_DECL: + if (lookup_attribute ("selectany", DECL_ATTRIBUTES (decl))) + { + if (DECL_INITIAL (decl) + /* If an object is initialized with a ctor, the static + initialization and destruction code for it is present in + each unit defining the object. The code that calls the + ctor is protected by a link-once guard variable, so that + the object still has link-once semantics, */ + || TYPE_NEEDS_CONSTRUCTING (TREE_TYPE (decl))) + make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl)); + else + error ("%q+D:'selectany' attribute applies only to " + "initialized objects", decl); + } + break; + + default: + return; + } + + /* Mark the decl so we can tell from the rtl whether the object is + dllexport'd or dllimport'd. tree.c: merge_dllimport_decl_attributes + handles dllexport/dllimport override semantics. */ + flags = (SYMBOL_REF_FLAGS (symbol) & + ~(SYMBOL_FLAG_DLLIMPORT | SYMBOL_FLAG_DLLEXPORT)); + if (i386_pe_determine_dllexport_p (decl)) + flags |= SYMBOL_FLAG_DLLEXPORT; + else if (i386_pe_determine_dllimport_p (decl)) + flags |= SYMBOL_FLAG_DLLIMPORT; + + SYMBOL_REF_FLAGS (symbol) = flags; +} + +bool +i386_pe_binds_local_p (const_tree exp) +{ + /* PE does not do dynamic binding. Indeed, the only kind of + non-local reference comes from a dllimport'd symbol. */ + if ((TREE_CODE (exp) == VAR_DECL || TREE_CODE (exp) == FUNCTION_DECL) + && DECL_DLLIMPORT_P (exp)) + return false; + + /* Or a weak one, now that they are supported. */ + if ((TREE_CODE (exp) == VAR_DECL || TREE_CODE (exp) == FUNCTION_DECL) + && DECL_WEAK (exp)) + return false; + + return true; +} + +/* Also strip the fastcall prefix and stdcall suffix. */ + +const char * +i386_pe_strip_name_encoding_full (const char *str) +{ + const char *p; + const char *name = default_strip_name_encoding (str); + + /* Strip leading '@' on fastcall symbols. */ + if (*name == '@') + name++; + + /* Strip trailing "@n". */ + p = strchr (name, '@'); + if (p) + return ggc_alloc_string (name, p - name); + + return name; +} + +void +i386_pe_unique_section (tree decl, int reloc) +{ + int len; + const char *name, *prefix; + char *string; + + name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)); + name = i386_pe_strip_name_encoding_full (name); + + /* The object is put in, for example, section .text$foo. + The linker will then ultimately place them in .text + (everything from the $ on is stripped). Don't put + read-only data in .rdata section to avoid a PE linker + bug when .rdata$* grouped sections are used in code + without a .rdata section. */ + if (TREE_CODE (decl) == FUNCTION_DECL) + prefix = ".text$"; + else if (decl_readonly_section (decl, reloc)) + prefix = ".rdata$"; + else + prefix = ".data$"; + len = strlen (name) + strlen (prefix); + string = XALLOCAVEC (char, len + 1); + sprintf (string, "%s%s", prefix, name); + + DECL_SECTION_NAME (decl) = build_string (len, string); +} + +/* Select a set of attributes for section NAME based on the properties + of DECL and whether or not RELOC indicates that DECL's initializer + might contain runtime relocations. + + We make the section read-only and executable for a function decl, + read-only for a const data decl, and writable for a non-const data decl. + + If the section has already been defined, to not allow it to have + different attributes, as (1) this is ambiguous since we're not seeing + all the declarations up front and (2) some assemblers (e.g. SVR4) + do not recognize section redefinitions. */ +/* ??? This differs from the "standard" PE implementation in that we + handle the SHARED variable attribute. Should this be done for all + PE targets? */ + +#define SECTION_PE_SHARED SECTION_MACH_DEP + +unsigned int +i386_pe_section_type_flags (tree decl, const char *name, int reloc) +{ + static htab_t htab; + unsigned int flags; + unsigned int **slot; + + /* The names we put in the hashtable will always be the unique + versions given to us by the stringtable, so we can just use + their addresses as the keys. */ + if (!htab) + htab = htab_create (31, htab_hash_pointer, htab_eq_pointer, NULL); + + if (decl && TREE_CODE (decl) == FUNCTION_DECL) + flags = SECTION_CODE; + else if (decl && decl_readonly_section (decl, reloc)) + flags = 0; + else + { + flags = SECTION_WRITE; + + if (decl && TREE_CODE (decl) == VAR_DECL + && lookup_attribute ("shared", DECL_ATTRIBUTES (decl))) + flags |= SECTION_PE_SHARED; + } + + if (decl && DECL_ONE_ONLY (decl)) + flags |= SECTION_LINKONCE; + + /* See if we already have an entry for this section. */ + slot = (unsigned int **) htab_find_slot (htab, name, INSERT); + if (!*slot) + { + *slot = (unsigned int *) xmalloc (sizeof (unsigned int)); + **slot = flags; + } + else + { + if (decl && **slot != flags) + error ("%q+D causes a section type conflict", decl); + } + + return flags; +} + +void +i386_pe_asm_named_section (const char *name, unsigned int flags, + tree decl) +{ + char flagchars[8], *f = flagchars; + + if ((flags & (SECTION_CODE | SECTION_WRITE)) == 0) + /* readonly data */ + { + *f++ ='d'; /* This is necessary for older versions of gas. */ + *f++ ='r'; + } + else + { + if (flags & SECTION_CODE) + *f++ = 'x'; + if (flags & SECTION_WRITE) + *f++ = 'w'; + if (flags & SECTION_PE_SHARED) + *f++ = 's'; + } + + /* LTO sections need 1-byte alignment to avoid confusing the + zlib decompression algorithm with trailing zero pad bytes. */ + if (strncmp (name, LTO_SECTION_NAME_PREFIX, + strlen (LTO_SECTION_NAME_PREFIX)) == 0) + *f++ = '0'; + + *f = '\0'; + + fprintf (asm_out_file, "\t.section\t%s,\"%s\"\n", name, flagchars); + + if (flags & SECTION_LINKONCE) + { + /* Functions may have been compiled at various levels of + optimization so we can't use `same_size' here. + Instead, have the linker pick one, without warning. + If 'selectany' attribute has been specified, MS compiler + sets 'discard' characteristic, rather than telling linker + to warn of size or content mismatch, so do the same. */ + bool discard = (flags & SECTION_CODE) + || lookup_attribute ("selectany", + DECL_ATTRIBUTES (decl)); + fprintf (asm_out_file, "\t.linkonce %s\n", + (discard ? "discard" : "same_size")); + } +} + +/* Beware, DECL may be NULL if compile_file() is emitting the LTO marker. */ + +void +i386_pe_asm_output_aligned_decl_common (FILE *stream, tree decl, + const char *name, HOST_WIDE_INT size, + HOST_WIDE_INT align ATTRIBUTE_UNUSED) +{ + HOST_WIDE_INT rounded; + + /* Compute as in assemble_noswitch_variable, since we don't have + support for aligned common on older binutils. We must also + avoid emitting a common symbol of size zero, as this is the + overloaded representation that indicates an undefined external + symbol in the PE object file format. */ + rounded = size ? size : 1; + rounded += (BIGGEST_ALIGNMENT / BITS_PER_UNIT) - 1; + rounded = (rounded / (BIGGEST_ALIGNMENT / BITS_PER_UNIT) + * (BIGGEST_ALIGNMENT / BITS_PER_UNIT)); + + i386_pe_maybe_record_exported_symbol (decl, name, 1); + + fprintf (stream, "\t.comm\t"); + assemble_name (stream, name); + if (use_pe_aligned_common) + fprintf (stream, ", " HOST_WIDE_INT_PRINT_DEC ", %d\n", + size ? size : (HOST_WIDE_INT) 1, + exact_log2 (align) - exact_log2 (CHAR_BIT)); + else + fprintf (stream, ", " HOST_WIDE_INT_PRINT_DEC "\t" ASM_COMMENT_START + " " HOST_WIDE_INT_PRINT_DEC "\n", rounded, size); +} + +/* The Microsoft linker requires that every function be marked as + DT_FCN. When using gas on cygwin, we must emit appropriate .type + directives. */ + +#include "gsyms.h" + +/* Mark a function appropriately. This should only be called for + functions for which we are not emitting COFF debugging information. + FILE is the assembler output file, NAME is the name of the + function, and PUB is nonzero if the function is globally + visible. */ + +void +i386_pe_declare_function_type (FILE *file, const char *name, int pub) +{ + fprintf (file, "\t.def\t"); + assemble_name (file, name); + fprintf (file, ";\t.scl\t%d;\t.type\t%d;\t.endef\n", + pub ? (int) C_EXT : (int) C_STAT, + (int) DT_FCN << N_BTSHFT); +} + +/* Keep a list of external functions. */ + +struct GTY(()) extern_list +{ + struct extern_list *next; + tree decl; + const char *name; +}; + +static GTY(()) struct extern_list *extern_head; + +/* Assemble an external function reference. We need to keep a list of + these, so that we can output the function types at the end of the + assembly. We can't output the types now, because we might see a + definition of the function later on and emit debugging information + for it then. */ + +void +i386_pe_record_external_function (tree decl, const char *name) +{ + struct extern_list *p; + + p = ggc_alloc_extern_list (); + p->next = extern_head; + p->decl = decl; + p->name = name; + extern_head = p; +} + +/* Keep a list of exported symbols. */ + +struct GTY(()) export_list +{ + struct export_list *next; + const char *name; + int is_data; /* used to type tag exported symbols. */ +}; + +static GTY(()) struct export_list *export_head; + +/* Assemble an export symbol entry. We need to keep a list of + these, so that we can output the export list at the end of the + assembly. We used to output these export symbols in each function, + but that causes problems with GNU ld when the sections are + linkonce. Beware, DECL may be NULL if compile_file() is emitting + the LTO marker. */ + +void +i386_pe_maybe_record_exported_symbol (tree decl, const char *name, int is_data) +{ + rtx symbol; + struct export_list *p; + + if (!decl) + return; + + symbol = XEXP (DECL_RTL (decl), 0); + gcc_assert (GET_CODE (symbol) == SYMBOL_REF); + if (!SYMBOL_REF_DLLEXPORT_P (symbol)) + return; + + gcc_assert (TREE_PUBLIC (decl)); + + p = ggc_alloc_export_list (); + p->next = export_head; + p->name = name; + p->is_data = is_data; + export_head = p; +} + +#ifdef CXX_WRAP_SPEC_LIST + +/* Hash table equality helper function. */ + +static int +wrapper_strcmp (const void *x, const void *y) +{ + return !strcmp ((const char *) x, (const char *) y); +} + +/* Search for a function named TARGET in the list of library wrappers + we are using, returning a pointer to it if found or NULL if not. + This function might be called on quite a few symbols, and we only + have the list of names of wrapped functions available to us as a + spec string, so first time round we lazily initialise a hash table + to make things quicker. */ + +static const char * +i386_find_on_wrapper_list (const char *target) +{ + static char first_time = 1; + static htab_t wrappers; + + if (first_time) + { + /* Beware that this is not a complicated parser, it assumes + that any sequence of non-whitespace beginning with an + underscore is one of the wrapped symbols. For now that's + adequate to distinguish symbols from spec substitutions + and command-line options. */ + static char wrapper_list_buffer[] = CXX_WRAP_SPEC_LIST; + char *bufptr; + /* Breaks up the char array into separated strings + strings and enter them into the hash table. */ + wrappers = htab_create_alloc (8, htab_hash_string, wrapper_strcmp, + 0, xcalloc, free); + for (bufptr = wrapper_list_buffer; *bufptr; ++bufptr) + { + char *found = NULL; + if (ISSPACE (*bufptr)) + continue; + if (*bufptr == '_') + found = bufptr; + while (*bufptr && !ISSPACE (*bufptr)) + ++bufptr; + if (*bufptr) + *bufptr = 0; + if (found) + *htab_find_slot (wrappers, found, INSERT) = found; + } + first_time = 0; + } + + return (const char *) htab_find (wrappers, target); +} + +#endif /* CXX_WRAP_SPEC_LIST */ + +/* This is called at the end of assembly. For each external function + which has not been defined, we output a declaration now. We also + output the .drectve section. */ + +void +i386_pe_file_end (void) +{ + struct extern_list *p; + + for (p = extern_head; p != NULL; p = p->next) + { + tree decl; + + decl = p->decl; + + /* Positively ensure only one declaration for any given symbol. */ + if (! TREE_ASM_WRITTEN (decl) + && TREE_SYMBOL_REFERENCED (DECL_ASSEMBLER_NAME (decl))) + { +#ifdef CXX_WRAP_SPEC_LIST + /* To ensure the DLL that provides the corresponding real + functions is still loaded at runtime, we must reference + the real function so that an (unused) import is created. */ + const char *realsym = i386_find_on_wrapper_list (p->name); + if (realsym) + i386_pe_declare_function_type (asm_out_file, + concat ("__real_", realsym, NULL), TREE_PUBLIC (decl)); +#endif /* CXX_WRAP_SPEC_LIST */ + TREE_ASM_WRITTEN (decl) = 1; + i386_pe_declare_function_type (asm_out_file, p->name, + TREE_PUBLIC (decl)); + } + } + + if (export_head) + { + struct export_list *q; + drectve_section (); + for (q = export_head; q != NULL; q = q->next) + { + fprintf (asm_out_file, "\t.ascii \" -export:\\\"%s\\\"%s\"\n", + default_strip_name_encoding (q->name), + (q->is_data ? ",data" : "")); + } + } +} + + +/* x64 Structured Exception Handling unwind info. */ + +struct seh_frame_state +{ + /* SEH records saves relative to the "current" stack pointer, whether + or not there's a frame pointer in place. This tracks the current + stack pointer offset from the CFA. */ + HOST_WIDE_INT sp_offset; + + /* The CFA is located at CFA_REG + CFA_OFFSET. */ + HOST_WIDE_INT cfa_offset; + rtx cfa_reg; +}; + +/* Set up data structures beginning output for SEH. */ + +void +i386_pe_seh_init (FILE *f) +{ + struct seh_frame_state *seh; + + if (!TARGET_SEH) + return; + if (cfun->is_thunk) + return; + + /* We cannot support DRAP with SEH. We turned off support for it by + re-defining MAX_STACK_ALIGNMENT when SEH is enabled. */ + gcc_assert (!stack_realign_drap); + + seh = XCNEW (struct seh_frame_state); + cfun->machine->seh = seh; + + seh->sp_offset = INCOMING_FRAME_SP_OFFSET; + seh->cfa_offset = INCOMING_FRAME_SP_OFFSET; + seh->cfa_reg = stack_pointer_rtx; + + fputs ("\t.seh_proc\t", f); + assemble_name (f, IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (cfun->decl))); + fputc ('\n', f); +} + +void +i386_pe_seh_end_prologue (FILE *f) +{ + struct seh_frame_state *seh; + + if (!TARGET_SEH) + return; + if (cfun->is_thunk) + return; + seh = cfun->machine->seh; + + /* Emit an assembler directive to set up the frame pointer. Always do + this last. The documentation talks about doing this "before" any + other code that uses offsets, but (experimentally) that's after we + emit the codes in reverse order (handled by the assembler). */ + if (seh->cfa_reg != stack_pointer_rtx) + { + HOST_WIDE_INT offset = seh->sp_offset - seh->cfa_offset; + + gcc_assert ((offset & 15) == 0); + gcc_assert (IN_RANGE (offset, 0, 240)); + + fputs ("\t.seh_setframe\t", f); + print_reg (seh->cfa_reg, 0, f); + fprintf (f, ", " HOST_WIDE_INT_PRINT_DEC "\n", offset); + } + + XDELETE (seh); + cfun->machine->seh = NULL; + + fputs ("\t.seh_endprologue\n", f); +} + +static void +i386_pe_seh_fini (FILE *f) +{ + if (!TARGET_SEH) + return; + if (cfun->is_thunk) + return; + fputs ("\t.seh_endproc\n", f); +} + +/* Emit an assembler directive to save REG via a PUSH. */ + +static void +seh_emit_push (FILE *f, struct seh_frame_state *seh, rtx reg) +{ + unsigned int regno = REGNO (reg); + + gcc_checking_assert (GENERAL_REGNO_P (regno)); + + seh->sp_offset += UNITS_PER_WORD; + if (seh->cfa_reg == stack_pointer_rtx) + seh->cfa_offset += UNITS_PER_WORD; + + fputs ("\t.seh_pushreg\t", f); + print_reg (reg, 0, f); + fputc ('\n', f); +} + +/* Emit an assembler directive to save REG at CFA - CFA_OFFSET. */ + +static void +seh_emit_save (FILE *f, struct seh_frame_state *seh, + rtx reg, HOST_WIDE_INT cfa_offset) +{ + unsigned int regno = REGNO (reg); + HOST_WIDE_INT offset; + + /* Negative save offsets are of course not supported, since that + would be a store below the stack pointer and thus clobberable. */ + gcc_assert (seh->sp_offset >= cfa_offset); + offset = seh->sp_offset - cfa_offset; + + fputs ((SSE_REGNO_P (regno) ? "\t.seh_savexmm\t" + : GENERAL_REGNO_P (regno) ? "\t.seh_savereg\t" + : (gcc_unreachable (), "")), f); + print_reg (reg, 0, f); + fprintf (f, ", " HOST_WIDE_INT_PRINT_DEC "\n", offset); +} + +/* Emit an assembler directive to adjust RSP by OFFSET. */ + +static void +seh_emit_stackalloc (FILE *f, struct seh_frame_state *seh, + HOST_WIDE_INT offset) +{ + /* We're only concerned with prologue stack allocations, which all + are subtractions from the stack pointer. */ + gcc_assert (offset < 0); + offset = -offset; + + if (seh->cfa_reg == stack_pointer_rtx) + seh->cfa_offset += offset; + seh->sp_offset += offset; + + fprintf (f, "\t.seh_stackalloc\t" HOST_WIDE_INT_PRINT_DEC "\n", offset); +} + +/* Process REG_CFA_ADJUST_CFA for SEH. */ + +static void +seh_cfa_adjust_cfa (FILE *f, struct seh_frame_state *seh, rtx pat) +{ + rtx dest, src; + HOST_WIDE_INT reg_offset = 0; + unsigned int dest_regno; + + dest = SET_DEST (pat); + src = SET_SRC (pat); + + if (GET_CODE (src) == PLUS) + { + reg_offset = INTVAL (XEXP (src, 1)); + src = XEXP (src, 0); + } + else if (GET_CODE (src) == MINUS) + { + reg_offset = -INTVAL (XEXP (src, 1)); + src = XEXP (src, 0); + } + gcc_assert (src == stack_pointer_rtx); + gcc_assert (seh->cfa_reg == stack_pointer_rtx); + dest_regno = REGNO (dest); + + if (dest_regno == STACK_POINTER_REGNUM) + seh_emit_stackalloc (f, seh, reg_offset); + else if (dest_regno == HARD_FRAME_POINTER_REGNUM) + { + seh->cfa_reg = dest; + seh->cfa_offset -= reg_offset; + } + else + gcc_unreachable (); +} + +/* Process REG_CFA_OFFSET for SEH. */ + +static void +seh_cfa_offset (FILE *f, struct seh_frame_state *seh, rtx pat) +{ + rtx dest, src; + HOST_WIDE_INT reg_offset; + + dest = SET_DEST (pat); + src = SET_SRC (pat); + + gcc_assert (MEM_P (dest)); + dest = XEXP (dest, 0); + if (REG_P (dest)) + reg_offset = 0; + else + { + gcc_assert (GET_CODE (dest) == PLUS); + reg_offset = INTVAL (XEXP (dest, 1)); + dest = XEXP (dest, 0); + } + gcc_assert (dest == seh->cfa_reg); + + seh_emit_save (f, seh, src, seh->cfa_offset - reg_offset); +} + +/* Process a FRAME_RELATED_EXPR for SEH. */ + +static void +seh_frame_related_expr (FILE *f, struct seh_frame_state *seh, rtx pat) +{ + rtx dest, src; + HOST_WIDE_INT addend; + + /* See the full loop in dwarf2out_frame_debug_expr. */ + if (GET_CODE (pat) == PARALLEL || GET_CODE (pat) == SEQUENCE) + { + int i, n = XVECLEN (pat, 0), pass, npass; + + npass = (GET_CODE (pat) == PARALLEL ? 2 : 1); + for (pass = 0; pass < npass; ++pass) + for (i = 0; i < n; ++i) + { + rtx ele = XVECEXP (pat, 0, i); + + if (GET_CODE (ele) != SET) + continue; + dest = SET_DEST (ele); + + /* Process each member of the PARALLEL independently. The first + member is always processed; others only if they are marked. */ + if (i == 0 || RTX_FRAME_RELATED_P (ele)) + { + /* Evaluate all register saves in the first pass and all + register updates in the second pass. */ + if ((MEM_P (dest) ^ pass) || npass == 1) + seh_frame_related_expr (f, seh, ele); + } + } + return; + } + + dest = SET_DEST (pat); + src = SET_SRC (pat); + + switch (GET_CODE (dest)) + { + case REG: + switch (GET_CODE (src)) + { + case REG: + /* REG = REG: This should be establishing a frame pointer. */ + gcc_assert (src == stack_pointer_rtx); + gcc_assert (dest == hard_frame_pointer_rtx); + seh_cfa_adjust_cfa (f, seh, pat); + break; + + case PLUS: + addend = INTVAL (XEXP (src, 1)); + src = XEXP (src, 0); + if (dest == hard_frame_pointer_rtx) + seh_cfa_adjust_cfa (f, seh, pat); + else if (dest == stack_pointer_rtx) + { + gcc_assert (src == stack_pointer_rtx); + seh_emit_stackalloc (f, seh, addend); + } + else + gcc_unreachable (); + break; + + default: + gcc_unreachable (); + } + break; + + case MEM: + /* A save of some kind. */ + dest = XEXP (dest, 0); + if (GET_CODE (dest) == PRE_DEC) + { + gcc_checking_assert (GET_MODE (src) == Pmode); + gcc_checking_assert (REG_P (src)); + seh_emit_push (f, seh, src); + } + else + seh_cfa_offset (f, seh, pat); + break; + + default: + gcc_unreachable (); + } +} + +/* This function looks at a single insn and emits any SEH directives + required for unwind of this insn. */ + +void +i386_pe_seh_unwind_emit (FILE *asm_out_file, rtx insn) +{ + rtx note, pat; + bool handled_one = false; + struct seh_frame_state *seh; + + if (!TARGET_SEH) + return; + + /* We free the SEH data once done with the prologue. Ignore those + RTX_FRAME_RELATED_P insns that are associated with the epilogue. */ + seh = cfun->machine->seh; + if (seh == NULL) + return; + + if (NOTE_P (insn) || !RTX_FRAME_RELATED_P (insn)) + return; + + for (note = REG_NOTES (insn); note ; note = XEXP (note, 1)) + { + pat = XEXP (note, 0); + switch (REG_NOTE_KIND (note)) + { + case REG_FRAME_RELATED_EXPR: + goto found; + + case REG_CFA_DEF_CFA: + case REG_CFA_EXPRESSION: + /* Only emitted with DRAP, which we disable. */ + gcc_unreachable (); + break; + + case REG_CFA_REGISTER: + /* Only emitted in epilogues, which we skip. */ + gcc_unreachable (); + + case REG_CFA_ADJUST_CFA: + if (pat == NULL) + { + pat = PATTERN (insn); + if (GET_CODE (pat) == PARALLEL) + pat = XVECEXP (pat, 0, 0); + } + seh_cfa_adjust_cfa (asm_out_file, seh, pat); + handled_one = true; + break; + + case REG_CFA_OFFSET: + if (pat == NULL) + pat = single_set (insn); + seh_cfa_offset (asm_out_file, seh, pat); + handled_one = true; + break; + + default: + break; + } + } + if (handled_one) + return; + pat = PATTERN (insn); + found: + seh_frame_related_expr (asm_out_file, seh, pat); +} + +void +i386_pe_start_function (FILE *f, const char *name, tree decl) +{ + i386_pe_maybe_record_exported_symbol (decl, name, 0); + if (write_symbols != SDB_DEBUG) + i386_pe_declare_function_type (f, name, TREE_PUBLIC (decl)); + /* In case section was altered by debugging output. */ + if (decl != NULL_TREE) + switch_to_section (function_section (decl)); + ASM_OUTPUT_FUNCTION_LABEL (f, name, decl); +} + +void +i386_pe_end_function (FILE *f, const char *name ATTRIBUTE_UNUSED, + tree decl ATTRIBUTE_UNUSED) +{ + i386_pe_seh_fini (f); +} + + +#include "gt-winnt.h" diff --git a/gcc/config/i386/wmmintrin.h b/gcc/config/i386/wmmintrin.h new file mode 100644 index 000000000..2c4bdc99a --- /dev/null +++ b/gcc/config/i386/wmmintrin.h @@ -0,0 +1,120 @@ +/* Copyright (C) 2008, 2009 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 10.1. */ + +#ifndef _WMMINTRIN_H_INCLUDED +#define _WMMINTRIN_H_INCLUDED + +/* We need definitions from the SSE2 header file. */ +#include + +#if !defined (__AES__) && !defined (__PCLMUL__) +# error "AES/PCLMUL instructions not enabled" +#else + +/* AES */ + +#ifdef __AES__ +/* Performs 1 round of AES decryption of the first m128i using + the second m128i as a round key. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesdec_si128 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_aesdec128 ((__v2di)__X, (__v2di)__Y); +} + +/* Performs the last round of AES decryption of the first m128i + using the second m128i as a round key. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesdeclast_si128 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_aesdeclast128 ((__v2di)__X, + (__v2di)__Y); +} + +/* Performs 1 round of AES encryption of the first m128i using + the second m128i as a round key. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesenc_si128 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_aesenc128 ((__v2di)__X, (__v2di)__Y); +} + +/* Performs the last round of AES encryption of the first m128i + using the second m128i as a round key. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesenclast_si128 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_aesenclast128 ((__v2di)__X, (__v2di)__Y); +} + +/* Performs the InverseMixColumn operation on the source m128i + and stores the result into m128i destination. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesimc_si128 (__m128i __X) +{ + return (__m128i) __builtin_ia32_aesimc128 ((__v2di)__X); +} + +/* Generates a m128i round key for the input m128i AES cipher key and + byte round constant. The second parameter must be a compile time + constant. */ +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aeskeygenassist_si128 (__m128i __X, const int __C) +{ + return (__m128i) __builtin_ia32_aeskeygenassist128 ((__v2di)__X, __C); +} +#else +#define _mm_aeskeygenassist_si128(X, C) \ + ((__m128i) __builtin_ia32_aeskeygenassist128 ((__v2di)(__m128i)(X), \ + (int)(C))) +#endif +#endif /* __AES__ */ + +/* PCLMUL */ + +#ifdef __PCLMUL__ +/* Performs carry-less integer multiplication of 64-bit halves of + 128-bit input operands. The third parameter inducates which 64-bit + haves of the input parameters v1 and v2 should be used. It must be + a compile time constant. */ +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_clmulepi64_si128 (__m128i __X, __m128i __Y, const int __I) +{ + return (__m128i) __builtin_ia32_pclmulqdq128 ((__v2di)__X, + (__v2di)__Y, __I); +} +#else +#define _mm_clmulepi64_si128(X, Y, I) \ + ((__m128i) __builtin_ia32_pclmulqdq128 ((__v2di)(__m128i)(X), \ + (__v2di)(__m128i)(Y), (int)(I))) +#endif +#endif /* __PCLMUL__ */ + +#endif /* __AES__/__PCLMUL__ */ + +#endif /* _WMMINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/x-cygwin b/gcc/config/i386/x-cygwin new file mode 100644 index 000000000..752af76ef --- /dev/null +++ b/gcc/config/i386/x-cygwin @@ -0,0 +1,4 @@ +host-cygwin.o : $(srcdir)/config/i386/host-cygwin.c $(CONFIG_H) $(SYSTEM_H) \ + coretypes.h hosthooks.h $(HOSTHOOKS_DEF_H) toplev.h diagnostic.h + $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ + $(srcdir)/config/i386/host-cygwin.c diff --git a/gcc/config/i386/x-darwin b/gcc/config/i386/x-darwin new file mode 100644 index 000000000..f0196bac4 --- /dev/null +++ b/gcc/config/i386/x-darwin @@ -0,0 +1,4 @@ +host-i386-darwin.o : $(srcdir)/config/i386/host-i386-darwin.c \ + $(CONFIG_H) $(SYSTEM_H) coretypes.h hosthooks.h $(HOSTHOOKS_DEF_H) \ + config/host-darwin.h + $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) $< diff --git a/gcc/config/i386/x-i386 b/gcc/config/i386/x-i386 new file mode 100644 index 000000000..2bf8fed5d --- /dev/null +++ b/gcc/config/i386/x-i386 @@ -0,0 +1,4 @@ +driver-i386.o : $(srcdir)/config/i386/driver-i386.c \ + $(srcdir)/config/i386/cpuid.h \ + $(CONFIG_H) $(SYSTEM_H) $(TM_H) coretypes.h + $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) $< diff --git a/gcc/config/i386/x-mingw32 b/gcc/config/i386/x-mingw32 new file mode 100644 index 000000000..2a1ca47c7 --- /dev/null +++ b/gcc/config/i386/x-mingw32 @@ -0,0 +1,31 @@ +# Copyright (C) 2003, 2004, 2008, 2009 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . +# +# +# Make local_includedir relative to EXEC_PREFIX +# +local_includedir=$(libsubdir)/$(unlibsubdir)/..`echo $(exec_prefix) | sed -e 's|^$(prefix)||' -e 's|/[^/]*|/..|g'`/include + +# On MinGW, we use "%IA64d" to print 64-bit integers, and the format-checking +# code does not handle that, so we have to disable checking here. +WERROR_FLAGS += -Wno-format + +host-mingw32.o : $(srcdir)/config/i386/host-mingw32.c $(CONFIG_H) $(SYSTEM_H) \ + coretypes.h hosthooks.h hosthooks-def.h toplev.h $(DIAGNOSTIC_H) $(HOOKS_H) + $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ + $(srcdir)/config/i386/host-mingw32.c diff --git a/gcc/config/i386/x86-64.h b/gcc/config/i386/x86-64.h new file mode 100644 index 000000000..b85dab9cd --- /dev/null +++ b/gcc/config/i386/x86-64.h @@ -0,0 +1,106 @@ +/* OS independent definitions for AMD x86-64. + Copyright (C) 2001, 2005, 2007, 2009, 2010, 2011 + Free Software Foundation, Inc. + Contributed by Bo Thorsen . + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ + +#undef ASM_COMMENT_START +#define ASM_COMMENT_START "#" + +#undef DBX_REGISTER_NUMBER +#define DBX_REGISTER_NUMBER(n) \ + (TARGET_64BIT ? dbx64_register_map[n] : svr4_dbx_register_map[n]) + +/* Output assembler code to FILE to call the profiler. */ +#define NO_PROFILE_COUNTERS 1 + +#undef MCOUNT_NAME +#define MCOUNT_NAME "mcount" + +#undef SIZE_TYPE +#define SIZE_TYPE (TARGET_64BIT ? "long unsigned int" : "unsigned int") + +#undef PTRDIFF_TYPE +#define PTRDIFF_TYPE (TARGET_64BIT ? "long int" : "int") + +#undef WCHAR_TYPE +#define WCHAR_TYPE "int" + +#undef WCHAR_TYPE_SIZE +#define WCHAR_TYPE_SIZE 32 + +#undef ASM_SPEC +#define ASM_SPEC "%{m32:--32} %{m64:--64}" + +#undef ASM_OUTPUT_ALIGNED_BSS +#define ASM_OUTPUT_ALIGNED_BSS(FILE, DECL, NAME, SIZE, ALIGN) \ + x86_output_aligned_bss (FILE, DECL, NAME, SIZE, ALIGN) + +#undef ASM_OUTPUT_ALIGNED_COMMON +#define ASM_OUTPUT_ALIGNED_COMMON(FILE, NAME, SIZE, ALIGN) \ + x86_elf_aligned_common (FILE, NAME, SIZE, ALIGN); + +/* This is used to align code labels according to Intel recommendations. */ + +#ifdef HAVE_GAS_MAX_SKIP_P2ALIGN +#define ASM_OUTPUT_MAX_SKIP_ALIGN(FILE,LOG,MAX_SKIP) \ + do { \ + if ((LOG) != 0) { \ + if ((MAX_SKIP) == 0) fprintf ((FILE), "\t.p2align %d\n", (LOG)); \ + else { \ + fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP)); \ + /* Make sure that we have at least 8 byte alignment if > 8 byte \ + alignment is preferred. */ \ + if ((LOG) > 3 \ + && (1 << (LOG)) > ((MAX_SKIP) + 1) \ + && (MAX_SKIP) >= 7) \ + fputs ("\t.p2align 3\n", (FILE)); \ + } \ + } \ + } while (0) +#undef ASM_OUTPUT_MAX_SKIP_PAD +#define ASM_OUTPUT_MAX_SKIP_PAD(FILE, LOG, MAX_SKIP) \ + if ((LOG) != 0) \ + { \ + if ((MAX_SKIP) == 0) \ + fprintf ((FILE), "\t.p2align %d\n", (LOG)); \ + else \ + fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP)); \ + } +#endif + + +/* i386 System V Release 4 uses DWARF debugging info. + x86-64 ABI specifies DWARF2. */ + +#define DWARF2_DEBUGGING_INFO 1 +#define DWARF2_UNWIND_INFO 1 + +#undef PREFERRED_DEBUGGING_TYPE +#define PREFERRED_DEBUGGING_TYPE DWARF2_DEBUG + +#undef TARGET_ASM_SELECT_SECTION +#define TARGET_ASM_SELECT_SECTION x86_64_elf_select_section + +#undef TARGET_ASM_UNIQUE_SECTION +#define TARGET_ASM_UNIQUE_SECTION x86_64_elf_unique_section diff --git a/gcc/config/i386/x86intrin.h b/gcc/config/i386/x86intrin.h new file mode 100644 index 000000000..36b43df87 --- /dev/null +++ b/gcc/config/i386/x86intrin.h @@ -0,0 +1,96 @@ +/* Copyright (C) 2008, 2009, 2010 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86INTRIN_H_INCLUDED +#define _X86INTRIN_H_INCLUDED + +#include + +#ifdef __MMX__ +#include +#endif + +#ifdef __SSE__ +#include +#endif + +#ifdef __SSE2__ +#include +#endif + +#ifdef __SSE3__ +#include +#endif + +#ifdef __SSSE3__ +#include +#endif + +#ifdef __SSE4A__ +#include +#endif + +#if defined (__SSE4_2__) || defined (__SSE4_1__) +#include +#endif + +#if defined (__AES__) || defined (__PCLMUL__) +#include +#endif + +/* For including AVX instructions */ +#include + +#ifdef __3dNOW__ +#include +#endif + +#ifdef __FMA4__ +#include +#endif + +#ifdef __XOP__ +#include +#endif + +#ifdef __LWP__ +#include +#endif + +#ifdef __ABM__ +#include +#endif + +#ifdef __BMI__ +#include +#endif + +#ifdef __TBM__ +#include +#endif + +#ifdef __POPCNT__ +#include +#endif + +#endif /* _X86INTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/xm-cygwin.h b/gcc/config/i386/xm-cygwin.h new file mode 100644 index 000000000..bd2238729 --- /dev/null +++ b/gcc/config/i386/xm-cygwin.h @@ -0,0 +1,22 @@ +/* Configuration for GCC for hosting on Windows NT. + using a unix style C library. + Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2007 + Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#define HOST_EXECUTABLE_SUFFIX ".exe" diff --git a/gcc/config/i386/xm-djgpp.h b/gcc/config/i386/xm-djgpp.h new file mode 100644 index 000000000..c3758ea9e --- /dev/null +++ b/gcc/config/i386/xm-djgpp.h @@ -0,0 +1,84 @@ +/* Configuration for GCC for Intel 80386 running DJGPP. + Copyright (C) 1988, 1996, 1998, 1999, 2000, 2001, 2004, 2007 + Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +/* Use semicolons to separate elements of a path. */ +#define PATH_SEPARATOR ';' + +#define HOST_EXECUTABLE_SUFFIX ".exe" + +/* System dependent initialization for collect2 + to tell system() to act like Unix. */ +#define COLLECT2_HOST_INITIALIZATION \ + do { __system_flags |= (__system_allow_multiple_cmds \ + | __system_emulate_chdir); } while (0) + +/* Define a version appropriate for DOS. */ +#undef XREF_FILE_NAME +#define XREF_FILE_NAME(xref_file, file) \ + do { \ + const char xref_ext[] = ".gxref"; \ + strcpy (xref_file, file); \ + s = basename (xref_file); \ + t = strchr (s, '.'); \ + if (t) \ + strcpy (t, xref_ext); \ + else \ + strcat (xref_file, xref_ext); \ + } while (0) + +#undef GCC_DRIVER_HOST_INITIALIZATION +#define GCC_DRIVER_HOST_INITIALIZATION \ + do { \ + /* If the environment variable DJDIR is not defined, then DJGPP is not \ + installed correctly and GCC will quickly become confused with the \ + default prefix settings. Report the problem now so the user doesn't \ + receive deceptive "file not found" error messages later. */ \ + char *djdir = getenv ("DJDIR"); \ + if (djdir == NULL) \ + { \ + /* DJDIR is automatically defined by the DJGPP environment config \ + file pointed to by the environment variable DJGPP. Examine DJGPP \ + to try and figure out what's wrong. */ \ + char *djgpp = getenv ("DJGPP"); \ + if (djgpp == NULL) \ + fatal ("environment variable DJGPP not defined"); \ + else if (access (djgpp, R_OK) == 0) \ + fatal ("environment variable DJGPP points to missing file '%s'", \ + djgpp); \ + else \ + fatal ("environment variable DJGPP points to corrupt file '%s'", \ + djgpp); \ + } \ + } while (0) + +/* Canonicalize paths containing '/dev/env/'; used in prefix.c. + _fixpath is a djgpp-specific function to canonicalize a path. + "/dev/env/DJDIR" evaluates to "c:/djgpp" if DJDIR is "c:/djgpp" for + example. It removes any trailing '/', so add it back. */ +/* We cannot free PATH below as it can point to string constant */ +#define UPDATE_PATH_HOST_CANONICALIZE(PATH) \ + if (memcmp ((PATH), "/dev/env/", sizeof("/dev/env/") - 1) == 0) \ + { \ + static char fixed_path[FILENAME_MAX + 1]; \ + \ + _fixpath ((PATH), fixed_path); \ + strcat (fixed_path, "/"); \ + (PATH) = xstrdup (fixed_path); \ + } diff --git a/gcc/config/i386/xm-mingw32.h b/gcc/config/i386/xm-mingw32.h new file mode 100644 index 000000000..e0dd3f372 --- /dev/null +++ b/gcc/config/i386/xm-mingw32.h @@ -0,0 +1,35 @@ +/* Configuration for GCC for hosting on Windows32. + using GNU tools and the Windows32 API Library. + Copyright (C) 1997, 1998, 1999, 2001, 2002, 2003, 2004, 2007 + Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#define HOST_EXECUTABLE_SUFFIX ".exe" + +#undef PATH_SEPARATOR +#define PATH_SEPARATOR ';' + +/* This is the name of the null device on windows. */ +#define HOST_BIT_BUCKET "nul" + +/* The st_ino field of struct stat is always 0. */ +#define HOST_LACKS_INODE_NUMBERS + +/* MSVCRT does not support the "ll" format specifier for printing + "long long" values. Instead, we use "I64". */ +#define HOST_LONG_LONG_FORMAT "I64" diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h new file mode 100644 index 000000000..5aefa9db0 --- /dev/null +++ b/gcc/config/i386/xmmintrin.h @@ -0,0 +1,1251 @@ +/* Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 + Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 9.0. */ + +#ifndef _XMMINTRIN_H_INCLUDED +#define _XMMINTRIN_H_INCLUDED + +#ifndef __SSE__ +# error "SSE instruction set not enabled" +#else + +/* We need type definitions from the MMX header file. */ +#include + +/* Get _mm_malloc () and _mm_free (). */ +#include + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__)); + +/* Internal data types for implementing the intrinsics. */ +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +/* Create a selector for use with the SHUFPS instruction. */ +#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) + +/* Constants for use with _mm_prefetch. */ +enum _mm_hint +{ + _MM_HINT_T0 = 3, + _MM_HINT_T1 = 2, + _MM_HINT_T2 = 1, + _MM_HINT_NTA = 0 +}; + +/* Bits in the MXCSR. */ +#define _MM_EXCEPT_MASK 0x003f +#define _MM_EXCEPT_INVALID 0x0001 +#define _MM_EXCEPT_DENORM 0x0002 +#define _MM_EXCEPT_DIV_ZERO 0x0004 +#define _MM_EXCEPT_OVERFLOW 0x0008 +#define _MM_EXCEPT_UNDERFLOW 0x0010 +#define _MM_EXCEPT_INEXACT 0x0020 + +#define _MM_MASK_MASK 0x1f80 +#define _MM_MASK_INVALID 0x0080 +#define _MM_MASK_DENORM 0x0100 +#define _MM_MASK_DIV_ZERO 0x0200 +#define _MM_MASK_OVERFLOW 0x0400 +#define _MM_MASK_UNDERFLOW 0x0800 +#define _MM_MASK_INEXACT 0x1000 + +#define _MM_ROUND_MASK 0x6000 +#define _MM_ROUND_NEAREST 0x0000 +#define _MM_ROUND_DOWN 0x2000 +#define _MM_ROUND_UP 0x4000 +#define _MM_ROUND_TOWARD_ZERO 0x6000 + +#define _MM_FLUSH_ZERO_MASK 0x8000 +#define _MM_FLUSH_ZERO_ON 0x8000 +#define _MM_FLUSH_ZERO_OFF 0x0000 + +/* Create a vector of zeros. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setzero_ps (void) +{ + return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f }; +} + +/* Perform the respective operation on the lower SPFP (single-precision + floating-point) values of A and B; the upper three SPFP values are + passed through from A. */ + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_ss (__m128 __A) +{ + return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp_ss (__m128 __A) +{ + return (__m128) __builtin_ia32_rcpss ((__v4sf)__A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt_ss (__m128 __A) +{ + return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B); +} + +/* Perform the respective operation on the four SPFP values in A and B. */ + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_ps (__m128 __A) +{ + return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp_ps (__m128 __A) +{ + return (__m128) __builtin_ia32_rcpps ((__v4sf)__A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt_ps (__m128 __A) +{ + return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B); +} + +/* Perform logical bit-wise operations on 128-bit values. */ + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_and_ps (__m128 __A, __m128 __B) +{ + return __builtin_ia32_andps (__A, __B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_andnot_ps (__m128 __A, __m128 __B) +{ + return __builtin_ia32_andnps (__A, __B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_or_ps (__m128 __A, __m128 __B) +{ + return __builtin_ia32_orps (__A, __B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_xor_ps (__m128 __A, __m128 __B) +{ + return __builtin_ia32_xorps (__A, __B); +} + +/* Perform a comparison on the lower SPFP values of A and B. If the + comparison is true, place a mask of all ones in the result, otherwise a + mask of zeros. The upper three SPFP values are passed through from A. */ + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_movss ((__v4sf) __A, + (__v4sf) + __builtin_ia32_cmpltss ((__v4sf) __B, + (__v4sf) + __A)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_movss ((__v4sf) __A, + (__v4sf) + __builtin_ia32_cmpless ((__v4sf) __B, + (__v4sf) + __A)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnlt_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnle_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpngt_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_movss ((__v4sf) __A, + (__v4sf) + __builtin_ia32_cmpnltss ((__v4sf) __B, + (__v4sf) + __A)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnge_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_movss ((__v4sf) __A, + (__v4sf) + __builtin_ia32_cmpnless ((__v4sf) __B, + (__v4sf) + __A)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpord_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpunord_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B); +} + +/* Perform a comparison on the four SPFP values of A and B. For each + element, if the comparison is true, place a mask of all ones in the + result, otherwise a mask of zeros. */ + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnlt_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnle_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpngt_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnge_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpord_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpunord_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B); +} + +/* Compare the lower SPFP values of A and B and return 1 if true + and 0 if false. */ + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comieq_ss (__m128 __A, __m128 __B) +{ + return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comilt_ss (__m128 __A, __m128 __B) +{ + return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comile_ss (__m128 __A, __m128 __B) +{ + return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comigt_ss (__m128 __A, __m128 __B) +{ + return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comige_ss (__m128 __A, __m128 __B) +{ + return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comineq_ss (__m128 __A, __m128 __B) +{ + return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomieq_ss (__m128 __A, __m128 __B) +{ + return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomilt_ss (__m128 __A, __m128 __B) +{ + return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomile_ss (__m128 __A, __m128 __B) +{ + return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomigt_ss (__m128 __A, __m128 __B) +{ + return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomige_ss (__m128 __A, __m128 __B) +{ + return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomineq_ss (__m128 __A, __m128 __B) +{ + return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B); +} + +/* Convert the lower SPFP value to a 32-bit integer according to the current + rounding mode. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_si32 (__m128 __A) +{ + return __builtin_ia32_cvtss2si ((__v4sf) __A); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_ss2si (__m128 __A) +{ + return _mm_cvtss_si32 (__A); +} + +#ifdef __x86_64__ +/* Convert the lower SPFP value to a 32-bit integer according to the + current rounding mode. */ + +/* Intel intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_si64 (__m128 __A) +{ + return __builtin_ia32_cvtss2si64 ((__v4sf) __A); +} + +/* Microsoft intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_si64x (__m128 __A) +{ + return __builtin_ia32_cvtss2si64 ((__v4sf) __A); +} +#endif + +/* Convert the two lower SPFP values to 32-bit integers according to the + current rounding mode. Return the integers in packed form. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_pi32 (__m128 __A) +{ + return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_ps2pi (__m128 __A) +{ + return _mm_cvtps_pi32 (__A); +} + +/* Truncate the lower SPFP value to a 32-bit integer. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_si32 (__m128 __A) +{ + return __builtin_ia32_cvttss2si ((__v4sf) __A); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_ss2si (__m128 __A) +{ + return _mm_cvttss_si32 (__A); +} + +#ifdef __x86_64__ +/* Truncate the lower SPFP value to a 32-bit integer. */ + +/* Intel intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_si64 (__m128 __A) +{ + return __builtin_ia32_cvttss2si64 ((__v4sf) __A); +} + +/* Microsoft intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_si64x (__m128 __A) +{ + return __builtin_ia32_cvttss2si64 ((__v4sf) __A); +} +#endif + +/* Truncate the two lower SPFP values to 32-bit integers. Return the + integers in packed form. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttps_pi32 (__m128 __A) +{ + return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_ps2pi (__m128 __A) +{ + return _mm_cvttps_pi32 (__A); +} + +/* Convert B to a SPFP value and insert it as element zero in A. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi32_ss (__m128 __A, int __B) +{ + return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_si2ss (__m128 __A, int __B) +{ + return _mm_cvtsi32_ss (__A, __B); +} + +#ifdef __x86_64__ +/* Convert B to a SPFP value and insert it as element zero in A. */ + +/* Intel intrinsic. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64_ss (__m128 __A, long long __B) +{ + return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B); +} + +/* Microsoft intrinsic. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64x_ss (__m128 __A, long long __B) +{ + return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B); +} +#endif + +/* Convert the two 32-bit values in B to SPFP form and insert them + as the two lower elements in A. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpi32_ps (__m128 __A, __m64 __B) +{ + return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_pi2ps (__m128 __A, __m64 __B) +{ + return _mm_cvtpi32_ps (__A, __B); +} + +/* Convert the four signed 16-bit values in A to SPFP form. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpi16_ps (__m64 __A) +{ + __v4hi __sign; + __v2si __hisi, __losi; + __v4sf __zero, __ra, __rb; + + /* This comparison against zero gives us a mask that can be used to + fill in the missing sign bits in the unpack operations below, so + that we get signed values after unpacking. */ + __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A); + + /* Convert the four words to doublewords. */ + __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign); + __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign); + + /* Convert the doublewords to floating point two at a time. */ + __zero = (__v4sf) _mm_setzero_ps (); + __ra = __builtin_ia32_cvtpi2ps (__zero, __losi); + __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi); + + return (__m128) __builtin_ia32_movlhps (__ra, __rb); +} + +/* Convert the four unsigned 16-bit values in A to SPFP form. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpu16_ps (__m64 __A) +{ + __v2si __hisi, __losi; + __v4sf __zero, __ra, __rb; + + /* Convert the four words to doublewords. */ + __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL); + __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL); + + /* Convert the doublewords to floating point two at a time. */ + __zero = (__v4sf) _mm_setzero_ps (); + __ra = __builtin_ia32_cvtpi2ps (__zero, __losi); + __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi); + + return (__m128) __builtin_ia32_movlhps (__ra, __rb); +} + +/* Convert the low four signed 8-bit values in A to SPFP form. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpi8_ps (__m64 __A) +{ + __v8qi __sign; + + /* This comparison against zero gives us a mask that can be used to + fill in the missing sign bits in the unpack operations below, so + that we get signed values after unpacking. */ + __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A); + + /* Convert the four low bytes to words. */ + __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign); + + return _mm_cvtpi16_ps(__A); +} + +/* Convert the low four unsigned 8-bit values in A to SPFP form. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpu8_ps(__m64 __A) +{ + __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL); + return _mm_cvtpu16_ps(__A); +} + +/* Convert the four signed 32-bit values in A and B to SPFP form. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpi32x2_ps(__m64 __A, __m64 __B) +{ + __v4sf __zero = (__v4sf) _mm_setzero_ps (); + __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A); + __v4sf __sfb = __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__B); + return (__m128) __builtin_ia32_movlhps (__sfa, __sfb); +} + +/* Convert the four SPFP values in A to four signed 16-bit integers. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_pi16(__m128 __A) +{ + __v4sf __hisf = (__v4sf)__A; + __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf); + __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf); + __v2si __losi = __builtin_ia32_cvtps2pi (__losf); + return (__m64) __builtin_ia32_packssdw (__hisi, __losi); +} + +/* Convert the four SPFP values in A to four signed 8-bit integers. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_pi8(__m128 __A) +{ + __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A); + return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL); +} + +/* Selects four specific SPFP values from A and B based on MASK. */ +#ifdef __OPTIMIZE__ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask) +{ + return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask); +} +#else +#define _mm_shuffle_ps(A, B, MASK) \ + ((__m128) __builtin_ia32_shufps ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(MASK))) +#endif + +/* Selects and interleaves the upper two SPFP values from A and B. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B); +} + +/* Selects and interleaves the lower two SPFP values from A and B. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B); +} + +/* Sets the upper two SPFP values with 64-bits of data loaded from P; + the lower two values are passed through from A. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadh_pi (__m128 __A, __m64 const *__P) +{ + return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (const __v2sf *)__P); +} + +/* Stores the upper two SPFP values of A into P. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeh_pi (__m64 *__P, __m128 __A) +{ + __builtin_ia32_storehps ((__v2sf *)__P, (__v4sf)__A); +} + +/* Moves the upper two values of B into the lower two values of A. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movehl_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B); +} + +/* Moves the lower two values of B into the upper two values of A. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movelh_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B); +} + +/* Sets the lower two SPFP values with 64-bits of data loaded from P; + the upper two values are passed through from A. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadl_pi (__m128 __A, __m64 const *__P) +{ + return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (const __v2sf *)__P); +} + +/* Stores the lower two SPFP values of A into P. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storel_pi (__m64 *__P, __m128 __A) +{ + __builtin_ia32_storelps ((__v2sf *)__P, (__v4sf)__A); +} + +/* Creates a 4-bit mask from the most significant bits of the SPFP values. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movemask_ps (__m128 __A) +{ + return __builtin_ia32_movmskps ((__v4sf)__A); +} + +/* Return the contents of the control register. */ +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getcsr (void) +{ + return __builtin_ia32_stmxcsr (); +} + +/* Read exception bits from the control register. */ +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_MM_GET_EXCEPTION_STATE (void) +{ + return _mm_getcsr() & _MM_EXCEPT_MASK; +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_MM_GET_EXCEPTION_MASK (void) +{ + return _mm_getcsr() & _MM_MASK_MASK; +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_MM_GET_ROUNDING_MODE (void) +{ + return _mm_getcsr() & _MM_ROUND_MASK; +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_MM_GET_FLUSH_ZERO_MODE (void) +{ + return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; +} + +/* Set the control register to I. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setcsr (unsigned int __I) +{ + __builtin_ia32_ldmxcsr (__I); +} + +/* Set exception bits in the control register. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_MM_SET_EXCEPTION_STATE(unsigned int __mask) +{ + _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_MM_SET_EXCEPTION_MASK (unsigned int __mask) +{ + _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_MM_SET_ROUNDING_MODE (unsigned int __mode) +{ + _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode) +{ + _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode); +} + +/* Create a vector with element 0 as F and the rest zero. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_ss (float __F) +{ + return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f }; +} + +/* Create a vector with all four elements equal to F. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_ps (float __F) +{ + return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F }; +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_ps1 (float __F) +{ + return _mm_set1_ps (__F); +} + +/* Create a vector with element 0 as *P and the rest zero. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_ss (float const *__P) +{ + return _mm_set_ss (*__P); +} + +/* Create a vector with all four elements equal to *P. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load1_ps (float const *__P) +{ + return _mm_set1_ps (*__P); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_ps1 (float const *__P) +{ + return _mm_load1_ps (__P); +} + +/* Load four SPFP values from P. The address must be 16-byte aligned. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_ps (float const *__P) +{ + return (__m128) *(__v4sf *)__P; +} + +/* Load four SPFP values from P. The address need not be 16-byte aligned. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_ps (float const *__P) +{ + return (__m128) __builtin_ia32_loadups (__P); +} + +/* Load four SPFP values in reverse order. The address must be aligned. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadr_ps (float const *__P) +{ + __v4sf __tmp = *(__v4sf *)__P; + return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3)); +} + +/* Create the vector [Z Y X W]. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W) +{ + return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z }; +} + +/* Create the vector [W X Y Z]. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_ps (float __Z, float __Y, float __X, float __W) +{ + return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W }; +} + +/* Stores the lower SPFP value. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_ss (float *__P, __m128 __A) +{ + *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0); +} + +extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_f32 (__m128 __A) +{ + return __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0); +} + +/* Store four SPFP values. The address must be 16-byte aligned. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_ps (float *__P, __m128 __A) +{ + *(__v4sf *)__P = (__v4sf)__A; +} + +/* Store four SPFP values. The address need not be 16-byte aligned. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_ps (float *__P, __m128 __A) +{ + __builtin_ia32_storeups (__P, (__v4sf)__A); +} + +/* Store the lower SPFP value across four words. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store1_ps (float *__P, __m128 __A) +{ + __v4sf __va = (__v4sf)__A; + __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0)); + _mm_storeu_ps (__P, __tmp); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_ps1 (float *__P, __m128 __A) +{ + _mm_store1_ps (__P, __A); +} + +/* Store four SPFP values in reverse order. The address must be aligned. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storer_ps (float *__P, __m128 __A) +{ + __v4sf __va = (__v4sf)__A; + __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3)); + _mm_store_ps (__P, __tmp); +} + +/* Sets the low SPFP value of A from the low value of B. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_move_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B); +} + +/* Extracts one of the four words of A. The selector N must be immediate. */ +#ifdef __OPTIMIZE__ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_pi16 (__m64 const __A, int const __N) +{ + return __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pextrw (__m64 const __A, int const __N) +{ + return _mm_extract_pi16 (__A, __N); +} +#else +#define _mm_extract_pi16(A, N) \ + ((int) __builtin_ia32_vec_ext_v4hi ((__v4hi)(__m64)(A), (int)(N))) + +#define _m_pextrw(A, N) _mm_extract_pi16(A, N) +#endif + +/* Inserts word D into one of four words of A. The selector N must be + immediate. */ +#ifdef __OPTIMIZE__ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_pi16 (__m64 const __A, int const __D, int const __N) +{ + return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pinsrw (__m64 const __A, int const __D, int const __N) +{ + return _mm_insert_pi16 (__A, __D, __N); +} +#else +#define _mm_insert_pi16(A, D, N) \ + ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(__m64)(A), \ + (int)(D), (int)(N))) + +#define _m_pinsrw(A, D, N) _mm_insert_pi16(A, D, N) +#endif + +/* Compute the element-wise maximum of signed 16-bit values. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_pi16 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pmaxsw (__m64 __A, __m64 __B) +{ + return _mm_max_pi16 (__A, __B); +} + +/* Compute the element-wise maximum of unsigned 8-bit values. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_pu8 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pmaxub (__m64 __A, __m64 __B) +{ + return _mm_max_pu8 (__A, __B); +} + +/* Compute the element-wise minimum of signed 16-bit values. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_pi16 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pminsw (__m64 __A, __m64 __B) +{ + return _mm_min_pi16 (__A, __B); +} + +/* Compute the element-wise minimum of unsigned 8-bit values. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_pu8 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pminub (__m64 __A, __m64 __B) +{ + return _mm_min_pu8 (__A, __B); +} + +/* Create an 8-bit mask of the signs of 8-bit values. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movemask_pi8 (__m64 __A) +{ + return __builtin_ia32_pmovmskb ((__v8qi)__A); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pmovmskb (__m64 __A) +{ + return _mm_movemask_pi8 (__A); +} + +/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values + in B and produce the high 16 bits of the 32-bit results. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhi_pu16 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pmulhuw (__m64 __A, __m64 __B) +{ + return _mm_mulhi_pu16 (__A, __B); +} + +/* Return a combination of the four 16-bit values in A. The selector + must be an immediate. */ +#ifdef __OPTIMIZE__ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_pi16 (__m64 __A, int const __N) +{ + return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pshufw (__m64 __A, int const __N) +{ + return _mm_shuffle_pi16 (__A, __N); +} +#else +#define _mm_shuffle_pi16(A, N) \ + ((__m64) __builtin_ia32_pshufw ((__v4hi)(__m64)(A), (int)(N))) + +#define _m_pshufw(A, N) _mm_shuffle_pi16 (A, N) +#endif + +/* Conditionally store byte elements of A into P. The high bit of each + byte in the selector N determines whether the corresponding byte from + A is stored. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P) +{ + __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_maskmovq (__m64 __A, __m64 __N, char *__P) +{ + _mm_maskmove_si64 (__A, __N, __P); +} + +/* Compute the rounded averages of the unsigned 8-bit values in A and B. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_avg_pu8 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pavgb (__m64 __A, __m64 __B) +{ + return _mm_avg_pu8 (__A, __B); +} + +/* Compute the rounded averages of the unsigned 16-bit values in A and B. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_avg_pu16 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pavgw (__m64 __A, __m64 __B) +{ + return _mm_avg_pu16 (__A, __B); +} + +/* Compute the sum of the absolute differences of the unsigned 8-bit + values in A and B. Return the value in the lower 16-bit word; the + upper words are cleared. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sad_pu8 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psadbw (__m64 __A, __m64 __B) +{ + return _mm_sad_pu8 (__A, __B); +} + +/* Loads one cache line from address P to a location "closer" to the + processor. The selector I specifies the type of prefetch operation. */ +#ifdef __OPTIMIZE__ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_prefetch (const void *__P, enum _mm_hint __I) +{ + __builtin_prefetch (__P, 0, __I); +} +#else +#define _mm_prefetch(P, I) \ + __builtin_prefetch ((P), 0, (I)) +#endif + +/* Stores the data in A to the address P without polluting the caches. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_pi (__m64 *__P, __m64 __A) +{ + __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A); +} + +/* Likewise. The address must be 16-byte aligned. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_ps (float *__P, __m128 __A) +{ + __builtin_ia32_movntps (__P, (__v4sf)__A); +} + +/* Guarantees that every preceding store is globally visible before + any subsequent store. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sfence (void) +{ + __builtin_ia32_sfence (); +} + +/* The execution of the next instruction is delayed by an implementation + specific amount of time. The instruction does not modify the + architectural state. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_pause (void) +{ + __asm__ __volatile__ ("rep; nop" : : ); +} + +/* Transpose the 4x4 matrix composed of row[0-3]. */ +#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ +do { \ + __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ + __v4sf __t0 = __builtin_ia32_unpcklps (__r0, __r1); \ + __v4sf __t1 = __builtin_ia32_unpcklps (__r2, __r3); \ + __v4sf __t2 = __builtin_ia32_unpckhps (__r0, __r1); \ + __v4sf __t3 = __builtin_ia32_unpckhps (__r2, __r3); \ + (row0) = __builtin_ia32_movlhps (__t0, __t1); \ + (row1) = __builtin_ia32_movhlps (__t1, __t0); \ + (row2) = __builtin_ia32_movlhps (__t2, __t3); \ + (row3) = __builtin_ia32_movhlps (__t3, __t2); \ +} while (0) + +/* For backward source compatibility. */ +#ifdef __SSE2__ +# include +#endif + +#endif /* __SSE__ */ +#endif /* _XMMINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/xopintrin.h b/gcc/config/i386/xopintrin.h new file mode 100644 index 000000000..3ebcb4b9f --- /dev/null +++ b/gcc/config/i386/xopintrin.h @@ -0,0 +1,835 @@ +/* Copyright (C) 2007, 2008, 2009, 2010 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86INTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _XOPMMINTRIN_H_INCLUDED +#define _XOPMMINTRIN_H_INCLUDED + +#ifndef __XOP__ +# error "XOP instruction set not enabled" +#else + +#include + +/* Integer multiply/add intructions. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacssww ((__v8hi)__A,(__v8hi)__B, (__v8hi)__C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacsww ((__v8hi)__A, (__v8hi)__B, (__v8hi)__C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacsswd ((__v8hi)__A, (__v8hi)__B, (__v4si)__C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacswd ((__v8hi)__A, (__v8hi)__B, (__v4si)__C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacssdd ((__v4si)__A, (__v4si)__B, (__v4si)__C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacsdd ((__v4si)__A, (__v4si)__B, (__v4si)__C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacssdql ((__v4si)__A, (__v4si)__B, (__v2di)__C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacsdql ((__v4si)__A, (__v4si)__B, (__v2di)__C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacssdqh ((__v4si)__A, (__v4si)__B, (__v2di)__C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacsdqh ((__v4si)__A, (__v4si)__B, (__v2di)__C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmadcsswd ((__v8hi)__A,(__v8hi)__B,(__v4si)__C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmadcswd ((__v8hi)__A,(__v8hi)__B,(__v4si)__C); +} + +/* Packed Integer Horizontal Add and Subtract */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddw_epi8(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddbw ((__v16qi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddd_epi8(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddbd ((__v16qi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddq_epi8(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddbq ((__v16qi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddd_epi16(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddwd ((__v8hi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddq_epi16(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddwq ((__v8hi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddq_epi32(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphadddq ((__v4si)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddw_epu8(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddubw ((__v16qi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddd_epu8(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddubd ((__v16qi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddq_epu8(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddubq ((__v16qi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddd_epu16(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphadduwd ((__v8hi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddq_epu16(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphadduwq ((__v8hi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddq_epu32(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddudq ((__v4si)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsubw_epi8(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphsubbw ((__v16qi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsubd_epi16(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphsubwd ((__v8hi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsubq_epi32(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphsubdq ((__v4si)__A); +} + +/* Vector conditional move and permute */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpcmov (__A, __B, __C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpperm ((__v16qi)__A, (__v16qi)__B, (__v16qi)__C); +} + +/* Packed Integer Rotates and Shifts + Rotates - Non-Immediate form */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rot_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vprotb ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rot_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vprotw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rot_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vprotd ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rot_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vprotq ((__v2di)__A, (__v2di)__B); +} + +/* Rotates - Immediate form */ + +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roti_epi8(__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_vprotbi ((__v16qi)__A, __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roti_epi16(__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_vprotwi ((__v8hi)__A, __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roti_epi32(__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_vprotdi ((__v4si)__A, __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roti_epi64(__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_vprotqi ((__v2di)__A, __B); +} +#else +#define _mm_roti_epi8(A, N) \ + ((__m128i) __builtin_ia32_vprotbi ((__v16qi)(__m128i)(A), (int)(N))) +#define _mm_roti_epi16(A, N) \ + ((__m128i) __builtin_ia32_vprotwi ((__v8hi)(__m128i)(A), (int)(N))) +#define _mm_roti_epi32(A, N) \ + ((__m128i) __builtin_ia32_vprotdi ((__v4si)(__m128i)(A), (int)(N))) +#define _mm_roti_epi64(A, N) \ + ((__m128i) __builtin_ia32_vprotqi ((__v2di)(__m128i)(A), (int)(N))) +#endif + +/* Shifts */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shl_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshlb ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shl_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshlw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shl_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshld ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shl_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshlq ((__v2di)__A, (__v2di)__B); +} + + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshab ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshaw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshad ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshaq ((__v2di)__A, (__v2di)__B); +} + +/* Compare and Predicate Generation + pcom (integer, unsinged bytes) */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_epu8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomltub ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_epu8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomleub ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_epu8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgtub ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_epu8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgeub ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_epu8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomequb ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_epu8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomnequb ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comfalse_epu8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomfalseub ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comtrue_epu8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomtrueub ((__v16qi)__A, (__v16qi)__B); +} + +/*pcom (integer, unsinged words) */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_epu16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomltuw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_epu16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomleuw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_epu16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgtuw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_epu16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgeuw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_epu16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomequw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_epu16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomnequw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comfalse_epu16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomfalseuw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comtrue_epu16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomtrueuw ((__v8hi)__A, (__v8hi)__B); +} + +/*pcom (integer, unsinged double words) */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_epu32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomltud ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_epu32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomleud ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_epu32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgtud ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_epu32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgeud ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_epu32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomequd ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_epu32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomnequd ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comfalse_epu32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomfalseud ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comtrue_epu32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomtrueud ((__v4si)__A, (__v4si)__B); +} + +/*pcom (integer, unsinged quad words) */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_epu64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomltuq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_epu64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomleuq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_epu64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgtuq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_epu64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgeuq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_epu64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomequq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_epu64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomnequq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comfalse_epu64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomfalseuq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comtrue_epu64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomtrueuq ((__v2di)__A, (__v2di)__B); +} + +/*pcom (integer, signed bytes) */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomltb ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomleb ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgtb ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgeb ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomeqb ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomneqb ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comfalse_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomfalseb ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comtrue_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomtrueb ((__v16qi)__A, (__v16qi)__B); +} + +/*pcom (integer, signed words) */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomltw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomlew ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgtw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgew ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomeqw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomneqw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comfalse_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomfalsew ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comtrue_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomtruew ((__v8hi)__A, (__v8hi)__B); +} + +/*pcom (integer, signed double words) */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomltd ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomled ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgtd ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomged ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomeqd ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomneqd ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comfalse_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomfalsed ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comtrue_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomtrued ((__v4si)__A, (__v4si)__B); +} + +/*pcom (integer, signed quad words) */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomltq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomleq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgtq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgeq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomeqq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomneqq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comfalse_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomfalseq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comtrue_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomtrueq ((__v2di)__A, (__v2di)__B); +} + +/* FRCZ */ + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_frcz_ps (__m128 __A) +{ + return (__m128) __builtin_ia32_vfrczps ((__v4sf)__A); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_frcz_pd (__m128d __A) +{ + return (__m128d) __builtin_ia32_vfrczpd ((__v2df)__A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_frcz_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfrczss ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_frcz_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfrczsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_frcz_ps (__m256 __A) +{ + return (__m256) __builtin_ia32_vfrczps256 ((__v8sf)__A); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_frcz_pd (__m256d __A) +{ + return (__m256d) __builtin_ia32_vfrczpd256 ((__v4df)__A); +} + +/* PERMIL2 */ + +#ifdef __OPTIMIZE__ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permute2_pd (__m128d __X, __m128d __Y, __m128i __C, const int __I) +{ + return (__m128d) __builtin_ia32_vpermil2pd ((__v2df)__X, + (__v2df)__Y, + (__v2di)__C, + __I); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2_pd (__m256d __X, __m256d __Y, __m256i __C, const int __I) +{ + return (__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)__X, + (__v4df)__Y, + (__v4di)__C, + __I); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permute2_ps (__m128 __X, __m128 __Y, __m128i __C, const int __I) +{ + return (__m128) __builtin_ia32_vpermil2ps ((__v4sf)__X, + (__v4sf)__Y, + (__v4si)__C, + __I); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2_ps (__m256 __X, __m256 __Y, __m256i __C, const int __I) +{ + return (__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)__X, + (__v8sf)__Y, + (__v8si)__C, + __I); +} +#else +#define _mm_permute2_pd(X, Y, C, I) \ + ((__m128d) __builtin_ia32_vpermil2pd ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (__v2di)(__m128d)(C), \ + (int)(I))) + +#define _mm256_permute2_pd(X, Y, C, I) \ + ((__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)(__m256d)(X), \ + (__v4df)(__m256d)(Y), \ + (__v4di)(__m256d)(C), \ + (int)(I))) + +#define _mm_permute2_ps(X, Y, C, I) \ + ((__m128) __builtin_ia32_vpermil2ps ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), \ + (__v4si)(__m128)(C), \ + (int)(I))) + +#define _mm256_permute2_ps(X, Y, C, I) \ + ((__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)(__m256)(X), \ + (__v8sf)(__m256)(Y), \ + (__v8si)(__m256)(C), \ + (int)(I))) +#endif /* __OPTIMIZE__ */ + +#endif /* __XOP__ */ + +#endif /* _XOPMMINTRIN_H_INCLUDED */ -- cgit v1.2.3