From 554fd8c5195424bdbcabf5de30fdc183aba391bd Mon Sep 17 00:00:00 2001 From: upstream source tree Date: Sun, 15 Mar 2015 20:14:05 -0400 Subject: obtained gcc-4.6.4.tar.bz2 from upstream website; verified gcc-4.6.4.tar.bz2.sig; imported gcc-4.6.4 source tree from verified upstream tarball. downloading a git-generated archive based on the 'upstream' tag should provide you with a source tree that is binary identical to the one extracted from the above tarball. if you have obtained the source via the command 'git clone', however, do note that line-endings of files in your working directory might differ from line-endings of the respective files in the upstream repository. --- gcc/config/arm/README-interworking | 749 ++ gcc/config/arm/aout.h | 380 + gcc/config/arm/arm-c.c | 45 + gcc/config/arm/arm-cores.def | 136 + gcc/config/arm/arm-generic.md | 153 + gcc/config/arm/arm-ldmstm.ml | 332 + gcc/config/arm/arm-modes.def | 78 + gcc/config/arm/arm-protos.h | 231 + gcc/config/arm/arm-tune.md | 5 + gcc/config/arm/arm.c | 23712 +++++++++++++++++++++++++++++++++++ gcc/config/arm/arm.h | 2464 ++++ gcc/config/arm/arm.md | 10746 ++++++++++++++++ gcc/config/arm/arm.opt | 171 + gcc/config/arm/arm1020e.md | 375 + gcc/config/arm/arm1026ejs.md | 240 + gcc/config/arm/arm1136jfs.md | 376 + gcc/config/arm/arm926ejs.md | 187 + gcc/config/arm/arm_neon.h | 12176 ++++++++++++++++++ gcc/config/arm/bpabi-v6m.S | 318 + gcc/config/arm/bpabi.S | 163 + gcc/config/arm/bpabi.c | 56 + gcc/config/arm/bpabi.h | 125 + gcc/config/arm/cirrus.md | 540 + gcc/config/arm/coff.h | 86 + gcc/config/arm/constraints.md | 335 + gcc/config/arm/cortex-a5.md | 297 + gcc/config/arm/cortex-a8-neon.md | 1312 ++ gcc/config/arm/cortex-a8.md | 275 + gcc/config/arm/cortex-a9-neon.md | 1237 ++ gcc/config/arm/cortex-a9.md | 269 + gcc/config/arm/cortex-m4-fpu.md | 111 + gcc/config/arm/cortex-m4.md | 111 + gcc/config/arm/cortex-r4.md | 292 + gcc/config/arm/cortex-r4f.md | 161 + gcc/config/arm/crti.asm | 86 + gcc/config/arm/crtn.asm | 82 + gcc/config/arm/ecos-elf.h | 27 + gcc/config/arm/elf.h | 166 + gcc/config/arm/fa526.md | 161 + gcc/config/arm/fa606te.md | 171 + gcc/config/arm/fa626te.md | 165 + gcc/config/arm/fa726te.md | 218 + gcc/config/arm/fmp626.md | 182 + gcc/config/arm/fp16.c | 145 + gcc/config/arm/fpa.md | 889 ++ gcc/config/arm/freebsd.h | 67 + gcc/config/arm/gentune.sh | 29 + gcc/config/arm/ieee754-df.S | 1447 +++ gcc/config/arm/ieee754-sf.S | 1060 ++ gcc/config/arm/iterators.md | 405 + gcc/config/arm/iwmmxt.md | 1332 ++ gcc/config/arm/ldmstm.md | 1191 ++ gcc/config/arm/lib1funcs.asm | 1829 +++ gcc/config/arm/libgcc-bpabi.ver | 108 + gcc/config/arm/libunwind.S | 363 + gcc/config/arm/linux-atomic.c | 278 + gcc/config/arm/linux-eabi.h | 103 + gcc/config/arm/linux-elf.h | 120 + gcc/config/arm/linux-gas.h | 56 + gcc/config/arm/mmintrin.h | 1254 ++ gcc/config/arm/neon-docgen.ml | 337 + gcc/config/arm/neon-gen.ml | 416 + gcc/config/arm/neon-schedgen.ml | 543 + gcc/config/arm/neon-testgen.ml | 283 + gcc/config/arm/neon.md | 5476 ++++++++ gcc/config/arm/neon.ml | 1857 +++ gcc/config/arm/netbsd-elf.h | 157 + gcc/config/arm/netbsd.h | 150 + gcc/config/arm/pe.c | 257 + gcc/config/arm/pe.h | 148 + gcc/config/arm/pe.opt | 23 + gcc/config/arm/pr-support.c | 401 + gcc/config/arm/predicates.md | 688 + gcc/config/arm/rtems-eabi.h | 29 + gcc/config/arm/rtems-elf.h | 45 + gcc/config/arm/semi.h | 75 + gcc/config/arm/sfp-machine.h | 105 + gcc/config/arm/symbian.h | 105 + gcc/config/arm/sync.md | 602 + gcc/config/arm/t-arm | 66 + gcc/config/arm/t-arm-elf | 128 + gcc/config/arm/t-arm-softfp | 29 + gcc/config/arm/t-bpabi | 36 + gcc/config/arm/t-linux | 34 + gcc/config/arm/t-linux-androideabi | 10 + gcc/config/arm/t-linux-eabi | 43 + gcc/config/arm/t-netbsd | 47 + gcc/config/arm/t-pe | 52 + gcc/config/arm/t-rtems | 10 + gcc/config/arm/t-rtems-eabi | 8 + gcc/config/arm/t-strongarm-elf | 61 + gcc/config/arm/t-symbian | 53 + gcc/config/arm/t-vxworks | 44 + gcc/config/arm/t-wince-pe | 56 + gcc/config/arm/thumb2.md | 1121 ++ gcc/config/arm/uclinux-eabi.h | 66 + gcc/config/arm/uclinux-elf.h | 88 + gcc/config/arm/unaligned-funcs.c | 57 + gcc/config/arm/unknown-elf.h | 100 + gcc/config/arm/unwind-arm.c | 1263 ++ gcc/config/arm/unwind-arm.h | 281 + gcc/config/arm/vec-common.md | 110 + gcc/config/arm/vfp.md | 1153 ++ gcc/config/arm/vfp11.md | 92 + gcc/config/arm/vxworks.h | 113 + gcc/config/arm/vxworks.opt | 60 + gcc/config/arm/wince-pe.h | 26 + 107 files changed, 87082 insertions(+) create mode 100644 gcc/config/arm/README-interworking create mode 100644 gcc/config/arm/aout.h create mode 100644 gcc/config/arm/arm-c.c create mode 100644 gcc/config/arm/arm-cores.def create mode 100644 gcc/config/arm/arm-generic.md create mode 100644 gcc/config/arm/arm-ldmstm.ml create mode 100644 gcc/config/arm/arm-modes.def create mode 100644 gcc/config/arm/arm-protos.h create mode 100644 gcc/config/arm/arm-tune.md create mode 100644 gcc/config/arm/arm.c create mode 100644 gcc/config/arm/arm.h create mode 100644 gcc/config/arm/arm.md create mode 100644 gcc/config/arm/arm.opt create mode 100644 gcc/config/arm/arm1020e.md create mode 100644 gcc/config/arm/arm1026ejs.md create mode 100644 gcc/config/arm/arm1136jfs.md create mode 100644 gcc/config/arm/arm926ejs.md create mode 100644 gcc/config/arm/arm_neon.h create mode 100644 gcc/config/arm/bpabi-v6m.S create mode 100644 gcc/config/arm/bpabi.S create mode 100644 gcc/config/arm/bpabi.c create mode 100644 gcc/config/arm/bpabi.h create mode 100644 gcc/config/arm/cirrus.md create mode 100644 gcc/config/arm/coff.h create mode 100644 gcc/config/arm/constraints.md create mode 100644 gcc/config/arm/cortex-a5.md create mode 100644 gcc/config/arm/cortex-a8-neon.md create mode 100644 gcc/config/arm/cortex-a8.md create mode 100644 gcc/config/arm/cortex-a9-neon.md create mode 100644 gcc/config/arm/cortex-a9.md create mode 100644 gcc/config/arm/cortex-m4-fpu.md create mode 100644 gcc/config/arm/cortex-m4.md create mode 100644 gcc/config/arm/cortex-r4.md create mode 100644 gcc/config/arm/cortex-r4f.md create mode 100644 gcc/config/arm/crti.asm create mode 100644 gcc/config/arm/crtn.asm create mode 100644 gcc/config/arm/ecos-elf.h create mode 100644 gcc/config/arm/elf.h create mode 100644 gcc/config/arm/fa526.md create mode 100644 gcc/config/arm/fa606te.md create mode 100644 gcc/config/arm/fa626te.md create mode 100644 gcc/config/arm/fa726te.md create mode 100644 gcc/config/arm/fmp626.md create mode 100644 gcc/config/arm/fp16.c create mode 100644 gcc/config/arm/fpa.md create mode 100644 gcc/config/arm/freebsd.h create mode 100755 gcc/config/arm/gentune.sh create mode 100644 gcc/config/arm/ieee754-df.S create mode 100644 gcc/config/arm/ieee754-sf.S create mode 100644 gcc/config/arm/iterators.md create mode 100644 gcc/config/arm/iwmmxt.md create mode 100644 gcc/config/arm/ldmstm.md create mode 100644 gcc/config/arm/lib1funcs.asm create mode 100644 gcc/config/arm/libgcc-bpabi.ver create mode 100644 gcc/config/arm/libunwind.S create mode 100644 gcc/config/arm/linux-atomic.c create mode 100644 gcc/config/arm/linux-eabi.h create mode 100644 gcc/config/arm/linux-elf.h create mode 100644 gcc/config/arm/linux-gas.h create mode 100644 gcc/config/arm/mmintrin.h create mode 100644 gcc/config/arm/neon-docgen.ml create mode 100644 gcc/config/arm/neon-gen.ml create mode 100644 gcc/config/arm/neon-schedgen.ml create mode 100644 gcc/config/arm/neon-testgen.ml create mode 100644 gcc/config/arm/neon.md create mode 100644 gcc/config/arm/neon.ml create mode 100644 gcc/config/arm/netbsd-elf.h create mode 100644 gcc/config/arm/netbsd.h create mode 100644 gcc/config/arm/pe.c create mode 100644 gcc/config/arm/pe.h create mode 100644 gcc/config/arm/pe.opt create mode 100644 gcc/config/arm/pr-support.c create mode 100644 gcc/config/arm/predicates.md create mode 100644 gcc/config/arm/rtems-eabi.h create mode 100644 gcc/config/arm/rtems-elf.h create mode 100644 gcc/config/arm/semi.h create mode 100644 gcc/config/arm/sfp-machine.h create mode 100644 gcc/config/arm/symbian.h create mode 100644 gcc/config/arm/sync.md create mode 100644 gcc/config/arm/t-arm create mode 100644 gcc/config/arm/t-arm-elf create mode 100644 gcc/config/arm/t-arm-softfp create mode 100644 gcc/config/arm/t-bpabi create mode 100644 gcc/config/arm/t-linux create mode 100644 gcc/config/arm/t-linux-androideabi create mode 100644 gcc/config/arm/t-linux-eabi create mode 100644 gcc/config/arm/t-netbsd create mode 100644 gcc/config/arm/t-pe create mode 100644 gcc/config/arm/t-rtems create mode 100644 gcc/config/arm/t-rtems-eabi create mode 100644 gcc/config/arm/t-strongarm-elf create mode 100644 gcc/config/arm/t-symbian create mode 100644 gcc/config/arm/t-vxworks create mode 100644 gcc/config/arm/t-wince-pe create mode 100644 gcc/config/arm/thumb2.md create mode 100644 gcc/config/arm/uclinux-eabi.h create mode 100644 gcc/config/arm/uclinux-elf.h create mode 100644 gcc/config/arm/unaligned-funcs.c create mode 100644 gcc/config/arm/unknown-elf.h create mode 100644 gcc/config/arm/unwind-arm.c create mode 100644 gcc/config/arm/unwind-arm.h create mode 100644 gcc/config/arm/vec-common.md create mode 100644 gcc/config/arm/vfp.md create mode 100644 gcc/config/arm/vfp11.md create mode 100644 gcc/config/arm/vxworks.h create mode 100644 gcc/config/arm/vxworks.opt create mode 100644 gcc/config/arm/wince-pe.h (limited to 'gcc/config/arm') diff --git a/gcc/config/arm/README-interworking b/gcc/config/arm/README-interworking new file mode 100644 index 000000000..7f2eda83b --- /dev/null +++ b/gcc/config/arm/README-interworking @@ -0,0 +1,749 @@ + Arm / Thumb Interworking + ======================== + +The Cygnus GNU Pro Toolkit for the ARM7T processor supports function +calls between code compiled for the ARM instruction set and code +compiled for the Thumb instruction set and vice versa. This document +describes how that interworking support operates and explains the +command line switches that should be used in order to produce working +programs. + +Note: The Cygnus GNU Pro Toolkit does not support switching between +compiling for the ARM instruction set and the Thumb instruction set +on anything other than a per file basis. There are in fact two +completely separate compilers, one that produces ARM assembler +instructions and one that produces Thumb assembler instructions. The +two compilers share the same assembler, linker and so on. + + +1. Explicit interworking support for C and C++ files +==================================================== + +By default if a file is compiled without any special command line +switches then the code produced will not support interworking. +Provided that a program is made up entirely from object files and +libraries produced in this way and which contain either exclusively +ARM instructions or exclusively Thumb instructions then this will not +matter and a working executable will be created. If an attempt is +made to link together mixed ARM and Thumb object files and libraries, +then warning messages will be produced by the linker and a non-working +executable will be created. + +In order to produce code which does support interworking it should be +compiled with the + + -mthumb-interwork + +command line option. Provided that a program is made up entirely from +object files and libraries built with this command line switch a +working executable will be produced, even if both ARM and Thumb +instructions are used by the various components of the program. (No +warning messages will be produced by the linker either). + +Note that specifying -mthumb-interwork does result in slightly larger, +slower code being produced. This is why interworking support must be +specifically enabled by a switch. + + +2. Explicit interworking support for assembler files +==================================================== + +If assembler files are to be included into an interworking program +then the following rules must be obeyed: + + * Any externally visible functions must return by using the BX + instruction. + + * Normal function calls can just use the BL instruction. The + linker will automatically insert code to switch between ARM + and Thumb modes as necessary. + + * Calls via function pointers should use the BX instruction if + the call is made in ARM mode: + + .code 32 + mov lr, pc + bx rX + + This code sequence will not work in Thumb mode however, since + the mov instruction will not set the bottom bit of the lr + register. Instead a branch-and-link to the _call_via_rX + functions should be used instead: + + .code 16 + bl _call_via_rX + + where rX is replaced by the name of the register containing + the function address. + + * All externally visible functions which should be entered in + Thumb mode must have the .thumb_func pseudo op specified just + before their entry point. e.g.: + + .code 16 + .global function + .thumb_func + function: + ...start of function.... + + * All assembler files must be assembled with the switch + -mthumb-interwork specified on the command line. (If the file + is assembled by calling gcc it will automatically pass on the + -mthumb-interwork switch to the assembler, provided that it + was specified on the gcc command line in the first place.) + + +3. Support for old, non-interworking aware code. +================================================ + +If it is necessary to link together code produced by an older, +non-interworking aware compiler, or code produced by the new compiler +but without the -mthumb-interwork command line switch specified, then +there are two command line switches that can be used to support this. + +The switch + + -mcaller-super-interworking + +will allow calls via function pointers in Thumb mode to work, +regardless of whether the function pointer points to old, +non-interworking aware code or not. Specifying this switch does +produce slightly slower code however. + +Note: There is no switch to allow calls via function pointers in ARM +mode to be handled specially. Calls via function pointers from +interworking aware ARM code to non-interworking aware ARM code work +without any special considerations by the compiler. Calls via +function pointers from interworking aware ARM code to non-interworking +aware Thumb code however will not work. (Actually under some +circumstances they may work, but there are no guarantees). This is +because only the new compiler is able to produce Thumb code, and this +compiler already has a command line switch to produce interworking +aware code. + + +The switch + + -mcallee-super-interworking + +will allow non-interworking aware ARM or Thumb code to call Thumb +functions, either directly or via function pointers. Specifying this +switch does produce slightly larger, slower code however. + +Note: There is no switch to allow non-interworking aware ARM or Thumb +code to call ARM functions. There is no need for any special handling +of calls from non-interworking aware ARM code to interworking aware +ARM functions, they just work normally. Calls from non-interworking +aware Thumb functions to ARM code however, will not work. There is no +option to support this, since it is always possible to recompile the +Thumb code to be interworking aware. + +As an alternative to the command line switch +-mcallee-super-interworking, which affects all externally visible +functions in a file, it is possible to specify an attribute or +declspec for individual functions, indicating that that particular +function should support being called by non-interworking aware code. +The function should be defined like this: + + int __attribute__((interfacearm)) function + { + ... body of function ... + } + +or + + int __declspec(interfacearm) function + { + ... body of function ... + } + + + +4. Interworking support in dlltool +================================== + +It is possible to create DLLs containing mixed ARM and Thumb code. It +is also possible to call Thumb code in a DLL from an ARM program and +vice versa. It is even possible to call ARM DLLs that have been compiled +without interworking support (say by an older version of the compiler), +from Thumb programs and still have things work properly. + + A version of the `dlltool' program which supports the `--interwork' +command line switch is needed, as well as the following special +considerations when building programs and DLLs: + +*Use `-mthumb-interwork'* + When compiling files for a DLL or a program the `-mthumb-interwork' + command line switch should be specified if calling between ARM and + Thumb code can happen. If a program is being compiled and the + mode of the DLLs that it uses is not known, then it should be + assumed that interworking might occur and the switch used. + +*Use `-m thumb'* + If the exported functions from a DLL are all Thumb encoded then the + `-m thumb' command line switch should be given to dlltool when + building the stubs. This will make dlltool create Thumb encoded + stubs, rather than its default of ARM encoded stubs. + + If the DLL consists of both exported Thumb functions and exported + ARM functions then the `-m thumb' switch should not be used. + Instead the Thumb functions in the DLL should be compiled with the + `-mcallee-super-interworking' switch, or with the `interfacearm' + attribute specified on their prototypes. In this way they will be + given ARM encoded prologues, which will work with the ARM encoded + stubs produced by dlltool. + +*Use `-mcaller-super-interworking'* + If it is possible for Thumb functions in a DLL to call + non-interworking aware code via a function pointer, then the Thumb + code must be compiled with the `-mcaller-super-interworking' + command line switch. This will force the function pointer calls + to use the _interwork_call_via_rX stub functions which will + correctly restore Thumb mode upon return from the called function. + +*Link with `libgcc.a'* + When the dll is built it may have to be linked with the GCC + library (`libgcc.a') in order to extract the _call_via_rX functions + or the _interwork_call_via_rX functions. This represents a partial + redundancy since the same functions *may* be present in the + application itself, but since they only take up 372 bytes this + should not be too much of a consideration. + +*Use `--support-old-code'* + When linking a program with an old DLL which does not support + interworking, the `--support-old-code' command line switch to the + linker should be used. This causes the linker to generate special + interworking stubs which can cope with old, non-interworking aware + ARM code, at the cost of generating bulkier code. The linker will + still generate a warning message along the lines of: + "Warning: input file XXX does not support interworking, whereas YYY does." + but this can now be ignored because the --support-old-code switch + has been used. + + + +5. How interworking support works +================================= + +Switching between the ARM and Thumb instruction sets is accomplished +via the BX instruction which takes as an argument a register name. +Control is transfered to the address held in this register (with the +bottom bit masked out), and if the bottom bit is set, then Thumb +instruction processing is enabled, otherwise ARM instruction +processing is enabled. + +When the -mthumb-interwork command line switch is specified, gcc +arranges for all functions to return to their caller by using the BX +instruction. Thus provided that the return address has the bottom bit +correctly initialized to indicate the instruction set of the caller, +correct operation will ensue. + +When a function is called explicitly (rather than via a function +pointer), the compiler generates a BL instruction to do this. The +Thumb version of the BL instruction has the special property of +setting the bottom bit of the LR register after it has stored the +return address into it, so that a future BX instruction will correctly +return the instruction after the BL instruction, in Thumb mode. + +The BL instruction does not change modes itself however, so if an ARM +function is calling a Thumb function, or vice versa, it is necessary +to generate some extra instructions to handle this. This is done in +the linker when it is storing the address of the referenced function +into the BL instruction. If the BL instruction is an ARM style BL +instruction, but the referenced function is a Thumb function, then the +linker automatically generates a calling stub that converts from ARM +mode to Thumb mode, puts the address of this stub into the BL +instruction, and puts the address of the referenced function into the +stub. Similarly if the BL instruction is a Thumb BL instruction, and +the referenced function is an ARM function, the linker generates a +stub which converts from Thumb to ARM mode, puts the address of this +stub into the BL instruction, and the address of the referenced +function into the stub. + +This is why it is necessary to mark Thumb functions with the +.thumb_func pseudo op when creating assembler files. This pseudo op +allows the assembler to distinguish between ARM functions and Thumb +functions. (The Thumb version of GCC automatically generates these +pseudo ops for any Thumb functions that it generates). + +Calls via function pointers work differently. Whenever the address of +a function is taken, the linker examines the type of the function +being referenced. If the function is a Thumb function, then it sets +the bottom bit of the address. Technically this makes the address +incorrect, since it is now one byte into the start of the function, +but this is never a problem because: + + a. with interworking enabled all calls via function pointer + are done using the BX instruction and this ignores the + bottom bit when computing where to go to. + + b. the linker will always set the bottom bit when the address + of the function is taken, so it is never possible to take + the address of the function in two different places and + then compare them and find that they are not equal. + +As already mentioned any call via a function pointer will use the BX +instruction (provided that interworking is enabled). The only problem +with this is computing the return address for the return from the +called function. For ARM code this can easily be done by the code +sequence: + + mov lr, pc + bx rX + +(where rX is the name of the register containing the function +pointer). This code does not work for the Thumb instruction set, +since the MOV instruction will not set the bottom bit of the LR +register, so that when the called function returns, it will return in +ARM mode not Thumb mode. Instead the compiler generates this +sequence: + + bl _call_via_rX + +(again where rX is the name if the register containing the function +pointer). The special call_via_rX functions look like this: + + .thumb_func +_call_via_r0: + bx r0 + nop + +The BL instruction ensures that the correct return address is stored +in the LR register and then the BX instruction jumps to the address +stored in the function pointer, switch modes if necessary. + + +6. How caller-super-interworking support works +============================================== + +When the -mcaller-super-interworking command line switch is specified +it changes the code produced by the Thumb compiler so that all calls +via function pointers (including virtual function calls) now go via a +different stub function. The code to call via a function pointer now +looks like this: + + bl _interwork_call_via_r0 + +Note: The compiler does not insist that r0 be used to hold the +function address. Any register will do, and there are a suite of stub +functions, one for each possible register. The stub functions look +like this: + + .code 16 + .thumb_func +_interwork_call_via_r0 + bx pc + nop + + .code 32 + tst r0, #1 + stmeqdb r13!, {lr} + adreq lr, _arm_return + bx r0 + +The stub first switches to ARM mode, since it is a lot easier to +perform the necessary operations using ARM instructions. It then +tests the bottom bit of the register containing the address of the +function to be called. If this bottom bit is set then the function +being called uses Thumb instructions and the BX instruction to come +will switch back into Thumb mode before calling this function. (Note +that it does not matter how this called function chooses to return to +its caller, since the both the caller and callee are Thumb functions, +and mode switching is necessary). If the function being called is an +ARM mode function however, the stub pushes the return address (with +its bottom bit set) onto the stack, replaces the return address with +the address of the a piece of code called '_arm_return' and then +performs a BX instruction to call the function. + +The '_arm_return' code looks like this: + + .code 32 +_arm_return: + ldmia r13!, {r12} + bx r12 + .code 16 + + +It simply retrieves the return address from the stack, and then +performs a BX operation to return to the caller and switch back into +Thumb mode. + + +7. How callee-super-interworking support works +============================================== + +When -mcallee-super-interworking is specified on the command line the +Thumb compiler behaves as if every externally visible function that it +compiles has had the (interfacearm) attribute specified for it. What +this attribute does is to put a special, ARM mode header onto the +function which forces a switch into Thumb mode: + + without __attribute__((interfacearm)): + + .code 16 + .thumb_func + function: + ... start of function ... + + with __attribute__((interfacearm)): + + .code 32 + function: + orr r12, pc, #1 + bx r12 + + .code 16 + .thumb_func + .real_start_of_function: + + ... start of function ... + +Note that since the function now expects to be entered in ARM mode, it +no longer has the .thumb_func pseudo op specified for its name. +Instead the pseudo op is attached to a new label .real_start_of_ +(where is the name of the function) which indicates the start +of the Thumb code. This does have the interesting side effect in that +if this function is now called from a Thumb mode piece of code +outside of the current file, the linker will generate a calling stub +to switch from Thumb mode into ARM mode, and then this is immediately +overridden by the function's header which switches back into Thumb +mode. + +In addition the (interfacearm) attribute also forces the function to +return by using the BX instruction, even if has not been compiled with +the -mthumb-interwork command line flag, so that the correct mode will +be restored upon exit from the function. + + +8. Some examples +================ + + Given these two test files: + + int arm (void) { return 1 + thumb (); } + + int thumb (void) { return 2 + arm (); } + + The following pieces of assembler are produced by the ARM and Thumb +version of GCC depending upon the command line options used: + + `-O2': + .code 32 .code 16 + .global _arm .global _thumb + .thumb_func + _arm: _thumb: + mov ip, sp + stmfd sp!, {fp, ip, lr, pc} push {lr} + sub fp, ip, #4 + bl _thumb bl _arm + add r0, r0, #1 add r0, r0, #2 + ldmea fp, {fp, sp, pc} pop {pc} + + Note how the functions return without using the BX instruction. If +these files were assembled and linked together they would fail to work +because they do not change mode when returning to their caller. + + `-O2 -mthumb-interwork': + + .code 32 .code 16 + .global _arm .global _thumb + .thumb_func + _arm: _thumb: + mov ip, sp + stmfd sp!, {fp, ip, lr, pc} push {lr} + sub fp, ip, #4 + bl _thumb bl _arm + add r0, r0, #1 add r0, r0, #2 + ldmea fp, {fp, sp, lr} pop {r1} + bx lr bx r1 + + Now the functions use BX to return their caller. They have grown by +4 and 2 bytes respectively, but they can now successfully be linked +together and be expect to work. The linker will replace the +destinations of the two BL instructions with the addresses of calling +stubs which convert to the correct mode before jumping to the called +function. + + `-O2 -mcallee-super-interworking': + + .code 32 .code 32 + .global _arm .global _thumb + _arm: _thumb: + orr r12, pc, #1 + bx r12 + mov ip, sp .code 16 + stmfd sp!, {fp, ip, lr, pc} push {lr} + sub fp, ip, #4 + bl _thumb bl _arm + add r0, r0, #1 add r0, r0, #2 + ldmea fp, {fp, sp, lr} pop {r1} + bx lr bx r1 + + The thumb function now has an ARM encoded prologue, and it no longer +has the `.thumb-func' pseudo op attached to it. The linker will not +generate a calling stub for the call from arm() to thumb(), but it will +still have to generate a stub for the call from thumb() to arm(). Also +note how specifying `--mcallee-super-interworking' automatically +implies `-mthumb-interworking'. + + +9. Some Function Pointer Examples +================================= + + Given this test file: + + int func (void) { return 1; } + + int call (int (* ptr)(void)) { return ptr (); } + + The following varying pieces of assembler are produced by the Thumb +version of GCC depending upon the command line options used: + + `-O2': + .code 16 + .globl _func + .thumb_func + _func: + mov r0, #1 + bx lr + + .globl _call + .thumb_func + _call: + push {lr} + bl __call_via_r0 + pop {pc} + + Note how the two functions have different exit sequences. In +particular call() uses pop {pc} to return, which would not work if the +caller was in ARM mode. func() however, uses the BX instruction, even +though `-mthumb-interwork' has not been specified, as this is the most +efficient way to exit a function when the return address is held in the +link register. + + `-O2 -mthumb-interwork': + + .code 16 + .globl _func + .thumb_func + _func: + mov r0, #1 + bx lr + + .globl _call + .thumb_func + _call: + push {lr} + bl __call_via_r0 + pop {r1} + bx r1 + + This time both functions return by using the BX instruction. This +means that call() is now two bytes longer and several cycles slower +than the previous version. + + `-O2 -mcaller-super-interworking': + .code 16 + .globl _func + .thumb_func + _func: + mov r0, #1 + bx lr + + .globl _call + .thumb_func + _call: + push {lr} + bl __interwork_call_via_r0 + pop {pc} + + Very similar to the first (non-interworking) version, except that a +different stub is used to call via the function pointer. This new stub +will work even if the called function is not interworking aware, and +tries to return to call() in ARM mode. Note that the assembly code for +call() is still not interworking aware itself, and so should not be +called from ARM code. + + `-O2 -mcallee-super-interworking': + + .code 32 + .globl _func + _func: + orr r12, pc, #1 + bx r12 + + .code 16 + .globl .real_start_of_func + .thumb_func + .real_start_of_func: + mov r0, #1 + bx lr + + .code 32 + .globl _call + _call: + orr r12, pc, #1 + bx r12 + + .code 16 + .globl .real_start_of_call + .thumb_func + .real_start_of_call: + push {lr} + bl __call_via_r0 + pop {r1} + bx r1 + + Now both functions have an ARM coded prologue, and both functions +return by using the BX instruction. These functions are interworking +aware therefore and can safely be called from ARM code. The code for +the call() function is now 10 bytes longer than the original, non +interworking aware version, an increase of over 200%. + + If a prototype for call() is added to the source code, and this +prototype includes the `interfacearm' attribute: + + int __attribute__((interfacearm)) call (int (* ptr)(void)); + + then this code is produced (with only -O2 specified on the command +line): + + .code 16 + .globl _func + .thumb_func + _func: + mov r0, #1 + bx lr + + .globl _call + .code 32 + _call: + orr r12, pc, #1 + bx r12 + + .code 16 + .globl .real_start_of_call + .thumb_func + .real_start_of_call: + push {lr} + bl __call_via_r0 + pop {r1} + bx r1 + + So now both call() and func() can be safely called via +non-interworking aware ARM code. If, when such a file is assembled, +the assembler detects the fact that call() is being called by another +function in the same file, it will automatically adjust the target of +the BL instruction to point to .real_start_of_call. In this way there +is no need for the linker to generate a Thumb-to-ARM calling stub so +that call can be entered in ARM mode. + + +10. How to use dlltool to build ARM/Thumb DLLs +============================================== + Given a program (`prog.c') like this: + + extern int func_in_dll (void); + + int main (void) { return func_in_dll(); } + + And a DLL source file (`dll.c') like this: + + int func_in_dll (void) { return 1; } + + Here is how to build the DLL and the program for a purely ARM based +environment: + +*Step One + Build a `.def' file describing the DLL: + + ; example.def + ; This file describes the contents of the DLL + LIBRARY example + HEAPSIZE 0x40000, 0x2000 + EXPORTS + func_in_dll 1 + +*Step Two + Compile the DLL source code: + + arm-pe-gcc -O2 -c dll.c + +*Step Three + Use `dlltool' to create an exports file and a library file: + + dlltool --def example.def --output-exp example.o --output-lib example.a + +*Step Four + Link together the complete DLL: + + arm-pe-ld dll.o example.o -o example.dll + +*Step Five + Compile the program's source code: + + arm-pe-gcc -O2 -c prog.c + +*Step Six + Link together the program and the DLL's library file: + + arm-pe-gcc prog.o example.a -o prog + + If instead this was a Thumb DLL being called from an ARM program, the +steps would look like this. (To save space only those steps that are +different from the previous version are shown): + +*Step Two + Compile the DLL source code (using the Thumb compiler): + + thumb-pe-gcc -O2 -c dll.c -mthumb-interwork + +*Step Three + Build the exports and library files (and support interworking): + + dlltool -d example.def -z example.o -l example.a --interwork -m thumb + +*Step Five + Compile the program's source code (and support interworking): + + arm-pe-gcc -O2 -c prog.c -mthumb-interwork + + If instead, the DLL was an old, ARM DLL which does not support +interworking, and which cannot be rebuilt, then these steps would be +used. + +*Step One + Skip. If you do not have access to the sources of a DLL, there is + no point in building a `.def' file for it. + +*Step Two + Skip. With no DLL sources there is nothing to compile. + +*Step Three + Skip. Without a `.def' file you cannot use dlltool to build an + exports file or a library file. + +*Step Four + Skip. Without a set of DLL object files you cannot build the DLL. + Besides it has already been built for you by somebody else. + +*Step Five + Compile the program's source code, this is the same as before: + + arm-pe-gcc -O2 -c prog.c + +*Step Six + Link together the program and the DLL's library file, passing the + `--support-old-code' option to the linker: + + arm-pe-gcc prog.o example.a -Wl,--support-old-code -o prog + + Ignore the warning message about the input file not supporting + interworking as the --support-old-code switch has taken care if this. + + +Copyright (C) 1998, 2002, 2003, 2004 Free Software Foundation, Inc. + +Copying and distribution of this file, with or without modification, +are permitted in any medium without royalty provided the copyright +notice and this notice are preserved. diff --git a/gcc/config/arm/aout.h b/gcc/config/arm/aout.h new file mode 100644 index 000000000..f8e7367fd --- /dev/null +++ b/gcc/config/arm/aout.h @@ -0,0 +1,380 @@ +/* Definitions of target machine for GNU compiler, for ARM with a.out + Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2004, 2007, 2008, 2010 + Free Software Foundation, Inc. + Contributed by Richard Earnshaw (rearnsha@armltd.co.uk). + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +#ifndef ASM_APP_ON +#define ASM_APP_ON "" +#endif +#ifndef ASM_APP_OFF +#define ASM_APP_OFF "" +#endif + +/* Switch to the text or data segment. */ +#define TEXT_SECTION_ASM_OP "\t.text" +#define DATA_SECTION_ASM_OP "\t.data" +#define BSS_SECTION_ASM_OP "\t.bss" + +/* Note: If USER_LABEL_PREFIX or LOCAL_LABEL_PREFIX are changed, + make sure that this change is reflected in the function + coff_arm_is_local_label_name() in bfd/coff-arm.c. */ +#ifndef REGISTER_PREFIX +#define REGISTER_PREFIX "" +#endif + +#ifndef USER_LABEL_PREFIX +#define USER_LABEL_PREFIX "_" +#endif + +#ifndef LOCAL_LABEL_PREFIX +#define LOCAL_LABEL_PREFIX "" +#endif + +/* The assembler's names for the registers. Note that the ?xx registers are + there so that VFPv3/NEON registers D16-D31 have the same spacing as D0-D15 + (each of which is overlaid on two S registers), although there are no + actual single-precision registers which correspond to D16-D31. */ +#ifndef REGISTER_NAMES +#define REGISTER_NAMES \ +{ \ + "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", \ + "r8", "r9", "sl", "fp", "ip", "sp", "lr", "pc", \ + "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", \ + "cc", "sfp", "afp", \ + "mv0", "mv1", "mv2", "mv3", \ + "mv4", "mv5", "mv6", "mv7", \ + "mv8", "mv9", "mv10", "mv11", \ + "mv12", "mv13", "mv14", "mv15", \ + "wcgr0", "wcgr1", "wcgr2", "wcgr3", \ + "wr0", "wr1", "wr2", "wr3", \ + "wr4", "wr5", "wr6", "wr7", \ + "wr8", "wr9", "wr10", "wr11", \ + "wr12", "wr13", "wr14", "wr15", \ + "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", \ + "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", \ + "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", \ + "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31", \ + "d16", "?16", "d17", "?17", "d18", "?18", "d19", "?19", \ + "d20", "?20", "d21", "?21", "d22", "?22", "d23", "?23", \ + "d24", "?24", "d25", "?25", "d26", "?26", "d27", "?27", \ + "d28", "?28", "d29", "?29", "d30", "?30", "d31", "?31", \ + "vfpcc" \ +} +#endif + +#ifndef ADDITIONAL_REGISTER_NAMES +#define ADDITIONAL_REGISTER_NAMES \ +{ \ + {"a1", 0}, \ + {"a2", 1}, \ + {"a3", 2}, \ + {"a4", 3}, \ + {"v1", 4}, \ + {"v2", 5}, \ + {"v3", 6}, \ + {"v4", 7}, \ + {"v5", 8}, \ + {"v6", 9}, \ + {"rfp", 9}, /* Gcc used to call it this */ \ + {"sb", 9}, \ + {"v7", 10}, \ + {"r10", 10}, /* sl */ \ + {"r11", 11}, /* fp */ \ + {"r12", 12}, /* ip */ \ + {"r13", 13}, /* sp */ \ + {"r14", 14}, /* lr */ \ + {"r15", 15}, /* pc */ \ + {"mvf0", 27}, \ + {"mvf1", 28}, \ + {"mvf2", 29}, \ + {"mvf3", 30}, \ + {"mvf4", 31}, \ + {"mvf5", 32}, \ + {"mvf6", 33}, \ + {"mvf7", 34}, \ + {"mvf8", 35}, \ + {"mvf9", 36}, \ + {"mvf10", 37}, \ + {"mvf11", 38}, \ + {"mvf12", 39}, \ + {"mvf13", 40}, \ + {"mvf14", 41}, \ + {"mvf15", 42}, \ + {"mvd0", 27}, \ + {"mvd1", 28}, \ + {"mvd2", 29}, \ + {"mvd3", 30}, \ + {"mvd4", 31}, \ + {"mvd5", 32}, \ + {"mvd6", 33}, \ + {"mvd7", 34}, \ + {"mvd8", 35}, \ + {"mvd9", 36}, \ + {"mvd10", 37}, \ + {"mvd11", 38}, \ + {"mvd12", 39}, \ + {"mvd13", 40}, \ + {"mvd14", 41}, \ + {"mvd15", 42}, \ + {"mvfx0", 27}, \ + {"mvfx1", 28}, \ + {"mvfx2", 29}, \ + {"mvfx3", 30}, \ + {"mvfx4", 31}, \ + {"mvfx5", 32}, \ + {"mvfx6", 33}, \ + {"mvfx7", 34}, \ + {"mvfx8", 35}, \ + {"mvfx9", 36}, \ + {"mvfx10", 37}, \ + {"mvfx11", 38}, \ + {"mvfx12", 39}, \ + {"mvfx13", 40}, \ + {"mvfx14", 41}, \ + {"mvfx15", 42}, \ + {"mvdx0", 27}, \ + {"mvdx1", 28}, \ + {"mvdx2", 29}, \ + {"mvdx3", 30}, \ + {"mvdx4", 31}, \ + {"mvdx5", 32}, \ + {"mvdx6", 33}, \ + {"mvdx7", 34}, \ + {"mvdx8", 35}, \ + {"mvdx9", 36}, \ + {"mvdx10", 37}, \ + {"mvdx11", 38}, \ + {"mvdx12", 39}, \ + {"mvdx13", 40}, \ + {"mvdx14", 41}, \ + {"mvdx15", 42} \ +} +#endif + +#ifndef OVERLAPPING_REGISTER_NAMES +#define OVERLAPPING_REGISTER_NAMES \ +{ \ + {"d0", 63, 2}, \ + {"d1", 65, 2}, \ + {"d2", 67, 2}, \ + {"d3", 69, 2}, \ + {"d4", 71, 2}, \ + {"d5", 73, 2}, \ + {"d6", 75, 2}, \ + {"d7", 77, 2}, \ + {"d8", 79, 2}, \ + {"d9", 81, 2}, \ + {"d10", 83, 2}, \ + {"d11", 85, 2}, \ + {"d12", 87, 2}, \ + {"d13", 89, 2}, \ + {"d14", 91, 2}, \ + {"d15", 93, 2}, \ + {"q0", 63, 4}, \ + {"q1", 67, 4}, \ + {"q2", 71, 4}, \ + {"q3", 75, 4}, \ + {"q4", 79, 4}, \ + {"q5", 83, 4}, \ + {"q6", 87, 4}, \ + {"q7", 91, 4}, \ + {"q8", 95, 4}, \ + {"q9", 99, 4}, \ + {"q10", 103, 4}, \ + {"q11", 107, 4}, \ + {"q12", 111, 4}, \ + {"q13", 115, 4}, \ + {"q14", 119, 4}, \ + {"q15", 123, 4} \ +} +#endif + +#ifndef NO_DOLLAR_IN_LABEL +#define NO_DOLLAR_IN_LABEL 1 +#endif + +/* Generate DBX debugging information. riscix.h will undefine this because + the native assembler does not support stabs. */ +#define DBX_DEBUGGING_INFO 1 + +/* Acorn dbx moans about continuation chars, so don't use any. */ +#ifndef DBX_CONTIN_LENGTH +#define DBX_CONTIN_LENGTH 0 +#endif + +/* Output a function label definition. */ +#ifndef ASM_DECLARE_FUNCTION_NAME +#define ASM_DECLARE_FUNCTION_NAME(STREAM, NAME, DECL) \ + do \ + { \ + ARM_DECLARE_FUNCTION_NAME (STREAM, NAME, DECL); \ + ASM_OUTPUT_LABEL (STREAM, NAME); \ + } \ + while (0) +#endif + +/* Globalizing directive for a label. */ +#define GLOBAL_ASM_OP "\t.global\t" + +/* Make an internal label into a string. */ +#ifndef ASM_GENERATE_INTERNAL_LABEL +#define ASM_GENERATE_INTERNAL_LABEL(STRING, PREFIX, NUM) \ + sprintf (STRING, "*%s%s%u", LOCAL_LABEL_PREFIX, PREFIX, (unsigned int)(NUM)) +#endif + +/* Output an element of a dispatch table. */ +#define ASM_OUTPUT_ADDR_VEC_ELT(STREAM, VALUE) \ + do \ + { \ + gcc_assert (!TARGET_THUMB2); \ + asm_fprintf (STREAM, "\t.word\t%LL%d\n", VALUE); \ + } \ + while (0) + + +/* Thumb-2 always uses addr_diff_elf so that the Table Branch instructions + can be used. For non-pic code where the offsets do not suitable for + TBB/TBH the elements are output as absolute labels. */ +#define ASM_OUTPUT_ADDR_DIFF_ELT(STREAM, BODY, VALUE, REL) \ + do \ + { \ + if (TARGET_ARM) \ + asm_fprintf (STREAM, "\tb\t%LL%d\n", VALUE); \ + else if (TARGET_THUMB1) \ + { \ + if (flag_pic || optimize_size) \ + { \ + switch (GET_MODE(body)) \ + { \ + case QImode: \ + asm_fprintf (STREAM, "\t.byte\t(%LL%d-%LL%d)/2\n", \ + VALUE, REL); \ + break; \ + case HImode: /* TBH */ \ + asm_fprintf (STREAM, "\t.2byte\t(%LL%d-%LL%d)/2\n", \ + VALUE, REL); \ + break; \ + case SImode: \ + asm_fprintf (STREAM, "\t.word\t%LL%d-%LL%d\n", \ + VALUE, REL); \ + break; \ + default: \ + gcc_unreachable(); \ + } \ + } \ + else \ + asm_fprintf (STREAM, "\t.word\t%LL%d+1\n", VALUE); \ + } \ + else /* Thumb-2 */ \ + { \ + switch (GET_MODE(body)) \ + { \ + case QImode: /* TBB */ \ + asm_fprintf (STREAM, "\t.byte\t(%LL%d-%LL%d)/2\n", \ + VALUE, REL); \ + break; \ + case HImode: /* TBH */ \ + asm_fprintf (STREAM, "\t.2byte\t(%LL%d-%LL%d)/2\n", \ + VALUE, REL); \ + break; \ + case SImode: \ + if (flag_pic) \ + asm_fprintf (STREAM, "\t.word\t%LL%d+1-%LL%d\n", VALUE, REL); \ + else \ + asm_fprintf (STREAM, "\t.word\t%LL%d+1\n", VALUE); \ + break; \ + default: \ + gcc_unreachable(); \ + } \ + } \ + } \ + while (0) + + +#undef ASM_OUTPUT_ASCII +#define ASM_OUTPUT_ASCII(STREAM, PTR, LEN) \ + output_ascii_pseudo_op (STREAM, (const unsigned char *) (PTR), LEN) + +/* Output a gap. In fact we fill it with nulls. */ +#undef ASM_OUTPUT_SKIP +#define ASM_OUTPUT_SKIP(STREAM, NBYTES) \ + fprintf (STREAM, "\t.space\t%d\n", (int) (NBYTES)) + +/* Align output to a power of two. Horrible /bin/as. */ +#ifndef ASM_OUTPUT_ALIGN +#define ASM_OUTPUT_ALIGN(STREAM, POWER) \ + do \ + { \ + register int amount = 1 << (POWER); \ + \ + if (amount == 2) \ + fprintf (STREAM, "\t.even\n"); \ + else if (amount != 1) \ + fprintf (STREAM, "\t.align\t%d\n", amount - 4); \ + } \ + while (0) +#endif + +/* Output a common block. */ +#ifndef ASM_OUTPUT_COMMON +#define ASM_OUTPUT_COMMON(STREAM, NAME, SIZE, ROUNDED) \ + do \ + { \ + fprintf (STREAM, "\t.comm\t"); \ + assemble_name (STREAM, NAME); \ + asm_fprintf (STREAM, ", %d\t%@ %d\n", \ + (int)(ROUNDED), (int)(SIZE)); \ + } \ + while (0) +#endif + +/* Output a local common block. /bin/as can't do this, so hack a + `.space' into the bss segment. Note that this is *bad* practice, + which is guaranteed NOT to work since it doesn't define STATIC + COMMON space but merely STATIC BSS space. */ +#ifndef ASM_OUTPUT_ALIGNED_LOCAL +#define ASM_OUTPUT_ALIGNED_LOCAL(STREAM, NAME, SIZE, ALIGN) \ + do \ + { \ + switch_to_section (bss_section); \ + ASM_OUTPUT_ALIGN (STREAM, floor_log2 (ALIGN / BITS_PER_UNIT)); \ + ASM_OUTPUT_LABEL (STREAM, NAME); \ + fprintf (STREAM, "\t.space\t%d\n", (int)(SIZE)); \ + } \ + while (0) +#endif + +/* Output a zero-initialized block. */ +#ifndef ASM_OUTPUT_ALIGNED_BSS +#define ASM_OUTPUT_ALIGNED_BSS(STREAM, DECL, NAME, SIZE, ALIGN) \ + asm_output_aligned_bss (STREAM, DECL, NAME, SIZE, ALIGN) +#endif + +/* Output a #ident directive. */ +#ifndef ASM_OUTPUT_IDENT +#define ASM_OUTPUT_IDENT(STREAM,STRING) \ + asm_fprintf (STREAM, "%@ - - - ident %s\n", STRING) +#endif + +#ifndef ASM_COMMENT_START +#define ASM_COMMENT_START "@" +#endif + +/* This works for GAS and some other assemblers. */ +#define SET_ASM_OP "\t.set\t" diff --git a/gcc/config/arm/arm-c.c b/gcc/config/arm/arm-c.c new file mode 100644 index 000000000..f9ad1c9e1 --- /dev/null +++ b/gcc/config/arm/arm-c.c @@ -0,0 +1,45 @@ +/* Copyright (C) 2007, 2010 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "tm.h" +#include "tm_p.h" +#include "tree.h" +#include "output.h" +#include "c-family/c-common.h" + + +/* Output C specific EABI object attributes. These can not be done in + arm.c because they require information from the C frontend. */ + +static void arm_output_c_attributes(void) +{ + /* Tag_ABI_PCS_wchar_t. */ + asm_fprintf (asm_out_file, "\t.eabi_attribute 18, %d\n", + (int)(TYPE_PRECISION (wchar_type_node) / BITS_PER_UNIT)); +} + + +/* Setup so that common code calls arm_output_c_attributes. */ + +void arm_lang_object_attributes_init(void) +{ + arm_lang_output_object_attributes_hook = arm_output_c_attributes; +} diff --git a/gcc/config/arm/arm-cores.def b/gcc/config/arm/arm-cores.def new file mode 100644 index 000000000..0bb9aa3ee --- /dev/null +++ b/gcc/config/arm/arm-cores.def @@ -0,0 +1,136 @@ +/* ARM CPU Cores + Copyright (C) 2003, 2005, 2006, 2007, 2008, 2009, 2010 + Free Software Foundation, Inc. + Written by CodeSourcery, LLC + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +/* Before using #include to read this file, define a macro: + + ARM_CORE(CORE_NAME, CORE_IDENT, ARCH, FLAGS, COSTS) + + The CORE_NAME is the name of the core, represented as a string constant. + The CORE_IDENT is the name of the core, represented as an identifier. + ARCH is the architecture revision implemented by the chip. + FLAGS are the bitwise-or of the traits that apply to that core. + This need not include flags implied by the architecture. + COSTS is the name of the rtx_costs routine to use. + + If you update this table, you must update the "tune" attribute in + arm.md. + + Some tools assume no whitespace up to the first "," in each entry. */ + +/* V2/V2A Architecture Processors */ +ARM_CORE("arm2", arm2, 2, FL_CO_PROC | FL_MODE26, slowmul) +ARM_CORE("arm250", arm250, 2, FL_CO_PROC | FL_MODE26, slowmul) +ARM_CORE("arm3", arm3, 2, FL_CO_PROC | FL_MODE26, slowmul) + +/* V3 Architecture Processors */ +ARM_CORE("arm6", arm6, 3, FL_CO_PROC | FL_MODE26, slowmul) +ARM_CORE("arm60", arm60, 3, FL_CO_PROC | FL_MODE26, slowmul) +ARM_CORE("arm600", arm600, 3, FL_CO_PROC | FL_MODE26 | FL_WBUF, slowmul) +ARM_CORE("arm610", arm610, 3, FL_MODE26 | FL_WBUF, slowmul) +ARM_CORE("arm620", arm620, 3, FL_CO_PROC | FL_MODE26 | FL_WBUF, slowmul) +ARM_CORE("arm7", arm7, 3, FL_CO_PROC | FL_MODE26, slowmul) +ARM_CORE("arm7d", arm7d, 3, FL_CO_PROC | FL_MODE26, slowmul) +ARM_CORE("arm7di", arm7di, 3, FL_CO_PROC | FL_MODE26, slowmul) +ARM_CORE("arm70", arm70, 3, FL_CO_PROC | FL_MODE26, slowmul) +ARM_CORE("arm700", arm700, 3, FL_CO_PROC | FL_MODE26 | FL_WBUF, slowmul) +ARM_CORE("arm700i", arm700i, 3, FL_CO_PROC | FL_MODE26 | FL_WBUF, slowmul) +ARM_CORE("arm710", arm710, 3, FL_MODE26 | FL_WBUF, slowmul) +ARM_CORE("arm720", arm720, 3, FL_MODE26 | FL_WBUF, slowmul) +ARM_CORE("arm710c", arm710c, 3, FL_MODE26 | FL_WBUF, slowmul) +ARM_CORE("arm7100", arm7100, 3, FL_MODE26 | FL_WBUF, slowmul) +ARM_CORE("arm7500", arm7500, 3, FL_MODE26 | FL_WBUF, slowmul) +/* Doesn't have an external co-proc, but does have embedded fpa. */ +ARM_CORE("arm7500fe", arm7500fe, 3, FL_CO_PROC | FL_MODE26 | FL_WBUF, slowmul) + +/* V3M Architecture Processors */ +/* arm7m doesn't exist on its own, but only with D, ("and", and I), but + those don't alter the code, so arm7m is sometimes used. */ +ARM_CORE("arm7m", arm7m, 3M, FL_CO_PROC | FL_MODE26, fastmul) +ARM_CORE("arm7dm", arm7dm, 3M, FL_CO_PROC | FL_MODE26, fastmul) +ARM_CORE("arm7dmi", arm7dmi, 3M, FL_CO_PROC | FL_MODE26, fastmul) + +/* V4 Architecture Processors */ +ARM_CORE("arm8", arm8, 4, FL_MODE26 | FL_LDSCHED, fastmul) +ARM_CORE("arm810", arm810, 4, FL_MODE26 | FL_LDSCHED, fastmul) +ARM_CORE("strongarm", strongarm, 4, FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul) +ARM_CORE("strongarm110", strongarm110, 4, FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul) +ARM_CORE("strongarm1100", strongarm1100, 4, FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul) +ARM_CORE("strongarm1110", strongarm1110, 4, FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul) +ARM_CORE("fa526", fa526, 4, FL_LDSCHED, fastmul) +ARM_CORE("fa626", fa626, 4, FL_LDSCHED, fastmul) + +/* V4T Architecture Processors */ +ARM_CORE("arm7tdmi", arm7tdmi, 4T, FL_CO_PROC , fastmul) +ARM_CORE("arm7tdmi-s", arm7tdmis, 4T, FL_CO_PROC , fastmul) +ARM_CORE("arm710t", arm710t, 4T, FL_WBUF, fastmul) +ARM_CORE("arm720t", arm720t, 4T, FL_WBUF, fastmul) +ARM_CORE("arm740t", arm740t, 4T, FL_WBUF, fastmul) +ARM_CORE("arm9", arm9, 4T, FL_LDSCHED, fastmul) +ARM_CORE("arm9tdmi", arm9tdmi, 4T, FL_LDSCHED, fastmul) +ARM_CORE("arm920", arm920, 4T, FL_LDSCHED, fastmul) +ARM_CORE("arm920t", arm920t, 4T, FL_LDSCHED, fastmul) +ARM_CORE("arm922t", arm922t, 4T, FL_LDSCHED, fastmul) +ARM_CORE("arm940t", arm940t, 4T, FL_LDSCHED, fastmul) +ARM_CORE("ep9312", ep9312, 4T, FL_LDSCHED | FL_CIRRUS, fastmul) + +/* V5T Architecture Processors */ +ARM_CORE("arm10tdmi", arm10tdmi, 5T, FL_LDSCHED, fastmul) +ARM_CORE("arm1020t", arm1020t, 5T, FL_LDSCHED, fastmul) + +/* V5TE Architecture Processors */ +ARM_CORE("arm9e", arm9e, 5TE, FL_LDSCHED, 9e) +ARM_CORE("arm946e-s", arm946es, 5TE, FL_LDSCHED, 9e) +ARM_CORE("arm966e-s", arm966es, 5TE, FL_LDSCHED, 9e) +ARM_CORE("arm968e-s", arm968es, 5TE, FL_LDSCHED, 9e) +ARM_CORE("arm10e", arm10e, 5TE, FL_LDSCHED, fastmul) +ARM_CORE("arm1020e", arm1020e, 5TE, FL_LDSCHED, fastmul) +ARM_CORE("arm1022e", arm1022e, 5TE, FL_LDSCHED, fastmul) +ARM_CORE("xscale", xscale, 5TE, FL_LDSCHED | FL_STRONG | FL_XSCALE, xscale) +ARM_CORE("iwmmxt", iwmmxt, 5TE, FL_LDSCHED | FL_STRONG | FL_XSCALE | FL_IWMMXT, xscale) +ARM_CORE("iwmmxt2", iwmmxt2, 5TE, FL_LDSCHED | FL_STRONG | FL_XSCALE | FL_IWMMXT, xscale) +ARM_CORE("fa606te", fa606te, 5TE, FL_LDSCHED, 9e) +ARM_CORE("fa626te", fa626te, 5TE, FL_LDSCHED, 9e) +ARM_CORE("fmp626", fmp626, 5TE, FL_LDSCHED, 9e) +ARM_CORE("fa726te", fa726te, 5TE, FL_LDSCHED, fa726te) + +/* V5TEJ Architecture Processors */ +ARM_CORE("arm926ej-s", arm926ejs, 5TEJ, FL_LDSCHED, 9e) +ARM_CORE("arm1026ej-s", arm1026ejs, 5TEJ, FL_LDSCHED, 9e) + +/* V6 Architecture Processors */ +ARM_CORE("arm1136j-s", arm1136js, 6J, FL_LDSCHED, 9e) +ARM_CORE("arm1136jf-s", arm1136jfs, 6J, FL_LDSCHED | FL_VFPV2, 9e) +ARM_CORE("arm1176jz-s", arm1176jzs, 6ZK, FL_LDSCHED, 9e) +ARM_CORE("arm1176jzf-s", arm1176jzfs, 6ZK, FL_LDSCHED | FL_VFPV2, 9e) +ARM_CORE("mpcorenovfp", mpcorenovfp, 6K, FL_LDSCHED, 9e) +ARM_CORE("mpcore", mpcore, 6K, FL_LDSCHED | FL_VFPV2, 9e) +ARM_CORE("arm1156t2-s", arm1156t2s, 6T2, FL_LDSCHED, 9e) +ARM_CORE("arm1156t2f-s", arm1156t2fs, 6T2, FL_LDSCHED | FL_VFPV2, 9e) +ARM_CORE("cortex-a5", cortexa5, 7A, FL_LDSCHED, 9e) +ARM_CORE("cortex-a8", cortexa8, 7A, FL_LDSCHED, 9e) +ARM_CORE("cortex-a9", cortexa9, 7A, FL_LDSCHED, cortex_a9) +ARM_CORE("cortex-a15", cortexa15, 7A, FL_LDSCHED, 9e) +ARM_CORE("cortex-r4", cortexr4, 7R, FL_LDSCHED, 9e) +ARM_CORE("cortex-r4f", cortexr4f, 7R, FL_LDSCHED, 9e) +ARM_CORE("cortex-m4", cortexm4, 7EM, FL_LDSCHED, 9e) +ARM_CORE("cortex-m3", cortexm3, 7M, FL_LDSCHED, 9e) +ARM_CORE("cortex-m1", cortexm1, 6M, FL_LDSCHED, 9e) +ARM_CORE("cortex-m0", cortexm0, 6M, FL_LDSCHED, 9e) diff --git a/gcc/config/arm/arm-generic.md b/gcc/config/arm/arm-generic.md new file mode 100644 index 000000000..44e758692 --- /dev/null +++ b/gcc/config/arm/arm-generic.md @@ -0,0 +1,153 @@ +;; Generic ARM Pipeline Description +;; Copyright (C) 2003, 2007, 2010 Free Software Foundation, Inc. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ + +(define_automaton "arm") + +;; Write buffer +; +; Strictly, we should model a 4-deep write buffer for ARM7xx based chips +; +; The write buffer on some of the arm6 processors is hard to model exactly. +; There is room in the buffer for up to two addresses and up to eight words +; of memory, but the two needn't be split evenly. When writing the two +; addresses are fully pipelined. However, a read from memory that is not +; currently in the cache will block until the writes have completed. +; It is normally the case that FCLK and MCLK will be in the ratio 2:1, so +; writes will take 2 FCLK cycles per word, if FCLK and MCLK are asynchronous +; (they aren't allowed to be at present) then there is a startup cost of 1MCLK +; cycle to add as well. +(define_cpu_unit "write_buf" "arm") + +;; Write blockage unit +; +; The write_blockage unit models (partially), the fact that reads will stall +; until the write buffer empties. +; The f_mem_r and r_mem_f could also block, but they are to the stack, +; so we don't model them here +(define_cpu_unit "write_blockage" "arm") + +;; Core +; +(define_cpu_unit "core" "arm") + +(define_insn_reservation "r_mem_f_wbuf" 5 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "model_wbuf" "yes") + (eq_attr "type" "r_mem_f"))) + "core+write_buf*3") + +(define_insn_reservation "store_wbuf" 5 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "model_wbuf" "yes") + (eq_attr "type" "store1"))) + "core+write_buf*3+write_blockage*5") + +(define_insn_reservation "store2_wbuf" 7 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "model_wbuf" "yes") + (eq_attr "type" "store2"))) + "core+write_buf*4+write_blockage*7") + +(define_insn_reservation "store3_wbuf" 9 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "model_wbuf" "yes") + (eq_attr "type" "store3"))) + "core+write_buf*5+write_blockage*9") + +(define_insn_reservation "store4_wbuf" 11 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "model_wbuf" "yes") + (eq_attr "type" "store4"))) + "core+write_buf*6+write_blockage*11") + +(define_insn_reservation "store2" 3 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "model_wbuf" "no") + (eq_attr "type" "store2"))) + "core*3") + +(define_insn_reservation "store3" 4 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "model_wbuf" "no") + (eq_attr "type" "store3"))) + "core*4") + +(define_insn_reservation "store4" 5 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "model_wbuf" "no") + (eq_attr "type" "store4"))) + "core*5") + +(define_insn_reservation "store_ldsched" 1 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "ldsched" "yes") + (eq_attr "type" "store1"))) + "core") + +(define_insn_reservation "load_ldsched_xscale" 3 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "ldsched" "yes") + (and (eq_attr "type" "load_byte,load1") + (eq_attr "tune" "xscale,iwmmxt,iwmmxt2")))) + "core") + +(define_insn_reservation "load_ldsched" 2 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "ldsched" "yes") + (and (eq_attr "type" "load_byte,load1") + (eq_attr "tune" "!xscale,iwmmxt,iwmmxt2")))) + "core") + +(define_insn_reservation "load_or_store" 2 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "ldsched" "!yes") + (eq_attr "type" "load_byte,load1,load2,load3,load4,store1"))) + "core*2") + +(define_insn_reservation "mult" 16 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "ldsched" "no") (eq_attr "type" "mult"))) + "core*16") + +(define_insn_reservation "mult_ldsched_strongarm" 3 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "ldsched" "yes") + (and (eq_attr "tune" + "strongarm,strongarm110,strongarm1100,strongarm1110") + (eq_attr "type" "mult")))) + "core*2") + +(define_insn_reservation "mult_ldsched" 4 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "ldsched" "yes") + (and (eq_attr "tune" + "!strongarm,strongarm110,strongarm1100,strongarm1110") + (eq_attr "type" "mult")))) + "core*4") + +(define_insn_reservation "multi_cycle" 32 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "core_cycles" "multi") + (eq_attr "type" "!mult,load_byte,load1,load2,load3,load4,store1,store2,store3,store4"))) + "core*32") + +(define_insn_reservation "single_cycle" 1 + (and (eq_attr "generic_sched" "yes") + (eq_attr "core_cycles" "single")) + "core") diff --git a/gcc/config/arm/arm-ldmstm.ml b/gcc/config/arm/arm-ldmstm.ml new file mode 100644 index 000000000..221edd2aa --- /dev/null +++ b/gcc/config/arm/arm-ldmstm.ml @@ -0,0 +1,332 @@ +(* Auto-generate ARM ldm/stm patterns + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by CodeSourcery. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free + Software Foundation; either version 3, or (at your option) any later + version. + + GCC is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . + + This is an O'Caml program. The O'Caml compiler is available from: + + http://caml.inria.fr/ + + Or from your favourite OS's friendly packaging system. Tested with version + 3.09.2, though other versions will probably work too. + + Run with: + ocaml arm-ldmstm.ml >/path/to/gcc/config/arm/ldmstm.md +*) + +type amode = IA | IB | DA | DB + +type optype = IN | OUT | INOUT + +let rec string_of_addrmode addrmode = + match addrmode with + IA -> "ia" | IB -> "ib" | DA -> "da" | DB -> "db" + +let rec initial_offset addrmode nregs = + match addrmode with + IA -> 0 + | IB -> 4 + | DA -> -4 * nregs + 4 + | DB -> -4 * nregs + +let rec final_offset addrmode nregs = + match addrmode with + IA -> nregs * 4 + | IB -> nregs * 4 + | DA -> -4 * nregs + | DB -> -4 * nregs + +let constr thumb = + if thumb then "l" else "rk" + +let inout_constr op_type = + match op_type with + OUT -> "=&" + | INOUT -> "+&" + | IN -> "" + +let destreg nregs first op_type thumb = + if not first then + Printf.sprintf "(match_dup %d)" (nregs + 1) + else + Printf.sprintf ("(match_operand:SI %d \"s_register_operand\" \"%s%s\")") + (nregs + 1) (inout_constr op_type) (constr thumb) + +let write_ldm_set thumb nregs offset opnr first = + let indent = " " in + Printf.printf "%s" (if first then " [" else indent); + Printf.printf "(set (match_operand:SI %d \"arm_hard_register_operand\" \"\")\n" opnr; + Printf.printf "%s (mem:SI " indent; + begin if offset != 0 then Printf.printf "(plus:SI " end; + Printf.printf "%s" (destreg nregs first IN thumb); + begin if offset != 0 then Printf.printf "\n%s (const_int %d))" indent offset end; + Printf.printf "))" + +let write_stm_set thumb nregs offset opnr first = + let indent = " " in + Printf.printf "%s" (if first then " [" else indent); + Printf.printf "(set (mem:SI "; + begin if offset != 0 then Printf.printf "(plus:SI " end; + Printf.printf "%s" (destreg nregs first IN thumb); + begin if offset != 0 then Printf.printf " (const_int %d))" offset end; + Printf.printf ")\n%s (match_operand:SI %d \"arm_hard_register_operand\" \"\"))" indent opnr + +let write_ldm_peep_set extra_indent nregs opnr first = + let indent = " " ^ extra_indent in + Printf.printf "%s" (if first then extra_indent ^ " [" else indent); + Printf.printf "(set (match_operand:SI %d \"s_register_operand\" \"\")\n" opnr; + Printf.printf "%s (match_operand:SI %d \"memory_operand\" \"\"))" indent (nregs + opnr) + +let write_stm_peep_set extra_indent nregs opnr first = + let indent = " " ^ extra_indent in + Printf.printf "%s" (if first then extra_indent ^ " [" else indent); + Printf.printf "(set (match_operand:SI %d \"memory_operand\" \"\")\n" (nregs + opnr); + Printf.printf "%s (match_operand:SI %d \"s_register_operand\" \"\"))" indent opnr + +let write_any_load optype nregs opnr first = + let indent = " " in + Printf.printf "%s" (if first then " [" else indent); + Printf.printf "(set (match_operand:SI %d \"s_register_operand\" \"\")\n" opnr; + Printf.printf "%s (match_operand:SI %d \"%s\" \"\"))" indent (nregs * 2 + opnr) optype + +let write_const_store nregs opnr first = + let indent = " " in + Printf.printf "%s(set (match_operand:SI %d \"memory_operand\" \"\")\n" indent (nregs + opnr); + Printf.printf "%s (match_dup %d))" indent opnr + +let write_const_stm_peep_set nregs opnr first = + write_any_load "const_int_operand" nregs opnr first; + Printf.printf "\n"; + write_const_store nregs opnr false + + +let rec write_pat_sets func opnr offset first n_left = + func offset opnr first; + begin + if n_left > 1 then begin + Printf.printf "\n"; + write_pat_sets func (opnr + 1) (offset + 4) false (n_left - 1); + end else + Printf.printf "]" + end + +let rec write_peep_sets func opnr first n_left = + func opnr first; + begin + if n_left > 1 then begin + Printf.printf "\n"; + write_peep_sets func (opnr + 1) false (n_left - 1); + end + end + +let can_thumb addrmode update is_store = + match addrmode, update, is_store with + (* Thumb1 mode only supports IA with update. However, for LDMIA, + if the address register also appears in the list of loaded + registers, the loaded value is stored, hence the RTL pattern + to describe such an insn does not have an update. We check + in the match_parallel predicate that the condition described + above is met. *) + IA, _, false -> true + | IA, true, true -> true + | _ -> false + +let target addrmode thumb = + match addrmode, thumb with + IA, true -> "TARGET_THUMB1" + | IA, false -> "TARGET_32BIT" + | DB, false -> "TARGET_32BIT" + | _, false -> "TARGET_ARM" + +let write_pattern_1 name ls addrmode nregs write_set_fn update thumb = + let astr = string_of_addrmode addrmode in + Printf.printf "(define_insn \"*%s%s%d_%s%s\"\n" + (if thumb then "thumb_" else "") name nregs astr + (if update then "_update" else ""); + Printf.printf " [(match_parallel 0 \"%s_multiple_operation\"\n" ls; + begin + if update then begin + Printf.printf " [(set %s\n (plus:SI %s" + (destreg nregs true INOUT thumb) (destreg nregs false IN thumb); + Printf.printf " (const_int %d)))\n" + (final_offset addrmode nregs) + end + end; + write_pat_sets + (write_set_fn thumb nregs) 1 + (initial_offset addrmode nregs) + (not update) nregs; + Printf.printf ")]\n \"%s && XVECLEN (operands[0], 0) == %d\"\n" + (target addrmode thumb) + (if update then nregs + 1 else nregs); + Printf.printf " \"%s%%(%s%%)\\t%%%d%s, {" + name astr (nregs + 1) (if update then "!" else ""); + for n = 1 to nregs; do + Printf.printf "%%%d%s" n (if n < nregs then ", " else "") + done; + Printf.printf "}\"\n"; + Printf.printf " [(set_attr \"type\" \"%s%d\")" ls nregs; + begin if not thumb then + Printf.printf "\n (set_attr \"predicable\" \"yes\")"; + end; + Printf.printf "])\n\n" + +let write_ldm_pattern addrmode nregs update = + write_pattern_1 "ldm" "load" addrmode nregs write_ldm_set update false; + begin if can_thumb addrmode update false then + write_pattern_1 "ldm" "load" addrmode nregs write_ldm_set update true; + end + +let write_stm_pattern addrmode nregs update = + write_pattern_1 "stm" "store" addrmode nregs write_stm_set update false; + begin if can_thumb addrmode update true then + write_pattern_1 "stm" "store" addrmode nregs write_stm_set update true; + end + +let write_ldm_commutative_peephole thumb = + let nregs = 2 in + Printf.printf "(define_peephole2\n"; + write_peep_sets (write_ldm_peep_set "" nregs) 0 true nregs; + let indent = " " in + if thumb then begin + Printf.printf "\n%s(set (match_operand:SI %d \"s_register_operand\" \"\")\n" indent (nregs * 2); + Printf.printf "%s (match_operator:SI %d \"commutative_binary_operator\"\n" indent (nregs * 2 + 1); + Printf.printf "%s [(match_operand:SI %d \"s_register_operand\" \"\")\n" indent (nregs * 2 + 2); + Printf.printf "%s (match_operand:SI %d \"s_register_operand\" \"\")]))]\n" indent (nregs * 2 + 3) + end else begin + Printf.printf "\n%s(parallel\n" indent; + Printf.printf "%s [(set (match_operand:SI %d \"s_register_operand\" \"\")\n" indent (nregs * 2); + Printf.printf "%s (match_operator:SI %d \"commutative_binary_operator\"\n" indent (nregs * 2 + 1); + Printf.printf "%s [(match_operand:SI %d \"s_register_operand\" \"\")\n" indent (nregs * 2 + 2); + Printf.printf "%s (match_operand:SI %d \"s_register_operand\" \"\")]))\n" indent (nregs * 2 + 3); + Printf.printf "%s (clobber (reg:CC CC_REGNUM))])]\n" indent + end; + Printf.printf " \"(((operands[%d] == operands[0] && operands[%d] == operands[1])\n" (nregs * 2 + 2) (nregs * 2 + 3); + Printf.printf " || (operands[%d] == operands[0] && operands[%d] == operands[1]))\n" (nregs * 2 + 3) (nregs * 2 + 2); + Printf.printf " && peep2_reg_dead_p (%d, operands[0]) && peep2_reg_dead_p (%d, operands[1]))\"\n" (nregs + 1) (nregs + 1); + begin + if thumb then + Printf.printf " [(set (match_dup %d) (match_op_dup %d [(match_dup %d) (match_dup %d)]))]\n" + (nregs * 2) (nregs * 2 + 1) (nregs * 2 + 2) (nregs * 2 + 3) + else begin + Printf.printf " [(parallel\n"; + Printf.printf " [(set (match_dup %d) (match_op_dup %d [(match_dup %d) (match_dup %d)]))\n" + (nregs * 2) (nregs * 2 + 1) (nregs * 2 + 2) (nregs * 2 + 3); + Printf.printf " (clobber (reg:CC CC_REGNUM))])]\n" + end + end; + Printf.printf "{\n if (!gen_ldm_seq (operands, %d, true))\n FAIL;\n" nregs; + Printf.printf "})\n\n" + +let write_ldm_peephole nregs = + Printf.printf "(define_peephole2\n"; + write_peep_sets (write_ldm_peep_set "" nregs) 0 true nregs; + Printf.printf "]\n \"\"\n [(const_int 0)]\n{\n"; + Printf.printf " if (gen_ldm_seq (operands, %d, false))\n DONE;\n else\n FAIL;\n})\n\n" nregs + +let write_ldm_peephole_b nregs = + if nregs > 2 then begin + Printf.printf "(define_peephole2\n"; + write_ldm_peep_set "" nregs 0 true; + Printf.printf "\n (parallel\n"; + write_peep_sets (write_ldm_peep_set " " nregs) 1 true (nregs - 1); + Printf.printf "])]\n \"\"\n [(const_int 0)]\n{\n"; + Printf.printf " if (gen_ldm_seq (operands, %d, false))\n DONE;\n else\n FAIL;\n})\n\n" nregs + end + +let write_stm_peephole nregs = + Printf.printf "(define_peephole2\n"; + write_peep_sets (write_stm_peep_set "" nregs) 0 true nregs; + Printf.printf "]\n \"\"\n [(const_int 0)]\n{\n"; + Printf.printf " if (gen_stm_seq (operands, %d))\n DONE;\n else\n FAIL;\n})\n\n" nregs + +let write_stm_peephole_b nregs = + if nregs > 2 then begin + Printf.printf "(define_peephole2\n"; + write_stm_peep_set "" nregs 0 true; + Printf.printf "\n (parallel\n"; + write_peep_sets (write_stm_peep_set "" nregs) 1 true (nregs - 1); + Printf.printf "]\n \"\"\n [(const_int 0)]\n{\n"; + Printf.printf " if (gen_stm_seq (operands, %d))\n DONE;\n else\n FAIL;\n})\n\n" nregs + end + +let write_const_stm_peephole_a nregs = + Printf.printf "(define_peephole2\n"; + write_peep_sets (write_const_stm_peep_set nregs) 0 true nregs; + Printf.printf "]\n \"\"\n [(const_int 0)]\n{\n"; + Printf.printf " if (gen_const_stm_seq (operands, %d))\n DONE;\n else\n FAIL;\n})\n\n" nregs + +let write_const_stm_peephole_b nregs = + Printf.printf "(define_peephole2\n"; + write_peep_sets (write_any_load "const_int_operand" nregs) 0 true nregs; + Printf.printf "\n"; + write_peep_sets (write_const_store nregs) 0 false nregs; + Printf.printf "]\n \"\"\n [(const_int 0)]\n{\n"; + Printf.printf " if (gen_const_stm_seq (operands, %d))\n DONE;\n else\n FAIL;\n})\n\n" nregs + +let patterns () = + let addrmodes = [ IA; IB; DA; DB ] in + let sizes = [ 4; 3; 2] in + List.iter + (fun n -> + List.iter + (fun addrmode -> + write_ldm_pattern addrmode n false; + write_ldm_pattern addrmode n true; + write_stm_pattern addrmode n false; + write_stm_pattern addrmode n true) + addrmodes; + write_ldm_peephole n; + write_ldm_peephole_b n; + write_const_stm_peephole_a n; + write_const_stm_peephole_b n; + write_stm_peephole n;) + sizes; + write_ldm_commutative_peephole false; + write_ldm_commutative_peephole true + +let print_lines = List.iter (fun s -> Format.printf "%s@\n" s) + +(* Do it. *) + +let _ = + print_lines [ +"/* ARM ldm/stm instruction patterns. This file was automatically generated"; +" using arm-ldmstm.ml. Please do not edit manually."; +""; +" Copyright (C) 2010 Free Software Foundation, Inc."; +" Contributed by CodeSourcery."; +""; +" This file is part of GCC."; +""; +" GCC is free software; you can redistribute it and/or modify it"; +" under the terms of the GNU General Public License as published"; +" by the Free Software Foundation; either version 3, or (at your"; +" option) any later version."; +""; +" GCC is distributed in the hope that it will be useful, but WITHOUT"; +" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY"; +" or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public"; +" License for more details."; +""; +" You should have received a copy of the GNU General Public License and"; +" a copy of the GCC Runtime Library Exception along with this program;"; +" see the files COPYING3 and COPYING.RUNTIME respectively. If not, see"; +" . */"; +""]; + patterns (); diff --git a/gcc/config/arm/arm-modes.def b/gcc/config/arm/arm-modes.def new file mode 100644 index 000000000..24e3d90a5 --- /dev/null +++ b/gcc/config/arm/arm-modes.def @@ -0,0 +1,78 @@ +/* Definitions of target machine for GNU compiler, for ARM. + Copyright (C) 2002, 2004, 2007, 2010 Free Software Foundation, Inc. + Contributed by Pieter `Tiggr' Schoenmakers (rcpieter@win.tue.nl) + and Martin Simmons (@harleqn.co.uk). + More major hacks by Richard Earnshaw (rearnsha@arm.com) + Minor hacks by Nick Clifton (nickc@cygnus.com) + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +/* Extended precision floating point. + FIXME What format is this? */ +FLOAT_MODE (XF, 12, 0); + +/* Half-precision floating point */ +FLOAT_MODE (HF, 2, 0); +ADJUST_FLOAT_FORMAT (HF, ((arm_fp16_format == ARM_FP16_FORMAT_ALTERNATIVE) + ? &arm_half_format : &ieee_half_format)); + +/* CCFPEmode should be used with floating inequalities, + CCFPmode should be used with floating equalities. + CC_NOOVmode should be used with SImode integer equalities. + CC_Zmode should be used if only the Z flag is set correctly + CC_Cmode should be used if only the C flag is set correctly, after an + addition. + CC_Nmode should be used if only the N (sign) flag is set correctly + CC_CZmode should be used if only the C and Z flags are correct + (used for DImode unsigned comparisons). + CC_NCVmode should be used if only the N, C, and V flags are correct + (used for DImode signed comparisons). + CCmode should be used otherwise. */ + +CC_MODE (CC_NOOV); +CC_MODE (CC_Z); +CC_MODE (CC_CZ); +CC_MODE (CC_NCV); +CC_MODE (CC_SWP); +CC_MODE (CCFP); +CC_MODE (CCFPE); +CC_MODE (CC_DNE); +CC_MODE (CC_DEQ); +CC_MODE (CC_DLE); +CC_MODE (CC_DLT); +CC_MODE (CC_DGE); +CC_MODE (CC_DGT); +CC_MODE (CC_DLEU); +CC_MODE (CC_DLTU); +CC_MODE (CC_DGEU); +CC_MODE (CC_DGTU); +CC_MODE (CC_C); +CC_MODE (CC_N); + +/* Vector modes. */ +VECTOR_MODES (INT, 4); /* V4QI V2HI */ +VECTOR_MODES (INT, 8); /* V8QI V4HI V2SI */ +VECTOR_MODES (INT, 16); /* V16QI V8HI V4SI V2DI */ +VECTOR_MODES (FLOAT, 8); /* V4HF V2SF */ +VECTOR_MODES (FLOAT, 16); /* V8HF V4SF V2DF */ + +/* Opaque integer modes for 3, 4, 6 or 8 Neon double registers (2 is + TImode). */ +INT_MODE (EI, 24); +INT_MODE (OI, 32); +INT_MODE (CI, 48); +INT_MODE (XI, 64); diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h new file mode 100644 index 000000000..f037a456a --- /dev/null +++ b/gcc/config/arm/arm-protos.h @@ -0,0 +1,231 @@ +/* Prototypes for exported functions defined in arm.c and pe.c + Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, + 2009, 2010 Free Software Foundation, Inc. + Contributed by Richard Earnshaw (rearnsha@arm.com) + Minor hacks by Nick Clifton (nickc@cygnus.com) + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +#ifndef GCC_ARM_PROTOS_H +#define GCC_ARM_PROTOS_H + +extern int use_return_insn (int, rtx); +extern enum reg_class arm_regno_class (int); +extern void arm_load_pic_register (unsigned long); +extern int arm_volatile_func (void); +extern const char *arm_output_epilogue (rtx); +extern void arm_expand_prologue (void); +extern const char *arm_strip_name_encoding (const char *); +extern void arm_asm_output_labelref (FILE *, const char *); +extern void thumb2_asm_output_opcode (FILE *); +extern unsigned long arm_current_func_type (void); +extern HOST_WIDE_INT arm_compute_initial_elimination_offset (unsigned int, + unsigned int); +extern HOST_WIDE_INT thumb_compute_initial_elimination_offset (unsigned int, + unsigned int); +extern unsigned int arm_dbx_register_number (unsigned int); +extern void arm_output_fn_unwind (FILE *, bool); + + +#ifdef RTX_CODE +extern bool arm_vector_mode_supported_p (enum machine_mode); +extern bool arm_small_register_classes_for_mode_p (enum machine_mode); +extern int arm_hard_regno_mode_ok (unsigned int, enum machine_mode); +extern int const_ok_for_arm (HOST_WIDE_INT); +extern int arm_split_constant (RTX_CODE, enum machine_mode, rtx, + HOST_WIDE_INT, rtx, rtx, int); +extern RTX_CODE arm_canonicalize_comparison (RTX_CODE, rtx *, rtx *); +extern int legitimate_pic_operand_p (rtx); +extern rtx legitimize_pic_address (rtx, enum machine_mode, rtx); +extern rtx legitimize_tls_address (rtx, rtx); +extern int arm_legitimate_address_outer_p (enum machine_mode, rtx, RTX_CODE, int); +extern int thumb_legitimate_offset_p (enum machine_mode, HOST_WIDE_INT); +extern bool arm_legitimize_reload_address (rtx *, enum machine_mode, int, int, + int); +extern rtx thumb_legitimize_reload_address (rtx *, enum machine_mode, int, int, + int); +extern int arm_const_double_rtx (rtx); +extern int neg_const_double_rtx_ok_for_fpa (rtx); +extern int vfp3_const_double_rtx (rtx); +extern int neon_immediate_valid_for_move (rtx, enum machine_mode, rtx *, int *); +extern int neon_immediate_valid_for_logic (rtx, enum machine_mode, int, rtx *, + int *); +extern char *neon_output_logic_immediate (const char *, rtx *, + enum machine_mode, int, int); +extern void neon_pairwise_reduce (rtx, rtx, enum machine_mode, + rtx (*) (rtx, rtx, rtx)); +extern rtx neon_make_constant (rtx); +extern void neon_expand_vector_init (rtx, rtx); +extern void neon_lane_bounds (rtx, HOST_WIDE_INT, HOST_WIDE_INT); +extern void neon_const_bounds (rtx, HOST_WIDE_INT, HOST_WIDE_INT); +extern HOST_WIDE_INT neon_element_bits (enum machine_mode); +extern void neon_reinterpret (rtx, rtx); +extern void neon_emit_pair_result_insn (enum machine_mode, + rtx (*) (rtx, rtx, rtx, rtx), + rtx, rtx, rtx); +extern void neon_disambiguate_copy (rtx *, rtx *, rtx *, unsigned int); +extern enum reg_class coproc_secondary_reload_class (enum machine_mode, rtx, + bool); +extern bool arm_tls_referenced_p (rtx); +extern bool arm_cannot_force_const_mem (rtx); + +extern int cirrus_memory_offset (rtx); +extern int arm_coproc_mem_operand (rtx, bool); +extern int neon_vector_mem_operand (rtx, int); +extern int neon_struct_mem_operand (rtx); +extern int arm_no_early_store_addr_dep (rtx, rtx); +extern int arm_early_store_addr_dep (rtx, rtx); +extern int arm_early_load_addr_dep (rtx, rtx); +extern int arm_no_early_alu_shift_dep (rtx, rtx); +extern int arm_no_early_alu_shift_value_dep (rtx, rtx); +extern int arm_no_early_mul_dep (rtx, rtx); +extern int arm_mac_accumulator_is_mul_result (rtx, rtx); + +extern int tls_mentioned_p (rtx); +extern int symbol_mentioned_p (rtx); +extern int label_mentioned_p (rtx); +extern RTX_CODE minmax_code (rtx); +extern int adjacent_mem_locations (rtx, rtx); +extern bool gen_ldm_seq (rtx *, int, bool); +extern bool gen_stm_seq (rtx *, int); +extern bool gen_const_stm_seq (rtx *, int); +extern rtx arm_gen_load_multiple (int *, int, rtx, int, rtx, HOST_WIDE_INT *); +extern rtx arm_gen_store_multiple (int *, int, rtx, int, rtx, HOST_WIDE_INT *); +extern int arm_gen_movmemqi (rtx *); +extern enum machine_mode arm_select_cc_mode (RTX_CODE, rtx, rtx); +extern enum machine_mode arm_select_dominance_cc_mode (rtx, rtx, + HOST_WIDE_INT); +extern rtx arm_gen_compare_reg (RTX_CODE, rtx, rtx); +extern rtx arm_gen_return_addr_mask (void); +extern void arm_reload_in_hi (rtx *); +extern void arm_reload_out_hi (rtx *); +extern int arm_const_double_inline_cost (rtx); +extern bool arm_const_double_by_parts (rtx); +extern bool arm_const_double_by_immediates (rtx); +extern const char *fp_immediate_constant (rtx); +extern void arm_emit_call_insn (rtx, rtx); +extern const char *output_call (rtx *); +extern const char *output_call_mem (rtx *); +void arm_emit_movpair (rtx, rtx); +extern const char *output_mov_long_double_fpa_from_arm (rtx *); +extern const char *output_mov_long_double_arm_from_fpa (rtx *); +extern const char *output_mov_long_double_arm_from_arm (rtx *); +extern const char *output_mov_double_fpa_from_arm (rtx *); +extern const char *output_mov_double_arm_from_fpa (rtx *); +extern const char *output_move_double (rtx *); +extern const char *output_move_quad (rtx *); +extern const char *output_move_vfp (rtx *operands); +extern const char *output_move_neon (rtx *operands); +extern int arm_attr_length_move_neon (rtx); +extern int arm_address_offset_is_imm (rtx); +extern const char *output_add_immediate (rtx *); +extern const char *arithmetic_instr (rtx, int); +extern void output_ascii_pseudo_op (FILE *, const unsigned char *, int); +extern const char *output_return_instruction (rtx, int, int); +extern void arm_poke_function_name (FILE *, const char *); +extern void arm_final_prescan_insn (rtx); +extern int arm_debugger_arg_offset (int, rtx); +extern bool arm_is_long_call_p (tree); +extern int arm_emit_vector_const (FILE *, rtx); +extern void arm_emit_fp16_const (rtx c); +extern const char * arm_output_load_gr (rtx *); +extern const char *vfp_output_fstmd (rtx *); +extern void arm_set_return_address (rtx, rtx); +extern int arm_eliminable_register (rtx); +extern const char *arm_output_shift(rtx *, int); +extern void arm_expand_sync (enum machine_mode, struct arm_sync_generator *, + rtx, rtx, rtx, rtx); +extern const char *arm_output_memory_barrier (rtx *); +extern const char *arm_output_sync_insn (rtx, rtx *); +extern unsigned int arm_sync_loop_insns (rtx , rtx *); + +#if defined TREE_CODE +extern void arm_init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree); +extern bool arm_pad_arg_upward (enum machine_mode, const_tree); +extern bool arm_pad_reg_upward (enum machine_mode, tree, int); +#endif +extern int arm_apply_result_size (void); +extern rtx aapcs_libcall_value (enum machine_mode); + +#endif /* RTX_CODE */ + +extern int arm_float_words_big_endian (void); + +/* Thumb functions. */ +extern void arm_init_expanders (void); +extern const char *thumb_unexpanded_epilogue (void); +extern void thumb1_expand_prologue (void); +extern void thumb1_expand_epilogue (void); +#ifdef TREE_CODE +extern int is_called_in_ARM_mode (tree); +#endif +extern int thumb_shiftable_const (unsigned HOST_WIDE_INT); +#ifdef RTX_CODE +extern void thumb1_final_prescan_insn (rtx); +extern void thumb2_final_prescan_insn (rtx); +extern const char *thumb_load_double_from_address (rtx *); +extern const char *thumb_output_move_mem_multiple (int, rtx *); +extern const char *thumb_call_via_reg (rtx); +extern void thumb_expand_movmemqi (rtx *); +extern rtx arm_return_addr (int, rtx); +extern void thumb_reload_out_hi (rtx *); +extern void thumb_reload_in_hi (rtx *); +extern void thumb_set_return_address (rtx, rtx); +extern const char *thumb1_output_casesi (rtx *); +extern const char *thumb2_output_casesi (rtx *); +#endif + +/* Defined in pe.c. */ +extern int arm_dllexport_name_p (const char *); +extern int arm_dllimport_name_p (const char *); + +#ifdef TREE_CODE +extern void arm_pe_unique_section (tree, int); +extern void arm_pe_encode_section_info (tree, rtx, int); +extern int arm_dllexport_p (tree); +extern int arm_dllimport_p (tree); +extern void arm_mark_dllexport (tree); +extern void arm_mark_dllimport (tree); +#endif + +extern void arm_pr_long_calls (struct cpp_reader *); +extern void arm_pr_no_long_calls (struct cpp_reader *); +extern void arm_pr_long_calls_off (struct cpp_reader *); + +extern void arm_lang_object_attributes_init(void); + +extern const char *arm_mangle_type (const_tree); + +extern void arm_order_regs_for_local_alloc (void); + +#ifdef RTX_CODE +/* This needs to be here because we need RTX_CODE and similar. */ + +struct tune_params +{ + bool (*rtx_costs) (rtx, RTX_CODE, RTX_CODE, int *, bool); + bool (*sched_adjust_cost) (rtx, rtx, rtx, int *); + int constant_limit; + int num_prefetch_slots; + int l1_cache_size; + int l1_cache_line_size; +}; + +extern const struct tune_params *current_tune; +#endif /* RTX_CODE */ + +#endif /* ! GCC_ARM_PROTOS_H */ diff --git a/gcc/config/arm/arm-tune.md b/gcc/config/arm/arm-tune.md new file mode 100644 index 000000000..9b664e7e6 --- /dev/null +++ b/gcc/config/arm/arm-tune.md @@ -0,0 +1,5 @@ +;; -*- buffer-read-only: t -*- +;; Generated automatically by gentune.sh from arm-cores.def +(define_attr "tune" + "arm2,arm250,arm3,arm6,arm60,arm600,arm610,arm620,arm7,arm7d,arm7di,arm70,arm700,arm700i,arm710,arm720,arm710c,arm7100,arm7500,arm7500fe,arm7m,arm7dm,arm7dmi,arm8,arm810,strongarm,strongarm110,strongarm1100,strongarm1110,fa526,fa626,arm7tdmi,arm7tdmis,arm710t,arm720t,arm740t,arm9,arm9tdmi,arm920,arm920t,arm922t,arm940t,ep9312,arm10tdmi,arm1020t,arm9e,arm946es,arm966es,arm968es,arm10e,arm1020e,arm1022e,xscale,iwmmxt,iwmmxt2,fa606te,fa626te,fmp626,fa726te,arm926ejs,arm1026ejs,arm1136js,arm1136jfs,arm1176jzs,arm1176jzfs,mpcorenovfp,mpcore,arm1156t2s,arm1156t2fs,cortexa5,cortexa8,cortexa9,cortexa15,cortexr4,cortexr4f,cortexm4,cortexm3,cortexm1,cortexm0" + (const (symbol_ref "((enum attr_tune) arm_tune)"))) diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c new file mode 100644 index 000000000..c3c5aa168 --- /dev/null +++ b/gcc/config/arm/arm.c @@ -0,0 +1,23712 @@ +/* Output routines for GCC for ARM. + Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, + 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 + Free Software Foundation, Inc. + Contributed by Pieter `Tiggr' Schoenmakers (rcpieter@win.tue.nl) + and Martin Simmons (@harleqn.co.uk). + More major hacks by Richard Earnshaw (rearnsha@arm.com). + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "tm.h" +#include "rtl.h" +#include "tree.h" +#include "obstack.h" +#include "regs.h" +#include "hard-reg-set.h" +#include "insn-config.h" +#include "conditions.h" +#include "output.h" +#include "insn-attr.h" +#include "flags.h" +#include "reload.h" +#include "function.h" +#include "expr.h" +#include "optabs.h" +#include "diagnostic-core.h" +#include "recog.h" +#include "cgraph.h" +#include "ggc.h" +#include "except.h" +#include "c-family/c-pragma.h" /* ??? */ +#include "integrate.h" +#include "tm_p.h" +#include "target.h" +#include "target-def.h" +#include "debug.h" +#include "langhooks.h" +#include "df.h" +#include "intl.h" +#include "libfuncs.h" +#include "params.h" + +/* Forward definitions of types. */ +typedef struct minipool_node Mnode; +typedef struct minipool_fixup Mfix; + +void (*arm_lang_output_object_attributes_hook)(void); + +/* Forward function declarations. */ +static bool arm_needs_doubleword_align (enum machine_mode, const_tree); +static int arm_compute_static_chain_stack_bytes (void); +static arm_stack_offsets *arm_get_frame_offsets (void); +static void arm_add_gc_roots (void); +static int arm_gen_constant (enum rtx_code, enum machine_mode, rtx, + HOST_WIDE_INT, rtx, rtx, int, int); +static unsigned bit_count (unsigned long); +static int arm_address_register_rtx_p (rtx, int); +static int arm_legitimate_index_p (enum machine_mode, rtx, RTX_CODE, int); +static int thumb2_legitimate_index_p (enum machine_mode, rtx, int); +static int thumb1_base_register_rtx_p (rtx, enum machine_mode, int); +static rtx arm_legitimize_address (rtx, rtx, enum machine_mode); +static rtx thumb_legitimize_address (rtx, rtx, enum machine_mode); +inline static int thumb1_index_register_rtx_p (rtx, int); +static bool arm_legitimate_address_p (enum machine_mode, rtx, bool); +static int thumb_far_jump_used_p (void); +static bool thumb_force_lr_save (void); +static int const_ok_for_op (HOST_WIDE_INT, enum rtx_code); +static rtx emit_sfm (int, int); +static unsigned arm_size_return_regs (void); +static bool arm_assemble_integer (rtx, unsigned int, int); +static void arm_print_operand (FILE *, rtx, int); +static void arm_print_operand_address (FILE *, rtx); +static bool arm_print_operand_punct_valid_p (unsigned char code); +static const char *fp_const_from_val (REAL_VALUE_TYPE *); +static arm_cc get_arm_condition_code (rtx); +static HOST_WIDE_INT int_log2 (HOST_WIDE_INT); +static rtx is_jump_table (rtx); +static const char *output_multi_immediate (rtx *, const char *, const char *, + int, HOST_WIDE_INT); +static const char *shift_op (rtx, HOST_WIDE_INT *); +static struct machine_function *arm_init_machine_status (void); +static void thumb_exit (FILE *, int); +static rtx is_jump_table (rtx); +static HOST_WIDE_INT get_jump_table_size (rtx); +static Mnode *move_minipool_fix_forward_ref (Mnode *, Mnode *, HOST_WIDE_INT); +static Mnode *add_minipool_forward_ref (Mfix *); +static Mnode *move_minipool_fix_backward_ref (Mnode *, Mnode *, HOST_WIDE_INT); +static Mnode *add_minipool_backward_ref (Mfix *); +static void assign_minipool_offsets (Mfix *); +static void arm_print_value (FILE *, rtx); +static void dump_minipool (rtx); +static int arm_barrier_cost (rtx); +static Mfix *create_fix_barrier (Mfix *, HOST_WIDE_INT); +static void push_minipool_barrier (rtx, HOST_WIDE_INT); +static void push_minipool_fix (rtx, HOST_WIDE_INT, rtx *, enum machine_mode, + rtx); +static void arm_reorg (void); +static bool note_invalid_constants (rtx, HOST_WIDE_INT, int); +static unsigned long arm_compute_save_reg0_reg12_mask (void); +static unsigned long arm_compute_save_reg_mask (void); +static unsigned long arm_isr_value (tree); +static unsigned long arm_compute_func_type (void); +static tree arm_handle_fndecl_attribute (tree *, tree, tree, int, bool *); +static tree arm_handle_pcs_attribute (tree *, tree, tree, int, bool *); +static tree arm_handle_isr_attribute (tree *, tree, tree, int, bool *); +#if TARGET_DLLIMPORT_DECL_ATTRIBUTES +static tree arm_handle_notshared_attribute (tree *, tree, tree, int, bool *); +#endif +static void arm_output_function_epilogue (FILE *, HOST_WIDE_INT); +static void arm_output_function_prologue (FILE *, HOST_WIDE_INT); +static void thumb1_output_function_prologue (FILE *, HOST_WIDE_INT); +static int arm_comp_type_attributes (const_tree, const_tree); +static void arm_set_default_type_attributes (tree); +static int arm_adjust_cost (rtx, rtx, rtx, int); +static int count_insns_for_constant (HOST_WIDE_INT, int); +static int arm_get_strip_length (int); +static bool arm_function_ok_for_sibcall (tree, tree); +static enum machine_mode arm_promote_function_mode (const_tree, + enum machine_mode, int *, + const_tree, int); +static bool arm_return_in_memory (const_tree, const_tree); +static rtx arm_function_value (const_tree, const_tree, bool); +static rtx arm_libcall_value (enum machine_mode, const_rtx); + +static void arm_internal_label (FILE *, const char *, unsigned long); +static void arm_output_mi_thunk (FILE *, tree, HOST_WIDE_INT, HOST_WIDE_INT, + tree); +static bool arm_have_conditional_execution (void); +static bool arm_rtx_costs_1 (rtx, enum rtx_code, int*, bool); +static bool arm_size_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *); +static bool arm_slowmul_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool); +static bool arm_fastmul_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool); +static bool arm_xscale_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool); +static bool arm_9e_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool); +static bool arm_rtx_costs (rtx, int, int, int *, bool); +static int arm_address_cost (rtx, bool); +static bool arm_memory_load_p (rtx); +static bool arm_cirrus_insn_p (rtx); +static void cirrus_reorg (rtx); +static void arm_init_builtins (void); +static void arm_init_iwmmxt_builtins (void); +static rtx safe_vector_operand (rtx, enum machine_mode); +static rtx arm_expand_binop_builtin (enum insn_code, tree, rtx); +static rtx arm_expand_unop_builtin (enum insn_code, tree, rtx, int); +static rtx arm_expand_builtin (tree, rtx, rtx, enum machine_mode, int); +static void emit_constant_insn (rtx cond, rtx pattern); +static rtx emit_set_insn (rtx, rtx); +static int arm_arg_partial_bytes (CUMULATIVE_ARGS *, enum machine_mode, + tree, bool); +static rtx arm_function_arg (CUMULATIVE_ARGS *, enum machine_mode, + const_tree, bool); +static void arm_function_arg_advance (CUMULATIVE_ARGS *, enum machine_mode, + const_tree, bool); +static unsigned int arm_function_arg_boundary (enum machine_mode, const_tree); +static rtx aapcs_allocate_return_reg (enum machine_mode, const_tree, + const_tree); +static int aapcs_select_return_coproc (const_tree, const_tree); + +#ifdef OBJECT_FORMAT_ELF +static void arm_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED; +static void arm_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED; +#endif +#ifndef ARM_PE +static void arm_encode_section_info (tree, rtx, int); +#endif + +static void arm_file_end (void); +static void arm_file_start (void); + +static void arm_setup_incoming_varargs (CUMULATIVE_ARGS *, enum machine_mode, + tree, int *, int); +static bool arm_pass_by_reference (CUMULATIVE_ARGS *, + enum machine_mode, const_tree, bool); +static bool arm_promote_prototypes (const_tree); +static bool arm_default_short_enums (void); +static bool arm_align_anon_bitfield (void); +static bool arm_return_in_msb (const_tree); +static bool arm_must_pass_in_stack (enum machine_mode, const_tree); +static bool arm_return_in_memory (const_tree, const_tree); +#if ARM_UNWIND_INFO +static void arm_unwind_emit (FILE *, rtx); +static bool arm_output_ttype (rtx); +static void arm_asm_emit_except_personality (rtx); +static void arm_asm_init_sections (void); +#endif +static enum unwind_info_type arm_except_unwind_info (struct gcc_options *); +static void arm_dwarf_handle_frame_unspec (const char *, rtx, int); +static rtx arm_dwarf_register_span (rtx); + +static tree arm_cxx_guard_type (void); +static bool arm_cxx_guard_mask_bit (void); +static tree arm_get_cookie_size (tree); +static bool arm_cookie_has_size (void); +static bool arm_cxx_cdtor_returns_this (void); +static bool arm_cxx_key_method_may_be_inline (void); +static void arm_cxx_determine_class_data_visibility (tree); +static bool arm_cxx_class_data_always_comdat (void); +static bool arm_cxx_use_aeabi_atexit (void); +static void arm_init_libfuncs (void); +static tree arm_build_builtin_va_list (void); +static void arm_expand_builtin_va_start (tree, rtx); +static tree arm_gimplify_va_arg_expr (tree, tree, gimple_seq *, gimple_seq *); +static void arm_option_override (void); +static bool arm_handle_option (size_t, const char *, int); +static void arm_target_help (void); +static unsigned HOST_WIDE_INT arm_shift_truncation_mask (enum machine_mode); +static bool arm_cannot_copy_insn_p (rtx); +static bool arm_tls_symbol_p (rtx x); +static int arm_issue_rate (void); +static void arm_output_dwarf_dtprel (FILE *, int, rtx) ATTRIBUTE_UNUSED; +static bool arm_output_addr_const_extra (FILE *, rtx); +static bool arm_allocate_stack_slots_for_args (void); +static const char *arm_invalid_parameter_type (const_tree t); +static const char *arm_invalid_return_type (const_tree t); +static tree arm_promoted_type (const_tree t); +static tree arm_convert_to_type (tree type, tree expr); +static bool arm_scalar_mode_supported_p (enum machine_mode); +static bool arm_frame_pointer_required (void); +static bool arm_can_eliminate (const int, const int); +static void arm_asm_trampoline_template (FILE *); +static void arm_trampoline_init (rtx, tree, rtx); +static rtx arm_trampoline_adjust_address (rtx); +static rtx arm_pic_static_addr (rtx orig, rtx reg); +static bool cortex_a9_sched_adjust_cost (rtx, rtx, rtx, int *); +static bool xscale_sched_adjust_cost (rtx, rtx, rtx, int *); +static bool fa726te_sched_adjust_cost (rtx, rtx, rtx, int *); +static enum machine_mode arm_preferred_simd_mode (enum machine_mode); +static bool arm_class_likely_spilled_p (reg_class_t); +static HOST_WIDE_INT arm_vector_alignment (const_tree type); +static bool arm_vector_alignment_reachable (const_tree type, bool is_packed); +static bool arm_builtin_support_vector_misalignment (enum machine_mode mode, + const_tree type, + int misalignment, + bool is_packed); +static void arm_conditional_register_usage (void); +static reg_class_t arm_preferred_rename_class (reg_class_t rclass); + + +/* Table of machine attributes. */ +static const struct attribute_spec arm_attribute_table[] = +{ + /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler } */ + /* Function calls made to this symbol must be done indirectly, because + it may lie outside of the 26 bit addressing range of a normal function + call. */ + { "long_call", 0, 0, false, true, true, NULL }, + /* Whereas these functions are always known to reside within the 26 bit + addressing range. */ + { "short_call", 0, 0, false, true, true, NULL }, + /* Specify the procedure call conventions for a function. */ + { "pcs", 1, 1, false, true, true, arm_handle_pcs_attribute }, + /* Interrupt Service Routines have special prologue and epilogue requirements. */ + { "isr", 0, 1, false, false, false, arm_handle_isr_attribute }, + { "interrupt", 0, 1, false, false, false, arm_handle_isr_attribute }, + { "naked", 0, 0, true, false, false, arm_handle_fndecl_attribute }, +#ifdef ARM_PE + /* ARM/PE has three new attributes: + interfacearm - ? + dllexport - for exporting a function/variable that will live in a dll + dllimport - for importing a function/variable from a dll + + Microsoft allows multiple declspecs in one __declspec, separating + them with spaces. We do NOT support this. Instead, use __declspec + multiple times. + */ + { "dllimport", 0, 0, true, false, false, NULL }, + { "dllexport", 0, 0, true, false, false, NULL }, + { "interfacearm", 0, 0, true, false, false, arm_handle_fndecl_attribute }, +#elif TARGET_DLLIMPORT_DECL_ATTRIBUTES + { "dllimport", 0, 0, false, false, false, handle_dll_attribute }, + { "dllexport", 0, 0, false, false, false, handle_dll_attribute }, + { "notshared", 0, 0, false, true, false, arm_handle_notshared_attribute }, +#endif + { NULL, 0, 0, false, false, false, NULL } +}; + +/* Set default optimization options. */ +static const struct default_options arm_option_optimization_table[] = + { + /* Enable section anchors by default at -O1 or higher. */ + { OPT_LEVELS_1_PLUS, OPT_fsection_anchors, NULL, 1 }, + { OPT_LEVELS_1_PLUS, OPT_fomit_frame_pointer, NULL, 1 }, + { OPT_LEVELS_NONE, 0, NULL, 0 } + }; + +/* Initialize the GCC target structure. */ +#if TARGET_DLLIMPORT_DECL_ATTRIBUTES +#undef TARGET_MERGE_DECL_ATTRIBUTES +#define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes +#endif + +#undef TARGET_LEGITIMIZE_ADDRESS +#define TARGET_LEGITIMIZE_ADDRESS arm_legitimize_address + +#undef TARGET_ATTRIBUTE_TABLE +#define TARGET_ATTRIBUTE_TABLE arm_attribute_table + +#undef TARGET_ASM_FILE_START +#define TARGET_ASM_FILE_START arm_file_start +#undef TARGET_ASM_FILE_END +#define TARGET_ASM_FILE_END arm_file_end + +#undef TARGET_ASM_ALIGNED_SI_OP +#define TARGET_ASM_ALIGNED_SI_OP NULL +#undef TARGET_ASM_INTEGER +#define TARGET_ASM_INTEGER arm_assemble_integer + +#undef TARGET_PRINT_OPERAND +#define TARGET_PRINT_OPERAND arm_print_operand +#undef TARGET_PRINT_OPERAND_ADDRESS +#define TARGET_PRINT_OPERAND_ADDRESS arm_print_operand_address +#undef TARGET_PRINT_OPERAND_PUNCT_VALID_P +#define TARGET_PRINT_OPERAND_PUNCT_VALID_P arm_print_operand_punct_valid_p + +#undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA +#define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA arm_output_addr_const_extra + +#undef TARGET_ASM_FUNCTION_PROLOGUE +#define TARGET_ASM_FUNCTION_PROLOGUE arm_output_function_prologue + +#undef TARGET_ASM_FUNCTION_EPILOGUE +#define TARGET_ASM_FUNCTION_EPILOGUE arm_output_function_epilogue + +#undef TARGET_DEFAULT_TARGET_FLAGS +#define TARGET_DEFAULT_TARGET_FLAGS (TARGET_DEFAULT | MASK_SCHED_PROLOG) +#undef TARGET_HANDLE_OPTION +#define TARGET_HANDLE_OPTION arm_handle_option +#undef TARGET_HELP +#define TARGET_HELP arm_target_help +#undef TARGET_OPTION_OVERRIDE +#define TARGET_OPTION_OVERRIDE arm_option_override +#undef TARGET_OPTION_OPTIMIZATION_TABLE +#define TARGET_OPTION_OPTIMIZATION_TABLE arm_option_optimization_table + +#undef TARGET_COMP_TYPE_ATTRIBUTES +#define TARGET_COMP_TYPE_ATTRIBUTES arm_comp_type_attributes + +#undef TARGET_SET_DEFAULT_TYPE_ATTRIBUTES +#define TARGET_SET_DEFAULT_TYPE_ATTRIBUTES arm_set_default_type_attributes + +#undef TARGET_SCHED_ADJUST_COST +#define TARGET_SCHED_ADJUST_COST arm_adjust_cost + +#undef TARGET_ENCODE_SECTION_INFO +#ifdef ARM_PE +#define TARGET_ENCODE_SECTION_INFO arm_pe_encode_section_info +#else +#define TARGET_ENCODE_SECTION_INFO arm_encode_section_info +#endif + +#undef TARGET_STRIP_NAME_ENCODING +#define TARGET_STRIP_NAME_ENCODING arm_strip_name_encoding + +#undef TARGET_ASM_INTERNAL_LABEL +#define TARGET_ASM_INTERNAL_LABEL arm_internal_label + +#undef TARGET_FUNCTION_OK_FOR_SIBCALL +#define TARGET_FUNCTION_OK_FOR_SIBCALL arm_function_ok_for_sibcall + +#undef TARGET_FUNCTION_VALUE +#define TARGET_FUNCTION_VALUE arm_function_value + +#undef TARGET_LIBCALL_VALUE +#define TARGET_LIBCALL_VALUE arm_libcall_value + +#undef TARGET_ASM_OUTPUT_MI_THUNK +#define TARGET_ASM_OUTPUT_MI_THUNK arm_output_mi_thunk +#undef TARGET_ASM_CAN_OUTPUT_MI_THUNK +#define TARGET_ASM_CAN_OUTPUT_MI_THUNK default_can_output_mi_thunk_no_vcall + +#undef TARGET_RTX_COSTS +#define TARGET_RTX_COSTS arm_rtx_costs +#undef TARGET_ADDRESS_COST +#define TARGET_ADDRESS_COST arm_address_cost + +#undef TARGET_SHIFT_TRUNCATION_MASK +#define TARGET_SHIFT_TRUNCATION_MASK arm_shift_truncation_mask +#undef TARGET_VECTOR_MODE_SUPPORTED_P +#define TARGET_VECTOR_MODE_SUPPORTED_P arm_vector_mode_supported_p +#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE +#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE arm_preferred_simd_mode + +#undef TARGET_MACHINE_DEPENDENT_REORG +#define TARGET_MACHINE_DEPENDENT_REORG arm_reorg + +#undef TARGET_INIT_BUILTINS +#define TARGET_INIT_BUILTINS arm_init_builtins +#undef TARGET_EXPAND_BUILTIN +#define TARGET_EXPAND_BUILTIN arm_expand_builtin + +#undef TARGET_INIT_LIBFUNCS +#define TARGET_INIT_LIBFUNCS arm_init_libfuncs + +#undef TARGET_PROMOTE_FUNCTION_MODE +#define TARGET_PROMOTE_FUNCTION_MODE arm_promote_function_mode +#undef TARGET_PROMOTE_PROTOTYPES +#define TARGET_PROMOTE_PROTOTYPES arm_promote_prototypes +#undef TARGET_PASS_BY_REFERENCE +#define TARGET_PASS_BY_REFERENCE arm_pass_by_reference +#undef TARGET_ARG_PARTIAL_BYTES +#define TARGET_ARG_PARTIAL_BYTES arm_arg_partial_bytes +#undef TARGET_FUNCTION_ARG +#define TARGET_FUNCTION_ARG arm_function_arg +#undef TARGET_FUNCTION_ARG_ADVANCE +#define TARGET_FUNCTION_ARG_ADVANCE arm_function_arg_advance +#undef TARGET_FUNCTION_ARG_BOUNDARY +#define TARGET_FUNCTION_ARG_BOUNDARY arm_function_arg_boundary + +#undef TARGET_SETUP_INCOMING_VARARGS +#define TARGET_SETUP_INCOMING_VARARGS arm_setup_incoming_varargs + +#undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS +#define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS arm_allocate_stack_slots_for_args + +#undef TARGET_ASM_TRAMPOLINE_TEMPLATE +#define TARGET_ASM_TRAMPOLINE_TEMPLATE arm_asm_trampoline_template +#undef TARGET_TRAMPOLINE_INIT +#define TARGET_TRAMPOLINE_INIT arm_trampoline_init +#undef TARGET_TRAMPOLINE_ADJUST_ADDRESS +#define TARGET_TRAMPOLINE_ADJUST_ADDRESS arm_trampoline_adjust_address + +#undef TARGET_DEFAULT_SHORT_ENUMS +#define TARGET_DEFAULT_SHORT_ENUMS arm_default_short_enums + +#undef TARGET_ALIGN_ANON_BITFIELD +#define TARGET_ALIGN_ANON_BITFIELD arm_align_anon_bitfield + +#undef TARGET_NARROW_VOLATILE_BITFIELD +#define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false + +#undef TARGET_CXX_GUARD_TYPE +#define TARGET_CXX_GUARD_TYPE arm_cxx_guard_type + +#undef TARGET_CXX_GUARD_MASK_BIT +#define TARGET_CXX_GUARD_MASK_BIT arm_cxx_guard_mask_bit + +#undef TARGET_CXX_GET_COOKIE_SIZE +#define TARGET_CXX_GET_COOKIE_SIZE arm_get_cookie_size + +#undef TARGET_CXX_COOKIE_HAS_SIZE +#define TARGET_CXX_COOKIE_HAS_SIZE arm_cookie_has_size + +#undef TARGET_CXX_CDTOR_RETURNS_THIS +#define TARGET_CXX_CDTOR_RETURNS_THIS arm_cxx_cdtor_returns_this + +#undef TARGET_CXX_KEY_METHOD_MAY_BE_INLINE +#define TARGET_CXX_KEY_METHOD_MAY_BE_INLINE arm_cxx_key_method_may_be_inline + +#undef TARGET_CXX_USE_AEABI_ATEXIT +#define TARGET_CXX_USE_AEABI_ATEXIT arm_cxx_use_aeabi_atexit + +#undef TARGET_CXX_DETERMINE_CLASS_DATA_VISIBILITY +#define TARGET_CXX_DETERMINE_CLASS_DATA_VISIBILITY \ + arm_cxx_determine_class_data_visibility + +#undef TARGET_CXX_CLASS_DATA_ALWAYS_COMDAT +#define TARGET_CXX_CLASS_DATA_ALWAYS_COMDAT arm_cxx_class_data_always_comdat + +#undef TARGET_RETURN_IN_MSB +#define TARGET_RETURN_IN_MSB arm_return_in_msb + +#undef TARGET_RETURN_IN_MEMORY +#define TARGET_RETURN_IN_MEMORY arm_return_in_memory + +#undef TARGET_MUST_PASS_IN_STACK +#define TARGET_MUST_PASS_IN_STACK arm_must_pass_in_stack + +#if ARM_UNWIND_INFO +#undef TARGET_ASM_UNWIND_EMIT +#define TARGET_ASM_UNWIND_EMIT arm_unwind_emit + +/* EABI unwinding tables use a different format for the typeinfo tables. */ +#undef TARGET_ASM_TTYPE +#define TARGET_ASM_TTYPE arm_output_ttype + +#undef TARGET_ARM_EABI_UNWINDER +#define TARGET_ARM_EABI_UNWINDER true + +#undef TARGET_ASM_EMIT_EXCEPT_PERSONALITY +#define TARGET_ASM_EMIT_EXCEPT_PERSONALITY arm_asm_emit_except_personality + +#undef TARGET_ASM_INIT_SECTIONS +#define TARGET_ASM_INIT_SECTIONS arm_asm_init_sections +#endif /* ARM_UNWIND_INFO */ + +#undef TARGET_EXCEPT_UNWIND_INFO +#define TARGET_EXCEPT_UNWIND_INFO arm_except_unwind_info + +#undef TARGET_DWARF_HANDLE_FRAME_UNSPEC +#define TARGET_DWARF_HANDLE_FRAME_UNSPEC arm_dwarf_handle_frame_unspec + +#undef TARGET_DWARF_REGISTER_SPAN +#define TARGET_DWARF_REGISTER_SPAN arm_dwarf_register_span + +#undef TARGET_CANNOT_COPY_INSN_P +#define TARGET_CANNOT_COPY_INSN_P arm_cannot_copy_insn_p + +#ifdef HAVE_AS_TLS +#undef TARGET_HAVE_TLS +#define TARGET_HAVE_TLS true +#endif + +#undef TARGET_HAVE_CONDITIONAL_EXECUTION +#define TARGET_HAVE_CONDITIONAL_EXECUTION arm_have_conditional_execution + +#undef TARGET_CANNOT_FORCE_CONST_MEM +#define TARGET_CANNOT_FORCE_CONST_MEM arm_cannot_force_const_mem + +#undef TARGET_MAX_ANCHOR_OFFSET +#define TARGET_MAX_ANCHOR_OFFSET 4095 + +/* The minimum is set such that the total size of the block + for a particular anchor is -4088 + 1 + 4095 bytes, which is + divisible by eight, ensuring natural spacing of anchors. */ +#undef TARGET_MIN_ANCHOR_OFFSET +#define TARGET_MIN_ANCHOR_OFFSET -4088 + +#undef TARGET_SCHED_ISSUE_RATE +#define TARGET_SCHED_ISSUE_RATE arm_issue_rate + +#undef TARGET_MANGLE_TYPE +#define TARGET_MANGLE_TYPE arm_mangle_type + +#undef TARGET_BUILD_BUILTIN_VA_LIST +#define TARGET_BUILD_BUILTIN_VA_LIST arm_build_builtin_va_list +#undef TARGET_EXPAND_BUILTIN_VA_START +#define TARGET_EXPAND_BUILTIN_VA_START arm_expand_builtin_va_start +#undef TARGET_GIMPLIFY_VA_ARG_EXPR +#define TARGET_GIMPLIFY_VA_ARG_EXPR arm_gimplify_va_arg_expr + +#ifdef HAVE_AS_TLS +#undef TARGET_ASM_OUTPUT_DWARF_DTPREL +#define TARGET_ASM_OUTPUT_DWARF_DTPREL arm_output_dwarf_dtprel +#endif + +#undef TARGET_LEGITIMATE_ADDRESS_P +#define TARGET_LEGITIMATE_ADDRESS_P arm_legitimate_address_p + +#undef TARGET_INVALID_PARAMETER_TYPE +#define TARGET_INVALID_PARAMETER_TYPE arm_invalid_parameter_type + +#undef TARGET_INVALID_RETURN_TYPE +#define TARGET_INVALID_RETURN_TYPE arm_invalid_return_type + +#undef TARGET_PROMOTED_TYPE +#define TARGET_PROMOTED_TYPE arm_promoted_type + +#undef TARGET_CONVERT_TO_TYPE +#define TARGET_CONVERT_TO_TYPE arm_convert_to_type + +#undef TARGET_SCALAR_MODE_SUPPORTED_P +#define TARGET_SCALAR_MODE_SUPPORTED_P arm_scalar_mode_supported_p + +#undef TARGET_FRAME_POINTER_REQUIRED +#define TARGET_FRAME_POINTER_REQUIRED arm_frame_pointer_required + +#undef TARGET_CAN_ELIMINATE +#define TARGET_CAN_ELIMINATE arm_can_eliminate + +#undef TARGET_CONDITIONAL_REGISTER_USAGE +#define TARGET_CONDITIONAL_REGISTER_USAGE arm_conditional_register_usage + +#undef TARGET_CLASS_LIKELY_SPILLED_P +#define TARGET_CLASS_LIKELY_SPILLED_P arm_class_likely_spilled_p + +#undef TARGET_VECTOR_ALIGNMENT +#define TARGET_VECTOR_ALIGNMENT arm_vector_alignment + +#undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE +#define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \ + arm_vector_alignment_reachable + +#undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT +#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \ + arm_builtin_support_vector_misalignment + +#undef TARGET_PREFERRED_RENAME_CLASS +#define TARGET_PREFERRED_RENAME_CLASS \ + arm_preferred_rename_class + +struct gcc_target targetm = TARGET_INITIALIZER; + +/* Obstack for minipool constant handling. */ +static struct obstack minipool_obstack; +static char * minipool_startobj; + +/* The maximum number of insns skipped which + will be conditionalised if possible. */ +static int max_insns_skipped = 5; + +extern FILE * asm_out_file; + +/* True if we are currently building a constant table. */ +int making_const_table; + +/* The processor for which instructions should be scheduled. */ +enum processor_type arm_tune = arm_none; + +/* The current tuning set. */ +const struct tune_params *current_tune; + +/* Which floating point hardware to schedule for. */ +int arm_fpu_attr; + +/* Which floating popint hardware to use. */ +const struct arm_fpu_desc *arm_fpu_desc; + +/* Whether to use floating point hardware. */ +enum float_abi_type arm_float_abi; + +/* Which __fp16 format to use. */ +enum arm_fp16_format_type arm_fp16_format; + +/* Which ABI to use. */ +enum arm_abi_type arm_abi; + +/* Which thread pointer model to use. */ +enum arm_tp_type target_thread_pointer = TP_AUTO; + +/* Used to parse -mstructure_size_boundary command line option. */ +int arm_structure_size_boundary = DEFAULT_STRUCTURE_SIZE_BOUNDARY; + +/* Used for Thumb call_via trampolines. */ +rtx thumb_call_via_label[14]; +static int thumb_call_reg_needed; + +/* Bit values used to identify processor capabilities. */ +#define FL_CO_PROC (1 << 0) /* Has external co-processor bus */ +#define FL_ARCH3M (1 << 1) /* Extended multiply */ +#define FL_MODE26 (1 << 2) /* 26-bit mode support */ +#define FL_MODE32 (1 << 3) /* 32-bit mode support */ +#define FL_ARCH4 (1 << 4) /* Architecture rel 4 */ +#define FL_ARCH5 (1 << 5) /* Architecture rel 5 */ +#define FL_THUMB (1 << 6) /* Thumb aware */ +#define FL_LDSCHED (1 << 7) /* Load scheduling necessary */ +#define FL_STRONG (1 << 8) /* StrongARM */ +#define FL_ARCH5E (1 << 9) /* DSP extensions to v5 */ +#define FL_XSCALE (1 << 10) /* XScale */ +#define FL_CIRRUS (1 << 11) /* Cirrus/DSP. */ +#define FL_ARCH6 (1 << 12) /* Architecture rel 6. Adds + media instructions. */ +#define FL_VFPV2 (1 << 13) /* Vector Floating Point V2. */ +#define FL_WBUF (1 << 14) /* Schedule for write buffer ops. + Note: ARM6 & 7 derivatives only. */ +#define FL_ARCH6K (1 << 15) /* Architecture rel 6 K extensions. */ +#define FL_THUMB2 (1 << 16) /* Thumb-2. */ +#define FL_NOTM (1 << 17) /* Instructions not present in the 'M' + profile. */ +#define FL_DIV (1 << 18) /* Hardware divide. */ +#define FL_VFPV3 (1 << 19) /* Vector Floating Point V3. */ +#define FL_NEON (1 << 20) /* Neon instructions. */ +#define FL_ARCH7EM (1 << 21) /* Instructions present in the ARMv7E-M + architecture. */ +#define FL_ARCH7 (1 << 22) /* Architecture 7. */ + +#define FL_IWMMXT (1 << 29) /* XScale v2 or "Intel Wireless MMX technology". */ + +/* Flags that only effect tuning, not available instructions. */ +#define FL_TUNE (FL_WBUF | FL_VFPV2 | FL_STRONG | FL_LDSCHED \ + | FL_CO_PROC) + +#define FL_FOR_ARCH2 FL_NOTM +#define FL_FOR_ARCH3 (FL_FOR_ARCH2 | FL_MODE32) +#define FL_FOR_ARCH3M (FL_FOR_ARCH3 | FL_ARCH3M) +#define FL_FOR_ARCH4 (FL_FOR_ARCH3M | FL_ARCH4) +#define FL_FOR_ARCH4T (FL_FOR_ARCH4 | FL_THUMB) +#define FL_FOR_ARCH5 (FL_FOR_ARCH4 | FL_ARCH5) +#define FL_FOR_ARCH5T (FL_FOR_ARCH5 | FL_THUMB) +#define FL_FOR_ARCH5E (FL_FOR_ARCH5 | FL_ARCH5E) +#define FL_FOR_ARCH5TE (FL_FOR_ARCH5E | FL_THUMB) +#define FL_FOR_ARCH5TEJ FL_FOR_ARCH5TE +#define FL_FOR_ARCH6 (FL_FOR_ARCH5TE | FL_ARCH6) +#define FL_FOR_ARCH6J FL_FOR_ARCH6 +#define FL_FOR_ARCH6K (FL_FOR_ARCH6 | FL_ARCH6K) +#define FL_FOR_ARCH6Z FL_FOR_ARCH6 +#define FL_FOR_ARCH6ZK FL_FOR_ARCH6K +#define FL_FOR_ARCH6T2 (FL_FOR_ARCH6 | FL_THUMB2) +#define FL_FOR_ARCH6M (FL_FOR_ARCH6 & ~FL_NOTM) +#define FL_FOR_ARCH7 ((FL_FOR_ARCH6T2 & ~FL_NOTM) | FL_ARCH7) +#define FL_FOR_ARCH7A (FL_FOR_ARCH7 | FL_NOTM | FL_ARCH6K) +#define FL_FOR_ARCH7R (FL_FOR_ARCH7A | FL_DIV) +#define FL_FOR_ARCH7M (FL_FOR_ARCH7 | FL_DIV) +#define FL_FOR_ARCH7EM (FL_FOR_ARCH7M | FL_ARCH7EM) + +/* The bits in this mask specify which + instructions we are allowed to generate. */ +static unsigned long insn_flags = 0; + +/* The bits in this mask specify which instruction scheduling options should + be used. */ +static unsigned long tune_flags = 0; + +/* The following are used in the arm.md file as equivalents to bits + in the above two flag variables. */ + +/* Nonzero if this chip supports the ARM Architecture 3M extensions. */ +int arm_arch3m = 0; + +/* Nonzero if this chip supports the ARM Architecture 4 extensions. */ +int arm_arch4 = 0; + +/* Nonzero if this chip supports the ARM Architecture 4t extensions. */ +int arm_arch4t = 0; + +/* Nonzero if this chip supports the ARM Architecture 5 extensions. */ +int arm_arch5 = 0; + +/* Nonzero if this chip supports the ARM Architecture 5E extensions. */ +int arm_arch5e = 0; + +/* Nonzero if this chip supports the ARM Architecture 6 extensions. */ +int arm_arch6 = 0; + +/* Nonzero if this chip supports the ARM 6K extensions. */ +int arm_arch6k = 0; + +/* Nonzero if this chip supports the ARM 7 extensions. */ +int arm_arch7 = 0; + +/* Nonzero if instructions not present in the 'M' profile can be used. */ +int arm_arch_notm = 0; + +/* Nonzero if instructions present in ARMv7E-M can be used. */ +int arm_arch7em = 0; + +/* Nonzero if this chip can benefit from load scheduling. */ +int arm_ld_sched = 0; + +/* Nonzero if this chip is a StrongARM. */ +int arm_tune_strongarm = 0; + +/* Nonzero if this chip is a Cirrus variant. */ +int arm_arch_cirrus = 0; + +/* Nonzero if this chip supports Intel Wireless MMX technology. */ +int arm_arch_iwmmxt = 0; + +/* Nonzero if this chip is an XScale. */ +int arm_arch_xscale = 0; + +/* Nonzero if tuning for XScale */ +int arm_tune_xscale = 0; + +/* Nonzero if we want to tune for stores that access the write-buffer. + This typically means an ARM6 or ARM7 with MMU or MPU. */ +int arm_tune_wbuf = 0; + +/* Nonzero if tuning for Cortex-A9. */ +int arm_tune_cortex_a9 = 0; + +/* Nonzero if generating Thumb instructions. */ +int thumb_code = 0; + +/* Nonzero if generating Thumb-1 instructions. */ +int thumb1_code = 0; + +/* Nonzero if we should define __THUMB_INTERWORK__ in the + preprocessor. + XXX This is a bit of a hack, it's intended to help work around + problems in GLD which doesn't understand that armv5t code is + interworking clean. */ +int arm_cpp_interwork = 0; + +/* Nonzero if chip supports Thumb 2. */ +int arm_arch_thumb2; + +/* Nonzero if chip supports integer division instruction. */ +int arm_arch_hwdiv; + +/* In case of a PRE_INC, POST_INC, PRE_DEC, POST_DEC memory reference, + we must report the mode of the memory reference from + TARGET_PRINT_OPERAND to TARGET_PRINT_OPERAND_ADDRESS. */ +enum machine_mode output_memory_reference_mode; + +/* The register number to be used for the PIC offset register. */ +unsigned arm_pic_register = INVALID_REGNUM; + +/* Set to 1 after arm_reorg has started. Reset to start at the start of + the next function. */ +static int after_arm_reorg = 0; + +enum arm_pcs arm_pcs_default; + +/* For an explanation of these variables, see final_prescan_insn below. */ +int arm_ccfsm_state; +/* arm_current_cc is also used for Thumb-2 cond_exec blocks. */ +enum arm_cond_code arm_current_cc; + +rtx arm_target_insn; +int arm_target_label; +/* The number of conditionally executed insns, including the current insn. */ +int arm_condexec_count = 0; +/* A bitmask specifying the patterns for the IT block. + Zero means do not output an IT block before this insn. */ +int arm_condexec_mask = 0; +/* The number of bits used in arm_condexec_mask. */ +int arm_condexec_masklen = 0; + +/* The condition codes of the ARM, and the inverse function. */ +static const char * const arm_condition_codes[] = +{ + "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc", + "hi", "ls", "ge", "lt", "gt", "le", "al", "nv" +}; + +/* The register numbers in sequence, for passing to arm_gen_load_multiple. */ +int arm_regs_in_sequence[] = +{ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +}; + +#define ARM_LSL_NAME (TARGET_UNIFIED_ASM ? "lsl" : "asl") +#define streq(string1, string2) (strcmp (string1, string2) == 0) + +#define THUMB2_WORK_REGS (0xff & ~( (1 << THUMB_HARD_FRAME_POINTER_REGNUM) \ + | (1 << SP_REGNUM) | (1 << PC_REGNUM) \ + | (1 << PIC_OFFSET_TABLE_REGNUM))) + +/* Initialization code. */ + +struct processors +{ + const char *const name; + enum processor_type core; + const char *arch; + const unsigned long flags; + const struct tune_params *const tune; +}; + + +#define ARM_PREFETCH_NOT_BENEFICIAL 0, -1, -1 +#define ARM_PREFETCH_BENEFICIAL(prefetch_slots,l1_size,l1_line_size) \ + prefetch_slots, \ + l1_size, \ + l1_line_size + +const struct tune_params arm_slowmul_tune = +{ + arm_slowmul_rtx_costs, + NULL, + 3, + ARM_PREFETCH_NOT_BENEFICIAL +}; + +const struct tune_params arm_fastmul_tune = +{ + arm_fastmul_rtx_costs, + NULL, + 1, + ARM_PREFETCH_NOT_BENEFICIAL +}; + +const struct tune_params arm_xscale_tune = +{ + arm_xscale_rtx_costs, + xscale_sched_adjust_cost, + 2, + ARM_PREFETCH_NOT_BENEFICIAL +}; + +const struct tune_params arm_9e_tune = +{ + arm_9e_rtx_costs, + NULL, + 1, + ARM_PREFETCH_NOT_BENEFICIAL +}; + +const struct tune_params arm_cortex_a9_tune = +{ + arm_9e_rtx_costs, + cortex_a9_sched_adjust_cost, + 1, + ARM_PREFETCH_BENEFICIAL(4,32,32) +}; + +const struct tune_params arm_fa726te_tune = +{ + arm_9e_rtx_costs, + fa726te_sched_adjust_cost, + 1, + ARM_PREFETCH_NOT_BENEFICIAL +}; + + +/* Not all of these give usefully different compilation alternatives, + but there is no simple way of generalizing them. */ +static const struct processors all_cores[] = +{ + /* ARM Cores */ +#define ARM_CORE(NAME, IDENT, ARCH, FLAGS, COSTS) \ + {NAME, IDENT, #ARCH, FLAGS | FL_FOR_ARCH##ARCH, &arm_##COSTS##_tune}, +#include "arm-cores.def" +#undef ARM_CORE + {NULL, arm_none, NULL, 0, NULL} +}; + +static const struct processors all_architectures[] = +{ + /* ARM Architectures */ + /* We don't specify tuning costs here as it will be figured out + from the core. */ + + {"armv2", arm2, "2", FL_CO_PROC | FL_MODE26 | FL_FOR_ARCH2, NULL}, + {"armv2a", arm2, "2", FL_CO_PROC | FL_MODE26 | FL_FOR_ARCH2, NULL}, + {"armv3", arm6, "3", FL_CO_PROC | FL_MODE26 | FL_FOR_ARCH3, NULL}, + {"armv3m", arm7m, "3M", FL_CO_PROC | FL_MODE26 | FL_FOR_ARCH3M, NULL}, + {"armv4", arm7tdmi, "4", FL_CO_PROC | FL_MODE26 | FL_FOR_ARCH4, NULL}, + /* Strictly, FL_MODE26 is a permitted option for v4t, but there are no + implementations that support it, so we will leave it out for now. */ + {"armv4t", arm7tdmi, "4T", FL_CO_PROC | FL_FOR_ARCH4T, NULL}, + {"armv5", arm10tdmi, "5", FL_CO_PROC | FL_FOR_ARCH5, NULL}, + {"armv5t", arm10tdmi, "5T", FL_CO_PROC | FL_FOR_ARCH5T, NULL}, + {"armv5e", arm1026ejs, "5E", FL_CO_PROC | FL_FOR_ARCH5E, NULL}, + {"armv5te", arm1026ejs, "5TE", FL_CO_PROC | FL_FOR_ARCH5TE, NULL}, + {"armv6", arm1136js, "6", FL_CO_PROC | FL_FOR_ARCH6, NULL}, + {"armv6j", arm1136js, "6J", FL_CO_PROC | FL_FOR_ARCH6J, NULL}, + {"armv6k", mpcore, "6K", FL_CO_PROC | FL_FOR_ARCH6K, NULL}, + {"armv6z", arm1176jzs, "6Z", FL_CO_PROC | FL_FOR_ARCH6Z, NULL}, + {"armv6zk", arm1176jzs, "6ZK", FL_CO_PROC | FL_FOR_ARCH6ZK, NULL}, + {"armv6t2", arm1156t2s, "6T2", FL_CO_PROC | FL_FOR_ARCH6T2, NULL}, + {"armv6-m", cortexm1, "6M", FL_FOR_ARCH6M, NULL}, + {"armv7", cortexa8, "7", FL_CO_PROC | FL_FOR_ARCH7, NULL}, + {"armv7-a", cortexa8, "7A", FL_CO_PROC | FL_FOR_ARCH7A, NULL}, + {"armv7-r", cortexr4, "7R", FL_CO_PROC | FL_FOR_ARCH7R, NULL}, + {"armv7-m", cortexm3, "7M", FL_CO_PROC | FL_FOR_ARCH7M, NULL}, + {"armv7e-m", cortexm4, "7EM", FL_CO_PROC | FL_FOR_ARCH7EM, NULL}, + {"ep9312", ep9312, "4T", FL_LDSCHED | FL_CIRRUS | FL_FOR_ARCH4, NULL}, + {"iwmmxt", iwmmxt, "5TE", FL_LDSCHED | FL_STRONG | FL_FOR_ARCH5TE | FL_XSCALE | FL_IWMMXT , NULL}, + {"iwmmxt2", iwmmxt2, "5TE", FL_LDSCHED | FL_STRONG | FL_FOR_ARCH5TE | FL_XSCALE | FL_IWMMXT , NULL}, + {NULL, arm_none, NULL, 0 , NULL} +}; + + +/* These are populated as commandline arguments are processed, or NULL + if not specified. */ +static const struct processors *arm_selected_arch; +static const struct processors *arm_selected_cpu; +static const struct processors *arm_selected_tune; + +/* The name of the preprocessor macro to define for this architecture. */ + +char arm_arch_name[] = "__ARM_ARCH_0UNK__"; + +/* Available values for -mfpu=. */ + +static const struct arm_fpu_desc all_fpus[] = +{ + {"fpa", ARM_FP_MODEL_FPA, 0, VFP_NONE, false, false}, + {"fpe2", ARM_FP_MODEL_FPA, 2, VFP_NONE, false, false}, + {"fpe3", ARM_FP_MODEL_FPA, 3, VFP_NONE, false, false}, + {"maverick", ARM_FP_MODEL_MAVERICK, 0, VFP_NONE, false, false}, + {"vfp", ARM_FP_MODEL_VFP, 2, VFP_REG_D16, false, false}, + {"vfpv3", ARM_FP_MODEL_VFP, 3, VFP_REG_D32, false, false}, + {"vfpv3-fp16", ARM_FP_MODEL_VFP, 3, VFP_REG_D32, false, true}, + {"vfpv3-d16", ARM_FP_MODEL_VFP, 3, VFP_REG_D16, false, false}, + {"vfpv3-d16-fp16", ARM_FP_MODEL_VFP, 3, VFP_REG_D16, false, true}, + {"vfpv3xd", ARM_FP_MODEL_VFP, 3, VFP_REG_SINGLE, false, false}, + {"vfpv3xd-fp16", ARM_FP_MODEL_VFP, 3, VFP_REG_SINGLE, false, true}, + {"neon", ARM_FP_MODEL_VFP, 3, VFP_REG_D32, true , false}, + {"neon-fp16", ARM_FP_MODEL_VFP, 3, VFP_REG_D32, true , true }, + {"vfpv4", ARM_FP_MODEL_VFP, 4, VFP_REG_D32, false, true}, + {"vfpv4-d16", ARM_FP_MODEL_VFP, 4, VFP_REG_D16, false, true}, + {"fpv4-sp-d16", ARM_FP_MODEL_VFP, 4, VFP_REG_SINGLE, false, true}, + {"neon-vfpv4", ARM_FP_MODEL_VFP, 4, VFP_REG_D32, true, true}, + /* Compatibility aliases. */ + {"vfp3", ARM_FP_MODEL_VFP, 3, VFP_REG_D32, false, false}, +}; + + +struct float_abi +{ + const char * name; + enum float_abi_type abi_type; +}; + + +/* Available values for -mfloat-abi=. */ + +static const struct float_abi all_float_abis[] = +{ + {"soft", ARM_FLOAT_ABI_SOFT}, + {"softfp", ARM_FLOAT_ABI_SOFTFP}, + {"hard", ARM_FLOAT_ABI_HARD} +}; + + +struct fp16_format +{ + const char *name; + enum arm_fp16_format_type fp16_format_type; +}; + + +/* Available values for -mfp16-format=. */ + +static const struct fp16_format all_fp16_formats[] = +{ + {"none", ARM_FP16_FORMAT_NONE}, + {"ieee", ARM_FP16_FORMAT_IEEE}, + {"alternative", ARM_FP16_FORMAT_ALTERNATIVE} +}; + + +struct abi_name +{ + const char *name; + enum arm_abi_type abi_type; +}; + + +/* Available values for -mabi=. */ + +static const struct abi_name arm_all_abis[] = +{ + {"apcs-gnu", ARM_ABI_APCS}, + {"atpcs", ARM_ABI_ATPCS}, + {"aapcs", ARM_ABI_AAPCS}, + {"iwmmxt", ARM_ABI_IWMMXT}, + {"aapcs-linux", ARM_ABI_AAPCS_LINUX} +}; + +/* Supported TLS relocations. */ + +enum tls_reloc { + TLS_GD32, + TLS_LDM32, + TLS_LDO32, + TLS_IE32, + TLS_LE32 +}; + +/* The maximum number of insns to be used when loading a constant. */ +inline static int +arm_constant_limit (bool size_p) +{ + return size_p ? 1 : current_tune->constant_limit; +} + +/* Emit an insn that's a simple single-set. Both the operands must be known + to be valid. */ +inline static rtx +emit_set_insn (rtx x, rtx y) +{ + return emit_insn (gen_rtx_SET (VOIDmode, x, y)); +} + +/* Return the number of bits set in VALUE. */ +static unsigned +bit_count (unsigned long value) +{ + unsigned long count = 0; + + while (value) + { + count++; + value &= value - 1; /* Clear the least-significant set bit. */ + } + + return count; +} + +/* Set up library functions unique to ARM. */ + +static void +arm_init_libfuncs (void) +{ + /* There are no special library functions unless we are using the + ARM BPABI. */ + if (!TARGET_BPABI) + return; + + /* The functions below are described in Section 4 of the "Run-Time + ABI for the ARM architecture", Version 1.0. */ + + /* Double-precision floating-point arithmetic. Table 2. */ + set_optab_libfunc (add_optab, DFmode, "__aeabi_dadd"); + set_optab_libfunc (sdiv_optab, DFmode, "__aeabi_ddiv"); + set_optab_libfunc (smul_optab, DFmode, "__aeabi_dmul"); + set_optab_libfunc (neg_optab, DFmode, "__aeabi_dneg"); + set_optab_libfunc (sub_optab, DFmode, "__aeabi_dsub"); + + /* Double-precision comparisons. Table 3. */ + set_optab_libfunc (eq_optab, DFmode, "__aeabi_dcmpeq"); + set_optab_libfunc (ne_optab, DFmode, NULL); + set_optab_libfunc (lt_optab, DFmode, "__aeabi_dcmplt"); + set_optab_libfunc (le_optab, DFmode, "__aeabi_dcmple"); + set_optab_libfunc (ge_optab, DFmode, "__aeabi_dcmpge"); + set_optab_libfunc (gt_optab, DFmode, "__aeabi_dcmpgt"); + set_optab_libfunc (unord_optab, DFmode, "__aeabi_dcmpun"); + + /* Single-precision floating-point arithmetic. Table 4. */ + set_optab_libfunc (add_optab, SFmode, "__aeabi_fadd"); + set_optab_libfunc (sdiv_optab, SFmode, "__aeabi_fdiv"); + set_optab_libfunc (smul_optab, SFmode, "__aeabi_fmul"); + set_optab_libfunc (neg_optab, SFmode, "__aeabi_fneg"); + set_optab_libfunc (sub_optab, SFmode, "__aeabi_fsub"); + + /* Single-precision comparisons. Table 5. */ + set_optab_libfunc (eq_optab, SFmode, "__aeabi_fcmpeq"); + set_optab_libfunc (ne_optab, SFmode, NULL); + set_optab_libfunc (lt_optab, SFmode, "__aeabi_fcmplt"); + set_optab_libfunc (le_optab, SFmode, "__aeabi_fcmple"); + set_optab_libfunc (ge_optab, SFmode, "__aeabi_fcmpge"); + set_optab_libfunc (gt_optab, SFmode, "__aeabi_fcmpgt"); + set_optab_libfunc (unord_optab, SFmode, "__aeabi_fcmpun"); + + /* Floating-point to integer conversions. Table 6. */ + set_conv_libfunc (sfix_optab, SImode, DFmode, "__aeabi_d2iz"); + set_conv_libfunc (ufix_optab, SImode, DFmode, "__aeabi_d2uiz"); + set_conv_libfunc (sfix_optab, DImode, DFmode, "__aeabi_d2lz"); + set_conv_libfunc (ufix_optab, DImode, DFmode, "__aeabi_d2ulz"); + set_conv_libfunc (sfix_optab, SImode, SFmode, "__aeabi_f2iz"); + set_conv_libfunc (ufix_optab, SImode, SFmode, "__aeabi_f2uiz"); + set_conv_libfunc (sfix_optab, DImode, SFmode, "__aeabi_f2lz"); + set_conv_libfunc (ufix_optab, DImode, SFmode, "__aeabi_f2ulz"); + + /* Conversions between floating types. Table 7. */ + set_conv_libfunc (trunc_optab, SFmode, DFmode, "__aeabi_d2f"); + set_conv_libfunc (sext_optab, DFmode, SFmode, "__aeabi_f2d"); + + /* Integer to floating-point conversions. Table 8. */ + set_conv_libfunc (sfloat_optab, DFmode, SImode, "__aeabi_i2d"); + set_conv_libfunc (ufloat_optab, DFmode, SImode, "__aeabi_ui2d"); + set_conv_libfunc (sfloat_optab, DFmode, DImode, "__aeabi_l2d"); + set_conv_libfunc (ufloat_optab, DFmode, DImode, "__aeabi_ul2d"); + set_conv_libfunc (sfloat_optab, SFmode, SImode, "__aeabi_i2f"); + set_conv_libfunc (ufloat_optab, SFmode, SImode, "__aeabi_ui2f"); + set_conv_libfunc (sfloat_optab, SFmode, DImode, "__aeabi_l2f"); + set_conv_libfunc (ufloat_optab, SFmode, DImode, "__aeabi_ul2f"); + + /* Long long. Table 9. */ + set_optab_libfunc (smul_optab, DImode, "__aeabi_lmul"); + set_optab_libfunc (sdivmod_optab, DImode, "__aeabi_ldivmod"); + set_optab_libfunc (udivmod_optab, DImode, "__aeabi_uldivmod"); + set_optab_libfunc (ashl_optab, DImode, "__aeabi_llsl"); + set_optab_libfunc (lshr_optab, DImode, "__aeabi_llsr"); + set_optab_libfunc (ashr_optab, DImode, "__aeabi_lasr"); + set_optab_libfunc (cmp_optab, DImode, "__aeabi_lcmp"); + set_optab_libfunc (ucmp_optab, DImode, "__aeabi_ulcmp"); + + /* Integer (32/32->32) division. \S 4.3.1. */ + set_optab_libfunc (sdivmod_optab, SImode, "__aeabi_idivmod"); + set_optab_libfunc (udivmod_optab, SImode, "__aeabi_uidivmod"); + + /* The divmod functions are designed so that they can be used for + plain division, even though they return both the quotient and the + remainder. The quotient is returned in the usual location (i.e., + r0 for SImode, {r0, r1} for DImode), just as would be expected + for an ordinary division routine. Because the AAPCS calling + conventions specify that all of { r0, r1, r2, r3 } are + callee-saved registers, there is no need to tell the compiler + explicitly that those registers are clobbered by these + routines. */ + set_optab_libfunc (sdiv_optab, DImode, "__aeabi_ldivmod"); + set_optab_libfunc (udiv_optab, DImode, "__aeabi_uldivmod"); + + /* For SImode division the ABI provides div-without-mod routines, + which are faster. */ + set_optab_libfunc (sdiv_optab, SImode, "__aeabi_idiv"); + set_optab_libfunc (udiv_optab, SImode, "__aeabi_uidiv"); + + /* We don't have mod libcalls. Fortunately gcc knows how to use the + divmod libcalls instead. */ + set_optab_libfunc (smod_optab, DImode, NULL); + set_optab_libfunc (umod_optab, DImode, NULL); + set_optab_libfunc (smod_optab, SImode, NULL); + set_optab_libfunc (umod_optab, SImode, NULL); + + /* Half-precision float operations. The compiler handles all operations + with NULL libfuncs by converting the SFmode. */ + switch (arm_fp16_format) + { + case ARM_FP16_FORMAT_IEEE: + case ARM_FP16_FORMAT_ALTERNATIVE: + + /* Conversions. */ + set_conv_libfunc (trunc_optab, HFmode, SFmode, + (arm_fp16_format == ARM_FP16_FORMAT_IEEE + ? "__gnu_f2h_ieee" + : "__gnu_f2h_alternative")); + set_conv_libfunc (sext_optab, SFmode, HFmode, + (arm_fp16_format == ARM_FP16_FORMAT_IEEE + ? "__gnu_h2f_ieee" + : "__gnu_h2f_alternative")); + + /* Arithmetic. */ + set_optab_libfunc (add_optab, HFmode, NULL); + set_optab_libfunc (sdiv_optab, HFmode, NULL); + set_optab_libfunc (smul_optab, HFmode, NULL); + set_optab_libfunc (neg_optab, HFmode, NULL); + set_optab_libfunc (sub_optab, HFmode, NULL); + + /* Comparisons. */ + set_optab_libfunc (eq_optab, HFmode, NULL); + set_optab_libfunc (ne_optab, HFmode, NULL); + set_optab_libfunc (lt_optab, HFmode, NULL); + set_optab_libfunc (le_optab, HFmode, NULL); + set_optab_libfunc (ge_optab, HFmode, NULL); + set_optab_libfunc (gt_optab, HFmode, NULL); + set_optab_libfunc (unord_optab, HFmode, NULL); + break; + + default: + break; + } + + if (TARGET_AAPCS_BASED) + synchronize_libfunc = init_one_libfunc ("__sync_synchronize"); +} + +/* On AAPCS systems, this is the "struct __va_list". */ +static GTY(()) tree va_list_type; + +/* Return the type to use as __builtin_va_list. */ +static tree +arm_build_builtin_va_list (void) +{ + tree va_list_name; + tree ap_field; + + if (!TARGET_AAPCS_BASED) + return std_build_builtin_va_list (); + + /* AAPCS \S 7.1.4 requires that va_list be a typedef for a type + defined as: + + struct __va_list + { + void *__ap; + }; + + The C Library ABI further reinforces this definition in \S + 4.1. + + We must follow this definition exactly. The structure tag + name is visible in C++ mangled names, and thus forms a part + of the ABI. The field name may be used by people who + #include . */ + /* Create the type. */ + va_list_type = lang_hooks.types.make_type (RECORD_TYPE); + /* Give it the required name. */ + va_list_name = build_decl (BUILTINS_LOCATION, + TYPE_DECL, + get_identifier ("__va_list"), + va_list_type); + DECL_ARTIFICIAL (va_list_name) = 1; + TYPE_NAME (va_list_type) = va_list_name; + TYPE_STUB_DECL (va_list_type) = va_list_name; + /* Create the __ap field. */ + ap_field = build_decl (BUILTINS_LOCATION, + FIELD_DECL, + get_identifier ("__ap"), + ptr_type_node); + DECL_ARTIFICIAL (ap_field) = 1; + DECL_FIELD_CONTEXT (ap_field) = va_list_type; + TYPE_FIELDS (va_list_type) = ap_field; + /* Compute its layout. */ + layout_type (va_list_type); + + return va_list_type; +} + +/* Return an expression of type "void *" pointing to the next + available argument in a variable-argument list. VALIST is the + user-level va_list object, of type __builtin_va_list. */ +static tree +arm_extract_valist_ptr (tree valist) +{ + if (TREE_TYPE (valist) == error_mark_node) + return error_mark_node; + + /* On an AAPCS target, the pointer is stored within "struct + va_list". */ + if (TARGET_AAPCS_BASED) + { + tree ap_field = TYPE_FIELDS (TREE_TYPE (valist)); + valist = build3 (COMPONENT_REF, TREE_TYPE (ap_field), + valist, ap_field, NULL_TREE); + } + + return valist; +} + +/* Implement TARGET_EXPAND_BUILTIN_VA_START. */ +static void +arm_expand_builtin_va_start (tree valist, rtx nextarg) +{ + valist = arm_extract_valist_ptr (valist); + std_expand_builtin_va_start (valist, nextarg); +} + +/* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */ +static tree +arm_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p, + gimple_seq *post_p) +{ + valist = arm_extract_valist_ptr (valist); + return std_gimplify_va_arg_expr (valist, type, pre_p, post_p); +} + +/* Lookup NAME in SEL. */ + +static const struct processors * +arm_find_cpu (const char *name, const struct processors *sel, const char *desc) +{ + if (!(name && *name)) + return NULL; + + for (; sel->name != NULL; sel++) + { + if (streq (name, sel->name)) + return sel; + } + + error ("bad value (%s) for %s switch", name, desc); + return NULL; +} + +/* Implement TARGET_HANDLE_OPTION. */ + +static bool +arm_handle_option (size_t code, const char *arg, int value ATTRIBUTE_UNUSED) +{ + switch (code) + { + case OPT_march_: + arm_selected_arch = arm_find_cpu(arg, all_architectures, "-march"); + return true; + + case OPT_mcpu_: + arm_selected_cpu = arm_find_cpu(arg, all_cores, "-mcpu"); + return true; + + case OPT_mhard_float: + target_float_abi_name = "hard"; + return true; + + case OPT_msoft_float: + target_float_abi_name = "soft"; + return true; + + case OPT_mtune_: + arm_selected_tune = arm_find_cpu(arg, all_cores, "-mtune"); + return true; + + default: + return true; + } +} + +static void +arm_target_help (void) +{ + int i; + static int columns = 0; + int remaining; + + /* If we have not done so already, obtain the desired maximum width of + the output. Note - this is a duplication of the code at the start of + gcc/opts.c:print_specific_help() - the two copies should probably be + replaced by a single function. */ + if (columns == 0) + { + const char *p; + + p = getenv ("COLUMNS"); + if (p != NULL) + { + int value = atoi (p); + + if (value > 0) + columns = value; + } + + if (columns == 0) + /* Use a reasonable default. */ + columns = 80; + } + + printf (" Known ARM CPUs (for use with the -mcpu= and -mtune= options):\n"); + + /* The - 2 is because we know that the last entry in the array is NULL. */ + i = ARRAY_SIZE (all_cores) - 2; + gcc_assert (i > 0); + printf (" %s", all_cores[i].name); + remaining = columns - (strlen (all_cores[i].name) + 4); + gcc_assert (remaining >= 0); + + while (i--) + { + int len = strlen (all_cores[i].name); + + if (remaining > len + 2) + { + printf (", %s", all_cores[i].name); + remaining -= len + 2; + } + else + { + if (remaining > 0) + printf (","); + printf ("\n %s", all_cores[i].name); + remaining = columns - (len + 4); + } + } + + printf ("\n\n Known ARM architectures (for use with the -march= option):\n"); + + i = ARRAY_SIZE (all_architectures) - 2; + gcc_assert (i > 0); + + printf (" %s", all_architectures[i].name); + remaining = columns - (strlen (all_architectures[i].name) + 4); + gcc_assert (remaining >= 0); + + while (i--) + { + int len = strlen (all_architectures[i].name); + + if (remaining > len + 2) + { + printf (", %s", all_architectures[i].name); + remaining -= len + 2; + } + else + { + if (remaining > 0) + printf (","); + printf ("\n %s", all_architectures[i].name); + remaining = columns - (len + 4); + } + } + printf ("\n"); + +} + +/* Fix up any incompatible options that the user has specified. */ +static void +arm_option_override (void) +{ + unsigned i; + +#ifdef SUBTARGET_OVERRIDE_OPTIONS + SUBTARGET_OVERRIDE_OPTIONS; +#endif + + if (arm_selected_arch) + { + if (arm_selected_cpu) + { + /* Check for conflict between mcpu and march. */ + if ((arm_selected_cpu->flags ^ arm_selected_arch->flags) & ~FL_TUNE) + { + warning (0, "switch -mcpu=%s conflicts with -march=%s switch", + arm_selected_cpu->name, arm_selected_arch->name); + /* -march wins for code generation. + -mcpu wins for default tuning. */ + if (!arm_selected_tune) + arm_selected_tune = arm_selected_cpu; + + arm_selected_cpu = arm_selected_arch; + } + else + /* -mcpu wins. */ + arm_selected_arch = NULL; + } + else + /* Pick a CPU based on the architecture. */ + arm_selected_cpu = arm_selected_arch; + } + + /* If the user did not specify a processor, choose one for them. */ + if (!arm_selected_cpu) + { + const struct processors * sel; + unsigned int sought; + + arm_selected_cpu = &all_cores[TARGET_CPU_DEFAULT]; + if (!arm_selected_cpu->name) + { +#ifdef SUBTARGET_CPU_DEFAULT + /* Use the subtarget default CPU if none was specified by + configure. */ + arm_selected_cpu = &all_cores[SUBTARGET_CPU_DEFAULT]; +#endif + /* Default to ARM6. */ + if (!arm_selected_cpu->name) + arm_selected_cpu = &all_cores[arm6]; + } + + sel = arm_selected_cpu; + insn_flags = sel->flags; + + /* Now check to see if the user has specified some command line + switch that require certain abilities from the cpu. */ + sought = 0; + + if (TARGET_INTERWORK || TARGET_THUMB) + { + sought |= (FL_THUMB | FL_MODE32); + + /* There are no ARM processors that support both APCS-26 and + interworking. Therefore we force FL_MODE26 to be removed + from insn_flags here (if it was set), so that the search + below will always be able to find a compatible processor. */ + insn_flags &= ~FL_MODE26; + } + + if (sought != 0 && ((sought & insn_flags) != sought)) + { + /* Try to locate a CPU type that supports all of the abilities + of the default CPU, plus the extra abilities requested by + the user. */ + for (sel = all_cores; sel->name != NULL; sel++) + if ((sel->flags & sought) == (sought | insn_flags)) + break; + + if (sel->name == NULL) + { + unsigned current_bit_count = 0; + const struct processors * best_fit = NULL; + + /* Ideally we would like to issue an error message here + saying that it was not possible to find a CPU compatible + with the default CPU, but which also supports the command + line options specified by the programmer, and so they + ought to use the -mcpu= command line option to + override the default CPU type. + + If we cannot find a cpu that has both the + characteristics of the default cpu and the given + command line options we scan the array again looking + for a best match. */ + for (sel = all_cores; sel->name != NULL; sel++) + if ((sel->flags & sought) == sought) + { + unsigned count; + + count = bit_count (sel->flags & insn_flags); + + if (count >= current_bit_count) + { + best_fit = sel; + current_bit_count = count; + } + } + + gcc_assert (best_fit); + sel = best_fit; + } + + arm_selected_cpu = sel; + } + } + + gcc_assert (arm_selected_cpu); + /* The selected cpu may be an architecture, so lookup tuning by core ID. */ + if (!arm_selected_tune) + arm_selected_tune = &all_cores[arm_selected_cpu->core]; + + sprintf (arm_arch_name, "__ARM_ARCH_%s__", arm_selected_cpu->arch); + insn_flags = arm_selected_cpu->flags; + + arm_tune = arm_selected_tune->core; + tune_flags = arm_selected_tune->flags; + current_tune = arm_selected_tune->tune; + + if (target_fp16_format_name) + { + for (i = 0; i < ARRAY_SIZE (all_fp16_formats); i++) + { + if (streq (all_fp16_formats[i].name, target_fp16_format_name)) + { + arm_fp16_format = all_fp16_formats[i].fp16_format_type; + break; + } + } + if (i == ARRAY_SIZE (all_fp16_formats)) + error ("invalid __fp16 format option: -mfp16-format=%s", + target_fp16_format_name); + } + else + arm_fp16_format = ARM_FP16_FORMAT_NONE; + + if (target_abi_name) + { + for (i = 0; i < ARRAY_SIZE (arm_all_abis); i++) + { + if (streq (arm_all_abis[i].name, target_abi_name)) + { + arm_abi = arm_all_abis[i].abi_type; + break; + } + } + if (i == ARRAY_SIZE (arm_all_abis)) + error ("invalid ABI option: -mabi=%s", target_abi_name); + } + else + arm_abi = ARM_DEFAULT_ABI; + + /* Make sure that the processor choice does not conflict with any of the + other command line choices. */ + if (TARGET_ARM && !(insn_flags & FL_NOTM)) + error ("target CPU does not support ARM mode"); + + /* BPABI targets use linker tricks to allow interworking on cores + without thumb support. */ + if (TARGET_INTERWORK && !((insn_flags & FL_THUMB) || TARGET_BPABI)) + { + warning (0, "target CPU does not support interworking" ); + target_flags &= ~MASK_INTERWORK; + } + + if (TARGET_THUMB && !(insn_flags & FL_THUMB)) + { + warning (0, "target CPU does not support THUMB instructions"); + target_flags &= ~MASK_THUMB; + } + + if (TARGET_APCS_FRAME && TARGET_THUMB) + { + /* warning (0, "ignoring -mapcs-frame because -mthumb was used"); */ + target_flags &= ~MASK_APCS_FRAME; + } + + /* Callee super interworking implies thumb interworking. Adding + this to the flags here simplifies the logic elsewhere. */ + if (TARGET_THUMB && TARGET_CALLEE_INTERWORKING) + target_flags |= MASK_INTERWORK; + + /* TARGET_BACKTRACE calls leaf_function_p, which causes a crash if done + from here where no function is being compiled currently. */ + if ((TARGET_TPCS_FRAME || TARGET_TPCS_LEAF_FRAME) && TARGET_ARM) + warning (0, "enabling backtrace support is only meaningful when compiling for the Thumb"); + + if (TARGET_ARM && TARGET_CALLEE_INTERWORKING) + warning (0, "enabling callee interworking support is only meaningful when compiling for the Thumb"); + + if (TARGET_APCS_STACK && !TARGET_APCS_FRAME) + { + warning (0, "-mapcs-stack-check incompatible with -mno-apcs-frame"); + target_flags |= MASK_APCS_FRAME; + } + + if (TARGET_POKE_FUNCTION_NAME) + target_flags |= MASK_APCS_FRAME; + + if (TARGET_APCS_REENT && flag_pic) + error ("-fpic and -mapcs-reent are incompatible"); + + if (TARGET_APCS_REENT) + warning (0, "APCS reentrant code not supported. Ignored"); + + /* If this target is normally configured to use APCS frames, warn if they + are turned off and debugging is turned on. */ + if (TARGET_ARM + && write_symbols != NO_DEBUG + && !TARGET_APCS_FRAME + && (TARGET_DEFAULT & MASK_APCS_FRAME)) + warning (0, "-g with -mno-apcs-frame may not give sensible debugging"); + + if (TARGET_APCS_FLOAT) + warning (0, "passing floating point arguments in fp regs not yet supported"); + + /* Initialize boolean versions of the flags, for use in the arm.md file. */ + arm_arch3m = (insn_flags & FL_ARCH3M) != 0; + arm_arch4 = (insn_flags & FL_ARCH4) != 0; + arm_arch4t = arm_arch4 & ((insn_flags & FL_THUMB) != 0); + arm_arch5 = (insn_flags & FL_ARCH5) != 0; + arm_arch5e = (insn_flags & FL_ARCH5E) != 0; + arm_arch6 = (insn_flags & FL_ARCH6) != 0; + arm_arch6k = (insn_flags & FL_ARCH6K) != 0; + arm_arch_notm = (insn_flags & FL_NOTM) != 0; + arm_arch7 = (insn_flags & FL_ARCH7) != 0; + arm_arch7em = (insn_flags & FL_ARCH7EM) != 0; + arm_arch_thumb2 = (insn_flags & FL_THUMB2) != 0; + arm_arch_xscale = (insn_flags & FL_XSCALE) != 0; + arm_arch_cirrus = (insn_flags & FL_CIRRUS) != 0; + + arm_ld_sched = (tune_flags & FL_LDSCHED) != 0; + arm_tune_strongarm = (tune_flags & FL_STRONG) != 0; + thumb_code = TARGET_ARM == 0; + thumb1_code = TARGET_THUMB1 != 0; + arm_tune_wbuf = (tune_flags & FL_WBUF) != 0; + arm_tune_xscale = (tune_flags & FL_XSCALE) != 0; + arm_arch_iwmmxt = (insn_flags & FL_IWMMXT) != 0; + arm_arch_hwdiv = (insn_flags & FL_DIV) != 0; + arm_tune_cortex_a9 = (arm_tune == cortexa9) != 0; + + /* If we are not using the default (ARM mode) section anchor offset + ranges, then set the correct ranges now. */ + if (TARGET_THUMB1) + { + /* Thumb-1 LDR instructions cannot have negative offsets. + Permissible positive offset ranges are 5-bit (for byte loads), + 6-bit (for halfword loads), or 7-bit (for word loads). + Empirical results suggest a 7-bit anchor range gives the best + overall code size. */ + targetm.min_anchor_offset = 0; + targetm.max_anchor_offset = 127; + } + else if (TARGET_THUMB2) + { + /* The minimum is set such that the total size of the block + for a particular anchor is 248 + 1 + 4095 bytes, which is + divisible by eight, ensuring natural spacing of anchors. */ + targetm.min_anchor_offset = -248; + targetm.max_anchor_offset = 4095; + } + + /* V5 code we generate is completely interworking capable, so we turn off + TARGET_INTERWORK here to avoid many tests later on. */ + + /* XXX However, we must pass the right pre-processor defines to CPP + or GLD can get confused. This is a hack. */ + if (TARGET_INTERWORK) + arm_cpp_interwork = 1; + + if (arm_arch5) + target_flags &= ~MASK_INTERWORK; + + if (TARGET_IWMMXT && !ARM_DOUBLEWORD_ALIGN) + error ("iwmmxt requires an AAPCS compatible ABI for proper operation"); + + if (TARGET_IWMMXT_ABI && !TARGET_IWMMXT) + error ("iwmmxt abi requires an iwmmxt capable cpu"); + + if (target_fpu_name == NULL && target_fpe_name != NULL) + { + if (streq (target_fpe_name, "2")) + target_fpu_name = "fpe2"; + else if (streq (target_fpe_name, "3")) + target_fpu_name = "fpe3"; + else + error ("invalid floating point emulation option: -mfpe=%s", + target_fpe_name); + } + + if (target_fpu_name == NULL) + { +#ifdef FPUTYPE_DEFAULT + target_fpu_name = FPUTYPE_DEFAULT; +#else + if (arm_arch_cirrus) + target_fpu_name = "maverick"; + else + target_fpu_name = "fpe2"; +#endif + } + + arm_fpu_desc = NULL; + for (i = 0; i < ARRAY_SIZE (all_fpus); i++) + { + if (streq (all_fpus[i].name, target_fpu_name)) + { + arm_fpu_desc = &all_fpus[i]; + break; + } + } + + if (!arm_fpu_desc) + { + error ("invalid floating point option: -mfpu=%s", target_fpu_name); + return; + } + + switch (arm_fpu_desc->model) + { + case ARM_FP_MODEL_FPA: + if (arm_fpu_desc->rev == 2) + arm_fpu_attr = FPU_FPE2; + else if (arm_fpu_desc->rev == 3) + arm_fpu_attr = FPU_FPE3; + else + arm_fpu_attr = FPU_FPA; + break; + + case ARM_FP_MODEL_MAVERICK: + arm_fpu_attr = FPU_MAVERICK; + break; + + case ARM_FP_MODEL_VFP: + arm_fpu_attr = FPU_VFP; + break; + + default: + gcc_unreachable(); + } + + if (target_float_abi_name != NULL) + { + /* The user specified a FP ABI. */ + for (i = 0; i < ARRAY_SIZE (all_float_abis); i++) + { + if (streq (all_float_abis[i].name, target_float_abi_name)) + { + arm_float_abi = all_float_abis[i].abi_type; + break; + } + } + if (i == ARRAY_SIZE (all_float_abis)) + error ("invalid floating point abi: -mfloat-abi=%s", + target_float_abi_name); + } + else + arm_float_abi = TARGET_DEFAULT_FLOAT_ABI; + + if (TARGET_AAPCS_BASED + && (arm_fpu_desc->model == ARM_FP_MODEL_FPA)) + error ("FPA is unsupported in the AAPCS"); + + if (TARGET_AAPCS_BASED) + { + if (TARGET_CALLER_INTERWORKING) + error ("AAPCS does not support -mcaller-super-interworking"); + else + if (TARGET_CALLEE_INTERWORKING) + error ("AAPCS does not support -mcallee-super-interworking"); + } + + /* FPA and iWMMXt are incompatible because the insn encodings overlap. + VFP and iWMMXt can theoretically coexist, but it's unlikely such silicon + will ever exist. GCC makes no attempt to support this combination. */ + if (TARGET_IWMMXT && !TARGET_SOFT_FLOAT) + sorry ("iWMMXt and hardware floating point"); + + /* ??? iWMMXt insn patterns need auditing for Thumb-2. */ + if (TARGET_THUMB2 && TARGET_IWMMXT) + sorry ("Thumb-2 iWMMXt"); + + /* __fp16 support currently assumes the core has ldrh. */ + if (!arm_arch4 && arm_fp16_format != ARM_FP16_FORMAT_NONE) + sorry ("__fp16 and no ldrh"); + + /* If soft-float is specified then don't use FPU. */ + if (TARGET_SOFT_FLOAT) + arm_fpu_attr = FPU_NONE; + + if (TARGET_AAPCS_BASED) + { + if (arm_abi == ARM_ABI_IWMMXT) + arm_pcs_default = ARM_PCS_AAPCS_IWMMXT; + else if (arm_float_abi == ARM_FLOAT_ABI_HARD + && TARGET_HARD_FLOAT + && TARGET_VFP) + arm_pcs_default = ARM_PCS_AAPCS_VFP; + else + arm_pcs_default = ARM_PCS_AAPCS; + } + else + { + if (arm_float_abi == ARM_FLOAT_ABI_HARD && TARGET_VFP) + sorry ("-mfloat-abi=hard and VFP"); + + if (arm_abi == ARM_ABI_APCS) + arm_pcs_default = ARM_PCS_APCS; + else + arm_pcs_default = ARM_PCS_ATPCS; + } + + /* For arm2/3 there is no need to do any scheduling if there is only + a floating point emulator, or we are doing software floating-point. */ + if ((TARGET_SOFT_FLOAT + || (TARGET_FPA && arm_fpu_desc->rev)) + && (tune_flags & FL_MODE32) == 0) + flag_schedule_insns = flag_schedule_insns_after_reload = 0; + + if (target_thread_switch) + { + if (strcmp (target_thread_switch, "soft") == 0) + target_thread_pointer = TP_SOFT; + else if (strcmp (target_thread_switch, "auto") == 0) + target_thread_pointer = TP_AUTO; + else if (strcmp (target_thread_switch, "cp15") == 0) + target_thread_pointer = TP_CP15; + else + error ("invalid thread pointer option: -mtp=%s", target_thread_switch); + } + + /* Use the cp15 method if it is available. */ + if (target_thread_pointer == TP_AUTO) + { + if (arm_arch6k && !TARGET_THUMB1) + target_thread_pointer = TP_CP15; + else + target_thread_pointer = TP_SOFT; + } + + if (TARGET_HARD_TP && TARGET_THUMB1) + error ("can not use -mtp=cp15 with 16-bit Thumb"); + + /* Override the default structure alignment for AAPCS ABI. */ + if (TARGET_AAPCS_BASED) + arm_structure_size_boundary = 8; + + if (structure_size_string != NULL) + { + int size = strtol (structure_size_string, NULL, 0); + + if (size == 8 || size == 32 + || (ARM_DOUBLEWORD_ALIGN && size == 64)) + arm_structure_size_boundary = size; + else + warning (0, "structure size boundary can only be set to %s", + ARM_DOUBLEWORD_ALIGN ? "8, 32 or 64": "8 or 32"); + } + + if (!TARGET_ARM && TARGET_VXWORKS_RTP && flag_pic) + { + error ("RTP PIC is incompatible with Thumb"); + flag_pic = 0; + } + + /* If stack checking is disabled, we can use r10 as the PIC register, + which keeps r9 available. The EABI specifies r9 as the PIC register. */ + if (flag_pic && TARGET_SINGLE_PIC_BASE) + { + if (TARGET_VXWORKS_RTP) + warning (0, "RTP PIC is incompatible with -msingle-pic-base"); + arm_pic_register = (TARGET_APCS_STACK || TARGET_AAPCS_BASED) ? 9 : 10; + } + + if (flag_pic && TARGET_VXWORKS_RTP) + arm_pic_register = 9; + + if (arm_pic_register_string != NULL) + { + int pic_register = decode_reg_name (arm_pic_register_string); + + if (!flag_pic) + warning (0, "-mpic-register= is useless without -fpic"); + + /* Prevent the user from choosing an obviously stupid PIC register. */ + else if (pic_register < 0 || call_used_regs[pic_register] + || pic_register == HARD_FRAME_POINTER_REGNUM + || pic_register == STACK_POINTER_REGNUM + || pic_register >= PC_REGNUM + || (TARGET_VXWORKS_RTP + && (unsigned int) pic_register != arm_pic_register)) + error ("unable to use '%s' for PIC register", arm_pic_register_string); + else + arm_pic_register = pic_register; + } + + /* Enable -mfix-cortex-m3-ldrd by default for Cortex-M3 cores. */ + if (fix_cm3_ldrd == 2) + { + if (arm_selected_cpu->core == cortexm3) + fix_cm3_ldrd = 1; + else + fix_cm3_ldrd = 0; + } + + if (TARGET_THUMB1 && flag_schedule_insns) + { + /* Don't warn since it's on by default in -O2. */ + flag_schedule_insns = 0; + } + + if (optimize_size) + { + /* If optimizing for size, bump the number of instructions that we + are prepared to conditionally execute (even on a StrongARM). */ + max_insns_skipped = 6; + } + else + { + /* StrongARM has early execution of branches, so a sequence + that is worth skipping is shorter. */ + if (arm_tune_strongarm) + max_insns_skipped = 3; + } + + /* Hot/Cold partitioning is not currently supported, since we can't + handle literal pool placement in that case. */ + if (flag_reorder_blocks_and_partition) + { + inform (input_location, + "-freorder-blocks-and-partition not supported on this architecture"); + flag_reorder_blocks_and_partition = 0; + flag_reorder_blocks = 1; + } + + if (flag_pic) + /* Hoisting PIC address calculations more aggressively provides a small, + but measurable, size reduction for PIC code. Therefore, we decrease + the bar for unrestricted expression hoisting to the cost of PIC address + calculation, which is 2 instructions. */ + maybe_set_param_value (PARAM_GCSE_UNRESTRICTED_COST, 2, + global_options.x_param_values, + global_options_set.x_param_values); + + /* ARM EABI defaults to strict volatile bitfields. */ + if (TARGET_AAPCS_BASED && flag_strict_volatile_bitfields < 0 + && abi_version_at_least(2)) + flag_strict_volatile_bitfields = 1; + + /* Enable sw prefetching at -O3 for CPUS that have prefetch, and we have deemed + it beneficial (signified by setting num_prefetch_slots to 1 or more.) */ + if (flag_prefetch_loop_arrays < 0 + && HAVE_prefetch + && optimize >= 3 + && current_tune->num_prefetch_slots > 0) + flag_prefetch_loop_arrays = 1; + + /* Set up parameters to be used in prefetching algorithm. Do not override the + defaults unless we are tuning for a core we have researched values for. */ + if (current_tune->num_prefetch_slots > 0) + maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES, + current_tune->num_prefetch_slots, + global_options.x_param_values, + global_options_set.x_param_values); + if (current_tune->l1_cache_line_size >= 0) + maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, + current_tune->l1_cache_line_size, + global_options.x_param_values, + global_options_set.x_param_values); + if (current_tune->l1_cache_size >= 0) + maybe_set_param_value (PARAM_L1_CACHE_SIZE, + current_tune->l1_cache_size, + global_options.x_param_values, + global_options_set.x_param_values); + + /* Register global variables with the garbage collector. */ + arm_add_gc_roots (); +} + +static void +arm_add_gc_roots (void) +{ + gcc_obstack_init(&minipool_obstack); + minipool_startobj = (char *) obstack_alloc (&minipool_obstack, 0); +} + +/* A table of known ARM exception types. + For use with the interrupt function attribute. */ + +typedef struct +{ + const char *const arg; + const unsigned long return_value; +} +isr_attribute_arg; + +static const isr_attribute_arg isr_attribute_args [] = +{ + { "IRQ", ARM_FT_ISR }, + { "irq", ARM_FT_ISR }, + { "FIQ", ARM_FT_FIQ }, + { "fiq", ARM_FT_FIQ }, + { "ABORT", ARM_FT_ISR }, + { "abort", ARM_FT_ISR }, + { "ABORT", ARM_FT_ISR }, + { "abort", ARM_FT_ISR }, + { "UNDEF", ARM_FT_EXCEPTION }, + { "undef", ARM_FT_EXCEPTION }, + { "SWI", ARM_FT_EXCEPTION }, + { "swi", ARM_FT_EXCEPTION }, + { NULL, ARM_FT_NORMAL } +}; + +/* Returns the (interrupt) function type of the current + function, or ARM_FT_UNKNOWN if the type cannot be determined. */ + +static unsigned long +arm_isr_value (tree argument) +{ + const isr_attribute_arg * ptr; + const char * arg; + + if (!arm_arch_notm) + return ARM_FT_NORMAL | ARM_FT_STACKALIGN; + + /* No argument - default to IRQ. */ + if (argument == NULL_TREE) + return ARM_FT_ISR; + + /* Get the value of the argument. */ + if (TREE_VALUE (argument) == NULL_TREE + || TREE_CODE (TREE_VALUE (argument)) != STRING_CST) + return ARM_FT_UNKNOWN; + + arg = TREE_STRING_POINTER (TREE_VALUE (argument)); + + /* Check it against the list of known arguments. */ + for (ptr = isr_attribute_args; ptr->arg != NULL; ptr++) + if (streq (arg, ptr->arg)) + return ptr->return_value; + + /* An unrecognized interrupt type. */ + return ARM_FT_UNKNOWN; +} + +/* Computes the type of the current function. */ + +static unsigned long +arm_compute_func_type (void) +{ + unsigned long type = ARM_FT_UNKNOWN; + tree a; + tree attr; + + gcc_assert (TREE_CODE (current_function_decl) == FUNCTION_DECL); + + /* Decide if the current function is volatile. Such functions + never return, and many memory cycles can be saved by not storing + register values that will never be needed again. This optimization + was added to speed up context switching in a kernel application. */ + if (optimize > 0 + && (TREE_NOTHROW (current_function_decl) + || !(flag_unwind_tables + || (flag_exceptions + && arm_except_unwind_info (&global_options) != UI_SJLJ))) + && TREE_THIS_VOLATILE (current_function_decl)) + type |= ARM_FT_VOLATILE; + + if (cfun->static_chain_decl != NULL) + type |= ARM_FT_NESTED; + + attr = DECL_ATTRIBUTES (current_function_decl); + + a = lookup_attribute ("naked", attr); + if (a != NULL_TREE) + type |= ARM_FT_NAKED; + + a = lookup_attribute ("isr", attr); + if (a == NULL_TREE) + a = lookup_attribute ("interrupt", attr); + + if (a == NULL_TREE) + type |= TARGET_INTERWORK ? ARM_FT_INTERWORKED : ARM_FT_NORMAL; + else + type |= arm_isr_value (TREE_VALUE (a)); + + return type; +} + +/* Returns the type of the current function. */ + +unsigned long +arm_current_func_type (void) +{ + if (ARM_FUNC_TYPE (cfun->machine->func_type) == ARM_FT_UNKNOWN) + cfun->machine->func_type = arm_compute_func_type (); + + return cfun->machine->func_type; +} + +bool +arm_allocate_stack_slots_for_args (void) +{ + /* Naked functions should not allocate stack slots for arguments. */ + return !IS_NAKED (arm_current_func_type ()); +} + + +/* Output assembler code for a block containing the constant parts + of a trampoline, leaving space for the variable parts. + + On the ARM, (if r8 is the static chain regnum, and remembering that + referencing pc adds an offset of 8) the trampoline looks like: + ldr r8, [pc, #0] + ldr pc, [pc] + .word static chain value + .word function's address + XXX FIXME: When the trampoline returns, r8 will be clobbered. */ + +static void +arm_asm_trampoline_template (FILE *f) +{ + if (TARGET_ARM) + { + asm_fprintf (f, "\tldr\t%r, [%r, #0]\n", STATIC_CHAIN_REGNUM, PC_REGNUM); + asm_fprintf (f, "\tldr\t%r, [%r, #0]\n", PC_REGNUM, PC_REGNUM); + } + else if (TARGET_THUMB2) + { + /* The Thumb-2 trampoline is similar to the arm implementation. + Unlike 16-bit Thumb, we enter the stub in thumb mode. */ + asm_fprintf (f, "\tldr.w\t%r, [%r, #4]\n", + STATIC_CHAIN_REGNUM, PC_REGNUM); + asm_fprintf (f, "\tldr.w\t%r, [%r, #4]\n", PC_REGNUM, PC_REGNUM); + } + else + { + ASM_OUTPUT_ALIGN (f, 2); + fprintf (f, "\t.code\t16\n"); + fprintf (f, ".Ltrampoline_start:\n"); + asm_fprintf (f, "\tpush\t{r0, r1}\n"); + asm_fprintf (f, "\tldr\tr0, [%r, #8]\n", PC_REGNUM); + asm_fprintf (f, "\tmov\t%r, r0\n", STATIC_CHAIN_REGNUM); + asm_fprintf (f, "\tldr\tr0, [%r, #8]\n", PC_REGNUM); + asm_fprintf (f, "\tstr\tr0, [%r, #4]\n", SP_REGNUM); + asm_fprintf (f, "\tpop\t{r0, %r}\n", PC_REGNUM); + } + assemble_aligned_integer (UNITS_PER_WORD, const0_rtx); + assemble_aligned_integer (UNITS_PER_WORD, const0_rtx); +} + +/* Emit RTL insns to initialize the variable parts of a trampoline. */ + +static void +arm_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value) +{ + rtx fnaddr, mem, a_tramp; + + emit_block_move (m_tramp, assemble_trampoline_template (), + GEN_INT (TRAMPOLINE_SIZE), BLOCK_OP_NORMAL); + + mem = adjust_address (m_tramp, SImode, TARGET_32BIT ? 8 : 12); + emit_move_insn (mem, chain_value); + + mem = adjust_address (m_tramp, SImode, TARGET_32BIT ? 12 : 16); + fnaddr = XEXP (DECL_RTL (fndecl), 0); + emit_move_insn (mem, fnaddr); + + a_tramp = XEXP (m_tramp, 0); + emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"), + LCT_NORMAL, VOIDmode, 2, a_tramp, Pmode, + plus_constant (a_tramp, TRAMPOLINE_SIZE), Pmode); +} + +/* Thumb trampolines should be entered in thumb mode, so set + the bottom bit of the address. */ + +static rtx +arm_trampoline_adjust_address (rtx addr) +{ + if (TARGET_THUMB) + addr = expand_simple_binop (Pmode, IOR, addr, const1_rtx, + NULL, 0, OPTAB_LIB_WIDEN); + return addr; +} + +/* Return 1 if it is possible to return using a single instruction. + If SIBLING is non-null, this is a test for a return before a sibling + call. SIBLING is the call insn, so we can examine its register usage. */ + +int +use_return_insn (int iscond, rtx sibling) +{ + int regno; + unsigned int func_type; + unsigned long saved_int_regs; + unsigned HOST_WIDE_INT stack_adjust; + arm_stack_offsets *offsets; + + /* Never use a return instruction before reload has run. */ + if (!reload_completed) + return 0; + + func_type = arm_current_func_type (); + + /* Naked, volatile and stack alignment functions need special + consideration. */ + if (func_type & (ARM_FT_VOLATILE | ARM_FT_NAKED | ARM_FT_STACKALIGN)) + return 0; + + /* So do interrupt functions that use the frame pointer and Thumb + interrupt functions. */ + if (IS_INTERRUPT (func_type) && (frame_pointer_needed || TARGET_THUMB)) + return 0; + + offsets = arm_get_frame_offsets (); + stack_adjust = offsets->outgoing_args - offsets->saved_regs; + + /* As do variadic functions. */ + if (crtl->args.pretend_args_size + || cfun->machine->uses_anonymous_args + /* Or if the function calls __builtin_eh_return () */ + || crtl->calls_eh_return + /* Or if the function calls alloca */ + || cfun->calls_alloca + /* Or if there is a stack adjustment. However, if the stack pointer + is saved on the stack, we can use a pre-incrementing stack load. */ + || !(stack_adjust == 0 || (TARGET_APCS_FRAME && frame_pointer_needed + && stack_adjust == 4))) + return 0; + + saved_int_regs = offsets->saved_regs_mask; + + /* Unfortunately, the insn + + ldmib sp, {..., sp, ...} + + triggers a bug on most SA-110 based devices, such that the stack + pointer won't be correctly restored if the instruction takes a + page fault. We work around this problem by popping r3 along with + the other registers, since that is never slower than executing + another instruction. + + We test for !arm_arch5 here, because code for any architecture + less than this could potentially be run on one of the buggy + chips. */ + if (stack_adjust == 4 && !arm_arch5 && TARGET_ARM) + { + /* Validate that r3 is a call-clobbered register (always true in + the default abi) ... */ + if (!call_used_regs[3]) + return 0; + + /* ... that it isn't being used for a return value ... */ + if (arm_size_return_regs () >= (4 * UNITS_PER_WORD)) + return 0; + + /* ... or for a tail-call argument ... */ + if (sibling) + { + gcc_assert (GET_CODE (sibling) == CALL_INSN); + + if (find_regno_fusage (sibling, USE, 3)) + return 0; + } + + /* ... and that there are no call-saved registers in r0-r2 + (always true in the default ABI). */ + if (saved_int_regs & 0x7) + return 0; + } + + /* Can't be done if interworking with Thumb, and any registers have been + stacked. */ + if (TARGET_INTERWORK && saved_int_regs != 0 && !IS_INTERRUPT(func_type)) + return 0; + + /* On StrongARM, conditional returns are expensive if they aren't + taken and multiple registers have been stacked. */ + if (iscond && arm_tune_strongarm) + { + /* Conditional return when just the LR is stored is a simple + conditional-load instruction, that's not expensive. */ + if (saved_int_regs != 0 && saved_int_regs != (1 << LR_REGNUM)) + return 0; + + if (flag_pic + && arm_pic_register != INVALID_REGNUM + && df_regs_ever_live_p (PIC_OFFSET_TABLE_REGNUM)) + return 0; + } + + /* If there are saved registers but the LR isn't saved, then we need + two instructions for the return. */ + if (saved_int_regs && !(saved_int_regs & (1 << LR_REGNUM))) + return 0; + + /* Can't be done if any of the FPA regs are pushed, + since this also requires an insn. */ + if (TARGET_HARD_FLOAT && TARGET_FPA) + for (regno = FIRST_FPA_REGNUM; regno <= LAST_FPA_REGNUM; regno++) + if (df_regs_ever_live_p (regno) && !call_used_regs[regno]) + return 0; + + /* Likewise VFP regs. */ + if (TARGET_HARD_FLOAT && TARGET_VFP) + for (regno = FIRST_VFP_REGNUM; regno <= LAST_VFP_REGNUM; regno++) + if (df_regs_ever_live_p (regno) && !call_used_regs[regno]) + return 0; + + if (TARGET_REALLY_IWMMXT) + for (regno = FIRST_IWMMXT_REGNUM; regno <= LAST_IWMMXT_REGNUM; regno++) + if (df_regs_ever_live_p (regno) && ! call_used_regs[regno]) + return 0; + + return 1; +} + +/* Return TRUE if int I is a valid immediate ARM constant. */ + +int +const_ok_for_arm (HOST_WIDE_INT i) +{ + int lowbit; + + /* For machines with >32 bit HOST_WIDE_INT, the bits above bit 31 must + be all zero, or all one. */ + if ((i & ~(unsigned HOST_WIDE_INT) 0xffffffff) != 0 + && ((i & ~(unsigned HOST_WIDE_INT) 0xffffffff) + != ((~(unsigned HOST_WIDE_INT) 0) + & ~(unsigned HOST_WIDE_INT) 0xffffffff))) + return FALSE; + + i &= (unsigned HOST_WIDE_INT) 0xffffffff; + + /* Fast return for 0 and small values. We must do this for zero, since + the code below can't handle that one case. */ + if ((i & ~(unsigned HOST_WIDE_INT) 0xff) == 0) + return TRUE; + + /* Get the number of trailing zeros. */ + lowbit = ffs((int) i) - 1; + + /* Only even shifts are allowed in ARM mode so round down to the + nearest even number. */ + if (TARGET_ARM) + lowbit &= ~1; + + if ((i & ~(((unsigned HOST_WIDE_INT) 0xff) << lowbit)) == 0) + return TRUE; + + if (TARGET_ARM) + { + /* Allow rotated constants in ARM mode. */ + if (lowbit <= 4 + && ((i & ~0xc000003f) == 0 + || (i & ~0xf000000f) == 0 + || (i & ~0xfc000003) == 0)) + return TRUE; + } + else + { + HOST_WIDE_INT v; + + /* Allow repeated patterns 0x00XY00XY or 0xXYXYXYXY. */ + v = i & 0xff; + v |= v << 16; + if (i == v || i == (v | (v << 8))) + return TRUE; + + /* Allow repeated pattern 0xXY00XY00. */ + v = i & 0xff00; + v |= v << 16; + if (i == v) + return TRUE; + } + + return FALSE; +} + +/* Return true if I is a valid constant for the operation CODE. */ +static int +const_ok_for_op (HOST_WIDE_INT i, enum rtx_code code) +{ + if (const_ok_for_arm (i)) + return 1; + + switch (code) + { + case PLUS: + case COMPARE: + case EQ: + case NE: + case GT: + case LE: + case LT: + case GE: + case GEU: + case LTU: + case GTU: + case LEU: + case UNORDERED: + case ORDERED: + case UNEQ: + case UNGE: + case UNLT: + case UNGT: + case UNLE: + return const_ok_for_arm (ARM_SIGN_EXTEND (-i)); + + case MINUS: /* Should only occur with (MINUS I reg) => rsb */ + case XOR: + return 0; + + case IOR: + if (TARGET_THUMB2) + return const_ok_for_arm (ARM_SIGN_EXTEND (~i)); + return 0; + + case AND: + return const_ok_for_arm (ARM_SIGN_EXTEND (~i)); + + default: + gcc_unreachable (); + } +} + +/* Emit a sequence of insns to handle a large constant. + CODE is the code of the operation required, it can be any of SET, PLUS, + IOR, AND, XOR, MINUS; + MODE is the mode in which the operation is being performed; + VAL is the integer to operate on; + SOURCE is the other operand (a register, or a null-pointer for SET); + SUBTARGETS means it is safe to create scratch registers if that will + either produce a simpler sequence, or we will want to cse the values. + Return value is the number of insns emitted. */ + +/* ??? Tweak this for thumb2. */ +int +arm_split_constant (enum rtx_code code, enum machine_mode mode, rtx insn, + HOST_WIDE_INT val, rtx target, rtx source, int subtargets) +{ + rtx cond; + + if (insn && GET_CODE (PATTERN (insn)) == COND_EXEC) + cond = COND_EXEC_TEST (PATTERN (insn)); + else + cond = NULL_RTX; + + if (subtargets || code == SET + || (GET_CODE (target) == REG && GET_CODE (source) == REG + && REGNO (target) != REGNO (source))) + { + /* After arm_reorg has been called, we can't fix up expensive + constants by pushing them into memory so we must synthesize + them in-line, regardless of the cost. This is only likely to + be more costly on chips that have load delay slots and we are + compiling without running the scheduler (so no splitting + occurred before the final instruction emission). + + Ref: gcc -O1 -mcpu=strongarm gcc.c-torture/compile/980506-2.c + */ + if (!after_arm_reorg + && !cond + && (arm_gen_constant (code, mode, NULL_RTX, val, target, source, + 1, 0) + > (arm_constant_limit (optimize_function_for_size_p (cfun)) + + (code != SET)))) + { + if (code == SET) + { + /* Currently SET is the only monadic value for CODE, all + the rest are diadic. */ + if (TARGET_USE_MOVT) + arm_emit_movpair (target, GEN_INT (val)); + else + emit_set_insn (target, GEN_INT (val)); + + return 1; + } + else + { + rtx temp = subtargets ? gen_reg_rtx (mode) : target; + + if (TARGET_USE_MOVT) + arm_emit_movpair (temp, GEN_INT (val)); + else + emit_set_insn (temp, GEN_INT (val)); + + /* For MINUS, the value is subtracted from, since we never + have subtraction of a constant. */ + if (code == MINUS) + emit_set_insn (target, gen_rtx_MINUS (mode, temp, source)); + else + emit_set_insn (target, + gen_rtx_fmt_ee (code, mode, source, temp)); + return 2; + } + } + } + + return arm_gen_constant (code, mode, cond, val, target, source, subtargets, + 1); +} + +/* Return the number of instructions required to synthesize the given + constant, if we start emitting them from bit-position I. */ +static int +count_insns_for_constant (HOST_WIDE_INT remainder, int i) +{ + HOST_WIDE_INT temp1; + int step_size = TARGET_ARM ? 2 : 1; + int num_insns = 0; + + gcc_assert (TARGET_ARM || i == 0); + + do + { + int end; + + if (i <= 0) + i += 32; + if (remainder & (((1 << step_size) - 1) << (i - step_size))) + { + end = i - 8; + if (end < 0) + end += 32; + temp1 = remainder & ((0x0ff << end) + | ((i < end) ? (0xff >> (32 - end)) : 0)); + remainder &= ~temp1; + num_insns++; + i -= 8 - step_size; + } + i -= step_size; + } while (remainder); + return num_insns; +} + +static int +find_best_start (unsigned HOST_WIDE_INT remainder) +{ + int best_consecutive_zeros = 0; + int i; + int best_start = 0; + + /* If we aren't targetting ARM, the best place to start is always at + the bottom. */ + if (! TARGET_ARM) + return 0; + + for (i = 0; i < 32; i += 2) + { + int consecutive_zeros = 0; + + if (!(remainder & (3 << i))) + { + while ((i < 32) && !(remainder & (3 << i))) + { + consecutive_zeros += 2; + i += 2; + } + if (consecutive_zeros > best_consecutive_zeros) + { + best_consecutive_zeros = consecutive_zeros; + best_start = i - consecutive_zeros; + } + i -= 2; + } + } + + /* So long as it won't require any more insns to do so, it's + desirable to emit a small constant (in bits 0...9) in the last + insn. This way there is more chance that it can be combined with + a later addressing insn to form a pre-indexed load or store + operation. Consider: + + *((volatile int *)0xe0000100) = 1; + *((volatile int *)0xe0000110) = 2; + + We want this to wind up as: + + mov rA, #0xe0000000 + mov rB, #1 + str rB, [rA, #0x100] + mov rB, #2 + str rB, [rA, #0x110] + + rather than having to synthesize both large constants from scratch. + + Therefore, we calculate how many insns would be required to emit + the constant starting from `best_start', and also starting from + zero (i.e. with bit 31 first to be output). If `best_start' doesn't + yield a shorter sequence, we may as well use zero. */ + if (best_start != 0 + && ((((unsigned HOST_WIDE_INT) 1) << best_start) < remainder) + && (count_insns_for_constant (remainder, 0) <= + count_insns_for_constant (remainder, best_start))) + best_start = 0; + + return best_start; +} + +/* Emit an instruction with the indicated PATTERN. If COND is + non-NULL, conditionalize the execution of the instruction on COND + being true. */ + +static void +emit_constant_insn (rtx cond, rtx pattern) +{ + if (cond) + pattern = gen_rtx_COND_EXEC (VOIDmode, copy_rtx (cond), pattern); + emit_insn (pattern); +} + +/* As above, but extra parameter GENERATE which, if clear, suppresses + RTL generation. */ +/* ??? This needs more work for thumb2. */ + +static int +arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, + HOST_WIDE_INT val, rtx target, rtx source, int subtargets, + int generate) +{ + int can_invert = 0; + int can_negate = 0; + int final_invert = 0; + int can_negate_initial = 0; + int i; + int num_bits_set = 0; + int set_sign_bit_copies = 0; + int clear_sign_bit_copies = 0; + int clear_zero_bit_copies = 0; + int set_zero_bit_copies = 0; + int insns = 0; + unsigned HOST_WIDE_INT temp1, temp2; + unsigned HOST_WIDE_INT remainder = val & 0xffffffff; + int step_size = TARGET_ARM ? 2 : 1; + + /* Find out which operations are safe for a given CODE. Also do a quick + check for degenerate cases; these can occur when DImode operations + are split. */ + switch (code) + { + case SET: + can_invert = 1; + can_negate = 1; + break; + + case PLUS: + can_negate = 1; + can_negate_initial = 1; + break; + + case IOR: + if (remainder == 0xffffffff) + { + if (generate) + emit_constant_insn (cond, + gen_rtx_SET (VOIDmode, target, + GEN_INT (ARM_SIGN_EXTEND (val)))); + return 1; + } + + if (remainder == 0) + { + if (reload_completed && rtx_equal_p (target, source)) + return 0; + + if (generate) + emit_constant_insn (cond, + gen_rtx_SET (VOIDmode, target, source)); + return 1; + } + + if (TARGET_THUMB2) + can_invert = 1; + break; + + case AND: + if (remainder == 0) + { + if (generate) + emit_constant_insn (cond, + gen_rtx_SET (VOIDmode, target, const0_rtx)); + return 1; + } + if (remainder == 0xffffffff) + { + if (reload_completed && rtx_equal_p (target, source)) + return 0; + if (generate) + emit_constant_insn (cond, + gen_rtx_SET (VOIDmode, target, source)); + return 1; + } + can_invert = 1; + break; + + case XOR: + if (remainder == 0) + { + if (reload_completed && rtx_equal_p (target, source)) + return 0; + if (generate) + emit_constant_insn (cond, + gen_rtx_SET (VOIDmode, target, source)); + return 1; + } + + if (remainder == 0xffffffff) + { + if (generate) + emit_constant_insn (cond, + gen_rtx_SET (VOIDmode, target, + gen_rtx_NOT (mode, source))); + return 1; + } + break; + + case MINUS: + /* We treat MINUS as (val - source), since (source - val) is always + passed as (source + (-val)). */ + if (remainder == 0) + { + if (generate) + emit_constant_insn (cond, + gen_rtx_SET (VOIDmode, target, + gen_rtx_NEG (mode, source))); + return 1; + } + if (const_ok_for_arm (val)) + { + if (generate) + emit_constant_insn (cond, + gen_rtx_SET (VOIDmode, target, + gen_rtx_MINUS (mode, GEN_INT (val), + source))); + return 1; + } + can_negate = 1; + + break; + + default: + gcc_unreachable (); + } + + /* If we can do it in one insn get out quickly. */ + if (const_ok_for_arm (val) + || (can_negate_initial && const_ok_for_arm (-val)) + || (can_invert && const_ok_for_arm (~val))) + { + if (generate) + emit_constant_insn (cond, + gen_rtx_SET (VOIDmode, target, + (source + ? gen_rtx_fmt_ee (code, mode, source, + GEN_INT (val)) + : GEN_INT (val)))); + return 1; + } + + /* Calculate a few attributes that may be useful for specific + optimizations. */ + /* Count number of leading zeros. */ + for (i = 31; i >= 0; i--) + { + if ((remainder & (1 << i)) == 0) + clear_sign_bit_copies++; + else + break; + } + + /* Count number of leading 1's. */ + for (i = 31; i >= 0; i--) + { + if ((remainder & (1 << i)) != 0) + set_sign_bit_copies++; + else + break; + } + + /* Count number of trailing zero's. */ + for (i = 0; i <= 31; i++) + { + if ((remainder & (1 << i)) == 0) + clear_zero_bit_copies++; + else + break; + } + + /* Count number of trailing 1's. */ + for (i = 0; i <= 31; i++) + { + if ((remainder & (1 << i)) != 0) + set_zero_bit_copies++; + else + break; + } + + switch (code) + { + case SET: + /* See if we can use movw. */ + if (arm_arch_thumb2 && (remainder & 0xffff0000) == 0) + { + if (generate) + emit_constant_insn (cond, gen_rtx_SET (VOIDmode, target, + GEN_INT (val))); + return 1; + } + + /* See if we can do this by sign_extending a constant that is known + to be negative. This is a good, way of doing it, since the shift + may well merge into a subsequent insn. */ + if (set_sign_bit_copies > 1) + { + if (const_ok_for_arm + (temp1 = ARM_SIGN_EXTEND (remainder + << (set_sign_bit_copies - 1)))) + { + if (generate) + { + rtx new_src = subtargets ? gen_reg_rtx (mode) : target; + emit_constant_insn (cond, + gen_rtx_SET (VOIDmode, new_src, + GEN_INT (temp1))); + emit_constant_insn (cond, + gen_ashrsi3 (target, new_src, + GEN_INT (set_sign_bit_copies - 1))); + } + return 2; + } + /* For an inverted constant, we will need to set the low bits, + these will be shifted out of harm's way. */ + temp1 |= (1 << (set_sign_bit_copies - 1)) - 1; + if (const_ok_for_arm (~temp1)) + { + if (generate) + { + rtx new_src = subtargets ? gen_reg_rtx (mode) : target; + emit_constant_insn (cond, + gen_rtx_SET (VOIDmode, new_src, + GEN_INT (temp1))); + emit_constant_insn (cond, + gen_ashrsi3 (target, new_src, + GEN_INT (set_sign_bit_copies - 1))); + } + return 2; + } + } + + /* See if we can calculate the value as the difference between two + valid immediates. */ + if (clear_sign_bit_copies + clear_zero_bit_copies <= 16) + { + int topshift = clear_sign_bit_copies & ~1; + + temp1 = ARM_SIGN_EXTEND ((remainder + (0x00800000 >> topshift)) + & (0xff000000 >> topshift)); + + /* If temp1 is zero, then that means the 9 most significant + bits of remainder were 1 and we've caused it to overflow. + When topshift is 0 we don't need to do anything since we + can borrow from 'bit 32'. */ + if (temp1 == 0 && topshift != 0) + temp1 = 0x80000000 >> (topshift - 1); + + temp2 = ARM_SIGN_EXTEND (temp1 - remainder); + + if (const_ok_for_arm (temp2)) + { + if (generate) + { + rtx new_src = subtargets ? gen_reg_rtx (mode) : target; + emit_constant_insn (cond, + gen_rtx_SET (VOIDmode, new_src, + GEN_INT (temp1))); + emit_constant_insn (cond, + gen_addsi3 (target, new_src, + GEN_INT (-temp2))); + } + + return 2; + } + } + + /* See if we can generate this by setting the bottom (or the top) + 16 bits, and then shifting these into the other half of the + word. We only look for the simplest cases, to do more would cost + too much. Be careful, however, not to generate this when the + alternative would take fewer insns. */ + if (val & 0xffff0000) + { + temp1 = remainder & 0xffff0000; + temp2 = remainder & 0x0000ffff; + + /* Overlaps outside this range are best done using other methods. */ + for (i = 9; i < 24; i++) + { + if ((((temp2 | (temp2 << i)) & 0xffffffff) == remainder) + && !const_ok_for_arm (temp2)) + { + rtx new_src = (subtargets + ? (generate ? gen_reg_rtx (mode) : NULL_RTX) + : target); + insns = arm_gen_constant (code, mode, cond, temp2, new_src, + source, subtargets, generate); + source = new_src; + if (generate) + emit_constant_insn + (cond, + gen_rtx_SET + (VOIDmode, target, + gen_rtx_IOR (mode, + gen_rtx_ASHIFT (mode, source, + GEN_INT (i)), + source))); + return insns + 1; + } + } + + /* Don't duplicate cases already considered. */ + for (i = 17; i < 24; i++) + { + if (((temp1 | (temp1 >> i)) == remainder) + && !const_ok_for_arm (temp1)) + { + rtx new_src = (subtargets + ? (generate ? gen_reg_rtx (mode) : NULL_RTX) + : target); + insns = arm_gen_constant (code, mode, cond, temp1, new_src, + source, subtargets, generate); + source = new_src; + if (generate) + emit_constant_insn + (cond, + gen_rtx_SET (VOIDmode, target, + gen_rtx_IOR + (mode, + gen_rtx_LSHIFTRT (mode, source, + GEN_INT (i)), + source))); + return insns + 1; + } + } + } + break; + + case IOR: + case XOR: + /* If we have IOR or XOR, and the constant can be loaded in a + single instruction, and we can find a temporary to put it in, + then this can be done in two instructions instead of 3-4. */ + if (subtargets + /* TARGET can't be NULL if SUBTARGETS is 0 */ + || (reload_completed && !reg_mentioned_p (target, source))) + { + if (const_ok_for_arm (ARM_SIGN_EXTEND (~val))) + { + if (generate) + { + rtx sub = subtargets ? gen_reg_rtx (mode) : target; + + emit_constant_insn (cond, + gen_rtx_SET (VOIDmode, sub, + GEN_INT (val))); + emit_constant_insn (cond, + gen_rtx_SET (VOIDmode, target, + gen_rtx_fmt_ee (code, mode, + source, sub))); + } + return 2; + } + } + + if (code == XOR) + break; + + /* Convert. + x = y | constant ( which is composed of set_sign_bit_copies of leading 1s + and the remainder 0s for e.g. 0xfff00000) + x = ~(~(y ashift set_sign_bit_copies) lshiftrt set_sign_bit_copies) + + This can be done in 2 instructions by using shifts with mov or mvn. + e.g. for + x = x | 0xfff00000; + we generate. + mvn r0, r0, asl #12 + mvn r0, r0, lsr #12 */ + if (set_sign_bit_copies > 8 + && (val & (-1 << (32 - set_sign_bit_copies))) == val) + { + if (generate) + { + rtx sub = subtargets ? gen_reg_rtx (mode) : target; + rtx shift = GEN_INT (set_sign_bit_copies); + + emit_constant_insn + (cond, + gen_rtx_SET (VOIDmode, sub, + gen_rtx_NOT (mode, + gen_rtx_ASHIFT (mode, + source, + shift)))); + emit_constant_insn + (cond, + gen_rtx_SET (VOIDmode, target, + gen_rtx_NOT (mode, + gen_rtx_LSHIFTRT (mode, sub, + shift)))); + } + return 2; + } + + /* Convert + x = y | constant (which has set_zero_bit_copies number of trailing ones). + to + x = ~((~y lshiftrt set_zero_bit_copies) ashift set_zero_bit_copies). + + For eg. r0 = r0 | 0xfff + mvn r0, r0, lsr #12 + mvn r0, r0, asl #12 + + */ + if (set_zero_bit_copies > 8 + && (remainder & ((1 << set_zero_bit_copies) - 1)) == remainder) + { + if (generate) + { + rtx sub = subtargets ? gen_reg_rtx (mode) : target; + rtx shift = GEN_INT (set_zero_bit_copies); + + emit_constant_insn + (cond, + gen_rtx_SET (VOIDmode, sub, + gen_rtx_NOT (mode, + gen_rtx_LSHIFTRT (mode, + source, + shift)))); + emit_constant_insn + (cond, + gen_rtx_SET (VOIDmode, target, + gen_rtx_NOT (mode, + gen_rtx_ASHIFT (mode, sub, + shift)))); + } + return 2; + } + + /* This will never be reached for Thumb2 because orn is a valid + instruction. This is for Thumb1 and the ARM 32 bit cases. + + x = y | constant (such that ~constant is a valid constant) + Transform this to + x = ~(~y & ~constant). + */ + if (const_ok_for_arm (temp1 = ARM_SIGN_EXTEND (~val))) + { + if (generate) + { + rtx sub = subtargets ? gen_reg_rtx (mode) : target; + emit_constant_insn (cond, + gen_rtx_SET (VOIDmode, sub, + gen_rtx_NOT (mode, source))); + source = sub; + if (subtargets) + sub = gen_reg_rtx (mode); + emit_constant_insn (cond, + gen_rtx_SET (VOIDmode, sub, + gen_rtx_AND (mode, source, + GEN_INT (temp1)))); + emit_constant_insn (cond, + gen_rtx_SET (VOIDmode, target, + gen_rtx_NOT (mode, sub))); + } + return 3; + } + break; + + case AND: + /* See if two shifts will do 2 or more insn's worth of work. */ + if (clear_sign_bit_copies >= 16 && clear_sign_bit_copies < 24) + { + HOST_WIDE_INT shift_mask = ((0xffffffff + << (32 - clear_sign_bit_copies)) + & 0xffffffff); + + if ((remainder | shift_mask) != 0xffffffff) + { + if (generate) + { + rtx new_src = subtargets ? gen_reg_rtx (mode) : target; + insns = arm_gen_constant (AND, mode, cond, + remainder | shift_mask, + new_src, source, subtargets, 1); + source = new_src; + } + else + { + rtx targ = subtargets ? NULL_RTX : target; + insns = arm_gen_constant (AND, mode, cond, + remainder | shift_mask, + targ, source, subtargets, 0); + } + } + + if (generate) + { + rtx new_src = subtargets ? gen_reg_rtx (mode) : target; + rtx shift = GEN_INT (clear_sign_bit_copies); + + emit_insn (gen_ashlsi3 (new_src, source, shift)); + emit_insn (gen_lshrsi3 (target, new_src, shift)); + } + + return insns + 2; + } + + if (clear_zero_bit_copies >= 16 && clear_zero_bit_copies < 24) + { + HOST_WIDE_INT shift_mask = (1 << clear_zero_bit_copies) - 1; + + if ((remainder | shift_mask) != 0xffffffff) + { + if (generate) + { + rtx new_src = subtargets ? gen_reg_rtx (mode) : target; + + insns = arm_gen_constant (AND, mode, cond, + remainder | shift_mask, + new_src, source, subtargets, 1); + source = new_src; + } + else + { + rtx targ = subtargets ? NULL_RTX : target; + + insns = arm_gen_constant (AND, mode, cond, + remainder | shift_mask, + targ, source, subtargets, 0); + } + } + + if (generate) + { + rtx new_src = subtargets ? gen_reg_rtx (mode) : target; + rtx shift = GEN_INT (clear_zero_bit_copies); + + emit_insn (gen_lshrsi3 (new_src, source, shift)); + emit_insn (gen_ashlsi3 (target, new_src, shift)); + } + + return insns + 2; + } + + break; + + default: + break; + } + + for (i = 0; i < 32; i++) + if (remainder & (1 << i)) + num_bits_set++; + + if ((code == AND) + || (code != IOR && can_invert && num_bits_set > 16)) + remainder ^= 0xffffffff; + else if (code == PLUS && num_bits_set > 16) + remainder = (-remainder) & 0xffffffff; + + /* For XOR, if more than half the bits are set and there's a sequence + of more than 8 consecutive ones in the pattern then we can XOR by the + inverted constant and then invert the final result; this may save an + instruction and might also lead to the final mvn being merged with + some other operation. */ + else if (code == XOR && num_bits_set > 16 + && (count_insns_for_constant (remainder ^ 0xffffffff, + find_best_start + (remainder ^ 0xffffffff)) + < count_insns_for_constant (remainder, + find_best_start (remainder)))) + { + remainder ^= 0xffffffff; + final_invert = 1; + } + else + { + can_invert = 0; + can_negate = 0; + } + + /* Now try and find a way of doing the job in either two or three + instructions. + We start by looking for the largest block of zeros that are aligned on + a 2-bit boundary, we then fill up the temps, wrapping around to the + top of the word when we drop off the bottom. + In the worst case this code should produce no more than four insns. + Thumb-2 constants are shifted, not rotated, so the MSB is always the + best place to start. */ + + /* ??? Use thumb2 replicated constants when the high and low halfwords are + the same. */ + { + /* Now start emitting the insns. */ + i = find_best_start (remainder); + do + { + int end; + + if (i <= 0) + i += 32; + if (remainder & (3 << (i - 2))) + { + end = i - 8; + if (end < 0) + end += 32; + temp1 = remainder & ((0x0ff << end) + | ((i < end) ? (0xff >> (32 - end)) : 0)); + remainder &= ~temp1; + + if (generate) + { + rtx new_src, temp1_rtx; + + if (code == SET || code == MINUS) + { + new_src = (subtargets ? gen_reg_rtx (mode) : target); + if (can_invert && code != MINUS) + temp1 = ~temp1; + } + else + { + if ((final_invert || remainder) && subtargets) + new_src = gen_reg_rtx (mode); + else + new_src = target; + if (can_invert) + temp1 = ~temp1; + else if (can_negate) + temp1 = -temp1; + } + + temp1 = trunc_int_for_mode (temp1, mode); + temp1_rtx = GEN_INT (temp1); + + if (code == SET) + ; + else if (code == MINUS) + temp1_rtx = gen_rtx_MINUS (mode, temp1_rtx, source); + else + temp1_rtx = gen_rtx_fmt_ee (code, mode, source, temp1_rtx); + + emit_constant_insn (cond, + gen_rtx_SET (VOIDmode, new_src, + temp1_rtx)); + source = new_src; + } + + if (code == SET) + { + can_invert = 0; + code = PLUS; + } + else if (code == MINUS) + code = PLUS; + + insns++; + i -= 8 - step_size; + } + /* Arm allows rotates by a multiple of two. Thumb-2 allows arbitrary + shifts. */ + i -= step_size; + } + while (remainder); + } + + if (final_invert) + { + if (generate) + emit_constant_insn (cond, gen_rtx_SET (VOIDmode, target, + gen_rtx_NOT (mode, source))); + insns++; + } + + return insns; +} + +/* Canonicalize a comparison so that we are more likely to recognize it. + This can be done for a few constant compares, where we can make the + immediate value easier to load. */ + +enum rtx_code +arm_canonicalize_comparison (enum rtx_code code, rtx *op0, rtx *op1) +{ + enum machine_mode mode; + unsigned HOST_WIDE_INT i, maxval; + + mode = GET_MODE (*op0); + if (mode == VOIDmode) + mode = GET_MODE (*op1); + + maxval = (((unsigned HOST_WIDE_INT) 1) << (GET_MODE_BITSIZE(mode) - 1)) - 1; + + /* For DImode, we have GE/LT/GEU/LTU comparisons. In ARM mode + we can also use cmp/cmpeq for GTU/LEU. GT/LE must be either + reversed or (for constant OP1) adjusted to GE/LT. Similarly + for GTU/LEU in Thumb mode. */ + if (mode == DImode) + { + rtx tem; + + /* To keep things simple, always use the Cirrus cfcmp64 if it is + available. */ + if (TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK) + return code; + + if (code == GT || code == LE + || (!TARGET_ARM && (code == GTU || code == LEU))) + { + /* Missing comparison. First try to use an available + comparison. */ + if (GET_CODE (*op1) == CONST_INT) + { + i = INTVAL (*op1); + switch (code) + { + case GT: + case LE: + if (i != maxval + && arm_const_double_by_immediates (GEN_INT (i + 1))) + { + *op1 = GEN_INT (i + 1); + return code == GT ? GE : LT; + } + break; + case GTU: + case LEU: + if (i != ~((unsigned HOST_WIDE_INT) 0) + && arm_const_double_by_immediates (GEN_INT (i + 1))) + { + *op1 = GEN_INT (i + 1); + return code == GTU ? GEU : LTU; + } + break; + default: + gcc_unreachable (); + } + } + + /* If that did not work, reverse the condition. */ + tem = *op0; + *op0 = *op1; + *op1 = tem; + return swap_condition (code); + } + + return code; + } + + /* Comparisons smaller than DImode. Only adjust comparisons against + an out-of-range constant. */ + if (GET_CODE (*op1) != CONST_INT + || const_ok_for_arm (INTVAL (*op1)) + || const_ok_for_arm (- INTVAL (*op1))) + return code; + + i = INTVAL (*op1); + + switch (code) + { + case EQ: + case NE: + return code; + + case GT: + case LE: + if (i != maxval + && (const_ok_for_arm (i + 1) || const_ok_for_arm (-(i + 1)))) + { + *op1 = GEN_INT (i + 1); + return code == GT ? GE : LT; + } + break; + + case GE: + case LT: + if (i != ~maxval + && (const_ok_for_arm (i - 1) || const_ok_for_arm (-(i - 1)))) + { + *op1 = GEN_INT (i - 1); + return code == GE ? GT : LE; + } + break; + + case GTU: + case LEU: + if (i != ~((unsigned HOST_WIDE_INT) 0) + && (const_ok_for_arm (i + 1) || const_ok_for_arm (-(i + 1)))) + { + *op1 = GEN_INT (i + 1); + return code == GTU ? GEU : LTU; + } + break; + + case GEU: + case LTU: + if (i != 0 + && (const_ok_for_arm (i - 1) || const_ok_for_arm (-(i - 1)))) + { + *op1 = GEN_INT (i - 1); + return code == GEU ? GTU : LEU; + } + break; + + default: + gcc_unreachable (); + } + + return code; +} + + +/* Define how to find the value returned by a function. */ + +static rtx +arm_function_value(const_tree type, const_tree func, + bool outgoing ATTRIBUTE_UNUSED) +{ + enum machine_mode mode; + int unsignedp ATTRIBUTE_UNUSED; + rtx r ATTRIBUTE_UNUSED; + + mode = TYPE_MODE (type); + + if (TARGET_AAPCS_BASED) + return aapcs_allocate_return_reg (mode, type, func); + + /* Promote integer types. */ + if (INTEGRAL_TYPE_P (type)) + mode = arm_promote_function_mode (type, mode, &unsignedp, func, 1); + + /* Promotes small structs returned in a register to full-word size + for big-endian AAPCS. */ + if (arm_return_in_msb (type)) + { + HOST_WIDE_INT size = int_size_in_bytes (type); + if (size % UNITS_PER_WORD != 0) + { + size += UNITS_PER_WORD - size % UNITS_PER_WORD; + mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0); + } + } + + return LIBCALL_VALUE (mode); +} + +static int +libcall_eq (const void *p1, const void *p2) +{ + return rtx_equal_p ((const_rtx) p1, (const_rtx) p2); +} + +static hashval_t +libcall_hash (const void *p1) +{ + return hash_rtx ((const_rtx) p1, VOIDmode, NULL, NULL, FALSE); +} + +static void +add_libcall (htab_t htab, rtx libcall) +{ + *htab_find_slot (htab, libcall, INSERT) = libcall; +} + +static bool +arm_libcall_uses_aapcs_base (const_rtx libcall) +{ + static bool init_done = false; + static htab_t libcall_htab; + + if (!init_done) + { + init_done = true; + + libcall_htab = htab_create (31, libcall_hash, libcall_eq, + NULL); + add_libcall (libcall_htab, + convert_optab_libfunc (sfloat_optab, SFmode, SImode)); + add_libcall (libcall_htab, + convert_optab_libfunc (sfloat_optab, DFmode, SImode)); + add_libcall (libcall_htab, + convert_optab_libfunc (sfloat_optab, SFmode, DImode)); + add_libcall (libcall_htab, + convert_optab_libfunc (sfloat_optab, DFmode, DImode)); + + add_libcall (libcall_htab, + convert_optab_libfunc (ufloat_optab, SFmode, SImode)); + add_libcall (libcall_htab, + convert_optab_libfunc (ufloat_optab, DFmode, SImode)); + add_libcall (libcall_htab, + convert_optab_libfunc (ufloat_optab, SFmode, DImode)); + add_libcall (libcall_htab, + convert_optab_libfunc (ufloat_optab, DFmode, DImode)); + + add_libcall (libcall_htab, + convert_optab_libfunc (sext_optab, SFmode, HFmode)); + add_libcall (libcall_htab, + convert_optab_libfunc (trunc_optab, HFmode, SFmode)); + add_libcall (libcall_htab, + convert_optab_libfunc (sfix_optab, SImode, DFmode)); + add_libcall (libcall_htab, + convert_optab_libfunc (ufix_optab, SImode, DFmode)); + add_libcall (libcall_htab, + convert_optab_libfunc (sfix_optab, DImode, DFmode)); + add_libcall (libcall_htab, + convert_optab_libfunc (ufix_optab, DImode, DFmode)); + add_libcall (libcall_htab, + convert_optab_libfunc (sfix_optab, DImode, SFmode)); + add_libcall (libcall_htab, + convert_optab_libfunc (ufix_optab, DImode, SFmode)); + + /* Values from double-precision helper functions are returned in core + registers if the selected core only supports single-precision + arithmetic, even if we are using the hard-float ABI. The same is + true for single-precision helpers, but we will never be using the + hard-float ABI on a CPU which doesn't support single-precision + operations in hardware. */ + add_libcall (libcall_htab, optab_libfunc (add_optab, DFmode)); + add_libcall (libcall_htab, optab_libfunc (sdiv_optab, DFmode)); + add_libcall (libcall_htab, optab_libfunc (smul_optab, DFmode)); + add_libcall (libcall_htab, optab_libfunc (neg_optab, DFmode)); + add_libcall (libcall_htab, optab_libfunc (sub_optab, DFmode)); + add_libcall (libcall_htab, optab_libfunc (eq_optab, DFmode)); + add_libcall (libcall_htab, optab_libfunc (lt_optab, DFmode)); + add_libcall (libcall_htab, optab_libfunc (le_optab, DFmode)); + add_libcall (libcall_htab, optab_libfunc (ge_optab, DFmode)); + add_libcall (libcall_htab, optab_libfunc (gt_optab, DFmode)); + add_libcall (libcall_htab, optab_libfunc (unord_optab, DFmode)); + add_libcall (libcall_htab, convert_optab_libfunc (sext_optab, DFmode, + SFmode)); + add_libcall (libcall_htab, convert_optab_libfunc (trunc_optab, SFmode, + DFmode)); + } + + return libcall && htab_find (libcall_htab, libcall) != NULL; +} + +rtx +arm_libcall_value (enum machine_mode mode, const_rtx libcall) +{ + if (TARGET_AAPCS_BASED && arm_pcs_default != ARM_PCS_AAPCS + && GET_MODE_CLASS (mode) == MODE_FLOAT) + { + /* The following libcalls return their result in integer registers, + even though they return a floating point value. */ + if (arm_libcall_uses_aapcs_base (libcall)) + return gen_rtx_REG (mode, ARG_REGISTER(1)); + + } + + return LIBCALL_VALUE (mode); +} + +/* Determine the amount of memory needed to store the possible return + registers of an untyped call. */ +int +arm_apply_result_size (void) +{ + int size = 16; + + if (TARGET_32BIT) + { + if (TARGET_HARD_FLOAT_ABI) + { + if (TARGET_VFP) + size += 32; + if (TARGET_FPA) + size += 12; + if (TARGET_MAVERICK) + size += 8; + } + if (TARGET_IWMMXT_ABI) + size += 8; + } + + return size; +} + +/* Decide whether TYPE should be returned in memory (true) + or in a register (false). FNTYPE is the type of the function making + the call. */ +static bool +arm_return_in_memory (const_tree type, const_tree fntype) +{ + HOST_WIDE_INT size; + + size = int_size_in_bytes (type); /* Negative if not fixed size. */ + + if (TARGET_AAPCS_BASED) + { + /* Simple, non-aggregate types (ie not including vectors and + complex) are always returned in a register (or registers). + We don't care about which register here, so we can short-cut + some of the detail. */ + if (!AGGREGATE_TYPE_P (type) + && TREE_CODE (type) != VECTOR_TYPE + && TREE_CODE (type) != COMPLEX_TYPE) + return false; + + /* Any return value that is no larger than one word can be + returned in r0. */ + if (((unsigned HOST_WIDE_INT) size) <= UNITS_PER_WORD) + return false; + + /* Check any available co-processors to see if they accept the + type as a register candidate (VFP, for example, can return + some aggregates in consecutive registers). These aren't + available if the call is variadic. */ + if (aapcs_select_return_coproc (type, fntype) >= 0) + return false; + + /* Vector values should be returned using ARM registers, not + memory (unless they're over 16 bytes, which will break since + we only have four call-clobbered registers to play with). */ + if (TREE_CODE (type) == VECTOR_TYPE) + return (size < 0 || size > (4 * UNITS_PER_WORD)); + + /* The rest go in memory. */ + return true; + } + + if (TREE_CODE (type) == VECTOR_TYPE) + return (size < 0 || size > (4 * UNITS_PER_WORD)); + + if (!AGGREGATE_TYPE_P (type) && + (TREE_CODE (type) != VECTOR_TYPE)) + /* All simple types are returned in registers. */ + return false; + + if (arm_abi != ARM_ABI_APCS) + { + /* ATPCS and later return aggregate types in memory only if they are + larger than a word (or are variable size). */ + return (size < 0 || size > UNITS_PER_WORD); + } + + /* For the arm-wince targets we choose to be compatible with Microsoft's + ARM and Thumb compilers, which always return aggregates in memory. */ +#ifndef ARM_WINCE + /* All structures/unions bigger than one word are returned in memory. + Also catch the case where int_size_in_bytes returns -1. In this case + the aggregate is either huge or of variable size, and in either case + we will want to return it via memory and not in a register. */ + if (size < 0 || size > UNITS_PER_WORD) + return true; + + if (TREE_CODE (type) == RECORD_TYPE) + { + tree field; + + /* For a struct the APCS says that we only return in a register + if the type is 'integer like' and every addressable element + has an offset of zero. For practical purposes this means + that the structure can have at most one non bit-field element + and that this element must be the first one in the structure. */ + + /* Find the first field, ignoring non FIELD_DECL things which will + have been created by C++. */ + for (field = TYPE_FIELDS (type); + field && TREE_CODE (field) != FIELD_DECL; + field = DECL_CHAIN (field)) + continue; + + if (field == NULL) + return false; /* An empty structure. Allowed by an extension to ANSI C. */ + + /* Check that the first field is valid for returning in a register. */ + + /* ... Floats are not allowed */ + if (FLOAT_TYPE_P (TREE_TYPE (field))) + return true; + + /* ... Aggregates that are not themselves valid for returning in + a register are not allowed. */ + if (arm_return_in_memory (TREE_TYPE (field), NULL_TREE)) + return true; + + /* Now check the remaining fields, if any. Only bitfields are allowed, + since they are not addressable. */ + for (field = DECL_CHAIN (field); + field; + field = DECL_CHAIN (field)) + { + if (TREE_CODE (field) != FIELD_DECL) + continue; + + if (!DECL_BIT_FIELD_TYPE (field)) + return true; + } + + return false; + } + + if (TREE_CODE (type) == UNION_TYPE) + { + tree field; + + /* Unions can be returned in registers if every element is + integral, or can be returned in an integer register. */ + for (field = TYPE_FIELDS (type); + field; + field = DECL_CHAIN (field)) + { + if (TREE_CODE (field) != FIELD_DECL) + continue; + + if (FLOAT_TYPE_P (TREE_TYPE (field))) + return true; + + if (arm_return_in_memory (TREE_TYPE (field), NULL_TREE)) + return true; + } + + return false; + } +#endif /* not ARM_WINCE */ + + /* Return all other types in memory. */ + return true; +} + +/* Indicate whether or not words of a double are in big-endian order. */ + +int +arm_float_words_big_endian (void) +{ + if (TARGET_MAVERICK) + return 0; + + /* For FPA, float words are always big-endian. For VFP, floats words + follow the memory system mode. */ + + if (TARGET_FPA) + { + return 1; + } + + if (TARGET_VFP) + return (TARGET_BIG_END ? 1 : 0); + + return 1; +} + +const struct pcs_attribute_arg +{ + const char *arg; + enum arm_pcs value; +} pcs_attribute_args[] = + { + {"aapcs", ARM_PCS_AAPCS}, + {"aapcs-vfp", ARM_PCS_AAPCS_VFP}, +#if 0 + /* We could recognize these, but changes would be needed elsewhere + * to implement them. */ + {"aapcs-iwmmxt", ARM_PCS_AAPCS_IWMMXT}, + {"atpcs", ARM_PCS_ATPCS}, + {"apcs", ARM_PCS_APCS}, +#endif + {NULL, ARM_PCS_UNKNOWN} + }; + +static enum arm_pcs +arm_pcs_from_attribute (tree attr) +{ + const struct pcs_attribute_arg *ptr; + const char *arg; + + /* Get the value of the argument. */ + if (TREE_VALUE (attr) == NULL_TREE + || TREE_CODE (TREE_VALUE (attr)) != STRING_CST) + return ARM_PCS_UNKNOWN; + + arg = TREE_STRING_POINTER (TREE_VALUE (attr)); + + /* Check it against the list of known arguments. */ + for (ptr = pcs_attribute_args; ptr->arg != NULL; ptr++) + if (streq (arg, ptr->arg)) + return ptr->value; + + /* An unrecognized interrupt type. */ + return ARM_PCS_UNKNOWN; +} + +/* Get the PCS variant to use for this call. TYPE is the function's type + specification, DECL is the specific declartion. DECL may be null if + the call could be indirect or if this is a library call. */ +static enum arm_pcs +arm_get_pcs_model (const_tree type, const_tree decl) +{ + bool user_convention = false; + enum arm_pcs user_pcs = arm_pcs_default; + tree attr; + + gcc_assert (type); + + attr = lookup_attribute ("pcs", TYPE_ATTRIBUTES (type)); + if (attr) + { + user_pcs = arm_pcs_from_attribute (TREE_VALUE (attr)); + user_convention = true; + } + + if (TARGET_AAPCS_BASED) + { + /* Detect varargs functions. These always use the base rules + (no argument is ever a candidate for a co-processor + register). */ + bool base_rules = stdarg_p (type); + + if (user_convention) + { + if (user_pcs > ARM_PCS_AAPCS_LOCAL) + sorry ("non-AAPCS derived PCS variant"); + else if (base_rules && user_pcs != ARM_PCS_AAPCS) + error ("variadic functions must use the base AAPCS variant"); + } + + if (base_rules) + return ARM_PCS_AAPCS; + else if (user_convention) + return user_pcs; + else if (decl && flag_unit_at_a_time) + { + /* Local functions never leak outside this compilation unit, + so we are free to use whatever conventions are + appropriate. */ + /* FIXME: remove CONST_CAST_TREE when cgraph is constified. */ + struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl)); + if (i && i->local) + return ARM_PCS_AAPCS_LOCAL; + } + } + else if (user_convention && user_pcs != arm_pcs_default) + sorry ("PCS variant"); + + /* For everything else we use the target's default. */ + return arm_pcs_default; +} + + +static void +aapcs_vfp_cum_init (CUMULATIVE_ARGS *pcum ATTRIBUTE_UNUSED, + const_tree fntype ATTRIBUTE_UNUSED, + rtx libcall ATTRIBUTE_UNUSED, + const_tree fndecl ATTRIBUTE_UNUSED) +{ + /* Record the unallocated VFP registers. */ + pcum->aapcs_vfp_regs_free = (1 << NUM_VFP_ARG_REGS) - 1; + pcum->aapcs_vfp_reg_alloc = 0; +} + +/* Walk down the type tree of TYPE counting consecutive base elements. + If *MODEP is VOIDmode, then set it to the first valid floating point + type. If a non-floating point type is found, or if a floating point + type that doesn't match a non-VOIDmode *MODEP is found, then return -1, + otherwise return the count in the sub-tree. */ +static int +aapcs_vfp_sub_candidate (const_tree type, enum machine_mode *modep) +{ + enum machine_mode mode; + HOST_WIDE_INT size; + + switch (TREE_CODE (type)) + { + case REAL_TYPE: + mode = TYPE_MODE (type); + if (mode != DFmode && mode != SFmode) + return -1; + + if (*modep == VOIDmode) + *modep = mode; + + if (*modep == mode) + return 1; + + break; + + case COMPLEX_TYPE: + mode = TYPE_MODE (TREE_TYPE (type)); + if (mode != DFmode && mode != SFmode) + return -1; + + if (*modep == VOIDmode) + *modep = mode; + + if (*modep == mode) + return 2; + + break; + + case VECTOR_TYPE: + /* Use V2SImode and V4SImode as representatives of all 64-bit + and 128-bit vector types, whether or not those modes are + supported with the present options. */ + size = int_size_in_bytes (type); + switch (size) + { + case 8: + mode = V2SImode; + break; + case 16: + mode = V4SImode; + break; + default: + return -1; + } + + if (*modep == VOIDmode) + *modep = mode; + + /* Vector modes are considered to be opaque: two vectors are + equivalent for the purposes of being homogeneous aggregates + if they are the same size. */ + if (*modep == mode) + return 1; + + break; + + case ARRAY_TYPE: + { + int count; + tree index = TYPE_DOMAIN (type); + + /* Can't handle incomplete types. */ + if (!COMPLETE_TYPE_P(type)) + return -1; + + count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep); + if (count == -1 + || !index + || !TYPE_MAX_VALUE (index) + || !host_integerp (TYPE_MAX_VALUE (index), 1) + || !TYPE_MIN_VALUE (index) + || !host_integerp (TYPE_MIN_VALUE (index), 1) + || count < 0) + return -1; + + count *= (1 + tree_low_cst (TYPE_MAX_VALUE (index), 1) + - tree_low_cst (TYPE_MIN_VALUE (index), 1)); + + /* There must be no padding. */ + if (!host_integerp (TYPE_SIZE (type), 1) + || (tree_low_cst (TYPE_SIZE (type), 1) + != count * GET_MODE_BITSIZE (*modep))) + return -1; + + return count; + } + + case RECORD_TYPE: + { + int count = 0; + int sub_count; + tree field; + + /* Can't handle incomplete types. */ + if (!COMPLETE_TYPE_P(type)) + return -1; + + for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field)) + { + if (TREE_CODE (field) != FIELD_DECL) + continue; + + sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep); + if (sub_count < 0) + return -1; + count += sub_count; + } + + /* There must be no padding. */ + if (!host_integerp (TYPE_SIZE (type), 1) + || (tree_low_cst (TYPE_SIZE (type), 1) + != count * GET_MODE_BITSIZE (*modep))) + return -1; + + return count; + } + + case UNION_TYPE: + case QUAL_UNION_TYPE: + { + /* These aren't very interesting except in a degenerate case. */ + int count = 0; + int sub_count; + tree field; + + /* Can't handle incomplete types. */ + if (!COMPLETE_TYPE_P(type)) + return -1; + + for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field)) + { + if (TREE_CODE (field) != FIELD_DECL) + continue; + + sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep); + if (sub_count < 0) + return -1; + count = count > sub_count ? count : sub_count; + } + + /* There must be no padding. */ + if (!host_integerp (TYPE_SIZE (type), 1) + || (tree_low_cst (TYPE_SIZE (type), 1) + != count * GET_MODE_BITSIZE (*modep))) + return -1; + + return count; + } + + default: + break; + } + + return -1; +} + +/* Return true if PCS_VARIANT should use VFP registers. */ +static bool +use_vfp_abi (enum arm_pcs pcs_variant, bool is_double) +{ + if (pcs_variant == ARM_PCS_AAPCS_VFP) + { + static bool seen_thumb1_vfp = false; + + if (TARGET_THUMB1 && !seen_thumb1_vfp) + { + sorry ("Thumb-1 hard-float VFP ABI"); + /* sorry() is not immediately fatal, so only display this once. */ + seen_thumb1_vfp = true; + } + + return true; + } + + if (pcs_variant != ARM_PCS_AAPCS_LOCAL) + return false; + + return (TARGET_32BIT && TARGET_VFP && TARGET_HARD_FLOAT && + (TARGET_VFP_DOUBLE || !is_double)); +} + +static bool +aapcs_vfp_is_call_or_return_candidate (enum arm_pcs pcs_variant, + enum machine_mode mode, const_tree type, + enum machine_mode *base_mode, int *count) +{ + enum machine_mode new_mode = VOIDmode; + + if (GET_MODE_CLASS (mode) == MODE_FLOAT + || GET_MODE_CLASS (mode) == MODE_VECTOR_INT + || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT) + { + *count = 1; + new_mode = mode; + } + else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT) + { + *count = 2; + new_mode = (mode == DCmode ? DFmode : SFmode); + } + else if (type && (mode == BLKmode || TREE_CODE (type) == VECTOR_TYPE)) + { + int ag_count = aapcs_vfp_sub_candidate (type, &new_mode); + + if (ag_count > 0 && ag_count <= 4) + *count = ag_count; + else + return false; + } + else + return false; + + + if (!use_vfp_abi (pcs_variant, ARM_NUM_REGS (new_mode) > 1)) + return false; + + *base_mode = new_mode; + return true; +} + +static bool +aapcs_vfp_is_return_candidate (enum arm_pcs pcs_variant, + enum machine_mode mode, const_tree type) +{ + int count ATTRIBUTE_UNUSED; + enum machine_mode ag_mode ATTRIBUTE_UNUSED; + + if (!use_vfp_abi (pcs_variant, false)) + return false; + return aapcs_vfp_is_call_or_return_candidate (pcs_variant, mode, type, + &ag_mode, &count); +} + +static bool +aapcs_vfp_is_call_candidate (CUMULATIVE_ARGS *pcum, enum machine_mode mode, + const_tree type) +{ + if (!use_vfp_abi (pcum->pcs_variant, false)) + return false; + + return aapcs_vfp_is_call_or_return_candidate (pcum->pcs_variant, mode, type, + &pcum->aapcs_vfp_rmode, + &pcum->aapcs_vfp_rcount); +} + +static bool +aapcs_vfp_allocate (CUMULATIVE_ARGS *pcum, enum machine_mode mode, + const_tree type ATTRIBUTE_UNUSED) +{ + int shift = GET_MODE_SIZE (pcum->aapcs_vfp_rmode) / GET_MODE_SIZE (SFmode); + unsigned mask = (1 << (shift * pcum->aapcs_vfp_rcount)) - 1; + int regno; + + for (regno = 0; regno < NUM_VFP_ARG_REGS; regno += shift) + if (((pcum->aapcs_vfp_regs_free >> regno) & mask) == mask) + { + pcum->aapcs_vfp_reg_alloc = mask << regno; + if (mode == BLKmode || (mode == TImode && !TARGET_NEON)) + { + int i; + int rcount = pcum->aapcs_vfp_rcount; + int rshift = shift; + enum machine_mode rmode = pcum->aapcs_vfp_rmode; + rtx par; + if (!TARGET_NEON) + { + /* Avoid using unsupported vector modes. */ + if (rmode == V2SImode) + rmode = DImode; + else if (rmode == V4SImode) + { + rmode = DImode; + rcount *= 2; + rshift /= 2; + } + } + par = gen_rtx_PARALLEL (mode, rtvec_alloc (rcount)); + for (i = 0; i < rcount; i++) + { + rtx tmp = gen_rtx_REG (rmode, + FIRST_VFP_REGNUM + regno + i * rshift); + tmp = gen_rtx_EXPR_LIST + (VOIDmode, tmp, + GEN_INT (i * GET_MODE_SIZE (rmode))); + XVECEXP (par, 0, i) = tmp; + } + + pcum->aapcs_reg = par; + } + else + pcum->aapcs_reg = gen_rtx_REG (mode, FIRST_VFP_REGNUM + regno); + return true; + } + return false; +} + +static rtx +aapcs_vfp_allocate_return_reg (enum arm_pcs pcs_variant ATTRIBUTE_UNUSED, + enum machine_mode mode, + const_tree type ATTRIBUTE_UNUSED) +{ + if (!use_vfp_abi (pcs_variant, false)) + return false; + + if (mode == BLKmode || (mode == TImode && !TARGET_NEON)) + { + int count; + enum machine_mode ag_mode; + int i; + rtx par; + int shift; + + aapcs_vfp_is_call_or_return_candidate (pcs_variant, mode, type, + &ag_mode, &count); + + if (!TARGET_NEON) + { + if (ag_mode == V2SImode) + ag_mode = DImode; + else if (ag_mode == V4SImode) + { + ag_mode = DImode; + count *= 2; + } + } + shift = GET_MODE_SIZE(ag_mode) / GET_MODE_SIZE(SFmode); + par = gen_rtx_PARALLEL (mode, rtvec_alloc (count)); + for (i = 0; i < count; i++) + { + rtx tmp = gen_rtx_REG (ag_mode, FIRST_VFP_REGNUM + i * shift); + tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, + GEN_INT (i * GET_MODE_SIZE (ag_mode))); + XVECEXP (par, 0, i) = tmp; + } + + return par; + } + + return gen_rtx_REG (mode, FIRST_VFP_REGNUM); +} + +static void +aapcs_vfp_advance (CUMULATIVE_ARGS *pcum ATTRIBUTE_UNUSED, + enum machine_mode mode ATTRIBUTE_UNUSED, + const_tree type ATTRIBUTE_UNUSED) +{ + pcum->aapcs_vfp_regs_free &= ~pcum->aapcs_vfp_reg_alloc; + pcum->aapcs_vfp_reg_alloc = 0; + return; +} + +#define AAPCS_CP(X) \ + { \ + aapcs_ ## X ## _cum_init, \ + aapcs_ ## X ## _is_call_candidate, \ + aapcs_ ## X ## _allocate, \ + aapcs_ ## X ## _is_return_candidate, \ + aapcs_ ## X ## _allocate_return_reg, \ + aapcs_ ## X ## _advance \ + } + +/* Table of co-processors that can be used to pass arguments in + registers. Idealy no arugment should be a candidate for more than + one co-processor table entry, but the table is processed in order + and stops after the first match. If that entry then fails to put + the argument into a co-processor register, the argument will go on + the stack. */ +static struct +{ + /* Initialize co-processor related state in CUMULATIVE_ARGS structure. */ + void (*cum_init) (CUMULATIVE_ARGS *, const_tree, rtx, const_tree); + + /* Return true if an argument of mode MODE (or type TYPE if MODE is + BLKmode) is a candidate for this co-processor's registers; this + function should ignore any position-dependent state in + CUMULATIVE_ARGS and only use call-type dependent information. */ + bool (*is_call_candidate) (CUMULATIVE_ARGS *, enum machine_mode, const_tree); + + /* Return true if the argument does get a co-processor register; it + should set aapcs_reg to an RTX of the register allocated as is + required for a return from FUNCTION_ARG. */ + bool (*allocate) (CUMULATIVE_ARGS *, enum machine_mode, const_tree); + + /* Return true if a result of mode MODE (or type TYPE if MODE is + BLKmode) is can be returned in this co-processor's registers. */ + bool (*is_return_candidate) (enum arm_pcs, enum machine_mode, const_tree); + + /* Allocate and return an RTX element to hold the return type of a + call, this routine must not fail and will only be called if + is_return_candidate returned true with the same parameters. */ + rtx (*allocate_return_reg) (enum arm_pcs, enum machine_mode, const_tree); + + /* Finish processing this argument and prepare to start processing + the next one. */ + void (*advance) (CUMULATIVE_ARGS *, enum machine_mode, const_tree); +} aapcs_cp_arg_layout[ARM_NUM_COPROC_SLOTS] = + { + AAPCS_CP(vfp) + }; + +#undef AAPCS_CP + +static int +aapcs_select_call_coproc (CUMULATIVE_ARGS *pcum, enum machine_mode mode, + const_tree type) +{ + int i; + + for (i = 0; i < ARM_NUM_COPROC_SLOTS; i++) + if (aapcs_cp_arg_layout[i].is_call_candidate (pcum, mode, type)) + return i; + + return -1; +} + +static int +aapcs_select_return_coproc (const_tree type, const_tree fntype) +{ + /* We aren't passed a decl, so we can't check that a call is local. + However, it isn't clear that that would be a win anyway, since it + might limit some tail-calling opportunities. */ + enum arm_pcs pcs_variant; + + if (fntype) + { + const_tree fndecl = NULL_TREE; + + if (TREE_CODE (fntype) == FUNCTION_DECL) + { + fndecl = fntype; + fntype = TREE_TYPE (fntype); + } + + pcs_variant = arm_get_pcs_model (fntype, fndecl); + } + else + pcs_variant = arm_pcs_default; + + if (pcs_variant != ARM_PCS_AAPCS) + { + int i; + + for (i = 0; i < ARM_NUM_COPROC_SLOTS; i++) + if (aapcs_cp_arg_layout[i].is_return_candidate (pcs_variant, + TYPE_MODE (type), + type)) + return i; + } + return -1; +} + +static rtx +aapcs_allocate_return_reg (enum machine_mode mode, const_tree type, + const_tree fntype) +{ + /* We aren't passed a decl, so we can't check that a call is local. + However, it isn't clear that that would be a win anyway, since it + might limit some tail-calling opportunities. */ + enum arm_pcs pcs_variant; + int unsignedp ATTRIBUTE_UNUSED; + + if (fntype) + { + const_tree fndecl = NULL_TREE; + + if (TREE_CODE (fntype) == FUNCTION_DECL) + { + fndecl = fntype; + fntype = TREE_TYPE (fntype); + } + + pcs_variant = arm_get_pcs_model (fntype, fndecl); + } + else + pcs_variant = arm_pcs_default; + + /* Promote integer types. */ + if (type && INTEGRAL_TYPE_P (type)) + mode = arm_promote_function_mode (type, mode, &unsignedp, fntype, 1); + + if (pcs_variant != ARM_PCS_AAPCS) + { + int i; + + for (i = 0; i < ARM_NUM_COPROC_SLOTS; i++) + if (aapcs_cp_arg_layout[i].is_return_candidate (pcs_variant, mode, + type)) + return aapcs_cp_arg_layout[i].allocate_return_reg (pcs_variant, + mode, type); + } + + /* Promotes small structs returned in a register to full-word size + for big-endian AAPCS. */ + if (type && arm_return_in_msb (type)) + { + HOST_WIDE_INT size = int_size_in_bytes (type); + if (size % UNITS_PER_WORD != 0) + { + size += UNITS_PER_WORD - size % UNITS_PER_WORD; + mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0); + } + } + + return gen_rtx_REG (mode, R0_REGNUM); +} + +rtx +aapcs_libcall_value (enum machine_mode mode) +{ + return aapcs_allocate_return_reg (mode, NULL_TREE, NULL_TREE); +} + +/* Lay out a function argument using the AAPCS rules. The rule + numbers referred to here are those in the AAPCS. */ +static void +aapcs_layout_arg (CUMULATIVE_ARGS *pcum, enum machine_mode mode, + const_tree type, bool named) +{ + int nregs, nregs2; + int ncrn; + + /* We only need to do this once per argument. */ + if (pcum->aapcs_arg_processed) + return; + + pcum->aapcs_arg_processed = true; + + /* Special case: if named is false then we are handling an incoming + anonymous argument which is on the stack. */ + if (!named) + return; + + /* Is this a potential co-processor register candidate? */ + if (pcum->pcs_variant != ARM_PCS_AAPCS) + { + int slot = aapcs_select_call_coproc (pcum, mode, type); + pcum->aapcs_cprc_slot = slot; + + /* We don't have to apply any of the rules from part B of the + preparation phase, these are handled elsewhere in the + compiler. */ + + if (slot >= 0) + { + /* A Co-processor register candidate goes either in its own + class of registers or on the stack. */ + if (!pcum->aapcs_cprc_failed[slot]) + { + /* C1.cp - Try to allocate the argument to co-processor + registers. */ + if (aapcs_cp_arg_layout[slot].allocate (pcum, mode, type)) + return; + + /* C2.cp - Put the argument on the stack and note that we + can't assign any more candidates in this slot. We also + need to note that we have allocated stack space, so that + we won't later try to split a non-cprc candidate between + core registers and the stack. */ + pcum->aapcs_cprc_failed[slot] = true; + pcum->can_split = false; + } + + /* We didn't get a register, so this argument goes on the + stack. */ + gcc_assert (pcum->can_split == false); + return; + } + } + + /* C3 - For double-word aligned arguments, round the NCRN up to the + next even number. */ + ncrn = pcum->aapcs_ncrn; + if ((ncrn & 1) && arm_needs_doubleword_align (mode, type)) + ncrn++; + + nregs = ARM_NUM_REGS2(mode, type); + + /* Sigh, this test should really assert that nregs > 0, but a GCC + extension allows empty structs and then gives them empty size; it + then allows such a structure to be passed by value. For some of + the code below we have to pretend that such an argument has + non-zero size so that we 'locate' it correctly either in + registers or on the stack. */ + gcc_assert (nregs >= 0); + + nregs2 = nregs ? nregs : 1; + + /* C4 - Argument fits entirely in core registers. */ + if (ncrn + nregs2 <= NUM_ARG_REGS) + { + pcum->aapcs_reg = gen_rtx_REG (mode, ncrn); + pcum->aapcs_next_ncrn = ncrn + nregs; + return; + } + + /* C5 - Some core registers left and there are no arguments already + on the stack: split this argument between the remaining core + registers and the stack. */ + if (ncrn < NUM_ARG_REGS && pcum->can_split) + { + pcum->aapcs_reg = gen_rtx_REG (mode, ncrn); + pcum->aapcs_next_ncrn = NUM_ARG_REGS; + pcum->aapcs_partial = (NUM_ARG_REGS - ncrn) * UNITS_PER_WORD; + return; + } + + /* C6 - NCRN is set to 4. */ + pcum->aapcs_next_ncrn = NUM_ARG_REGS; + + /* C7,C8 - arugment goes on the stack. We have nothing to do here. */ + return; +} + +/* Initialize a variable CUM of type CUMULATIVE_ARGS + for a call to a function whose data type is FNTYPE. + For a library call, FNTYPE is NULL. */ +void +arm_init_cumulative_args (CUMULATIVE_ARGS *pcum, tree fntype, + rtx libname, + tree fndecl ATTRIBUTE_UNUSED) +{ + /* Long call handling. */ + if (fntype) + pcum->pcs_variant = arm_get_pcs_model (fntype, fndecl); + else + pcum->pcs_variant = arm_pcs_default; + + if (pcum->pcs_variant <= ARM_PCS_AAPCS_LOCAL) + { + if (arm_libcall_uses_aapcs_base (libname)) + pcum->pcs_variant = ARM_PCS_AAPCS; + + pcum->aapcs_ncrn = pcum->aapcs_next_ncrn = 0; + pcum->aapcs_reg = NULL_RTX; + pcum->aapcs_partial = 0; + pcum->aapcs_arg_processed = false; + pcum->aapcs_cprc_slot = -1; + pcum->can_split = true; + + if (pcum->pcs_variant != ARM_PCS_AAPCS) + { + int i; + + for (i = 0; i < ARM_NUM_COPROC_SLOTS; i++) + { + pcum->aapcs_cprc_failed[i] = false; + aapcs_cp_arg_layout[i].cum_init (pcum, fntype, libname, fndecl); + } + } + return; + } + + /* Legacy ABIs */ + + /* On the ARM, the offset starts at 0. */ + pcum->nregs = 0; + pcum->iwmmxt_nregs = 0; + pcum->can_split = true; + + /* Varargs vectors are treated the same as long long. + named_count avoids having to change the way arm handles 'named' */ + pcum->named_count = 0; + pcum->nargs = 0; + + if (TARGET_REALLY_IWMMXT && fntype) + { + tree fn_arg; + + for (fn_arg = TYPE_ARG_TYPES (fntype); + fn_arg; + fn_arg = TREE_CHAIN (fn_arg)) + pcum->named_count += 1; + + if (! pcum->named_count) + pcum->named_count = INT_MAX; + } +} + + +/* Return true if mode/type need doubleword alignment. */ +static bool +arm_needs_doubleword_align (enum machine_mode mode, const_tree type) +{ + return (GET_MODE_ALIGNMENT (mode) > PARM_BOUNDARY + || (type && TYPE_ALIGN (type) > PARM_BOUNDARY)); +} + + +/* Determine where to put an argument to a function. + Value is zero to push the argument on the stack, + or a hard register in which to store the argument. + + MODE is the argument's machine mode. + TYPE is the data type of the argument (as a tree). + This is null for libcalls where that information may + not be available. + CUM is a variable of type CUMULATIVE_ARGS which gives info about + the preceding args and about the function being called. + NAMED is nonzero if this argument is a named parameter + (otherwise it is an extra parameter matching an ellipsis). + + On the ARM, normally the first 16 bytes are passed in registers r0-r3; all + other arguments are passed on the stack. If (NAMED == 0) (which happens + only in assign_parms, since TARGET_SETUP_INCOMING_VARARGS is + defined), say it is passed in the stack (function_prologue will + indeed make it pass in the stack if necessary). */ + +static rtx +arm_function_arg (CUMULATIVE_ARGS *pcum, enum machine_mode mode, + const_tree type, bool named) +{ + int nregs; + + /* Handle the special case quickly. Pick an arbitrary value for op2 of + a call insn (op3 of a call_value insn). */ + if (mode == VOIDmode) + return const0_rtx; + + if (pcum->pcs_variant <= ARM_PCS_AAPCS_LOCAL) + { + aapcs_layout_arg (pcum, mode, type, named); + return pcum->aapcs_reg; + } + + /* Varargs vectors are treated the same as long long. + named_count avoids having to change the way arm handles 'named' */ + if (TARGET_IWMMXT_ABI + && arm_vector_mode_supported_p (mode) + && pcum->named_count > pcum->nargs + 1) + { + if (pcum->iwmmxt_nregs <= 9) + return gen_rtx_REG (mode, pcum->iwmmxt_nregs + FIRST_IWMMXT_REGNUM); + else + { + pcum->can_split = false; + return NULL_RTX; + } + } + + /* Put doubleword aligned quantities in even register pairs. */ + if (pcum->nregs & 1 + && ARM_DOUBLEWORD_ALIGN + && arm_needs_doubleword_align (mode, type)) + pcum->nregs++; + + /* Only allow splitting an arg between regs and memory if all preceding + args were allocated to regs. For args passed by reference we only count + the reference pointer. */ + if (pcum->can_split) + nregs = 1; + else + nregs = ARM_NUM_REGS2 (mode, type); + + if (!named || pcum->nregs + nregs > NUM_ARG_REGS) + return NULL_RTX; + + return gen_rtx_REG (mode, pcum->nregs); +} + +/* The AAPCS sets the maximum alignment of a vector to 64 bits. */ +static HOST_WIDE_INT +arm_vector_alignment (const_tree type) +{ + HOST_WIDE_INT align = tree_low_cst (TYPE_SIZE (type), 0); + + if (TARGET_AAPCS_BASED) + align = MIN (align, 64); + + return align; +} + +static unsigned int +arm_function_arg_boundary (enum machine_mode mode, const_tree type) +{ + return (ARM_DOUBLEWORD_ALIGN && arm_needs_doubleword_align (mode, type) + ? DOUBLEWORD_ALIGNMENT + : PARM_BOUNDARY); +} + +static int +arm_arg_partial_bytes (CUMULATIVE_ARGS *pcum, enum machine_mode mode, + tree type, bool named) +{ + int nregs = pcum->nregs; + + if (pcum->pcs_variant <= ARM_PCS_AAPCS_LOCAL) + { + aapcs_layout_arg (pcum, mode, type, named); + return pcum->aapcs_partial; + } + + if (TARGET_IWMMXT_ABI && arm_vector_mode_supported_p (mode)) + return 0; + + if (NUM_ARG_REGS > nregs + && (NUM_ARG_REGS < nregs + ARM_NUM_REGS2 (mode, type)) + && pcum->can_split) + return (NUM_ARG_REGS - nregs) * UNITS_PER_WORD; + + return 0; +} + +/* Update the data in PCUM to advance over an argument + of mode MODE and data type TYPE. + (TYPE is null for libcalls where that information may not be available.) */ + +static void +arm_function_arg_advance (CUMULATIVE_ARGS *pcum, enum machine_mode mode, + const_tree type, bool named) +{ + if (pcum->pcs_variant <= ARM_PCS_AAPCS_LOCAL) + { + aapcs_layout_arg (pcum, mode, type, named); + + if (pcum->aapcs_cprc_slot >= 0) + { + aapcs_cp_arg_layout[pcum->aapcs_cprc_slot].advance (pcum, mode, + type); + pcum->aapcs_cprc_slot = -1; + } + + /* Generic stuff. */ + pcum->aapcs_arg_processed = false; + pcum->aapcs_ncrn = pcum->aapcs_next_ncrn; + pcum->aapcs_reg = NULL_RTX; + pcum->aapcs_partial = 0; + } + else + { + pcum->nargs += 1; + if (arm_vector_mode_supported_p (mode) + && pcum->named_count > pcum->nargs + && TARGET_IWMMXT_ABI) + pcum->iwmmxt_nregs += 1; + else + pcum->nregs += ARM_NUM_REGS2 (mode, type); + } +} + +/* Variable sized types are passed by reference. This is a GCC + extension to the ARM ABI. */ + +static bool +arm_pass_by_reference (CUMULATIVE_ARGS *cum ATTRIBUTE_UNUSED, + enum machine_mode mode ATTRIBUTE_UNUSED, + const_tree type, bool named ATTRIBUTE_UNUSED) +{ + return type && TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST; +} + +/* Encode the current state of the #pragma [no_]long_calls. */ +typedef enum +{ + OFF, /* No #pragma [no_]long_calls is in effect. */ + LONG, /* #pragma long_calls is in effect. */ + SHORT /* #pragma no_long_calls is in effect. */ +} arm_pragma_enum; + +static arm_pragma_enum arm_pragma_long_calls = OFF; + +void +arm_pr_long_calls (struct cpp_reader * pfile ATTRIBUTE_UNUSED) +{ + arm_pragma_long_calls = LONG; +} + +void +arm_pr_no_long_calls (struct cpp_reader * pfile ATTRIBUTE_UNUSED) +{ + arm_pragma_long_calls = SHORT; +} + +void +arm_pr_long_calls_off (struct cpp_reader * pfile ATTRIBUTE_UNUSED) +{ + arm_pragma_long_calls = OFF; +} + +/* Handle an attribute requiring a FUNCTION_DECL; + arguments as in struct attribute_spec.handler. */ +static tree +arm_handle_fndecl_attribute (tree *node, tree name, tree args ATTRIBUTE_UNUSED, + int flags ATTRIBUTE_UNUSED, bool *no_add_attrs) +{ + if (TREE_CODE (*node) != FUNCTION_DECL) + { + warning (OPT_Wattributes, "%qE attribute only applies to functions", + name); + *no_add_attrs = true; + } + + return NULL_TREE; +} + +/* Handle an "interrupt" or "isr" attribute; + arguments as in struct attribute_spec.handler. */ +static tree +arm_handle_isr_attribute (tree *node, tree name, tree args, int flags, + bool *no_add_attrs) +{ + if (DECL_P (*node)) + { + if (TREE_CODE (*node) != FUNCTION_DECL) + { + warning (OPT_Wattributes, "%qE attribute only applies to functions", + name); + *no_add_attrs = true; + } + /* FIXME: the argument if any is checked for type attributes; + should it be checked for decl ones? */ + } + else + { + if (TREE_CODE (*node) == FUNCTION_TYPE + || TREE_CODE (*node) == METHOD_TYPE) + { + if (arm_isr_value (args) == ARM_FT_UNKNOWN) + { + warning (OPT_Wattributes, "%qE attribute ignored", + name); + *no_add_attrs = true; + } + } + else if (TREE_CODE (*node) == POINTER_TYPE + && (TREE_CODE (TREE_TYPE (*node)) == FUNCTION_TYPE + || TREE_CODE (TREE_TYPE (*node)) == METHOD_TYPE) + && arm_isr_value (args) != ARM_FT_UNKNOWN) + { + *node = build_variant_type_copy (*node); + TREE_TYPE (*node) = build_type_attribute_variant + (TREE_TYPE (*node), + tree_cons (name, args, TYPE_ATTRIBUTES (TREE_TYPE (*node)))); + *no_add_attrs = true; + } + else + { + /* Possibly pass this attribute on from the type to a decl. */ + if (flags & ((int) ATTR_FLAG_DECL_NEXT + | (int) ATTR_FLAG_FUNCTION_NEXT + | (int) ATTR_FLAG_ARRAY_NEXT)) + { + *no_add_attrs = true; + return tree_cons (name, args, NULL_TREE); + } + else + { + warning (OPT_Wattributes, "%qE attribute ignored", + name); + } + } + } + + return NULL_TREE; +} + +/* Handle a "pcs" attribute; arguments as in struct + attribute_spec.handler. */ +static tree +arm_handle_pcs_attribute (tree *node ATTRIBUTE_UNUSED, tree name, tree args, + int flags ATTRIBUTE_UNUSED, bool *no_add_attrs) +{ + if (arm_pcs_from_attribute (args) == ARM_PCS_UNKNOWN) + { + warning (OPT_Wattributes, "%qE attribute ignored", name); + *no_add_attrs = true; + } + return NULL_TREE; +} + +#if TARGET_DLLIMPORT_DECL_ATTRIBUTES +/* Handle the "notshared" attribute. This attribute is another way of + requesting hidden visibility. ARM's compiler supports + "__declspec(notshared)"; we support the same thing via an + attribute. */ + +static tree +arm_handle_notshared_attribute (tree *node, + tree name ATTRIBUTE_UNUSED, + tree args ATTRIBUTE_UNUSED, + int flags ATTRIBUTE_UNUSED, + bool *no_add_attrs) +{ + tree decl = TYPE_NAME (*node); + + if (decl) + { + DECL_VISIBILITY (decl) = VISIBILITY_HIDDEN; + DECL_VISIBILITY_SPECIFIED (decl) = 1; + *no_add_attrs = false; + } + return NULL_TREE; +} +#endif + +/* Return 0 if the attributes for two types are incompatible, 1 if they + are compatible, and 2 if they are nearly compatible (which causes a + warning to be generated). */ +static int +arm_comp_type_attributes (const_tree type1, const_tree type2) +{ + int l1, l2, s1, s2; + + /* Check for mismatch of non-default calling convention. */ + if (TREE_CODE (type1) != FUNCTION_TYPE) + return 1; + + /* Check for mismatched call attributes. */ + l1 = lookup_attribute ("long_call", TYPE_ATTRIBUTES (type1)) != NULL; + l2 = lookup_attribute ("long_call", TYPE_ATTRIBUTES (type2)) != NULL; + s1 = lookup_attribute ("short_call", TYPE_ATTRIBUTES (type1)) != NULL; + s2 = lookup_attribute ("short_call", TYPE_ATTRIBUTES (type2)) != NULL; + + /* Only bother to check if an attribute is defined. */ + if (l1 | l2 | s1 | s2) + { + /* If one type has an attribute, the other must have the same attribute. */ + if ((l1 != l2) || (s1 != s2)) + return 0; + + /* Disallow mixed attributes. */ + if ((l1 & s2) || (l2 & s1)) + return 0; + } + + /* Check for mismatched ISR attribute. */ + l1 = lookup_attribute ("isr", TYPE_ATTRIBUTES (type1)) != NULL; + if (! l1) + l1 = lookup_attribute ("interrupt", TYPE_ATTRIBUTES (type1)) != NULL; + l2 = lookup_attribute ("isr", TYPE_ATTRIBUTES (type2)) != NULL; + if (! l2) + l1 = lookup_attribute ("interrupt", TYPE_ATTRIBUTES (type2)) != NULL; + if (l1 != l2) + return 0; + + return 1; +} + +/* Assigns default attributes to newly defined type. This is used to + set short_call/long_call attributes for function types of + functions defined inside corresponding #pragma scopes. */ +static void +arm_set_default_type_attributes (tree type) +{ + /* Add __attribute__ ((long_call)) to all functions, when + inside #pragma long_calls or __attribute__ ((short_call)), + when inside #pragma no_long_calls. */ + if (TREE_CODE (type) == FUNCTION_TYPE || TREE_CODE (type) == METHOD_TYPE) + { + tree type_attr_list, attr_name; + type_attr_list = TYPE_ATTRIBUTES (type); + + if (arm_pragma_long_calls == LONG) + attr_name = get_identifier ("long_call"); + else if (arm_pragma_long_calls == SHORT) + attr_name = get_identifier ("short_call"); + else + return; + + type_attr_list = tree_cons (attr_name, NULL_TREE, type_attr_list); + TYPE_ATTRIBUTES (type) = type_attr_list; + } +} + +/* Return true if DECL is known to be linked into section SECTION. */ + +static bool +arm_function_in_section_p (tree decl, section *section) +{ + /* We can only be certain about functions defined in the same + compilation unit. */ + if (!TREE_STATIC (decl)) + return false; + + /* Make sure that SYMBOL always binds to the definition in this + compilation unit. */ + if (!targetm.binds_local_p (decl)) + return false; + + /* If DECL_SECTION_NAME is set, assume it is trustworthy. */ + if (!DECL_SECTION_NAME (decl)) + { + /* Make sure that we will not create a unique section for DECL. */ + if (flag_function_sections || DECL_ONE_ONLY (decl)) + return false; + } + + return function_section (decl) == section; +} + +/* Return nonzero if a 32-bit "long_call" should be generated for + a call from the current function to DECL. We generate a long_call + if the function: + + a. has an __attribute__((long call)) + or b. is within the scope of a #pragma long_calls + or c. the -mlong-calls command line switch has been specified + + However we do not generate a long call if the function: + + d. has an __attribute__ ((short_call)) + or e. is inside the scope of a #pragma no_long_calls + or f. is defined in the same section as the current function. */ + +bool +arm_is_long_call_p (tree decl) +{ + tree attrs; + + if (!decl) + return TARGET_LONG_CALLS; + + attrs = TYPE_ATTRIBUTES (TREE_TYPE (decl)); + if (lookup_attribute ("short_call", attrs)) + return false; + + /* For "f", be conservative, and only cater for cases in which the + whole of the current function is placed in the same section. */ + if (!flag_reorder_blocks_and_partition + && TREE_CODE (decl) == FUNCTION_DECL + && arm_function_in_section_p (decl, current_function_section ())) + return false; + + if (lookup_attribute ("long_call", attrs)) + return true; + + return TARGET_LONG_CALLS; +} + +/* Return nonzero if it is ok to make a tail-call to DECL. */ +static bool +arm_function_ok_for_sibcall (tree decl, tree exp) +{ + unsigned long func_type; + + if (cfun->machine->sibcall_blocked) + return false; + + /* Never tailcall something for which we have no decl, or if we + are generating code for Thumb-1. */ + if (decl == NULL || TARGET_THUMB1) + return false; + + /* The PIC register is live on entry to VxWorks PLT entries, so we + must make the call before restoring the PIC register. */ + if (TARGET_VXWORKS_RTP && flag_pic && !targetm.binds_local_p (decl)) + return false; + + /* Cannot tail-call to long calls, since these are out of range of + a branch instruction. */ + if (arm_is_long_call_p (decl)) + return false; + + /* If we are interworking and the function is not declared static + then we can't tail-call it unless we know that it exists in this + compilation unit (since it might be a Thumb routine). */ + if (TARGET_INTERWORK && TREE_PUBLIC (decl) && !TREE_ASM_WRITTEN (decl)) + return false; + + func_type = arm_current_func_type (); + /* Never tailcall from an ISR routine - it needs a special exit sequence. */ + if (IS_INTERRUPT (func_type)) + return false; + + if (!VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl)))) + { + /* Check that the return value locations are the same. For + example that we aren't returning a value from the sibling in + a VFP register but then need to transfer it to a core + register. */ + rtx a, b; + + a = arm_function_value (TREE_TYPE (exp), decl, false); + b = arm_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)), + cfun->decl, false); + if (!rtx_equal_p (a, b)) + return false; + } + + /* Never tailcall if function may be called with a misaligned SP. */ + if (IS_STACKALIGN (func_type)) + return false; + + /* The AAPCS says that, on bare-metal, calls to unresolved weak + references should become a NOP. Don't convert such calls into + sibling calls. */ + if (TARGET_AAPCS_BASED + && arm_abi == ARM_ABI_AAPCS + && DECL_WEAK (decl)) + return false; + + /* Everything else is ok. */ + return true; +} + + +/* Addressing mode support functions. */ + +/* Return nonzero if X is a legitimate immediate operand when compiling + for PIC. We know that X satisfies CONSTANT_P and flag_pic is true. */ +int +legitimate_pic_operand_p (rtx x) +{ + if (GET_CODE (x) == SYMBOL_REF + || (GET_CODE (x) == CONST + && GET_CODE (XEXP (x, 0)) == PLUS + && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)) + return 0; + + return 1; +} + +/* Record that the current function needs a PIC register. Initialize + cfun->machine->pic_reg if we have not already done so. */ + +static void +require_pic_register (void) +{ + /* A lot of the logic here is made obscure by the fact that this + routine gets called as part of the rtx cost estimation process. + We don't want those calls to affect any assumptions about the real + function; and further, we can't call entry_of_function() until we + start the real expansion process. */ + if (!crtl->uses_pic_offset_table) + { + gcc_assert (can_create_pseudo_p ()); + if (arm_pic_register != INVALID_REGNUM) + { + if (!cfun->machine->pic_reg) + cfun->machine->pic_reg = gen_rtx_REG (Pmode, arm_pic_register); + + /* Play games to avoid marking the function as needing pic + if we are being called as part of the cost-estimation + process. */ + if (current_ir_type () != IR_GIMPLE || currently_expanding_to_rtl) + crtl->uses_pic_offset_table = 1; + } + else + { + rtx seq, insn; + + if (!cfun->machine->pic_reg) + cfun->machine->pic_reg = gen_reg_rtx (Pmode); + + /* Play games to avoid marking the function as needing pic + if we are being called as part of the cost-estimation + process. */ + if (current_ir_type () != IR_GIMPLE || currently_expanding_to_rtl) + { + crtl->uses_pic_offset_table = 1; + start_sequence (); + + arm_load_pic_register (0UL); + + seq = get_insns (); + end_sequence (); + + for (insn = seq; insn; insn = NEXT_INSN (insn)) + if (INSN_P (insn)) + INSN_LOCATOR (insn) = prologue_locator; + + /* We can be called during expansion of PHI nodes, where + we can't yet emit instructions directly in the final + insn stream. Queue the insns on the entry edge, they will + be committed after everything else is expanded. */ + insert_insn_on_edge (seq, single_succ_edge (ENTRY_BLOCK_PTR)); + } + } + } +} + +rtx +legitimize_pic_address (rtx orig, enum machine_mode mode, rtx reg) +{ + if (GET_CODE (orig) == SYMBOL_REF + || GET_CODE (orig) == LABEL_REF) + { + rtx insn; + + if (reg == 0) + { + gcc_assert (can_create_pseudo_p ()); + reg = gen_reg_rtx (Pmode); + } + + /* VxWorks does not impose a fixed gap between segments; the run-time + gap can be different from the object-file gap. We therefore can't + use GOTOFF unless we are absolutely sure that the symbol is in the + same segment as the GOT. Unfortunately, the flexibility of linker + scripts means that we can't be sure of that in general, so assume + that GOTOFF is never valid on VxWorks. */ + if ((GET_CODE (orig) == LABEL_REF + || (GET_CODE (orig) == SYMBOL_REF && + SYMBOL_REF_LOCAL_P (orig))) + && NEED_GOT_RELOC + && !TARGET_VXWORKS_RTP) + insn = arm_pic_static_addr (orig, reg); + else + { + rtx pat; + rtx mem; + + /* If this function doesn't have a pic register, create one now. */ + require_pic_register (); + + pat = gen_calculate_pic_address (reg, cfun->machine->pic_reg, orig); + + /* Make the MEM as close to a constant as possible. */ + mem = SET_SRC (pat); + gcc_assert (MEM_P (mem) && !MEM_VOLATILE_P (mem)); + MEM_READONLY_P (mem) = 1; + MEM_NOTRAP_P (mem) = 1; + + insn = emit_insn (pat); + } + + /* Put a REG_EQUAL note on this insn, so that it can be optimized + by loop. */ + set_unique_reg_note (insn, REG_EQUAL, orig); + + return reg; + } + else if (GET_CODE (orig) == CONST) + { + rtx base, offset; + + if (GET_CODE (XEXP (orig, 0)) == PLUS + && XEXP (XEXP (orig, 0), 0) == cfun->machine->pic_reg) + return orig; + + /* Handle the case where we have: const (UNSPEC_TLS). */ + if (GET_CODE (XEXP (orig, 0)) == UNSPEC + && XINT (XEXP (orig, 0), 1) == UNSPEC_TLS) + return orig; + + /* Handle the case where we have: + const (plus (UNSPEC_TLS) (ADDEND)). The ADDEND must be a + CONST_INT. */ + if (GET_CODE (XEXP (orig, 0)) == PLUS + && GET_CODE (XEXP (XEXP (orig, 0), 0)) == UNSPEC + && XINT (XEXP (XEXP (orig, 0), 0), 1) == UNSPEC_TLS) + { + gcc_assert (GET_CODE (XEXP (XEXP (orig, 0), 1)) == CONST_INT); + return orig; + } + + if (reg == 0) + { + gcc_assert (can_create_pseudo_p ()); + reg = gen_reg_rtx (Pmode); + } + + gcc_assert (GET_CODE (XEXP (orig, 0)) == PLUS); + + base = legitimize_pic_address (XEXP (XEXP (orig, 0), 0), Pmode, reg); + offset = legitimize_pic_address (XEXP (XEXP (orig, 0), 1), Pmode, + base == reg ? 0 : reg); + + if (GET_CODE (offset) == CONST_INT) + { + /* The base register doesn't really matter, we only want to + test the index for the appropriate mode. */ + if (!arm_legitimate_index_p (mode, offset, SET, 0)) + { + gcc_assert (can_create_pseudo_p ()); + offset = force_reg (Pmode, offset); + } + + if (GET_CODE (offset) == CONST_INT) + return plus_constant (base, INTVAL (offset)); + } + + if (GET_MODE_SIZE (mode) > 4 + && (GET_MODE_CLASS (mode) == MODE_INT + || TARGET_SOFT_FLOAT)) + { + emit_insn (gen_addsi3 (reg, base, offset)); + return reg; + } + + return gen_rtx_PLUS (Pmode, base, offset); + } + + return orig; +} + + +/* Find a spare register to use during the prolog of a function. */ + +static int +thumb_find_work_register (unsigned long pushed_regs_mask) +{ + int reg; + + /* Check the argument registers first as these are call-used. The + register allocation order means that sometimes r3 might be used + but earlier argument registers might not, so check them all. */ + for (reg = LAST_ARG_REGNUM; reg >= 0; reg --) + if (!df_regs_ever_live_p (reg)) + return reg; + + /* Before going on to check the call-saved registers we can try a couple + more ways of deducing that r3 is available. The first is when we are + pushing anonymous arguments onto the stack and we have less than 4 + registers worth of fixed arguments(*). In this case r3 will be part of + the variable argument list and so we can be sure that it will be + pushed right at the start of the function. Hence it will be available + for the rest of the prologue. + (*): ie crtl->args.pretend_args_size is greater than 0. */ + if (cfun->machine->uses_anonymous_args + && crtl->args.pretend_args_size > 0) + return LAST_ARG_REGNUM; + + /* The other case is when we have fixed arguments but less than 4 registers + worth. In this case r3 might be used in the body of the function, but + it is not being used to convey an argument into the function. In theory + we could just check crtl->args.size to see how many bytes are + being passed in argument registers, but it seems that it is unreliable. + Sometimes it will have the value 0 when in fact arguments are being + passed. (See testcase execute/20021111-1.c for an example). So we also + check the args_info.nregs field as well. The problem with this field is + that it makes no allowances for arguments that are passed to the + function but which are not used. Hence we could miss an opportunity + when a function has an unused argument in r3. But it is better to be + safe than to be sorry. */ + if (! cfun->machine->uses_anonymous_args + && crtl->args.size >= 0 + && crtl->args.size <= (LAST_ARG_REGNUM * UNITS_PER_WORD) + && crtl->args.info.nregs < 4) + return LAST_ARG_REGNUM; + + /* Otherwise look for a call-saved register that is going to be pushed. */ + for (reg = LAST_LO_REGNUM; reg > LAST_ARG_REGNUM; reg --) + if (pushed_regs_mask & (1 << reg)) + return reg; + + if (TARGET_THUMB2) + { + /* Thumb-2 can use high regs. */ + for (reg = FIRST_HI_REGNUM; reg < 15; reg ++) + if (pushed_regs_mask & (1 << reg)) + return reg; + } + /* Something went wrong - thumb_compute_save_reg_mask() + should have arranged for a suitable register to be pushed. */ + gcc_unreachable (); +} + +static GTY(()) int pic_labelno; + +/* Generate code to load the PIC register. In thumb mode SCRATCH is a + low register. */ + +void +arm_load_pic_register (unsigned long saved_regs ATTRIBUTE_UNUSED) +{ + rtx l1, labelno, pic_tmp, pic_rtx, pic_reg; + + if (crtl->uses_pic_offset_table == 0 || TARGET_SINGLE_PIC_BASE) + return; + + gcc_assert (flag_pic); + + pic_reg = cfun->machine->pic_reg; + if (TARGET_VXWORKS_RTP) + { + pic_rtx = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE); + pic_rtx = gen_rtx_CONST (Pmode, pic_rtx); + emit_insn (gen_pic_load_addr_32bit (pic_reg, pic_rtx)); + + emit_insn (gen_rtx_SET (Pmode, pic_reg, gen_rtx_MEM (Pmode, pic_reg))); + + pic_tmp = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX); + emit_insn (gen_pic_offset_arm (pic_reg, pic_reg, pic_tmp)); + } + else + { + /* We use an UNSPEC rather than a LABEL_REF because this label + never appears in the code stream. */ + + labelno = GEN_INT (pic_labelno++); + l1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, labelno), UNSPEC_PIC_LABEL); + l1 = gen_rtx_CONST (VOIDmode, l1); + + /* On the ARM the PC register contains 'dot + 8' at the time of the + addition, on the Thumb it is 'dot + 4'. */ + pic_rtx = plus_constant (l1, TARGET_ARM ? 8 : 4); + pic_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, pic_rtx), + UNSPEC_GOTSYM_OFF); + pic_rtx = gen_rtx_CONST (Pmode, pic_rtx); + + if (TARGET_32BIT) + { + emit_insn (gen_pic_load_addr_unified (pic_reg, pic_rtx, labelno)); + } + else /* TARGET_THUMB1 */ + { + if (arm_pic_register != INVALID_REGNUM + && REGNO (pic_reg) > LAST_LO_REGNUM) + { + /* We will have pushed the pic register, so we should always be + able to find a work register. */ + pic_tmp = gen_rtx_REG (SImode, + thumb_find_work_register (saved_regs)); + emit_insn (gen_pic_load_addr_thumb1 (pic_tmp, pic_rtx)); + emit_insn (gen_movsi (pic_offset_table_rtx, pic_tmp)); + emit_insn (gen_pic_add_dot_plus_four (pic_reg, pic_reg, labelno)); + } + else + emit_insn (gen_pic_load_addr_unified (pic_reg, pic_rtx, labelno)); + } + } + + /* Need to emit this whether or not we obey regdecls, + since setjmp/longjmp can cause life info to screw up. */ + emit_use (pic_reg); +} + +/* Generate code to load the address of a static var when flag_pic is set. */ +static rtx +arm_pic_static_addr (rtx orig, rtx reg) +{ + rtx l1, labelno, offset_rtx, insn; + + gcc_assert (flag_pic); + + /* We use an UNSPEC rather than a LABEL_REF because this label + never appears in the code stream. */ + labelno = GEN_INT (pic_labelno++); + l1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, labelno), UNSPEC_PIC_LABEL); + l1 = gen_rtx_CONST (VOIDmode, l1); + + /* On the ARM the PC register contains 'dot + 8' at the time of the + addition, on the Thumb it is 'dot + 4'. */ + offset_rtx = plus_constant (l1, TARGET_ARM ? 8 : 4); + offset_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (2, orig, offset_rtx), + UNSPEC_SYMBOL_OFFSET); + offset_rtx = gen_rtx_CONST (Pmode, offset_rtx); + + insn = emit_insn (gen_pic_load_addr_unified (reg, offset_rtx, labelno)); + return insn; +} + +/* Return nonzero if X is valid as an ARM state addressing register. */ +static int +arm_address_register_rtx_p (rtx x, int strict_p) +{ + int regno; + + if (GET_CODE (x) != REG) + return 0; + + regno = REGNO (x); + + if (strict_p) + return ARM_REGNO_OK_FOR_BASE_P (regno); + + return (regno <= LAST_ARM_REGNUM + || regno >= FIRST_PSEUDO_REGISTER + || regno == FRAME_POINTER_REGNUM + || regno == ARG_POINTER_REGNUM); +} + +/* Return TRUE if this rtx is the difference of a symbol and a label, + and will reduce to a PC-relative relocation in the object file. + Expressions like this can be left alone when generating PIC, rather + than forced through the GOT. */ +static int +pcrel_constant_p (rtx x) +{ + if (GET_CODE (x) == MINUS) + return symbol_mentioned_p (XEXP (x, 0)) && label_mentioned_p (XEXP (x, 1)); + + return FALSE; +} + +/* Return true if X will surely end up in an index register after next + splitting pass. */ +static bool +will_be_in_index_register (const_rtx x) +{ + /* arm.md: calculate_pic_address will split this into a register. */ + return GET_CODE (x) == UNSPEC && (XINT (x, 1) == UNSPEC_PIC_SYM); +} + +/* Return nonzero if X is a valid ARM state address operand. */ +int +arm_legitimate_address_outer_p (enum machine_mode mode, rtx x, RTX_CODE outer, + int strict_p) +{ + bool use_ldrd; + enum rtx_code code = GET_CODE (x); + + if (arm_address_register_rtx_p (x, strict_p)) + return 1; + + use_ldrd = (TARGET_LDRD + && (mode == DImode + || (mode == DFmode && (TARGET_SOFT_FLOAT || TARGET_VFP)))); + + if (code == POST_INC || code == PRE_DEC + || ((code == PRE_INC || code == POST_DEC) + && (use_ldrd || GET_MODE_SIZE (mode) <= 4))) + return arm_address_register_rtx_p (XEXP (x, 0), strict_p); + + else if ((code == POST_MODIFY || code == PRE_MODIFY) + && arm_address_register_rtx_p (XEXP (x, 0), strict_p) + && GET_CODE (XEXP (x, 1)) == PLUS + && rtx_equal_p (XEXP (XEXP (x, 1), 0), XEXP (x, 0))) + { + rtx addend = XEXP (XEXP (x, 1), 1); + + /* Don't allow ldrd post increment by register because it's hard + to fixup invalid register choices. */ + if (use_ldrd + && GET_CODE (x) == POST_MODIFY + && GET_CODE (addend) == REG) + return 0; + + return ((use_ldrd || GET_MODE_SIZE (mode) <= 4) + && arm_legitimate_index_p (mode, addend, outer, strict_p)); + } + + /* After reload constants split into minipools will have addresses + from a LABEL_REF. */ + else if (reload_completed + && (code == LABEL_REF + || (code == CONST + && GET_CODE (XEXP (x, 0)) == PLUS + && GET_CODE (XEXP (XEXP (x, 0), 0)) == LABEL_REF + && GET_CODE (XEXP (XEXP (x, 0), 1)) == CONST_INT))) + return 1; + + else if (mode == TImode || (TARGET_NEON && VALID_NEON_STRUCT_MODE (mode))) + return 0; + + else if (code == PLUS) + { + rtx xop0 = XEXP (x, 0); + rtx xop1 = XEXP (x, 1); + + return ((arm_address_register_rtx_p (xop0, strict_p) + && ((GET_CODE(xop1) == CONST_INT + && arm_legitimate_index_p (mode, xop1, outer, strict_p)) + || (!strict_p && will_be_in_index_register (xop1)))) + || (arm_address_register_rtx_p (xop1, strict_p) + && arm_legitimate_index_p (mode, xop0, outer, strict_p))); + } + +#if 0 + /* Reload currently can't handle MINUS, so disable this for now */ + else if (GET_CODE (x) == MINUS) + { + rtx xop0 = XEXP (x, 0); + rtx xop1 = XEXP (x, 1); + + return (arm_address_register_rtx_p (xop0, strict_p) + && arm_legitimate_index_p (mode, xop1, outer, strict_p)); + } +#endif + + else if (GET_MODE_CLASS (mode) != MODE_FLOAT + && code == SYMBOL_REF + && CONSTANT_POOL_ADDRESS_P (x) + && ! (flag_pic + && symbol_mentioned_p (get_pool_constant (x)) + && ! pcrel_constant_p (get_pool_constant (x)))) + return 1; + + return 0; +} + +/* Return nonzero if X is a valid Thumb-2 address operand. */ +static int +thumb2_legitimate_address_p (enum machine_mode mode, rtx x, int strict_p) +{ + bool use_ldrd; + enum rtx_code code = GET_CODE (x); + + if (arm_address_register_rtx_p (x, strict_p)) + return 1; + + use_ldrd = (TARGET_LDRD + && (mode == DImode + || (mode == DFmode && (TARGET_SOFT_FLOAT || TARGET_VFP)))); + + if (code == POST_INC || code == PRE_DEC + || ((code == PRE_INC || code == POST_DEC) + && (use_ldrd || GET_MODE_SIZE (mode) <= 4))) + return arm_address_register_rtx_p (XEXP (x, 0), strict_p); + + else if ((code == POST_MODIFY || code == PRE_MODIFY) + && arm_address_register_rtx_p (XEXP (x, 0), strict_p) + && GET_CODE (XEXP (x, 1)) == PLUS + && rtx_equal_p (XEXP (XEXP (x, 1), 0), XEXP (x, 0))) + { + /* Thumb-2 only has autoincrement by constant. */ + rtx addend = XEXP (XEXP (x, 1), 1); + HOST_WIDE_INT offset; + + if (GET_CODE (addend) != CONST_INT) + return 0; + + offset = INTVAL(addend); + if (GET_MODE_SIZE (mode) <= 4) + return (offset > -256 && offset < 256); + + return (use_ldrd && offset > -1024 && offset < 1024 + && (offset & 3) == 0); + } + + /* After reload constants split into minipools will have addresses + from a LABEL_REF. */ + else if (reload_completed + && (code == LABEL_REF + || (code == CONST + && GET_CODE (XEXP (x, 0)) == PLUS + && GET_CODE (XEXP (XEXP (x, 0), 0)) == LABEL_REF + && GET_CODE (XEXP (XEXP (x, 0), 1)) == CONST_INT))) + return 1; + + else if (mode == TImode || (TARGET_NEON && VALID_NEON_STRUCT_MODE (mode))) + return 0; + + else if (code == PLUS) + { + rtx xop0 = XEXP (x, 0); + rtx xop1 = XEXP (x, 1); + + return ((arm_address_register_rtx_p (xop0, strict_p) + && (thumb2_legitimate_index_p (mode, xop1, strict_p) + || (!strict_p && will_be_in_index_register (xop1)))) + || (arm_address_register_rtx_p (xop1, strict_p) + && thumb2_legitimate_index_p (mode, xop0, strict_p))); + } + + else if (GET_MODE_CLASS (mode) != MODE_FLOAT + && code == SYMBOL_REF + && CONSTANT_POOL_ADDRESS_P (x) + && ! (flag_pic + && symbol_mentioned_p (get_pool_constant (x)) + && ! pcrel_constant_p (get_pool_constant (x)))) + return 1; + + return 0; +} + +/* Return nonzero if INDEX is valid for an address index operand in + ARM state. */ +static int +arm_legitimate_index_p (enum machine_mode mode, rtx index, RTX_CODE outer, + int strict_p) +{ + HOST_WIDE_INT range; + enum rtx_code code = GET_CODE (index); + + /* Standard coprocessor addressing modes. */ + if (TARGET_HARD_FLOAT + && (TARGET_VFP || TARGET_FPA || TARGET_MAVERICK) + && (mode == SFmode || mode == DFmode + || (TARGET_MAVERICK && mode == DImode))) + return (code == CONST_INT && INTVAL (index) < 1024 + && INTVAL (index) > -1024 + && (INTVAL (index) & 3) == 0); + + /* For quad modes, we restrict the constant offset to be slightly less + than what the instruction format permits. We do this because for + quad mode moves, we will actually decompose them into two separate + double-mode reads or writes. INDEX must therefore be a valid + (double-mode) offset and so should INDEX+8. */ + if (TARGET_NEON && VALID_NEON_QREG_MODE (mode)) + return (code == CONST_INT + && INTVAL (index) < 1016 + && INTVAL (index) > -1024 + && (INTVAL (index) & 3) == 0); + + /* We have no such constraint on double mode offsets, so we permit the + full range of the instruction format. */ + if (TARGET_NEON && VALID_NEON_DREG_MODE (mode)) + return (code == CONST_INT + && INTVAL (index) < 1024 + && INTVAL (index) > -1024 + && (INTVAL (index) & 3) == 0); + + if (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (mode)) + return (code == CONST_INT + && INTVAL (index) < 1024 + && INTVAL (index) > -1024 + && (INTVAL (index) & 3) == 0); + + if (arm_address_register_rtx_p (index, strict_p) + && (GET_MODE_SIZE (mode) <= 4)) + return 1; + + if (mode == DImode || mode == DFmode) + { + if (code == CONST_INT) + { + HOST_WIDE_INT val = INTVAL (index); + + if (TARGET_LDRD) + return val > -256 && val < 256; + else + return val > -4096 && val < 4092; + } + + return TARGET_LDRD && arm_address_register_rtx_p (index, strict_p); + } + + if (GET_MODE_SIZE (mode) <= 4 + && ! (arm_arch4 + && (mode == HImode + || mode == HFmode + || (mode == QImode && outer == SIGN_EXTEND)))) + { + if (code == MULT) + { + rtx xiop0 = XEXP (index, 0); + rtx xiop1 = XEXP (index, 1); + + return ((arm_address_register_rtx_p (xiop0, strict_p) + && power_of_two_operand (xiop1, SImode)) + || (arm_address_register_rtx_p (xiop1, strict_p) + && power_of_two_operand (xiop0, SImode))); + } + else if (code == LSHIFTRT || code == ASHIFTRT + || code == ASHIFT || code == ROTATERT) + { + rtx op = XEXP (index, 1); + + return (arm_address_register_rtx_p (XEXP (index, 0), strict_p) + && GET_CODE (op) == CONST_INT + && INTVAL (op) > 0 + && INTVAL (op) <= 31); + } + } + + /* For ARM v4 we may be doing a sign-extend operation during the + load. */ + if (arm_arch4) + { + if (mode == HImode + || mode == HFmode + || (outer == SIGN_EXTEND && mode == QImode)) + range = 256; + else + range = 4096; + } + else + range = (mode == HImode || mode == HFmode) ? 4095 : 4096; + + return (code == CONST_INT + && INTVAL (index) < range + && INTVAL (index) > -range); +} + +/* Return true if OP is a valid index scaling factor for Thumb-2 address + index operand. i.e. 1, 2, 4 or 8. */ +static bool +thumb2_index_mul_operand (rtx op) +{ + HOST_WIDE_INT val; + + if (GET_CODE(op) != CONST_INT) + return false; + + val = INTVAL(op); + return (val == 1 || val == 2 || val == 4 || val == 8); +} + +/* Return nonzero if INDEX is a valid Thumb-2 address index operand. */ +static int +thumb2_legitimate_index_p (enum machine_mode mode, rtx index, int strict_p) +{ + enum rtx_code code = GET_CODE (index); + + /* ??? Combine arm and thumb2 coprocessor addressing modes. */ + /* Standard coprocessor addressing modes. */ + if (TARGET_HARD_FLOAT + && (TARGET_VFP || TARGET_FPA || TARGET_MAVERICK) + && (mode == SFmode || mode == DFmode + || (TARGET_MAVERICK && mode == DImode))) + return (code == CONST_INT && INTVAL (index) < 1024 + /* Thumb-2 allows only > -256 index range for it's core register + load/stores. Since we allow SF/DF in core registers, we have + to use the intersection between -256~4096 (core) and -1024~1024 + (coprocessor). */ + && INTVAL (index) > -256 + && (INTVAL (index) & 3) == 0); + + if (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (mode)) + { + /* For DImode assume values will usually live in core regs + and only allow LDRD addressing modes. */ + if (!TARGET_LDRD || mode != DImode) + return (code == CONST_INT + && INTVAL (index) < 1024 + && INTVAL (index) > -1024 + && (INTVAL (index) & 3) == 0); + } + + /* For quad modes, we restrict the constant offset to be slightly less + than what the instruction format permits. We do this because for + quad mode moves, we will actually decompose them into two separate + double-mode reads or writes. INDEX must therefore be a valid + (double-mode) offset and so should INDEX+8. */ + if (TARGET_NEON && VALID_NEON_QREG_MODE (mode)) + return (code == CONST_INT + && INTVAL (index) < 1016 + && INTVAL (index) > -1024 + && (INTVAL (index) & 3) == 0); + + /* We have no such constraint on double mode offsets, so we permit the + full range of the instruction format. */ + if (TARGET_NEON && VALID_NEON_DREG_MODE (mode)) + return (code == CONST_INT + && INTVAL (index) < 1024 + && INTVAL (index) > -1024 + && (INTVAL (index) & 3) == 0); + + if (arm_address_register_rtx_p (index, strict_p) + && (GET_MODE_SIZE (mode) <= 4)) + return 1; + + if (mode == DImode || mode == DFmode) + { + if (code == CONST_INT) + { + HOST_WIDE_INT val = INTVAL (index); + /* ??? Can we assume ldrd for thumb2? */ + /* Thumb-2 ldrd only has reg+const addressing modes. */ + /* ldrd supports offsets of +-1020. + However the ldr fallback does not. */ + return val > -256 && val < 256 && (val & 3) == 0; + } + else + return 0; + } + + if (code == MULT) + { + rtx xiop0 = XEXP (index, 0); + rtx xiop1 = XEXP (index, 1); + + return ((arm_address_register_rtx_p (xiop0, strict_p) + && thumb2_index_mul_operand (xiop1)) + || (arm_address_register_rtx_p (xiop1, strict_p) + && thumb2_index_mul_operand (xiop0))); + } + else if (code == ASHIFT) + { + rtx op = XEXP (index, 1); + + return (arm_address_register_rtx_p (XEXP (index, 0), strict_p) + && GET_CODE (op) == CONST_INT + && INTVAL (op) > 0 + && INTVAL (op) <= 3); + } + + return (code == CONST_INT + && INTVAL (index) < 4096 + && INTVAL (index) > -256); +} + +/* Return nonzero if X is valid as a 16-bit Thumb state base register. */ +static int +thumb1_base_register_rtx_p (rtx x, enum machine_mode mode, int strict_p) +{ + int regno; + + if (GET_CODE (x) != REG) + return 0; + + regno = REGNO (x); + + if (strict_p) + return THUMB1_REGNO_MODE_OK_FOR_BASE_P (regno, mode); + + return (regno <= LAST_LO_REGNUM + || regno > LAST_VIRTUAL_REGISTER + || regno == FRAME_POINTER_REGNUM + || (GET_MODE_SIZE (mode) >= 4 + && (regno == STACK_POINTER_REGNUM + || regno >= FIRST_PSEUDO_REGISTER + || x == hard_frame_pointer_rtx + || x == arg_pointer_rtx))); +} + +/* Return nonzero if x is a legitimate index register. This is the case + for any base register that can access a QImode object. */ +inline static int +thumb1_index_register_rtx_p (rtx x, int strict_p) +{ + return thumb1_base_register_rtx_p (x, QImode, strict_p); +} + +/* Return nonzero if x is a legitimate 16-bit Thumb-state address. + + The AP may be eliminated to either the SP or the FP, so we use the + least common denominator, e.g. SImode, and offsets from 0 to 64. + + ??? Verify whether the above is the right approach. + + ??? Also, the FP may be eliminated to the SP, so perhaps that + needs special handling also. + + ??? Look at how the mips16 port solves this problem. It probably uses + better ways to solve some of these problems. + + Although it is not incorrect, we don't accept QImode and HImode + addresses based on the frame pointer or arg pointer until the + reload pass starts. This is so that eliminating such addresses + into stack based ones won't produce impossible code. */ +static int +thumb1_legitimate_address_p (enum machine_mode mode, rtx x, int strict_p) +{ + /* ??? Not clear if this is right. Experiment. */ + if (GET_MODE_SIZE (mode) < 4 + && !(reload_in_progress || reload_completed) + && (reg_mentioned_p (frame_pointer_rtx, x) + || reg_mentioned_p (arg_pointer_rtx, x) + || reg_mentioned_p (virtual_incoming_args_rtx, x) + || reg_mentioned_p (virtual_outgoing_args_rtx, x) + || reg_mentioned_p (virtual_stack_dynamic_rtx, x) + || reg_mentioned_p (virtual_stack_vars_rtx, x))) + return 0; + + /* Accept any base register. SP only in SImode or larger. */ + else if (thumb1_base_register_rtx_p (x, mode, strict_p)) + return 1; + + /* This is PC relative data before arm_reorg runs. */ + else if (GET_MODE_SIZE (mode) >= 4 && CONSTANT_P (x) + && GET_CODE (x) == SYMBOL_REF + && CONSTANT_POOL_ADDRESS_P (x) && !flag_pic) + return 1; + + /* This is PC relative data after arm_reorg runs. */ + else if ((GET_MODE_SIZE (mode) >= 4 || mode == HFmode) + && reload_completed + && (GET_CODE (x) == LABEL_REF + || (GET_CODE (x) == CONST + && GET_CODE (XEXP (x, 0)) == PLUS + && GET_CODE (XEXP (XEXP (x, 0), 0)) == LABEL_REF + && GET_CODE (XEXP (XEXP (x, 0), 1)) == CONST_INT))) + return 1; + + /* Post-inc indexing only supported for SImode and larger. */ + else if (GET_CODE (x) == POST_INC && GET_MODE_SIZE (mode) >= 4 + && thumb1_index_register_rtx_p (XEXP (x, 0), strict_p)) + return 1; + + else if (GET_CODE (x) == PLUS) + { + /* REG+REG address can be any two index registers. */ + /* We disallow FRAME+REG addressing since we know that FRAME + will be replaced with STACK, and SP relative addressing only + permits SP+OFFSET. */ + if (GET_MODE_SIZE (mode) <= 4 + && XEXP (x, 0) != frame_pointer_rtx + && XEXP (x, 1) != frame_pointer_rtx + && thumb1_index_register_rtx_p (XEXP (x, 0), strict_p) + && (thumb1_index_register_rtx_p (XEXP (x, 1), strict_p) + || (!strict_p && will_be_in_index_register (XEXP (x, 1))))) + return 1; + + /* REG+const has 5-7 bit offset for non-SP registers. */ + else if ((thumb1_index_register_rtx_p (XEXP (x, 0), strict_p) + || XEXP (x, 0) == arg_pointer_rtx) + && GET_CODE (XEXP (x, 1)) == CONST_INT + && thumb_legitimate_offset_p (mode, INTVAL (XEXP (x, 1)))) + return 1; + + /* REG+const has 10-bit offset for SP, but only SImode and + larger is supported. */ + /* ??? Should probably check for DI/DFmode overflow here + just like GO_IF_LEGITIMATE_OFFSET does. */ + else if (GET_CODE (XEXP (x, 0)) == REG + && REGNO (XEXP (x, 0)) == STACK_POINTER_REGNUM + && GET_MODE_SIZE (mode) >= 4 + && GET_CODE (XEXP (x, 1)) == CONST_INT + && INTVAL (XEXP (x, 1)) >= 0 + && INTVAL (XEXP (x, 1)) + GET_MODE_SIZE (mode) <= 1024 + && (INTVAL (XEXP (x, 1)) & 3) == 0) + return 1; + + else if (GET_CODE (XEXP (x, 0)) == REG + && (REGNO (XEXP (x, 0)) == FRAME_POINTER_REGNUM + || REGNO (XEXP (x, 0)) == ARG_POINTER_REGNUM + || (REGNO (XEXP (x, 0)) >= FIRST_VIRTUAL_REGISTER + && REGNO (XEXP (x, 0)) + <= LAST_VIRTUAL_POINTER_REGISTER)) + && GET_MODE_SIZE (mode) >= 4 + && GET_CODE (XEXP (x, 1)) == CONST_INT + && (INTVAL (XEXP (x, 1)) & 3) == 0) + return 1; + } + + else if (GET_MODE_CLASS (mode) != MODE_FLOAT + && GET_MODE_SIZE (mode) == 4 + && GET_CODE (x) == SYMBOL_REF + && CONSTANT_POOL_ADDRESS_P (x) + && ! (flag_pic + && symbol_mentioned_p (get_pool_constant (x)) + && ! pcrel_constant_p (get_pool_constant (x)))) + return 1; + + return 0; +} + +/* Return nonzero if VAL can be used as an offset in a Thumb-state address + instruction of mode MODE. */ +int +thumb_legitimate_offset_p (enum machine_mode mode, HOST_WIDE_INT val) +{ + switch (GET_MODE_SIZE (mode)) + { + case 1: + return val >= 0 && val < 32; + + case 2: + return val >= 0 && val < 64 && (val & 1) == 0; + + default: + return (val >= 0 + && (val + GET_MODE_SIZE (mode)) <= 128 + && (val & 3) == 0); + } +} + +bool +arm_legitimate_address_p (enum machine_mode mode, rtx x, bool strict_p) +{ + if (TARGET_ARM) + return arm_legitimate_address_outer_p (mode, x, SET, strict_p); + else if (TARGET_THUMB2) + return thumb2_legitimate_address_p (mode, x, strict_p); + else /* if (TARGET_THUMB1) */ + return thumb1_legitimate_address_p (mode, x, strict_p); +} + +/* Build the SYMBOL_REF for __tls_get_addr. */ + +static GTY(()) rtx tls_get_addr_libfunc; + +static rtx +get_tls_get_addr (void) +{ + if (!tls_get_addr_libfunc) + tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr"); + return tls_get_addr_libfunc; +} + +static rtx +arm_load_tp (rtx target) +{ + if (!target) + target = gen_reg_rtx (SImode); + + if (TARGET_HARD_TP) + { + /* Can return in any reg. */ + emit_insn (gen_load_tp_hard (target)); + } + else + { + /* Always returned in r0. Immediately copy the result into a pseudo, + otherwise other uses of r0 (e.g. setting up function arguments) may + clobber the value. */ + + rtx tmp; + + emit_insn (gen_load_tp_soft ()); + + tmp = gen_rtx_REG (SImode, 0); + emit_move_insn (target, tmp); + } + return target; +} + +static rtx +load_tls_operand (rtx x, rtx reg) +{ + rtx tmp; + + if (reg == NULL_RTX) + reg = gen_reg_rtx (SImode); + + tmp = gen_rtx_CONST (SImode, x); + + emit_move_insn (reg, tmp); + + return reg; +} + +static rtx +arm_call_tls_get_addr (rtx x, rtx reg, rtx *valuep, int reloc) +{ + rtx insns, label, labelno, sum; + + start_sequence (); + + labelno = GEN_INT (pic_labelno++); + label = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, labelno), UNSPEC_PIC_LABEL); + label = gen_rtx_CONST (VOIDmode, label); + + sum = gen_rtx_UNSPEC (Pmode, + gen_rtvec (4, x, GEN_INT (reloc), label, + GEN_INT (TARGET_ARM ? 8 : 4)), + UNSPEC_TLS); + reg = load_tls_operand (sum, reg); + + if (TARGET_ARM) + emit_insn (gen_pic_add_dot_plus_eight (reg, reg, labelno)); + else if (TARGET_THUMB2) + emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno)); + else /* TARGET_THUMB1 */ + emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno)); + + *valuep = emit_library_call_value (get_tls_get_addr (), NULL_RTX, LCT_PURE, /* LCT_CONST? */ + Pmode, 1, reg, Pmode); + + insns = get_insns (); + end_sequence (); + + return insns; +} + +rtx +legitimize_tls_address (rtx x, rtx reg) +{ + rtx dest, tp, label, labelno, sum, insns, ret, eqv, addend; + unsigned int model = SYMBOL_REF_TLS_MODEL (x); + + switch (model) + { + case TLS_MODEL_GLOBAL_DYNAMIC: + insns = arm_call_tls_get_addr (x, reg, &ret, TLS_GD32); + dest = gen_reg_rtx (Pmode); + emit_libcall_block (insns, dest, ret, x); + return dest; + + case TLS_MODEL_LOCAL_DYNAMIC: + insns = arm_call_tls_get_addr (x, reg, &ret, TLS_LDM32); + + /* Attach a unique REG_EQUIV, to allow the RTL optimizers to + share the LDM result with other LD model accesses. */ + eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const1_rtx), + UNSPEC_TLS); + dest = gen_reg_rtx (Pmode); + emit_libcall_block (insns, dest, ret, eqv); + + /* Load the addend. */ + addend = gen_rtx_UNSPEC (Pmode, gen_rtvec (2, x, GEN_INT (TLS_LDO32)), + UNSPEC_TLS); + addend = force_reg (SImode, gen_rtx_CONST (SImode, addend)); + return gen_rtx_PLUS (Pmode, dest, addend); + + case TLS_MODEL_INITIAL_EXEC: + labelno = GEN_INT (pic_labelno++); + label = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, labelno), UNSPEC_PIC_LABEL); + label = gen_rtx_CONST (VOIDmode, label); + sum = gen_rtx_UNSPEC (Pmode, + gen_rtvec (4, x, GEN_INT (TLS_IE32), label, + GEN_INT (TARGET_ARM ? 8 : 4)), + UNSPEC_TLS); + reg = load_tls_operand (sum, reg); + + if (TARGET_ARM) + emit_insn (gen_tls_load_dot_plus_eight (reg, reg, labelno)); + else if (TARGET_THUMB2) + emit_insn (gen_tls_load_dot_plus_four (reg, NULL, reg, labelno)); + else + { + emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno)); + emit_move_insn (reg, gen_const_mem (SImode, reg)); + } + + tp = arm_load_tp (NULL_RTX); + + return gen_rtx_PLUS (Pmode, tp, reg); + + case TLS_MODEL_LOCAL_EXEC: + tp = arm_load_tp (NULL_RTX); + + reg = gen_rtx_UNSPEC (Pmode, + gen_rtvec (2, x, GEN_INT (TLS_LE32)), + UNSPEC_TLS); + reg = force_reg (SImode, gen_rtx_CONST (SImode, reg)); + + return gen_rtx_PLUS (Pmode, tp, reg); + + default: + abort (); + } +} + +/* Try machine-dependent ways of modifying an illegitimate address + to be legitimate. If we find one, return the new, valid address. */ +rtx +arm_legitimize_address (rtx x, rtx orig_x, enum machine_mode mode) +{ + if (!TARGET_ARM) + { + /* TODO: legitimize_address for Thumb2. */ + if (TARGET_THUMB2) + return x; + return thumb_legitimize_address (x, orig_x, mode); + } + + if (arm_tls_symbol_p (x)) + return legitimize_tls_address (x, NULL_RTX); + + if (GET_CODE (x) == PLUS) + { + rtx xop0 = XEXP (x, 0); + rtx xop1 = XEXP (x, 1); + + if (CONSTANT_P (xop0) && !symbol_mentioned_p (xop0)) + xop0 = force_reg (SImode, xop0); + + if (CONSTANT_P (xop1) && !symbol_mentioned_p (xop1)) + xop1 = force_reg (SImode, xop1); + + if (ARM_BASE_REGISTER_RTX_P (xop0) + && GET_CODE (xop1) == CONST_INT) + { + HOST_WIDE_INT n, low_n; + rtx base_reg, val; + n = INTVAL (xop1); + + /* VFP addressing modes actually allow greater offsets, but for + now we just stick with the lowest common denominator. */ + if (mode == DImode + || ((TARGET_SOFT_FLOAT || TARGET_VFP) && mode == DFmode)) + { + low_n = n & 0x0f; + n &= ~0x0f; + if (low_n > 4) + { + n += 16; + low_n -= 16; + } + } + else + { + low_n = ((mode) == TImode ? 0 + : n >= 0 ? (n & 0xfff) : -((-n) & 0xfff)); + n -= low_n; + } + + base_reg = gen_reg_rtx (SImode); + val = force_operand (plus_constant (xop0, n), NULL_RTX); + emit_move_insn (base_reg, val); + x = plus_constant (base_reg, low_n); + } + else if (xop0 != XEXP (x, 0) || xop1 != XEXP (x, 1)) + x = gen_rtx_PLUS (SImode, xop0, xop1); + } + + /* XXX We don't allow MINUS any more -- see comment in + arm_legitimate_address_outer_p (). */ + else if (GET_CODE (x) == MINUS) + { + rtx xop0 = XEXP (x, 0); + rtx xop1 = XEXP (x, 1); + + if (CONSTANT_P (xop0)) + xop0 = force_reg (SImode, xop0); + + if (CONSTANT_P (xop1) && ! symbol_mentioned_p (xop1)) + xop1 = force_reg (SImode, xop1); + + if (xop0 != XEXP (x, 0) || xop1 != XEXP (x, 1)) + x = gen_rtx_MINUS (SImode, xop0, xop1); + } + + /* Make sure to take full advantage of the pre-indexed addressing mode + with absolute addresses which often allows for the base register to + be factorized for multiple adjacent memory references, and it might + even allows for the mini pool to be avoided entirely. */ + else if (GET_CODE (x) == CONST_INT && optimize > 0) + { + unsigned int bits; + HOST_WIDE_INT mask, base, index; + rtx base_reg; + + /* ldr and ldrb can use a 12-bit index, ldrsb and the rest can only + use a 8-bit index. So let's use a 12-bit index for SImode only and + hope that arm_gen_constant will enable ldrb to use more bits. */ + bits = (mode == SImode) ? 12 : 8; + mask = (1 << bits) - 1; + base = INTVAL (x) & ~mask; + index = INTVAL (x) & mask; + if (bit_count (base & 0xffffffff) > (32 - bits)/2) + { + /* It'll most probably be more efficient to generate the base + with more bits set and use a negative index instead. */ + base |= mask; + index -= mask; + } + base_reg = force_reg (SImode, GEN_INT (base)); + x = plus_constant (base_reg, index); + } + + if (flag_pic) + { + /* We need to find and carefully transform any SYMBOL and LABEL + references; so go back to the original address expression. */ + rtx new_x = legitimize_pic_address (orig_x, mode, NULL_RTX); + + if (new_x != orig_x) + x = new_x; + } + + return x; +} + + +/* Try machine-dependent ways of modifying an illegitimate Thumb address + to be legitimate. If we find one, return the new, valid address. */ +rtx +thumb_legitimize_address (rtx x, rtx orig_x, enum machine_mode mode) +{ + if (arm_tls_symbol_p (x)) + return legitimize_tls_address (x, NULL_RTX); + + if (GET_CODE (x) == PLUS + && GET_CODE (XEXP (x, 1)) == CONST_INT + && (INTVAL (XEXP (x, 1)) >= 32 * GET_MODE_SIZE (mode) + || INTVAL (XEXP (x, 1)) < 0)) + { + rtx xop0 = XEXP (x, 0); + rtx xop1 = XEXP (x, 1); + HOST_WIDE_INT offset = INTVAL (xop1); + + /* Try and fold the offset into a biasing of the base register and + then offsetting that. Don't do this when optimizing for space + since it can cause too many CSEs. */ + if (optimize_size && offset >= 0 + && offset < 256 + 31 * GET_MODE_SIZE (mode)) + { + HOST_WIDE_INT delta; + + if (offset >= 256) + delta = offset - (256 - GET_MODE_SIZE (mode)); + else if (offset < 32 * GET_MODE_SIZE (mode) + 8) + delta = 31 * GET_MODE_SIZE (mode); + else + delta = offset & (~31 * GET_MODE_SIZE (mode)); + + xop0 = force_operand (plus_constant (xop0, offset - delta), + NULL_RTX); + x = plus_constant (xop0, delta); + } + else if (offset < 0 && offset > -256) + /* Small negative offsets are best done with a subtract before the + dereference, forcing these into a register normally takes two + instructions. */ + x = force_operand (x, NULL_RTX); + else + { + /* For the remaining cases, force the constant into a register. */ + xop1 = force_reg (SImode, xop1); + x = gen_rtx_PLUS (SImode, xop0, xop1); + } + } + else if (GET_CODE (x) == PLUS + && s_register_operand (XEXP (x, 1), SImode) + && !s_register_operand (XEXP (x, 0), SImode)) + { + rtx xop0 = force_operand (XEXP (x, 0), NULL_RTX); + + x = gen_rtx_PLUS (SImode, xop0, XEXP (x, 1)); + } + + if (flag_pic) + { + /* We need to find and carefully transform any SYMBOL and LABEL + references; so go back to the original address expression. */ + rtx new_x = legitimize_pic_address (orig_x, mode, NULL_RTX); + + if (new_x != orig_x) + x = new_x; + } + + return x; +} + +bool +arm_legitimize_reload_address (rtx *p, + enum machine_mode mode, + int opnum, int type, + int ind_levels ATTRIBUTE_UNUSED) +{ + if (GET_CODE (*p) == PLUS + && GET_CODE (XEXP (*p, 0)) == REG + && ARM_REGNO_OK_FOR_BASE_P (REGNO (XEXP (*p, 0))) + && GET_CODE (XEXP (*p, 1)) == CONST_INT) + { + HOST_WIDE_INT val = INTVAL (XEXP (*p, 1)); + HOST_WIDE_INT low, high; + + if (mode == DImode || (mode == DFmode && TARGET_SOFT_FLOAT)) + low = ((val & 0xf) ^ 0x8) - 0x8; + else if (TARGET_MAVERICK && TARGET_HARD_FLOAT) + /* Need to be careful, -256 is not a valid offset. */ + low = val >= 0 ? (val & 0xff) : -((-val) & 0xff); + else if (mode == SImode + || (mode == SFmode && TARGET_SOFT_FLOAT) + || ((mode == HImode || mode == QImode) && ! arm_arch4)) + /* Need to be careful, -4096 is not a valid offset. */ + low = val >= 0 ? (val & 0xfff) : -((-val) & 0xfff); + else if ((mode == HImode || mode == QImode) && arm_arch4) + /* Need to be careful, -256 is not a valid offset. */ + low = val >= 0 ? (val & 0xff) : -((-val) & 0xff); + else if (GET_MODE_CLASS (mode) == MODE_FLOAT + && TARGET_HARD_FLOAT && TARGET_FPA) + /* Need to be careful, -1024 is not a valid offset. */ + low = val >= 0 ? (val & 0x3ff) : -((-val) & 0x3ff); + else + return false; + + high = ((((val - low) & (unsigned HOST_WIDE_INT) 0xffffffff) + ^ (unsigned HOST_WIDE_INT) 0x80000000) + - (unsigned HOST_WIDE_INT) 0x80000000); + /* Check for overflow or zero */ + if (low == 0 || high == 0 || (high + low != val)) + return false; + + /* Reload the high part into a base reg; leave the low part + in the mem. */ + *p = gen_rtx_PLUS (GET_MODE (*p), + gen_rtx_PLUS (GET_MODE (*p), XEXP (*p, 0), + GEN_INT (high)), + GEN_INT (low)); + push_reload (XEXP (*p, 0), NULL_RTX, &XEXP (*p, 0), NULL, + MODE_BASE_REG_CLASS (mode), GET_MODE (*p), + VOIDmode, 0, 0, opnum, (enum reload_type) type); + return true; + } + + return false; +} + +rtx +thumb_legitimize_reload_address (rtx *x_p, + enum machine_mode mode, + int opnum, int type, + int ind_levels ATTRIBUTE_UNUSED) +{ + rtx x = *x_p; + + if (GET_CODE (x) == PLUS + && GET_MODE_SIZE (mode) < 4 + && REG_P (XEXP (x, 0)) + && XEXP (x, 0) == stack_pointer_rtx + && GET_CODE (XEXP (x, 1)) == CONST_INT + && !thumb_legitimate_offset_p (mode, INTVAL (XEXP (x, 1)))) + { + rtx orig_x = x; + + x = copy_rtx (x); + push_reload (orig_x, NULL_RTX, x_p, NULL, MODE_BASE_REG_CLASS (mode), + Pmode, VOIDmode, 0, 0, opnum, (enum reload_type) type); + return x; + } + + /* If both registers are hi-regs, then it's better to reload the + entire expression rather than each register individually. That + only requires one reload register rather than two. */ + if (GET_CODE (x) == PLUS + && REG_P (XEXP (x, 0)) + && REG_P (XEXP (x, 1)) + && !REG_MODE_OK_FOR_REG_BASE_P (XEXP (x, 0), mode) + && !REG_MODE_OK_FOR_REG_BASE_P (XEXP (x, 1), mode)) + { + rtx orig_x = x; + + x = copy_rtx (x); + push_reload (orig_x, NULL_RTX, x_p, NULL, MODE_BASE_REG_CLASS (mode), + Pmode, VOIDmode, 0, 0, opnum, (enum reload_type) type); + return x; + } + + return NULL; +} + +/* Test for various thread-local symbols. */ + +/* Return TRUE if X is a thread-local symbol. */ + +static bool +arm_tls_symbol_p (rtx x) +{ + if (! TARGET_HAVE_TLS) + return false; + + if (GET_CODE (x) != SYMBOL_REF) + return false; + + return SYMBOL_REF_TLS_MODEL (x) != 0; +} + +/* Helper for arm_tls_referenced_p. */ + +static int +arm_tls_operand_p_1 (rtx *x, void *data ATTRIBUTE_UNUSED) +{ + if (GET_CODE (*x) == SYMBOL_REF) + return SYMBOL_REF_TLS_MODEL (*x) != 0; + + /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are + TLS offsets, not real symbol references. */ + if (GET_CODE (*x) == UNSPEC + && XINT (*x, 1) == UNSPEC_TLS) + return -1; + + return 0; +} + +/* Return TRUE if X contains any TLS symbol references. */ + +bool +arm_tls_referenced_p (rtx x) +{ + if (! TARGET_HAVE_TLS) + return false; + + return for_each_rtx (&x, arm_tls_operand_p_1, NULL); +} + +/* Implement TARGET_CANNOT_FORCE_CONST_MEM. */ + +bool +arm_cannot_force_const_mem (rtx x) +{ + rtx base, offset; + + if (ARM_OFFSETS_MUST_BE_WITHIN_SECTIONS_P) + { + split_const (x, &base, &offset); + if (GET_CODE (base) == SYMBOL_REF + && !offset_within_block_p (base, INTVAL (offset))) + return true; + } + return arm_tls_referenced_p (x); +} + +#define REG_OR_SUBREG_REG(X) \ + (GET_CODE (X) == REG \ + || (GET_CODE (X) == SUBREG && GET_CODE (SUBREG_REG (X)) == REG)) + +#define REG_OR_SUBREG_RTX(X) \ + (GET_CODE (X) == REG ? (X) : SUBREG_REG (X)) + +static inline int +thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer) +{ + enum machine_mode mode = GET_MODE (x); + int total; + + switch (code) + { + case ASHIFT: + case ASHIFTRT: + case LSHIFTRT: + case ROTATERT: + case PLUS: + case MINUS: + case COMPARE: + case NEG: + case NOT: + return COSTS_N_INSNS (1); + + case MULT: + if (GET_CODE (XEXP (x, 1)) == CONST_INT) + { + int cycles = 0; + unsigned HOST_WIDE_INT i = INTVAL (XEXP (x, 1)); + + while (i) + { + i >>= 2; + cycles++; + } + return COSTS_N_INSNS (2) + cycles; + } + return COSTS_N_INSNS (1) + 16; + + case SET: + return (COSTS_N_INSNS (1) + + 4 * ((GET_CODE (SET_SRC (x)) == MEM) + + GET_CODE (SET_DEST (x)) == MEM)); + + case CONST_INT: + if (outer == SET) + { + if ((unsigned HOST_WIDE_INT) INTVAL (x) < 256) + return 0; + if (thumb_shiftable_const (INTVAL (x))) + return COSTS_N_INSNS (2); + return COSTS_N_INSNS (3); + } + else if ((outer == PLUS || outer == COMPARE) + && INTVAL (x) < 256 && INTVAL (x) > -256) + return 0; + else if ((outer == IOR || outer == XOR || outer == AND) + && INTVAL (x) < 256 && INTVAL (x) >= -256) + return COSTS_N_INSNS (1); + else if (outer == AND) + { + int i; + /* This duplicates the tests in the andsi3 expander. */ + for (i = 9; i <= 31; i++) + if ((((HOST_WIDE_INT) 1) << i) - 1 == INTVAL (x) + || (((HOST_WIDE_INT) 1) << i) - 1 == ~INTVAL (x)) + return COSTS_N_INSNS (2); + } + else if (outer == ASHIFT || outer == ASHIFTRT + || outer == LSHIFTRT) + return 0; + return COSTS_N_INSNS (2); + + case CONST: + case CONST_DOUBLE: + case LABEL_REF: + case SYMBOL_REF: + return COSTS_N_INSNS (3); + + case UDIV: + case UMOD: + case DIV: + case MOD: + return 100; + + case TRUNCATE: + return 99; + + case AND: + case XOR: + case IOR: + /* XXX guess. */ + return 8; + + case MEM: + /* XXX another guess. */ + /* Memory costs quite a lot for the first word, but subsequent words + load at the equivalent of a single insn each. */ + return (10 + 4 * ((GET_MODE_SIZE (mode) - 1) / UNITS_PER_WORD) + + ((GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)) + ? 4 : 0)); + + case IF_THEN_ELSE: + /* XXX a guess. */ + if (GET_CODE (XEXP (x, 1)) == PC || GET_CODE (XEXP (x, 2)) == PC) + return 14; + return 2; + + case SIGN_EXTEND: + case ZERO_EXTEND: + total = mode == DImode ? COSTS_N_INSNS (1) : 0; + total += thumb1_rtx_costs (XEXP (x, 0), GET_CODE (XEXP (x, 0)), code); + + if (mode == SImode) + return total; + + if (arm_arch6) + return total + COSTS_N_INSNS (1); + + /* Assume a two-shift sequence. Increase the cost slightly so + we prefer actual shifts over an extend operation. */ + return total + 1 + COSTS_N_INSNS (2); + + default: + return 99; + } +} + +static inline bool +arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed) +{ + enum machine_mode mode = GET_MODE (x); + enum rtx_code subcode; + rtx operand; + enum rtx_code code = GET_CODE (x); + *total = 0; + + switch (code) + { + case MEM: + /* Memory costs quite a lot for the first word, but subsequent words + load at the equivalent of a single insn each. */ + *total = COSTS_N_INSNS (2 + ARM_NUM_REGS (mode)); + return true; + + case DIV: + case MOD: + case UDIV: + case UMOD: + if (TARGET_HARD_FLOAT && mode == SFmode) + *total = COSTS_N_INSNS (2); + else if (TARGET_HARD_FLOAT && mode == DFmode && !TARGET_VFP_SINGLE) + *total = COSTS_N_INSNS (4); + else + *total = COSTS_N_INSNS (20); + return false; + + case ROTATE: + if (GET_CODE (XEXP (x, 1)) == REG) + *total = COSTS_N_INSNS (1); /* Need to subtract from 32 */ + else if (GET_CODE (XEXP (x, 1)) != CONST_INT) + *total = rtx_cost (XEXP (x, 1), code, speed); + + /* Fall through */ + case ROTATERT: + if (mode != SImode) + { + *total += COSTS_N_INSNS (4); + return true; + } + + /* Fall through */ + case ASHIFT: case LSHIFTRT: case ASHIFTRT: + *total += rtx_cost (XEXP (x, 0), code, speed); + if (mode == DImode) + { + *total += COSTS_N_INSNS (3); + return true; + } + + *total += COSTS_N_INSNS (1); + /* Increase the cost of complex shifts because they aren't any faster, + and reduce dual issue opportunities. */ + if (arm_tune_cortex_a9 + && outer != SET && GET_CODE (XEXP (x, 1)) != CONST_INT) + ++*total; + + return true; + + case MINUS: + if (mode == DImode) + { + *total = COSTS_N_INSNS (ARM_NUM_REGS (mode)); + if (GET_CODE (XEXP (x, 0)) == CONST_INT + && const_ok_for_arm (INTVAL (XEXP (x, 0)))) + { + *total += rtx_cost (XEXP (x, 1), code, speed); + return true; + } + + if (GET_CODE (XEXP (x, 1)) == CONST_INT + && const_ok_for_arm (INTVAL (XEXP (x, 1)))) + { + *total += rtx_cost (XEXP (x, 0), code, speed); + return true; + } + + return false; + } + + if (GET_MODE_CLASS (mode) == MODE_FLOAT) + { + if (TARGET_HARD_FLOAT + && (mode == SFmode + || (mode == DFmode && !TARGET_VFP_SINGLE))) + { + *total = COSTS_N_INSNS (1); + if (GET_CODE (XEXP (x, 0)) == CONST_DOUBLE + && arm_const_double_rtx (XEXP (x, 0))) + { + *total += rtx_cost (XEXP (x, 1), code, speed); + return true; + } + + if (GET_CODE (XEXP (x, 1)) == CONST_DOUBLE + && arm_const_double_rtx (XEXP (x, 1))) + { + *total += rtx_cost (XEXP (x, 0), code, speed); + return true; + } + + return false; + } + *total = COSTS_N_INSNS (20); + return false; + } + + *total = COSTS_N_INSNS (1); + if (GET_CODE (XEXP (x, 0)) == CONST_INT + && const_ok_for_arm (INTVAL (XEXP (x, 0)))) + { + *total += rtx_cost (XEXP (x, 1), code, speed); + return true; + } + + subcode = GET_CODE (XEXP (x, 1)); + if (subcode == ASHIFT || subcode == ASHIFTRT + || subcode == LSHIFTRT + || subcode == ROTATE || subcode == ROTATERT) + { + *total += rtx_cost (XEXP (x, 0), code, speed); + *total += rtx_cost (XEXP (XEXP (x, 1), 0), subcode, speed); + return true; + } + + /* A shift as a part of RSB costs no more than RSB itself. */ + if (GET_CODE (XEXP (x, 0)) == MULT + && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode)) + { + *total += rtx_cost (XEXP (XEXP (x, 0), 0), code, speed); + *total += rtx_cost (XEXP (x, 1), code, speed); + return true; + } + + if (subcode == MULT + && power_of_two_operand (XEXP (XEXP (x, 1), 1), SImode)) + { + *total += rtx_cost (XEXP (x, 0), code, speed); + *total += rtx_cost (XEXP (XEXP (x, 1), 0), subcode, speed); + return true; + } + + if (GET_RTX_CLASS (GET_CODE (XEXP (x, 1))) == RTX_COMPARE + || GET_RTX_CLASS (GET_CODE (XEXP (x, 1))) == RTX_COMM_COMPARE) + { + *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code, speed); + if (GET_CODE (XEXP (XEXP (x, 1), 0)) == REG + && REGNO (XEXP (XEXP (x, 1), 0)) != CC_REGNUM) + *total += COSTS_N_INSNS (1); + + return true; + } + + /* Fall through */ + + case PLUS: + if (code == PLUS && arm_arch6 && mode == SImode + && (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND + || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)) + { + *total = COSTS_N_INSNS (1); + *total += rtx_cost (XEXP (XEXP (x, 0), 0), GET_CODE (XEXP (x, 0)), + speed); + *total += rtx_cost (XEXP (x, 1), code, speed); + return true; + } + + /* MLA: All arguments must be registers. We filter out + multiplication by a power of two, so that we fall down into + the code below. */ + if (GET_CODE (XEXP (x, 0)) == MULT + && !power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode)) + { + /* The cost comes from the cost of the multiply. */ + return false; + } + + if (GET_MODE_CLASS (mode) == MODE_FLOAT) + { + if (TARGET_HARD_FLOAT + && (mode == SFmode + || (mode == DFmode && !TARGET_VFP_SINGLE))) + { + *total = COSTS_N_INSNS (1); + if (GET_CODE (XEXP (x, 1)) == CONST_DOUBLE + && arm_const_double_rtx (XEXP (x, 1))) + { + *total += rtx_cost (XEXP (x, 0), code, speed); + return true; + } + + return false; + } + + *total = COSTS_N_INSNS (20); + return false; + } + + if (GET_RTX_CLASS (GET_CODE (XEXP (x, 0))) == RTX_COMPARE + || GET_RTX_CLASS (GET_CODE (XEXP (x, 0))) == RTX_COMM_COMPARE) + { + *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 1), code, speed); + if (GET_CODE (XEXP (XEXP (x, 0), 0)) == REG + && REGNO (XEXP (XEXP (x, 0), 0)) != CC_REGNUM) + *total += COSTS_N_INSNS (1); + return true; + } + + /* Fall through */ + + case AND: case XOR: case IOR: + + /* Normally the frame registers will be spilt into reg+const during + reload, so it is a bad idea to combine them with other instructions, + since then they might not be moved outside of loops. As a compromise + we allow integration with ops that have a constant as their second + operand. */ + if (REG_OR_SUBREG_REG (XEXP (x, 0)) + && ARM_FRAME_RTX (REG_OR_SUBREG_RTX (XEXP (x, 0))) + && GET_CODE (XEXP (x, 1)) != CONST_INT) + *total = COSTS_N_INSNS (1); + + if (mode == DImode) + { + *total += COSTS_N_INSNS (2); + if (GET_CODE (XEXP (x, 1)) == CONST_INT + && const_ok_for_op (INTVAL (XEXP (x, 1)), code)) + { + *total += rtx_cost (XEXP (x, 0), code, speed); + return true; + } + + return false; + } + + *total += COSTS_N_INSNS (1); + if (GET_CODE (XEXP (x, 1)) == CONST_INT + && const_ok_for_op (INTVAL (XEXP (x, 1)), code)) + { + *total += rtx_cost (XEXP (x, 0), code, speed); + return true; + } + subcode = GET_CODE (XEXP (x, 0)); + if (subcode == ASHIFT || subcode == ASHIFTRT + || subcode == LSHIFTRT + || subcode == ROTATE || subcode == ROTATERT) + { + *total += rtx_cost (XEXP (x, 1), code, speed); + *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, speed); + return true; + } + + if (subcode == MULT + && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode)) + { + *total += rtx_cost (XEXP (x, 1), code, speed); + *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, speed); + return true; + } + + if (subcode == UMIN || subcode == UMAX + || subcode == SMIN || subcode == SMAX) + { + *total = COSTS_N_INSNS (3); + return true; + } + + return false; + + case MULT: + /* This should have been handled by the CPU specific routines. */ + gcc_unreachable (); + + case TRUNCATE: + if (arm_arch3m && mode == SImode + && GET_CODE (XEXP (x, 0)) == LSHIFTRT + && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT + && (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) + == GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1))) + && (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND + || GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND)) + { + *total = rtx_cost (XEXP (XEXP (x, 0), 0), LSHIFTRT, speed); + return true; + } + *total = COSTS_N_INSNS (2); /* Plus the cost of the MULT */ + return false; + + case NEG: + if (GET_MODE_CLASS (mode) == MODE_FLOAT) + { + if (TARGET_HARD_FLOAT + && (mode == SFmode + || (mode == DFmode && !TARGET_VFP_SINGLE))) + { + *total = COSTS_N_INSNS (1); + return false; + } + *total = COSTS_N_INSNS (2); + return false; + } + + /* Fall through */ + case NOT: + *total = COSTS_N_INSNS (ARM_NUM_REGS(mode)); + if (mode == SImode && code == NOT) + { + subcode = GET_CODE (XEXP (x, 0)); + if (subcode == ASHIFT || subcode == ASHIFTRT + || subcode == LSHIFTRT + || subcode == ROTATE || subcode == ROTATERT + || (subcode == MULT + && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))) + { + *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, speed); + /* Register shifts cost an extra cycle. */ + if (GET_CODE (XEXP (XEXP (x, 0), 1)) != CONST_INT) + *total += COSTS_N_INSNS (1) + rtx_cost (XEXP (XEXP (x, 0), 1), + subcode, speed); + return true; + } + } + + return false; + + case IF_THEN_ELSE: + if (GET_CODE (XEXP (x, 1)) == PC || GET_CODE (XEXP (x, 2)) == PC) + { + *total = COSTS_N_INSNS (4); + return true; + } + + operand = XEXP (x, 0); + + if (!((GET_RTX_CLASS (GET_CODE (operand)) == RTX_COMPARE + || GET_RTX_CLASS (GET_CODE (operand)) == RTX_COMM_COMPARE) + && GET_CODE (XEXP (operand, 0)) == REG + && REGNO (XEXP (operand, 0)) == CC_REGNUM)) + *total += COSTS_N_INSNS (1); + *total += (rtx_cost (XEXP (x, 1), code, speed) + + rtx_cost (XEXP (x, 2), code, speed)); + return true; + + case NE: + if (mode == SImode && XEXP (x, 1) == const0_rtx) + { + *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code, speed); + return true; + } + goto scc_insn; + + case GE: + if ((GET_CODE (XEXP (x, 0)) != REG || REGNO (XEXP (x, 0)) != CC_REGNUM) + && mode == SImode && XEXP (x, 1) == const0_rtx) + { + *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code, speed); + return true; + } + goto scc_insn; + + case LT: + if ((GET_CODE (XEXP (x, 0)) != REG || REGNO (XEXP (x, 0)) != CC_REGNUM) + && mode == SImode && XEXP (x, 1) == const0_rtx) + { + *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code, speed); + return true; + } + goto scc_insn; + + case EQ: + case GT: + case LE: + case GEU: + case LTU: + case GTU: + case LEU: + case UNORDERED: + case ORDERED: + case UNEQ: + case UNGE: + case UNLT: + case UNGT: + case UNLE: + scc_insn: + /* SCC insns. In the case where the comparison has already been + performed, then they cost 2 instructions. Otherwise they need + an additional comparison before them. */ + *total = COSTS_N_INSNS (2); + if (GET_CODE (XEXP (x, 0)) == REG && REGNO (XEXP (x, 0)) == CC_REGNUM) + { + return true; + } + + /* Fall through */ + case COMPARE: + if (GET_CODE (XEXP (x, 0)) == REG && REGNO (XEXP (x, 0)) == CC_REGNUM) + { + *total = 0; + return true; + } + + *total += COSTS_N_INSNS (1); + if (GET_CODE (XEXP (x, 1)) == CONST_INT + && const_ok_for_op (INTVAL (XEXP (x, 1)), code)) + { + *total += rtx_cost (XEXP (x, 0), code, speed); + return true; + } + + subcode = GET_CODE (XEXP (x, 0)); + if (subcode == ASHIFT || subcode == ASHIFTRT + || subcode == LSHIFTRT + || subcode == ROTATE || subcode == ROTATERT) + { + *total += rtx_cost (XEXP (x, 1), code, speed); + *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, speed); + return true; + } + + if (subcode == MULT + && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode)) + { + *total += rtx_cost (XEXP (x, 1), code, speed); + *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, speed); + return true; + } + + return false; + + case UMIN: + case UMAX: + case SMIN: + case SMAX: + *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code, speed); + if (GET_CODE (XEXP (x, 1)) != CONST_INT + || !const_ok_for_arm (INTVAL (XEXP (x, 1)))) + *total += rtx_cost (XEXP (x, 1), code, speed); + return true; + + case ABS: + if (GET_MODE_CLASS (mode) == MODE_FLOAT) + { + if (TARGET_HARD_FLOAT + && (mode == SFmode + || (mode == DFmode && !TARGET_VFP_SINGLE))) + { + *total = COSTS_N_INSNS (1); + return false; + } + *total = COSTS_N_INSNS (20); + return false; + } + *total = COSTS_N_INSNS (1); + if (mode == DImode) + *total += COSTS_N_INSNS (3); + return false; + + case SIGN_EXTEND: + case ZERO_EXTEND: + *total = 0; + if (GET_MODE_CLASS (mode) == MODE_INT) + { + rtx op = XEXP (x, 0); + enum machine_mode opmode = GET_MODE (op); + + if (mode == DImode) + *total += COSTS_N_INSNS (1); + + if (opmode != SImode) + { + if (MEM_P (op)) + { + /* If !arm_arch4, we use one of the extendhisi2_mem + or movhi_bytes patterns for HImode. For a QImode + sign extension, we first zero-extend from memory + and then perform a shift sequence. */ + if (!arm_arch4 && (opmode != QImode || code == SIGN_EXTEND)) + *total += COSTS_N_INSNS (2); + } + else if (arm_arch6) + *total += COSTS_N_INSNS (1); + + /* We don't have the necessary insn, so we need to perform some + other operation. */ + else if (TARGET_ARM && code == ZERO_EXTEND && mode == QImode) + /* An and with constant 255. */ + *total += COSTS_N_INSNS (1); + else + /* A shift sequence. Increase costs slightly to avoid + combining two shifts into an extend operation. */ + *total += COSTS_N_INSNS (2) + 1; + } + + return false; + } + + switch (GET_MODE (XEXP (x, 0))) + { + case V8QImode: + case V4HImode: + case V2SImode: + case V4QImode: + case V2HImode: + *total = COSTS_N_INSNS (1); + return false; + + default: + gcc_unreachable (); + } + gcc_unreachable (); + + case ZERO_EXTRACT: + case SIGN_EXTRACT: + *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code, speed); + return true; + + case CONST_INT: + if (const_ok_for_arm (INTVAL (x)) + || const_ok_for_arm (~INTVAL (x))) + *total = COSTS_N_INSNS (1); + else + *total = COSTS_N_INSNS (arm_gen_constant (SET, mode, NULL_RTX, + INTVAL (x), NULL_RTX, + NULL_RTX, 0, 0)); + return true; + + case CONST: + case LABEL_REF: + case SYMBOL_REF: + *total = COSTS_N_INSNS (3); + return true; + + case HIGH: + *total = COSTS_N_INSNS (1); + return true; + + case LO_SUM: + *total = COSTS_N_INSNS (1); + *total += rtx_cost (XEXP (x, 0), code, speed); + return true; + + case CONST_DOUBLE: + if (TARGET_HARD_FLOAT && vfp3_const_double_rtx (x) + && (mode == SFmode || !TARGET_VFP_SINGLE)) + *total = COSTS_N_INSNS (1); + else + *total = COSTS_N_INSNS (4); + return true; + + case UNSPEC: + /* We cost this as high as our memory costs to allow this to + be hoisted from loops. */ + if (XINT (x, 1) == UNSPEC_PIC_UNIFIED) + { + *total = COSTS_N_INSNS (2 + ARM_NUM_REGS (mode)); + } + return true; + + default: + *total = COSTS_N_INSNS (4); + return false; + } +} + +/* Estimates the size cost of thumb1 instructions. + For now most of the code is copied from thumb1_rtx_costs. We need more + fine grain tuning when we have more related test cases. */ +static inline int +thumb1_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer) +{ + enum machine_mode mode = GET_MODE (x); + + switch (code) + { + case ASHIFT: + case ASHIFTRT: + case LSHIFTRT: + case ROTATERT: + case PLUS: + case MINUS: + case COMPARE: + case NEG: + case NOT: + return COSTS_N_INSNS (1); + + case MULT: + if (GET_CODE (XEXP (x, 1)) == CONST_INT) + { + /* Thumb1 mul instruction can't operate on const. We must Load it + into a register first. */ + int const_size = thumb1_size_rtx_costs (XEXP (x, 1), CONST_INT, SET); + return COSTS_N_INSNS (1) + const_size; + } + return COSTS_N_INSNS (1); + + case SET: + return (COSTS_N_INSNS (1) + + 4 * ((GET_CODE (SET_SRC (x)) == MEM) + + GET_CODE (SET_DEST (x)) == MEM)); + + case CONST_INT: + if (outer == SET) + { + if ((unsigned HOST_WIDE_INT) INTVAL (x) < 256) + return COSTS_N_INSNS (1); + /* See split "TARGET_THUMB1 && satisfies_constraint_J". */ + if (INTVAL (x) >= -255 && INTVAL (x) <= -1) + return COSTS_N_INSNS (2); + /* See split "TARGET_THUMB1 && satisfies_constraint_K". */ + if (thumb_shiftable_const (INTVAL (x))) + return COSTS_N_INSNS (2); + return COSTS_N_INSNS (3); + } + else if ((outer == PLUS || outer == COMPARE) + && INTVAL (x) < 256 && INTVAL (x) > -256) + return 0; + else if ((outer == IOR || outer == XOR || outer == AND) + && INTVAL (x) < 256 && INTVAL (x) >= -256) + return COSTS_N_INSNS (1); + else if (outer == AND) + { + int i; + /* This duplicates the tests in the andsi3 expander. */ + for (i = 9; i <= 31; i++) + if ((((HOST_WIDE_INT) 1) << i) - 1 == INTVAL (x) + || (((HOST_WIDE_INT) 1) << i) - 1 == ~INTVAL (x)) + return COSTS_N_INSNS (2); + } + else if (outer == ASHIFT || outer == ASHIFTRT + || outer == LSHIFTRT) + return 0; + return COSTS_N_INSNS (2); + + case CONST: + case CONST_DOUBLE: + case LABEL_REF: + case SYMBOL_REF: + return COSTS_N_INSNS (3); + + case UDIV: + case UMOD: + case DIV: + case MOD: + return 100; + + case TRUNCATE: + return 99; + + case AND: + case XOR: + case IOR: + /* XXX guess. */ + return 8; + + case MEM: + /* XXX another guess. */ + /* Memory costs quite a lot for the first word, but subsequent words + load at the equivalent of a single insn each. */ + return (10 + 4 * ((GET_MODE_SIZE (mode) - 1) / UNITS_PER_WORD) + + ((GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)) + ? 4 : 0)); + + case IF_THEN_ELSE: + /* XXX a guess. */ + if (GET_CODE (XEXP (x, 1)) == PC || GET_CODE (XEXP (x, 2)) == PC) + return 14; + return 2; + + case ZERO_EXTEND: + /* XXX still guessing. */ + switch (GET_MODE (XEXP (x, 0))) + { + case QImode: + return (1 + (mode == DImode ? 4 : 0) + + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0)); + + case HImode: + return (4 + (mode == DImode ? 4 : 0) + + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0)); + + case SImode: + return (1 + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0)); + + default: + return 99; + } + + default: + return 99; + } +} + +/* RTX costs when optimizing for size. */ +static bool +arm_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code, + int *total) +{ + enum machine_mode mode = GET_MODE (x); + if (TARGET_THUMB1) + { + *total = thumb1_size_rtx_costs (x, code, outer_code); + return true; + } + + /* FIXME: This makes no attempt to prefer narrow Thumb-2 instructions. */ + switch (code) + { + case MEM: + /* A memory access costs 1 insn if the mode is small, or the address is + a single register, otherwise it costs one insn per word. */ + if (REG_P (XEXP (x, 0))) + *total = COSTS_N_INSNS (1); + else if (flag_pic + && GET_CODE (XEXP (x, 0)) == PLUS + && will_be_in_index_register (XEXP (XEXP (x, 0), 1))) + /* This will be split into two instructions. + See arm.md:calculate_pic_address. */ + *total = COSTS_N_INSNS (2); + else + *total = COSTS_N_INSNS (ARM_NUM_REGS (mode)); + return true; + + case DIV: + case MOD: + case UDIV: + case UMOD: + /* Needs a libcall, so it costs about this. */ + *total = COSTS_N_INSNS (2); + return false; + + case ROTATE: + if (mode == SImode && GET_CODE (XEXP (x, 1)) == REG) + { + *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code, false); + return true; + } + /* Fall through */ + case ROTATERT: + case ASHIFT: + case LSHIFTRT: + case ASHIFTRT: + if (mode == DImode && GET_CODE (XEXP (x, 1)) == CONST_INT) + { + *total = COSTS_N_INSNS (3) + rtx_cost (XEXP (x, 0), code, false); + return true; + } + else if (mode == SImode) + { + *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code, false); + /* Slightly disparage register shifts, but not by much. */ + if (GET_CODE (XEXP (x, 1)) != CONST_INT) + *total += 1 + rtx_cost (XEXP (x, 1), code, false); + return true; + } + + /* Needs a libcall. */ + *total = COSTS_N_INSNS (2); + return false; + + case MINUS: + if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT + && (mode == SFmode || !TARGET_VFP_SINGLE)) + { + *total = COSTS_N_INSNS (1); + return false; + } + + if (mode == SImode) + { + enum rtx_code subcode0 = GET_CODE (XEXP (x, 0)); + enum rtx_code subcode1 = GET_CODE (XEXP (x, 1)); + + if (subcode0 == ROTATE || subcode0 == ROTATERT || subcode0 == ASHIFT + || subcode0 == LSHIFTRT || subcode0 == ASHIFTRT + || subcode1 == ROTATE || subcode1 == ROTATERT + || subcode1 == ASHIFT || subcode1 == LSHIFTRT + || subcode1 == ASHIFTRT) + { + /* It's just the cost of the two operands. */ + *total = 0; + return false; + } + + *total = COSTS_N_INSNS (1); + return false; + } + + *total = COSTS_N_INSNS (ARM_NUM_REGS (mode)); + return false; + + case PLUS: + if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT + && (mode == SFmode || !TARGET_VFP_SINGLE)) + { + *total = COSTS_N_INSNS (1); + return false; + } + + /* A shift as a part of ADD costs nothing. */ + if (GET_CODE (XEXP (x, 0)) == MULT + && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode)) + { + *total = COSTS_N_INSNS (TARGET_THUMB2 ? 2 : 1); + *total += rtx_cost (XEXP (XEXP (x, 0), 0), code, false); + *total += rtx_cost (XEXP (x, 1), code, false); + return true; + } + + /* Fall through */ + case AND: case XOR: case IOR: + if (mode == SImode) + { + enum rtx_code subcode = GET_CODE (XEXP (x, 0)); + + if (subcode == ROTATE || subcode == ROTATERT || subcode == ASHIFT + || subcode == LSHIFTRT || subcode == ASHIFTRT + || (code == AND && subcode == NOT)) + { + /* It's just the cost of the two operands. */ + *total = 0; + return false; + } + } + + *total = COSTS_N_INSNS (ARM_NUM_REGS (mode)); + return false; + + case MULT: + *total = COSTS_N_INSNS (ARM_NUM_REGS (mode)); + return false; + + case NEG: + if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT + && (mode == SFmode || !TARGET_VFP_SINGLE)) + { + *total = COSTS_N_INSNS (1); + return false; + } + + /* Fall through */ + case NOT: + *total = COSTS_N_INSNS (ARM_NUM_REGS (mode)); + + return false; + + case IF_THEN_ELSE: + *total = 0; + return false; + + case COMPARE: + if (cc_register (XEXP (x, 0), VOIDmode)) + * total = 0; + else + *total = COSTS_N_INSNS (1); + return false; + + case ABS: + if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT + && (mode == SFmode || !TARGET_VFP_SINGLE)) + *total = COSTS_N_INSNS (1); + else + *total = COSTS_N_INSNS (1 + ARM_NUM_REGS (mode)); + return false; + + case SIGN_EXTEND: + case ZERO_EXTEND: + return arm_rtx_costs_1 (x, outer_code, total, 0); + + case CONST_INT: + if (const_ok_for_arm (INTVAL (x))) + /* A multiplication by a constant requires another instruction + to load the constant to a register. */ + *total = COSTS_N_INSNS ((outer_code == SET || outer_code == MULT) + ? 1 : 0); + else if (const_ok_for_arm (~INTVAL (x))) + *total = COSTS_N_INSNS (outer_code == AND ? 0 : 1); + else if (const_ok_for_arm (-INTVAL (x))) + { + if (outer_code == COMPARE || outer_code == PLUS + || outer_code == MINUS) + *total = 0; + else + *total = COSTS_N_INSNS (1); + } + else + *total = COSTS_N_INSNS (2); + return true; + + case CONST: + case LABEL_REF: + case SYMBOL_REF: + *total = COSTS_N_INSNS (2); + return true; + + case CONST_DOUBLE: + *total = COSTS_N_INSNS (4); + return true; + + case HIGH: + case LO_SUM: + /* We prefer constant pool entries to MOVW/MOVT pairs, so bump the + cost of these slightly. */ + *total = COSTS_N_INSNS (1) + 1; + return true; + + default: + if (mode != VOIDmode) + *total = COSTS_N_INSNS (ARM_NUM_REGS (mode)); + else + *total = COSTS_N_INSNS (4); /* How knows? */ + return false; + } +} + +/* RTX costs when optimizing for size. */ +static bool +arm_rtx_costs (rtx x, int code, int outer_code, int *total, + bool speed) +{ + if (!speed) + return arm_size_rtx_costs (x, (enum rtx_code) code, + (enum rtx_code) outer_code, total); + else + return current_tune->rtx_costs (x, (enum rtx_code) code, + (enum rtx_code) outer_code, + total, speed); +} + +/* RTX costs for cores with a slow MUL implementation. Thumb-2 is not + supported on any "slowmul" cores, so it can be ignored. */ + +static bool +arm_slowmul_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code, + int *total, bool speed) +{ + enum machine_mode mode = GET_MODE (x); + + if (TARGET_THUMB) + { + *total = thumb1_rtx_costs (x, code, outer_code); + return true; + } + + switch (code) + { + case MULT: + if (GET_MODE_CLASS (mode) == MODE_FLOAT + || mode == DImode) + { + *total = COSTS_N_INSNS (20); + return false; + } + + if (GET_CODE (XEXP (x, 1)) == CONST_INT) + { + unsigned HOST_WIDE_INT i = (INTVAL (XEXP (x, 1)) + & (unsigned HOST_WIDE_INT) 0xffffffff); + int cost, const_ok = const_ok_for_arm (i); + int j, booth_unit_size; + + /* Tune as appropriate. */ + cost = const_ok ? 4 : 8; + booth_unit_size = 2; + for (j = 0; i && j < 32; j += booth_unit_size) + { + i >>= booth_unit_size; + cost++; + } + + *total = COSTS_N_INSNS (cost); + *total += rtx_cost (XEXP (x, 0), code, speed); + return true; + } + + *total = COSTS_N_INSNS (20); + return false; + + default: + return arm_rtx_costs_1 (x, outer_code, total, speed);; + } +} + + +/* RTX cost for cores with a fast multiply unit (M variants). */ + +static bool +arm_fastmul_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code, + int *total, bool speed) +{ + enum machine_mode mode = GET_MODE (x); + + if (TARGET_THUMB1) + { + *total = thumb1_rtx_costs (x, code, outer_code); + return true; + } + + /* ??? should thumb2 use different costs? */ + switch (code) + { + case MULT: + /* There is no point basing this on the tuning, since it is always the + fast variant if it exists at all. */ + if (mode == DImode + && (GET_CODE (XEXP (x, 0)) == GET_CODE (XEXP (x, 1))) + && (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND + || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)) + { + *total = COSTS_N_INSNS(2); + return false; + } + + + if (mode == DImode) + { + *total = COSTS_N_INSNS (5); + return false; + } + + if (GET_CODE (XEXP (x, 1)) == CONST_INT) + { + unsigned HOST_WIDE_INT i = (INTVAL (XEXP (x, 1)) + & (unsigned HOST_WIDE_INT) 0xffffffff); + int cost, const_ok = const_ok_for_arm (i); + int j, booth_unit_size; + + /* Tune as appropriate. */ + cost = const_ok ? 4 : 8; + booth_unit_size = 8; + for (j = 0; i && j < 32; j += booth_unit_size) + { + i >>= booth_unit_size; + cost++; + } + + *total = COSTS_N_INSNS(cost); + return false; + } + + if (mode == SImode) + { + *total = COSTS_N_INSNS (4); + return false; + } + + if (GET_MODE_CLASS (mode) == MODE_FLOAT) + { + if (TARGET_HARD_FLOAT + && (mode == SFmode + || (mode == DFmode && !TARGET_VFP_SINGLE))) + { + *total = COSTS_N_INSNS (1); + return false; + } + } + + /* Requires a lib call */ + *total = COSTS_N_INSNS (20); + return false; + + default: + return arm_rtx_costs_1 (x, outer_code, total, speed); + } +} + + +/* RTX cost for XScale CPUs. Thumb-2 is not supported on any xscale cores, + so it can be ignored. */ + +static bool +arm_xscale_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code, + int *total, bool speed) +{ + enum machine_mode mode = GET_MODE (x); + + if (TARGET_THUMB) + { + *total = thumb1_rtx_costs (x, code, outer_code); + return true; + } + + switch (code) + { + case COMPARE: + if (GET_CODE (XEXP (x, 0)) != MULT) + return arm_rtx_costs_1 (x, outer_code, total, speed); + + /* A COMPARE of a MULT is slow on XScale; the muls instruction + will stall until the multiplication is complete. */ + *total = COSTS_N_INSNS (3); + return false; + + case MULT: + /* There is no point basing this on the tuning, since it is always the + fast variant if it exists at all. */ + if (mode == DImode + && (GET_CODE (XEXP (x, 0)) == GET_CODE (XEXP (x, 1))) + && (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND + || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)) + { + *total = COSTS_N_INSNS (2); + return false; + } + + + if (mode == DImode) + { + *total = COSTS_N_INSNS (5); + return false; + } + + if (GET_CODE (XEXP (x, 1)) == CONST_INT) + { + /* If operand 1 is a constant we can more accurately + calculate the cost of the multiply. The multiplier can + retire 15 bits on the first cycle and a further 12 on the + second. We do, of course, have to load the constant into + a register first. */ + unsigned HOST_WIDE_INT i = INTVAL (XEXP (x, 1)); + /* There's a general overhead of one cycle. */ + int cost = 1; + unsigned HOST_WIDE_INT masked_const; + + if (i & 0x80000000) + i = ~i; + + i &= (unsigned HOST_WIDE_INT) 0xffffffff; + + masked_const = i & 0xffff8000; + if (masked_const != 0) + { + cost++; + masked_const = i & 0xf8000000; + if (masked_const != 0) + cost++; + } + *total = COSTS_N_INSNS (cost); + return false; + } + + if (mode == SImode) + { + *total = COSTS_N_INSNS (3); + return false; + } + + /* Requires a lib call */ + *total = COSTS_N_INSNS (20); + return false; + + default: + return arm_rtx_costs_1 (x, outer_code, total, speed); + } +} + + +/* RTX costs for 9e (and later) cores. */ + +static bool +arm_9e_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code, + int *total, bool speed) +{ + enum machine_mode mode = GET_MODE (x); + + if (TARGET_THUMB1) + { + switch (code) + { + case MULT: + *total = COSTS_N_INSNS (3); + return true; + + default: + *total = thumb1_rtx_costs (x, code, outer_code); + return true; + } + } + + switch (code) + { + case MULT: + /* There is no point basing this on the tuning, since it is always the + fast variant if it exists at all. */ + if (mode == DImode + && (GET_CODE (XEXP (x, 0)) == GET_CODE (XEXP (x, 1))) + && (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND + || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)) + { + *total = COSTS_N_INSNS (2); + return false; + } + + + if (mode == DImode) + { + *total = COSTS_N_INSNS (5); + return false; + } + + if (mode == SImode) + { + *total = COSTS_N_INSNS (2); + return false; + } + + if (GET_MODE_CLASS (mode) == MODE_FLOAT) + { + if (TARGET_HARD_FLOAT + && (mode == SFmode + || (mode == DFmode && !TARGET_VFP_SINGLE))) + { + *total = COSTS_N_INSNS (1); + return false; + } + } + + *total = COSTS_N_INSNS (20); + return false; + + default: + return arm_rtx_costs_1 (x, outer_code, total, speed); + } +} +/* All address computations that can be done are free, but rtx cost returns + the same for practically all of them. So we weight the different types + of address here in the order (most pref first): + PRE/POST_INC/DEC, SHIFT or NON-INT sum, INT sum, REG, MEM or LABEL. */ +static inline int +arm_arm_address_cost (rtx x) +{ + enum rtx_code c = GET_CODE (x); + + if (c == PRE_INC || c == PRE_DEC || c == POST_INC || c == POST_DEC) + return 0; + if (c == MEM || c == LABEL_REF || c == SYMBOL_REF) + return 10; + + if (c == PLUS) + { + if (GET_CODE (XEXP (x, 1)) == CONST_INT) + return 2; + + if (ARITHMETIC_P (XEXP (x, 0)) || ARITHMETIC_P (XEXP (x, 1))) + return 3; + + return 4; + } + + return 6; +} + +static inline int +arm_thumb_address_cost (rtx x) +{ + enum rtx_code c = GET_CODE (x); + + if (c == REG) + return 1; + if (c == PLUS + && GET_CODE (XEXP (x, 0)) == REG + && GET_CODE (XEXP (x, 1)) == CONST_INT) + return 1; + + return 2; +} + +static int +arm_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED) +{ + return TARGET_32BIT ? arm_arm_address_cost (x) : arm_thumb_address_cost (x); +} + +/* Adjust cost hook for XScale. */ +static bool +xscale_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost) +{ + /* Some true dependencies can have a higher cost depending + on precisely how certain input operands are used. */ + if (REG_NOTE_KIND(link) == 0 + && recog_memoized (insn) >= 0 + && recog_memoized (dep) >= 0) + { + int shift_opnum = get_attr_shift (insn); + enum attr_type attr_type = get_attr_type (dep); + + /* If nonzero, SHIFT_OPNUM contains the operand number of a shifted + operand for INSN. If we have a shifted input operand and the + instruction we depend on is another ALU instruction, then we may + have to account for an additional stall. */ + if (shift_opnum != 0 + && (attr_type == TYPE_ALU_SHIFT || attr_type == TYPE_ALU_SHIFT_REG)) + { + rtx shifted_operand; + int opno; + + /* Get the shifted operand. */ + extract_insn (insn); + shifted_operand = recog_data.operand[shift_opnum]; + + /* Iterate over all the operands in DEP. If we write an operand + that overlaps with SHIFTED_OPERAND, then we have increase the + cost of this dependency. */ + extract_insn (dep); + preprocess_constraints (); + for (opno = 0; opno < recog_data.n_operands; opno++) + { + /* We can ignore strict inputs. */ + if (recog_data.operand_type[opno] == OP_IN) + continue; + + if (reg_overlap_mentioned_p (recog_data.operand[opno], + shifted_operand)) + { + *cost = 2; + return false; + } + } + } + } + return true; +} + +/* Adjust cost hook for Cortex A9. */ +static bool +cortex_a9_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost) +{ + switch (REG_NOTE_KIND (link)) + { + case REG_DEP_ANTI: + *cost = 0; + return false; + + case REG_DEP_TRUE: + case REG_DEP_OUTPUT: + if (recog_memoized (insn) >= 0 + && recog_memoized (dep) >= 0) + { + if (GET_CODE (PATTERN (insn)) == SET) + { + if (GET_MODE_CLASS + (GET_MODE (SET_DEST (PATTERN (insn)))) == MODE_FLOAT + || GET_MODE_CLASS + (GET_MODE (SET_SRC (PATTERN (insn)))) == MODE_FLOAT) + { + enum attr_type attr_type_insn = get_attr_type (insn); + enum attr_type attr_type_dep = get_attr_type (dep); + + /* By default all dependencies of the form + s0 = s0 s1 + s0 = s0 s2 + have an extra latency of 1 cycle because + of the input and output dependency in this + case. However this gets modeled as an true + dependency and hence all these checks. */ + if (REG_P (SET_DEST (PATTERN (insn))) + && REG_P (SET_DEST (PATTERN (dep))) + && reg_overlap_mentioned_p (SET_DEST (PATTERN (insn)), + SET_DEST (PATTERN (dep)))) + { + /* FMACS is a special case where the dependant + instruction can be issued 3 cycles before + the normal latency in case of an output + dependency. */ + if ((attr_type_insn == TYPE_FMACS + || attr_type_insn == TYPE_FMACD) + && (attr_type_dep == TYPE_FMACS + || attr_type_dep == TYPE_FMACD)) + { + if (REG_NOTE_KIND (link) == REG_DEP_OUTPUT) + *cost = insn_default_latency (dep) - 3; + else + *cost = insn_default_latency (dep); + return false; + } + else + { + if (REG_NOTE_KIND (link) == REG_DEP_OUTPUT) + *cost = insn_default_latency (dep) + 1; + else + *cost = insn_default_latency (dep); + } + return false; + } + } + } + } + break; + + default: + gcc_unreachable (); + } + + return true; +} + +/* Adjust cost hook for FA726TE. */ +static bool +fa726te_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost) +{ + /* For FA726TE, true dependency on CPSR (i.e. set cond followed by predicated) + have penalty of 3. */ + if (REG_NOTE_KIND (link) == REG_DEP_TRUE + && recog_memoized (insn) >= 0 + && recog_memoized (dep) >= 0 + && get_attr_conds (dep) == CONDS_SET) + { + /* Use of carry (e.g. 64-bit arithmetic) in ALU: 3-cycle latency. */ + if (get_attr_conds (insn) == CONDS_USE + && get_attr_type (insn) != TYPE_BRANCH) + { + *cost = 3; + return false; + } + + if (GET_CODE (PATTERN (insn)) == COND_EXEC + || get_attr_conds (insn) == CONDS_USE) + { + *cost = 0; + return false; + } + } + + return true; +} + +/* This function implements the target macro TARGET_SCHED_ADJUST_COST. + It corrects the value of COST based on the relationship between + INSN and DEP through the dependence LINK. It returns the new + value. There is a per-core adjust_cost hook to adjust scheduler costs + and the per-core hook can choose to completely override the generic + adjust_cost function. Only put bits of code into arm_adjust_cost that + are common across all cores. */ +static int +arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost) +{ + rtx i_pat, d_pat; + + /* When generating Thumb-1 code, we want to place flag-setting operations + close to a conditional branch which depends on them, so that we can + omit the comparison. */ + if (TARGET_THUMB1 + && REG_NOTE_KIND (link) == 0 + && recog_memoized (insn) == CODE_FOR_cbranchsi4_insn + && recog_memoized (dep) >= 0 + && get_attr_conds (dep) == CONDS_SET) + return 0; + + if (current_tune->sched_adjust_cost != NULL) + { + if (!current_tune->sched_adjust_cost (insn, link, dep, &cost)) + return cost; + } + + /* XXX This is not strictly true for the FPA. */ + if (REG_NOTE_KIND (link) == REG_DEP_ANTI + || REG_NOTE_KIND (link) == REG_DEP_OUTPUT) + return 0; + + /* Call insns don't incur a stall, even if they follow a load. */ + if (REG_NOTE_KIND (link) == 0 + && GET_CODE (insn) == CALL_INSN) + return 1; + + if ((i_pat = single_set (insn)) != NULL + && GET_CODE (SET_SRC (i_pat)) == MEM + && (d_pat = single_set (dep)) != NULL + && GET_CODE (SET_DEST (d_pat)) == MEM) + { + rtx src_mem = XEXP (SET_SRC (i_pat), 0); + /* This is a load after a store, there is no conflict if the load reads + from a cached area. Assume that loads from the stack, and from the + constant pool are cached, and that others will miss. This is a + hack. */ + + if ((GET_CODE (src_mem) == SYMBOL_REF + && CONSTANT_POOL_ADDRESS_P (src_mem)) + || reg_mentioned_p (stack_pointer_rtx, src_mem) + || reg_mentioned_p (frame_pointer_rtx, src_mem) + || reg_mentioned_p (hard_frame_pointer_rtx, src_mem)) + return 1; + } + + return cost; +} + +static int fp_consts_inited = 0; + +/* Only zero is valid for VFP. Other values are also valid for FPA. */ +static const char * const strings_fp[8] = +{ + "0", "1", "2", "3", + "4", "5", "0.5", "10" +}; + +static REAL_VALUE_TYPE values_fp[8]; + +static void +init_fp_table (void) +{ + int i; + REAL_VALUE_TYPE r; + + if (TARGET_VFP) + fp_consts_inited = 1; + else + fp_consts_inited = 8; + + for (i = 0; i < fp_consts_inited; i++) + { + r = REAL_VALUE_ATOF (strings_fp[i], DFmode); + values_fp[i] = r; + } +} + +/* Return TRUE if rtx X is a valid immediate FP constant. */ +int +arm_const_double_rtx (rtx x) +{ + REAL_VALUE_TYPE r; + int i; + + if (!fp_consts_inited) + init_fp_table (); + + REAL_VALUE_FROM_CONST_DOUBLE (r, x); + if (REAL_VALUE_MINUS_ZERO (r)) + return 0; + + for (i = 0; i < fp_consts_inited; i++) + if (REAL_VALUES_EQUAL (r, values_fp[i])) + return 1; + + return 0; +} + +/* Return TRUE if rtx X is a valid immediate FPA constant. */ +int +neg_const_double_rtx_ok_for_fpa (rtx x) +{ + REAL_VALUE_TYPE r; + int i; + + if (!fp_consts_inited) + init_fp_table (); + + REAL_VALUE_FROM_CONST_DOUBLE (r, x); + r = real_value_negate (&r); + if (REAL_VALUE_MINUS_ZERO (r)) + return 0; + + for (i = 0; i < 8; i++) + if (REAL_VALUES_EQUAL (r, values_fp[i])) + return 1; + + return 0; +} + + +/* VFPv3 has a fairly wide range of representable immediates, formed from + "quarter-precision" floating-point values. These can be evaluated using this + formula (with ^ for exponentiation): + + -1^s * n * 2^-r + + Where 's' is a sign bit (0/1), 'n' and 'r' are integers such that + 16 <= n <= 31 and 0 <= r <= 7. + + These values are mapped onto an 8-bit integer ABCDEFGH s.t. + + - A (most-significant) is the sign bit. + - BCD are the exponent (encoded as r XOR 3). + - EFGH are the mantissa (encoded as n - 16). +*/ + +/* Return an integer index for a VFPv3 immediate operand X suitable for the + fconst[sd] instruction, or -1 if X isn't suitable. */ +static int +vfp3_const_double_index (rtx x) +{ + REAL_VALUE_TYPE r, m; + int sign, exponent; + unsigned HOST_WIDE_INT mantissa, mant_hi; + unsigned HOST_WIDE_INT mask; + HOST_WIDE_INT m1, m2; + int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1; + + if (!TARGET_VFP3 || GET_CODE (x) != CONST_DOUBLE) + return -1; + + REAL_VALUE_FROM_CONST_DOUBLE (r, x); + + /* We can't represent these things, so detect them first. */ + if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r) || REAL_VALUE_MINUS_ZERO (r)) + return -1; + + /* Extract sign, exponent and mantissa. */ + sign = REAL_VALUE_NEGATIVE (r) ? 1 : 0; + r = real_value_abs (&r); + exponent = REAL_EXP (&r); + /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the + highest (sign) bit, with a fixed binary point at bit point_pos. + WARNING: If there's ever a VFP version which uses more than 2 * H_W_I - 1 + bits for the mantissa, this may fail (low bits would be lost). */ + real_ldexp (&m, &r, point_pos - exponent); + REAL_VALUE_TO_INT (&m1, &m2, m); + mantissa = m1; + mant_hi = m2; + + /* If there are bits set in the low part of the mantissa, we can't + represent this value. */ + if (mantissa != 0) + return -1; + + /* Now make it so that mantissa contains the most-significant bits, and move + the point_pos to indicate that the least-significant bits have been + discarded. */ + point_pos -= HOST_BITS_PER_WIDE_INT; + mantissa = mant_hi; + + /* We can permit four significant bits of mantissa only, plus a high bit + which is always 1. */ + mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1; + if ((mantissa & mask) != 0) + return -1; + + /* Now we know the mantissa is in range, chop off the unneeded bits. */ + mantissa >>= point_pos - 5; + + /* The mantissa may be zero. Disallow that case. (It's possible to load the + floating-point immediate zero with Neon using an integer-zero load, but + that case is handled elsewhere.) */ + if (mantissa == 0) + return -1; + + gcc_assert (mantissa >= 16 && mantissa <= 31); + + /* The value of 5 here would be 4 if GCC used IEEE754-like encoding (where + normalized significands are in the range [1, 2). (Our mantissa is shifted + left 4 places at this point relative to normalized IEEE754 values). GCC + internally uses [0.5, 1) (see real.c), so the exponent returned from + REAL_EXP must be altered. */ + exponent = 5 - exponent; + + if (exponent < 0 || exponent > 7) + return -1; + + /* Sign, mantissa and exponent are now in the correct form to plug into the + formula described in the comment above. */ + return (sign << 7) | ((exponent ^ 3) << 4) | (mantissa - 16); +} + +/* Return TRUE if rtx X is a valid immediate VFPv3 constant. */ +int +vfp3_const_double_rtx (rtx x) +{ + if (!TARGET_VFP3) + return 0; + + return vfp3_const_double_index (x) != -1; +} + +/* Recognize immediates which can be used in various Neon instructions. Legal + immediates are described by the following table (for VMVN variants, the + bitwise inverse of the constant shown is recognized. In either case, VMOV + is output and the correct instruction to use for a given constant is chosen + by the assembler). The constant shown is replicated across all elements of + the destination vector. + + insn elems variant constant (binary) + ---- ----- ------- ----------------- + vmov i32 0 00000000 00000000 00000000 abcdefgh + vmov i32 1 00000000 00000000 abcdefgh 00000000 + vmov i32 2 00000000 abcdefgh 00000000 00000000 + vmov i32 3 abcdefgh 00000000 00000000 00000000 + vmov i16 4 00000000 abcdefgh + vmov i16 5 abcdefgh 00000000 + vmvn i32 6 00000000 00000000 00000000 abcdefgh + vmvn i32 7 00000000 00000000 abcdefgh 00000000 + vmvn i32 8 00000000 abcdefgh 00000000 00000000 + vmvn i32 9 abcdefgh 00000000 00000000 00000000 + vmvn i16 10 00000000 abcdefgh + vmvn i16 11 abcdefgh 00000000 + vmov i32 12 00000000 00000000 abcdefgh 11111111 + vmvn i32 13 00000000 00000000 abcdefgh 11111111 + vmov i32 14 00000000 abcdefgh 11111111 11111111 + vmvn i32 15 00000000 abcdefgh 11111111 11111111 + vmov i8 16 abcdefgh + vmov i64 17 aaaaaaaa bbbbbbbb cccccccc dddddddd + eeeeeeee ffffffff gggggggg hhhhhhhh + vmov f32 18 aBbbbbbc defgh000 00000000 00000000 + + For case 18, B = !b. Representable values are exactly those accepted by + vfp3_const_double_index, but are output as floating-point numbers rather + than indices. + + Variants 0-5 (inclusive) may also be used as immediates for the second + operand of VORR/VBIC instructions. + + The INVERSE argument causes the bitwise inverse of the given operand to be + recognized instead (used for recognizing legal immediates for the VAND/VORN + pseudo-instructions). If INVERSE is true, the value placed in *MODCONST is + *not* inverted (i.e. the pseudo-instruction forms vand/vorn should still be + output, rather than the real insns vbic/vorr). + + INVERSE makes no difference to the recognition of float vectors. + + The return value is the variant of immediate as shown in the above table, or + -1 if the given value doesn't match any of the listed patterns. +*/ +static int +neon_valid_immediate (rtx op, enum machine_mode mode, int inverse, + rtx *modconst, int *elementwidth) +{ +#define CHECK(STRIDE, ELSIZE, CLASS, TEST) \ + matches = 1; \ + for (i = 0; i < idx; i += (STRIDE)) \ + if (!(TEST)) \ + matches = 0; \ + if (matches) \ + { \ + immtype = (CLASS); \ + elsize = (ELSIZE); \ + break; \ + } + + unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op); + unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode)); + unsigned char bytes[16]; + int immtype = -1, matches; + unsigned int invmask = inverse ? 0xff : 0; + + /* Vectors of float constants. */ + if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT) + { + rtx el0 = CONST_VECTOR_ELT (op, 0); + REAL_VALUE_TYPE r0; + + if (!vfp3_const_double_rtx (el0)) + return -1; + + REAL_VALUE_FROM_CONST_DOUBLE (r0, el0); + + for (i = 1; i < n_elts; i++) + { + rtx elt = CONST_VECTOR_ELT (op, i); + REAL_VALUE_TYPE re; + + REAL_VALUE_FROM_CONST_DOUBLE (re, elt); + + if (!REAL_VALUES_EQUAL (r0, re)) + return -1; + } + + if (modconst) + *modconst = CONST_VECTOR_ELT (op, 0); + + if (elementwidth) + *elementwidth = 0; + + return 18; + } + + /* Splat vector constant out into a byte vector. */ + for (i = 0; i < n_elts; i++) + { + rtx el = CONST_VECTOR_ELT (op, i); + unsigned HOST_WIDE_INT elpart; + unsigned int part, parts; + + if (GET_CODE (el) == CONST_INT) + { + elpart = INTVAL (el); + parts = 1; + } + else if (GET_CODE (el) == CONST_DOUBLE) + { + elpart = CONST_DOUBLE_LOW (el); + parts = 2; + } + else + gcc_unreachable (); + + for (part = 0; part < parts; part++) + { + unsigned int byte; + for (byte = 0; byte < innersize; byte++) + { + bytes[idx++] = (elpart & 0xff) ^ invmask; + elpart >>= BITS_PER_UNIT; + } + if (GET_CODE (el) == CONST_DOUBLE) + elpart = CONST_DOUBLE_HIGH (el); + } + } + + /* Sanity check. */ + gcc_assert (idx == GET_MODE_SIZE (mode)); + + do + { + CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0 + && bytes[i + 2] == 0 && bytes[i + 3] == 0); + + CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1] + && bytes[i + 2] == 0 && bytes[i + 3] == 0); + + CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0 + && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0); + + CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0 + && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3]); + + CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0); + + CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1]); + + CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff + && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff); + + CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1] + && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff); + + CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff + && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff); + + CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff + && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3]); + + CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff); + + CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1]); + + CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1] + && bytes[i + 2] == 0 && bytes[i + 3] == 0); + + CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1] + && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff); + + CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff + && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0); + + CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0 + && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff); + + CHECK (1, 8, 16, bytes[i] == bytes[0]); + + CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff) + && bytes[i] == bytes[(i + 8) % idx]); + } + while (0); + + if (immtype == -1) + return -1; + + if (elementwidth) + *elementwidth = elsize; + + if (modconst) + { + unsigned HOST_WIDE_INT imm = 0; + + /* Un-invert bytes of recognized vector, if necessary. */ + if (invmask != 0) + for (i = 0; i < idx; i++) + bytes[i] ^= invmask; + + if (immtype == 17) + { + /* FIXME: Broken on 32-bit H_W_I hosts. */ + gcc_assert (sizeof (HOST_WIDE_INT) == 8); + + for (i = 0; i < 8; i++) + imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0) + << (i * BITS_PER_UNIT); + + *modconst = GEN_INT (imm); + } + else + { + unsigned HOST_WIDE_INT imm = 0; + + for (i = 0; i < elsize / BITS_PER_UNIT; i++) + imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT); + + *modconst = GEN_INT (imm); + } + } + + return immtype; +#undef CHECK +} + +/* Return TRUE if rtx X is legal for use as either a Neon VMOV (or, implicitly, + VMVN) immediate. Write back width per element to *ELEMENTWIDTH (or zero for + float elements), and a modified constant (whatever should be output for a + VMOV) in *MODCONST. */ + +int +neon_immediate_valid_for_move (rtx op, enum machine_mode mode, + rtx *modconst, int *elementwidth) +{ + rtx tmpconst; + int tmpwidth; + int retval = neon_valid_immediate (op, mode, 0, &tmpconst, &tmpwidth); + + if (retval == -1) + return 0; + + if (modconst) + *modconst = tmpconst; + + if (elementwidth) + *elementwidth = tmpwidth; + + return 1; +} + +/* Return TRUE if rtx X is legal for use in a VORR or VBIC instruction. If + the immediate is valid, write a constant suitable for using as an operand + to VORR/VBIC/VAND/VORN to *MODCONST and the corresponding element width to + *ELEMENTWIDTH. See neon_valid_immediate for description of INVERSE. */ + +int +neon_immediate_valid_for_logic (rtx op, enum machine_mode mode, int inverse, + rtx *modconst, int *elementwidth) +{ + rtx tmpconst; + int tmpwidth; + int retval = neon_valid_immediate (op, mode, inverse, &tmpconst, &tmpwidth); + + if (retval < 0 || retval > 5) + return 0; + + if (modconst) + *modconst = tmpconst; + + if (elementwidth) + *elementwidth = tmpwidth; + + return 1; +} + +/* Return a string suitable for output of Neon immediate logic operation + MNEM. */ + +char * +neon_output_logic_immediate (const char *mnem, rtx *op2, enum machine_mode mode, + int inverse, int quad) +{ + int width, is_valid; + static char templ[40]; + + is_valid = neon_immediate_valid_for_logic (*op2, mode, inverse, op2, &width); + + gcc_assert (is_valid != 0); + + if (quad) + sprintf (templ, "%s.i%d\t%%q0, %%2", mnem, width); + else + sprintf (templ, "%s.i%d\t%%P0, %%2", mnem, width); + + return templ; +} + +/* Output a sequence of pairwise operations to implement a reduction. + NOTE: We do "too much work" here, because pairwise operations work on two + registers-worth of operands in one go. Unfortunately we can't exploit those + extra calculations to do the full operation in fewer steps, I don't think. + Although all vector elements of the result but the first are ignored, we + actually calculate the same result in each of the elements. An alternative + such as initially loading a vector with zero to use as each of the second + operands would use up an additional register and take an extra instruction, + for no particular gain. */ + +void +neon_pairwise_reduce (rtx op0, rtx op1, enum machine_mode mode, + rtx (*reduc) (rtx, rtx, rtx)) +{ + enum machine_mode inner = GET_MODE_INNER (mode); + unsigned int i, parts = GET_MODE_SIZE (mode) / GET_MODE_SIZE (inner); + rtx tmpsum = op1; + + for (i = parts / 2; i >= 1; i /= 2) + { + rtx dest = (i == 1) ? op0 : gen_reg_rtx (mode); + emit_insn (reduc (dest, tmpsum, tmpsum)); + tmpsum = dest; + } +} + +/* If VALS is a vector constant that can be loaded into a register + using VDUP, generate instructions to do so and return an RTX to + assign to the register. Otherwise return NULL_RTX. */ + +static rtx +neon_vdup_constant (rtx vals) +{ + enum machine_mode mode = GET_MODE (vals); + enum machine_mode inner_mode = GET_MODE_INNER (mode); + int n_elts = GET_MODE_NUNITS (mode); + bool all_same = true; + rtx x; + int i; + + if (GET_CODE (vals) != CONST_VECTOR || GET_MODE_SIZE (inner_mode) > 4) + return NULL_RTX; + + for (i = 0; i < n_elts; ++i) + { + x = XVECEXP (vals, 0, i); + if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0))) + all_same = false; + } + + if (!all_same) + /* The elements are not all the same. We could handle repeating + patterns of a mode larger than INNER_MODE here (e.g. int8x8_t + {0, C, 0, C, 0, C, 0, C} which can be loaded using + vdup.i16). */ + return NULL_RTX; + + /* We can load this constant by using VDUP and a constant in a + single ARM register. This will be cheaper than a vector + load. */ + + x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0)); + return gen_rtx_VEC_DUPLICATE (mode, x); +} + +/* Generate code to load VALS, which is a PARALLEL containing only + constants (for vec_init) or CONST_VECTOR, efficiently into a + register. Returns an RTX to copy into the register, or NULL_RTX + for a PARALLEL that can not be converted into a CONST_VECTOR. */ + +rtx +neon_make_constant (rtx vals) +{ + enum machine_mode mode = GET_MODE (vals); + rtx target; + rtx const_vec = NULL_RTX; + int n_elts = GET_MODE_NUNITS (mode); + int n_const = 0; + int i; + + if (GET_CODE (vals) == CONST_VECTOR) + const_vec = vals; + else if (GET_CODE (vals) == PARALLEL) + { + /* A CONST_VECTOR must contain only CONST_INTs and + CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF). + Only store valid constants in a CONST_VECTOR. */ + for (i = 0; i < n_elts; ++i) + { + rtx x = XVECEXP (vals, 0, i); + if (GET_CODE (x) == CONST_INT || GET_CODE (x) == CONST_DOUBLE) + n_const++; + } + if (n_const == n_elts) + const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)); + } + else + gcc_unreachable (); + + if (const_vec != NULL + && neon_immediate_valid_for_move (const_vec, mode, NULL, NULL)) + /* Load using VMOV. On Cortex-A8 this takes one cycle. */ + return const_vec; + else if ((target = neon_vdup_constant (vals)) != NULL_RTX) + /* Loaded using VDUP. On Cortex-A8 the VDUP takes one NEON + pipeline cycle; creating the constant takes one or two ARM + pipeline cycles. */ + return target; + else if (const_vec != NULL_RTX) + /* Load from constant pool. On Cortex-A8 this takes two cycles + (for either double or quad vectors). We can not take advantage + of single-cycle VLD1 because we need a PC-relative addressing + mode. */ + return const_vec; + else + /* A PARALLEL containing something not valid inside CONST_VECTOR. + We can not construct an initializer. */ + return NULL_RTX; +} + +/* Initialize vector TARGET to VALS. */ + +void +neon_expand_vector_init (rtx target, rtx vals) +{ + enum machine_mode mode = GET_MODE (target); + enum machine_mode inner_mode = GET_MODE_INNER (mode); + int n_elts = GET_MODE_NUNITS (mode); + int n_var = 0, one_var = -1; + bool all_same = true; + rtx x, mem; + int i; + + for (i = 0; i < n_elts; ++i) + { + x = XVECEXP (vals, 0, i); + if (!CONSTANT_P (x)) + ++n_var, one_var = i; + + if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0))) + all_same = false; + } + + if (n_var == 0) + { + rtx constant = neon_make_constant (vals); + if (constant != NULL_RTX) + { + emit_move_insn (target, constant); + return; + } + } + + /* Splat a single non-constant element if we can. */ + if (all_same && GET_MODE_SIZE (inner_mode) <= 4) + { + x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0)); + emit_insn (gen_rtx_SET (VOIDmode, target, + gen_rtx_VEC_DUPLICATE (mode, x))); + return; + } + + /* One field is non-constant. Load constant then overwrite varying + field. This is more efficient than using the stack. */ + if (n_var == 1) + { + rtx copy = copy_rtx (vals); + rtx index = GEN_INT (one_var); + + /* Load constant part of vector, substitute neighboring value for + varying element. */ + XVECEXP (copy, 0, one_var) = XVECEXP (vals, 0, (one_var + 1) % n_elts); + neon_expand_vector_init (target, copy); + + /* Insert variable. */ + x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, one_var)); + switch (mode) + { + case V8QImode: + emit_insn (gen_neon_vset_lanev8qi (target, x, target, index)); + break; + case V16QImode: + emit_insn (gen_neon_vset_lanev16qi (target, x, target, index)); + break; + case V4HImode: + emit_insn (gen_neon_vset_lanev4hi (target, x, target, index)); + break; + case V8HImode: + emit_insn (gen_neon_vset_lanev8hi (target, x, target, index)); + break; + case V2SImode: + emit_insn (gen_neon_vset_lanev2si (target, x, target, index)); + break; + case V4SImode: + emit_insn (gen_neon_vset_lanev4si (target, x, target, index)); + break; + case V2SFmode: + emit_insn (gen_neon_vset_lanev2sf (target, x, target, index)); + break; + case V4SFmode: + emit_insn (gen_neon_vset_lanev4sf (target, x, target, index)); + break; + case V2DImode: + emit_insn (gen_neon_vset_lanev2di (target, x, target, index)); + break; + default: + gcc_unreachable (); + } + return; + } + + /* Construct the vector in memory one field at a time + and load the whole vector. */ + mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), 0); + for (i = 0; i < n_elts; i++) + emit_move_insn (adjust_address_nv (mem, inner_mode, + i * GET_MODE_SIZE (inner_mode)), + XVECEXP (vals, 0, i)); + emit_move_insn (target, mem); +} + +/* Ensure OPERAND lies between LOW (inclusive) and HIGH (exclusive). Raise + ERR if it doesn't. FIXME: NEON bounds checks occur late in compilation, so + reported source locations are bogus. */ + +static void +bounds_check (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high, + const char *err) +{ + HOST_WIDE_INT lane; + + gcc_assert (GET_CODE (operand) == CONST_INT); + + lane = INTVAL (operand); + + if (lane < low || lane >= high) + error (err); +} + +/* Bounds-check lanes. */ + +void +neon_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high) +{ + bounds_check (operand, low, high, "lane out of range"); +} + +/* Bounds-check constants. */ + +void +neon_const_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high) +{ + bounds_check (operand, low, high, "constant out of range"); +} + +HOST_WIDE_INT +neon_element_bits (enum machine_mode mode) +{ + if (mode == DImode) + return GET_MODE_BITSIZE (mode); + else + return GET_MODE_BITSIZE (GET_MODE_INNER (mode)); +} + + +/* Predicates for `match_operand' and `match_operator'. */ + +/* Return nonzero if OP is a valid Cirrus memory address pattern. */ +int +cirrus_memory_offset (rtx op) +{ + /* Reject eliminable registers. */ + if (! (reload_in_progress || reload_completed) + && ( reg_mentioned_p (frame_pointer_rtx, op) + || reg_mentioned_p (arg_pointer_rtx, op) + || reg_mentioned_p (virtual_incoming_args_rtx, op) + || reg_mentioned_p (virtual_outgoing_args_rtx, op) + || reg_mentioned_p (virtual_stack_dynamic_rtx, op) + || reg_mentioned_p (virtual_stack_vars_rtx, op))) + return 0; + + if (GET_CODE (op) == MEM) + { + rtx ind; + + ind = XEXP (op, 0); + + /* Match: (mem (reg)). */ + if (GET_CODE (ind) == REG) + return 1; + + /* Match: + (mem (plus (reg) + (const))). */ + if (GET_CODE (ind) == PLUS + && GET_CODE (XEXP (ind, 0)) == REG + && REG_MODE_OK_FOR_BASE_P (XEXP (ind, 0), VOIDmode) + && GET_CODE (XEXP (ind, 1)) == CONST_INT) + return 1; + } + + return 0; +} + +/* Return TRUE if OP is a valid coprocessor memory address pattern. + WB is true if full writeback address modes are allowed and is false + if limited writeback address modes (POST_INC and PRE_DEC) are + allowed. */ + +int +arm_coproc_mem_operand (rtx op, bool wb) +{ + rtx ind; + + /* Reject eliminable registers. */ + if (! (reload_in_progress || reload_completed) + && ( reg_mentioned_p (frame_pointer_rtx, op) + || reg_mentioned_p (arg_pointer_rtx, op) + || reg_mentioned_p (virtual_incoming_args_rtx, op) + || reg_mentioned_p (virtual_outgoing_args_rtx, op) + || reg_mentioned_p (virtual_stack_dynamic_rtx, op) + || reg_mentioned_p (virtual_stack_vars_rtx, op))) + return FALSE; + + /* Constants are converted into offsets from labels. */ + if (GET_CODE (op) != MEM) + return FALSE; + + ind = XEXP (op, 0); + + if (reload_completed + && (GET_CODE (ind) == LABEL_REF + || (GET_CODE (ind) == CONST + && GET_CODE (XEXP (ind, 0)) == PLUS + && GET_CODE (XEXP (XEXP (ind, 0), 0)) == LABEL_REF + && GET_CODE (XEXP (XEXP (ind, 0), 1)) == CONST_INT))) + return TRUE; + + /* Match: (mem (reg)). */ + if (GET_CODE (ind) == REG) + return arm_address_register_rtx_p (ind, 0); + + /* Autoincremment addressing modes. POST_INC and PRE_DEC are + acceptable in any case (subject to verification by + arm_address_register_rtx_p). We need WB to be true to accept + PRE_INC and POST_DEC. */ + if (GET_CODE (ind) == POST_INC + || GET_CODE (ind) == PRE_DEC + || (wb + && (GET_CODE (ind) == PRE_INC + || GET_CODE (ind) == POST_DEC))) + return arm_address_register_rtx_p (XEXP (ind, 0), 0); + + if (wb + && (GET_CODE (ind) == POST_MODIFY || GET_CODE (ind) == PRE_MODIFY) + && arm_address_register_rtx_p (XEXP (ind, 0), 0) + && GET_CODE (XEXP (ind, 1)) == PLUS + && rtx_equal_p (XEXP (XEXP (ind, 1), 0), XEXP (ind, 0))) + ind = XEXP (ind, 1); + + /* Match: + (plus (reg) + (const)). */ + if (GET_CODE (ind) == PLUS + && GET_CODE (XEXP (ind, 0)) == REG + && REG_MODE_OK_FOR_BASE_P (XEXP (ind, 0), VOIDmode) + && GET_CODE (XEXP (ind, 1)) == CONST_INT + && INTVAL (XEXP (ind, 1)) > -1024 + && INTVAL (XEXP (ind, 1)) < 1024 + && (INTVAL (XEXP (ind, 1)) & 3) == 0) + return TRUE; + + return FALSE; +} + +/* Return TRUE if OP is a memory operand which we can load or store a vector + to/from. TYPE is one of the following values: + 0 - Vector load/stor (vldr) + 1 - Core registers (ldm) + 2 - Element/structure loads (vld1) + */ +int +neon_vector_mem_operand (rtx op, int type) +{ + rtx ind; + + /* Reject eliminable registers. */ + if (! (reload_in_progress || reload_completed) + && ( reg_mentioned_p (frame_pointer_rtx, op) + || reg_mentioned_p (arg_pointer_rtx, op) + || reg_mentioned_p (virtual_incoming_args_rtx, op) + || reg_mentioned_p (virtual_outgoing_args_rtx, op) + || reg_mentioned_p (virtual_stack_dynamic_rtx, op) + || reg_mentioned_p (virtual_stack_vars_rtx, op))) + return FALSE; + + /* Constants are converted into offsets from labels. */ + if (GET_CODE (op) != MEM) + return FALSE; + + ind = XEXP (op, 0); + + if (reload_completed + && (GET_CODE (ind) == LABEL_REF + || (GET_CODE (ind) == CONST + && GET_CODE (XEXP (ind, 0)) == PLUS + && GET_CODE (XEXP (XEXP (ind, 0), 0)) == LABEL_REF + && GET_CODE (XEXP (XEXP (ind, 0), 1)) == CONST_INT))) + return TRUE; + + /* Match: (mem (reg)). */ + if (GET_CODE (ind) == REG) + return arm_address_register_rtx_p (ind, 0); + + /* Allow post-increment with Neon registers. */ + if ((type != 1 && GET_CODE (ind) == POST_INC) + || (type == 0 && GET_CODE (ind) == PRE_DEC)) + return arm_address_register_rtx_p (XEXP (ind, 0), 0); + + /* FIXME: vld1 allows register post-modify. */ + + /* Match: + (plus (reg) + (const)). */ + if (type == 0 + && GET_CODE (ind) == PLUS + && GET_CODE (XEXP (ind, 0)) == REG + && REG_MODE_OK_FOR_BASE_P (XEXP (ind, 0), VOIDmode) + && GET_CODE (XEXP (ind, 1)) == CONST_INT + && INTVAL (XEXP (ind, 1)) > -1024 + && INTVAL (XEXP (ind, 1)) < 1016 + && (INTVAL (XEXP (ind, 1)) & 3) == 0) + return TRUE; + + return FALSE; +} + +/* Return TRUE if OP is a mem suitable for loading/storing a Neon struct + type. */ +int +neon_struct_mem_operand (rtx op) +{ + rtx ind; + + /* Reject eliminable registers. */ + if (! (reload_in_progress || reload_completed) + && ( reg_mentioned_p (frame_pointer_rtx, op) + || reg_mentioned_p (arg_pointer_rtx, op) + || reg_mentioned_p (virtual_incoming_args_rtx, op) + || reg_mentioned_p (virtual_outgoing_args_rtx, op) + || reg_mentioned_p (virtual_stack_dynamic_rtx, op) + || reg_mentioned_p (virtual_stack_vars_rtx, op))) + return FALSE; + + /* Constants are converted into offsets from labels. */ + if (GET_CODE (op) != MEM) + return FALSE; + + ind = XEXP (op, 0); + + if (reload_completed + && (GET_CODE (ind) == LABEL_REF + || (GET_CODE (ind) == CONST + && GET_CODE (XEXP (ind, 0)) == PLUS + && GET_CODE (XEXP (XEXP (ind, 0), 0)) == LABEL_REF + && GET_CODE (XEXP (XEXP (ind, 0), 1)) == CONST_INT))) + return TRUE; + + /* Match: (mem (reg)). */ + if (GET_CODE (ind) == REG) + return arm_address_register_rtx_p (ind, 0); + + return FALSE; +} + +/* Return true if X is a register that will be eliminated later on. */ +int +arm_eliminable_register (rtx x) +{ + return REG_P (x) && (REGNO (x) == FRAME_POINTER_REGNUM + || REGNO (x) == ARG_POINTER_REGNUM + || (REGNO (x) >= FIRST_VIRTUAL_REGISTER + && REGNO (x) <= LAST_VIRTUAL_REGISTER)); +} + +/* Return GENERAL_REGS if a scratch register required to reload x to/from + coprocessor registers. Otherwise return NO_REGS. */ + +enum reg_class +coproc_secondary_reload_class (enum machine_mode mode, rtx x, bool wb) +{ + if (mode == HFmode) + { + if (!TARGET_NEON_FP16) + return GENERAL_REGS; + if (s_register_operand (x, mode) || neon_vector_mem_operand (x, 2)) + return NO_REGS; + return GENERAL_REGS; + } + + if (TARGET_NEON + && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT + || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT) + && neon_vector_mem_operand (x, 0)) + return NO_REGS; + + if (arm_coproc_mem_operand (x, wb) || s_register_operand (x, mode)) + return NO_REGS; + + return GENERAL_REGS; +} + +/* Values which must be returned in the most-significant end of the return + register. */ + +static bool +arm_return_in_msb (const_tree valtype) +{ + return (TARGET_AAPCS_BASED + && BYTES_BIG_ENDIAN + && (AGGREGATE_TYPE_P (valtype) + || TREE_CODE (valtype) == COMPLEX_TYPE)); +} + +/* Returns TRUE if INSN is an "LDR REG, ADDR" instruction. + Use by the Cirrus Maverick code which has to workaround + a hardware bug triggered by such instructions. */ +static bool +arm_memory_load_p (rtx insn) +{ + rtx body, lhs, rhs;; + + if (insn == NULL_RTX || GET_CODE (insn) != INSN) + return false; + + body = PATTERN (insn); + + if (GET_CODE (body) != SET) + return false; + + lhs = XEXP (body, 0); + rhs = XEXP (body, 1); + + lhs = REG_OR_SUBREG_RTX (lhs); + + /* If the destination is not a general purpose + register we do not have to worry. */ + if (GET_CODE (lhs) != REG + || REGNO_REG_CLASS (REGNO (lhs)) != GENERAL_REGS) + return false; + + /* As well as loads from memory we also have to react + to loads of invalid constants which will be turned + into loads from the minipool. */ + return (GET_CODE (rhs) == MEM + || GET_CODE (rhs) == SYMBOL_REF + || note_invalid_constants (insn, -1, false)); +} + +/* Return TRUE if INSN is a Cirrus instruction. */ +static bool +arm_cirrus_insn_p (rtx insn) +{ + enum attr_cirrus attr; + + /* get_attr cannot accept USE or CLOBBER. */ + if (!insn + || GET_CODE (insn) != INSN + || GET_CODE (PATTERN (insn)) == USE + || GET_CODE (PATTERN (insn)) == CLOBBER) + return 0; + + attr = get_attr_cirrus (insn); + + return attr != CIRRUS_NOT; +} + +/* Cirrus reorg for invalid instruction combinations. */ +static void +cirrus_reorg (rtx first) +{ + enum attr_cirrus attr; + rtx body = PATTERN (first); + rtx t; + int nops; + + /* Any branch must be followed by 2 non Cirrus instructions. */ + if (GET_CODE (first) == JUMP_INSN && GET_CODE (body) != RETURN) + { + nops = 0; + t = next_nonnote_insn (first); + + if (arm_cirrus_insn_p (t)) + ++ nops; + + if (arm_cirrus_insn_p (next_nonnote_insn (t))) + ++ nops; + + while (nops --) + emit_insn_after (gen_nop (), first); + + return; + } + + /* (float (blah)) is in parallel with a clobber. */ + if (GET_CODE (body) == PARALLEL && XVECLEN (body, 0) > 0) + body = XVECEXP (body, 0, 0); + + if (GET_CODE (body) == SET) + { + rtx lhs = XEXP (body, 0), rhs = XEXP (body, 1); + + /* cfldrd, cfldr64, cfstrd, cfstr64 must + be followed by a non Cirrus insn. */ + if (get_attr_cirrus (first) == CIRRUS_DOUBLE) + { + if (arm_cirrus_insn_p (next_nonnote_insn (first))) + emit_insn_after (gen_nop (), first); + + return; + } + else if (arm_memory_load_p (first)) + { + unsigned int arm_regno; + + /* Any ldr/cfmvdlr, ldr/cfmvdhr, ldr/cfmvsr, ldr/cfmv64lr, + ldr/cfmv64hr combination where the Rd field is the same + in both instructions must be split with a non Cirrus + insn. Example: + + ldr r0, blah + nop + cfmvsr mvf0, r0. */ + + /* Get Arm register number for ldr insn. */ + if (GET_CODE (lhs) == REG) + arm_regno = REGNO (lhs); + else + { + gcc_assert (GET_CODE (rhs) == REG); + arm_regno = REGNO (rhs); + } + + /* Next insn. */ + first = next_nonnote_insn (first); + + if (! arm_cirrus_insn_p (first)) + return; + + body = PATTERN (first); + + /* (float (blah)) is in parallel with a clobber. */ + if (GET_CODE (body) == PARALLEL && XVECLEN (body, 0)) + body = XVECEXP (body, 0, 0); + + if (GET_CODE (body) == FLOAT) + body = XEXP (body, 0); + + if (get_attr_cirrus (first) == CIRRUS_MOVE + && GET_CODE (XEXP (body, 1)) == REG + && arm_regno == REGNO (XEXP (body, 1))) + emit_insn_after (gen_nop (), first); + + return; + } + } + + /* get_attr cannot accept USE or CLOBBER. */ + if (!first + || GET_CODE (first) != INSN + || GET_CODE (PATTERN (first)) == USE + || GET_CODE (PATTERN (first)) == CLOBBER) + return; + + attr = get_attr_cirrus (first); + + /* Any coprocessor compare instruction (cfcmps, cfcmpd, ...) + must be followed by a non-coprocessor instruction. */ + if (attr == CIRRUS_COMPARE) + { + nops = 0; + + t = next_nonnote_insn (first); + + if (arm_cirrus_insn_p (t)) + ++ nops; + + if (arm_cirrus_insn_p (next_nonnote_insn (t))) + ++ nops; + + while (nops --) + emit_insn_after (gen_nop (), first); + + return; + } +} + +/* Return TRUE if X references a SYMBOL_REF. */ +int +symbol_mentioned_p (rtx x) +{ + const char * fmt; + int i; + + if (GET_CODE (x) == SYMBOL_REF) + return 1; + + /* UNSPEC_TLS entries for a symbol include the SYMBOL_REF, but they + are constant offsets, not symbols. */ + if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS) + return 0; + + fmt = GET_RTX_FORMAT (GET_CODE (x)); + + for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--) + { + if (fmt[i] == 'E') + { + int j; + + for (j = XVECLEN (x, i) - 1; j >= 0; j--) + if (symbol_mentioned_p (XVECEXP (x, i, j))) + return 1; + } + else if (fmt[i] == 'e' && symbol_mentioned_p (XEXP (x, i))) + return 1; + } + + return 0; +} + +/* Return TRUE if X references a LABEL_REF. */ +int +label_mentioned_p (rtx x) +{ + const char * fmt; + int i; + + if (GET_CODE (x) == LABEL_REF) + return 1; + + /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the referencing + instruction, but they are constant offsets, not symbols. */ + if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS) + return 0; + + fmt = GET_RTX_FORMAT (GET_CODE (x)); + for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--) + { + if (fmt[i] == 'E') + { + int j; + + for (j = XVECLEN (x, i) - 1; j >= 0; j--) + if (label_mentioned_p (XVECEXP (x, i, j))) + return 1; + } + else if (fmt[i] == 'e' && label_mentioned_p (XEXP (x, i))) + return 1; + } + + return 0; +} + +int +tls_mentioned_p (rtx x) +{ + switch (GET_CODE (x)) + { + case CONST: + return tls_mentioned_p (XEXP (x, 0)); + + case UNSPEC: + if (XINT (x, 1) == UNSPEC_TLS) + return 1; + + default: + return 0; + } +} + +/* Must not copy any rtx that uses a pc-relative address. */ + +static int +arm_note_pic_base (rtx *x, void *date ATTRIBUTE_UNUSED) +{ + if (GET_CODE (*x) == UNSPEC + && (XINT (*x, 1) == UNSPEC_PIC_BASE + || XINT (*x, 1) == UNSPEC_PIC_UNIFIED)) + return 1; + return 0; +} + +static bool +arm_cannot_copy_insn_p (rtx insn) +{ + return for_each_rtx (&PATTERN (insn), arm_note_pic_base, NULL); +} + +enum rtx_code +minmax_code (rtx x) +{ + enum rtx_code code = GET_CODE (x); + + switch (code) + { + case SMAX: + return GE; + case SMIN: + return LE; + case UMIN: + return LEU; + case UMAX: + return GEU; + default: + gcc_unreachable (); + } +} + +/* Return 1 if memory locations are adjacent. */ +int +adjacent_mem_locations (rtx a, rtx b) +{ + /* We don't guarantee to preserve the order of these memory refs. */ + if (volatile_refs_p (a) || volatile_refs_p (b)) + return 0; + + if ((GET_CODE (XEXP (a, 0)) == REG + || (GET_CODE (XEXP (a, 0)) == PLUS + && GET_CODE (XEXP (XEXP (a, 0), 1)) == CONST_INT)) + && (GET_CODE (XEXP (b, 0)) == REG + || (GET_CODE (XEXP (b, 0)) == PLUS + && GET_CODE (XEXP (XEXP (b, 0), 1)) == CONST_INT))) + { + HOST_WIDE_INT val0 = 0, val1 = 0; + rtx reg0, reg1; + int val_diff; + + if (GET_CODE (XEXP (a, 0)) == PLUS) + { + reg0 = XEXP (XEXP (a, 0), 0); + val0 = INTVAL (XEXP (XEXP (a, 0), 1)); + } + else + reg0 = XEXP (a, 0); + + if (GET_CODE (XEXP (b, 0)) == PLUS) + { + reg1 = XEXP (XEXP (b, 0), 0); + val1 = INTVAL (XEXP (XEXP (b, 0), 1)); + } + else + reg1 = XEXP (b, 0); + + /* Don't accept any offset that will require multiple + instructions to handle, since this would cause the + arith_adjacentmem pattern to output an overlong sequence. */ + if (!const_ok_for_op (val0, PLUS) || !const_ok_for_op (val1, PLUS)) + return 0; + + /* Don't allow an eliminable register: register elimination can make + the offset too large. */ + if (arm_eliminable_register (reg0)) + return 0; + + val_diff = val1 - val0; + + if (arm_ld_sched) + { + /* If the target has load delay slots, then there's no benefit + to using an ldm instruction unless the offset is zero and + we are optimizing for size. */ + return (optimize_size && (REGNO (reg0) == REGNO (reg1)) + && (val0 == 0 || val1 == 0 || val0 == 4 || val1 == 4) + && (val_diff == 4 || val_diff == -4)); + } + + return ((REGNO (reg0) == REGNO (reg1)) + && (val_diff == 4 || val_diff == -4)); + } + + return 0; +} + +/* Return true iff it would be profitable to turn a sequence of NOPS loads + or stores (depending on IS_STORE) into a load-multiple or store-multiple + instruction. ADD_OFFSET is nonzero if the base address register needs + to be modified with an add instruction before we can use it. */ + +static bool +multiple_operation_profitable_p (bool is_store ATTRIBUTE_UNUSED, + int nops, HOST_WIDE_INT add_offset) + { + /* For ARM8,9 & StrongARM, 2 ldr instructions are faster than an ldm + if the offset isn't small enough. The reason 2 ldrs are faster + is because these ARMs are able to do more than one cache access + in a single cycle. The ARM9 and StrongARM have Harvard caches, + whilst the ARM8 has a double bandwidth cache. This means that + these cores can do both an instruction fetch and a data fetch in + a single cycle, so the trick of calculating the address into a + scratch register (one of the result regs) and then doing a load + multiple actually becomes slower (and no smaller in code size). + That is the transformation + + ldr rd1, [rbase + offset] + ldr rd2, [rbase + offset + 4] + + to + + add rd1, rbase, offset + ldmia rd1, {rd1, rd2} + + produces worse code -- '3 cycles + any stalls on rd2' instead of + '2 cycles + any stalls on rd2'. On ARMs with only one cache + access per cycle, the first sequence could never complete in less + than 6 cycles, whereas the ldm sequence would only take 5 and + would make better use of sequential accesses if not hitting the + cache. + + We cheat here and test 'arm_ld_sched' which we currently know to + only be true for the ARM8, ARM9 and StrongARM. If this ever + changes, then the test below needs to be reworked. */ + if (nops == 2 && arm_ld_sched && add_offset != 0) + return false; + + /* XScale has load-store double instructions, but they have stricter + alignment requirements than load-store multiple, so we cannot + use them. + + For XScale ldm requires 2 + NREGS cycles to complete and blocks + the pipeline until completion. + + NREGS CYCLES + 1 3 + 2 4 + 3 5 + 4 6 + + An ldr instruction takes 1-3 cycles, but does not block the + pipeline. + + NREGS CYCLES + 1 1-3 + 2 2-6 + 3 3-9 + 4 4-12 + + Best case ldr will always win. However, the more ldr instructions + we issue, the less likely we are to be able to schedule them well. + Using ldr instructions also increases code size. + + As a compromise, we use ldr for counts of 1 or 2 regs, and ldm + for counts of 3 or 4 regs. */ + if (nops <= 2 && arm_tune_xscale && !optimize_size) + return false; + return true; +} + +/* Subroutine of load_multiple_sequence and store_multiple_sequence. + Given an array of UNSORTED_OFFSETS, of which there are NOPS, compute + an array ORDER which describes the sequence to use when accessing the + offsets that produces an ascending order. In this sequence, each + offset must be larger by exactly 4 than the previous one. ORDER[0] + must have been filled in with the lowest offset by the caller. + If UNSORTED_REGS is nonnull, it is an array of register numbers that + we use to verify that ORDER produces an ascending order of registers. + Return true if it was possible to construct such an order, false if + not. */ + +static bool +compute_offset_order (int nops, HOST_WIDE_INT *unsorted_offsets, int *order, + int *unsorted_regs) +{ + int i; + for (i = 1; i < nops; i++) + { + int j; + + order[i] = order[i - 1]; + for (j = 0; j < nops; j++) + if (unsorted_offsets[j] == unsorted_offsets[order[i - 1]] + 4) + { + /* We must find exactly one offset that is higher than the + previous one by 4. */ + if (order[i] != order[i - 1]) + return false; + order[i] = j; + } + if (order[i] == order[i - 1]) + return false; + /* The register numbers must be ascending. */ + if (unsorted_regs != NULL + && unsorted_regs[order[i]] <= unsorted_regs[order[i - 1]]) + return false; + } + return true; +} + +/* Used to determine in a peephole whether a sequence of load + instructions can be changed into a load-multiple instruction. + NOPS is the number of separate load instructions we are examining. The + first NOPS entries in OPERANDS are the destination registers, the + next NOPS entries are memory operands. If this function is + successful, *BASE is set to the common base register of the memory + accesses; *LOAD_OFFSET is set to the first memory location's offset + from that base register. + REGS is an array filled in with the destination register numbers. + SAVED_ORDER (if nonnull), is an array filled in with an order that maps + insn numbers to to an ascending order of stores. If CHECK_REGS is true, + the sequence of registers in REGS matches the loads from ascending memory + locations, and the function verifies that the register numbers are + themselves ascending. If CHECK_REGS is false, the register numbers + are stored in the order they are found in the operands. */ +static int +load_multiple_sequence (rtx *operands, int nops, int *regs, int *saved_order, + int *base, HOST_WIDE_INT *load_offset, bool check_regs) +{ + int unsorted_regs[MAX_LDM_STM_OPS]; + HOST_WIDE_INT unsorted_offsets[MAX_LDM_STM_OPS]; + int order[MAX_LDM_STM_OPS]; + rtx base_reg_rtx = NULL; + int base_reg = -1; + int i, ldm_case; + + /* Can only handle up to MAX_LDM_STM_OPS insns at present, though could be + easily extended if required. */ + gcc_assert (nops >= 2 && nops <= MAX_LDM_STM_OPS); + + memset (order, 0, MAX_LDM_STM_OPS * sizeof (int)); + + /* Loop over the operands and check that the memory references are + suitable (i.e. immediate offsets from the same base register). At + the same time, extract the target register, and the memory + offsets. */ + for (i = 0; i < nops; i++) + { + rtx reg; + rtx offset; + + /* Convert a subreg of a mem into the mem itself. */ + if (GET_CODE (operands[nops + i]) == SUBREG) + operands[nops + i] = alter_subreg (operands + (nops + i)); + + gcc_assert (GET_CODE (operands[nops + i]) == MEM); + + /* Don't reorder volatile memory references; it doesn't seem worth + looking for the case where the order is ok anyway. */ + if (MEM_VOLATILE_P (operands[nops + i])) + return 0; + + offset = const0_rtx; + + if ((GET_CODE (reg = XEXP (operands[nops + i], 0)) == REG + || (GET_CODE (reg) == SUBREG + && GET_CODE (reg = SUBREG_REG (reg)) == REG)) + || (GET_CODE (XEXP (operands[nops + i], 0)) == PLUS + && ((GET_CODE (reg = XEXP (XEXP (operands[nops + i], 0), 0)) + == REG) + || (GET_CODE (reg) == SUBREG + && GET_CODE (reg = SUBREG_REG (reg)) == REG)) + && (GET_CODE (offset = XEXP (XEXP (operands[nops + i], 0), 1)) + == CONST_INT))) + { + if (i == 0) + { + base_reg = REGNO (reg); + base_reg_rtx = reg; + if (TARGET_THUMB1 && base_reg > LAST_LO_REGNUM) + return 0; + } + else if (base_reg != (int) REGNO (reg)) + /* Not addressed from the same base register. */ + return 0; + + unsorted_regs[i] = (GET_CODE (operands[i]) == REG + ? REGNO (operands[i]) + : REGNO (SUBREG_REG (operands[i]))); + + /* If it isn't an integer register, or if it overwrites the + base register but isn't the last insn in the list, then + we can't do this. */ + if (unsorted_regs[i] < 0 + || (TARGET_THUMB1 && unsorted_regs[i] > LAST_LO_REGNUM) + || unsorted_regs[i] > 14 + || (i != nops - 1 && unsorted_regs[i] == base_reg)) + return 0; + + unsorted_offsets[i] = INTVAL (offset); + if (i == 0 || unsorted_offsets[i] < unsorted_offsets[order[0]]) + order[0] = i; + } + else + /* Not a suitable memory address. */ + return 0; + } + + /* All the useful information has now been extracted from the + operands into unsorted_regs and unsorted_offsets; additionally, + order[0] has been set to the lowest offset in the list. Sort + the offsets into order, verifying that they are adjacent, and + check that the register numbers are ascending. */ + if (!compute_offset_order (nops, unsorted_offsets, order, + check_regs ? unsorted_regs : NULL)) + return 0; + + if (saved_order) + memcpy (saved_order, order, sizeof order); + + if (base) + { + *base = base_reg; + + for (i = 0; i < nops; i++) + regs[i] = unsorted_regs[check_regs ? order[i] : i]; + + *load_offset = unsorted_offsets[order[0]]; + } + + if (TARGET_THUMB1 + && !peep2_reg_dead_p (nops, base_reg_rtx)) + return 0; + + if (unsorted_offsets[order[0]] == 0) + ldm_case = 1; /* ldmia */ + else if (TARGET_ARM && unsorted_offsets[order[0]] == 4) + ldm_case = 2; /* ldmib */ + else if (TARGET_ARM && unsorted_offsets[order[nops - 1]] == 0) + ldm_case = 3; /* ldmda */ + else if (TARGET_32BIT && unsorted_offsets[order[nops - 1]] == -4) + ldm_case = 4; /* ldmdb */ + else if (const_ok_for_arm (unsorted_offsets[order[0]]) + || const_ok_for_arm (-unsorted_offsets[order[0]])) + ldm_case = 5; + else + return 0; + + if (!multiple_operation_profitable_p (false, nops, + ldm_case == 5 + ? unsorted_offsets[order[0]] : 0)) + return 0; + + return ldm_case; +} + +/* Used to determine in a peephole whether a sequence of store instructions can + be changed into a store-multiple instruction. + NOPS is the number of separate store instructions we are examining. + NOPS_TOTAL is the total number of instructions recognized by the peephole + pattern. + The first NOPS entries in OPERANDS are the source registers, the next + NOPS entries are memory operands. If this function is successful, *BASE is + set to the common base register of the memory accesses; *LOAD_OFFSET is set + to the first memory location's offset from that base register. REGS is an + array filled in with the source register numbers, REG_RTXS (if nonnull) is + likewise filled with the corresponding rtx's. + SAVED_ORDER (if nonnull), is an array filled in with an order that maps insn + numbers to to an ascending order of stores. + If CHECK_REGS is true, the sequence of registers in *REGS matches the stores + from ascending memory locations, and the function verifies that the register + numbers are themselves ascending. If CHECK_REGS is false, the register + numbers are stored in the order they are found in the operands. */ +static int +store_multiple_sequence (rtx *operands, int nops, int nops_total, + int *regs, rtx *reg_rtxs, int *saved_order, int *base, + HOST_WIDE_INT *load_offset, bool check_regs) +{ + int unsorted_regs[MAX_LDM_STM_OPS]; + rtx unsorted_reg_rtxs[MAX_LDM_STM_OPS]; + HOST_WIDE_INT unsorted_offsets[MAX_LDM_STM_OPS]; + int order[MAX_LDM_STM_OPS]; + int base_reg = -1; + rtx base_reg_rtx = NULL; + int i, stm_case; + + /* Write back of base register is currently only supported for Thumb 1. */ + int base_writeback = TARGET_THUMB1; + + /* Can only handle up to MAX_LDM_STM_OPS insns at present, though could be + easily extended if required. */ + gcc_assert (nops >= 2 && nops <= MAX_LDM_STM_OPS); + + memset (order, 0, MAX_LDM_STM_OPS * sizeof (int)); + + /* Loop over the operands and check that the memory references are + suitable (i.e. immediate offsets from the same base register). At + the same time, extract the target register, and the memory + offsets. */ + for (i = 0; i < nops; i++) + { + rtx reg; + rtx offset; + + /* Convert a subreg of a mem into the mem itself. */ + if (GET_CODE (operands[nops + i]) == SUBREG) + operands[nops + i] = alter_subreg (operands + (nops + i)); + + gcc_assert (GET_CODE (operands[nops + i]) == MEM); + + /* Don't reorder volatile memory references; it doesn't seem worth + looking for the case where the order is ok anyway. */ + if (MEM_VOLATILE_P (operands[nops + i])) + return 0; + + offset = const0_rtx; + + if ((GET_CODE (reg = XEXP (operands[nops + i], 0)) == REG + || (GET_CODE (reg) == SUBREG + && GET_CODE (reg = SUBREG_REG (reg)) == REG)) + || (GET_CODE (XEXP (operands[nops + i], 0)) == PLUS + && ((GET_CODE (reg = XEXP (XEXP (operands[nops + i], 0), 0)) + == REG) + || (GET_CODE (reg) == SUBREG + && GET_CODE (reg = SUBREG_REG (reg)) == REG)) + && (GET_CODE (offset = XEXP (XEXP (operands[nops + i], 0), 1)) + == CONST_INT))) + { + unsorted_reg_rtxs[i] = (GET_CODE (operands[i]) == REG + ? operands[i] : SUBREG_REG (operands[i])); + unsorted_regs[i] = REGNO (unsorted_reg_rtxs[i]); + + if (i == 0) + { + base_reg = REGNO (reg); + base_reg_rtx = reg; + if (TARGET_THUMB1 && base_reg > LAST_LO_REGNUM) + return 0; + } + else if (base_reg != (int) REGNO (reg)) + /* Not addressed from the same base register. */ + return 0; + + /* If it isn't an integer register, then we can't do this. */ + if (unsorted_regs[i] < 0 + || (TARGET_THUMB1 && unsorted_regs[i] > LAST_LO_REGNUM) + /* The effects are unpredictable if the base register is + both updated and stored. */ + || (base_writeback && unsorted_regs[i] == base_reg) + || (TARGET_THUMB2 && unsorted_regs[i] == SP_REGNUM) + || unsorted_regs[i] > 14) + return 0; + + unsorted_offsets[i] = INTVAL (offset); + if (i == 0 || unsorted_offsets[i] < unsorted_offsets[order[0]]) + order[0] = i; + } + else + /* Not a suitable memory address. */ + return 0; + } + + /* All the useful information has now been extracted from the + operands into unsorted_regs and unsorted_offsets; additionally, + order[0] has been set to the lowest offset in the list. Sort + the offsets into order, verifying that they are adjacent, and + check that the register numbers are ascending. */ + if (!compute_offset_order (nops, unsorted_offsets, order, + check_regs ? unsorted_regs : NULL)) + return 0; + + if (saved_order) + memcpy (saved_order, order, sizeof order); + + if (base) + { + *base = base_reg; + + for (i = 0; i < nops; i++) + { + regs[i] = unsorted_regs[check_regs ? order[i] : i]; + if (reg_rtxs) + reg_rtxs[i] = unsorted_reg_rtxs[check_regs ? order[i] : i]; + } + + *load_offset = unsorted_offsets[order[0]]; + } + + if (TARGET_THUMB1 + && !peep2_reg_dead_p (nops_total, base_reg_rtx)) + return 0; + + if (unsorted_offsets[order[0]] == 0) + stm_case = 1; /* stmia */ + else if (TARGET_ARM && unsorted_offsets[order[0]] == 4) + stm_case = 2; /* stmib */ + else if (TARGET_ARM && unsorted_offsets[order[nops - 1]] == 0) + stm_case = 3; /* stmda */ + else if (TARGET_32BIT && unsorted_offsets[order[nops - 1]] == -4) + stm_case = 4; /* stmdb */ + else + return 0; + + if (!multiple_operation_profitable_p (false, nops, 0)) + return 0; + + return stm_case; +} + +/* Routines for use in generating RTL. */ + +/* Generate a load-multiple instruction. COUNT is the number of loads in + the instruction; REGS and MEMS are arrays containing the operands. + BASEREG is the base register to be used in addressing the memory operands. + WBACK_OFFSET is nonzero if the instruction should update the base + register. */ + +static rtx +arm_gen_load_multiple_1 (int count, int *regs, rtx *mems, rtx basereg, + HOST_WIDE_INT wback_offset) +{ + int i = 0, j; + rtx result; + + if (!multiple_operation_profitable_p (false, count, 0)) + { + rtx seq; + + start_sequence (); + + for (i = 0; i < count; i++) + emit_move_insn (gen_rtx_REG (SImode, regs[i]), mems[i]); + + if (wback_offset != 0) + emit_move_insn (basereg, plus_constant (basereg, wback_offset)); + + seq = get_insns (); + end_sequence (); + + return seq; + } + + result = gen_rtx_PARALLEL (VOIDmode, + rtvec_alloc (count + (wback_offset != 0 ? 1 : 0))); + if (wback_offset != 0) + { + XVECEXP (result, 0, 0) + = gen_rtx_SET (VOIDmode, basereg, + plus_constant (basereg, wback_offset)); + i = 1; + count++; + } + + for (j = 0; i < count; i++, j++) + XVECEXP (result, 0, i) + = gen_rtx_SET (VOIDmode, gen_rtx_REG (SImode, regs[j]), mems[j]); + + return result; +} + +/* Generate a store-multiple instruction. COUNT is the number of stores in + the instruction; REGS and MEMS are arrays containing the operands. + BASEREG is the base register to be used in addressing the memory operands. + WBACK_OFFSET is nonzero if the instruction should update the base + register. */ + +static rtx +arm_gen_store_multiple_1 (int count, int *regs, rtx *mems, rtx basereg, + HOST_WIDE_INT wback_offset) +{ + int i = 0, j; + rtx result; + + if (GET_CODE (basereg) == PLUS) + basereg = XEXP (basereg, 0); + + if (!multiple_operation_profitable_p (false, count, 0)) + { + rtx seq; + + start_sequence (); + + for (i = 0; i < count; i++) + emit_move_insn (mems[i], gen_rtx_REG (SImode, regs[i])); + + if (wback_offset != 0) + emit_move_insn (basereg, plus_constant (basereg, wback_offset)); + + seq = get_insns (); + end_sequence (); + + return seq; + } + + result = gen_rtx_PARALLEL (VOIDmode, + rtvec_alloc (count + (wback_offset != 0 ? 1 : 0))); + if (wback_offset != 0) + { + XVECEXP (result, 0, 0) + = gen_rtx_SET (VOIDmode, basereg, + plus_constant (basereg, wback_offset)); + i = 1; + count++; + } + + for (j = 0; i < count; i++, j++) + XVECEXP (result, 0, i) + = gen_rtx_SET (VOIDmode, mems[j], gen_rtx_REG (SImode, regs[j])); + + return result; +} + +/* Generate either a load-multiple or a store-multiple instruction. This + function can be used in situations where we can start with a single MEM + rtx and adjust its address upwards. + COUNT is the number of operations in the instruction, not counting a + possible update of the base register. REGS is an array containing the + register operands. + BASEREG is the base register to be used in addressing the memory operands, + which are constructed from BASEMEM. + WRITE_BACK specifies whether the generated instruction should include an + update of the base register. + OFFSETP is used to pass an offset to and from this function; this offset + is not used when constructing the address (instead BASEMEM should have an + appropriate offset in its address), it is used only for setting + MEM_OFFSET. It is updated only if WRITE_BACK is true.*/ + +static rtx +arm_gen_multiple_op (bool is_load, int *regs, int count, rtx basereg, + bool write_back, rtx basemem, HOST_WIDE_INT *offsetp) +{ + rtx mems[MAX_LDM_STM_OPS]; + HOST_WIDE_INT offset = *offsetp; + int i; + + gcc_assert (count <= MAX_LDM_STM_OPS); + + if (GET_CODE (basereg) == PLUS) + basereg = XEXP (basereg, 0); + + for (i = 0; i < count; i++) + { + rtx addr = plus_constant (basereg, i * 4); + mems[i] = adjust_automodify_address_nv (basemem, SImode, addr, offset); + offset += 4; + } + + if (write_back) + *offsetp = offset; + + if (is_load) + return arm_gen_load_multiple_1 (count, regs, mems, basereg, + write_back ? 4 * count : 0); + else + return arm_gen_store_multiple_1 (count, regs, mems, basereg, + write_back ? 4 * count : 0); +} + +rtx +arm_gen_load_multiple (int *regs, int count, rtx basereg, int write_back, + rtx basemem, HOST_WIDE_INT *offsetp) +{ + return arm_gen_multiple_op (TRUE, regs, count, basereg, write_back, basemem, + offsetp); +} + +rtx +arm_gen_store_multiple (int *regs, int count, rtx basereg, int write_back, + rtx basemem, HOST_WIDE_INT *offsetp) +{ + return arm_gen_multiple_op (FALSE, regs, count, basereg, write_back, basemem, + offsetp); +} + +/* Called from a peephole2 expander to turn a sequence of loads into an + LDM instruction. OPERANDS are the operands found by the peephole matcher; + NOPS indicates how many separate loads we are trying to combine. SORT_REGS + is true if we can reorder the registers because they are used commutatively + subsequently. + Returns true iff we could generate a new instruction. */ + +bool +gen_ldm_seq (rtx *operands, int nops, bool sort_regs) +{ + int regs[MAX_LDM_STM_OPS], mem_order[MAX_LDM_STM_OPS]; + rtx mems[MAX_LDM_STM_OPS]; + int i, j, base_reg; + rtx base_reg_rtx; + HOST_WIDE_INT offset; + int write_back = FALSE; + int ldm_case; + rtx addr; + + ldm_case = load_multiple_sequence (operands, nops, regs, mem_order, + &base_reg, &offset, !sort_regs); + + if (ldm_case == 0) + return false; + + if (sort_regs) + for (i = 0; i < nops - 1; i++) + for (j = i + 1; j < nops; j++) + if (regs[i] > regs[j]) + { + int t = regs[i]; + regs[i] = regs[j]; + regs[j] = t; + } + base_reg_rtx = gen_rtx_REG (Pmode, base_reg); + + if (TARGET_THUMB1) + { + gcc_assert (peep2_reg_dead_p (nops, base_reg_rtx)); + gcc_assert (ldm_case == 1 || ldm_case == 5); + write_back = TRUE; + } + + if (ldm_case == 5) + { + rtx newbase = TARGET_THUMB1 ? base_reg_rtx : gen_rtx_REG (SImode, regs[0]); + emit_insn (gen_addsi3 (newbase, base_reg_rtx, GEN_INT (offset))); + offset = 0; + if (!TARGET_THUMB1) + { + base_reg = regs[0]; + base_reg_rtx = newbase; + } + } + + for (i = 0; i < nops; i++) + { + addr = plus_constant (base_reg_rtx, offset + i * 4); + mems[i] = adjust_automodify_address_nv (operands[nops + mem_order[i]], + SImode, addr, 0); + } + emit_insn (arm_gen_load_multiple_1 (nops, regs, mems, base_reg_rtx, + write_back ? offset + i * 4 : 0)); + return true; +} + +/* Called from a peephole2 expander to turn a sequence of stores into an + STM instruction. OPERANDS are the operands found by the peephole matcher; + NOPS indicates how many separate stores we are trying to combine. + Returns true iff we could generate a new instruction. */ + +bool +gen_stm_seq (rtx *operands, int nops) +{ + int i; + int regs[MAX_LDM_STM_OPS], mem_order[MAX_LDM_STM_OPS]; + rtx mems[MAX_LDM_STM_OPS]; + int base_reg; + rtx base_reg_rtx; + HOST_WIDE_INT offset; + int write_back = FALSE; + int stm_case; + rtx addr; + bool base_reg_dies; + + stm_case = store_multiple_sequence (operands, nops, nops, regs, NULL, + mem_order, &base_reg, &offset, true); + + if (stm_case == 0) + return false; + + base_reg_rtx = gen_rtx_REG (Pmode, base_reg); + + base_reg_dies = peep2_reg_dead_p (nops, base_reg_rtx); + if (TARGET_THUMB1) + { + gcc_assert (base_reg_dies); + write_back = TRUE; + } + + if (stm_case == 5) + { + gcc_assert (base_reg_dies); + emit_insn (gen_addsi3 (base_reg_rtx, base_reg_rtx, GEN_INT (offset))); + offset = 0; + } + + addr = plus_constant (base_reg_rtx, offset); + + for (i = 0; i < nops; i++) + { + addr = plus_constant (base_reg_rtx, offset + i * 4); + mems[i] = adjust_automodify_address_nv (operands[nops + mem_order[i]], + SImode, addr, 0); + } + emit_insn (arm_gen_store_multiple_1 (nops, regs, mems, base_reg_rtx, + write_back ? offset + i * 4 : 0)); + return true; +} + +/* Called from a peephole2 expander to turn a sequence of stores that are + preceded by constant loads into an STM instruction. OPERANDS are the + operands found by the peephole matcher; NOPS indicates how many + separate stores we are trying to combine; there are 2 * NOPS + instructions in the peephole. + Returns true iff we could generate a new instruction. */ + +bool +gen_const_stm_seq (rtx *operands, int nops) +{ + int regs[MAX_LDM_STM_OPS], sorted_regs[MAX_LDM_STM_OPS]; + int reg_order[MAX_LDM_STM_OPS], mem_order[MAX_LDM_STM_OPS]; + rtx reg_rtxs[MAX_LDM_STM_OPS], orig_reg_rtxs[MAX_LDM_STM_OPS]; + rtx mems[MAX_LDM_STM_OPS]; + int base_reg; + rtx base_reg_rtx; + HOST_WIDE_INT offset; + int write_back = FALSE; + int stm_case; + rtx addr; + bool base_reg_dies; + int i, j; + HARD_REG_SET allocated; + + stm_case = store_multiple_sequence (operands, nops, 2 * nops, regs, reg_rtxs, + mem_order, &base_reg, &offset, false); + + if (stm_case == 0) + return false; + + memcpy (orig_reg_rtxs, reg_rtxs, sizeof orig_reg_rtxs); + + /* If the same register is used more than once, try to find a free + register. */ + CLEAR_HARD_REG_SET (allocated); + for (i = 0; i < nops; i++) + { + for (j = i + 1; j < nops; j++) + if (regs[i] == regs[j]) + { + rtx t = peep2_find_free_register (0, nops * 2, + TARGET_THUMB1 ? "l" : "r", + SImode, &allocated); + if (t == NULL_RTX) + return false; + reg_rtxs[i] = t; + regs[i] = REGNO (t); + } + } + + /* Compute an ordering that maps the register numbers to an ascending + sequence. */ + reg_order[0] = 0; + for (i = 0; i < nops; i++) + if (regs[i] < regs[reg_order[0]]) + reg_order[0] = i; + + for (i = 1; i < nops; i++) + { + int this_order = reg_order[i - 1]; + for (j = 0; j < nops; j++) + if (regs[j] > regs[reg_order[i - 1]] + && (this_order == reg_order[i - 1] + || regs[j] < regs[this_order])) + this_order = j; + reg_order[i] = this_order; + } + + /* Ensure that registers that must be live after the instruction end + up with the correct value. */ + for (i = 0; i < nops; i++) + { + int this_order = reg_order[i]; + if ((this_order != mem_order[i] + || orig_reg_rtxs[this_order] != reg_rtxs[this_order]) + && !peep2_reg_dead_p (nops * 2, orig_reg_rtxs[this_order])) + return false; + } + + /* Load the constants. */ + for (i = 0; i < nops; i++) + { + rtx op = operands[2 * nops + mem_order[i]]; + sorted_regs[i] = regs[reg_order[i]]; + emit_move_insn (reg_rtxs[reg_order[i]], op); + } + + base_reg_rtx = gen_rtx_REG (Pmode, base_reg); + + base_reg_dies = peep2_reg_dead_p (nops * 2, base_reg_rtx); + if (TARGET_THUMB1) + { + gcc_assert (base_reg_dies); + write_back = TRUE; + } + + if (stm_case == 5) + { + gcc_assert (base_reg_dies); + emit_insn (gen_addsi3 (base_reg_rtx, base_reg_rtx, GEN_INT (offset))); + offset = 0; + } + + addr = plus_constant (base_reg_rtx, offset); + + for (i = 0; i < nops; i++) + { + addr = plus_constant (base_reg_rtx, offset + i * 4); + mems[i] = adjust_automodify_address_nv (operands[nops + mem_order[i]], + SImode, addr, 0); + } + emit_insn (arm_gen_store_multiple_1 (nops, sorted_regs, mems, base_reg_rtx, + write_back ? offset + i * 4 : 0)); + return true; +} + +int +arm_gen_movmemqi (rtx *operands) +{ + HOST_WIDE_INT in_words_to_go, out_words_to_go, last_bytes; + HOST_WIDE_INT srcoffset, dstoffset; + int i; + rtx src, dst, srcbase, dstbase; + rtx part_bytes_reg = NULL; + rtx mem; + + if (GET_CODE (operands[2]) != CONST_INT + || GET_CODE (operands[3]) != CONST_INT + || INTVAL (operands[2]) > 64 + || INTVAL (operands[3]) & 3) + return 0; + + dstbase = operands[0]; + srcbase = operands[1]; + + dst = copy_to_mode_reg (SImode, XEXP (dstbase, 0)); + src = copy_to_mode_reg (SImode, XEXP (srcbase, 0)); + + in_words_to_go = ARM_NUM_INTS (INTVAL (operands[2])); + out_words_to_go = INTVAL (operands[2]) / 4; + last_bytes = INTVAL (operands[2]) & 3; + dstoffset = srcoffset = 0; + + if (out_words_to_go != in_words_to_go && ((in_words_to_go - 1) & 3) != 0) + part_bytes_reg = gen_rtx_REG (SImode, (in_words_to_go - 1) & 3); + + for (i = 0; in_words_to_go >= 2; i+=4) + { + if (in_words_to_go > 4) + emit_insn (arm_gen_load_multiple (arm_regs_in_sequence, 4, src, + TRUE, srcbase, &srcoffset)); + else + emit_insn (arm_gen_load_multiple (arm_regs_in_sequence, in_words_to_go, + src, FALSE, srcbase, + &srcoffset)); + + if (out_words_to_go) + { + if (out_words_to_go > 4) + emit_insn (arm_gen_store_multiple (arm_regs_in_sequence, 4, dst, + TRUE, dstbase, &dstoffset)); + else if (out_words_to_go != 1) + emit_insn (arm_gen_store_multiple (arm_regs_in_sequence, + out_words_to_go, dst, + (last_bytes == 0 + ? FALSE : TRUE), + dstbase, &dstoffset)); + else + { + mem = adjust_automodify_address (dstbase, SImode, dst, dstoffset); + emit_move_insn (mem, gen_rtx_REG (SImode, 0)); + if (last_bytes != 0) + { + emit_insn (gen_addsi3 (dst, dst, GEN_INT (4))); + dstoffset += 4; + } + } + } + + in_words_to_go -= in_words_to_go < 4 ? in_words_to_go : 4; + out_words_to_go -= out_words_to_go < 4 ? out_words_to_go : 4; + } + + /* OUT_WORDS_TO_GO will be zero here if there are byte stores to do. */ + if (out_words_to_go) + { + rtx sreg; + + mem = adjust_automodify_address (srcbase, SImode, src, srcoffset); + sreg = copy_to_reg (mem); + + mem = adjust_automodify_address (dstbase, SImode, dst, dstoffset); + emit_move_insn (mem, sreg); + in_words_to_go--; + + gcc_assert (!in_words_to_go); /* Sanity check */ + } + + if (in_words_to_go) + { + gcc_assert (in_words_to_go > 0); + + mem = adjust_automodify_address (srcbase, SImode, src, srcoffset); + part_bytes_reg = copy_to_mode_reg (SImode, mem); + } + + gcc_assert (!last_bytes || part_bytes_reg); + + if (BYTES_BIG_ENDIAN && last_bytes) + { + rtx tmp = gen_reg_rtx (SImode); + + /* The bytes we want are in the top end of the word. */ + emit_insn (gen_lshrsi3 (tmp, part_bytes_reg, + GEN_INT (8 * (4 - last_bytes)))); + part_bytes_reg = tmp; + + while (last_bytes) + { + mem = adjust_automodify_address (dstbase, QImode, + plus_constant (dst, last_bytes - 1), + dstoffset + last_bytes - 1); + emit_move_insn (mem, gen_lowpart (QImode, part_bytes_reg)); + + if (--last_bytes) + { + tmp = gen_reg_rtx (SImode); + emit_insn (gen_lshrsi3 (tmp, part_bytes_reg, GEN_INT (8))); + part_bytes_reg = tmp; + } + } + + } + else + { + if (last_bytes > 1) + { + mem = adjust_automodify_address (dstbase, HImode, dst, dstoffset); + emit_move_insn (mem, gen_lowpart (HImode, part_bytes_reg)); + last_bytes -= 2; + if (last_bytes) + { + rtx tmp = gen_reg_rtx (SImode); + emit_insn (gen_addsi3 (dst, dst, const2_rtx)); + emit_insn (gen_lshrsi3 (tmp, part_bytes_reg, GEN_INT (16))); + part_bytes_reg = tmp; + dstoffset += 2; + } + } + + if (last_bytes) + { + mem = adjust_automodify_address (dstbase, QImode, dst, dstoffset); + emit_move_insn (mem, gen_lowpart (QImode, part_bytes_reg)); + } + } + + return 1; +} + +/* Select a dominance comparison mode if possible for a test of the general + form (OP (COND_OR (X) (Y)) (const_int 0)). We support three forms. + COND_OR == DOM_CC_X_AND_Y => (X && Y) + COND_OR == DOM_CC_NX_OR_Y => ((! X) || Y) + COND_OR == DOM_CC_X_OR_Y => (X || Y) + In all cases OP will be either EQ or NE, but we don't need to know which + here. If we are unable to support a dominance comparison we return + CC mode. This will then fail to match for the RTL expressions that + generate this call. */ +enum machine_mode +arm_select_dominance_cc_mode (rtx x, rtx y, HOST_WIDE_INT cond_or) +{ + enum rtx_code cond1, cond2; + int swapped = 0; + + /* Currently we will probably get the wrong result if the individual + comparisons are not simple. This also ensures that it is safe to + reverse a comparison if necessary. */ + if ((arm_select_cc_mode (cond1 = GET_CODE (x), XEXP (x, 0), XEXP (x, 1)) + != CCmode) + || (arm_select_cc_mode (cond2 = GET_CODE (y), XEXP (y, 0), XEXP (y, 1)) + != CCmode)) + return CCmode; + + /* The if_then_else variant of this tests the second condition if the + first passes, but is true if the first fails. Reverse the first + condition to get a true "inclusive-or" expression. */ + if (cond_or == DOM_CC_NX_OR_Y) + cond1 = reverse_condition (cond1); + + /* If the comparisons are not equal, and one doesn't dominate the other, + then we can't do this. */ + if (cond1 != cond2 + && !comparison_dominates_p (cond1, cond2) + && (swapped = 1, !comparison_dominates_p (cond2, cond1))) + return CCmode; + + if (swapped) + { + enum rtx_code temp = cond1; + cond1 = cond2; + cond2 = temp; + } + + switch (cond1) + { + case EQ: + if (cond_or == DOM_CC_X_AND_Y) + return CC_DEQmode; + + switch (cond2) + { + case EQ: return CC_DEQmode; + case LE: return CC_DLEmode; + case LEU: return CC_DLEUmode; + case GE: return CC_DGEmode; + case GEU: return CC_DGEUmode; + default: gcc_unreachable (); + } + + case LT: + if (cond_or == DOM_CC_X_AND_Y) + return CC_DLTmode; + + switch (cond2) + { + case LT: + return CC_DLTmode; + case LE: + return CC_DLEmode; + case NE: + return CC_DNEmode; + default: + gcc_unreachable (); + } + + case GT: + if (cond_or == DOM_CC_X_AND_Y) + return CC_DGTmode; + + switch (cond2) + { + case GT: + return CC_DGTmode; + case GE: + return CC_DGEmode; + case NE: + return CC_DNEmode; + default: + gcc_unreachable (); + } + + case LTU: + if (cond_or == DOM_CC_X_AND_Y) + return CC_DLTUmode; + + switch (cond2) + { + case LTU: + return CC_DLTUmode; + case LEU: + return CC_DLEUmode; + case NE: + return CC_DNEmode; + default: + gcc_unreachable (); + } + + case GTU: + if (cond_or == DOM_CC_X_AND_Y) + return CC_DGTUmode; + + switch (cond2) + { + case GTU: + return CC_DGTUmode; + case GEU: + return CC_DGEUmode; + case NE: + return CC_DNEmode; + default: + gcc_unreachable (); + } + + /* The remaining cases only occur when both comparisons are the + same. */ + case NE: + gcc_assert (cond1 == cond2); + return CC_DNEmode; + + case LE: + gcc_assert (cond1 == cond2); + return CC_DLEmode; + + case GE: + gcc_assert (cond1 == cond2); + return CC_DGEmode; + + case LEU: + gcc_assert (cond1 == cond2); + return CC_DLEUmode; + + case GEU: + gcc_assert (cond1 == cond2); + return CC_DGEUmode; + + default: + gcc_unreachable (); + } +} + +enum machine_mode +arm_select_cc_mode (enum rtx_code op, rtx x, rtx y) +{ + /* All floating point compares return CCFP if it is an equality + comparison, and CCFPE otherwise. */ + if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT) + { + switch (op) + { + case EQ: + case NE: + case UNORDERED: + case ORDERED: + case UNLT: + case UNLE: + case UNGT: + case UNGE: + case UNEQ: + case LTGT: + return CCFPmode; + + case LT: + case LE: + case GT: + case GE: + if (TARGET_HARD_FLOAT && TARGET_MAVERICK) + return CCFPmode; + return CCFPEmode; + + default: + gcc_unreachable (); + } + } + + /* A compare with a shifted operand. Because of canonicalization, the + comparison will have to be swapped when we emit the assembler. */ + if (GET_MODE (y) == SImode + && (REG_P (y) || (GET_CODE (y) == SUBREG)) + && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT + || GET_CODE (x) == LSHIFTRT || GET_CODE (x) == ROTATE + || GET_CODE (x) == ROTATERT)) + return CC_SWPmode; + + /* This operation is performed swapped, but since we only rely on the Z + flag we don't need an additional mode. */ + if (GET_MODE (y) == SImode + && (REG_P (y) || (GET_CODE (y) == SUBREG)) + && GET_CODE (x) == NEG + && (op == EQ || op == NE)) + return CC_Zmode; + + /* This is a special case that is used by combine to allow a + comparison of a shifted byte load to be split into a zero-extend + followed by a comparison of the shifted integer (only valid for + equalities and unsigned inequalities). */ + if (GET_MODE (x) == SImode + && GET_CODE (x) == ASHIFT + && GET_CODE (XEXP (x, 1)) == CONST_INT && INTVAL (XEXP (x, 1)) == 24 + && GET_CODE (XEXP (x, 0)) == SUBREG + && GET_CODE (SUBREG_REG (XEXP (x, 0))) == MEM + && GET_MODE (SUBREG_REG (XEXP (x, 0))) == QImode + && (op == EQ || op == NE + || op == GEU || op == GTU || op == LTU || op == LEU) + && GET_CODE (y) == CONST_INT) + return CC_Zmode; + + /* A construct for a conditional compare, if the false arm contains + 0, then both conditions must be true, otherwise either condition + must be true. Not all conditions are possible, so CCmode is + returned if it can't be done. */ + if (GET_CODE (x) == IF_THEN_ELSE + && (XEXP (x, 2) == const0_rtx + || XEXP (x, 2) == const1_rtx) + && COMPARISON_P (XEXP (x, 0)) + && COMPARISON_P (XEXP (x, 1))) + return arm_select_dominance_cc_mode (XEXP (x, 0), XEXP (x, 1), + INTVAL (XEXP (x, 2))); + + /* Alternate canonicalizations of the above. These are somewhat cleaner. */ + if (GET_CODE (x) == AND + && (op == EQ || op == NE) + && COMPARISON_P (XEXP (x, 0)) + && COMPARISON_P (XEXP (x, 1))) + return arm_select_dominance_cc_mode (XEXP (x, 0), XEXP (x, 1), + DOM_CC_X_AND_Y); + + if (GET_CODE (x) == IOR + && (op == EQ || op == NE) + && COMPARISON_P (XEXP (x, 0)) + && COMPARISON_P (XEXP (x, 1))) + return arm_select_dominance_cc_mode (XEXP (x, 0), XEXP (x, 1), + DOM_CC_X_OR_Y); + + /* An operation (on Thumb) where we want to test for a single bit. + This is done by shifting that bit up into the top bit of a + scratch register; we can then branch on the sign bit. */ + if (TARGET_THUMB1 + && GET_MODE (x) == SImode + && (op == EQ || op == NE) + && GET_CODE (x) == ZERO_EXTRACT + && XEXP (x, 1) == const1_rtx) + return CC_Nmode; + + /* An operation that sets the condition codes as a side-effect, the + V flag is not set correctly, so we can only use comparisons where + this doesn't matter. (For LT and GE we can use "mi" and "pl" + instead.) */ + /* ??? Does the ZERO_EXTRACT case really apply to thumb2? */ + if (GET_MODE (x) == SImode + && y == const0_rtx + && (op == EQ || op == NE || op == LT || op == GE) + && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS + || GET_CODE (x) == AND || GET_CODE (x) == IOR + || GET_CODE (x) == XOR || GET_CODE (x) == MULT + || GET_CODE (x) == NOT || GET_CODE (x) == NEG + || GET_CODE (x) == LSHIFTRT + || GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT + || GET_CODE (x) == ROTATERT + || (TARGET_32BIT && GET_CODE (x) == ZERO_EXTRACT))) + return CC_NOOVmode; + + if (GET_MODE (x) == QImode && (op == EQ || op == NE)) + return CC_Zmode; + + if (GET_MODE (x) == SImode && (op == LTU || op == GEU) + && GET_CODE (x) == PLUS + && (rtx_equal_p (XEXP (x, 0), y) || rtx_equal_p (XEXP (x, 1), y))) + return CC_Cmode; + + if (GET_MODE (x) == DImode || GET_MODE (y) == DImode) + { + /* To keep things simple, always use the Cirrus cfcmp64 if it is + available. */ + if (TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK) + return CCmode; + + switch (op) + { + case EQ: + case NE: + /* A DImode comparison against zero can be implemented by + or'ing the two halves together. */ + if (y == const0_rtx) + return CC_Zmode; + + /* We can do an equality test in three Thumb instructions. */ + if (!TARGET_ARM) + return CC_Zmode; + + /* FALLTHROUGH */ + + case LTU: + case LEU: + case GTU: + case GEU: + /* DImode unsigned comparisons can be implemented by cmp + + cmpeq without a scratch register. Not worth doing in + Thumb-2. */ + if (TARGET_ARM) + return CC_CZmode; + + /* FALLTHROUGH */ + + case LT: + case LE: + case GT: + case GE: + /* DImode signed and unsigned comparisons can be implemented + by cmp + sbcs with a scratch register, but that does not + set the Z flag - we must reverse GT/LE/GTU/LEU. */ + gcc_assert (op != EQ && op != NE); + return CC_NCVmode; + + default: + gcc_unreachable (); + } + } + + return CCmode; +} + +/* X and Y are two things to compare using CODE. Emit the compare insn and + return the rtx for register 0 in the proper mode. FP means this is a + floating point compare: I don't think that it is needed on the arm. */ +rtx +arm_gen_compare_reg (enum rtx_code code, rtx x, rtx y) +{ + enum machine_mode mode; + rtx cc_reg; + int dimode_comparison = GET_MODE (x) == DImode || GET_MODE (y) == DImode; + + /* We might have X as a constant, Y as a register because of the predicates + used for cmpdi. If so, force X to a register here. */ + if (dimode_comparison && !REG_P (x)) + x = force_reg (DImode, x); + + mode = SELECT_CC_MODE (code, x, y); + cc_reg = gen_rtx_REG (mode, CC_REGNUM); + + if (dimode_comparison + && !(TARGET_HARD_FLOAT && TARGET_MAVERICK) + && mode != CC_CZmode) + { + rtx clobber, set; + + /* To compare two non-zero values for equality, XOR them and + then compare against zero. Not used for ARM mode; there + CC_CZmode is cheaper. */ + if (mode == CC_Zmode && y != const0_rtx) + { + x = expand_binop (DImode, xor_optab, x, y, NULL_RTX, 0, OPTAB_WIDEN); + y = const0_rtx; + } + /* A scratch register is required. */ + clobber = gen_rtx_CLOBBER (VOIDmode, gen_rtx_SCRATCH (SImode)); + set = gen_rtx_SET (VOIDmode, cc_reg, gen_rtx_COMPARE (mode, x, y)); + emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clobber))); + } + else + emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y)); + + return cc_reg; +} + +/* Generate a sequence of insns that will generate the correct return + address mask depending on the physical architecture that the program + is running on. */ +rtx +arm_gen_return_addr_mask (void) +{ + rtx reg = gen_reg_rtx (Pmode); + + emit_insn (gen_return_addr_mask (reg)); + return reg; +} + +void +arm_reload_in_hi (rtx *operands) +{ + rtx ref = operands[1]; + rtx base, scratch; + HOST_WIDE_INT offset = 0; + + if (GET_CODE (ref) == SUBREG) + { + offset = SUBREG_BYTE (ref); + ref = SUBREG_REG (ref); + } + + if (GET_CODE (ref) == REG) + { + /* We have a pseudo which has been spilt onto the stack; there + are two cases here: the first where there is a simple + stack-slot replacement and a second where the stack-slot is + out of range, or is used as a subreg. */ + if (reg_equiv_mem[REGNO (ref)]) + { + ref = reg_equiv_mem[REGNO (ref)]; + base = find_replacement (&XEXP (ref, 0)); + } + else + /* The slot is out of range, or was dressed up in a SUBREG. */ + base = reg_equiv_address[REGNO (ref)]; + } + else + base = find_replacement (&XEXP (ref, 0)); + + /* Handle the case where the address is too complex to be offset by 1. */ + if (GET_CODE (base) == MINUS + || (GET_CODE (base) == PLUS && GET_CODE (XEXP (base, 1)) != CONST_INT)) + { + rtx base_plus = gen_rtx_REG (SImode, REGNO (operands[2]) + 1); + + emit_set_insn (base_plus, base); + base = base_plus; + } + else if (GET_CODE (base) == PLUS) + { + /* The addend must be CONST_INT, or we would have dealt with it above. */ + HOST_WIDE_INT hi, lo; + + offset += INTVAL (XEXP (base, 1)); + base = XEXP (base, 0); + + /* Rework the address into a legal sequence of insns. */ + /* Valid range for lo is -4095 -> 4095 */ + lo = (offset >= 0 + ? (offset & 0xfff) + : -((-offset) & 0xfff)); + + /* Corner case, if lo is the max offset then we would be out of range + once we have added the additional 1 below, so bump the msb into the + pre-loading insn(s). */ + if (lo == 4095) + lo &= 0x7ff; + + hi = ((((offset - lo) & (HOST_WIDE_INT) 0xffffffff) + ^ (HOST_WIDE_INT) 0x80000000) + - (HOST_WIDE_INT) 0x80000000); + + gcc_assert (hi + lo == offset); + + if (hi != 0) + { + rtx base_plus = gen_rtx_REG (SImode, REGNO (operands[2]) + 1); + + /* Get the base address; addsi3 knows how to handle constants + that require more than one insn. */ + emit_insn (gen_addsi3 (base_plus, base, GEN_INT (hi))); + base = base_plus; + offset = lo; + } + } + + /* Operands[2] may overlap operands[0] (though it won't overlap + operands[1]), that's why we asked for a DImode reg -- so we can + use the bit that does not overlap. */ + if (REGNO (operands[2]) == REGNO (operands[0])) + scratch = gen_rtx_REG (SImode, REGNO (operands[2]) + 1); + else + scratch = gen_rtx_REG (SImode, REGNO (operands[2])); + + emit_insn (gen_zero_extendqisi2 (scratch, + gen_rtx_MEM (QImode, + plus_constant (base, + offset)))); + emit_insn (gen_zero_extendqisi2 (gen_rtx_SUBREG (SImode, operands[0], 0), + gen_rtx_MEM (QImode, + plus_constant (base, + offset + 1)))); + if (!BYTES_BIG_ENDIAN) + emit_set_insn (gen_rtx_SUBREG (SImode, operands[0], 0), + gen_rtx_IOR (SImode, + gen_rtx_ASHIFT + (SImode, + gen_rtx_SUBREG (SImode, operands[0], 0), + GEN_INT (8)), + scratch)); + else + emit_set_insn (gen_rtx_SUBREG (SImode, operands[0], 0), + gen_rtx_IOR (SImode, + gen_rtx_ASHIFT (SImode, scratch, + GEN_INT (8)), + gen_rtx_SUBREG (SImode, operands[0], 0))); +} + +/* Handle storing a half-word to memory during reload by synthesizing as two + byte stores. Take care not to clobber the input values until after we + have moved them somewhere safe. This code assumes that if the DImode + scratch in operands[2] overlaps either the input value or output address + in some way, then that value must die in this insn (we absolutely need + two scratch registers for some corner cases). */ +void +arm_reload_out_hi (rtx *operands) +{ + rtx ref = operands[0]; + rtx outval = operands[1]; + rtx base, scratch; + HOST_WIDE_INT offset = 0; + + if (GET_CODE (ref) == SUBREG) + { + offset = SUBREG_BYTE (ref); + ref = SUBREG_REG (ref); + } + + if (GET_CODE (ref) == REG) + { + /* We have a pseudo which has been spilt onto the stack; there + are two cases here: the first where there is a simple + stack-slot replacement and a second where the stack-slot is + out of range, or is used as a subreg. */ + if (reg_equiv_mem[REGNO (ref)]) + { + ref = reg_equiv_mem[REGNO (ref)]; + base = find_replacement (&XEXP (ref, 0)); + } + else + /* The slot is out of range, or was dressed up in a SUBREG. */ + base = reg_equiv_address[REGNO (ref)]; + } + else + base = find_replacement (&XEXP (ref, 0)); + + scratch = gen_rtx_REG (SImode, REGNO (operands[2])); + + /* Handle the case where the address is too complex to be offset by 1. */ + if (GET_CODE (base) == MINUS + || (GET_CODE (base) == PLUS && GET_CODE (XEXP (base, 1)) != CONST_INT)) + { + rtx base_plus = gen_rtx_REG (SImode, REGNO (operands[2]) + 1); + + /* Be careful not to destroy OUTVAL. */ + if (reg_overlap_mentioned_p (base_plus, outval)) + { + /* Updating base_plus might destroy outval, see if we can + swap the scratch and base_plus. */ + if (!reg_overlap_mentioned_p (scratch, outval)) + { + rtx tmp = scratch; + scratch = base_plus; + base_plus = tmp; + } + else + { + rtx scratch_hi = gen_rtx_REG (HImode, REGNO (operands[2])); + + /* Be conservative and copy OUTVAL into the scratch now, + this should only be necessary if outval is a subreg + of something larger than a word. */ + /* XXX Might this clobber base? I can't see how it can, + since scratch is known to overlap with OUTVAL, and + must be wider than a word. */ + emit_insn (gen_movhi (scratch_hi, outval)); + outval = scratch_hi; + } + } + + emit_set_insn (base_plus, base); + base = base_plus; + } + else if (GET_CODE (base) == PLUS) + { + /* The addend must be CONST_INT, or we would have dealt with it above. */ + HOST_WIDE_INT hi, lo; + + offset += INTVAL (XEXP (base, 1)); + base = XEXP (base, 0); + + /* Rework the address into a legal sequence of insns. */ + /* Valid range for lo is -4095 -> 4095 */ + lo = (offset >= 0 + ? (offset & 0xfff) + : -((-offset) & 0xfff)); + + /* Corner case, if lo is the max offset then we would be out of range + once we have added the additional 1 below, so bump the msb into the + pre-loading insn(s). */ + if (lo == 4095) + lo &= 0x7ff; + + hi = ((((offset - lo) & (HOST_WIDE_INT) 0xffffffff) + ^ (HOST_WIDE_INT) 0x80000000) + - (HOST_WIDE_INT) 0x80000000); + + gcc_assert (hi + lo == offset); + + if (hi != 0) + { + rtx base_plus = gen_rtx_REG (SImode, REGNO (operands[2]) + 1); + + /* Be careful not to destroy OUTVAL. */ + if (reg_overlap_mentioned_p (base_plus, outval)) + { + /* Updating base_plus might destroy outval, see if we + can swap the scratch and base_plus. */ + if (!reg_overlap_mentioned_p (scratch, outval)) + { + rtx tmp = scratch; + scratch = base_plus; + base_plus = tmp; + } + else + { + rtx scratch_hi = gen_rtx_REG (HImode, REGNO (operands[2])); + + /* Be conservative and copy outval into scratch now, + this should only be necessary if outval is a + subreg of something larger than a word. */ + /* XXX Might this clobber base? I can't see how it + can, since scratch is known to overlap with + outval. */ + emit_insn (gen_movhi (scratch_hi, outval)); + outval = scratch_hi; + } + } + + /* Get the base address; addsi3 knows how to handle constants + that require more than one insn. */ + emit_insn (gen_addsi3 (base_plus, base, GEN_INT (hi))); + base = base_plus; + offset = lo; + } + } + + if (BYTES_BIG_ENDIAN) + { + emit_insn (gen_movqi (gen_rtx_MEM (QImode, + plus_constant (base, offset + 1)), + gen_lowpart (QImode, outval))); + emit_insn (gen_lshrsi3 (scratch, + gen_rtx_SUBREG (SImode, outval, 0), + GEN_INT (8))); + emit_insn (gen_movqi (gen_rtx_MEM (QImode, plus_constant (base, offset)), + gen_lowpart (QImode, scratch))); + } + else + { + emit_insn (gen_movqi (gen_rtx_MEM (QImode, plus_constant (base, offset)), + gen_lowpart (QImode, outval))); + emit_insn (gen_lshrsi3 (scratch, + gen_rtx_SUBREG (SImode, outval, 0), + GEN_INT (8))); + emit_insn (gen_movqi (gen_rtx_MEM (QImode, + plus_constant (base, offset + 1)), + gen_lowpart (QImode, scratch))); + } +} + +/* Return true if a type must be passed in memory. For AAPCS, small aggregates + (padded to the size of a word) should be passed in a register. */ + +static bool +arm_must_pass_in_stack (enum machine_mode mode, const_tree type) +{ + if (TARGET_AAPCS_BASED) + return must_pass_in_stack_var_size (mode, type); + else + return must_pass_in_stack_var_size_or_pad (mode, type); +} + + +/* For use by FUNCTION_ARG_PADDING (MODE, TYPE). + Return true if an argument passed on the stack should be padded upwards, + i.e. if the least-significant byte has useful data. + For legacy APCS ABIs we use the default. For AAPCS based ABIs small + aggregate types are placed in the lowest memory address. */ + +bool +arm_pad_arg_upward (enum machine_mode mode, const_tree type) +{ + if (!TARGET_AAPCS_BASED) + return DEFAULT_FUNCTION_ARG_PADDING(mode, type) == upward; + + if (type && BYTES_BIG_ENDIAN && INTEGRAL_TYPE_P (type)) + return false; + + return true; +} + + +/* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST). + For non-AAPCS, return !BYTES_BIG_ENDIAN if the least significant + byte of the register has useful data, and return the opposite if the + most significant byte does. + For AAPCS, small aggregates and small complex types are always padded + upwards. */ + +bool +arm_pad_reg_upward (enum machine_mode mode ATTRIBUTE_UNUSED, + tree type, int first ATTRIBUTE_UNUSED) +{ + if (TARGET_AAPCS_BASED + && BYTES_BIG_ENDIAN + && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE) + && int_size_in_bytes (type) <= 4) + return true; + + /* Otherwise, use default padding. */ + return !BYTES_BIG_ENDIAN; +} + + +/* Print a symbolic form of X to the debug file, F. */ +static void +arm_print_value (FILE *f, rtx x) +{ + switch (GET_CODE (x)) + { + case CONST_INT: + fprintf (f, HOST_WIDE_INT_PRINT_HEX, INTVAL (x)); + return; + + case CONST_DOUBLE: + fprintf (f, "<0x%lx,0x%lx>", (long)XWINT (x, 2), (long)XWINT (x, 3)); + return; + + case CONST_VECTOR: + { + int i; + + fprintf (f, "<"); + for (i = 0; i < CONST_VECTOR_NUNITS (x); i++) + { + fprintf (f, HOST_WIDE_INT_PRINT_HEX, INTVAL (CONST_VECTOR_ELT (x, i))); + if (i < (CONST_VECTOR_NUNITS (x) - 1)) + fputc (',', f); + } + fprintf (f, ">"); + } + return; + + case CONST_STRING: + fprintf (f, "\"%s\"", XSTR (x, 0)); + return; + + case SYMBOL_REF: + fprintf (f, "`%s'", XSTR (x, 0)); + return; + + case LABEL_REF: + fprintf (f, "L%d", INSN_UID (XEXP (x, 0))); + return; + + case CONST: + arm_print_value (f, XEXP (x, 0)); + return; + + case PLUS: + arm_print_value (f, XEXP (x, 0)); + fprintf (f, "+"); + arm_print_value (f, XEXP (x, 1)); + return; + + case PC: + fprintf (f, "pc"); + return; + + default: + fprintf (f, "????"); + return; + } +} + +/* Routines for manipulation of the constant pool. */ + +/* Arm instructions cannot load a large constant directly into a + register; they have to come from a pc relative load. The constant + must therefore be placed in the addressable range of the pc + relative load. Depending on the precise pc relative load + instruction the range is somewhere between 256 bytes and 4k. This + means that we often have to dump a constant inside a function, and + generate code to branch around it. + + It is important to minimize this, since the branches will slow + things down and make the code larger. + + Normally we can hide the table after an existing unconditional + branch so that there is no interruption of the flow, but in the + worst case the code looks like this: + + ldr rn, L1 + ... + b L2 + align + L1: .long value + L2: + ... + + ldr rn, L3 + ... + b L4 + align + L3: .long value + L4: + ... + + We fix this by performing a scan after scheduling, which notices + which instructions need to have their operands fetched from the + constant table and builds the table. + + The algorithm starts by building a table of all the constants that + need fixing up and all the natural barriers in the function (places + where a constant table can be dropped without breaking the flow). + For each fixup we note how far the pc-relative replacement will be + able to reach and the offset of the instruction into the function. + + Having built the table we then group the fixes together to form + tables that are as large as possible (subject to addressing + constraints) and emit each table of constants after the last + barrier that is within range of all the instructions in the group. + If a group does not contain a barrier, then we forcibly create one + by inserting a jump instruction into the flow. Once the table has + been inserted, the insns are then modified to reference the + relevant entry in the pool. + + Possible enhancements to the algorithm (not implemented) are: + + 1) For some processors and object formats, there may be benefit in + aligning the pools to the start of cache lines; this alignment + would need to be taken into account when calculating addressability + of a pool. */ + +/* These typedefs are located at the start of this file, so that + they can be used in the prototypes there. This comment is to + remind readers of that fact so that the following structures + can be understood more easily. + + typedef struct minipool_node Mnode; + typedef struct minipool_fixup Mfix; */ + +struct minipool_node +{ + /* Doubly linked chain of entries. */ + Mnode * next; + Mnode * prev; + /* The maximum offset into the code that this entry can be placed. While + pushing fixes for forward references, all entries are sorted in order + of increasing max_address. */ + HOST_WIDE_INT max_address; + /* Similarly for an entry inserted for a backwards ref. */ + HOST_WIDE_INT min_address; + /* The number of fixes referencing this entry. This can become zero + if we "unpush" an entry. In this case we ignore the entry when we + come to emit the code. */ + int refcount; + /* The offset from the start of the minipool. */ + HOST_WIDE_INT offset; + /* The value in table. */ + rtx value; + /* The mode of value. */ + enum machine_mode mode; + /* The size of the value. With iWMMXt enabled + sizes > 4 also imply an alignment of 8-bytes. */ + int fix_size; +}; + +struct minipool_fixup +{ + Mfix * next; + rtx insn; + HOST_WIDE_INT address; + rtx * loc; + enum machine_mode mode; + int fix_size; + rtx value; + Mnode * minipool; + HOST_WIDE_INT forwards; + HOST_WIDE_INT backwards; +}; + +/* Fixes less than a word need padding out to a word boundary. */ +#define MINIPOOL_FIX_SIZE(mode) \ + (GET_MODE_SIZE ((mode)) >= 4 ? GET_MODE_SIZE ((mode)) : 4) + +static Mnode * minipool_vector_head; +static Mnode * minipool_vector_tail; +static rtx minipool_vector_label; +static int minipool_pad; + +/* The linked list of all minipool fixes required for this function. */ +Mfix * minipool_fix_head; +Mfix * minipool_fix_tail; +/* The fix entry for the current minipool, once it has been placed. */ +Mfix * minipool_barrier; + +/* Determines if INSN is the start of a jump table. Returns the end + of the TABLE or NULL_RTX. */ +static rtx +is_jump_table (rtx insn) +{ + rtx table; + + if (GET_CODE (insn) == JUMP_INSN + && JUMP_LABEL (insn) != NULL + && ((table = next_real_insn (JUMP_LABEL (insn))) + == next_real_insn (insn)) + && table != NULL + && GET_CODE (table) == JUMP_INSN + && (GET_CODE (PATTERN (table)) == ADDR_VEC + || GET_CODE (PATTERN (table)) == ADDR_DIFF_VEC)) + return table; + + return NULL_RTX; +} + +#ifndef JUMP_TABLES_IN_TEXT_SECTION +#define JUMP_TABLES_IN_TEXT_SECTION 0 +#endif + +static HOST_WIDE_INT +get_jump_table_size (rtx insn) +{ + /* ADDR_VECs only take room if read-only data does into the text + section. */ + if (JUMP_TABLES_IN_TEXT_SECTION || readonly_data_section == text_section) + { + rtx body = PATTERN (insn); + int elt = GET_CODE (body) == ADDR_DIFF_VEC ? 1 : 0; + HOST_WIDE_INT size; + HOST_WIDE_INT modesize; + + modesize = GET_MODE_SIZE (GET_MODE (body)); + size = modesize * XVECLEN (body, elt); + switch (modesize) + { + case 1: + /* Round up size of TBB table to a halfword boundary. */ + size = (size + 1) & ~(HOST_WIDE_INT)1; + break; + case 2: + /* No padding necessary for TBH. */ + break; + case 4: + /* Add two bytes for alignment on Thumb. */ + if (TARGET_THUMB) + size += 2; + break; + default: + gcc_unreachable (); + } + return size; + } + + return 0; +} + +/* Move a minipool fix MP from its current location to before MAX_MP. + If MAX_MP is NULL, then MP doesn't need moving, but the addressing + constraints may need updating. */ +static Mnode * +move_minipool_fix_forward_ref (Mnode *mp, Mnode *max_mp, + HOST_WIDE_INT max_address) +{ + /* The code below assumes these are different. */ + gcc_assert (mp != max_mp); + + if (max_mp == NULL) + { + if (max_address < mp->max_address) + mp->max_address = max_address; + } + else + { + if (max_address > max_mp->max_address - mp->fix_size) + mp->max_address = max_mp->max_address - mp->fix_size; + else + mp->max_address = max_address; + + /* Unlink MP from its current position. Since max_mp is non-null, + mp->prev must be non-null. */ + mp->prev->next = mp->next; + if (mp->next != NULL) + mp->next->prev = mp->prev; + else + minipool_vector_tail = mp->prev; + + /* Re-insert it before MAX_MP. */ + mp->next = max_mp; + mp->prev = max_mp->prev; + max_mp->prev = mp; + + if (mp->prev != NULL) + mp->prev->next = mp; + else + minipool_vector_head = mp; + } + + /* Save the new entry. */ + max_mp = mp; + + /* Scan over the preceding entries and adjust their addresses as + required. */ + while (mp->prev != NULL + && mp->prev->max_address > mp->max_address - mp->prev->fix_size) + { + mp->prev->max_address = mp->max_address - mp->prev->fix_size; + mp = mp->prev; + } + + return max_mp; +} + +/* Add a constant to the minipool for a forward reference. Returns the + node added or NULL if the constant will not fit in this pool. */ +static Mnode * +add_minipool_forward_ref (Mfix *fix) +{ + /* If set, max_mp is the first pool_entry that has a lower + constraint than the one we are trying to add. */ + Mnode * max_mp = NULL; + HOST_WIDE_INT max_address = fix->address + fix->forwards - minipool_pad; + Mnode * mp; + + /* If the minipool starts before the end of FIX->INSN then this FIX + can not be placed into the current pool. Furthermore, adding the + new constant pool entry may cause the pool to start FIX_SIZE bytes + earlier. */ + if (minipool_vector_head && + (fix->address + get_attr_length (fix->insn) + >= minipool_vector_head->max_address - fix->fix_size)) + return NULL; + + /* Scan the pool to see if a constant with the same value has + already been added. While we are doing this, also note the + location where we must insert the constant if it doesn't already + exist. */ + for (mp = minipool_vector_head; mp != NULL; mp = mp->next) + { + if (GET_CODE (fix->value) == GET_CODE (mp->value) + && fix->mode == mp->mode + && (GET_CODE (fix->value) != CODE_LABEL + || (CODE_LABEL_NUMBER (fix->value) + == CODE_LABEL_NUMBER (mp->value))) + && rtx_equal_p (fix->value, mp->value)) + { + /* More than one fix references this entry. */ + mp->refcount++; + return move_minipool_fix_forward_ref (mp, max_mp, max_address); + } + + /* Note the insertion point if necessary. */ + if (max_mp == NULL + && mp->max_address > max_address) + max_mp = mp; + + /* If we are inserting an 8-bytes aligned quantity and + we have not already found an insertion point, then + make sure that all such 8-byte aligned quantities are + placed at the start of the pool. */ + if (ARM_DOUBLEWORD_ALIGN + && max_mp == NULL + && fix->fix_size >= 8 + && mp->fix_size < 8) + { + max_mp = mp; + max_address = mp->max_address; + } + } + + /* The value is not currently in the minipool, so we need to create + a new entry for it. If MAX_MP is NULL, the entry will be put on + the end of the list since the placement is less constrained than + any existing entry. Otherwise, we insert the new fix before + MAX_MP and, if necessary, adjust the constraints on the other + entries. */ + mp = XNEW (Mnode); + mp->fix_size = fix->fix_size; + mp->mode = fix->mode; + mp->value = fix->value; + mp->refcount = 1; + /* Not yet required for a backwards ref. */ + mp->min_address = -65536; + + if (max_mp == NULL) + { + mp->max_address = max_address; + mp->next = NULL; + mp->prev = minipool_vector_tail; + + if (mp->prev == NULL) + { + minipool_vector_head = mp; + minipool_vector_label = gen_label_rtx (); + } + else + mp->prev->next = mp; + + minipool_vector_tail = mp; + } + else + { + if (max_address > max_mp->max_address - mp->fix_size) + mp->max_address = max_mp->max_address - mp->fix_size; + else + mp->max_address = max_address; + + mp->next = max_mp; + mp->prev = max_mp->prev; + max_mp->prev = mp; + if (mp->prev != NULL) + mp->prev->next = mp; + else + minipool_vector_head = mp; + } + + /* Save the new entry. */ + max_mp = mp; + + /* Scan over the preceding entries and adjust their addresses as + required. */ + while (mp->prev != NULL + && mp->prev->max_address > mp->max_address - mp->prev->fix_size) + { + mp->prev->max_address = mp->max_address - mp->prev->fix_size; + mp = mp->prev; + } + + return max_mp; +} + +static Mnode * +move_minipool_fix_backward_ref (Mnode *mp, Mnode *min_mp, + HOST_WIDE_INT min_address) +{ + HOST_WIDE_INT offset; + + /* The code below assumes these are different. */ + gcc_assert (mp != min_mp); + + if (min_mp == NULL) + { + if (min_address > mp->min_address) + mp->min_address = min_address; + } + else + { + /* We will adjust this below if it is too loose. */ + mp->min_address = min_address; + + /* Unlink MP from its current position. Since min_mp is non-null, + mp->next must be non-null. */ + mp->next->prev = mp->prev; + if (mp->prev != NULL) + mp->prev->next = mp->next; + else + minipool_vector_head = mp->next; + + /* Reinsert it after MIN_MP. */ + mp->prev = min_mp; + mp->next = min_mp->next; + min_mp->next = mp; + if (mp->next != NULL) + mp->next->prev = mp; + else + minipool_vector_tail = mp; + } + + min_mp = mp; + + offset = 0; + for (mp = minipool_vector_head; mp != NULL; mp = mp->next) + { + mp->offset = offset; + if (mp->refcount > 0) + offset += mp->fix_size; + + if (mp->next && mp->next->min_address < mp->min_address + mp->fix_size) + mp->next->min_address = mp->min_address + mp->fix_size; + } + + return min_mp; +} + +/* Add a constant to the minipool for a backward reference. Returns the + node added or NULL if the constant will not fit in this pool. + + Note that the code for insertion for a backwards reference can be + somewhat confusing because the calculated offsets for each fix do + not take into account the size of the pool (which is still under + construction. */ +static Mnode * +add_minipool_backward_ref (Mfix *fix) +{ + /* If set, min_mp is the last pool_entry that has a lower constraint + than the one we are trying to add. */ + Mnode *min_mp = NULL; + /* This can be negative, since it is only a constraint. */ + HOST_WIDE_INT min_address = fix->address - fix->backwards; + Mnode *mp; + + /* If we can't reach the current pool from this insn, or if we can't + insert this entry at the end of the pool without pushing other + fixes out of range, then we don't try. This ensures that we + can't fail later on. */ + if (min_address >= minipool_barrier->address + || (minipool_vector_tail->min_address + fix->fix_size + >= minipool_barrier->address)) + return NULL; + + /* Scan the pool to see if a constant with the same value has + already been added. While we are doing this, also note the + location where we must insert the constant if it doesn't already + exist. */ + for (mp = minipool_vector_tail; mp != NULL; mp = mp->prev) + { + if (GET_CODE (fix->value) == GET_CODE (mp->value) + && fix->mode == mp->mode + && (GET_CODE (fix->value) != CODE_LABEL + || (CODE_LABEL_NUMBER (fix->value) + == CODE_LABEL_NUMBER (mp->value))) + && rtx_equal_p (fix->value, mp->value) + /* Check that there is enough slack to move this entry to the + end of the table (this is conservative). */ + && (mp->max_address + > (minipool_barrier->address + + minipool_vector_tail->offset + + minipool_vector_tail->fix_size))) + { + mp->refcount++; + return move_minipool_fix_backward_ref (mp, min_mp, min_address); + } + + if (min_mp != NULL) + mp->min_address += fix->fix_size; + else + { + /* Note the insertion point if necessary. */ + if (mp->min_address < min_address) + { + /* For now, we do not allow the insertion of 8-byte alignment + requiring nodes anywhere but at the start of the pool. */ + if (ARM_DOUBLEWORD_ALIGN + && fix->fix_size >= 8 && mp->fix_size < 8) + return NULL; + else + min_mp = mp; + } + else if (mp->max_address + < minipool_barrier->address + mp->offset + fix->fix_size) + { + /* Inserting before this entry would push the fix beyond + its maximum address (which can happen if we have + re-located a forwards fix); force the new fix to come + after it. */ + if (ARM_DOUBLEWORD_ALIGN + && fix->fix_size >= 8 && mp->fix_size < 8) + return NULL; + else + { + min_mp = mp; + min_address = mp->min_address + fix->fix_size; + } + } + /* Do not insert a non-8-byte aligned quantity before 8-byte + aligned quantities. */ + else if (ARM_DOUBLEWORD_ALIGN + && fix->fix_size < 8 + && mp->fix_size >= 8) + { + min_mp = mp; + min_address = mp->min_address + fix->fix_size; + } + } + } + + /* We need to create a new entry. */ + mp = XNEW (Mnode); + mp->fix_size = fix->fix_size; + mp->mode = fix->mode; + mp->value = fix->value; + mp->refcount = 1; + mp->max_address = minipool_barrier->address + 65536; + + mp->min_address = min_address; + + if (min_mp == NULL) + { + mp->prev = NULL; + mp->next = minipool_vector_head; + + if (mp->next == NULL) + { + minipool_vector_tail = mp; + minipool_vector_label = gen_label_rtx (); + } + else + mp->next->prev = mp; + + minipool_vector_head = mp; + } + else + { + mp->next = min_mp->next; + mp->prev = min_mp; + min_mp->next = mp; + + if (mp->next != NULL) + mp->next->prev = mp; + else + minipool_vector_tail = mp; + } + + /* Save the new entry. */ + min_mp = mp; + + if (mp->prev) + mp = mp->prev; + else + mp->offset = 0; + + /* Scan over the following entries and adjust their offsets. */ + while (mp->next != NULL) + { + if (mp->next->min_address < mp->min_address + mp->fix_size) + mp->next->min_address = mp->min_address + mp->fix_size; + + if (mp->refcount) + mp->next->offset = mp->offset + mp->fix_size; + else + mp->next->offset = mp->offset; + + mp = mp->next; + } + + return min_mp; +} + +static void +assign_minipool_offsets (Mfix *barrier) +{ + HOST_WIDE_INT offset = 0; + Mnode *mp; + + minipool_barrier = barrier; + + for (mp = minipool_vector_head; mp != NULL; mp = mp->next) + { + mp->offset = offset; + + if (mp->refcount > 0) + offset += mp->fix_size; + } +} + +/* Output the literal table */ +static void +dump_minipool (rtx scan) +{ + Mnode * mp; + Mnode * nmp; + int align64 = 0; + + if (ARM_DOUBLEWORD_ALIGN) + for (mp = minipool_vector_head; mp != NULL; mp = mp->next) + if (mp->refcount > 0 && mp->fix_size >= 8) + { + align64 = 1; + break; + } + + if (dump_file) + fprintf (dump_file, + ";; Emitting minipool after insn %u; address %ld; align %d (bytes)\n", + INSN_UID (scan), (unsigned long) minipool_barrier->address, align64 ? 8 : 4); + + scan = emit_label_after (gen_label_rtx (), scan); + scan = emit_insn_after (align64 ? gen_align_8 () : gen_align_4 (), scan); + scan = emit_label_after (minipool_vector_label, scan); + + for (mp = minipool_vector_head; mp != NULL; mp = nmp) + { + if (mp->refcount > 0) + { + if (dump_file) + { + fprintf (dump_file, + ";; Offset %u, min %ld, max %ld ", + (unsigned) mp->offset, (unsigned long) mp->min_address, + (unsigned long) mp->max_address); + arm_print_value (dump_file, mp->value); + fputc ('\n', dump_file); + } + + switch (mp->fix_size) + { +#ifdef HAVE_consttable_1 + case 1: + scan = emit_insn_after (gen_consttable_1 (mp->value), scan); + break; + +#endif +#ifdef HAVE_consttable_2 + case 2: + scan = emit_insn_after (gen_consttable_2 (mp->value), scan); + break; + +#endif +#ifdef HAVE_consttable_4 + case 4: + scan = emit_insn_after (gen_consttable_4 (mp->value), scan); + break; + +#endif +#ifdef HAVE_consttable_8 + case 8: + scan = emit_insn_after (gen_consttable_8 (mp->value), scan); + break; + +#endif +#ifdef HAVE_consttable_16 + case 16: + scan = emit_insn_after (gen_consttable_16 (mp->value), scan); + break; + +#endif + default: + gcc_unreachable (); + } + } + + nmp = mp->next; + free (mp); + } + + minipool_vector_head = minipool_vector_tail = NULL; + scan = emit_insn_after (gen_consttable_end (), scan); + scan = emit_barrier_after (scan); +} + +/* Return the cost of forcibly inserting a barrier after INSN. */ +static int +arm_barrier_cost (rtx insn) +{ + /* Basing the location of the pool on the loop depth is preferable, + but at the moment, the basic block information seems to be + corrupt by this stage of the compilation. */ + int base_cost = 50; + rtx next = next_nonnote_insn (insn); + + if (next != NULL && GET_CODE (next) == CODE_LABEL) + base_cost -= 20; + + switch (GET_CODE (insn)) + { + case CODE_LABEL: + /* It will always be better to place the table before the label, rather + than after it. */ + return 50; + + case INSN: + case CALL_INSN: + return base_cost; + + case JUMP_INSN: + return base_cost - 10; + + default: + return base_cost + 10; + } +} + +/* Find the best place in the insn stream in the range + (FIX->address,MAX_ADDRESS) to forcibly insert a minipool barrier. + Create the barrier by inserting a jump and add a new fix entry for + it. */ +static Mfix * +create_fix_barrier (Mfix *fix, HOST_WIDE_INT max_address) +{ + HOST_WIDE_INT count = 0; + rtx barrier; + rtx from = fix->insn; + /* The instruction after which we will insert the jump. */ + rtx selected = NULL; + int selected_cost; + /* The address at which the jump instruction will be placed. */ + HOST_WIDE_INT selected_address; + Mfix * new_fix; + HOST_WIDE_INT max_count = max_address - fix->address; + rtx label = gen_label_rtx (); + + selected_cost = arm_barrier_cost (from); + selected_address = fix->address; + + while (from && count < max_count) + { + rtx tmp; + int new_cost; + + /* This code shouldn't have been called if there was a natural barrier + within range. */ + gcc_assert (GET_CODE (from) != BARRIER); + + /* Count the length of this insn. */ + count += get_attr_length (from); + + /* If there is a jump table, add its length. */ + tmp = is_jump_table (from); + if (tmp != NULL) + { + count += get_jump_table_size (tmp); + + /* Jump tables aren't in a basic block, so base the cost on + the dispatch insn. If we select this location, we will + still put the pool after the table. */ + new_cost = arm_barrier_cost (from); + + if (count < max_count + && (!selected || new_cost <= selected_cost)) + { + selected = tmp; + selected_cost = new_cost; + selected_address = fix->address + count; + } + + /* Continue after the dispatch table. */ + from = NEXT_INSN (tmp); + continue; + } + + new_cost = arm_barrier_cost (from); + + if (count < max_count + && (!selected || new_cost <= selected_cost)) + { + selected = from; + selected_cost = new_cost; + selected_address = fix->address + count; + } + + from = NEXT_INSN (from); + } + + /* Make sure that we found a place to insert the jump. */ + gcc_assert (selected); + + /* Create a new JUMP_INSN that branches around a barrier. */ + from = emit_jump_insn_after (gen_jump (label), selected); + JUMP_LABEL (from) = label; + barrier = emit_barrier_after (from); + emit_label_after (label, barrier); + + /* Create a minipool barrier entry for the new barrier. */ + new_fix = (Mfix *) obstack_alloc (&minipool_obstack, sizeof (* new_fix)); + new_fix->insn = barrier; + new_fix->address = selected_address; + new_fix->next = fix->next; + fix->next = new_fix; + + return new_fix; +} + +/* Record that there is a natural barrier in the insn stream at + ADDRESS. */ +static void +push_minipool_barrier (rtx insn, HOST_WIDE_INT address) +{ + Mfix * fix = (Mfix *) obstack_alloc (&minipool_obstack, sizeof (* fix)); + + fix->insn = insn; + fix->address = address; + + fix->next = NULL; + if (minipool_fix_head != NULL) + minipool_fix_tail->next = fix; + else + minipool_fix_head = fix; + + minipool_fix_tail = fix; +} + +/* Record INSN, which will need fixing up to load a value from the + minipool. ADDRESS is the offset of the insn since the start of the + function; LOC is a pointer to the part of the insn which requires + fixing; VALUE is the constant that must be loaded, which is of type + MODE. */ +static void +push_minipool_fix (rtx insn, HOST_WIDE_INT address, rtx *loc, + enum machine_mode mode, rtx value) +{ + Mfix * fix = (Mfix *) obstack_alloc (&minipool_obstack, sizeof (* fix)); + + fix->insn = insn; + fix->address = address; + fix->loc = loc; + fix->mode = mode; + fix->fix_size = MINIPOOL_FIX_SIZE (mode); + fix->value = value; + fix->forwards = get_attr_pool_range (insn); + fix->backwards = get_attr_neg_pool_range (insn); + fix->minipool = NULL; + + /* If an insn doesn't have a range defined for it, then it isn't + expecting to be reworked by this code. Better to stop now than + to generate duff assembly code. */ + gcc_assert (fix->forwards || fix->backwards); + + /* If an entry requires 8-byte alignment then assume all constant pools + require 4 bytes of padding. Trying to do this later on a per-pool + basis is awkward because existing pool entries have to be modified. */ + if (ARM_DOUBLEWORD_ALIGN && fix->fix_size >= 8) + minipool_pad = 4; + + if (dump_file) + { + fprintf (dump_file, + ";; %smode fixup for i%d; addr %lu, range (%ld,%ld): ", + GET_MODE_NAME (mode), + INSN_UID (insn), (unsigned long) address, + -1 * (long)fix->backwards, (long)fix->forwards); + arm_print_value (dump_file, fix->value); + fprintf (dump_file, "\n"); + } + + /* Add it to the chain of fixes. */ + fix->next = NULL; + + if (minipool_fix_head != NULL) + minipool_fix_tail->next = fix; + else + minipool_fix_head = fix; + + minipool_fix_tail = fix; +} + +/* Return the cost of synthesizing a 64-bit constant VAL inline. + Returns the number of insns needed, or 99 if we don't know how to + do it. */ +int +arm_const_double_inline_cost (rtx val) +{ + rtx lowpart, highpart; + enum machine_mode mode; + + mode = GET_MODE (val); + + if (mode == VOIDmode) + mode = DImode; + + gcc_assert (GET_MODE_SIZE (mode) == 8); + + lowpart = gen_lowpart (SImode, val); + highpart = gen_highpart_mode (SImode, mode, val); + + gcc_assert (GET_CODE (lowpart) == CONST_INT); + gcc_assert (GET_CODE (highpart) == CONST_INT); + + return (arm_gen_constant (SET, SImode, NULL_RTX, INTVAL (lowpart), + NULL_RTX, NULL_RTX, 0, 0) + + arm_gen_constant (SET, SImode, NULL_RTX, INTVAL (highpart), + NULL_RTX, NULL_RTX, 0, 0)); +} + +/* Return true if it is worthwhile to split a 64-bit constant into two + 32-bit operations. This is the case if optimizing for size, or + if we have load delay slots, or if one 32-bit part can be done with + a single data operation. */ +bool +arm_const_double_by_parts (rtx val) +{ + enum machine_mode mode = GET_MODE (val); + rtx part; + + if (optimize_size || arm_ld_sched) + return true; + + if (mode == VOIDmode) + mode = DImode; + + part = gen_highpart_mode (SImode, mode, val); + + gcc_assert (GET_CODE (part) == CONST_INT); + + if (const_ok_for_arm (INTVAL (part)) + || const_ok_for_arm (~INTVAL (part))) + return true; + + part = gen_lowpart (SImode, val); + + gcc_assert (GET_CODE (part) == CONST_INT); + + if (const_ok_for_arm (INTVAL (part)) + || const_ok_for_arm (~INTVAL (part))) + return true; + + return false; +} + +/* Return true if it is possible to inline both the high and low parts + of a 64-bit constant into 32-bit data processing instructions. */ +bool +arm_const_double_by_immediates (rtx val) +{ + enum machine_mode mode = GET_MODE (val); + rtx part; + + if (mode == VOIDmode) + mode = DImode; + + part = gen_highpart_mode (SImode, mode, val); + + gcc_assert (GET_CODE (part) == CONST_INT); + + if (!const_ok_for_arm (INTVAL (part))) + return false; + + part = gen_lowpart (SImode, val); + + gcc_assert (GET_CODE (part) == CONST_INT); + + if (!const_ok_for_arm (INTVAL (part))) + return false; + + return true; +} + +/* Scan INSN and note any of its operands that need fixing. + If DO_PUSHES is false we do not actually push any of the fixups + needed. The function returns TRUE if any fixups were needed/pushed. + This is used by arm_memory_load_p() which needs to know about loads + of constants that will be converted into minipool loads. */ +static bool +note_invalid_constants (rtx insn, HOST_WIDE_INT address, int do_pushes) +{ + bool result = false; + int opno; + + extract_insn (insn); + + if (!constrain_operands (1)) + fatal_insn_not_found (insn); + + if (recog_data.n_alternatives == 0) + return false; + + /* Fill in recog_op_alt with information about the constraints of + this insn. */ + preprocess_constraints (); + + for (opno = 0; opno < recog_data.n_operands; opno++) + { + /* Things we need to fix can only occur in inputs. */ + if (recog_data.operand_type[opno] != OP_IN) + continue; + + /* If this alternative is a memory reference, then any mention + of constants in this alternative is really to fool reload + into allowing us to accept one there. We need to fix them up + now so that we output the right code. */ + if (recog_op_alt[opno][which_alternative].memory_ok) + { + rtx op = recog_data.operand[opno]; + + if (CONSTANT_P (op)) + { + if (do_pushes) + push_minipool_fix (insn, address, recog_data.operand_loc[opno], + recog_data.operand_mode[opno], op); + result = true; + } + else if (GET_CODE (op) == MEM + && GET_CODE (XEXP (op, 0)) == SYMBOL_REF + && CONSTANT_POOL_ADDRESS_P (XEXP (op, 0))) + { + if (do_pushes) + { + rtx cop = avoid_constant_pool_reference (op); + + /* Casting the address of something to a mode narrower + than a word can cause avoid_constant_pool_reference() + to return the pool reference itself. That's no good to + us here. Lets just hope that we can use the + constant pool value directly. */ + if (op == cop) + cop = get_pool_constant (XEXP (op, 0)); + + push_minipool_fix (insn, address, + recog_data.operand_loc[opno], + recog_data.operand_mode[opno], cop); + } + + result = true; + } + } + } + + return result; +} + +/* Convert instructions to their cc-clobbering variant if possible, since + that allows us to use smaller encodings. */ + +static void +thumb2_reorg (void) +{ + basic_block bb; + regset_head live; + + INIT_REG_SET (&live); + + /* We are freeing block_for_insn in the toplev to keep compatibility + with old MDEP_REORGS that are not CFG based. Recompute it now. */ + compute_bb_for_insn (); + df_analyze (); + + FOR_EACH_BB (bb) + { + rtx insn; + + COPY_REG_SET (&live, DF_LR_OUT (bb)); + df_simulate_initialize_backwards (bb, &live); + FOR_BB_INSNS_REVERSE (bb, insn) + { + if (NONJUMP_INSN_P (insn) + && !REGNO_REG_SET_P (&live, CC_REGNUM)) + { + rtx pat = PATTERN (insn); + if (GET_CODE (pat) == SET + && low_register_operand (XEXP (pat, 0), SImode) + && thumb_16bit_operator (XEXP (pat, 1), SImode) + && low_register_operand (XEXP (XEXP (pat, 1), 0), SImode) + && low_register_operand (XEXP (XEXP (pat, 1), 1), SImode)) + { + rtx dst = XEXP (pat, 0); + rtx src = XEXP (pat, 1); + rtx op0 = XEXP (src, 0); + rtx op1 = (GET_RTX_CLASS (GET_CODE (src)) == RTX_COMM_ARITH + ? XEXP (src, 1) : NULL); + + if (rtx_equal_p (dst, op0) + || GET_CODE (src) == PLUS || GET_CODE (src) == MINUS) + { + rtx ccreg = gen_rtx_REG (CCmode, CC_REGNUM); + rtx clobber = gen_rtx_CLOBBER (VOIDmode, ccreg); + rtvec vec = gen_rtvec (2, pat, clobber); + + PATTERN (insn) = gen_rtx_PARALLEL (VOIDmode, vec); + INSN_CODE (insn) = -1; + } + /* We can also handle a commutative operation where the + second operand matches the destination. */ + else if (op1 && rtx_equal_p (dst, op1)) + { + rtx ccreg = gen_rtx_REG (CCmode, CC_REGNUM); + rtx clobber = gen_rtx_CLOBBER (VOIDmode, ccreg); + rtvec vec; + + src = copy_rtx (src); + XEXP (src, 0) = op1; + XEXP (src, 1) = op0; + pat = gen_rtx_SET (VOIDmode, dst, src); + vec = gen_rtvec (2, pat, clobber); + PATTERN (insn) = gen_rtx_PARALLEL (VOIDmode, vec); + INSN_CODE (insn) = -1; + } + } + } + + if (NONDEBUG_INSN_P (insn)) + df_simulate_one_insn_backwards (bb, insn, &live); + } + } + + CLEAR_REG_SET (&live); +} + +/* Gcc puts the pool in the wrong place for ARM, since we can only + load addresses a limited distance around the pc. We do some + special munging to move the constant pool values to the correct + point in the code. */ +static void +arm_reorg (void) +{ + rtx insn; + HOST_WIDE_INT address = 0; + Mfix * fix; + + if (TARGET_THUMB2) + thumb2_reorg (); + + minipool_fix_head = minipool_fix_tail = NULL; + + /* The first insn must always be a note, or the code below won't + scan it properly. */ + insn = get_insns (); + gcc_assert (GET_CODE (insn) == NOTE); + minipool_pad = 0; + + /* Scan all the insns and record the operands that will need fixing. */ + for (insn = next_nonnote_insn (insn); insn; insn = next_nonnote_insn (insn)) + { + if (TARGET_CIRRUS_FIX_INVALID_INSNS + && (arm_cirrus_insn_p (insn) + || GET_CODE (insn) == JUMP_INSN + || arm_memory_load_p (insn))) + cirrus_reorg (insn); + + if (GET_CODE (insn) == BARRIER) + push_minipool_barrier (insn, address); + else if (INSN_P (insn)) + { + rtx table; + + note_invalid_constants (insn, address, true); + address += get_attr_length (insn); + + /* If the insn is a vector jump, add the size of the table + and skip the table. */ + if ((table = is_jump_table (insn)) != NULL) + { + address += get_jump_table_size (table); + insn = table; + } + } + } + + fix = minipool_fix_head; + + /* Now scan the fixups and perform the required changes. */ + while (fix) + { + Mfix * ftmp; + Mfix * fdel; + Mfix * last_added_fix; + Mfix * last_barrier = NULL; + Mfix * this_fix; + + /* Skip any further barriers before the next fix. */ + while (fix && GET_CODE (fix->insn) == BARRIER) + fix = fix->next; + + /* No more fixes. */ + if (fix == NULL) + break; + + last_added_fix = NULL; + + for (ftmp = fix; ftmp; ftmp = ftmp->next) + { + if (GET_CODE (ftmp->insn) == BARRIER) + { + if (ftmp->address >= minipool_vector_head->max_address) + break; + + last_barrier = ftmp; + } + else if ((ftmp->minipool = add_minipool_forward_ref (ftmp)) == NULL) + break; + + last_added_fix = ftmp; /* Keep track of the last fix added. */ + } + + /* If we found a barrier, drop back to that; any fixes that we + could have reached but come after the barrier will now go in + the next mini-pool. */ + if (last_barrier != NULL) + { + /* Reduce the refcount for those fixes that won't go into this + pool after all. */ + for (fdel = last_barrier->next; + fdel && fdel != ftmp; + fdel = fdel->next) + { + fdel->minipool->refcount--; + fdel->minipool = NULL; + } + + ftmp = last_barrier; + } + else + { + /* ftmp is first fix that we can't fit into this pool and + there no natural barriers that we could use. Insert a + new barrier in the code somewhere between the previous + fix and this one, and arrange to jump around it. */ + HOST_WIDE_INT max_address; + + /* The last item on the list of fixes must be a barrier, so + we can never run off the end of the list of fixes without + last_barrier being set. */ + gcc_assert (ftmp); + + max_address = minipool_vector_head->max_address; + /* Check that there isn't another fix that is in range that + we couldn't fit into this pool because the pool was + already too large: we need to put the pool before such an + instruction. The pool itself may come just after the + fix because create_fix_barrier also allows space for a + jump instruction. */ + if (ftmp->address < max_address) + max_address = ftmp->address + 1; + + last_barrier = create_fix_barrier (last_added_fix, max_address); + } + + assign_minipool_offsets (last_barrier); + + while (ftmp) + { + if (GET_CODE (ftmp->insn) != BARRIER + && ((ftmp->minipool = add_minipool_backward_ref (ftmp)) + == NULL)) + break; + + ftmp = ftmp->next; + } + + /* Scan over the fixes we have identified for this pool, fixing them + up and adding the constants to the pool itself. */ + for (this_fix = fix; this_fix && ftmp != this_fix; + this_fix = this_fix->next) + if (GET_CODE (this_fix->insn) != BARRIER) + { + rtx addr + = plus_constant (gen_rtx_LABEL_REF (VOIDmode, + minipool_vector_label), + this_fix->minipool->offset); + *this_fix->loc = gen_rtx_MEM (this_fix->mode, addr); + } + + dump_minipool (last_barrier->insn); + fix = ftmp; + } + + /* From now on we must synthesize any constants that we can't handle + directly. This can happen if the RTL gets split during final + instruction generation. */ + after_arm_reorg = 1; + + /* Free the minipool memory. */ + obstack_free (&minipool_obstack, minipool_startobj); +} + +/* Routines to output assembly language. */ + +/* If the rtx is the correct value then return the string of the number. + In this way we can ensure that valid double constants are generated even + when cross compiling. */ +const char * +fp_immediate_constant (rtx x) +{ + REAL_VALUE_TYPE r; + int i; + + if (!fp_consts_inited) + init_fp_table (); + + REAL_VALUE_FROM_CONST_DOUBLE (r, x); + for (i = 0; i < 8; i++) + if (REAL_VALUES_EQUAL (r, values_fp[i])) + return strings_fp[i]; + + gcc_unreachable (); +} + +/* As for fp_immediate_constant, but value is passed directly, not in rtx. */ +static const char * +fp_const_from_val (REAL_VALUE_TYPE *r) +{ + int i; + + if (!fp_consts_inited) + init_fp_table (); + + for (i = 0; i < 8; i++) + if (REAL_VALUES_EQUAL (*r, values_fp[i])) + return strings_fp[i]; + + gcc_unreachable (); +} + +/* Output the operands of a LDM/STM instruction to STREAM. + MASK is the ARM register set mask of which only bits 0-15 are important. + REG is the base register, either the frame pointer or the stack pointer, + INSTR is the possibly suffixed load or store instruction. + RFE is nonzero if the instruction should also copy spsr to cpsr. */ + +static void +print_multi_reg (FILE *stream, const char *instr, unsigned reg, + unsigned long mask, int rfe) +{ + unsigned i; + bool not_first = FALSE; + + gcc_assert (!rfe || (mask & (1 << PC_REGNUM))); + fputc ('\t', stream); + asm_fprintf (stream, instr, reg); + fputc ('{', stream); + + for (i = 0; i <= LAST_ARM_REGNUM; i++) + if (mask & (1 << i)) + { + if (not_first) + fprintf (stream, ", "); + + asm_fprintf (stream, "%r", i); + not_first = TRUE; + } + + if (rfe) + fprintf (stream, "}^\n"); + else + fprintf (stream, "}\n"); +} + + +/* Output a FLDMD instruction to STREAM. + BASE if the register containing the address. + REG and COUNT specify the register range. + Extra registers may be added to avoid hardware bugs. + + We output FLDMD even for ARMv5 VFP implementations. Although + FLDMD is technically not supported until ARMv6, it is believed + that all VFP implementations support its use in this context. */ + +static void +vfp_output_fldmd (FILE * stream, unsigned int base, int reg, int count) +{ + int i; + + /* Workaround ARM10 VFPr1 bug. */ + if (count == 2 && !arm_arch6) + { + if (reg == 15) + reg--; + count++; + } + + /* FLDMD may not load more than 16 doubleword registers at a time. Split the + load into multiple parts if we have to handle more than 16 registers. */ + if (count > 16) + { + vfp_output_fldmd (stream, base, reg, 16); + vfp_output_fldmd (stream, base, reg + 16, count - 16); + return; + } + + fputc ('\t', stream); + asm_fprintf (stream, "fldmfdd\t%r!, {", base); + + for (i = reg; i < reg + count; i++) + { + if (i > reg) + fputs (", ", stream); + asm_fprintf (stream, "d%d", i); + } + fputs ("}\n", stream); + +} + + +/* Output the assembly for a store multiple. */ + +const char * +vfp_output_fstmd (rtx * operands) +{ + char pattern[100]; + int p; + int base; + int i; + + strcpy (pattern, "fstmfdd\t%m0!, {%P1"); + p = strlen (pattern); + + gcc_assert (GET_CODE (operands[1]) == REG); + + base = (REGNO (operands[1]) - FIRST_VFP_REGNUM) / 2; + for (i = 1; i < XVECLEN (operands[2], 0); i++) + { + p += sprintf (&pattern[p], ", d%d", base + i); + } + strcpy (&pattern[p], "}"); + + output_asm_insn (pattern, operands); + return ""; +} + + +/* Emit RTL to save block of VFP register pairs to the stack. Returns the + number of bytes pushed. */ + +static int +vfp_emit_fstmd (int base_reg, int count) +{ + rtx par; + rtx dwarf; + rtx tmp, reg; + int i; + + /* Workaround ARM10 VFPr1 bug. Data corruption can occur when exactly two + register pairs are stored by a store multiple insn. We avoid this + by pushing an extra pair. */ + if (count == 2 && !arm_arch6) + { + if (base_reg == LAST_VFP_REGNUM - 3) + base_reg -= 2; + count++; + } + + /* FSTMD may not store more than 16 doubleword registers at once. Split + larger stores into multiple parts (up to a maximum of two, in + practice). */ + if (count > 16) + { + int saved; + /* NOTE: base_reg is an internal register number, so each D register + counts as 2. */ + saved = vfp_emit_fstmd (base_reg + 32, count - 16); + saved += vfp_emit_fstmd (base_reg, 16); + return saved; + } + + par = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (count)); + dwarf = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (count + 1)); + + reg = gen_rtx_REG (DFmode, base_reg); + base_reg += 2; + + XVECEXP (par, 0, 0) + = gen_rtx_SET (VOIDmode, + gen_frame_mem + (BLKmode, + gen_rtx_PRE_MODIFY (Pmode, + stack_pointer_rtx, + plus_constant + (stack_pointer_rtx, + - (count * 8))) + ), + gen_rtx_UNSPEC (BLKmode, + gen_rtvec (1, reg), + UNSPEC_PUSH_MULT)); + + tmp = gen_rtx_SET (VOIDmode, stack_pointer_rtx, + plus_constant (stack_pointer_rtx, -(count * 8))); + RTX_FRAME_RELATED_P (tmp) = 1; + XVECEXP (dwarf, 0, 0) = tmp; + + tmp = gen_rtx_SET (VOIDmode, + gen_frame_mem (DFmode, stack_pointer_rtx), + reg); + RTX_FRAME_RELATED_P (tmp) = 1; + XVECEXP (dwarf, 0, 1) = tmp; + + for (i = 1; i < count; i++) + { + reg = gen_rtx_REG (DFmode, base_reg); + base_reg += 2; + XVECEXP (par, 0, i) = gen_rtx_USE (VOIDmode, reg); + + tmp = gen_rtx_SET (VOIDmode, + gen_frame_mem (DFmode, + plus_constant (stack_pointer_rtx, + i * 8)), + reg); + RTX_FRAME_RELATED_P (tmp) = 1; + XVECEXP (dwarf, 0, i + 1) = tmp; + } + + par = emit_insn (par); + add_reg_note (par, REG_FRAME_RELATED_EXPR, dwarf); + RTX_FRAME_RELATED_P (par) = 1; + + return count * 8; +} + +/* Emit a call instruction with pattern PAT. ADDR is the address of + the call target. */ + +void +arm_emit_call_insn (rtx pat, rtx addr) +{ + rtx insn; + + insn = emit_call_insn (pat); + + /* The PIC register is live on entry to VxWorks PIC PLT entries. + If the call might use such an entry, add a use of the PIC register + to the instruction's CALL_INSN_FUNCTION_USAGE. */ + if (TARGET_VXWORKS_RTP + && flag_pic + && GET_CODE (addr) == SYMBOL_REF + && (SYMBOL_REF_DECL (addr) + ? !targetm.binds_local_p (SYMBOL_REF_DECL (addr)) + : !SYMBOL_REF_LOCAL_P (addr))) + { + require_pic_register (); + use_reg (&CALL_INSN_FUNCTION_USAGE (insn), cfun->machine->pic_reg); + } +} + +/* Output a 'call' insn. */ +const char * +output_call (rtx *operands) +{ + gcc_assert (!arm_arch5); /* Patterns should call blx directly. */ + + /* Handle calls to lr using ip (which may be clobbered in subr anyway). */ + if (REGNO (operands[0]) == LR_REGNUM) + { + operands[0] = gen_rtx_REG (SImode, IP_REGNUM); + output_asm_insn ("mov%?\t%0, %|lr", operands); + } + + output_asm_insn ("mov%?\t%|lr, %|pc", operands); + + if (TARGET_INTERWORK || arm_arch4t) + output_asm_insn ("bx%?\t%0", operands); + else + output_asm_insn ("mov%?\t%|pc, %0", operands); + + return ""; +} + +/* Output a 'call' insn that is a reference in memory. This is + disabled for ARMv5 and we prefer a blx instead because otherwise + there's a significant performance overhead. */ +const char * +output_call_mem (rtx *operands) +{ + gcc_assert (!arm_arch5); + if (TARGET_INTERWORK) + { + output_asm_insn ("ldr%?\t%|ip, %0", operands); + output_asm_insn ("mov%?\t%|lr, %|pc", operands); + output_asm_insn ("bx%?\t%|ip", operands); + } + else if (regno_use_in (LR_REGNUM, operands[0])) + { + /* LR is used in the memory address. We load the address in the + first instruction. It's safe to use IP as the target of the + load since the call will kill it anyway. */ + output_asm_insn ("ldr%?\t%|ip, %0", operands); + output_asm_insn ("mov%?\t%|lr, %|pc", operands); + if (arm_arch4t) + output_asm_insn ("bx%?\t%|ip", operands); + else + output_asm_insn ("mov%?\t%|pc, %|ip", operands); + } + else + { + output_asm_insn ("mov%?\t%|lr, %|pc", operands); + output_asm_insn ("ldr%?\t%|pc, %0", operands); + } + + return ""; +} + + +/* Output a move from arm registers to an fpa registers. + OPERANDS[0] is an fpa register. + OPERANDS[1] is the first registers of an arm register pair. */ +const char * +output_mov_long_double_fpa_from_arm (rtx *operands) +{ + int arm_reg0 = REGNO (operands[1]); + rtx ops[3]; + + gcc_assert (arm_reg0 != IP_REGNUM); + + ops[0] = gen_rtx_REG (SImode, arm_reg0); + ops[1] = gen_rtx_REG (SImode, 1 + arm_reg0); + ops[2] = gen_rtx_REG (SImode, 2 + arm_reg0); + + output_asm_insn ("stm%(fd%)\t%|sp!, {%0, %1, %2}", ops); + output_asm_insn ("ldf%?e\t%0, [%|sp], #12", operands); + + return ""; +} + +/* Output a move from an fpa register to arm registers. + OPERANDS[0] is the first registers of an arm register pair. + OPERANDS[1] is an fpa register. */ +const char * +output_mov_long_double_arm_from_fpa (rtx *operands) +{ + int arm_reg0 = REGNO (operands[0]); + rtx ops[3]; + + gcc_assert (arm_reg0 != IP_REGNUM); + + ops[0] = gen_rtx_REG (SImode, arm_reg0); + ops[1] = gen_rtx_REG (SImode, 1 + arm_reg0); + ops[2] = gen_rtx_REG (SImode, 2 + arm_reg0); + + output_asm_insn ("stf%?e\t%1, [%|sp, #-12]!", operands); + output_asm_insn ("ldm%(fd%)\t%|sp!, {%0, %1, %2}", ops); + return ""; +} + +/* Output a move from arm registers to arm registers of a long double + OPERANDS[0] is the destination. + OPERANDS[1] is the source. */ +const char * +output_mov_long_double_arm_from_arm (rtx *operands) +{ + /* We have to be careful here because the two might overlap. */ + int dest_start = REGNO (operands[0]); + int src_start = REGNO (operands[1]); + rtx ops[2]; + int i; + + if (dest_start < src_start) + { + for (i = 0; i < 3; i++) + { + ops[0] = gen_rtx_REG (SImode, dest_start + i); + ops[1] = gen_rtx_REG (SImode, src_start + i); + output_asm_insn ("mov%?\t%0, %1", ops); + } + } + else + { + for (i = 2; i >= 0; i--) + { + ops[0] = gen_rtx_REG (SImode, dest_start + i); + ops[1] = gen_rtx_REG (SImode, src_start + i); + output_asm_insn ("mov%?\t%0, %1", ops); + } + } + + return ""; +} + +void +arm_emit_movpair (rtx dest, rtx src) + { + /* If the src is an immediate, simplify it. */ + if (CONST_INT_P (src)) + { + HOST_WIDE_INT val = INTVAL (src); + emit_set_insn (dest, GEN_INT (val & 0x0000ffff)); + if ((val >> 16) & 0x0000ffff) + emit_set_insn (gen_rtx_ZERO_EXTRACT (SImode, dest, GEN_INT (16), + GEN_INT (16)), + GEN_INT ((val >> 16) & 0x0000ffff)); + return; + } + emit_set_insn (dest, gen_rtx_HIGH (SImode, src)); + emit_set_insn (dest, gen_rtx_LO_SUM (SImode, dest, src)); + } + +/* Output a move from arm registers to an fpa registers. + OPERANDS[0] is an fpa register. + OPERANDS[1] is the first registers of an arm register pair. */ +const char * +output_mov_double_fpa_from_arm (rtx *operands) +{ + int arm_reg0 = REGNO (operands[1]); + rtx ops[2]; + + gcc_assert (arm_reg0 != IP_REGNUM); + + ops[0] = gen_rtx_REG (SImode, arm_reg0); + ops[1] = gen_rtx_REG (SImode, 1 + arm_reg0); + output_asm_insn ("stm%(fd%)\t%|sp!, {%0, %1}", ops); + output_asm_insn ("ldf%?d\t%0, [%|sp], #8", operands); + return ""; +} + +/* Output a move from an fpa register to arm registers. + OPERANDS[0] is the first registers of an arm register pair. + OPERANDS[1] is an fpa register. */ +const char * +output_mov_double_arm_from_fpa (rtx *operands) +{ + int arm_reg0 = REGNO (operands[0]); + rtx ops[2]; + + gcc_assert (arm_reg0 != IP_REGNUM); + + ops[0] = gen_rtx_REG (SImode, arm_reg0); + ops[1] = gen_rtx_REG (SImode, 1 + arm_reg0); + output_asm_insn ("stf%?d\t%1, [%|sp, #-8]!", operands); + output_asm_insn ("ldm%(fd%)\t%|sp!, {%0, %1}", ops); + return ""; +} + +/* Output a move between double words. It must be REG<-MEM + or MEM<-REG. */ +const char * +output_move_double (rtx *operands) +{ + enum rtx_code code0 = GET_CODE (operands[0]); + enum rtx_code code1 = GET_CODE (operands[1]); + rtx otherops[3]; + + if (code0 == REG) + { + unsigned int reg0 = REGNO (operands[0]); + + otherops[0] = gen_rtx_REG (SImode, 1 + reg0); + + gcc_assert (code1 == MEM); /* Constraints should ensure this. */ + + switch (GET_CODE (XEXP (operands[1], 0))) + { + case REG: + if (TARGET_LDRD + && !(fix_cm3_ldrd && reg0 == REGNO(XEXP (operands[1], 0)))) + output_asm_insn ("ldr%(d%)\t%0, [%m1]", operands); + else + output_asm_insn ("ldm%(ia%)\t%m1, %M0", operands); + break; + + case PRE_INC: + gcc_assert (TARGET_LDRD); + output_asm_insn ("ldr%(d%)\t%0, [%m1, #8]!", operands); + break; + + case PRE_DEC: + if (TARGET_LDRD) + output_asm_insn ("ldr%(d%)\t%0, [%m1, #-8]!", operands); + else + output_asm_insn ("ldm%(db%)\t%m1!, %M0", operands); + break; + + case POST_INC: + if (TARGET_LDRD) + output_asm_insn ("ldr%(d%)\t%0, [%m1], #8", operands); + else + output_asm_insn ("ldm%(ia%)\t%m1!, %M0", operands); + break; + + case POST_DEC: + gcc_assert (TARGET_LDRD); + output_asm_insn ("ldr%(d%)\t%0, [%m1], #-8", operands); + break; + + case PRE_MODIFY: + case POST_MODIFY: + /* Autoicrement addressing modes should never have overlapping + base and destination registers, and overlapping index registers + are already prohibited, so this doesn't need to worry about + fix_cm3_ldrd. */ + otherops[0] = operands[0]; + otherops[1] = XEXP (XEXP (XEXP (operands[1], 0), 1), 0); + otherops[2] = XEXP (XEXP (XEXP (operands[1], 0), 1), 1); + + if (GET_CODE (XEXP (operands[1], 0)) == PRE_MODIFY) + { + if (reg_overlap_mentioned_p (otherops[0], otherops[2])) + { + /* Registers overlap so split out the increment. */ + output_asm_insn ("add%?\t%1, %1, %2", otherops); + output_asm_insn ("ldr%(d%)\t%0, [%1] @split", otherops); + } + else + { + /* Use a single insn if we can. + FIXME: IWMMXT allows offsets larger than ldrd can + handle, fix these up with a pair of ldr. */ + if (TARGET_THUMB2 + || GET_CODE (otherops[2]) != CONST_INT + || (INTVAL (otherops[2]) > -256 + && INTVAL (otherops[2]) < 256)) + output_asm_insn ("ldr%(d%)\t%0, [%1, %2]!", otherops); + else + { + output_asm_insn ("ldr%?\t%0, [%1, %2]!", otherops); + output_asm_insn ("ldr%?\t%H0, [%1, #4]", otherops); + } + } + } + else + { + /* Use a single insn if we can. + FIXME: IWMMXT allows offsets larger than ldrd can handle, + fix these up with a pair of ldr. */ + if (TARGET_THUMB2 + || GET_CODE (otherops[2]) != CONST_INT + || (INTVAL (otherops[2]) > -256 + && INTVAL (otherops[2]) < 256)) + output_asm_insn ("ldr%(d%)\t%0, [%1], %2", otherops); + else + { + output_asm_insn ("ldr%?\t%H0, [%1, #4]", otherops); + output_asm_insn ("ldr%?\t%0, [%1], %2", otherops); + } + } + break; + + case LABEL_REF: + case CONST: + /* We might be able to use ldrd %0, %1 here. However the range is + different to ldr/adr, and it is broken on some ARMv7-M + implementations. */ + /* Use the second register of the pair to avoid problematic + overlap. */ + otherops[1] = operands[1]; + output_asm_insn ("adr%?\t%0, %1", otherops); + operands[1] = otherops[0]; + if (TARGET_LDRD) + output_asm_insn ("ldr%(d%)\t%0, [%1]", operands); + else + output_asm_insn ("ldm%(ia%)\t%1, %M0", operands); + break; + + /* ??? This needs checking for thumb2. */ + default: + if (arm_add_operand (XEXP (XEXP (operands[1], 0), 1), + GET_MODE (XEXP (XEXP (operands[1], 0), 1)))) + { + otherops[0] = operands[0]; + otherops[1] = XEXP (XEXP (operands[1], 0), 0); + otherops[2] = XEXP (XEXP (operands[1], 0), 1); + + if (GET_CODE (XEXP (operands[1], 0)) == PLUS) + { + if (GET_CODE (otherops[2]) == CONST_INT && !TARGET_LDRD) + { + switch ((int) INTVAL (otherops[2])) + { + case -8: + output_asm_insn ("ldm%(db%)\t%1, %M0", otherops); + return ""; + case -4: + if (TARGET_THUMB2) + break; + output_asm_insn ("ldm%(da%)\t%1, %M0", otherops); + return ""; + case 4: + if (TARGET_THUMB2) + break; + output_asm_insn ("ldm%(ib%)\t%1, %M0", otherops); + return ""; + } + } + otherops[0] = gen_rtx_REG(SImode, REGNO(operands[0]) + 1); + operands[1] = otherops[0]; + if (TARGET_LDRD + && (GET_CODE (otherops[2]) == REG + || TARGET_THUMB2 + || (GET_CODE (otherops[2]) == CONST_INT + && INTVAL (otherops[2]) > -256 + && INTVAL (otherops[2]) < 256))) + { + if (reg_overlap_mentioned_p (operands[0], + otherops[2])) + { + rtx tmp; + /* Swap base and index registers over to + avoid a conflict. */ + tmp = otherops[1]; + otherops[1] = otherops[2]; + otherops[2] = tmp; + } + /* If both registers conflict, it will usually + have been fixed by a splitter. */ + if (reg_overlap_mentioned_p (operands[0], otherops[2]) + || (fix_cm3_ldrd && reg0 == REGNO (otherops[1]))) + { + output_asm_insn ("add%?\t%0, %1, %2", otherops); + output_asm_insn ("ldr%(d%)\t%0, [%1]", operands); + } + else + { + otherops[0] = operands[0]; + output_asm_insn ("ldr%(d%)\t%0, [%1, %2]", otherops); + } + return ""; + } + + if (GET_CODE (otherops[2]) == CONST_INT) + { + if (!(const_ok_for_arm (INTVAL (otherops[2])))) + output_asm_insn ("sub%?\t%0, %1, #%n2", otherops); + else + output_asm_insn ("add%?\t%0, %1, %2", otherops); + } + else + output_asm_insn ("add%?\t%0, %1, %2", otherops); + } + else + output_asm_insn ("sub%?\t%0, %1, %2", otherops); + + if (TARGET_LDRD) + return "ldr%(d%)\t%0, [%1]"; + + return "ldm%(ia%)\t%1, %M0"; + } + else + { + otherops[1] = adjust_address (operands[1], SImode, 4); + /* Take care of overlapping base/data reg. */ + if (reg_mentioned_p (operands[0], operands[1])) + { + output_asm_insn ("ldr%?\t%0, %1", otherops); + output_asm_insn ("ldr%?\t%0, %1", operands); + } + else + { + output_asm_insn ("ldr%?\t%0, %1", operands); + output_asm_insn ("ldr%?\t%0, %1", otherops); + } + } + } + } + else + { + /* Constraints should ensure this. */ + gcc_assert (code0 == MEM && code1 == REG); + gcc_assert (REGNO (operands[1]) != IP_REGNUM); + + switch (GET_CODE (XEXP (operands[0], 0))) + { + case REG: + if (TARGET_LDRD) + output_asm_insn ("str%(d%)\t%1, [%m0]", operands); + else + output_asm_insn ("stm%(ia%)\t%m0, %M1", operands); + break; + + case PRE_INC: + gcc_assert (TARGET_LDRD); + output_asm_insn ("str%(d%)\t%1, [%m0, #8]!", operands); + break; + + case PRE_DEC: + if (TARGET_LDRD) + output_asm_insn ("str%(d%)\t%1, [%m0, #-8]!", operands); + else + output_asm_insn ("stm%(db%)\t%m0!, %M1", operands); + break; + + case POST_INC: + if (TARGET_LDRD) + output_asm_insn ("str%(d%)\t%1, [%m0], #8", operands); + else + output_asm_insn ("stm%(ia%)\t%m0!, %M1", operands); + break; + + case POST_DEC: + gcc_assert (TARGET_LDRD); + output_asm_insn ("str%(d%)\t%1, [%m0], #-8", operands); + break; + + case PRE_MODIFY: + case POST_MODIFY: + otherops[0] = operands[1]; + otherops[1] = XEXP (XEXP (XEXP (operands[0], 0), 1), 0); + otherops[2] = XEXP (XEXP (XEXP (operands[0], 0), 1), 1); + + /* IWMMXT allows offsets larger than ldrd can handle, + fix these up with a pair of ldr. */ + if (!TARGET_THUMB2 + && GET_CODE (otherops[2]) == CONST_INT + && (INTVAL(otherops[2]) <= -256 + || INTVAL(otherops[2]) >= 256)) + { + if (GET_CODE (XEXP (operands[0], 0)) == PRE_MODIFY) + { + output_asm_insn ("str%?\t%0, [%1, %2]!", otherops); + output_asm_insn ("str%?\t%H0, [%1, #4]", otherops); + } + else + { + output_asm_insn ("str%?\t%H0, [%1, #4]", otherops); + output_asm_insn ("str%?\t%0, [%1], %2", otherops); + } + } + else if (GET_CODE (XEXP (operands[0], 0)) == PRE_MODIFY) + output_asm_insn ("str%(d%)\t%0, [%1, %2]!", otherops); + else + output_asm_insn ("str%(d%)\t%0, [%1], %2", otherops); + break; + + case PLUS: + otherops[2] = XEXP (XEXP (operands[0], 0), 1); + if (GET_CODE (otherops[2]) == CONST_INT && !TARGET_LDRD) + { + switch ((int) INTVAL (XEXP (XEXP (operands[0], 0), 1))) + { + case -8: + output_asm_insn ("stm%(db%)\t%m0, %M1", operands); + return ""; + + case -4: + if (TARGET_THUMB2) + break; + output_asm_insn ("stm%(da%)\t%m0, %M1", operands); + return ""; + + case 4: + if (TARGET_THUMB2) + break; + output_asm_insn ("stm%(ib%)\t%m0, %M1", operands); + return ""; + } + } + if (TARGET_LDRD + && (GET_CODE (otherops[2]) == REG + || TARGET_THUMB2 + || (GET_CODE (otherops[2]) == CONST_INT + && INTVAL (otherops[2]) > -256 + && INTVAL (otherops[2]) < 256))) + { + otherops[0] = operands[1]; + otherops[1] = XEXP (XEXP (operands[0], 0), 0); + output_asm_insn ("str%(d%)\t%0, [%1, %2]", otherops); + return ""; + } + /* Fall through */ + + default: + otherops[0] = adjust_address (operands[0], SImode, 4); + otherops[1] = operands[1]; + output_asm_insn ("str%?\t%1, %0", operands); + output_asm_insn ("str%?\t%H1, %0", otherops); + } + } + + return ""; +} + +/* Output a move, load or store for quad-word vectors in ARM registers. Only + handles MEMs accepted by neon_vector_mem_operand with TYPE=1. */ + +const char * +output_move_quad (rtx *operands) +{ + if (REG_P (operands[0])) + { + /* Load, or reg->reg move. */ + + if (MEM_P (operands[1])) + { + switch (GET_CODE (XEXP (operands[1], 0))) + { + case REG: + output_asm_insn ("ldm%(ia%)\t%m1, %M0", operands); + break; + + case LABEL_REF: + case CONST: + output_asm_insn ("adr%?\t%0, %1", operands); + output_asm_insn ("ldm%(ia%)\t%0, %M0", operands); + break; + + default: + gcc_unreachable (); + } + } + else + { + rtx ops[2]; + int dest, src, i; + + gcc_assert (REG_P (operands[1])); + + dest = REGNO (operands[0]); + src = REGNO (operands[1]); + + /* This seems pretty dumb, but hopefully GCC won't try to do it + very often. */ + if (dest < src) + for (i = 0; i < 4; i++) + { + ops[0] = gen_rtx_REG (SImode, dest + i); + ops[1] = gen_rtx_REG (SImode, src + i); + output_asm_insn ("mov%?\t%0, %1", ops); + } + else + for (i = 3; i >= 0; i--) + { + ops[0] = gen_rtx_REG (SImode, dest + i); + ops[1] = gen_rtx_REG (SImode, src + i); + output_asm_insn ("mov%?\t%0, %1", ops); + } + } + } + else + { + gcc_assert (MEM_P (operands[0])); + gcc_assert (REG_P (operands[1])); + gcc_assert (!reg_overlap_mentioned_p (operands[1], operands[0])); + + switch (GET_CODE (XEXP (operands[0], 0))) + { + case REG: + output_asm_insn ("stm%(ia%)\t%m0, %M1", operands); + break; + + default: + gcc_unreachable (); + } + } + + return ""; +} + +/* Output a VFP load or store instruction. */ + +const char * +output_move_vfp (rtx *operands) +{ + rtx reg, mem, addr, ops[2]; + int load = REG_P (operands[0]); + int dp = GET_MODE_SIZE (GET_MODE (operands[0])) == 8; + int integer_p = GET_MODE_CLASS (GET_MODE (operands[0])) == MODE_INT; + const char *templ; + char buff[50]; + enum machine_mode mode; + + reg = operands[!load]; + mem = operands[load]; + + mode = GET_MODE (reg); + + gcc_assert (REG_P (reg)); + gcc_assert (IS_VFP_REGNUM (REGNO (reg))); + gcc_assert (mode == SFmode + || mode == DFmode + || mode == SImode + || mode == DImode + || (TARGET_NEON && VALID_NEON_DREG_MODE (mode))); + gcc_assert (MEM_P (mem)); + + addr = XEXP (mem, 0); + + switch (GET_CODE (addr)) + { + case PRE_DEC: + templ = "f%smdb%c%%?\t%%0!, {%%%s1}%s"; + ops[0] = XEXP (addr, 0); + ops[1] = reg; + break; + + case POST_INC: + templ = "f%smia%c%%?\t%%0!, {%%%s1}%s"; + ops[0] = XEXP (addr, 0); + ops[1] = reg; + break; + + default: + templ = "f%s%c%%?\t%%%s0, %%1%s"; + ops[0] = reg; + ops[1] = mem; + break; + } + + sprintf (buff, templ, + load ? "ld" : "st", + dp ? 'd' : 's', + dp ? "P" : "", + integer_p ? "\t%@ int" : ""); + output_asm_insn (buff, ops); + + return ""; +} + +/* Output a Neon quad-word load or store, or a load or store for + larger structure modes. + + WARNING: The ordering of elements is weird in big-endian mode, + because we use VSTM, as required by the EABI. GCC RTL defines + element ordering based on in-memory order. This can be differ + from the architectural ordering of elements within a NEON register. + The intrinsics defined in arm_neon.h use the NEON register element + ordering, not the GCC RTL element ordering. + + For example, the in-memory ordering of a big-endian a quadword + vector with 16-bit elements when stored from register pair {d0,d1} + will be (lowest address first, d0[N] is NEON register element N): + + [d0[3], d0[2], d0[1], d0[0], d1[7], d1[6], d1[5], d1[4]] + + When necessary, quadword registers (dN, dN+1) are moved to ARM + registers from rN in the order: + + dN -> (rN+1, rN), dN+1 -> (rN+3, rN+2) + + So that STM/LDM can be used on vectors in ARM registers, and the + same memory layout will result as if VSTM/VLDM were used. */ + +const char * +output_move_neon (rtx *operands) +{ + rtx reg, mem, addr, ops[2]; + int regno, load = REG_P (operands[0]); + const char *templ; + char buff[50]; + enum machine_mode mode; + + reg = operands[!load]; + mem = operands[load]; + + mode = GET_MODE (reg); + + gcc_assert (REG_P (reg)); + regno = REGNO (reg); + gcc_assert (VFP_REGNO_OK_FOR_DOUBLE (regno) + || NEON_REGNO_OK_FOR_QUAD (regno)); + gcc_assert (VALID_NEON_DREG_MODE (mode) + || VALID_NEON_QREG_MODE (mode) + || VALID_NEON_STRUCT_MODE (mode)); + gcc_assert (MEM_P (mem)); + + addr = XEXP (mem, 0); + + /* Strip off const from addresses like (const (plus (...))). */ + if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS) + addr = XEXP (addr, 0); + + switch (GET_CODE (addr)) + { + case POST_INC: + templ = "v%smia%%?\t%%0!, %%h1"; + ops[0] = XEXP (addr, 0); + ops[1] = reg; + break; + + case PRE_DEC: + /* FIXME: We should be using vld1/vst1 here in BE mode? */ + templ = "v%smdb%%?\t%%0!, %%h1"; + ops[0] = XEXP (addr, 0); + ops[1] = reg; + break; + + case POST_MODIFY: + /* FIXME: Not currently enabled in neon_vector_mem_operand. */ + gcc_unreachable (); + + case LABEL_REF: + case PLUS: + { + int nregs = HARD_REGNO_NREGS (REGNO (reg), mode) / 2; + int i; + int overlap = -1; + for (i = 0; i < nregs; i++) + { + /* We're only using DImode here because it's a convenient size. */ + ops[0] = gen_rtx_REG (DImode, REGNO (reg) + 2 * i); + ops[1] = adjust_address (mem, DImode, 8 * i); + if (reg_overlap_mentioned_p (ops[0], mem)) + { + gcc_assert (overlap == -1); + overlap = i; + } + else + { + sprintf (buff, "v%sr%%?\t%%P0, %%1", load ? "ld" : "st"); + output_asm_insn (buff, ops); + } + } + if (overlap != -1) + { + ops[0] = gen_rtx_REG (DImode, REGNO (reg) + 2 * overlap); + ops[1] = adjust_address (mem, SImode, 8 * overlap); + sprintf (buff, "v%sr%%?\t%%P0, %%1", load ? "ld" : "st"); + output_asm_insn (buff, ops); + } + + return ""; + } + + default: + templ = "v%smia%%?\t%%m0, %%h1"; + ops[0] = mem; + ops[1] = reg; + } + + sprintf (buff, templ, load ? "ld" : "st"); + output_asm_insn (buff, ops); + + return ""; +} + +/* Compute and return the length of neon_mov, where is + one of VSTRUCT modes: EI, OI, CI or XI. */ +int +arm_attr_length_move_neon (rtx insn) +{ + rtx reg, mem, addr; + int load; + enum machine_mode mode; + + extract_insn_cached (insn); + + if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1])) + { + mode = GET_MODE (recog_data.operand[0]); + switch (mode) + { + case EImode: + case OImode: + return 8; + case CImode: + return 12; + case XImode: + return 16; + default: + gcc_unreachable (); + } + } + + load = REG_P (recog_data.operand[0]); + reg = recog_data.operand[!load]; + mem = recog_data.operand[load]; + + gcc_assert (MEM_P (mem)); + + mode = GET_MODE (reg); + addr = XEXP (mem, 0); + + /* Strip off const from addresses like (const (plus (...))). */ + if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS) + addr = XEXP (addr, 0); + + if (GET_CODE (addr) == LABEL_REF || GET_CODE (addr) == PLUS) + { + int insns = HARD_REGNO_NREGS (REGNO (reg), mode) / 2; + return insns * 4; + } + else + return 4; +} + +/* Return nonzero if the offset in the address is an immediate. Otherwise, + return zero. */ + +int +arm_address_offset_is_imm (rtx insn) +{ + rtx mem, addr; + + extract_insn_cached (insn); + + if (REG_P (recog_data.operand[0])) + return 0; + + mem = recog_data.operand[0]; + + gcc_assert (MEM_P (mem)); + + addr = XEXP (mem, 0); + + if (GET_CODE (addr) == REG + || (GET_CODE (addr) == PLUS + && GET_CODE (XEXP (addr, 0)) == REG + && GET_CODE (XEXP (addr, 1)) == CONST_INT)) + return 1; + else + return 0; +} + +/* Output an ADD r, s, #n where n may be too big for one instruction. + If adding zero to one register, output nothing. */ +const char * +output_add_immediate (rtx *operands) +{ + HOST_WIDE_INT n = INTVAL (operands[2]); + + if (n != 0 || REGNO (operands[0]) != REGNO (operands[1])) + { + if (n < 0) + output_multi_immediate (operands, + "sub%?\t%0, %1, %2", "sub%?\t%0, %0, %2", 2, + -n); + else + output_multi_immediate (operands, + "add%?\t%0, %1, %2", "add%?\t%0, %0, %2", 2, + n); + } + + return ""; +} + +/* Output a multiple immediate operation. + OPERANDS is the vector of operands referred to in the output patterns. + INSTR1 is the output pattern to use for the first constant. + INSTR2 is the output pattern to use for subsequent constants. + IMMED_OP is the index of the constant slot in OPERANDS. + N is the constant value. */ +static const char * +output_multi_immediate (rtx *operands, const char *instr1, const char *instr2, + int immed_op, HOST_WIDE_INT n) +{ +#if HOST_BITS_PER_WIDE_INT > 32 + n &= 0xffffffff; +#endif + + if (n == 0) + { + /* Quick and easy output. */ + operands[immed_op] = const0_rtx; + output_asm_insn (instr1, operands); + } + else + { + int i; + const char * instr = instr1; + + /* Note that n is never zero here (which would give no output). */ + for (i = 0; i < 32; i += 2) + { + if (n & (3 << i)) + { + operands[immed_op] = GEN_INT (n & (255 << i)); + output_asm_insn (instr, operands); + instr = instr2; + i += 6; + } + } + } + + return ""; +} + +/* Return the name of a shifter operation. */ +static const char * +arm_shift_nmem(enum rtx_code code) +{ + switch (code) + { + case ASHIFT: + return ARM_LSL_NAME; + + case ASHIFTRT: + return "asr"; + + case LSHIFTRT: + return "lsr"; + + case ROTATERT: + return "ror"; + + default: + abort(); + } +} + +/* Return the appropriate ARM instruction for the operation code. + The returned result should not be overwritten. OP is the rtx of the + operation. SHIFT_FIRST_ARG is TRUE if the first argument of the operator + was shifted. */ +const char * +arithmetic_instr (rtx op, int shift_first_arg) +{ + switch (GET_CODE (op)) + { + case PLUS: + return "add"; + + case MINUS: + return shift_first_arg ? "rsb" : "sub"; + + case IOR: + return "orr"; + + case XOR: + return "eor"; + + case AND: + return "and"; + + case ASHIFT: + case ASHIFTRT: + case LSHIFTRT: + case ROTATERT: + return arm_shift_nmem(GET_CODE(op)); + + default: + gcc_unreachable (); + } +} + +/* Ensure valid constant shifts and return the appropriate shift mnemonic + for the operation code. The returned result should not be overwritten. + OP is the rtx code of the shift. + On exit, *AMOUNTP will be -1 if the shift is by a register, or a constant + shift. */ +static const char * +shift_op (rtx op, HOST_WIDE_INT *amountp) +{ + const char * mnem; + enum rtx_code code = GET_CODE (op); + + switch (GET_CODE (XEXP (op, 1))) + { + case REG: + case SUBREG: + *amountp = -1; + break; + + case CONST_INT: + *amountp = INTVAL (XEXP (op, 1)); + break; + + default: + gcc_unreachable (); + } + + switch (code) + { + case ROTATE: + gcc_assert (*amountp != -1); + *amountp = 32 - *amountp; + code = ROTATERT; + + /* Fall through. */ + + case ASHIFT: + case ASHIFTRT: + case LSHIFTRT: + case ROTATERT: + mnem = arm_shift_nmem(code); + break; + + case MULT: + /* We never have to worry about the amount being other than a + power of 2, since this case can never be reloaded from a reg. */ + gcc_assert (*amountp != -1); + *amountp = int_log2 (*amountp); + return ARM_LSL_NAME; + + default: + gcc_unreachable (); + } + + if (*amountp != -1) + { + /* This is not 100% correct, but follows from the desire to merge + multiplication by a power of 2 with the recognizer for a + shift. >=32 is not a valid shift for "lsl", so we must try and + output a shift that produces the correct arithmetical result. + Using lsr #32 is identical except for the fact that the carry bit + is not set correctly if we set the flags; but we never use the + carry bit from such an operation, so we can ignore that. */ + if (code == ROTATERT) + /* Rotate is just modulo 32. */ + *amountp &= 31; + else if (*amountp != (*amountp & 31)) + { + if (code == ASHIFT) + mnem = "lsr"; + *amountp = 32; + } + + /* Shifts of 0 are no-ops. */ + if (*amountp == 0) + return NULL; + } + + return mnem; +} + +/* Obtain the shift from the POWER of two. */ + +static HOST_WIDE_INT +int_log2 (HOST_WIDE_INT power) +{ + HOST_WIDE_INT shift = 0; + + while ((((HOST_WIDE_INT) 1 << shift) & power) == 0) + { + gcc_assert (shift <= 31); + shift++; + } + + return shift; +} + +/* Output a .ascii pseudo-op, keeping track of lengths. This is + because /bin/as is horribly restrictive. The judgement about + whether or not each character is 'printable' (and can be output as + is) or not (and must be printed with an octal escape) must be made + with reference to the *host* character set -- the situation is + similar to that discussed in the comments above pp_c_char in + c-pretty-print.c. */ + +#define MAX_ASCII_LEN 51 + +void +output_ascii_pseudo_op (FILE *stream, const unsigned char *p, int len) +{ + int i; + int len_so_far = 0; + + fputs ("\t.ascii\t\"", stream); + + for (i = 0; i < len; i++) + { + int c = p[i]; + + if (len_so_far >= MAX_ASCII_LEN) + { + fputs ("\"\n\t.ascii\t\"", stream); + len_so_far = 0; + } + + if (ISPRINT (c)) + { + if (c == '\\' || c == '\"') + { + putc ('\\', stream); + len_so_far++; + } + putc (c, stream); + len_so_far++; + } + else + { + fprintf (stream, "\\%03o", c); + len_so_far += 4; + } + } + + fputs ("\"\n", stream); +} + +/* Compute the register save mask for registers 0 through 12 + inclusive. This code is used by arm_compute_save_reg_mask. */ + +static unsigned long +arm_compute_save_reg0_reg12_mask (void) +{ + unsigned long func_type = arm_current_func_type (); + unsigned long save_reg_mask = 0; + unsigned int reg; + + if (IS_INTERRUPT (func_type)) + { + unsigned int max_reg; + /* Interrupt functions must not corrupt any registers, + even call clobbered ones. If this is a leaf function + we can just examine the registers used by the RTL, but + otherwise we have to assume that whatever function is + called might clobber anything, and so we have to save + all the call-clobbered registers as well. */ + if (ARM_FUNC_TYPE (func_type) == ARM_FT_FIQ) + /* FIQ handlers have registers r8 - r12 banked, so + we only need to check r0 - r7, Normal ISRs only + bank r14 and r15, so we must check up to r12. + r13 is the stack pointer which is always preserved, + so we do not need to consider it here. */ + max_reg = 7; + else + max_reg = 12; + + for (reg = 0; reg <= max_reg; reg++) + if (df_regs_ever_live_p (reg) + || (! current_function_is_leaf && call_used_regs[reg])) + save_reg_mask |= (1 << reg); + + /* Also save the pic base register if necessary. */ + if (flag_pic + && !TARGET_SINGLE_PIC_BASE + && arm_pic_register != INVALID_REGNUM + && crtl->uses_pic_offset_table) + save_reg_mask |= 1 << PIC_OFFSET_TABLE_REGNUM; + } + else if (IS_VOLATILE(func_type)) + { + /* For noreturn functions we historically omitted register saves + altogether. However this really messes up debugging. As a + compromise save just the frame pointers. Combined with the link + register saved elsewhere this should be sufficient to get + a backtrace. */ + if (frame_pointer_needed) + save_reg_mask |= 1 << HARD_FRAME_POINTER_REGNUM; + if (df_regs_ever_live_p (ARM_HARD_FRAME_POINTER_REGNUM)) + save_reg_mask |= 1 << ARM_HARD_FRAME_POINTER_REGNUM; + if (df_regs_ever_live_p (THUMB_HARD_FRAME_POINTER_REGNUM)) + save_reg_mask |= 1 << THUMB_HARD_FRAME_POINTER_REGNUM; + } + else + { + /* In the normal case we only need to save those registers + which are call saved and which are used by this function. */ + for (reg = 0; reg <= 11; reg++) + if (df_regs_ever_live_p (reg) && ! call_used_regs[reg]) + save_reg_mask |= (1 << reg); + + /* Handle the frame pointer as a special case. */ + if (frame_pointer_needed) + save_reg_mask |= 1 << HARD_FRAME_POINTER_REGNUM; + + /* If we aren't loading the PIC register, + don't stack it even though it may be live. */ + if (flag_pic + && !TARGET_SINGLE_PIC_BASE + && arm_pic_register != INVALID_REGNUM + && (df_regs_ever_live_p (PIC_OFFSET_TABLE_REGNUM) + || crtl->uses_pic_offset_table)) + save_reg_mask |= 1 << PIC_OFFSET_TABLE_REGNUM; + + /* The prologue will copy SP into R0, so save it. */ + if (IS_STACKALIGN (func_type)) + save_reg_mask |= 1; + } + + /* Save registers so the exception handler can modify them. */ + if (crtl->calls_eh_return) + { + unsigned int i; + + for (i = 0; ; i++) + { + reg = EH_RETURN_DATA_REGNO (i); + if (reg == INVALID_REGNUM) + break; + save_reg_mask |= 1 << reg; + } + } + + return save_reg_mask; +} + + +/* Compute the number of bytes used to store the static chain register on the + stack, above the stack frame. We need to know this accurately to get the + alignment of the rest of the stack frame correct. */ + +static int arm_compute_static_chain_stack_bytes (void) +{ + unsigned long func_type = arm_current_func_type (); + int static_chain_stack_bytes = 0; + + if (TARGET_APCS_FRAME && frame_pointer_needed && TARGET_ARM && + IS_NESTED (func_type) && + df_regs_ever_live_p (3) && crtl->args.pretend_args_size == 0) + static_chain_stack_bytes = 4; + + return static_chain_stack_bytes; +} + + +/* Compute a bit mask of which registers need to be + saved on the stack for the current function. + This is used by arm_get_frame_offsets, which may add extra registers. */ + +static unsigned long +arm_compute_save_reg_mask (void) +{ + unsigned int save_reg_mask = 0; + unsigned long func_type = arm_current_func_type (); + unsigned int reg; + + if (IS_NAKED (func_type)) + /* This should never really happen. */ + return 0; + + /* If we are creating a stack frame, then we must save the frame pointer, + IP (which will hold the old stack pointer), LR and the PC. */ + if (TARGET_APCS_FRAME && frame_pointer_needed && TARGET_ARM) + save_reg_mask |= + (1 << ARM_HARD_FRAME_POINTER_REGNUM) + | (1 << IP_REGNUM) + | (1 << LR_REGNUM) + | (1 << PC_REGNUM); + + save_reg_mask |= arm_compute_save_reg0_reg12_mask (); + + /* Decide if we need to save the link register. + Interrupt routines have their own banked link register, + so they never need to save it. + Otherwise if we do not use the link register we do not need to save + it. If we are pushing other registers onto the stack however, we + can save an instruction in the epilogue by pushing the link register + now and then popping it back into the PC. This incurs extra memory + accesses though, so we only do it when optimizing for size, and only + if we know that we will not need a fancy return sequence. */ + if (df_regs_ever_live_p (LR_REGNUM) + || (save_reg_mask + && optimize_size + && ARM_FUNC_TYPE (func_type) == ARM_FT_NORMAL + && !crtl->calls_eh_return)) + save_reg_mask |= 1 << LR_REGNUM; + + if (cfun->machine->lr_save_eliminated) + save_reg_mask &= ~ (1 << LR_REGNUM); + + if (TARGET_REALLY_IWMMXT + && ((bit_count (save_reg_mask) + + ARM_NUM_INTS (crtl->args.pretend_args_size + + arm_compute_static_chain_stack_bytes()) + ) % 2) != 0) + { + /* The total number of registers that are going to be pushed + onto the stack is odd. We need to ensure that the stack + is 64-bit aligned before we start to save iWMMXt registers, + and also before we start to create locals. (A local variable + might be a double or long long which we will load/store using + an iWMMXt instruction). Therefore we need to push another + ARM register, so that the stack will be 64-bit aligned. We + try to avoid using the arg registers (r0 -r3) as they might be + used to pass values in a tail call. */ + for (reg = 4; reg <= 12; reg++) + if ((save_reg_mask & (1 << reg)) == 0) + break; + + if (reg <= 12) + save_reg_mask |= (1 << reg); + else + { + cfun->machine->sibcall_blocked = 1; + save_reg_mask |= (1 << 3); + } + } + + /* We may need to push an additional register for use initializing the + PIC base register. */ + if (TARGET_THUMB2 && IS_NESTED (func_type) && flag_pic + && (save_reg_mask & THUMB2_WORK_REGS) == 0) + { + reg = thumb_find_work_register (1 << 4); + if (!call_used_regs[reg]) + save_reg_mask |= (1 << reg); + } + + return save_reg_mask; +} + + +/* Compute a bit mask of which registers need to be + saved on the stack for the current function. */ +static unsigned long +thumb1_compute_save_reg_mask (void) +{ + unsigned long mask; + unsigned reg; + + mask = 0; + for (reg = 0; reg < 12; reg ++) + if (df_regs_ever_live_p (reg) && !call_used_regs[reg]) + mask |= 1 << reg; + + if (flag_pic + && !TARGET_SINGLE_PIC_BASE + && arm_pic_register != INVALID_REGNUM + && crtl->uses_pic_offset_table) + mask |= 1 << PIC_OFFSET_TABLE_REGNUM; + + /* See if we might need r11 for calls to _interwork_r11_call_via_rN(). */ + if (!frame_pointer_needed && CALLER_INTERWORKING_SLOT_SIZE > 0) + mask |= 1 << ARM_HARD_FRAME_POINTER_REGNUM; + + /* LR will also be pushed if any lo regs are pushed. */ + if (mask & 0xff || thumb_force_lr_save ()) + mask |= (1 << LR_REGNUM); + + /* Make sure we have a low work register if we need one. + We will need one if we are going to push a high register, + but we are not currently intending to push a low register. */ + if ((mask & 0xff) == 0 + && ((mask & 0x0f00) || TARGET_BACKTRACE)) + { + /* Use thumb_find_work_register to choose which register + we will use. If the register is live then we will + have to push it. Use LAST_LO_REGNUM as our fallback + choice for the register to select. */ + reg = thumb_find_work_register (1 << LAST_LO_REGNUM); + /* Make sure the register returned by thumb_find_work_register is + not part of the return value. */ + if (reg * UNITS_PER_WORD <= (unsigned) arm_size_return_regs ()) + reg = LAST_LO_REGNUM; + + if (! call_used_regs[reg]) + mask |= 1 << reg; + } + + /* The 504 below is 8 bytes less than 512 because there are two possible + alignment words. We can't tell here if they will be present or not so we + have to play it safe and assume that they are. */ + if ((CALLER_INTERWORKING_SLOT_SIZE + + ROUND_UP_WORD (get_frame_size ()) + + crtl->outgoing_args_size) >= 504) + { + /* This is the same as the code in thumb1_expand_prologue() which + determines which register to use for stack decrement. */ + for (reg = LAST_ARG_REGNUM + 1; reg <= LAST_LO_REGNUM; reg++) + if (mask & (1 << reg)) + break; + + if (reg > LAST_LO_REGNUM) + { + /* Make sure we have a register available for stack decrement. */ + mask |= 1 << LAST_LO_REGNUM; + } + } + + return mask; +} + + +/* Return the number of bytes required to save VFP registers. */ +static int +arm_get_vfp_saved_size (void) +{ + unsigned int regno; + int count; + int saved; + + saved = 0; + /* Space for saved VFP registers. */ + if (TARGET_HARD_FLOAT && TARGET_VFP) + { + count = 0; + for (regno = FIRST_VFP_REGNUM; + regno < LAST_VFP_REGNUM; + regno += 2) + { + if ((!df_regs_ever_live_p (regno) || call_used_regs[regno]) + && (!df_regs_ever_live_p (regno + 1) || call_used_regs[regno + 1])) + { + if (count > 0) + { + /* Workaround ARM10 VFPr1 bug. */ + if (count == 2 && !arm_arch6) + count++; + saved += count * 8; + } + count = 0; + } + else + count++; + } + if (count > 0) + { + if (count == 2 && !arm_arch6) + count++; + saved += count * 8; + } + } + return saved; +} + + +/* Generate a function exit sequence. If REALLY_RETURN is false, then do + everything bar the final return instruction. */ +const char * +output_return_instruction (rtx operand, int really_return, int reverse) +{ + char conditional[10]; + char instr[100]; + unsigned reg; + unsigned long live_regs_mask; + unsigned long func_type; + arm_stack_offsets *offsets; + + func_type = arm_current_func_type (); + + if (IS_NAKED (func_type)) + return ""; + + if (IS_VOLATILE (func_type) && TARGET_ABORT_NORETURN) + { + /* If this function was declared non-returning, and we have + found a tail call, then we have to trust that the called + function won't return. */ + if (really_return) + { + rtx ops[2]; + + /* Otherwise, trap an attempted return by aborting. */ + ops[0] = operand; + ops[1] = gen_rtx_SYMBOL_REF (Pmode, NEED_PLT_RELOC ? "abort(PLT)" + : "abort"); + assemble_external_libcall (ops[1]); + output_asm_insn (reverse ? "bl%D0\t%a1" : "bl%d0\t%a1", ops); + } + + return ""; + } + + gcc_assert (!cfun->calls_alloca || really_return); + + sprintf (conditional, "%%?%%%c0", reverse ? 'D' : 'd'); + + cfun->machine->return_used_this_function = 1; + + offsets = arm_get_frame_offsets (); + live_regs_mask = offsets->saved_regs_mask; + + if (live_regs_mask) + { + const char * return_reg; + + /* If we do not have any special requirements for function exit + (e.g. interworking) then we can load the return address + directly into the PC. Otherwise we must load it into LR. */ + if (really_return + && (IS_INTERRUPT (func_type) || !TARGET_INTERWORK)) + return_reg = reg_names[PC_REGNUM]; + else + return_reg = reg_names[LR_REGNUM]; + + if ((live_regs_mask & (1 << IP_REGNUM)) == (1 << IP_REGNUM)) + { + /* There are three possible reasons for the IP register + being saved. 1) a stack frame was created, in which case + IP contains the old stack pointer, or 2) an ISR routine + corrupted it, or 3) it was saved to align the stack on + iWMMXt. In case 1, restore IP into SP, otherwise just + restore IP. */ + if (frame_pointer_needed) + { + live_regs_mask &= ~ (1 << IP_REGNUM); + live_regs_mask |= (1 << SP_REGNUM); + } + else + gcc_assert (IS_INTERRUPT (func_type) || TARGET_REALLY_IWMMXT); + } + + /* On some ARM architectures it is faster to use LDR rather than + LDM to load a single register. On other architectures, the + cost is the same. In 26 bit mode, or for exception handlers, + we have to use LDM to load the PC so that the CPSR is also + restored. */ + for (reg = 0; reg <= LAST_ARM_REGNUM; reg++) + if (live_regs_mask == (1U << reg)) + break; + + if (reg <= LAST_ARM_REGNUM + && (reg != LR_REGNUM + || ! really_return + || ! IS_INTERRUPT (func_type))) + { + sprintf (instr, "ldr%s\t%%|%s, [%%|sp], #4", conditional, + (reg == LR_REGNUM) ? return_reg : reg_names[reg]); + } + else + { + char *p; + int first = 1; + + /* Generate the load multiple instruction to restore the + registers. Note we can get here, even if + frame_pointer_needed is true, but only if sp already + points to the base of the saved core registers. */ + if (live_regs_mask & (1 << SP_REGNUM)) + { + unsigned HOST_WIDE_INT stack_adjust; + + stack_adjust = offsets->outgoing_args - offsets->saved_regs; + gcc_assert (stack_adjust == 0 || stack_adjust == 4); + + if (stack_adjust && arm_arch5 && TARGET_ARM) + if (TARGET_UNIFIED_ASM) + sprintf (instr, "ldmib%s\t%%|sp, {", conditional); + else + sprintf (instr, "ldm%sib\t%%|sp, {", conditional); + else + { + /* If we can't use ldmib (SA110 bug), + then try to pop r3 instead. */ + if (stack_adjust) + live_regs_mask |= 1 << 3; + + if (TARGET_UNIFIED_ASM) + sprintf (instr, "ldmfd%s\t%%|sp, {", conditional); + else + sprintf (instr, "ldm%sfd\t%%|sp, {", conditional); + } + } + else + if (TARGET_UNIFIED_ASM) + sprintf (instr, "pop%s\t{", conditional); + else + sprintf (instr, "ldm%sfd\t%%|sp!, {", conditional); + + p = instr + strlen (instr); + + for (reg = 0; reg <= SP_REGNUM; reg++) + if (live_regs_mask & (1 << reg)) + { + int l = strlen (reg_names[reg]); + + if (first) + first = 0; + else + { + memcpy (p, ", ", 2); + p += 2; + } + + memcpy (p, "%|", 2); + memcpy (p + 2, reg_names[reg], l); + p += l + 2; + } + + if (live_regs_mask & (1 << LR_REGNUM)) + { + sprintf (p, "%s%%|%s}", first ? "" : ", ", return_reg); + /* If returning from an interrupt, restore the CPSR. */ + if (IS_INTERRUPT (func_type)) + strcat (p, "^"); + } + else + strcpy (p, "}"); + } + + output_asm_insn (instr, & operand); + + /* See if we need to generate an extra instruction to + perform the actual function return. */ + if (really_return + && func_type != ARM_FT_INTERWORKED + && (live_regs_mask & (1 << LR_REGNUM)) != 0) + { + /* The return has already been handled + by loading the LR into the PC. */ + really_return = 0; + } + } + + if (really_return) + { + switch ((int) ARM_FUNC_TYPE (func_type)) + { + case ARM_FT_ISR: + case ARM_FT_FIQ: + /* ??? This is wrong for unified assembly syntax. */ + sprintf (instr, "sub%ss\t%%|pc, %%|lr, #4", conditional); + break; + + case ARM_FT_INTERWORKED: + sprintf (instr, "bx%s\t%%|lr", conditional); + break; + + case ARM_FT_EXCEPTION: + /* ??? This is wrong for unified assembly syntax. */ + sprintf (instr, "mov%ss\t%%|pc, %%|lr", conditional); + break; + + default: + /* Use bx if it's available. */ + if (arm_arch5 || arm_arch4t) + sprintf (instr, "bx%s\t%%|lr", conditional); + else + sprintf (instr, "mov%s\t%%|pc, %%|lr", conditional); + break; + } + + output_asm_insn (instr, & operand); + } + + return ""; +} + +/* Write the function name into the code section, directly preceding + the function prologue. + + Code will be output similar to this: + t0 + .ascii "arm_poke_function_name", 0 + .align + t1 + .word 0xff000000 + (t1 - t0) + arm_poke_function_name + mov ip, sp + stmfd sp!, {fp, ip, lr, pc} + sub fp, ip, #4 + + When performing a stack backtrace, code can inspect the value + of 'pc' stored at 'fp' + 0. If the trace function then looks + at location pc - 12 and the top 8 bits are set, then we know + that there is a function name embedded immediately preceding this + location and has length ((pc[-3]) & 0xff000000). + + We assume that pc is declared as a pointer to an unsigned long. + + It is of no benefit to output the function name if we are assembling + a leaf function. These function types will not contain a stack + backtrace structure, therefore it is not possible to determine the + function name. */ +void +arm_poke_function_name (FILE *stream, const char *name) +{ + unsigned long alignlength; + unsigned long length; + rtx x; + + length = strlen (name) + 1; + alignlength = ROUND_UP_WORD (length); + + ASM_OUTPUT_ASCII (stream, name, length); + ASM_OUTPUT_ALIGN (stream, 2); + x = GEN_INT ((unsigned HOST_WIDE_INT) 0xff000000 + alignlength); + assemble_aligned_integer (UNITS_PER_WORD, x); +} + +/* Place some comments into the assembler stream + describing the current function. */ +static void +arm_output_function_prologue (FILE *f, HOST_WIDE_INT frame_size) +{ + unsigned long func_type; + + if (TARGET_THUMB1) + { + thumb1_output_function_prologue (f, frame_size); + return; + } + + /* Sanity check. */ + gcc_assert (!arm_ccfsm_state && !arm_target_insn); + + func_type = arm_current_func_type (); + + switch ((int) ARM_FUNC_TYPE (func_type)) + { + default: + case ARM_FT_NORMAL: + break; + case ARM_FT_INTERWORKED: + asm_fprintf (f, "\t%@ Function supports interworking.\n"); + break; + case ARM_FT_ISR: + asm_fprintf (f, "\t%@ Interrupt Service Routine.\n"); + break; + case ARM_FT_FIQ: + asm_fprintf (f, "\t%@ Fast Interrupt Service Routine.\n"); + break; + case ARM_FT_EXCEPTION: + asm_fprintf (f, "\t%@ ARM Exception Handler.\n"); + break; + } + + if (IS_NAKED (func_type)) + asm_fprintf (f, "\t%@ Naked Function: prologue and epilogue provided by programmer.\n"); + + if (IS_VOLATILE (func_type)) + asm_fprintf (f, "\t%@ Volatile: function does not return.\n"); + + if (IS_NESTED (func_type)) + asm_fprintf (f, "\t%@ Nested: function declared inside another function.\n"); + if (IS_STACKALIGN (func_type)) + asm_fprintf (f, "\t%@ Stack Align: May be called with mis-aligned SP.\n"); + + asm_fprintf (f, "\t%@ args = %d, pretend = %d, frame = %wd\n", + crtl->args.size, + crtl->args.pretend_args_size, frame_size); + + asm_fprintf (f, "\t%@ frame_needed = %d, uses_anonymous_args = %d\n", + frame_pointer_needed, + cfun->machine->uses_anonymous_args); + + if (cfun->machine->lr_save_eliminated) + asm_fprintf (f, "\t%@ link register save eliminated.\n"); + + if (crtl->calls_eh_return) + asm_fprintf (f, "\t@ Calls __builtin_eh_return.\n"); + +} + +const char * +arm_output_epilogue (rtx sibling) +{ + int reg; + unsigned long saved_regs_mask; + unsigned long func_type; + /* Floats_offset is the offset from the "virtual" frame. In an APCS + frame that is $fp + 4 for a non-variadic function. */ + int floats_offset = 0; + rtx operands[3]; + FILE * f = asm_out_file; + unsigned int lrm_count = 0; + int really_return = (sibling == NULL); + int start_reg; + arm_stack_offsets *offsets; + + /* If we have already generated the return instruction + then it is futile to generate anything else. */ + if (use_return_insn (FALSE, sibling) && + (cfun->machine->return_used_this_function != 0)) + return ""; + + func_type = arm_current_func_type (); + + if (IS_NAKED (func_type)) + /* Naked functions don't have epilogues. */ + return ""; + + if (IS_VOLATILE (func_type) && TARGET_ABORT_NORETURN) + { + rtx op; + + /* A volatile function should never return. Call abort. */ + op = gen_rtx_SYMBOL_REF (Pmode, NEED_PLT_RELOC ? "abort(PLT)" : "abort"); + assemble_external_libcall (op); + output_asm_insn ("bl\t%a0", &op); + + return ""; + } + + /* If we are throwing an exception, then we really must be doing a + return, so we can't tail-call. */ + gcc_assert (!crtl->calls_eh_return || really_return); + + offsets = arm_get_frame_offsets (); + saved_regs_mask = offsets->saved_regs_mask; + + if (TARGET_IWMMXT) + lrm_count = bit_count (saved_regs_mask); + + floats_offset = offsets->saved_args; + /* Compute how far away the floats will be. */ + for (reg = 0; reg <= LAST_ARM_REGNUM; reg++) + if (saved_regs_mask & (1 << reg)) + floats_offset += 4; + + if (TARGET_APCS_FRAME && frame_pointer_needed && TARGET_ARM) + { + /* This variable is for the Virtual Frame Pointer, not VFP regs. */ + int vfp_offset = offsets->frame; + + if (TARGET_FPA_EMU2) + { + for (reg = LAST_FPA_REGNUM; reg >= FIRST_FPA_REGNUM; reg--) + if (df_regs_ever_live_p (reg) && !call_used_regs[reg]) + { + floats_offset += 12; + asm_fprintf (f, "\tldfe\t%r, [%r, #-%d]\n", + reg, FP_REGNUM, floats_offset - vfp_offset); + } + } + else + { + start_reg = LAST_FPA_REGNUM; + + for (reg = LAST_FPA_REGNUM; reg >= FIRST_FPA_REGNUM; reg--) + { + if (df_regs_ever_live_p (reg) && !call_used_regs[reg]) + { + floats_offset += 12; + + /* We can't unstack more than four registers at once. */ + if (start_reg - reg == 3) + { + asm_fprintf (f, "\tlfm\t%r, 4, [%r, #-%d]\n", + reg, FP_REGNUM, floats_offset - vfp_offset); + start_reg = reg - 1; + } + } + else + { + if (reg != start_reg) + asm_fprintf (f, "\tlfm\t%r, %d, [%r, #-%d]\n", + reg + 1, start_reg - reg, + FP_REGNUM, floats_offset - vfp_offset); + start_reg = reg - 1; + } + } + + /* Just in case the last register checked also needs unstacking. */ + if (reg != start_reg) + asm_fprintf (f, "\tlfm\t%r, %d, [%r, #-%d]\n", + reg + 1, start_reg - reg, + FP_REGNUM, floats_offset - vfp_offset); + } + + if (TARGET_HARD_FLOAT && TARGET_VFP) + { + int saved_size; + + /* The fldmd insns do not have base+offset addressing + modes, so we use IP to hold the address. */ + saved_size = arm_get_vfp_saved_size (); + + if (saved_size > 0) + { + floats_offset += saved_size; + asm_fprintf (f, "\tsub\t%r, %r, #%d\n", IP_REGNUM, + FP_REGNUM, floats_offset - vfp_offset); + } + start_reg = FIRST_VFP_REGNUM; + for (reg = FIRST_VFP_REGNUM; reg < LAST_VFP_REGNUM; reg += 2) + { + if ((!df_regs_ever_live_p (reg) || call_used_regs[reg]) + && (!df_regs_ever_live_p (reg + 1) || call_used_regs[reg + 1])) + { + if (start_reg != reg) + vfp_output_fldmd (f, IP_REGNUM, + (start_reg - FIRST_VFP_REGNUM) / 2, + (reg - start_reg) / 2); + start_reg = reg + 2; + } + } + if (start_reg != reg) + vfp_output_fldmd (f, IP_REGNUM, + (start_reg - FIRST_VFP_REGNUM) / 2, + (reg - start_reg) / 2); + } + + if (TARGET_IWMMXT) + { + /* The frame pointer is guaranteed to be non-double-word aligned. + This is because it is set to (old_stack_pointer - 4) and the + old_stack_pointer was double word aligned. Thus the offset to + the iWMMXt registers to be loaded must also be non-double-word + sized, so that the resultant address *is* double-word aligned. + We can ignore floats_offset since that was already included in + the live_regs_mask. */ + lrm_count += (lrm_count % 2 ? 2 : 1); + + for (reg = LAST_IWMMXT_REGNUM; reg >= FIRST_IWMMXT_REGNUM; reg--) + if (df_regs_ever_live_p (reg) && !call_used_regs[reg]) + { + asm_fprintf (f, "\twldrd\t%r, [%r, #-%d]\n", + reg, FP_REGNUM, lrm_count * 4); + lrm_count += 2; + } + } + + /* saved_regs_mask should contain the IP, which at the time of stack + frame generation actually contains the old stack pointer. So a + quick way to unwind the stack is just pop the IP register directly + into the stack pointer. */ + gcc_assert (saved_regs_mask & (1 << IP_REGNUM)); + saved_regs_mask &= ~ (1 << IP_REGNUM); + saved_regs_mask |= (1 << SP_REGNUM); + + /* There are two registers left in saved_regs_mask - LR and PC. We + only need to restore the LR register (the return address), but to + save time we can load it directly into the PC, unless we need a + special function exit sequence, or we are not really returning. */ + if (really_return + && ARM_FUNC_TYPE (func_type) == ARM_FT_NORMAL + && !crtl->calls_eh_return) + /* Delete the LR from the register mask, so that the LR on + the stack is loaded into the PC in the register mask. */ + saved_regs_mask &= ~ (1 << LR_REGNUM); + else + saved_regs_mask &= ~ (1 << PC_REGNUM); + + /* We must use SP as the base register, because SP is one of the + registers being restored. If an interrupt or page fault + happens in the ldm instruction, the SP might or might not + have been restored. That would be bad, as then SP will no + longer indicate the safe area of stack, and we can get stack + corruption. Using SP as the base register means that it will + be reset correctly to the original value, should an interrupt + occur. If the stack pointer already points at the right + place, then omit the subtraction. */ + if (offsets->outgoing_args != (1 + (int) bit_count (saved_regs_mask)) + || cfun->calls_alloca) + asm_fprintf (f, "\tsub\t%r, %r, #%d\n", SP_REGNUM, FP_REGNUM, + 4 * bit_count (saved_regs_mask)); + print_multi_reg (f, "ldmfd\t%r, ", SP_REGNUM, saved_regs_mask, 0); + + if (IS_INTERRUPT (func_type)) + /* Interrupt handlers will have pushed the + IP onto the stack, so restore it now. */ + print_multi_reg (f, "ldmfd\t%r!, ", SP_REGNUM, 1 << IP_REGNUM, 0); + } + else + { + /* This branch is executed for ARM mode (non-apcs frames) and + Thumb-2 mode. Frame layout is essentially the same for those + cases, except that in ARM mode frame pointer points to the + first saved register, while in Thumb-2 mode the frame pointer points + to the last saved register. + + It is possible to make frame pointer point to last saved + register in both cases, and remove some conditionals below. + That means that fp setup in prologue would be just "mov fp, sp" + and sp restore in epilogue would be just "mov sp, fp", whereas + now we have to use add/sub in those cases. However, the value + of that would be marginal, as both mov and add/sub are 32-bit + in ARM mode, and it would require extra conditionals + in arm_expand_prologue to distingish ARM-apcs-frame case + (where frame pointer is required to point at first register) + and ARM-non-apcs-frame. Therefore, such change is postponed + until real need arise. */ + unsigned HOST_WIDE_INT amount; + int rfe; + /* Restore stack pointer if necessary. */ + if (TARGET_ARM && frame_pointer_needed) + { + operands[0] = stack_pointer_rtx; + operands[1] = hard_frame_pointer_rtx; + + operands[2] = GEN_INT (offsets->frame - offsets->saved_regs); + output_add_immediate (operands); + } + else + { + if (frame_pointer_needed) + { + /* For Thumb-2 restore sp from the frame pointer. + Operand restrictions mean we have to incrememnt FP, then copy + to SP. */ + amount = offsets->locals_base - offsets->saved_regs; + operands[0] = hard_frame_pointer_rtx; + } + else + { + unsigned long count; + operands[0] = stack_pointer_rtx; + amount = offsets->outgoing_args - offsets->saved_regs; + /* pop call clobbered registers if it avoids a + separate stack adjustment. */ + count = offsets->saved_regs - offsets->saved_args; + if (optimize_size + && count != 0 + && !crtl->calls_eh_return + && bit_count(saved_regs_mask) * 4 == count + && !IS_INTERRUPT (func_type) + && !crtl->tail_call_emit) + { + unsigned long mask; + /* Preserve return values, of any size. */ + mask = (1 << ((arm_size_return_regs() + 3) / 4)) - 1; + mask ^= 0xf; + mask &= ~saved_regs_mask; + reg = 0; + while (bit_count (mask) * 4 > amount) + { + while ((mask & (1 << reg)) == 0) + reg++; + mask &= ~(1 << reg); + } + if (bit_count (mask) * 4 == amount) { + amount = 0; + saved_regs_mask |= mask; + } + } + } + + if (amount) + { + operands[1] = operands[0]; + operands[2] = GEN_INT (amount); + output_add_immediate (operands); + } + if (frame_pointer_needed) + asm_fprintf (f, "\tmov\t%r, %r\n", + SP_REGNUM, HARD_FRAME_POINTER_REGNUM); + } + + if (TARGET_FPA_EMU2) + { + for (reg = FIRST_FPA_REGNUM; reg <= LAST_FPA_REGNUM; reg++) + if (df_regs_ever_live_p (reg) && !call_used_regs[reg]) + asm_fprintf (f, "\tldfe\t%r, [%r], #12\n", + reg, SP_REGNUM); + } + else + { + start_reg = FIRST_FPA_REGNUM; + + for (reg = FIRST_FPA_REGNUM; reg <= LAST_FPA_REGNUM; reg++) + { + if (df_regs_ever_live_p (reg) && !call_used_regs[reg]) + { + if (reg - start_reg == 3) + { + asm_fprintf (f, "\tlfmfd\t%r, 4, [%r]!\n", + start_reg, SP_REGNUM); + start_reg = reg + 1; + } + } + else + { + if (reg != start_reg) + asm_fprintf (f, "\tlfmfd\t%r, %d, [%r]!\n", + start_reg, reg - start_reg, + SP_REGNUM); + + start_reg = reg + 1; + } + } + + /* Just in case the last register checked also needs unstacking. */ + if (reg != start_reg) + asm_fprintf (f, "\tlfmfd\t%r, %d, [%r]!\n", + start_reg, reg - start_reg, SP_REGNUM); + } + + if (TARGET_HARD_FLOAT && TARGET_VFP) + { + int end_reg = LAST_VFP_REGNUM + 1; + + /* Scan the registers in reverse order. We need to match + any groupings made in the prologue and generate matching + pop operations. */ + for (reg = LAST_VFP_REGNUM - 1; reg >= FIRST_VFP_REGNUM; reg -= 2) + { + if ((!df_regs_ever_live_p (reg) || call_used_regs[reg]) + && (!df_regs_ever_live_p (reg + 1) + || call_used_regs[reg + 1])) + { + if (end_reg > reg + 2) + vfp_output_fldmd (f, SP_REGNUM, + (reg + 2 - FIRST_VFP_REGNUM) / 2, + (end_reg - (reg + 2)) / 2); + end_reg = reg; + } + } + if (end_reg > reg + 2) + vfp_output_fldmd (f, SP_REGNUM, 0, + (end_reg - (reg + 2)) / 2); + } + + if (TARGET_IWMMXT) + for (reg = FIRST_IWMMXT_REGNUM; reg <= LAST_IWMMXT_REGNUM; reg++) + if (df_regs_ever_live_p (reg) && !call_used_regs[reg]) + asm_fprintf (f, "\twldrd\t%r, [%r], #8\n", reg, SP_REGNUM); + + /* If we can, restore the LR into the PC. */ + if (ARM_FUNC_TYPE (func_type) != ARM_FT_INTERWORKED + && (TARGET_ARM || ARM_FUNC_TYPE (func_type) == ARM_FT_NORMAL) + && !IS_STACKALIGN (func_type) + && really_return + && crtl->args.pretend_args_size == 0 + && saved_regs_mask & (1 << LR_REGNUM) + && !crtl->calls_eh_return) + { + saved_regs_mask &= ~ (1 << LR_REGNUM); + saved_regs_mask |= (1 << PC_REGNUM); + rfe = IS_INTERRUPT (func_type); + } + else + rfe = 0; + + /* Load the registers off the stack. If we only have one register + to load use the LDR instruction - it is faster. For Thumb-2 + always use pop and the assembler will pick the best instruction.*/ + if (TARGET_ARM && saved_regs_mask == (1 << LR_REGNUM) + && !IS_INTERRUPT(func_type)) + { + asm_fprintf (f, "\tldr\t%r, [%r], #4\n", LR_REGNUM, SP_REGNUM); + } + else if (saved_regs_mask) + { + if (saved_regs_mask & (1 << SP_REGNUM)) + /* Note - write back to the stack register is not enabled + (i.e. "ldmfd sp!..."). We know that the stack pointer is + in the list of registers and if we add writeback the + instruction becomes UNPREDICTABLE. */ + print_multi_reg (f, "ldmfd\t%r, ", SP_REGNUM, saved_regs_mask, + rfe); + else if (TARGET_ARM) + print_multi_reg (f, "ldmfd\t%r!, ", SP_REGNUM, saved_regs_mask, + rfe); + else + print_multi_reg (f, "pop\t", SP_REGNUM, saved_regs_mask, 0); + } + + if (crtl->args.pretend_args_size) + { + /* Unwind the pre-pushed regs. */ + operands[0] = operands[1] = stack_pointer_rtx; + operands[2] = GEN_INT (crtl->args.pretend_args_size); + output_add_immediate (operands); + } + } + + /* We may have already restored PC directly from the stack. */ + if (!really_return || saved_regs_mask & (1 << PC_REGNUM)) + return ""; + + /* Stack adjustment for exception handler. */ + if (crtl->calls_eh_return) + asm_fprintf (f, "\tadd\t%r, %r, %r\n", SP_REGNUM, SP_REGNUM, + ARM_EH_STACKADJ_REGNUM); + + /* Generate the return instruction. */ + switch ((int) ARM_FUNC_TYPE (func_type)) + { + case ARM_FT_ISR: + case ARM_FT_FIQ: + asm_fprintf (f, "\tsubs\t%r, %r, #4\n", PC_REGNUM, LR_REGNUM); + break; + + case ARM_FT_EXCEPTION: + asm_fprintf (f, "\tmovs\t%r, %r\n", PC_REGNUM, LR_REGNUM); + break; + + case ARM_FT_INTERWORKED: + asm_fprintf (f, "\tbx\t%r\n", LR_REGNUM); + break; + + default: + if (IS_STACKALIGN (func_type)) + { + /* See comment in arm_expand_prologue. */ + asm_fprintf (f, "\tmov\t%r, %r\n", SP_REGNUM, 0); + } + if (arm_arch5 || arm_arch4t) + asm_fprintf (f, "\tbx\t%r\n", LR_REGNUM); + else + asm_fprintf (f, "\tmov\t%r, %r\n", PC_REGNUM, LR_REGNUM); + break; + } + + return ""; +} + +static void +arm_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED, + HOST_WIDE_INT frame_size ATTRIBUTE_UNUSED) +{ + arm_stack_offsets *offsets; + + if (TARGET_THUMB1) + { + int regno; + + /* Emit any call-via-reg trampolines that are needed for v4t support + of call_reg and call_value_reg type insns. */ + for (regno = 0; regno < LR_REGNUM; regno++) + { + rtx label = cfun->machine->call_via[regno]; + + if (label != NULL) + { + switch_to_section (function_section (current_function_decl)); + targetm.asm_out.internal_label (asm_out_file, "L", + CODE_LABEL_NUMBER (label)); + asm_fprintf (asm_out_file, "\tbx\t%r\n", regno); + } + } + + /* ??? Probably not safe to set this here, since it assumes that a + function will be emitted as assembly immediately after we generate + RTL for it. This does not happen for inline functions. */ + cfun->machine->return_used_this_function = 0; + } + else /* TARGET_32BIT */ + { + /* We need to take into account any stack-frame rounding. */ + offsets = arm_get_frame_offsets (); + + gcc_assert (!use_return_insn (FALSE, NULL) + || (cfun->machine->return_used_this_function != 0) + || offsets->saved_regs == offsets->outgoing_args + || frame_pointer_needed); + + /* Reset the ARM-specific per-function variables. */ + after_arm_reorg = 0; + } +} + +/* Generate and emit an insn that we will recognize as a push_multi. + Unfortunately, since this insn does not reflect very well the actual + semantics of the operation, we need to annotate the insn for the benefit + of DWARF2 frame unwind information. */ +static rtx +emit_multi_reg_push (unsigned long mask) +{ + int num_regs = 0; + int num_dwarf_regs; + int i, j; + rtx par; + rtx dwarf; + int dwarf_par_index; + rtx tmp, reg; + + for (i = 0; i <= LAST_ARM_REGNUM; i++) + if (mask & (1 << i)) + num_regs++; + + gcc_assert (num_regs && num_regs <= 16); + + /* We don't record the PC in the dwarf frame information. */ + num_dwarf_regs = num_regs; + if (mask & (1 << PC_REGNUM)) + num_dwarf_regs--; + + /* For the body of the insn we are going to generate an UNSPEC in + parallel with several USEs. This allows the insn to be recognized + by the push_multi pattern in the arm.md file. + + The body of the insn looks something like this: + + (parallel [ + (set (mem:BLK (pre_modify:SI (reg:SI sp) + (const_int:SI ))) + (unspec:BLK [(reg:SI r4)] UNSPEC_PUSH_MULT)) + (use (reg:SI XX)) + (use (reg:SI YY)) + ... + ]) + + For the frame note however, we try to be more explicit and actually + show each register being stored into the stack frame, plus a (single) + decrement of the stack pointer. We do it this way in order to be + friendly to the stack unwinding code, which only wants to see a single + stack decrement per instruction. The RTL we generate for the note looks + something like this: + + (sequence [ + (set (reg:SI sp) (plus:SI (reg:SI sp) (const_int -20))) + (set (mem:SI (reg:SI sp)) (reg:SI r4)) + (set (mem:SI (plus:SI (reg:SI sp) (const_int 4))) (reg:SI XX)) + (set (mem:SI (plus:SI (reg:SI sp) (const_int 8))) (reg:SI YY)) + ... + ]) + + FIXME:: In an ideal world the PRE_MODIFY would not exist and + instead we'd have a parallel expression detailing all + the stores to the various memory addresses so that debug + information is more up-to-date. Remember however while writing + this to take care of the constraints with the push instruction. + + Note also that this has to be taken care of for the VFP registers. + + For more see PR43399. */ + + par = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (num_regs)); + dwarf = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (num_dwarf_regs + 1)); + dwarf_par_index = 1; + + for (i = 0; i <= LAST_ARM_REGNUM; i++) + { + if (mask & (1 << i)) + { + reg = gen_rtx_REG (SImode, i); + + XVECEXP (par, 0, 0) + = gen_rtx_SET (VOIDmode, + gen_frame_mem + (BLKmode, + gen_rtx_PRE_MODIFY (Pmode, + stack_pointer_rtx, + plus_constant + (stack_pointer_rtx, + -4 * num_regs)) + ), + gen_rtx_UNSPEC (BLKmode, + gen_rtvec (1, reg), + UNSPEC_PUSH_MULT)); + + if (i != PC_REGNUM) + { + tmp = gen_rtx_SET (VOIDmode, + gen_frame_mem (SImode, stack_pointer_rtx), + reg); + RTX_FRAME_RELATED_P (tmp) = 1; + XVECEXP (dwarf, 0, dwarf_par_index) = tmp; + dwarf_par_index++; + } + + break; + } + } + + for (j = 1, i++; j < num_regs; i++) + { + if (mask & (1 << i)) + { + reg = gen_rtx_REG (SImode, i); + + XVECEXP (par, 0, j) = gen_rtx_USE (VOIDmode, reg); + + if (i != PC_REGNUM) + { + tmp + = gen_rtx_SET (VOIDmode, + gen_frame_mem + (SImode, + plus_constant (stack_pointer_rtx, + 4 * j)), + reg); + RTX_FRAME_RELATED_P (tmp) = 1; + XVECEXP (dwarf, 0, dwarf_par_index++) = tmp; + } + + j++; + } + } + + par = emit_insn (par); + + tmp = gen_rtx_SET (VOIDmode, + stack_pointer_rtx, + plus_constant (stack_pointer_rtx, -4 * num_regs)); + RTX_FRAME_RELATED_P (tmp) = 1; + XVECEXP (dwarf, 0, 0) = tmp; + + add_reg_note (par, REG_FRAME_RELATED_EXPR, dwarf); + + return par; +} + +/* Calculate the size of the return value that is passed in registers. */ +static unsigned +arm_size_return_regs (void) +{ + enum machine_mode mode; + + if (crtl->return_rtx != 0) + mode = GET_MODE (crtl->return_rtx); + else + mode = DECL_MODE (DECL_RESULT (current_function_decl)); + + return GET_MODE_SIZE (mode); +} + +static rtx +emit_sfm (int base_reg, int count) +{ + rtx par; + rtx dwarf; + rtx tmp, reg; + int i; + + par = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (count)); + dwarf = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (count + 1)); + + reg = gen_rtx_REG (XFmode, base_reg++); + + XVECEXP (par, 0, 0) + = gen_rtx_SET (VOIDmode, + gen_frame_mem + (BLKmode, + gen_rtx_PRE_MODIFY (Pmode, + stack_pointer_rtx, + plus_constant + (stack_pointer_rtx, + -12 * count)) + ), + gen_rtx_UNSPEC (BLKmode, + gen_rtvec (1, reg), + UNSPEC_PUSH_MULT)); + tmp = gen_rtx_SET (VOIDmode, + gen_frame_mem (XFmode, stack_pointer_rtx), reg); + RTX_FRAME_RELATED_P (tmp) = 1; + XVECEXP (dwarf, 0, 1) = tmp; + + for (i = 1; i < count; i++) + { + reg = gen_rtx_REG (XFmode, base_reg++); + XVECEXP (par, 0, i) = gen_rtx_USE (VOIDmode, reg); + + tmp = gen_rtx_SET (VOIDmode, + gen_frame_mem (XFmode, + plus_constant (stack_pointer_rtx, + i * 12)), + reg); + RTX_FRAME_RELATED_P (tmp) = 1; + XVECEXP (dwarf, 0, i + 1) = tmp; + } + + tmp = gen_rtx_SET (VOIDmode, + stack_pointer_rtx, + plus_constant (stack_pointer_rtx, -12 * count)); + + RTX_FRAME_RELATED_P (tmp) = 1; + XVECEXP (dwarf, 0, 0) = tmp; + + par = emit_insn (par); + add_reg_note (par, REG_FRAME_RELATED_EXPR, dwarf); + + return par; +} + + +/* Return true if the current function needs to save/restore LR. */ + +static bool +thumb_force_lr_save (void) +{ + return !cfun->machine->lr_save_eliminated + && (!leaf_function_p () + || thumb_far_jump_used_p () + || df_regs_ever_live_p (LR_REGNUM)); +} + + +/* Return true if r3 is used by any of the tail call insns in the + current function. */ + +static bool +any_sibcall_uses_r3 (void) +{ + edge_iterator ei; + edge e; + + if (!crtl->tail_call_emit) + return false; + FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds) + if (e->flags & EDGE_SIBCALL) + { + rtx call = BB_END (e->src); + if (!CALL_P (call)) + call = prev_nonnote_nondebug_insn (call); + gcc_assert (CALL_P (call) && SIBLING_CALL_P (call)); + if (find_regno_fusage (call, USE, 3)) + return true; + } + return false; +} + + +/* Compute the distance from register FROM to register TO. + These can be the arg pointer (26), the soft frame pointer (25), + the stack pointer (13) or the hard frame pointer (11). + In thumb mode r7 is used as the soft frame pointer, if needed. + Typical stack layout looks like this: + + old stack pointer -> | | + ---- + | | \ + | | saved arguments for + | | vararg functions + | | / + -- + hard FP & arg pointer -> | | \ + | | stack + | | frame + | | / + -- + | | \ + | | call saved + | | registers + soft frame pointer -> | | / + -- + | | \ + | | local + | | variables + locals base pointer -> | | / + -- + | | \ + | | outgoing + | | arguments + current stack pointer -> | | / + -- + + For a given function some or all of these stack components + may not be needed, giving rise to the possibility of + eliminating some of the registers. + + The values returned by this function must reflect the behavior + of arm_expand_prologue() and arm_compute_save_reg_mask(). + + The sign of the number returned reflects the direction of stack + growth, so the values are positive for all eliminations except + from the soft frame pointer to the hard frame pointer. + + SFP may point just inside the local variables block to ensure correct + alignment. */ + + +/* Calculate stack offsets. These are used to calculate register elimination + offsets and in prologue/epilogue code. Also calculates which registers + should be saved. */ + +static arm_stack_offsets * +arm_get_frame_offsets (void) +{ + struct arm_stack_offsets *offsets; + unsigned long func_type; + int leaf; + int saved; + int core_saved; + HOST_WIDE_INT frame_size; + int i; + + offsets = &cfun->machine->stack_offsets; + + /* We need to know if we are a leaf function. Unfortunately, it + is possible to be called after start_sequence has been called, + which causes get_insns to return the insns for the sequence, + not the function, which will cause leaf_function_p to return + the incorrect result. + + to know about leaf functions once reload has completed, and the + frame size cannot be changed after that time, so we can safely + use the cached value. */ + + if (reload_completed) + return offsets; + + /* Initially this is the size of the local variables. It will translated + into an offset once we have determined the size of preceding data. */ + frame_size = ROUND_UP_WORD (get_frame_size ()); + + leaf = leaf_function_p (); + + /* Space for variadic functions. */ + offsets->saved_args = crtl->args.pretend_args_size; + + /* In Thumb mode this is incorrect, but never used. */ + offsets->frame = offsets->saved_args + (frame_pointer_needed ? 4 : 0) + + arm_compute_static_chain_stack_bytes(); + + if (TARGET_32BIT) + { + unsigned int regno; + + offsets->saved_regs_mask = arm_compute_save_reg_mask (); + core_saved = bit_count (offsets->saved_regs_mask) * 4; + saved = core_saved; + + /* We know that SP will be doubleword aligned on entry, and we must + preserve that condition at any subroutine call. We also require the + soft frame pointer to be doubleword aligned. */ + + if (TARGET_REALLY_IWMMXT) + { + /* Check for the call-saved iWMMXt registers. */ + for (regno = FIRST_IWMMXT_REGNUM; + regno <= LAST_IWMMXT_REGNUM; + regno++) + if (df_regs_ever_live_p (regno) && ! call_used_regs[regno]) + saved += 8; + } + + func_type = arm_current_func_type (); + if (! IS_VOLATILE (func_type)) + { + /* Space for saved FPA registers. */ + for (regno = FIRST_FPA_REGNUM; regno <= LAST_FPA_REGNUM; regno++) + if (df_regs_ever_live_p (regno) && ! call_used_regs[regno]) + saved += 12; + + /* Space for saved VFP registers. */ + if (TARGET_HARD_FLOAT && TARGET_VFP) + saved += arm_get_vfp_saved_size (); + } + } + else /* TARGET_THUMB1 */ + { + offsets->saved_regs_mask = thumb1_compute_save_reg_mask (); + core_saved = bit_count (offsets->saved_regs_mask) * 4; + saved = core_saved; + if (TARGET_BACKTRACE) + saved += 16; + } + + /* Saved registers include the stack frame. */ + offsets->saved_regs = offsets->saved_args + saved + + arm_compute_static_chain_stack_bytes(); + offsets->soft_frame = offsets->saved_regs + CALLER_INTERWORKING_SLOT_SIZE; + /* A leaf function does not need any stack alignment if it has nothing + on the stack. */ + if (leaf && frame_size == 0) + { + offsets->outgoing_args = offsets->soft_frame; + offsets->locals_base = offsets->soft_frame; + return offsets; + } + + /* Ensure SFP has the correct alignment. */ + if (ARM_DOUBLEWORD_ALIGN + && (offsets->soft_frame & 7)) + { + offsets->soft_frame += 4; + /* Try to align stack by pushing an extra reg. Don't bother doing this + when there is a stack frame as the alignment will be rolled into + the normal stack adjustment. */ + if (frame_size + crtl->outgoing_args_size == 0) + { + int reg = -1; + + /* If it is safe to use r3, then do so. This sometimes + generates better code on Thumb-2 by avoiding the need to + use 32-bit push/pop instructions. */ + if (! any_sibcall_uses_r3 () + && arm_size_return_regs () <= 12 + && (offsets->saved_regs_mask & (1 << 3)) == 0) + { + reg = 3; + } + else + for (i = 4; i <= (TARGET_THUMB1 ? LAST_LO_REGNUM : 11); i++) + { + if ((offsets->saved_regs_mask & (1 << i)) == 0) + { + reg = i; + break; + } + } + + if (reg != -1) + { + offsets->saved_regs += 4; + offsets->saved_regs_mask |= (1 << reg); + } + } + } + + offsets->locals_base = offsets->soft_frame + frame_size; + offsets->outgoing_args = (offsets->locals_base + + crtl->outgoing_args_size); + + if (ARM_DOUBLEWORD_ALIGN) + { + /* Ensure SP remains doubleword aligned. */ + if (offsets->outgoing_args & 7) + offsets->outgoing_args += 4; + gcc_assert (!(offsets->outgoing_args & 7)); + } + + return offsets; +} + + +/* Calculate the relative offsets for the different stack pointers. Positive + offsets are in the direction of stack growth. */ + +HOST_WIDE_INT +arm_compute_initial_elimination_offset (unsigned int from, unsigned int to) +{ + arm_stack_offsets *offsets; + + offsets = arm_get_frame_offsets (); + + /* OK, now we have enough information to compute the distances. + There must be an entry in these switch tables for each pair + of registers in ELIMINABLE_REGS, even if some of the entries + seem to be redundant or useless. */ + switch (from) + { + case ARG_POINTER_REGNUM: + switch (to) + { + case THUMB_HARD_FRAME_POINTER_REGNUM: + return 0; + + case FRAME_POINTER_REGNUM: + /* This is the reverse of the soft frame pointer + to hard frame pointer elimination below. */ + return offsets->soft_frame - offsets->saved_args; + + case ARM_HARD_FRAME_POINTER_REGNUM: + /* This is only non-zero in the case where the static chain register + is stored above the frame. */ + return offsets->frame - offsets->saved_args - 4; + + case STACK_POINTER_REGNUM: + /* If nothing has been pushed on the stack at all + then this will return -4. This *is* correct! */ + return offsets->outgoing_args - (offsets->saved_args + 4); + + default: + gcc_unreachable (); + } + gcc_unreachable (); + + case FRAME_POINTER_REGNUM: + switch (to) + { + case THUMB_HARD_FRAME_POINTER_REGNUM: + return 0; + + case ARM_HARD_FRAME_POINTER_REGNUM: + /* The hard frame pointer points to the top entry in the + stack frame. The soft frame pointer to the bottom entry + in the stack frame. If there is no stack frame at all, + then they are identical. */ + + return offsets->frame - offsets->soft_frame; + + case STACK_POINTER_REGNUM: + return offsets->outgoing_args - offsets->soft_frame; + + default: + gcc_unreachable (); + } + gcc_unreachable (); + + default: + /* You cannot eliminate from the stack pointer. + In theory you could eliminate from the hard frame + pointer to the stack pointer, but this will never + happen, since if a stack frame is not needed the + hard frame pointer will never be used. */ + gcc_unreachable (); + } +} + +/* Given FROM and TO register numbers, say whether this elimination is + allowed. Frame pointer elimination is automatically handled. + + All eliminations are permissible. Note that ARG_POINTER_REGNUM and + HARD_FRAME_POINTER_REGNUM are in fact the same thing. If we need a frame + pointer, we must eliminate FRAME_POINTER_REGNUM into + HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM or + ARG_POINTER_REGNUM. */ + +bool +arm_can_eliminate (const int from, const int to) +{ + return ((to == FRAME_POINTER_REGNUM && from == ARG_POINTER_REGNUM) ? false : + (to == STACK_POINTER_REGNUM && frame_pointer_needed) ? false : + (to == ARM_HARD_FRAME_POINTER_REGNUM && TARGET_THUMB) ? false : + (to == THUMB_HARD_FRAME_POINTER_REGNUM && TARGET_ARM) ? false : + true); +} + +/* Emit RTL to save coprocessor registers on function entry. Returns the + number of bytes pushed. */ + +static int +arm_save_coproc_regs(void) +{ + int saved_size = 0; + unsigned reg; + unsigned start_reg; + rtx insn; + + for (reg = LAST_IWMMXT_REGNUM; reg >= FIRST_IWMMXT_REGNUM; reg--) + if (df_regs_ever_live_p (reg) && ! call_used_regs[reg]) + { + insn = gen_rtx_PRE_DEC (Pmode, stack_pointer_rtx); + insn = gen_rtx_MEM (V2SImode, insn); + insn = emit_set_insn (insn, gen_rtx_REG (V2SImode, reg)); + RTX_FRAME_RELATED_P (insn) = 1; + saved_size += 8; + } + + /* Save any floating point call-saved registers used by this + function. */ + if (TARGET_FPA_EMU2) + { + for (reg = LAST_FPA_REGNUM; reg >= FIRST_FPA_REGNUM; reg--) + if (df_regs_ever_live_p (reg) && !call_used_regs[reg]) + { + insn = gen_rtx_PRE_DEC (Pmode, stack_pointer_rtx); + insn = gen_rtx_MEM (XFmode, insn); + insn = emit_set_insn (insn, gen_rtx_REG (XFmode, reg)); + RTX_FRAME_RELATED_P (insn) = 1; + saved_size += 12; + } + } + else + { + start_reg = LAST_FPA_REGNUM; + + for (reg = LAST_FPA_REGNUM; reg >= FIRST_FPA_REGNUM; reg--) + { + if (df_regs_ever_live_p (reg) && !call_used_regs[reg]) + { + if (start_reg - reg == 3) + { + insn = emit_sfm (reg, 4); + RTX_FRAME_RELATED_P (insn) = 1; + saved_size += 48; + start_reg = reg - 1; + } + } + else + { + if (start_reg != reg) + { + insn = emit_sfm (reg + 1, start_reg - reg); + RTX_FRAME_RELATED_P (insn) = 1; + saved_size += (start_reg - reg) * 12; + } + start_reg = reg - 1; + } + } + + if (start_reg != reg) + { + insn = emit_sfm (reg + 1, start_reg - reg); + saved_size += (start_reg - reg) * 12; + RTX_FRAME_RELATED_P (insn) = 1; + } + } + if (TARGET_HARD_FLOAT && TARGET_VFP) + { + start_reg = FIRST_VFP_REGNUM; + + for (reg = FIRST_VFP_REGNUM; reg < LAST_VFP_REGNUM; reg += 2) + { + if ((!df_regs_ever_live_p (reg) || call_used_regs[reg]) + && (!df_regs_ever_live_p (reg + 1) || call_used_regs[reg + 1])) + { + if (start_reg != reg) + saved_size += vfp_emit_fstmd (start_reg, + (reg - start_reg) / 2); + start_reg = reg + 2; + } + } + if (start_reg != reg) + saved_size += vfp_emit_fstmd (start_reg, + (reg - start_reg) / 2); + } + return saved_size; +} + + +/* Set the Thumb frame pointer from the stack pointer. */ + +static void +thumb_set_frame_pointer (arm_stack_offsets *offsets) +{ + HOST_WIDE_INT amount; + rtx insn, dwarf; + + amount = offsets->outgoing_args - offsets->locals_base; + if (amount < 1024) + insn = emit_insn (gen_addsi3 (hard_frame_pointer_rtx, + stack_pointer_rtx, GEN_INT (amount))); + else + { + emit_insn (gen_movsi (hard_frame_pointer_rtx, GEN_INT (amount))); + /* Thumb-2 RTL patterns expect sp as the first input. Thumb-1 + expects the first two operands to be the same. */ + if (TARGET_THUMB2) + { + insn = emit_insn (gen_addsi3 (hard_frame_pointer_rtx, + stack_pointer_rtx, + hard_frame_pointer_rtx)); + } + else + { + insn = emit_insn (gen_addsi3 (hard_frame_pointer_rtx, + hard_frame_pointer_rtx, + stack_pointer_rtx)); + } + dwarf = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx, + plus_constant (stack_pointer_rtx, amount)); + RTX_FRAME_RELATED_P (dwarf) = 1; + add_reg_note (insn, REG_FRAME_RELATED_EXPR, dwarf); + } + + RTX_FRAME_RELATED_P (insn) = 1; +} + +/* Generate the prologue instructions for entry into an ARM or Thumb-2 + function. */ +void +arm_expand_prologue (void) +{ + rtx amount; + rtx insn; + rtx ip_rtx; + unsigned long live_regs_mask; + unsigned long func_type; + int fp_offset = 0; + int saved_pretend_args = 0; + int saved_regs = 0; + unsigned HOST_WIDE_INT args_to_push; + arm_stack_offsets *offsets; + + func_type = arm_current_func_type (); + + /* Naked functions don't have prologues. */ + if (IS_NAKED (func_type)) + return; + + /* Make a copy of c_f_p_a_s as we may need to modify it locally. */ + args_to_push = crtl->args.pretend_args_size; + + /* Compute which register we will have to save onto the stack. */ + offsets = arm_get_frame_offsets (); + live_regs_mask = offsets->saved_regs_mask; + + ip_rtx = gen_rtx_REG (SImode, IP_REGNUM); + + if (IS_STACKALIGN (func_type)) + { + rtx dwarf; + rtx r0; + rtx r1; + /* Handle a word-aligned stack pointer. We generate the following: + + mov r0, sp + bic r1, r0, #7 + mov sp, r1 + + mov sp, r0 + bx lr + + The unwinder doesn't need to know about the stack realignment. + Just tell it we saved SP in r0. */ + gcc_assert (TARGET_THUMB2 && !arm_arch_notm && args_to_push == 0); + + r0 = gen_rtx_REG (SImode, 0); + r1 = gen_rtx_REG (SImode, 1); + /* Use a real rtvec rather than NULL_RTVEC so the rest of the + compiler won't choke. */ + dwarf = gen_rtx_UNSPEC (SImode, rtvec_alloc (0), UNSPEC_STACK_ALIGN); + dwarf = gen_rtx_SET (VOIDmode, r0, dwarf); + insn = gen_movsi (r0, stack_pointer_rtx); + RTX_FRAME_RELATED_P (insn) = 1; + add_reg_note (insn, REG_FRAME_RELATED_EXPR, dwarf); + emit_insn (insn); + emit_insn (gen_andsi3 (r1, r0, GEN_INT (~(HOST_WIDE_INT)7))); + emit_insn (gen_movsi (stack_pointer_rtx, r1)); + } + + /* For APCS frames, if IP register is clobbered + when creating frame, save that register in a special + way. */ + if (TARGET_APCS_FRAME && frame_pointer_needed && TARGET_ARM) + { + if (IS_INTERRUPT (func_type)) + { + /* Interrupt functions must not corrupt any registers. + Creating a frame pointer however, corrupts the IP + register, so we must push it first. */ + insn = emit_multi_reg_push (1 << IP_REGNUM); + + /* Do not set RTX_FRAME_RELATED_P on this insn. + The dwarf stack unwinding code only wants to see one + stack decrement per function, and this is not it. If + this instruction is labeled as being part of the frame + creation sequence then dwarf2out_frame_debug_expr will + die when it encounters the assignment of IP to FP + later on, since the use of SP here establishes SP as + the CFA register and not IP. + + Anyway this instruction is not really part of the stack + frame creation although it is part of the prologue. */ + } + else if (IS_NESTED (func_type)) + { + /* The Static chain register is the same as the IP register + used as a scratch register during stack frame creation. + To get around this need to find somewhere to store IP + whilst the frame is being created. We try the following + places in order: + + 1. The last argument register. + 2. A slot on the stack above the frame. (This only + works if the function is not a varargs function). + 3. Register r3, after pushing the argument registers + onto the stack. + + Note - we only need to tell the dwarf2 backend about the SP + adjustment in the second variant; the static chain register + doesn't need to be unwound, as it doesn't contain a value + inherited from the caller. */ + + if (df_regs_ever_live_p (3) == false) + insn = emit_set_insn (gen_rtx_REG (SImode, 3), ip_rtx); + else if (args_to_push == 0) + { + rtx dwarf; + + gcc_assert(arm_compute_static_chain_stack_bytes() == 4); + saved_regs += 4; + + insn = gen_rtx_PRE_DEC (SImode, stack_pointer_rtx); + insn = emit_set_insn (gen_frame_mem (SImode, insn), ip_rtx); + fp_offset = 4; + + /* Just tell the dwarf backend that we adjusted SP. */ + dwarf = gen_rtx_SET (VOIDmode, stack_pointer_rtx, + plus_constant (stack_pointer_rtx, + -fp_offset)); + RTX_FRAME_RELATED_P (insn) = 1; + add_reg_note (insn, REG_FRAME_RELATED_EXPR, dwarf); + } + else + { + /* Store the args on the stack. */ + if (cfun->machine->uses_anonymous_args) + insn = emit_multi_reg_push + ((0xf0 >> (args_to_push / 4)) & 0xf); + else + insn = emit_insn + (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (- args_to_push))); + + RTX_FRAME_RELATED_P (insn) = 1; + + saved_pretend_args = 1; + fp_offset = args_to_push; + args_to_push = 0; + + /* Now reuse r3 to preserve IP. */ + emit_set_insn (gen_rtx_REG (SImode, 3), ip_rtx); + } + } + + insn = emit_set_insn (ip_rtx, + plus_constant (stack_pointer_rtx, fp_offset)); + RTX_FRAME_RELATED_P (insn) = 1; + } + + if (args_to_push) + { + /* Push the argument registers, or reserve space for them. */ + if (cfun->machine->uses_anonymous_args) + insn = emit_multi_reg_push + ((0xf0 >> (args_to_push / 4)) & 0xf); + else + insn = emit_insn + (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (- args_to_push))); + RTX_FRAME_RELATED_P (insn) = 1; + } + + /* If this is an interrupt service routine, and the link register + is going to be pushed, and we're not generating extra + push of IP (needed when frame is needed and frame layout if apcs), + subtracting four from LR now will mean that the function return + can be done with a single instruction. */ + if ((func_type == ARM_FT_ISR || func_type == ARM_FT_FIQ) + && (live_regs_mask & (1 << LR_REGNUM)) != 0 + && !(frame_pointer_needed && TARGET_APCS_FRAME) + && TARGET_ARM) + { + rtx lr = gen_rtx_REG (SImode, LR_REGNUM); + + emit_set_insn (lr, plus_constant (lr, -4)); + } + + if (live_regs_mask) + { + saved_regs += bit_count (live_regs_mask) * 4; + if (optimize_size && !frame_pointer_needed + && saved_regs == offsets->saved_regs - offsets->saved_args) + { + /* If no coprocessor registers are being pushed and we don't have + to worry about a frame pointer then push extra registers to + create the stack frame. This is done is a way that does not + alter the frame layout, so is independent of the epilogue. */ + int n; + int frame; + n = 0; + while (n < 8 && (live_regs_mask & (1 << n)) == 0) + n++; + frame = offsets->outgoing_args - (offsets->saved_args + saved_regs); + if (frame && n * 4 >= frame) + { + n = frame / 4; + live_regs_mask |= (1 << n) - 1; + saved_regs += frame; + } + } + insn = emit_multi_reg_push (live_regs_mask); + RTX_FRAME_RELATED_P (insn) = 1; + } + + if (! IS_VOLATILE (func_type)) + saved_regs += arm_save_coproc_regs (); + + if (frame_pointer_needed && TARGET_ARM) + { + /* Create the new frame pointer. */ + if (TARGET_APCS_FRAME) + { + insn = GEN_INT (-(4 + args_to_push + fp_offset)); + insn = emit_insn (gen_addsi3 (hard_frame_pointer_rtx, ip_rtx, insn)); + RTX_FRAME_RELATED_P (insn) = 1; + + if (IS_NESTED (func_type)) + { + /* Recover the static chain register. */ + if (!df_regs_ever_live_p (3) + || saved_pretend_args) + insn = gen_rtx_REG (SImode, 3); + else /* if (crtl->args.pretend_args_size == 0) */ + { + insn = plus_constant (hard_frame_pointer_rtx, 4); + insn = gen_frame_mem (SImode, insn); + } + emit_set_insn (ip_rtx, insn); + /* Add a USE to stop propagate_one_insn() from barfing. */ + emit_insn (gen_prologue_use (ip_rtx)); + } + } + else + { + insn = GEN_INT (saved_regs - 4); + insn = emit_insn (gen_addsi3 (hard_frame_pointer_rtx, + stack_pointer_rtx, insn)); + RTX_FRAME_RELATED_P (insn) = 1; + } + } + + if (flag_stack_usage) + current_function_static_stack_size + = offsets->outgoing_args - offsets->saved_args; + + if (offsets->outgoing_args != offsets->saved_args + saved_regs) + { + /* This add can produce multiple insns for a large constant, so we + need to get tricky. */ + rtx last = get_last_insn (); + + amount = GEN_INT (offsets->saved_args + saved_regs + - offsets->outgoing_args); + + insn = emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx, + amount)); + do + { + last = last ? NEXT_INSN (last) : get_insns (); + RTX_FRAME_RELATED_P (last) = 1; + } + while (last != insn); + + /* If the frame pointer is needed, emit a special barrier that + will prevent the scheduler from moving stores to the frame + before the stack adjustment. */ + if (frame_pointer_needed) + insn = emit_insn (gen_stack_tie (stack_pointer_rtx, + hard_frame_pointer_rtx)); + } + + + if (frame_pointer_needed && TARGET_THUMB2) + thumb_set_frame_pointer (offsets); + + if (flag_pic && arm_pic_register != INVALID_REGNUM) + { + unsigned long mask; + + mask = live_regs_mask; + mask &= THUMB2_WORK_REGS; + if (!IS_NESTED (func_type)) + mask |= (1 << IP_REGNUM); + arm_load_pic_register (mask); + } + + /* If we are profiling, make sure no instructions are scheduled before + the call to mcount. Similarly if the user has requested no + scheduling in the prolog. Similarly if we want non-call exceptions + using the EABI unwinder, to prevent faulting instructions from being + swapped with a stack adjustment. */ + if (crtl->profile || !TARGET_SCHED_PROLOG + || (arm_except_unwind_info (&global_options) == UI_TARGET + && cfun->can_throw_non_call_exceptions)) + emit_insn (gen_blockage ()); + + /* If the link register is being kept alive, with the return address in it, + then make sure that it does not get reused by the ce2 pass. */ + if ((live_regs_mask & (1 << LR_REGNUM)) == 0) + cfun->machine->lr_save_eliminated = 1; +} + +/* Print condition code to STREAM. Helper function for arm_print_operand. */ +static void +arm_print_condition (FILE *stream) +{ + if (arm_ccfsm_state == 3 || arm_ccfsm_state == 4) + { + /* Branch conversion is not implemented for Thumb-2. */ + if (TARGET_THUMB) + { + output_operand_lossage ("predicated Thumb instruction"); + return; + } + if (current_insn_predicate != NULL) + { + output_operand_lossage + ("predicated instruction in conditional sequence"); + return; + } + + fputs (arm_condition_codes[arm_current_cc], stream); + } + else if (current_insn_predicate) + { + enum arm_cond_code code; + + if (TARGET_THUMB1) + { + output_operand_lossage ("predicated Thumb instruction"); + return; + } + + code = get_arm_condition_code (current_insn_predicate); + fputs (arm_condition_codes[code], stream); + } +} + + +/* If CODE is 'd', then the X is a condition operand and the instruction + should only be executed if the condition is true. + if CODE is 'D', then the X is a condition operand and the instruction + should only be executed if the condition is false: however, if the mode + of the comparison is CCFPEmode, then always execute the instruction -- we + do this because in these circumstances !GE does not necessarily imply LT; + in these cases the instruction pattern will take care to make sure that + an instruction containing %d will follow, thereby undoing the effects of + doing this instruction unconditionally. + If CODE is 'N' then X is a floating point operand that must be negated + before output. + If CODE is 'B' then output a bitwise inverted value of X (a const int). + If X is a REG and CODE is `M', output a ldm/stm style multi-reg. */ +static void +arm_print_operand (FILE *stream, rtx x, int code) +{ + switch (code) + { + case '@': + fputs (ASM_COMMENT_START, stream); + return; + + case '_': + fputs (user_label_prefix, stream); + return; + + case '|': + fputs (REGISTER_PREFIX, stream); + return; + + case '?': + arm_print_condition (stream); + return; + + case '(': + /* Nothing in unified syntax, otherwise the current condition code. */ + if (!TARGET_UNIFIED_ASM) + arm_print_condition (stream); + break; + + case ')': + /* The current condition code in unified syntax, otherwise nothing. */ + if (TARGET_UNIFIED_ASM) + arm_print_condition (stream); + break; + + case '.': + /* The current condition code for a condition code setting instruction. + Preceded by 's' in unified syntax, otherwise followed by 's'. */ + if (TARGET_UNIFIED_ASM) + { + fputc('s', stream); + arm_print_condition (stream); + } + else + { + arm_print_condition (stream); + fputc('s', stream); + } + return; + + case '!': + /* If the instruction is conditionally executed then print + the current condition code, otherwise print 's'. */ + gcc_assert (TARGET_THUMB2 && TARGET_UNIFIED_ASM); + if (current_insn_predicate) + arm_print_condition (stream); + else + fputc('s', stream); + break; + + /* %# is a "break" sequence. It doesn't output anything, but is used to + separate e.g. operand numbers from following text, if that text consists + of further digits which we don't want to be part of the operand + number. */ + case '#': + return; + + case 'N': + { + REAL_VALUE_TYPE r; + REAL_VALUE_FROM_CONST_DOUBLE (r, x); + r = real_value_negate (&r); + fprintf (stream, "%s", fp_const_from_val (&r)); + } + return; + + /* An integer or symbol address without a preceding # sign. */ + case 'c': + switch (GET_CODE (x)) + { + case CONST_INT: + fprintf (stream, HOST_WIDE_INT_PRINT_DEC, INTVAL (x)); + break; + + case SYMBOL_REF: + output_addr_const (stream, x); + break; + + default: + gcc_unreachable (); + } + return; + + case 'B': + if (GET_CODE (x) == CONST_INT) + { + HOST_WIDE_INT val; + val = ARM_SIGN_EXTEND (~INTVAL (x)); + fprintf (stream, HOST_WIDE_INT_PRINT_DEC, val); + } + else + { + putc ('~', stream); + output_addr_const (stream, x); + } + return; + + case 'L': + /* The low 16 bits of an immediate constant. */ + fprintf (stream, HOST_WIDE_INT_PRINT_DEC, INTVAL(x) & 0xffff); + return; + + case 'i': + fprintf (stream, "%s", arithmetic_instr (x, 1)); + return; + + /* Truncate Cirrus shift counts. */ + case 's': + if (GET_CODE (x) == CONST_INT) + { + fprintf (stream, HOST_WIDE_INT_PRINT_DEC, INTVAL (x) & 0x3f); + return; + } + arm_print_operand (stream, x, 0); + return; + + case 'I': + fprintf (stream, "%s", arithmetic_instr (x, 0)); + return; + + case 'S': + { + HOST_WIDE_INT val; + const char *shift; + + if (!shift_operator (x, SImode)) + { + output_operand_lossage ("invalid shift operand"); + break; + } + + shift = shift_op (x, &val); + + if (shift) + { + fprintf (stream, ", %s ", shift); + if (val == -1) + arm_print_operand (stream, XEXP (x, 1), 0); + else + fprintf (stream, "#" HOST_WIDE_INT_PRINT_DEC, val); + } + } + return; + + /* An explanation of the 'Q', 'R' and 'H' register operands: + + In a pair of registers containing a DI or DF value the 'Q' + operand returns the register number of the register containing + the least significant part of the value. The 'R' operand returns + the register number of the register containing the most + significant part of the value. + + The 'H' operand returns the higher of the two register numbers. + On a run where WORDS_BIG_ENDIAN is true the 'H' operand is the + same as the 'Q' operand, since the most significant part of the + value is held in the lower number register. The reverse is true + on systems where WORDS_BIG_ENDIAN is false. + + The purpose of these operands is to distinguish between cases + where the endian-ness of the values is important (for example + when they are added together), and cases where the endian-ness + is irrelevant, but the order of register operations is important. + For example when loading a value from memory into a register + pair, the endian-ness does not matter. Provided that the value + from the lower memory address is put into the lower numbered + register, and the value from the higher address is put into the + higher numbered register, the load will work regardless of whether + the value being loaded is big-wordian or little-wordian. The + order of the two register loads can matter however, if the address + of the memory location is actually held in one of the registers + being overwritten by the load. + + The 'Q' and 'R' constraints are also available for 64-bit + constants. */ + case 'Q': + if (GET_CODE (x) == CONST_INT || GET_CODE (x) == CONST_DOUBLE) + { + rtx part = gen_lowpart (SImode, x); + fprintf (stream, "#" HOST_WIDE_INT_PRINT_DEC, INTVAL (part)); + return; + } + + if (GET_CODE (x) != REG || REGNO (x) > LAST_ARM_REGNUM) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + asm_fprintf (stream, "%r", REGNO (x) + (WORDS_BIG_ENDIAN ? 1 : 0)); + return; + + case 'R': + if (GET_CODE (x) == CONST_INT || GET_CODE (x) == CONST_DOUBLE) + { + enum machine_mode mode = GET_MODE (x); + rtx part; + + if (mode == VOIDmode) + mode = DImode; + part = gen_highpart_mode (SImode, mode, x); + fprintf (stream, "#" HOST_WIDE_INT_PRINT_DEC, INTVAL (part)); + return; + } + + if (GET_CODE (x) != REG || REGNO (x) > LAST_ARM_REGNUM) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + asm_fprintf (stream, "%r", REGNO (x) + (WORDS_BIG_ENDIAN ? 0 : 1)); + return; + + case 'H': + if (GET_CODE (x) != REG || REGNO (x) > LAST_ARM_REGNUM) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + asm_fprintf (stream, "%r", REGNO (x) + 1); + return; + + case 'J': + if (GET_CODE (x) != REG || REGNO (x) > LAST_ARM_REGNUM) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + asm_fprintf (stream, "%r", REGNO (x) + (WORDS_BIG_ENDIAN ? 3 : 2)); + return; + + case 'K': + if (GET_CODE (x) != REG || REGNO (x) > LAST_ARM_REGNUM) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + asm_fprintf (stream, "%r", REGNO (x) + (WORDS_BIG_ENDIAN ? 2 : 3)); + return; + + case 'm': + asm_fprintf (stream, "%r", + GET_CODE (XEXP (x, 0)) == REG + ? REGNO (XEXP (x, 0)) : REGNO (XEXP (XEXP (x, 0), 0))); + return; + + case 'M': + asm_fprintf (stream, "{%r-%r}", + REGNO (x), + REGNO (x) + ARM_NUM_REGS (GET_MODE (x)) - 1); + return; + + /* Like 'M', but writing doubleword vector registers, for use by Neon + insns. */ + case 'h': + { + int regno = (REGNO (x) - FIRST_VFP_REGNUM) / 2; + int numregs = ARM_NUM_REGS (GET_MODE (x)) / 2; + if (numregs == 1) + asm_fprintf (stream, "{d%d}", regno); + else + asm_fprintf (stream, "{d%d-d%d}", regno, regno + numregs - 1); + } + return; + + case 'd': + /* CONST_TRUE_RTX means always -- that's the default. */ + if (x == const_true_rtx) + return; + + if (!COMPARISON_P (x)) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + fputs (arm_condition_codes[get_arm_condition_code (x)], + stream); + return; + + case 'D': + /* CONST_TRUE_RTX means not always -- i.e. never. We shouldn't ever + want to do that. */ + if (x == const_true_rtx) + { + output_operand_lossage ("instruction never executed"); + return; + } + if (!COMPARISON_P (x)) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + fputs (arm_condition_codes[ARM_INVERSE_CONDITION_CODE + (get_arm_condition_code (x))], + stream); + return; + + /* Cirrus registers can be accessed in a variety of ways: + single floating point (f) + double floating point (d) + 32bit integer (fx) + 64bit integer (dx). */ + case 'W': /* Cirrus register in F mode. */ + case 'X': /* Cirrus register in D mode. */ + case 'Y': /* Cirrus register in FX mode. */ + case 'Z': /* Cirrus register in DX mode. */ + gcc_assert (GET_CODE (x) == REG + && REGNO_REG_CLASS (REGNO (x)) == CIRRUS_REGS); + + fprintf (stream, "mv%s%s", + code == 'W' ? "f" + : code == 'X' ? "d" + : code == 'Y' ? "fx" : "dx", reg_names[REGNO (x)] + 2); + + return; + + /* Print cirrus register in the mode specified by the register's mode. */ + case 'V': + { + int mode = GET_MODE (x); + + if (GET_CODE (x) != REG || REGNO_REG_CLASS (REGNO (x)) != CIRRUS_REGS) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + fprintf (stream, "mv%s%s", + mode == DFmode ? "d" + : mode == SImode ? "fx" + : mode == DImode ? "dx" + : "f", reg_names[REGNO (x)] + 2); + + return; + } + + case 'U': + if (GET_CODE (x) != REG + || REGNO (x) < FIRST_IWMMXT_GR_REGNUM + || REGNO (x) > LAST_IWMMXT_GR_REGNUM) + /* Bad value for wCG register number. */ + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + else + fprintf (stream, "%d", REGNO (x) - FIRST_IWMMXT_GR_REGNUM); + return; + + /* Print an iWMMXt control register name. */ + case 'w': + if (GET_CODE (x) != CONST_INT + || INTVAL (x) < 0 + || INTVAL (x) >= 16) + /* Bad value for wC register number. */ + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + else + { + static const char * wc_reg_names [16] = + { + "wCID", "wCon", "wCSSF", "wCASF", + "wC4", "wC5", "wC6", "wC7", + "wCGR0", "wCGR1", "wCGR2", "wCGR3", + "wC12", "wC13", "wC14", "wC15" + }; + + fprintf (stream, wc_reg_names [INTVAL (x)]); + } + return; + + /* Print the high single-precision register of a VFP double-precision + register. */ + case 'p': + { + int mode = GET_MODE (x); + int regno; + + if (GET_MODE_SIZE (mode) != 8 || GET_CODE (x) != REG) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + regno = REGNO (x); + if (!VFP_REGNO_OK_FOR_DOUBLE (regno)) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + fprintf (stream, "s%d", regno - FIRST_VFP_REGNUM + 1); + } + return; + + /* Print a VFP/Neon double precision or quad precision register name. */ + case 'P': + case 'q': + { + int mode = GET_MODE (x); + int is_quad = (code == 'q'); + int regno; + + if (GET_MODE_SIZE (mode) != (is_quad ? 16 : 8)) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + if (GET_CODE (x) != REG + || !IS_VFP_REGNUM (REGNO (x))) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + regno = REGNO (x); + if ((is_quad && !NEON_REGNO_OK_FOR_QUAD (regno)) + || (!is_quad && !VFP_REGNO_OK_FOR_DOUBLE (regno))) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + fprintf (stream, "%c%d", is_quad ? 'q' : 'd', + (regno - FIRST_VFP_REGNUM) >> (is_quad ? 2 : 1)); + } + return; + + /* These two codes print the low/high doubleword register of a Neon quad + register, respectively. For pair-structure types, can also print + low/high quadword registers. */ + case 'e': + case 'f': + { + int mode = GET_MODE (x); + int regno; + + if ((GET_MODE_SIZE (mode) != 16 + && GET_MODE_SIZE (mode) != 32) || GET_CODE (x) != REG) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + regno = REGNO (x); + if (!NEON_REGNO_OK_FOR_QUAD (regno)) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + if (GET_MODE_SIZE (mode) == 16) + fprintf (stream, "d%d", ((regno - FIRST_VFP_REGNUM) >> 1) + + (code == 'f' ? 1 : 0)); + else + fprintf (stream, "q%d", ((regno - FIRST_VFP_REGNUM) >> 2) + + (code == 'f' ? 1 : 0)); + } + return; + + /* Print a VFPv3 floating-point constant, represented as an integer + index. */ + case 'G': + { + int index = vfp3_const_double_index (x); + gcc_assert (index != -1); + fprintf (stream, "%d", index); + } + return; + + /* Print bits representing opcode features for Neon. + + Bit 0 is 1 for signed, 0 for unsigned. Floats count as signed + and polynomials as unsigned. + + Bit 1 is 1 for floats and polynomials, 0 for ordinary integers. + + Bit 2 is 1 for rounding functions, 0 otherwise. */ + + /* Identify the type as 's', 'u', 'p' or 'f'. */ + case 'T': + { + HOST_WIDE_INT bits = INTVAL (x); + fputc ("uspf"[bits & 3], stream); + } + return; + + /* Likewise, but signed and unsigned integers are both 'i'. */ + case 'F': + { + HOST_WIDE_INT bits = INTVAL (x); + fputc ("iipf"[bits & 3], stream); + } + return; + + /* As for 'T', but emit 'u' instead of 'p'. */ + case 't': + { + HOST_WIDE_INT bits = INTVAL (x); + fputc ("usuf"[bits & 3], stream); + } + return; + + /* Bit 2: rounding (vs none). */ + case 'O': + { + HOST_WIDE_INT bits = INTVAL (x); + fputs ((bits & 4) != 0 ? "r" : "", stream); + } + return; + + /* Memory operand for vld1/vst1 instruction. */ + case 'A': + { + rtx addr; + bool postinc = FALSE; + unsigned align, modesize, align_bits; + + gcc_assert (GET_CODE (x) == MEM); + addr = XEXP (x, 0); + if (GET_CODE (addr) == POST_INC) + { + postinc = 1; + addr = XEXP (addr, 0); + } + asm_fprintf (stream, "[%r", REGNO (addr)); + + /* We know the alignment of this access, so we can emit a hint in the + instruction (for some alignments) as an aid to the memory subsystem + of the target. */ + align = MEM_ALIGN (x) >> 3; + modesize = GET_MODE_SIZE (GET_MODE (x)); + + /* Only certain alignment specifiers are supported by the hardware. */ + if (modesize == 16 && (align % 32) == 0) + align_bits = 256; + else if ((modesize == 8 || modesize == 16) && (align % 16) == 0) + align_bits = 128; + else if ((align % 8) == 0) + align_bits = 64; + else + align_bits = 0; + + if (align_bits != 0) + asm_fprintf (stream, ":%d", align_bits); + + asm_fprintf (stream, "]"); + + if (postinc) + fputs("!", stream); + } + return; + + case 'C': + { + rtx addr; + + gcc_assert (GET_CODE (x) == MEM); + addr = XEXP (x, 0); + gcc_assert (GET_CODE (addr) == REG); + asm_fprintf (stream, "[%r]", REGNO (addr)); + } + return; + + /* Translate an S register number into a D register number and element index. */ + case 'y': + { + int mode = GET_MODE (x); + int regno; + + if (GET_MODE_SIZE (mode) != 4 || GET_CODE (x) != REG) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + regno = REGNO (x); + if (!VFP_REGNO_OK_FOR_SINGLE (regno)) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + regno = regno - FIRST_VFP_REGNUM; + fprintf (stream, "d%d[%d]", regno / 2, regno % 2); + } + return; + + /* Register specifier for vld1.16/vst1.16. Translate the S register + number into a D register number and element index. */ + case 'z': + { + int mode = GET_MODE (x); + int regno; + + if (GET_MODE_SIZE (mode) != 2 || GET_CODE (x) != REG) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + regno = REGNO (x); + if (!VFP_REGNO_OK_FOR_SINGLE (regno)) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + regno = regno - FIRST_VFP_REGNUM; + fprintf (stream, "d%d[%d]", regno/2, ((regno % 2) ? 2 : 0)); + } + return; + + default: + if (x == 0) + { + output_operand_lossage ("missing operand"); + return; + } + + switch (GET_CODE (x)) + { + case REG: + asm_fprintf (stream, "%r", REGNO (x)); + break; + + case MEM: + output_memory_reference_mode = GET_MODE (x); + output_address (XEXP (x, 0)); + break; + + case CONST_DOUBLE: + if (TARGET_NEON) + { + char fpstr[20]; + real_to_decimal (fpstr, CONST_DOUBLE_REAL_VALUE (x), + sizeof (fpstr), 0, 1); + fprintf (stream, "#%s", fpstr); + } + else + fprintf (stream, "#%s", fp_immediate_constant (x)); + break; + + default: + gcc_assert (GET_CODE (x) != NEG); + fputc ('#', stream); + if (GET_CODE (x) == HIGH) + { + fputs (":lower16:", stream); + x = XEXP (x, 0); + } + + output_addr_const (stream, x); + break; + } + } +} + +/* Target hook for printing a memory address. */ +static void +arm_print_operand_address (FILE *stream, rtx x) +{ + if (TARGET_32BIT) + { + int is_minus = GET_CODE (x) == MINUS; + + if (GET_CODE (x) == REG) + asm_fprintf (stream, "[%r, #0]", REGNO (x)); + else if (GET_CODE (x) == PLUS || is_minus) + { + rtx base = XEXP (x, 0); + rtx index = XEXP (x, 1); + HOST_WIDE_INT offset = 0; + if (GET_CODE (base) != REG + || (GET_CODE (index) == REG && REGNO (index) == SP_REGNUM)) + { + /* Ensure that BASE is a register. */ + /* (one of them must be). */ + /* Also ensure the SP is not used as in index register. */ + rtx temp = base; + base = index; + index = temp; + } + switch (GET_CODE (index)) + { + case CONST_INT: + offset = INTVAL (index); + if (is_minus) + offset = -offset; + asm_fprintf (stream, "[%r, #%wd]", + REGNO (base), offset); + break; + + case REG: + asm_fprintf (stream, "[%r, %s%r]", + REGNO (base), is_minus ? "-" : "", + REGNO (index)); + break; + + case MULT: + case ASHIFTRT: + case LSHIFTRT: + case ASHIFT: + case ROTATERT: + { + asm_fprintf (stream, "[%r, %s%r", + REGNO (base), is_minus ? "-" : "", + REGNO (XEXP (index, 0))); + arm_print_operand (stream, index, 'S'); + fputs ("]", stream); + break; + } + + default: + gcc_unreachable (); + } + } + else if (GET_CODE (x) == PRE_INC || GET_CODE (x) == POST_INC + || GET_CODE (x) == PRE_DEC || GET_CODE (x) == POST_DEC) + { + extern enum machine_mode output_memory_reference_mode; + + gcc_assert (GET_CODE (XEXP (x, 0)) == REG); + + if (GET_CODE (x) == PRE_DEC || GET_CODE (x) == PRE_INC) + asm_fprintf (stream, "[%r, #%s%d]!", + REGNO (XEXP (x, 0)), + GET_CODE (x) == PRE_DEC ? "-" : "", + GET_MODE_SIZE (output_memory_reference_mode)); + else + asm_fprintf (stream, "[%r], #%s%d", + REGNO (XEXP (x, 0)), + GET_CODE (x) == POST_DEC ? "-" : "", + GET_MODE_SIZE (output_memory_reference_mode)); + } + else if (GET_CODE (x) == PRE_MODIFY) + { + asm_fprintf (stream, "[%r, ", REGNO (XEXP (x, 0))); + if (GET_CODE (XEXP (XEXP (x, 1), 1)) == CONST_INT) + asm_fprintf (stream, "#%wd]!", + INTVAL (XEXP (XEXP (x, 1), 1))); + else + asm_fprintf (stream, "%r]!", + REGNO (XEXP (XEXP (x, 1), 1))); + } + else if (GET_CODE (x) == POST_MODIFY) + { + asm_fprintf (stream, "[%r], ", REGNO (XEXP (x, 0))); + if (GET_CODE (XEXP (XEXP (x, 1), 1)) == CONST_INT) + asm_fprintf (stream, "#%wd", + INTVAL (XEXP (XEXP (x, 1), 1))); + else + asm_fprintf (stream, "%r", + REGNO (XEXP (XEXP (x, 1), 1))); + } + else output_addr_const (stream, x); + } + else + { + if (GET_CODE (x) == REG) + asm_fprintf (stream, "[%r]", REGNO (x)); + else if (GET_CODE (x) == POST_INC) + asm_fprintf (stream, "%r!", REGNO (XEXP (x, 0))); + else if (GET_CODE (x) == PLUS) + { + gcc_assert (GET_CODE (XEXP (x, 0)) == REG); + if (GET_CODE (XEXP (x, 1)) == CONST_INT) + asm_fprintf (stream, "[%r, #%wd]", + REGNO (XEXP (x, 0)), + INTVAL (XEXP (x, 1))); + else + asm_fprintf (stream, "[%r, %r]", + REGNO (XEXP (x, 0)), + REGNO (XEXP (x, 1))); + } + else + output_addr_const (stream, x); + } +} + +/* Target hook for indicating whether a punctuation character for + TARGET_PRINT_OPERAND is valid. */ +static bool +arm_print_operand_punct_valid_p (unsigned char code) +{ + return (code == '@' || code == '|' || code == '.' + || code == '(' || code == ')' || code == '#' + || (TARGET_32BIT && (code == '?')) + || (TARGET_THUMB2 && (code == '!')) + || (TARGET_THUMB && (code == '_'))); +} + +/* Target hook for assembling integer objects. The ARM version needs to + handle word-sized values specially. */ +static bool +arm_assemble_integer (rtx x, unsigned int size, int aligned_p) +{ + enum machine_mode mode; + + if (size == UNITS_PER_WORD && aligned_p) + { + fputs ("\t.word\t", asm_out_file); + output_addr_const (asm_out_file, x); + + /* Mark symbols as position independent. We only do this in the + .text segment, not in the .data segment. */ + if (NEED_GOT_RELOC && flag_pic && making_const_table && + (GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF)) + { + /* See legitimize_pic_address for an explanation of the + TARGET_VXWORKS_RTP check. */ + if (TARGET_VXWORKS_RTP + || (GET_CODE (x) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (x))) + fputs ("(GOT)", asm_out_file); + else + fputs ("(GOTOFF)", asm_out_file); + } + fputc ('\n', asm_out_file); + return true; + } + + mode = GET_MODE (x); + + if (arm_vector_mode_supported_p (mode)) + { + int i, units; + + gcc_assert (GET_CODE (x) == CONST_VECTOR); + + units = CONST_VECTOR_NUNITS (x); + size = GET_MODE_SIZE (GET_MODE_INNER (mode)); + + if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) + for (i = 0; i < units; i++) + { + rtx elt = CONST_VECTOR_ELT (x, i); + assemble_integer + (elt, size, i == 0 ? BIGGEST_ALIGNMENT : size * BITS_PER_UNIT, 1); + } + else + for (i = 0; i < units; i++) + { + rtx elt = CONST_VECTOR_ELT (x, i); + REAL_VALUE_TYPE rval; + + REAL_VALUE_FROM_CONST_DOUBLE (rval, elt); + + assemble_real + (rval, GET_MODE_INNER (mode), + i == 0 ? BIGGEST_ALIGNMENT : size * BITS_PER_UNIT); + } + + return true; + } + + return default_assemble_integer (x, size, aligned_p); +} + +static void +arm_elf_asm_cdtor (rtx symbol, int priority, bool is_ctor) +{ + section *s; + + if (!TARGET_AAPCS_BASED) + { + (is_ctor ? + default_named_section_asm_out_constructor + : default_named_section_asm_out_destructor) (symbol, priority); + return; + } + + /* Put these in the .init_array section, using a special relocation. */ + if (priority != DEFAULT_INIT_PRIORITY) + { + char buf[18]; + sprintf (buf, "%s.%.5u", + is_ctor ? ".init_array" : ".fini_array", + priority); + s = get_section (buf, SECTION_WRITE, NULL_TREE); + } + else if (is_ctor) + s = ctors_section; + else + s = dtors_section; + + switch_to_section (s); + assemble_align (POINTER_SIZE); + fputs ("\t.word\t", asm_out_file); + output_addr_const (asm_out_file, symbol); + fputs ("(target1)\n", asm_out_file); +} + +/* Add a function to the list of static constructors. */ + +static void +arm_elf_asm_constructor (rtx symbol, int priority) +{ + arm_elf_asm_cdtor (symbol, priority, /*is_ctor=*/true); +} + +/* Add a function to the list of static destructors. */ + +static void +arm_elf_asm_destructor (rtx symbol, int priority) +{ + arm_elf_asm_cdtor (symbol, priority, /*is_ctor=*/false); +} + +/* A finite state machine takes care of noticing whether or not instructions + can be conditionally executed, and thus decrease execution time and code + size by deleting branch instructions. The fsm is controlled by + final_prescan_insn, and controls the actions of ASM_OUTPUT_OPCODE. */ + +/* The state of the fsm controlling condition codes are: + 0: normal, do nothing special + 1: make ASM_OUTPUT_OPCODE not output this instruction + 2: make ASM_OUTPUT_OPCODE not output this instruction + 3: make instructions conditional + 4: make instructions conditional + + State transitions (state->state by whom under condition): + 0 -> 1 final_prescan_insn if the `target' is a label + 0 -> 2 final_prescan_insn if the `target' is an unconditional branch + 1 -> 3 ASM_OUTPUT_OPCODE after not having output the conditional branch + 2 -> 4 ASM_OUTPUT_OPCODE after not having output the conditional branch + 3 -> 0 (*targetm.asm_out.internal_label) if the `target' label is reached + (the target label has CODE_LABEL_NUMBER equal to arm_target_label). + 4 -> 0 final_prescan_insn if the `target' unconditional branch is reached + (the target insn is arm_target_insn). + + If the jump clobbers the conditions then we use states 2 and 4. + + A similar thing can be done with conditional return insns. + + XXX In case the `target' is an unconditional branch, this conditionalising + of the instructions always reduces code size, but not always execution + time. But then, I want to reduce the code size to somewhere near what + /bin/cc produces. */ + +/* In addition to this, state is maintained for Thumb-2 COND_EXEC + instructions. When a COND_EXEC instruction is seen the subsequent + instructions are scanned so that multiple conditional instructions can be + combined into a single IT block. arm_condexec_count and arm_condexec_mask + specify the length and true/false mask for the IT block. These will be + decremented/zeroed by arm_asm_output_opcode as the insns are output. */ + +/* Returns the index of the ARM condition code string in + `arm_condition_codes'. COMPARISON should be an rtx like + `(eq (...) (...))'. */ +static enum arm_cond_code +get_arm_condition_code (rtx comparison) +{ + enum machine_mode mode = GET_MODE (XEXP (comparison, 0)); + enum arm_cond_code code; + enum rtx_code comp_code = GET_CODE (comparison); + + if (GET_MODE_CLASS (mode) != MODE_CC) + mode = SELECT_CC_MODE (comp_code, XEXP (comparison, 0), + XEXP (comparison, 1)); + + switch (mode) + { + case CC_DNEmode: code = ARM_NE; goto dominance; + case CC_DEQmode: code = ARM_EQ; goto dominance; + case CC_DGEmode: code = ARM_GE; goto dominance; + case CC_DGTmode: code = ARM_GT; goto dominance; + case CC_DLEmode: code = ARM_LE; goto dominance; + case CC_DLTmode: code = ARM_LT; goto dominance; + case CC_DGEUmode: code = ARM_CS; goto dominance; + case CC_DGTUmode: code = ARM_HI; goto dominance; + case CC_DLEUmode: code = ARM_LS; goto dominance; + case CC_DLTUmode: code = ARM_CC; + + dominance: + gcc_assert (comp_code == EQ || comp_code == NE); + + if (comp_code == EQ) + return ARM_INVERSE_CONDITION_CODE (code); + return code; + + case CC_NOOVmode: + switch (comp_code) + { + case NE: return ARM_NE; + case EQ: return ARM_EQ; + case GE: return ARM_PL; + case LT: return ARM_MI; + default: gcc_unreachable (); + } + + case CC_Zmode: + switch (comp_code) + { + case NE: return ARM_NE; + case EQ: return ARM_EQ; + default: gcc_unreachable (); + } + + case CC_Nmode: + switch (comp_code) + { + case NE: return ARM_MI; + case EQ: return ARM_PL; + default: gcc_unreachable (); + } + + case CCFPEmode: + case CCFPmode: + /* These encodings assume that AC=1 in the FPA system control + byte. This allows us to handle all cases except UNEQ and + LTGT. */ + switch (comp_code) + { + case GE: return ARM_GE; + case GT: return ARM_GT; + case LE: return ARM_LS; + case LT: return ARM_MI; + case NE: return ARM_NE; + case EQ: return ARM_EQ; + case ORDERED: return ARM_VC; + case UNORDERED: return ARM_VS; + case UNLT: return ARM_LT; + case UNLE: return ARM_LE; + case UNGT: return ARM_HI; + case UNGE: return ARM_PL; + /* UNEQ and LTGT do not have a representation. */ + case UNEQ: /* Fall through. */ + case LTGT: /* Fall through. */ + default: gcc_unreachable (); + } + + case CC_SWPmode: + switch (comp_code) + { + case NE: return ARM_NE; + case EQ: return ARM_EQ; + case GE: return ARM_LE; + case GT: return ARM_LT; + case LE: return ARM_GE; + case LT: return ARM_GT; + case GEU: return ARM_LS; + case GTU: return ARM_CC; + case LEU: return ARM_CS; + case LTU: return ARM_HI; + default: gcc_unreachable (); + } + + case CC_Cmode: + switch (comp_code) + { + case LTU: return ARM_CS; + case GEU: return ARM_CC; + default: gcc_unreachable (); + } + + case CC_CZmode: + switch (comp_code) + { + case NE: return ARM_NE; + case EQ: return ARM_EQ; + case GEU: return ARM_CS; + case GTU: return ARM_HI; + case LEU: return ARM_LS; + case LTU: return ARM_CC; + default: gcc_unreachable (); + } + + case CC_NCVmode: + switch (comp_code) + { + case GE: return ARM_GE; + case LT: return ARM_LT; + case GEU: return ARM_CS; + case LTU: return ARM_CC; + default: gcc_unreachable (); + } + + case CCmode: + switch (comp_code) + { + case NE: return ARM_NE; + case EQ: return ARM_EQ; + case GE: return ARM_GE; + case GT: return ARM_GT; + case LE: return ARM_LE; + case LT: return ARM_LT; + case GEU: return ARM_CS; + case GTU: return ARM_HI; + case LEU: return ARM_LS; + case LTU: return ARM_CC; + default: gcc_unreachable (); + } + + default: gcc_unreachable (); + } +} + +/* Tell arm_asm_output_opcode to output IT blocks for conditionally executed + instructions. */ +void +thumb2_final_prescan_insn (rtx insn) +{ + rtx first_insn = insn; + rtx body = PATTERN (insn); + rtx predicate; + enum arm_cond_code code; + int n; + int mask; + + /* Remove the previous insn from the count of insns to be output. */ + if (arm_condexec_count) + arm_condexec_count--; + + /* Nothing to do if we are already inside a conditional block. */ + if (arm_condexec_count) + return; + + if (GET_CODE (body) != COND_EXEC) + return; + + /* Conditional jumps are implemented directly. */ + if (GET_CODE (insn) == JUMP_INSN) + return; + + predicate = COND_EXEC_TEST (body); + arm_current_cc = get_arm_condition_code (predicate); + + n = get_attr_ce_count (insn); + arm_condexec_count = 1; + arm_condexec_mask = (1 << n) - 1; + arm_condexec_masklen = n; + /* See if subsequent instructions can be combined into the same block. */ + for (;;) + { + insn = next_nonnote_insn (insn); + + /* Jumping into the middle of an IT block is illegal, so a label or + barrier terminates the block. */ + if (GET_CODE (insn) != INSN && GET_CODE(insn) != JUMP_INSN) + break; + + body = PATTERN (insn); + /* USE and CLOBBER aren't really insns, so just skip them. */ + if (GET_CODE (body) == USE + || GET_CODE (body) == CLOBBER) + continue; + + /* ??? Recognize conditional jumps, and combine them with IT blocks. */ + if (GET_CODE (body) != COND_EXEC) + break; + /* Allow up to 4 conditionally executed instructions in a block. */ + n = get_attr_ce_count (insn); + if (arm_condexec_masklen + n > 4) + break; + + predicate = COND_EXEC_TEST (body); + code = get_arm_condition_code (predicate); + mask = (1 << n) - 1; + if (arm_current_cc == code) + arm_condexec_mask |= (mask << arm_condexec_masklen); + else if (arm_current_cc != ARM_INVERSE_CONDITION_CODE(code)) + break; + + arm_condexec_count++; + arm_condexec_masklen += n; + + /* A jump must be the last instruction in a conditional block. */ + if (GET_CODE(insn) == JUMP_INSN) + break; + } + /* Restore recog_data (getting the attributes of other insns can + destroy this array, but final.c assumes that it remains intact + across this call). */ + extract_constrain_insn_cached (first_insn); +} + +void +arm_final_prescan_insn (rtx insn) +{ + /* BODY will hold the body of INSN. */ + rtx body = PATTERN (insn); + + /* This will be 1 if trying to repeat the trick, and things need to be + reversed if it appears to fail. */ + int reverse = 0; + + /* If we start with a return insn, we only succeed if we find another one. */ + int seeking_return = 0; + + /* START_INSN will hold the insn from where we start looking. This is the + first insn after the following code_label if REVERSE is true. */ + rtx start_insn = insn; + + /* If in state 4, check if the target branch is reached, in order to + change back to state 0. */ + if (arm_ccfsm_state == 4) + { + if (insn == arm_target_insn) + { + arm_target_insn = NULL; + arm_ccfsm_state = 0; + } + return; + } + + /* If in state 3, it is possible to repeat the trick, if this insn is an + unconditional branch to a label, and immediately following this branch + is the previous target label which is only used once, and the label this + branch jumps to is not too far off. */ + if (arm_ccfsm_state == 3) + { + if (simplejump_p (insn)) + { + start_insn = next_nonnote_insn (start_insn); + if (GET_CODE (start_insn) == BARRIER) + { + /* XXX Isn't this always a barrier? */ + start_insn = next_nonnote_insn (start_insn); + } + if (GET_CODE (start_insn) == CODE_LABEL + && CODE_LABEL_NUMBER (start_insn) == arm_target_label + && LABEL_NUSES (start_insn) == 1) + reverse = TRUE; + else + return; + } + else if (GET_CODE (body) == RETURN) + { + start_insn = next_nonnote_insn (start_insn); + if (GET_CODE (start_insn) == BARRIER) + start_insn = next_nonnote_insn (start_insn); + if (GET_CODE (start_insn) == CODE_LABEL + && CODE_LABEL_NUMBER (start_insn) == arm_target_label + && LABEL_NUSES (start_insn) == 1) + { + reverse = TRUE; + seeking_return = 1; + } + else + return; + } + else + return; + } + + gcc_assert (!arm_ccfsm_state || reverse); + if (GET_CODE (insn) != JUMP_INSN) + return; + + /* This jump might be paralleled with a clobber of the condition codes + the jump should always come first */ + if (GET_CODE (body) == PARALLEL && XVECLEN (body, 0) > 0) + body = XVECEXP (body, 0, 0); + + if (reverse + || (GET_CODE (body) == SET && GET_CODE (SET_DEST (body)) == PC + && GET_CODE (SET_SRC (body)) == IF_THEN_ELSE)) + { + int insns_skipped; + int fail = FALSE, succeed = FALSE; + /* Flag which part of the IF_THEN_ELSE is the LABEL_REF. */ + int then_not_else = TRUE; + rtx this_insn = start_insn, label = 0; + + /* Register the insn jumped to. */ + if (reverse) + { + if (!seeking_return) + label = XEXP (SET_SRC (body), 0); + } + else if (GET_CODE (XEXP (SET_SRC (body), 1)) == LABEL_REF) + label = XEXP (XEXP (SET_SRC (body), 1), 0); + else if (GET_CODE (XEXP (SET_SRC (body), 2)) == LABEL_REF) + { + label = XEXP (XEXP (SET_SRC (body), 2), 0); + then_not_else = FALSE; + } + else if (GET_CODE (XEXP (SET_SRC (body), 1)) == RETURN) + seeking_return = 1; + else if (GET_CODE (XEXP (SET_SRC (body), 2)) == RETURN) + { + seeking_return = 1; + then_not_else = FALSE; + } + else + gcc_unreachable (); + + /* See how many insns this branch skips, and what kind of insns. If all + insns are okay, and the label or unconditional branch to the same + label is not too far away, succeed. */ + for (insns_skipped = 0; + !fail && !succeed && insns_skipped++ < max_insns_skipped;) + { + rtx scanbody; + + this_insn = next_nonnote_insn (this_insn); + if (!this_insn) + break; + + switch (GET_CODE (this_insn)) + { + case CODE_LABEL: + /* Succeed if it is the target label, otherwise fail since + control falls in from somewhere else. */ + if (this_insn == label) + { + arm_ccfsm_state = 1; + succeed = TRUE; + } + else + fail = TRUE; + break; + + case BARRIER: + /* Succeed if the following insn is the target label. + Otherwise fail. + If return insns are used then the last insn in a function + will be a barrier. */ + this_insn = next_nonnote_insn (this_insn); + if (this_insn && this_insn == label) + { + arm_ccfsm_state = 1; + succeed = TRUE; + } + else + fail = TRUE; + break; + + case CALL_INSN: + /* The AAPCS says that conditional calls should not be + used since they make interworking inefficient (the + linker can't transform BL into BLX). That's + only a problem if the machine has BLX. */ + if (arm_arch5) + { + fail = TRUE; + break; + } + + /* Succeed if the following insn is the target label, or + if the following two insns are a barrier and the + target label. */ + this_insn = next_nonnote_insn (this_insn); + if (this_insn && GET_CODE (this_insn) == BARRIER) + this_insn = next_nonnote_insn (this_insn); + + if (this_insn && this_insn == label + && insns_skipped < max_insns_skipped) + { + arm_ccfsm_state = 1; + succeed = TRUE; + } + else + fail = TRUE; + break; + + case JUMP_INSN: + /* If this is an unconditional branch to the same label, succeed. + If it is to another label, do nothing. If it is conditional, + fail. */ + /* XXX Probably, the tests for SET and the PC are + unnecessary. */ + + scanbody = PATTERN (this_insn); + if (GET_CODE (scanbody) == SET + && GET_CODE (SET_DEST (scanbody)) == PC) + { + if (GET_CODE (SET_SRC (scanbody)) == LABEL_REF + && XEXP (SET_SRC (scanbody), 0) == label && !reverse) + { + arm_ccfsm_state = 2; + succeed = TRUE; + } + else if (GET_CODE (SET_SRC (scanbody)) == IF_THEN_ELSE) + fail = TRUE; + } + /* Fail if a conditional return is undesirable (e.g. on a + StrongARM), but still allow this if optimizing for size. */ + else if (GET_CODE (scanbody) == RETURN + && !use_return_insn (TRUE, NULL) + && !optimize_size) + fail = TRUE; + else if (GET_CODE (scanbody) == RETURN + && seeking_return) + { + arm_ccfsm_state = 2; + succeed = TRUE; + } + else if (GET_CODE (scanbody) == PARALLEL) + { + switch (get_attr_conds (this_insn)) + { + case CONDS_NOCOND: + break; + default: + fail = TRUE; + break; + } + } + else + fail = TRUE; /* Unrecognized jump (e.g. epilogue). */ + + break; + + case INSN: + /* Instructions using or affecting the condition codes make it + fail. */ + scanbody = PATTERN (this_insn); + if (!(GET_CODE (scanbody) == SET + || GET_CODE (scanbody) == PARALLEL) + || get_attr_conds (this_insn) != CONDS_NOCOND) + fail = TRUE; + + /* A conditional cirrus instruction must be followed by + a non Cirrus instruction. However, since we + conditionalize instructions in this function and by + the time we get here we can't add instructions + (nops), because shorten_branches() has already been + called, we will disable conditionalizing Cirrus + instructions to be safe. */ + if (GET_CODE (scanbody) != USE + && GET_CODE (scanbody) != CLOBBER + && get_attr_cirrus (this_insn) != CIRRUS_NOT) + fail = TRUE; + break; + + default: + break; + } + } + if (succeed) + { + if ((!seeking_return) && (arm_ccfsm_state == 1 || reverse)) + arm_target_label = CODE_LABEL_NUMBER (label); + else + { + gcc_assert (seeking_return || arm_ccfsm_state == 2); + + while (this_insn && GET_CODE (PATTERN (this_insn)) == USE) + { + this_insn = next_nonnote_insn (this_insn); + gcc_assert (!this_insn + || (GET_CODE (this_insn) != BARRIER + && GET_CODE (this_insn) != CODE_LABEL)); + } + if (!this_insn) + { + /* Oh, dear! we ran off the end.. give up. */ + extract_constrain_insn_cached (insn); + arm_ccfsm_state = 0; + arm_target_insn = NULL; + return; + } + arm_target_insn = this_insn; + } + + /* If REVERSE is true, ARM_CURRENT_CC needs to be inverted from + what it was. */ + if (!reverse) + arm_current_cc = get_arm_condition_code (XEXP (SET_SRC (body), 0)); + + if (reverse || then_not_else) + arm_current_cc = ARM_INVERSE_CONDITION_CODE (arm_current_cc); + } + + /* Restore recog_data (getting the attributes of other insns can + destroy this array, but final.c assumes that it remains intact + across this call. */ + extract_constrain_insn_cached (insn); + } +} + +/* Output IT instructions. */ +void +thumb2_asm_output_opcode (FILE * stream) +{ + char buff[5]; + int n; + + if (arm_condexec_mask) + { + for (n = 0; n < arm_condexec_masklen; n++) + buff[n] = (arm_condexec_mask & (1 << n)) ? 't' : 'e'; + buff[n] = 0; + asm_fprintf(stream, "i%s\t%s\n\t", buff, + arm_condition_codes[arm_current_cc]); + arm_condexec_mask = 0; + } +} + +/* Returns true if REGNO is a valid register + for holding a quantity of type MODE. */ +int +arm_hard_regno_mode_ok (unsigned int regno, enum machine_mode mode) +{ + if (GET_MODE_CLASS (mode) == MODE_CC) + return (regno == CC_REGNUM + || (TARGET_HARD_FLOAT && TARGET_VFP + && regno == VFPCC_REGNUM)); + + if (TARGET_THUMB1) + /* For the Thumb we only allow values bigger than SImode in + registers 0 - 6, so that there is always a second low + register available to hold the upper part of the value. + We probably we ought to ensure that the register is the + start of an even numbered register pair. */ + return (ARM_NUM_REGS (mode) < 2) || (regno < LAST_LO_REGNUM); + + if (TARGET_HARD_FLOAT && TARGET_MAVERICK + && IS_CIRRUS_REGNUM (regno)) + /* We have outlawed SI values in Cirrus registers because they + reside in the lower 32 bits, but SF values reside in the + upper 32 bits. This causes gcc all sorts of grief. We can't + even split the registers into pairs because Cirrus SI values + get sign extended to 64bits-- aldyh. */ + return (GET_MODE_CLASS (mode) == MODE_FLOAT) || (mode == DImode); + + if (TARGET_HARD_FLOAT && TARGET_VFP + && IS_VFP_REGNUM (regno)) + { + if (mode == SFmode || mode == SImode) + return VFP_REGNO_OK_FOR_SINGLE (regno); + + if (mode == DFmode) + return VFP_REGNO_OK_FOR_DOUBLE (regno); + + /* VFP registers can hold HFmode values, but there is no point in + putting them there unless we have hardware conversion insns. */ + if (mode == HFmode) + return TARGET_FP16 && VFP_REGNO_OK_FOR_SINGLE (regno); + + if (TARGET_NEON) + return (VALID_NEON_DREG_MODE (mode) && VFP_REGNO_OK_FOR_DOUBLE (regno)) + || (VALID_NEON_QREG_MODE (mode) + && NEON_REGNO_OK_FOR_QUAD (regno)) + || (mode == TImode && NEON_REGNO_OK_FOR_NREGS (regno, 2)) + || (mode == EImode && NEON_REGNO_OK_FOR_NREGS (regno, 3)) + || (mode == OImode && NEON_REGNO_OK_FOR_NREGS (regno, 4)) + || (mode == CImode && NEON_REGNO_OK_FOR_NREGS (regno, 6)) + || (mode == XImode && NEON_REGNO_OK_FOR_NREGS (regno, 8)); + + return FALSE; + } + + if (TARGET_REALLY_IWMMXT) + { + if (IS_IWMMXT_GR_REGNUM (regno)) + return mode == SImode; + + if (IS_IWMMXT_REGNUM (regno)) + return VALID_IWMMXT_REG_MODE (mode); + } + + /* We allow almost any value to be stored in the general registers. + Restrict doubleword quantities to even register pairs so that we can + use ldrd. Do not allow very large Neon structure opaque modes in + general registers; they would use too many. */ + if (regno <= LAST_ARM_REGNUM) + return !(TARGET_LDRD && GET_MODE_SIZE (mode) > 4 && (regno & 1) != 0) + && ARM_NUM_REGS (mode) <= 4; + + if (regno == FRAME_POINTER_REGNUM + || regno == ARG_POINTER_REGNUM) + /* We only allow integers in the fake hard registers. */ + return GET_MODE_CLASS (mode) == MODE_INT; + + /* The only registers left are the FPA registers + which we only allow to hold FP values. */ + return (TARGET_HARD_FLOAT && TARGET_FPA + && GET_MODE_CLASS (mode) == MODE_FLOAT + && regno >= FIRST_FPA_REGNUM + && regno <= LAST_FPA_REGNUM); +} + +/* For efficiency and historical reasons LO_REGS, HI_REGS and CC_REGS are + not used in arm mode. */ + +enum reg_class +arm_regno_class (int regno) +{ + if (TARGET_THUMB1) + { + if (regno == STACK_POINTER_REGNUM) + return STACK_REG; + if (regno == CC_REGNUM) + return CC_REG; + if (regno < 8) + return LO_REGS; + return HI_REGS; + } + + if (TARGET_THUMB2 && regno < 8) + return LO_REGS; + + if ( regno <= LAST_ARM_REGNUM + || regno == FRAME_POINTER_REGNUM + || regno == ARG_POINTER_REGNUM) + return TARGET_THUMB2 ? HI_REGS : GENERAL_REGS; + + if (regno == CC_REGNUM || regno == VFPCC_REGNUM) + return TARGET_THUMB2 ? CC_REG : NO_REGS; + + if (IS_CIRRUS_REGNUM (regno)) + return CIRRUS_REGS; + + if (IS_VFP_REGNUM (regno)) + { + if (regno <= D7_VFP_REGNUM) + return VFP_D0_D7_REGS; + else if (regno <= LAST_LO_VFP_REGNUM) + return VFP_LO_REGS; + else + return VFP_HI_REGS; + } + + if (IS_IWMMXT_REGNUM (regno)) + return IWMMXT_REGS; + + if (IS_IWMMXT_GR_REGNUM (regno)) + return IWMMXT_GR_REGS; + + return FPA_REGS; +} + +/* Handle a special case when computing the offset + of an argument from the frame pointer. */ +int +arm_debugger_arg_offset (int value, rtx addr) +{ + rtx insn; + + /* We are only interested if dbxout_parms() failed to compute the offset. */ + if (value != 0) + return 0; + + /* We can only cope with the case where the address is held in a register. */ + if (GET_CODE (addr) != REG) + return 0; + + /* If we are using the frame pointer to point at the argument, then + an offset of 0 is correct. */ + if (REGNO (addr) == (unsigned) HARD_FRAME_POINTER_REGNUM) + return 0; + + /* If we are using the stack pointer to point at the + argument, then an offset of 0 is correct. */ + /* ??? Check this is consistent with thumb2 frame layout. */ + if ((TARGET_THUMB || !frame_pointer_needed) + && REGNO (addr) == SP_REGNUM) + return 0; + + /* Oh dear. The argument is pointed to by a register rather + than being held in a register, or being stored at a known + offset from the frame pointer. Since GDB only understands + those two kinds of argument we must translate the address + held in the register into an offset from the frame pointer. + We do this by searching through the insns for the function + looking to see where this register gets its value. If the + register is initialized from the frame pointer plus an offset + then we are in luck and we can continue, otherwise we give up. + + This code is exercised by producing debugging information + for a function with arguments like this: + + double func (double a, double b, int c, double d) {return d;} + + Without this code the stab for parameter 'd' will be set to + an offset of 0 from the frame pointer, rather than 8. */ + + /* The if() statement says: + + If the insn is a normal instruction + and if the insn is setting the value in a register + and if the register being set is the register holding the address of the argument + and if the address is computing by an addition + that involves adding to a register + which is the frame pointer + a constant integer + + then... */ + + for (insn = get_insns (); insn; insn = NEXT_INSN (insn)) + { + if ( GET_CODE (insn) == INSN + && GET_CODE (PATTERN (insn)) == SET + && REGNO (XEXP (PATTERN (insn), 0)) == REGNO (addr) + && GET_CODE (XEXP (PATTERN (insn), 1)) == PLUS + && GET_CODE (XEXP (XEXP (PATTERN (insn), 1), 0)) == REG + && REGNO (XEXP (XEXP (PATTERN (insn), 1), 0)) == (unsigned) HARD_FRAME_POINTER_REGNUM + && GET_CODE (XEXP (XEXP (PATTERN (insn), 1), 1)) == CONST_INT + ) + { + value = INTVAL (XEXP (XEXP (PATTERN (insn), 1), 1)); + + break; + } + } + + if (value == 0) + { + debug_rtx (addr); + warning (0, "unable to compute real location of stacked parameter"); + value = 8; /* XXX magic hack */ + } + + return value; +} + +#define def_mbuiltin(MASK, NAME, TYPE, CODE) \ + do \ + { \ + if ((MASK) & insn_flags) \ + add_builtin_function ((NAME), (TYPE), (CODE), \ + BUILT_IN_MD, NULL, NULL_TREE); \ + } \ + while (0) + +struct builtin_description +{ + const unsigned int mask; + const enum insn_code icode; + const char * const name; + const enum arm_builtins code; + const enum rtx_code comparison; + const unsigned int flag; +}; + +static const struct builtin_description bdesc_2arg[] = +{ +#define IWMMXT_BUILTIN(code, string, builtin) \ + { FL_IWMMXT, CODE_FOR_##code, "__builtin_arm_" string, \ + ARM_BUILTIN_##builtin, UNKNOWN, 0 }, + + IWMMXT_BUILTIN (addv8qi3, "waddb", WADDB) + IWMMXT_BUILTIN (addv4hi3, "waddh", WADDH) + IWMMXT_BUILTIN (addv2si3, "waddw", WADDW) + IWMMXT_BUILTIN (subv8qi3, "wsubb", WSUBB) + IWMMXT_BUILTIN (subv4hi3, "wsubh", WSUBH) + IWMMXT_BUILTIN (subv2si3, "wsubw", WSUBW) + IWMMXT_BUILTIN (ssaddv8qi3, "waddbss", WADDSSB) + IWMMXT_BUILTIN (ssaddv4hi3, "waddhss", WADDSSH) + IWMMXT_BUILTIN (ssaddv2si3, "waddwss", WADDSSW) + IWMMXT_BUILTIN (sssubv8qi3, "wsubbss", WSUBSSB) + IWMMXT_BUILTIN (sssubv4hi3, "wsubhss", WSUBSSH) + IWMMXT_BUILTIN (sssubv2si3, "wsubwss", WSUBSSW) + IWMMXT_BUILTIN (usaddv8qi3, "waddbus", WADDUSB) + IWMMXT_BUILTIN (usaddv4hi3, "waddhus", WADDUSH) + IWMMXT_BUILTIN (usaddv2si3, "waddwus", WADDUSW) + IWMMXT_BUILTIN (ussubv8qi3, "wsubbus", WSUBUSB) + IWMMXT_BUILTIN (ussubv4hi3, "wsubhus", WSUBUSH) + IWMMXT_BUILTIN (ussubv2si3, "wsubwus", WSUBUSW) + IWMMXT_BUILTIN (mulv4hi3, "wmulul", WMULUL) + IWMMXT_BUILTIN (smulv4hi3_highpart, "wmulsm", WMULSM) + IWMMXT_BUILTIN (umulv4hi3_highpart, "wmulum", WMULUM) + IWMMXT_BUILTIN (eqv8qi3, "wcmpeqb", WCMPEQB) + IWMMXT_BUILTIN (eqv4hi3, "wcmpeqh", WCMPEQH) + IWMMXT_BUILTIN (eqv2si3, "wcmpeqw", WCMPEQW) + IWMMXT_BUILTIN (gtuv8qi3, "wcmpgtub", WCMPGTUB) + IWMMXT_BUILTIN (gtuv4hi3, "wcmpgtuh", WCMPGTUH) + IWMMXT_BUILTIN (gtuv2si3, "wcmpgtuw", WCMPGTUW) + IWMMXT_BUILTIN (gtv8qi3, "wcmpgtsb", WCMPGTSB) + IWMMXT_BUILTIN (gtv4hi3, "wcmpgtsh", WCMPGTSH) + IWMMXT_BUILTIN (gtv2si3, "wcmpgtsw", WCMPGTSW) + IWMMXT_BUILTIN (umaxv8qi3, "wmaxub", WMAXUB) + IWMMXT_BUILTIN (smaxv8qi3, "wmaxsb", WMAXSB) + IWMMXT_BUILTIN (umaxv4hi3, "wmaxuh", WMAXUH) + IWMMXT_BUILTIN (smaxv4hi3, "wmaxsh", WMAXSH) + IWMMXT_BUILTIN (umaxv2si3, "wmaxuw", WMAXUW) + IWMMXT_BUILTIN (smaxv2si3, "wmaxsw", WMAXSW) + IWMMXT_BUILTIN (uminv8qi3, "wminub", WMINUB) + IWMMXT_BUILTIN (sminv8qi3, "wminsb", WMINSB) + IWMMXT_BUILTIN (uminv4hi3, "wminuh", WMINUH) + IWMMXT_BUILTIN (sminv4hi3, "wminsh", WMINSH) + IWMMXT_BUILTIN (uminv2si3, "wminuw", WMINUW) + IWMMXT_BUILTIN (sminv2si3, "wminsw", WMINSW) + IWMMXT_BUILTIN (iwmmxt_anddi3, "wand", WAND) + IWMMXT_BUILTIN (iwmmxt_nanddi3, "wandn", WANDN) + IWMMXT_BUILTIN (iwmmxt_iordi3, "wor", WOR) + IWMMXT_BUILTIN (iwmmxt_xordi3, "wxor", WXOR) + IWMMXT_BUILTIN (iwmmxt_uavgv8qi3, "wavg2b", WAVG2B) + IWMMXT_BUILTIN (iwmmxt_uavgv4hi3, "wavg2h", WAVG2H) + IWMMXT_BUILTIN (iwmmxt_uavgrndv8qi3, "wavg2br", WAVG2BR) + IWMMXT_BUILTIN (iwmmxt_uavgrndv4hi3, "wavg2hr", WAVG2HR) + IWMMXT_BUILTIN (iwmmxt_wunpckilb, "wunpckilb", WUNPCKILB) + IWMMXT_BUILTIN (iwmmxt_wunpckilh, "wunpckilh", WUNPCKILH) + IWMMXT_BUILTIN (iwmmxt_wunpckilw, "wunpckilw", WUNPCKILW) + IWMMXT_BUILTIN (iwmmxt_wunpckihb, "wunpckihb", WUNPCKIHB) + IWMMXT_BUILTIN (iwmmxt_wunpckihh, "wunpckihh", WUNPCKIHH) + IWMMXT_BUILTIN (iwmmxt_wunpckihw, "wunpckihw", WUNPCKIHW) + IWMMXT_BUILTIN (iwmmxt_wmadds, "wmadds", WMADDS) + IWMMXT_BUILTIN (iwmmxt_wmaddu, "wmaddu", WMADDU) + +#define IWMMXT_BUILTIN2(code, builtin) \ + { FL_IWMMXT, CODE_FOR_##code, NULL, ARM_BUILTIN_##builtin, UNKNOWN, 0 }, + + IWMMXT_BUILTIN2 (iwmmxt_wpackhss, WPACKHSS) + IWMMXT_BUILTIN2 (iwmmxt_wpackwss, WPACKWSS) + IWMMXT_BUILTIN2 (iwmmxt_wpackdss, WPACKDSS) + IWMMXT_BUILTIN2 (iwmmxt_wpackhus, WPACKHUS) + IWMMXT_BUILTIN2 (iwmmxt_wpackwus, WPACKWUS) + IWMMXT_BUILTIN2 (iwmmxt_wpackdus, WPACKDUS) + IWMMXT_BUILTIN2 (ashlv4hi3_di, WSLLH) + IWMMXT_BUILTIN2 (ashlv4hi3_iwmmxt, WSLLHI) + IWMMXT_BUILTIN2 (ashlv2si3_di, WSLLW) + IWMMXT_BUILTIN2 (ashlv2si3_iwmmxt, WSLLWI) + IWMMXT_BUILTIN2 (ashldi3_di, WSLLD) + IWMMXT_BUILTIN2 (ashldi3_iwmmxt, WSLLDI) + IWMMXT_BUILTIN2 (lshrv4hi3_di, WSRLH) + IWMMXT_BUILTIN2 (lshrv4hi3_iwmmxt, WSRLHI) + IWMMXT_BUILTIN2 (lshrv2si3_di, WSRLW) + IWMMXT_BUILTIN2 (lshrv2si3_iwmmxt, WSRLWI) + IWMMXT_BUILTIN2 (lshrdi3_di, WSRLD) + IWMMXT_BUILTIN2 (lshrdi3_iwmmxt, WSRLDI) + IWMMXT_BUILTIN2 (ashrv4hi3_di, WSRAH) + IWMMXT_BUILTIN2 (ashrv4hi3_iwmmxt, WSRAHI) + IWMMXT_BUILTIN2 (ashrv2si3_di, WSRAW) + IWMMXT_BUILTIN2 (ashrv2si3_iwmmxt, WSRAWI) + IWMMXT_BUILTIN2 (ashrdi3_di, WSRAD) + IWMMXT_BUILTIN2 (ashrdi3_iwmmxt, WSRADI) + IWMMXT_BUILTIN2 (rorv4hi3_di, WRORH) + IWMMXT_BUILTIN2 (rorv4hi3, WRORHI) + IWMMXT_BUILTIN2 (rorv2si3_di, WRORW) + IWMMXT_BUILTIN2 (rorv2si3, WRORWI) + IWMMXT_BUILTIN2 (rordi3_di, WRORD) + IWMMXT_BUILTIN2 (rordi3, WRORDI) + IWMMXT_BUILTIN2 (iwmmxt_wmacuz, WMACUZ) + IWMMXT_BUILTIN2 (iwmmxt_wmacsz, WMACSZ) +}; + +static const struct builtin_description bdesc_1arg[] = +{ + IWMMXT_BUILTIN (iwmmxt_tmovmskb, "tmovmskb", TMOVMSKB) + IWMMXT_BUILTIN (iwmmxt_tmovmskh, "tmovmskh", TMOVMSKH) + IWMMXT_BUILTIN (iwmmxt_tmovmskw, "tmovmskw", TMOVMSKW) + IWMMXT_BUILTIN (iwmmxt_waccb, "waccb", WACCB) + IWMMXT_BUILTIN (iwmmxt_wacch, "wacch", WACCH) + IWMMXT_BUILTIN (iwmmxt_waccw, "waccw", WACCW) + IWMMXT_BUILTIN (iwmmxt_wunpckehub, "wunpckehub", WUNPCKEHUB) + IWMMXT_BUILTIN (iwmmxt_wunpckehuh, "wunpckehuh", WUNPCKEHUH) + IWMMXT_BUILTIN (iwmmxt_wunpckehuw, "wunpckehuw", WUNPCKEHUW) + IWMMXT_BUILTIN (iwmmxt_wunpckehsb, "wunpckehsb", WUNPCKEHSB) + IWMMXT_BUILTIN (iwmmxt_wunpckehsh, "wunpckehsh", WUNPCKEHSH) + IWMMXT_BUILTIN (iwmmxt_wunpckehsw, "wunpckehsw", WUNPCKEHSW) + IWMMXT_BUILTIN (iwmmxt_wunpckelub, "wunpckelub", WUNPCKELUB) + IWMMXT_BUILTIN (iwmmxt_wunpckeluh, "wunpckeluh", WUNPCKELUH) + IWMMXT_BUILTIN (iwmmxt_wunpckeluw, "wunpckeluw", WUNPCKELUW) + IWMMXT_BUILTIN (iwmmxt_wunpckelsb, "wunpckelsb", WUNPCKELSB) + IWMMXT_BUILTIN (iwmmxt_wunpckelsh, "wunpckelsh", WUNPCKELSH) + IWMMXT_BUILTIN (iwmmxt_wunpckelsw, "wunpckelsw", WUNPCKELSW) +}; + +/* Set up all the iWMMXt builtins. This is + not called if TARGET_IWMMXT is zero. */ + +static void +arm_init_iwmmxt_builtins (void) +{ + const struct builtin_description * d; + size_t i; + tree endlink = void_list_node; + + tree V2SI_type_node = build_vector_type_for_mode (intSI_type_node, V2SImode); + tree V4HI_type_node = build_vector_type_for_mode (intHI_type_node, V4HImode); + tree V8QI_type_node = build_vector_type_for_mode (intQI_type_node, V8QImode); + + tree int_ftype_int + = build_function_type (integer_type_node, + tree_cons (NULL_TREE, integer_type_node, endlink)); + tree v8qi_ftype_v8qi_v8qi_int + = build_function_type (V8QI_type_node, + tree_cons (NULL_TREE, V8QI_type_node, + tree_cons (NULL_TREE, V8QI_type_node, + tree_cons (NULL_TREE, + integer_type_node, + endlink)))); + tree v4hi_ftype_v4hi_int + = build_function_type (V4HI_type_node, + tree_cons (NULL_TREE, V4HI_type_node, + tree_cons (NULL_TREE, integer_type_node, + endlink))); + tree v2si_ftype_v2si_int + = build_function_type (V2SI_type_node, + tree_cons (NULL_TREE, V2SI_type_node, + tree_cons (NULL_TREE, integer_type_node, + endlink))); + tree v2si_ftype_di_di + = build_function_type (V2SI_type_node, + tree_cons (NULL_TREE, long_long_integer_type_node, + tree_cons (NULL_TREE, long_long_integer_type_node, + endlink))); + tree di_ftype_di_int + = build_function_type (long_long_integer_type_node, + tree_cons (NULL_TREE, long_long_integer_type_node, + tree_cons (NULL_TREE, integer_type_node, + endlink))); + tree di_ftype_di_int_int + = build_function_type (long_long_integer_type_node, + tree_cons (NULL_TREE, long_long_integer_type_node, + tree_cons (NULL_TREE, integer_type_node, + tree_cons (NULL_TREE, + integer_type_node, + endlink)))); + tree int_ftype_v8qi + = build_function_type (integer_type_node, + tree_cons (NULL_TREE, V8QI_type_node, + endlink)); + tree int_ftype_v4hi + = build_function_type (integer_type_node, + tree_cons (NULL_TREE, V4HI_type_node, + endlink)); + tree int_ftype_v2si + = build_function_type (integer_type_node, + tree_cons (NULL_TREE, V2SI_type_node, + endlink)); + tree int_ftype_v8qi_int + = build_function_type (integer_type_node, + tree_cons (NULL_TREE, V8QI_type_node, + tree_cons (NULL_TREE, integer_type_node, + endlink))); + tree int_ftype_v4hi_int + = build_function_type (integer_type_node, + tree_cons (NULL_TREE, V4HI_type_node, + tree_cons (NULL_TREE, integer_type_node, + endlink))); + tree int_ftype_v2si_int + = build_function_type (integer_type_node, + tree_cons (NULL_TREE, V2SI_type_node, + tree_cons (NULL_TREE, integer_type_node, + endlink))); + tree v8qi_ftype_v8qi_int_int + = build_function_type (V8QI_type_node, + tree_cons (NULL_TREE, V8QI_type_node, + tree_cons (NULL_TREE, integer_type_node, + tree_cons (NULL_TREE, + integer_type_node, + endlink)))); + tree v4hi_ftype_v4hi_int_int + = build_function_type (V4HI_type_node, + tree_cons (NULL_TREE, V4HI_type_node, + tree_cons (NULL_TREE, integer_type_node, + tree_cons (NULL_TREE, + integer_type_node, + endlink)))); + tree v2si_ftype_v2si_int_int + = build_function_type (V2SI_type_node, + tree_cons (NULL_TREE, V2SI_type_node, + tree_cons (NULL_TREE, integer_type_node, + tree_cons (NULL_TREE, + integer_type_node, + endlink)))); + /* Miscellaneous. */ + tree v8qi_ftype_v4hi_v4hi + = build_function_type (V8QI_type_node, + tree_cons (NULL_TREE, V4HI_type_node, + tree_cons (NULL_TREE, V4HI_type_node, + endlink))); + tree v4hi_ftype_v2si_v2si + = build_function_type (V4HI_type_node, + tree_cons (NULL_TREE, V2SI_type_node, + tree_cons (NULL_TREE, V2SI_type_node, + endlink))); + tree v2si_ftype_v4hi_v4hi + = build_function_type (V2SI_type_node, + tree_cons (NULL_TREE, V4HI_type_node, + tree_cons (NULL_TREE, V4HI_type_node, + endlink))); + tree v2si_ftype_v8qi_v8qi + = build_function_type (V2SI_type_node, + tree_cons (NULL_TREE, V8QI_type_node, + tree_cons (NULL_TREE, V8QI_type_node, + endlink))); + tree v4hi_ftype_v4hi_di + = build_function_type (V4HI_type_node, + tree_cons (NULL_TREE, V4HI_type_node, + tree_cons (NULL_TREE, + long_long_integer_type_node, + endlink))); + tree v2si_ftype_v2si_di + = build_function_type (V2SI_type_node, + tree_cons (NULL_TREE, V2SI_type_node, + tree_cons (NULL_TREE, + long_long_integer_type_node, + endlink))); + tree void_ftype_int_int + = build_function_type (void_type_node, + tree_cons (NULL_TREE, integer_type_node, + tree_cons (NULL_TREE, integer_type_node, + endlink))); + tree di_ftype_void + = build_function_type (long_long_unsigned_type_node, endlink); + tree di_ftype_v8qi + = build_function_type (long_long_integer_type_node, + tree_cons (NULL_TREE, V8QI_type_node, + endlink)); + tree di_ftype_v4hi + = build_function_type (long_long_integer_type_node, + tree_cons (NULL_TREE, V4HI_type_node, + endlink)); + tree di_ftype_v2si + = build_function_type (long_long_integer_type_node, + tree_cons (NULL_TREE, V2SI_type_node, + endlink)); + tree v2si_ftype_v4hi + = build_function_type (V2SI_type_node, + tree_cons (NULL_TREE, V4HI_type_node, + endlink)); + tree v4hi_ftype_v8qi + = build_function_type (V4HI_type_node, + tree_cons (NULL_TREE, V8QI_type_node, + endlink)); + + tree di_ftype_di_v4hi_v4hi + = build_function_type (long_long_unsigned_type_node, + tree_cons (NULL_TREE, + long_long_unsigned_type_node, + tree_cons (NULL_TREE, V4HI_type_node, + tree_cons (NULL_TREE, + V4HI_type_node, + endlink)))); + + tree di_ftype_v4hi_v4hi + = build_function_type (long_long_unsigned_type_node, + tree_cons (NULL_TREE, V4HI_type_node, + tree_cons (NULL_TREE, V4HI_type_node, + endlink))); + + /* Normal vector binops. */ + tree v8qi_ftype_v8qi_v8qi + = build_function_type (V8QI_type_node, + tree_cons (NULL_TREE, V8QI_type_node, + tree_cons (NULL_TREE, V8QI_type_node, + endlink))); + tree v4hi_ftype_v4hi_v4hi + = build_function_type (V4HI_type_node, + tree_cons (NULL_TREE, V4HI_type_node, + tree_cons (NULL_TREE, V4HI_type_node, + endlink))); + tree v2si_ftype_v2si_v2si + = build_function_type (V2SI_type_node, + tree_cons (NULL_TREE, V2SI_type_node, + tree_cons (NULL_TREE, V2SI_type_node, + endlink))); + tree di_ftype_di_di + = build_function_type (long_long_unsigned_type_node, + tree_cons (NULL_TREE, long_long_unsigned_type_node, + tree_cons (NULL_TREE, + long_long_unsigned_type_node, + endlink))); + + /* Add all builtins that are more or less simple operations on two + operands. */ + for (i = 0, d = bdesc_2arg; i < ARRAY_SIZE (bdesc_2arg); i++, d++) + { + /* Use one of the operands; the target can have a different mode for + mask-generating compares. */ + enum machine_mode mode; + tree type; + + if (d->name == 0) + continue; + + mode = insn_data[d->icode].operand[1].mode; + + switch (mode) + { + case V8QImode: + type = v8qi_ftype_v8qi_v8qi; + break; + case V4HImode: + type = v4hi_ftype_v4hi_v4hi; + break; + case V2SImode: + type = v2si_ftype_v2si_v2si; + break; + case DImode: + type = di_ftype_di_di; + break; + + default: + gcc_unreachable (); + } + + def_mbuiltin (d->mask, d->name, type, d->code); + } + + /* Add the remaining MMX insns with somewhat more complicated types. */ + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wzero", di_ftype_void, ARM_BUILTIN_WZERO); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_setwcx", void_ftype_int_int, ARM_BUILTIN_SETWCX); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_getwcx", int_ftype_int, ARM_BUILTIN_GETWCX); + + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wsllh", v4hi_ftype_v4hi_di, ARM_BUILTIN_WSLLH); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wsllw", v2si_ftype_v2si_di, ARM_BUILTIN_WSLLW); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wslld", di_ftype_di_di, ARM_BUILTIN_WSLLD); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wsllhi", v4hi_ftype_v4hi_int, ARM_BUILTIN_WSLLHI); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wsllwi", v2si_ftype_v2si_int, ARM_BUILTIN_WSLLWI); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wslldi", di_ftype_di_int, ARM_BUILTIN_WSLLDI); + + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wsrlh", v4hi_ftype_v4hi_di, ARM_BUILTIN_WSRLH); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wsrlw", v2si_ftype_v2si_di, ARM_BUILTIN_WSRLW); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wsrld", di_ftype_di_di, ARM_BUILTIN_WSRLD); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wsrlhi", v4hi_ftype_v4hi_int, ARM_BUILTIN_WSRLHI); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wsrlwi", v2si_ftype_v2si_int, ARM_BUILTIN_WSRLWI); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wsrldi", di_ftype_di_int, ARM_BUILTIN_WSRLDI); + + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wsrah", v4hi_ftype_v4hi_di, ARM_BUILTIN_WSRAH); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wsraw", v2si_ftype_v2si_di, ARM_BUILTIN_WSRAW); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wsrad", di_ftype_di_di, ARM_BUILTIN_WSRAD); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wsrahi", v4hi_ftype_v4hi_int, ARM_BUILTIN_WSRAHI); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wsrawi", v2si_ftype_v2si_int, ARM_BUILTIN_WSRAWI); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wsradi", di_ftype_di_int, ARM_BUILTIN_WSRADI); + + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wrorh", v4hi_ftype_v4hi_di, ARM_BUILTIN_WRORH); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wrorw", v2si_ftype_v2si_di, ARM_BUILTIN_WRORW); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wrord", di_ftype_di_di, ARM_BUILTIN_WRORD); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wrorhi", v4hi_ftype_v4hi_int, ARM_BUILTIN_WRORHI); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wrorwi", v2si_ftype_v2si_int, ARM_BUILTIN_WRORWI); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wrordi", di_ftype_di_int, ARM_BUILTIN_WRORDI); + + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wshufh", v4hi_ftype_v4hi_int, ARM_BUILTIN_WSHUFH); + + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wsadb", v2si_ftype_v8qi_v8qi, ARM_BUILTIN_WSADB); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wsadh", v2si_ftype_v4hi_v4hi, ARM_BUILTIN_WSADH); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wsadbz", v2si_ftype_v8qi_v8qi, ARM_BUILTIN_WSADBZ); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wsadhz", v2si_ftype_v4hi_v4hi, ARM_BUILTIN_WSADHZ); + + def_mbuiltin (FL_IWMMXT, "__builtin_arm_textrmsb", int_ftype_v8qi_int, ARM_BUILTIN_TEXTRMSB); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_textrmsh", int_ftype_v4hi_int, ARM_BUILTIN_TEXTRMSH); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_textrmsw", int_ftype_v2si_int, ARM_BUILTIN_TEXTRMSW); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_textrmub", int_ftype_v8qi_int, ARM_BUILTIN_TEXTRMUB); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_textrmuh", int_ftype_v4hi_int, ARM_BUILTIN_TEXTRMUH); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_textrmuw", int_ftype_v2si_int, ARM_BUILTIN_TEXTRMUW); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_tinsrb", v8qi_ftype_v8qi_int_int, ARM_BUILTIN_TINSRB); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_tinsrh", v4hi_ftype_v4hi_int_int, ARM_BUILTIN_TINSRH); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_tinsrw", v2si_ftype_v2si_int_int, ARM_BUILTIN_TINSRW); + + def_mbuiltin (FL_IWMMXT, "__builtin_arm_waccb", di_ftype_v8qi, ARM_BUILTIN_WACCB); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wacch", di_ftype_v4hi, ARM_BUILTIN_WACCH); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_waccw", di_ftype_v2si, ARM_BUILTIN_WACCW); + + def_mbuiltin (FL_IWMMXT, "__builtin_arm_tmovmskb", int_ftype_v8qi, ARM_BUILTIN_TMOVMSKB); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_tmovmskh", int_ftype_v4hi, ARM_BUILTIN_TMOVMSKH); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_tmovmskw", int_ftype_v2si, ARM_BUILTIN_TMOVMSKW); + + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wpackhss", v8qi_ftype_v4hi_v4hi, ARM_BUILTIN_WPACKHSS); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wpackhus", v8qi_ftype_v4hi_v4hi, ARM_BUILTIN_WPACKHUS); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wpackwus", v4hi_ftype_v2si_v2si, ARM_BUILTIN_WPACKWUS); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wpackwss", v4hi_ftype_v2si_v2si, ARM_BUILTIN_WPACKWSS); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wpackdus", v2si_ftype_di_di, ARM_BUILTIN_WPACKDUS); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wpackdss", v2si_ftype_di_di, ARM_BUILTIN_WPACKDSS); + + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wunpckehub", v4hi_ftype_v8qi, ARM_BUILTIN_WUNPCKEHUB); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wunpckehuh", v2si_ftype_v4hi, ARM_BUILTIN_WUNPCKEHUH); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wunpckehuw", di_ftype_v2si, ARM_BUILTIN_WUNPCKEHUW); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wunpckehsb", v4hi_ftype_v8qi, ARM_BUILTIN_WUNPCKEHSB); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wunpckehsh", v2si_ftype_v4hi, ARM_BUILTIN_WUNPCKEHSH); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wunpckehsw", di_ftype_v2si, ARM_BUILTIN_WUNPCKEHSW); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wunpckelub", v4hi_ftype_v8qi, ARM_BUILTIN_WUNPCKELUB); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wunpckeluh", v2si_ftype_v4hi, ARM_BUILTIN_WUNPCKELUH); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wunpckeluw", di_ftype_v2si, ARM_BUILTIN_WUNPCKELUW); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wunpckelsb", v4hi_ftype_v8qi, ARM_BUILTIN_WUNPCKELSB); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wunpckelsh", v2si_ftype_v4hi, ARM_BUILTIN_WUNPCKELSH); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wunpckelsw", di_ftype_v2si, ARM_BUILTIN_WUNPCKELSW); + + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wmacs", di_ftype_di_v4hi_v4hi, ARM_BUILTIN_WMACS); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wmacsz", di_ftype_v4hi_v4hi, ARM_BUILTIN_WMACSZ); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wmacu", di_ftype_di_v4hi_v4hi, ARM_BUILTIN_WMACU); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_wmacuz", di_ftype_v4hi_v4hi, ARM_BUILTIN_WMACUZ); + + def_mbuiltin (FL_IWMMXT, "__builtin_arm_walign", v8qi_ftype_v8qi_v8qi_int, ARM_BUILTIN_WALIGN); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_tmia", di_ftype_di_int_int, ARM_BUILTIN_TMIA); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_tmiaph", di_ftype_di_int_int, ARM_BUILTIN_TMIAPH); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_tmiabb", di_ftype_di_int_int, ARM_BUILTIN_TMIABB); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_tmiabt", di_ftype_di_int_int, ARM_BUILTIN_TMIABT); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_tmiatb", di_ftype_di_int_int, ARM_BUILTIN_TMIATB); + def_mbuiltin (FL_IWMMXT, "__builtin_arm_tmiatt", di_ftype_di_int_int, ARM_BUILTIN_TMIATT); +} + +static void +arm_init_tls_builtins (void) +{ + tree ftype, decl; + + ftype = build_function_type (ptr_type_node, void_list_node); + decl = add_builtin_function ("__builtin_thread_pointer", ftype, + ARM_BUILTIN_THREAD_POINTER, BUILT_IN_MD, + NULL, NULL_TREE); + TREE_NOTHROW (decl) = 1; + TREE_READONLY (decl) = 1; +} + +enum neon_builtin_type_bits { + T_V8QI = 0x0001, + T_V4HI = 0x0002, + T_V2SI = 0x0004, + T_V2SF = 0x0008, + T_DI = 0x0010, + T_V16QI = 0x0020, + T_V8HI = 0x0040, + T_V4SI = 0x0080, + T_V4SF = 0x0100, + T_V2DI = 0x0200, + T_TI = 0x0400, + T_EI = 0x0800, + T_OI = 0x1000 +}; + +#define v8qi_UP T_V8QI +#define v4hi_UP T_V4HI +#define v2si_UP T_V2SI +#define v2sf_UP T_V2SF +#define di_UP T_DI +#define v16qi_UP T_V16QI +#define v8hi_UP T_V8HI +#define v4si_UP T_V4SI +#define v4sf_UP T_V4SF +#define v2di_UP T_V2DI +#define ti_UP T_TI +#define ei_UP T_EI +#define oi_UP T_OI + +#define UP(X) X##_UP + +#define T_MAX 13 + +typedef enum { + NEON_BINOP, + NEON_TERNOP, + NEON_UNOP, + NEON_GETLANE, + NEON_SETLANE, + NEON_CREATE, + NEON_DUP, + NEON_DUPLANE, + NEON_COMBINE, + NEON_SPLIT, + NEON_LANEMUL, + NEON_LANEMULL, + NEON_LANEMULH, + NEON_LANEMAC, + NEON_SCALARMUL, + NEON_SCALARMULL, + NEON_SCALARMULH, + NEON_SCALARMAC, + NEON_CONVERT, + NEON_FIXCONV, + NEON_SELECT, + NEON_RESULTPAIR, + NEON_REINTERP, + NEON_VTBL, + NEON_VTBX, + NEON_LOAD1, + NEON_LOAD1LANE, + NEON_STORE1, + NEON_STORE1LANE, + NEON_LOADSTRUCT, + NEON_LOADSTRUCTLANE, + NEON_STORESTRUCT, + NEON_STORESTRUCTLANE, + NEON_LOGICBINOP, + NEON_SHIFTINSERT, + NEON_SHIFTIMM, + NEON_SHIFTACC +} neon_itype; + +typedef struct { + const char *name; + const neon_itype itype; + const int bits; + const enum insn_code codes[T_MAX]; + const unsigned int num_vars; + unsigned int base_fcode; +} neon_builtin_datum; + +#define CF(N,X) CODE_FOR_neon_##N##X + +#define VAR1(T, N, A) \ + #N, NEON_##T, UP (A), { CF (N, A) }, 1, 0 +#define VAR2(T, N, A, B) \ + #N, NEON_##T, UP (A) | UP (B), { CF (N, A), CF (N, B) }, 2, 0 +#define VAR3(T, N, A, B, C) \ + #N, NEON_##T, UP (A) | UP (B) | UP (C), \ + { CF (N, A), CF (N, B), CF (N, C) }, 3, 0 +#define VAR4(T, N, A, B, C, D) \ + #N, NEON_##T, UP (A) | UP (B) | UP (C) | UP (D), \ + { CF (N, A), CF (N, B), CF (N, C), CF (N, D) }, 4, 0 +#define VAR5(T, N, A, B, C, D, E) \ + #N, NEON_##T, UP (A) | UP (B) | UP (C) | UP (D) | UP (E), \ + { CF (N, A), CF (N, B), CF (N, C), CF (N, D), CF (N, E) }, 5, 0 +#define VAR6(T, N, A, B, C, D, E, F) \ + #N, NEON_##T, UP (A) | UP (B) | UP (C) | UP (D) | UP (E) | UP (F), \ + { CF (N, A), CF (N, B), CF (N, C), CF (N, D), CF (N, E), CF (N, F) }, 6, 0 +#define VAR7(T, N, A, B, C, D, E, F, G) \ + #N, NEON_##T, UP (A) | UP (B) | UP (C) | UP (D) | UP (E) | UP (F) | UP (G), \ + { CF (N, A), CF (N, B), CF (N, C), CF (N, D), CF (N, E), CF (N, F), \ + CF (N, G) }, 7, 0 +#define VAR8(T, N, A, B, C, D, E, F, G, H) \ + #N, NEON_##T, UP (A) | UP (B) | UP (C) | UP (D) | UP (E) | UP (F) | UP (G) \ + | UP (H), \ + { CF (N, A), CF (N, B), CF (N, C), CF (N, D), CF (N, E), CF (N, F), \ + CF (N, G), CF (N, H) }, 8, 0 +#define VAR9(T, N, A, B, C, D, E, F, G, H, I) \ + #N, NEON_##T, UP (A) | UP (B) | UP (C) | UP (D) | UP (E) | UP (F) | UP (G) \ + | UP (H) | UP (I), \ + { CF (N, A), CF (N, B), CF (N, C), CF (N, D), CF (N, E), CF (N, F), \ + CF (N, G), CF (N, H), CF (N, I) }, 9, 0 +#define VAR10(T, N, A, B, C, D, E, F, G, H, I, J) \ + #N, NEON_##T, UP (A) | UP (B) | UP (C) | UP (D) | UP (E) | UP (F) | UP (G) \ + | UP (H) | UP (I) | UP (J), \ + { CF (N, A), CF (N, B), CF (N, C), CF (N, D), CF (N, E), CF (N, F), \ + CF (N, G), CF (N, H), CF (N, I), CF (N, J) }, 10, 0 + +/* The mode entries in the following table correspond to the "key" type of the + instruction variant, i.e. equivalent to that which would be specified after + the assembler mnemonic, which usually refers to the last vector operand. + (Signed/unsigned/polynomial types are not differentiated between though, and + are all mapped onto the same mode for a given element size.) The modes + listed per instruction should be the same as those defined for that + instruction's pattern in neon.md. + WARNING: Variants should be listed in the same increasing order as + neon_builtin_type_bits. */ + +static neon_builtin_datum neon_builtin_data[] = +{ + { VAR10 (BINOP, vadd, + v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) }, + { VAR3 (BINOP, vaddl, v8qi, v4hi, v2si) }, + { VAR3 (BINOP, vaddw, v8qi, v4hi, v2si) }, + { VAR6 (BINOP, vhadd, v8qi, v4hi, v2si, v16qi, v8hi, v4si) }, + { VAR8 (BINOP, vqadd, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di) }, + { VAR3 (BINOP, vaddhn, v8hi, v4si, v2di) }, + { VAR8 (BINOP, vmul, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf) }, + { VAR8 (TERNOP, vmla, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf) }, + { VAR3 (TERNOP, vmlal, v8qi, v4hi, v2si) }, + { VAR8 (TERNOP, vmls, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf) }, + { VAR3 (TERNOP, vmlsl, v8qi, v4hi, v2si) }, + { VAR4 (BINOP, vqdmulh, v4hi, v2si, v8hi, v4si) }, + { VAR2 (TERNOP, vqdmlal, v4hi, v2si) }, + { VAR2 (TERNOP, vqdmlsl, v4hi, v2si) }, + { VAR3 (BINOP, vmull, v8qi, v4hi, v2si) }, + { VAR2 (SCALARMULL, vmull_n, v4hi, v2si) }, + { VAR2 (LANEMULL, vmull_lane, v4hi, v2si) }, + { VAR2 (SCALARMULL, vqdmull_n, v4hi, v2si) }, + { VAR2 (LANEMULL, vqdmull_lane, v4hi, v2si) }, + { VAR4 (SCALARMULH, vqdmulh_n, v4hi, v2si, v8hi, v4si) }, + { VAR4 (LANEMULH, vqdmulh_lane, v4hi, v2si, v8hi, v4si) }, + { VAR2 (BINOP, vqdmull, v4hi, v2si) }, + { VAR8 (BINOP, vshl, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di) }, + { VAR8 (BINOP, vqshl, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di) }, + { VAR8 (SHIFTIMM, vshr_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di) }, + { VAR3 (SHIFTIMM, vshrn_n, v8hi, v4si, v2di) }, + { VAR3 (SHIFTIMM, vqshrn_n, v8hi, v4si, v2di) }, + { VAR3 (SHIFTIMM, vqshrun_n, v8hi, v4si, v2di) }, + { VAR8 (SHIFTIMM, vshl_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di) }, + { VAR8 (SHIFTIMM, vqshl_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di) }, + { VAR8 (SHIFTIMM, vqshlu_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di) }, + { VAR3 (SHIFTIMM, vshll_n, v8qi, v4hi, v2si) }, + { VAR8 (SHIFTACC, vsra_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di) }, + { VAR10 (BINOP, vsub, + v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) }, + { VAR3 (BINOP, vsubl, v8qi, v4hi, v2si) }, + { VAR3 (BINOP, vsubw, v8qi, v4hi, v2si) }, + { VAR8 (BINOP, vqsub, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di) }, + { VAR6 (BINOP, vhsub, v8qi, v4hi, v2si, v16qi, v8hi, v4si) }, + { VAR3 (BINOP, vsubhn, v8hi, v4si, v2di) }, + { VAR8 (BINOP, vceq, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf) }, + { VAR8 (BINOP, vcge, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf) }, + { VAR8 (BINOP, vcgt, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf) }, + { VAR2 (BINOP, vcage, v2sf, v4sf) }, + { VAR2 (BINOP, vcagt, v2sf, v4sf) }, + { VAR6 (BINOP, vtst, v8qi, v4hi, v2si, v16qi, v8hi, v4si) }, + { VAR8 (BINOP, vabd, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf) }, + { VAR3 (BINOP, vabdl, v8qi, v4hi, v2si) }, + { VAR6 (TERNOP, vaba, v8qi, v4hi, v2si, v16qi, v8hi, v4si) }, + { VAR3 (TERNOP, vabal, v8qi, v4hi, v2si) }, + { VAR8 (BINOP, vmax, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf) }, + { VAR8 (BINOP, vmin, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf) }, + { VAR4 (BINOP, vpadd, v8qi, v4hi, v2si, v2sf) }, + { VAR6 (UNOP, vpaddl, v8qi, v4hi, v2si, v16qi, v8hi, v4si) }, + { VAR6 (BINOP, vpadal, v8qi, v4hi, v2si, v16qi, v8hi, v4si) }, + { VAR4 (BINOP, vpmax, v8qi, v4hi, v2si, v2sf) }, + { VAR4 (BINOP, vpmin, v8qi, v4hi, v2si, v2sf) }, + { VAR2 (BINOP, vrecps, v2sf, v4sf) }, + { VAR2 (BINOP, vrsqrts, v2sf, v4sf) }, + { VAR8 (SHIFTINSERT, vsri_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di) }, + { VAR8 (SHIFTINSERT, vsli_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di) }, + { VAR8 (UNOP, vabs, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf) }, + { VAR6 (UNOP, vqabs, v8qi, v4hi, v2si, v16qi, v8hi, v4si) }, + { VAR8 (UNOP, vneg, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf) }, + { VAR6 (UNOP, vqneg, v8qi, v4hi, v2si, v16qi, v8hi, v4si) }, + { VAR6 (UNOP, vcls, v8qi, v4hi, v2si, v16qi, v8hi, v4si) }, + { VAR6 (UNOP, vclz, v8qi, v4hi, v2si, v16qi, v8hi, v4si) }, + { VAR2 (UNOP, vcnt, v8qi, v16qi) }, + { VAR4 (UNOP, vrecpe, v2si, v2sf, v4si, v4sf) }, + { VAR4 (UNOP, vrsqrte, v2si, v2sf, v4si, v4sf) }, + { VAR6 (UNOP, vmvn, v8qi, v4hi, v2si, v16qi, v8hi, v4si) }, + /* FIXME: vget_lane supports more variants than this! */ + { VAR10 (GETLANE, vget_lane, + v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) }, + { VAR10 (SETLANE, vset_lane, + v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) }, + { VAR5 (CREATE, vcreate, v8qi, v4hi, v2si, v2sf, di) }, + { VAR10 (DUP, vdup_n, + v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) }, + { VAR10 (DUPLANE, vdup_lane, + v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) }, + { VAR5 (COMBINE, vcombine, v8qi, v4hi, v2si, v2sf, di) }, + { VAR5 (SPLIT, vget_high, v16qi, v8hi, v4si, v4sf, v2di) }, + { VAR5 (SPLIT, vget_low, v16qi, v8hi, v4si, v4sf, v2di) }, + { VAR3 (UNOP, vmovn, v8hi, v4si, v2di) }, + { VAR3 (UNOP, vqmovn, v8hi, v4si, v2di) }, + { VAR3 (UNOP, vqmovun, v8hi, v4si, v2di) }, + { VAR3 (UNOP, vmovl, v8qi, v4hi, v2si) }, + { VAR6 (LANEMUL, vmul_lane, v4hi, v2si, v2sf, v8hi, v4si, v4sf) }, + { VAR6 (LANEMAC, vmla_lane, v4hi, v2si, v2sf, v8hi, v4si, v4sf) }, + { VAR2 (LANEMAC, vmlal_lane, v4hi, v2si) }, + { VAR2 (LANEMAC, vqdmlal_lane, v4hi, v2si) }, + { VAR6 (LANEMAC, vmls_lane, v4hi, v2si, v2sf, v8hi, v4si, v4sf) }, + { VAR2 (LANEMAC, vmlsl_lane, v4hi, v2si) }, + { VAR2 (LANEMAC, vqdmlsl_lane, v4hi, v2si) }, + { VAR6 (SCALARMUL, vmul_n, v4hi, v2si, v2sf, v8hi, v4si, v4sf) }, + { VAR6 (SCALARMAC, vmla_n, v4hi, v2si, v2sf, v8hi, v4si, v4sf) }, + { VAR2 (SCALARMAC, vmlal_n, v4hi, v2si) }, + { VAR2 (SCALARMAC, vqdmlal_n, v4hi, v2si) }, + { VAR6 (SCALARMAC, vmls_n, v4hi, v2si, v2sf, v8hi, v4si, v4sf) }, + { VAR2 (SCALARMAC, vmlsl_n, v4hi, v2si) }, + { VAR2 (SCALARMAC, vqdmlsl_n, v4hi, v2si) }, + { VAR10 (BINOP, vext, + v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) }, + { VAR8 (UNOP, vrev64, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf) }, + { VAR4 (UNOP, vrev32, v8qi, v4hi, v16qi, v8hi) }, + { VAR2 (UNOP, vrev16, v8qi, v16qi) }, + { VAR4 (CONVERT, vcvt, v2si, v2sf, v4si, v4sf) }, + { VAR4 (FIXCONV, vcvt_n, v2si, v2sf, v4si, v4sf) }, + { VAR10 (SELECT, vbsl, + v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) }, + { VAR1 (VTBL, vtbl1, v8qi) }, + { VAR1 (VTBL, vtbl2, v8qi) }, + { VAR1 (VTBL, vtbl3, v8qi) }, + { VAR1 (VTBL, vtbl4, v8qi) }, + { VAR1 (VTBX, vtbx1, v8qi) }, + { VAR1 (VTBX, vtbx2, v8qi) }, + { VAR1 (VTBX, vtbx3, v8qi) }, + { VAR1 (VTBX, vtbx4, v8qi) }, + { VAR8 (RESULTPAIR, vtrn, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf) }, + { VAR8 (RESULTPAIR, vzip, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf) }, + { VAR8 (RESULTPAIR, vuzp, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf) }, + { VAR5 (REINTERP, vreinterpretv8qi, v8qi, v4hi, v2si, v2sf, di) }, + { VAR5 (REINTERP, vreinterpretv4hi, v8qi, v4hi, v2si, v2sf, di) }, + { VAR5 (REINTERP, vreinterpretv2si, v8qi, v4hi, v2si, v2sf, di) }, + { VAR5 (REINTERP, vreinterpretv2sf, v8qi, v4hi, v2si, v2sf, di) }, + { VAR5 (REINTERP, vreinterpretdi, v8qi, v4hi, v2si, v2sf, di) }, + { VAR5 (REINTERP, vreinterpretv16qi, v16qi, v8hi, v4si, v4sf, v2di) }, + { VAR5 (REINTERP, vreinterpretv8hi, v16qi, v8hi, v4si, v4sf, v2di) }, + { VAR5 (REINTERP, vreinterpretv4si, v16qi, v8hi, v4si, v4sf, v2di) }, + { VAR5 (REINTERP, vreinterpretv4sf, v16qi, v8hi, v4si, v4sf, v2di) }, + { VAR5 (REINTERP, vreinterpretv2di, v16qi, v8hi, v4si, v4sf, v2di) }, + { VAR10 (LOAD1, vld1, + v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) }, + { VAR10 (LOAD1LANE, vld1_lane, + v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) }, + { VAR10 (LOAD1, vld1_dup, + v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) }, + { VAR10 (STORE1, vst1, + v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) }, + { VAR10 (STORE1LANE, vst1_lane, + v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) }, + { VAR9 (LOADSTRUCT, + vld2, v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf) }, + { VAR7 (LOADSTRUCTLANE, vld2_lane, + v8qi, v4hi, v2si, v2sf, v8hi, v4si, v4sf) }, + { VAR5 (LOADSTRUCT, vld2_dup, v8qi, v4hi, v2si, v2sf, di) }, + { VAR9 (STORESTRUCT, vst2, + v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf) }, + { VAR7 (STORESTRUCTLANE, vst2_lane, + v8qi, v4hi, v2si, v2sf, v8hi, v4si, v4sf) }, + { VAR9 (LOADSTRUCT, + vld3, v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf) }, + { VAR7 (LOADSTRUCTLANE, vld3_lane, + v8qi, v4hi, v2si, v2sf, v8hi, v4si, v4sf) }, + { VAR5 (LOADSTRUCT, vld3_dup, v8qi, v4hi, v2si, v2sf, di) }, + { VAR9 (STORESTRUCT, vst3, + v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf) }, + { VAR7 (STORESTRUCTLANE, vst3_lane, + v8qi, v4hi, v2si, v2sf, v8hi, v4si, v4sf) }, + { VAR9 (LOADSTRUCT, vld4, + v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf) }, + { VAR7 (LOADSTRUCTLANE, vld4_lane, + v8qi, v4hi, v2si, v2sf, v8hi, v4si, v4sf) }, + { VAR5 (LOADSTRUCT, vld4_dup, v8qi, v4hi, v2si, v2sf, di) }, + { VAR9 (STORESTRUCT, vst4, + v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf) }, + { VAR7 (STORESTRUCTLANE, vst4_lane, + v8qi, v4hi, v2si, v2sf, v8hi, v4si, v4sf) }, + { VAR10 (LOGICBINOP, vand, + v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) }, + { VAR10 (LOGICBINOP, vorr, + v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) }, + { VAR10 (BINOP, veor, + v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) }, + { VAR10 (LOGICBINOP, vbic, + v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) }, + { VAR10 (LOGICBINOP, vorn, + v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) } +}; + +#undef CF +#undef VAR1 +#undef VAR2 +#undef VAR3 +#undef VAR4 +#undef VAR5 +#undef VAR6 +#undef VAR7 +#undef VAR8 +#undef VAR9 +#undef VAR10 + +static void +arm_init_neon_builtins (void) +{ + unsigned int i, fcode = ARM_BUILTIN_NEON_BASE; + + tree neon_intQI_type_node; + tree neon_intHI_type_node; + tree neon_polyQI_type_node; + tree neon_polyHI_type_node; + tree neon_intSI_type_node; + tree neon_intDI_type_node; + tree neon_float_type_node; + + tree intQI_pointer_node; + tree intHI_pointer_node; + tree intSI_pointer_node; + tree intDI_pointer_node; + tree float_pointer_node; + + tree const_intQI_node; + tree const_intHI_node; + tree const_intSI_node; + tree const_intDI_node; + tree const_float_node; + + tree const_intQI_pointer_node; + tree const_intHI_pointer_node; + tree const_intSI_pointer_node; + tree const_intDI_pointer_node; + tree const_float_pointer_node; + + tree V8QI_type_node; + tree V4HI_type_node; + tree V2SI_type_node; + tree V2SF_type_node; + tree V16QI_type_node; + tree V8HI_type_node; + tree V4SI_type_node; + tree V4SF_type_node; + tree V2DI_type_node; + + tree intUQI_type_node; + tree intUHI_type_node; + tree intUSI_type_node; + tree intUDI_type_node; + + tree intEI_type_node; + tree intOI_type_node; + tree intCI_type_node; + tree intXI_type_node; + + tree V8QI_pointer_node; + tree V4HI_pointer_node; + tree V2SI_pointer_node; + tree V2SF_pointer_node; + tree V16QI_pointer_node; + tree V8HI_pointer_node; + tree V4SI_pointer_node; + tree V4SF_pointer_node; + tree V2DI_pointer_node; + + tree void_ftype_pv8qi_v8qi_v8qi; + tree void_ftype_pv4hi_v4hi_v4hi; + tree void_ftype_pv2si_v2si_v2si; + tree void_ftype_pv2sf_v2sf_v2sf; + tree void_ftype_pdi_di_di; + tree void_ftype_pv16qi_v16qi_v16qi; + tree void_ftype_pv8hi_v8hi_v8hi; + tree void_ftype_pv4si_v4si_v4si; + tree void_ftype_pv4sf_v4sf_v4sf; + tree void_ftype_pv2di_v2di_v2di; + + tree reinterp_ftype_dreg[5][5]; + tree reinterp_ftype_qreg[5][5]; + tree dreg_types[5], qreg_types[5]; + + /* Create distinguished type nodes for NEON vector element types, + and pointers to values of such types, so we can detect them later. */ + neon_intQI_type_node = make_signed_type (GET_MODE_PRECISION (QImode)); + neon_intHI_type_node = make_signed_type (GET_MODE_PRECISION (HImode)); + neon_polyQI_type_node = make_signed_type (GET_MODE_PRECISION (QImode)); + neon_polyHI_type_node = make_signed_type (GET_MODE_PRECISION (HImode)); + neon_intSI_type_node = make_signed_type (GET_MODE_PRECISION (SImode)); + neon_intDI_type_node = make_signed_type (GET_MODE_PRECISION (DImode)); + neon_float_type_node = make_node (REAL_TYPE); + TYPE_PRECISION (neon_float_type_node) = FLOAT_TYPE_SIZE; + layout_type (neon_float_type_node); + + /* Define typedefs which exactly correspond to the modes we are basing vector + types on. If you change these names you'll need to change + the table used by arm_mangle_type too. */ + (*lang_hooks.types.register_builtin_type) (neon_intQI_type_node, + "__builtin_neon_qi"); + (*lang_hooks.types.register_builtin_type) (neon_intHI_type_node, + "__builtin_neon_hi"); + (*lang_hooks.types.register_builtin_type) (neon_intSI_type_node, + "__builtin_neon_si"); + (*lang_hooks.types.register_builtin_type) (neon_float_type_node, + "__builtin_neon_sf"); + (*lang_hooks.types.register_builtin_type) (neon_intDI_type_node, + "__builtin_neon_di"); + (*lang_hooks.types.register_builtin_type) (neon_polyQI_type_node, + "__builtin_neon_poly8"); + (*lang_hooks.types.register_builtin_type) (neon_polyHI_type_node, + "__builtin_neon_poly16"); + + intQI_pointer_node = build_pointer_type (neon_intQI_type_node); + intHI_pointer_node = build_pointer_type (neon_intHI_type_node); + intSI_pointer_node = build_pointer_type (neon_intSI_type_node); + intDI_pointer_node = build_pointer_type (neon_intDI_type_node); + float_pointer_node = build_pointer_type (neon_float_type_node); + + /* Next create constant-qualified versions of the above types. */ + const_intQI_node = build_qualified_type (neon_intQI_type_node, + TYPE_QUAL_CONST); + const_intHI_node = build_qualified_type (neon_intHI_type_node, + TYPE_QUAL_CONST); + const_intSI_node = build_qualified_type (neon_intSI_type_node, + TYPE_QUAL_CONST); + const_intDI_node = build_qualified_type (neon_intDI_type_node, + TYPE_QUAL_CONST); + const_float_node = build_qualified_type (neon_float_type_node, + TYPE_QUAL_CONST); + + const_intQI_pointer_node = build_pointer_type (const_intQI_node); + const_intHI_pointer_node = build_pointer_type (const_intHI_node); + const_intSI_pointer_node = build_pointer_type (const_intSI_node); + const_intDI_pointer_node = build_pointer_type (const_intDI_node); + const_float_pointer_node = build_pointer_type (const_float_node); + + /* Now create vector types based on our NEON element types. */ + /* 64-bit vectors. */ + V8QI_type_node = + build_vector_type_for_mode (neon_intQI_type_node, V8QImode); + V4HI_type_node = + build_vector_type_for_mode (neon_intHI_type_node, V4HImode); + V2SI_type_node = + build_vector_type_for_mode (neon_intSI_type_node, V2SImode); + V2SF_type_node = + build_vector_type_for_mode (neon_float_type_node, V2SFmode); + /* 128-bit vectors. */ + V16QI_type_node = + build_vector_type_for_mode (neon_intQI_type_node, V16QImode); + V8HI_type_node = + build_vector_type_for_mode (neon_intHI_type_node, V8HImode); + V4SI_type_node = + build_vector_type_for_mode (neon_intSI_type_node, V4SImode); + V4SF_type_node = + build_vector_type_for_mode (neon_float_type_node, V4SFmode); + V2DI_type_node = + build_vector_type_for_mode (neon_intDI_type_node, V2DImode); + + /* Unsigned integer types for various mode sizes. */ + intUQI_type_node = make_unsigned_type (GET_MODE_PRECISION (QImode)); + intUHI_type_node = make_unsigned_type (GET_MODE_PRECISION (HImode)); + intUSI_type_node = make_unsigned_type (GET_MODE_PRECISION (SImode)); + intUDI_type_node = make_unsigned_type (GET_MODE_PRECISION (DImode)); + + (*lang_hooks.types.register_builtin_type) (intUQI_type_node, + "__builtin_neon_uqi"); + (*lang_hooks.types.register_builtin_type) (intUHI_type_node, + "__builtin_neon_uhi"); + (*lang_hooks.types.register_builtin_type) (intUSI_type_node, + "__builtin_neon_usi"); + (*lang_hooks.types.register_builtin_type) (intUDI_type_node, + "__builtin_neon_udi"); + + /* Opaque integer types for structures of vectors. */ + intEI_type_node = make_signed_type (GET_MODE_PRECISION (EImode)); + intOI_type_node = make_signed_type (GET_MODE_PRECISION (OImode)); + intCI_type_node = make_signed_type (GET_MODE_PRECISION (CImode)); + intXI_type_node = make_signed_type (GET_MODE_PRECISION (XImode)); + + (*lang_hooks.types.register_builtin_type) (intTI_type_node, + "__builtin_neon_ti"); + (*lang_hooks.types.register_builtin_type) (intEI_type_node, + "__builtin_neon_ei"); + (*lang_hooks.types.register_builtin_type) (intOI_type_node, + "__builtin_neon_oi"); + (*lang_hooks.types.register_builtin_type) (intCI_type_node, + "__builtin_neon_ci"); + (*lang_hooks.types.register_builtin_type) (intXI_type_node, + "__builtin_neon_xi"); + + /* Pointers to vector types. */ + V8QI_pointer_node = build_pointer_type (V8QI_type_node); + V4HI_pointer_node = build_pointer_type (V4HI_type_node); + V2SI_pointer_node = build_pointer_type (V2SI_type_node); + V2SF_pointer_node = build_pointer_type (V2SF_type_node); + V16QI_pointer_node = build_pointer_type (V16QI_type_node); + V8HI_pointer_node = build_pointer_type (V8HI_type_node); + V4SI_pointer_node = build_pointer_type (V4SI_type_node); + V4SF_pointer_node = build_pointer_type (V4SF_type_node); + V2DI_pointer_node = build_pointer_type (V2DI_type_node); + + /* Operations which return results as pairs. */ + void_ftype_pv8qi_v8qi_v8qi = + build_function_type_list (void_type_node, V8QI_pointer_node, V8QI_type_node, + V8QI_type_node, NULL); + void_ftype_pv4hi_v4hi_v4hi = + build_function_type_list (void_type_node, V4HI_pointer_node, V4HI_type_node, + V4HI_type_node, NULL); + void_ftype_pv2si_v2si_v2si = + build_function_type_list (void_type_node, V2SI_pointer_node, V2SI_type_node, + V2SI_type_node, NULL); + void_ftype_pv2sf_v2sf_v2sf = + build_function_type_list (void_type_node, V2SF_pointer_node, V2SF_type_node, + V2SF_type_node, NULL); + void_ftype_pdi_di_di = + build_function_type_list (void_type_node, intDI_pointer_node, + neon_intDI_type_node, neon_intDI_type_node, NULL); + void_ftype_pv16qi_v16qi_v16qi = + build_function_type_list (void_type_node, V16QI_pointer_node, + V16QI_type_node, V16QI_type_node, NULL); + void_ftype_pv8hi_v8hi_v8hi = + build_function_type_list (void_type_node, V8HI_pointer_node, V8HI_type_node, + V8HI_type_node, NULL); + void_ftype_pv4si_v4si_v4si = + build_function_type_list (void_type_node, V4SI_pointer_node, V4SI_type_node, + V4SI_type_node, NULL); + void_ftype_pv4sf_v4sf_v4sf = + build_function_type_list (void_type_node, V4SF_pointer_node, V4SF_type_node, + V4SF_type_node, NULL); + void_ftype_pv2di_v2di_v2di = + build_function_type_list (void_type_node, V2DI_pointer_node, V2DI_type_node, + V2DI_type_node, NULL); + + dreg_types[0] = V8QI_type_node; + dreg_types[1] = V4HI_type_node; + dreg_types[2] = V2SI_type_node; + dreg_types[3] = V2SF_type_node; + dreg_types[4] = neon_intDI_type_node; + + qreg_types[0] = V16QI_type_node; + qreg_types[1] = V8HI_type_node; + qreg_types[2] = V4SI_type_node; + qreg_types[3] = V4SF_type_node; + qreg_types[4] = V2DI_type_node; + + for (i = 0; i < 5; i++) + { + int j; + for (j = 0; j < 5; j++) + { + reinterp_ftype_dreg[i][j] + = build_function_type_list (dreg_types[i], dreg_types[j], NULL); + reinterp_ftype_qreg[i][j] + = build_function_type_list (qreg_types[i], qreg_types[j], NULL); + } + } + + for (i = 0; i < ARRAY_SIZE (neon_builtin_data); i++) + { + neon_builtin_datum *d = &neon_builtin_data[i]; + unsigned int j, codeidx = 0; + + d->base_fcode = fcode; + + for (j = 0; j < T_MAX; j++) + { + const char* const modenames[] = { + "v8qi", "v4hi", "v2si", "v2sf", "di", + "v16qi", "v8hi", "v4si", "v4sf", "v2di" + }; + char namebuf[60]; + tree ftype = NULL; + enum insn_code icode; + int is_load = 0, is_store = 0; + + if ((d->bits & (1 << j)) == 0) + continue; + + icode = d->codes[codeidx++]; + + switch (d->itype) + { + case NEON_LOAD1: + case NEON_LOAD1LANE: + case NEON_LOADSTRUCT: + case NEON_LOADSTRUCTLANE: + is_load = 1; + /* Fall through. */ + case NEON_STORE1: + case NEON_STORE1LANE: + case NEON_STORESTRUCT: + case NEON_STORESTRUCTLANE: + if (!is_load) + is_store = 1; + /* Fall through. */ + case NEON_UNOP: + case NEON_BINOP: + case NEON_LOGICBINOP: + case NEON_SHIFTINSERT: + case NEON_TERNOP: + case NEON_GETLANE: + case NEON_SETLANE: + case NEON_CREATE: + case NEON_DUP: + case NEON_DUPLANE: + case NEON_SHIFTIMM: + case NEON_SHIFTACC: + case NEON_COMBINE: + case NEON_SPLIT: + case NEON_CONVERT: + case NEON_FIXCONV: + case NEON_LANEMUL: + case NEON_LANEMULL: + case NEON_LANEMULH: + case NEON_LANEMAC: + case NEON_SCALARMUL: + case NEON_SCALARMULL: + case NEON_SCALARMULH: + case NEON_SCALARMAC: + case NEON_SELECT: + case NEON_VTBL: + case NEON_VTBX: + { + int k; + tree return_type = void_type_node, args = void_list_node; + + /* Build a function type directly from the insn_data for this + builtin. The build_function_type() function takes care of + removing duplicates for us. */ + for (k = insn_data[icode].n_operands - 1; k >= 0; k--) + { + tree eltype; + + if (is_load && k == 1) + { + /* Neon load patterns always have the memory operand + (a SImode pointer) in the operand 1 position. We + want a const pointer to the element type in that + position. */ + gcc_assert (insn_data[icode].operand[k].mode == SImode); + + switch (1 << j) + { + case T_V8QI: + case T_V16QI: + eltype = const_intQI_pointer_node; + break; + + case T_V4HI: + case T_V8HI: + eltype = const_intHI_pointer_node; + break; + + case T_V2SI: + case T_V4SI: + eltype = const_intSI_pointer_node; + break; + + case T_V2SF: + case T_V4SF: + eltype = const_float_pointer_node; + break; + + case T_DI: + case T_V2DI: + eltype = const_intDI_pointer_node; + break; + + default: gcc_unreachable (); + } + } + else if (is_store && k == 0) + { + /* Similarly, Neon store patterns use operand 0 as + the memory location to store to (a SImode pointer). + Use a pointer to the element type of the store in + that position. */ + gcc_assert (insn_data[icode].operand[k].mode == SImode); + + switch (1 << j) + { + case T_V8QI: + case T_V16QI: + eltype = intQI_pointer_node; + break; + + case T_V4HI: + case T_V8HI: + eltype = intHI_pointer_node; + break; + + case T_V2SI: + case T_V4SI: + eltype = intSI_pointer_node; + break; + + case T_V2SF: + case T_V4SF: + eltype = float_pointer_node; + break; + + case T_DI: + case T_V2DI: + eltype = intDI_pointer_node; + break; + + default: gcc_unreachable (); + } + } + else + { + switch (insn_data[icode].operand[k].mode) + { + case VOIDmode: eltype = void_type_node; break; + /* Scalars. */ + case QImode: eltype = neon_intQI_type_node; break; + case HImode: eltype = neon_intHI_type_node; break; + case SImode: eltype = neon_intSI_type_node; break; + case SFmode: eltype = neon_float_type_node; break; + case DImode: eltype = neon_intDI_type_node; break; + case TImode: eltype = intTI_type_node; break; + case EImode: eltype = intEI_type_node; break; + case OImode: eltype = intOI_type_node; break; + case CImode: eltype = intCI_type_node; break; + case XImode: eltype = intXI_type_node; break; + /* 64-bit vectors. */ + case V8QImode: eltype = V8QI_type_node; break; + case V4HImode: eltype = V4HI_type_node; break; + case V2SImode: eltype = V2SI_type_node; break; + case V2SFmode: eltype = V2SF_type_node; break; + /* 128-bit vectors. */ + case V16QImode: eltype = V16QI_type_node; break; + case V8HImode: eltype = V8HI_type_node; break; + case V4SImode: eltype = V4SI_type_node; break; + case V4SFmode: eltype = V4SF_type_node; break; + case V2DImode: eltype = V2DI_type_node; break; + default: gcc_unreachable (); + } + } + + if (k == 0 && !is_store) + return_type = eltype; + else + args = tree_cons (NULL_TREE, eltype, args); + } + + ftype = build_function_type (return_type, args); + } + break; + + case NEON_RESULTPAIR: + { + switch (insn_data[icode].operand[1].mode) + { + case V8QImode: ftype = void_ftype_pv8qi_v8qi_v8qi; break; + case V4HImode: ftype = void_ftype_pv4hi_v4hi_v4hi; break; + case V2SImode: ftype = void_ftype_pv2si_v2si_v2si; break; + case V2SFmode: ftype = void_ftype_pv2sf_v2sf_v2sf; break; + case DImode: ftype = void_ftype_pdi_di_di; break; + case V16QImode: ftype = void_ftype_pv16qi_v16qi_v16qi; break; + case V8HImode: ftype = void_ftype_pv8hi_v8hi_v8hi; break; + case V4SImode: ftype = void_ftype_pv4si_v4si_v4si; break; + case V4SFmode: ftype = void_ftype_pv4sf_v4sf_v4sf; break; + case V2DImode: ftype = void_ftype_pv2di_v2di_v2di; break; + default: gcc_unreachable (); + } + } + break; + + case NEON_REINTERP: + { + /* We iterate over 5 doubleword types, then 5 quadword + types. */ + int rhs = j % 5; + switch (insn_data[icode].operand[0].mode) + { + case V8QImode: ftype = reinterp_ftype_dreg[0][rhs]; break; + case V4HImode: ftype = reinterp_ftype_dreg[1][rhs]; break; + case V2SImode: ftype = reinterp_ftype_dreg[2][rhs]; break; + case V2SFmode: ftype = reinterp_ftype_dreg[3][rhs]; break; + case DImode: ftype = reinterp_ftype_dreg[4][rhs]; break; + case V16QImode: ftype = reinterp_ftype_qreg[0][rhs]; break; + case V8HImode: ftype = reinterp_ftype_qreg[1][rhs]; break; + case V4SImode: ftype = reinterp_ftype_qreg[2][rhs]; break; + case V4SFmode: ftype = reinterp_ftype_qreg[3][rhs]; break; + case V2DImode: ftype = reinterp_ftype_qreg[4][rhs]; break; + default: gcc_unreachable (); + } + } + break; + + default: + gcc_unreachable (); + } + + gcc_assert (ftype != NULL); + + sprintf (namebuf, "__builtin_neon_%s%s", d->name, modenames[j]); + + add_builtin_function (namebuf, ftype, fcode++, BUILT_IN_MD, NULL, + NULL_TREE); + } + } +} + +static void +arm_init_fp16_builtins (void) +{ + tree fp16_type = make_node (REAL_TYPE); + TYPE_PRECISION (fp16_type) = 16; + layout_type (fp16_type); + (*lang_hooks.types.register_builtin_type) (fp16_type, "__fp16"); +} + +static void +arm_init_builtins (void) +{ + arm_init_tls_builtins (); + + if (TARGET_REALLY_IWMMXT) + arm_init_iwmmxt_builtins (); + + if (TARGET_NEON) + arm_init_neon_builtins (); + + if (arm_fp16_format) + arm_init_fp16_builtins (); +} + +/* Implement TARGET_INVALID_PARAMETER_TYPE. */ + +static const char * +arm_invalid_parameter_type (const_tree t) +{ + if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16) + return N_("function parameters cannot have __fp16 type"); + return NULL; +} + +/* Implement TARGET_INVALID_PARAMETER_TYPE. */ + +static const char * +arm_invalid_return_type (const_tree t) +{ + if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16) + return N_("functions cannot return __fp16 type"); + return NULL; +} + +/* Implement TARGET_PROMOTED_TYPE. */ + +static tree +arm_promoted_type (const_tree t) +{ + if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16) + return float_type_node; + return NULL_TREE; +} + +/* Implement TARGET_CONVERT_TO_TYPE. + Specifically, this hook implements the peculiarity of the ARM + half-precision floating-point C semantics that requires conversions between + __fp16 to or from double to do an intermediate conversion to float. */ + +static tree +arm_convert_to_type (tree type, tree expr) +{ + tree fromtype = TREE_TYPE (expr); + if (!SCALAR_FLOAT_TYPE_P (fromtype) || !SCALAR_FLOAT_TYPE_P (type)) + return NULL_TREE; + if ((TYPE_PRECISION (fromtype) == 16 && TYPE_PRECISION (type) > 32) + || (TYPE_PRECISION (type) == 16 && TYPE_PRECISION (fromtype) > 32)) + return convert (type, convert (float_type_node, expr)); + return NULL_TREE; +} + +/* Implement TARGET_SCALAR_MODE_SUPPORTED_P. + This simply adds HFmode as a supported mode; even though we don't + implement arithmetic on this type directly, it's supported by + optabs conversions, much the way the double-word arithmetic is + special-cased in the default hook. */ + +static bool +arm_scalar_mode_supported_p (enum machine_mode mode) +{ + if (mode == HFmode) + return (arm_fp16_format != ARM_FP16_FORMAT_NONE); + else + return default_scalar_mode_supported_p (mode); +} + +/* Errors in the source file can cause expand_expr to return const0_rtx + where we expect a vector. To avoid crashing, use one of the vector + clear instructions. */ + +static rtx +safe_vector_operand (rtx x, enum machine_mode mode) +{ + if (x != const0_rtx) + return x; + x = gen_reg_rtx (mode); + + emit_insn (gen_iwmmxt_clrdi (mode == DImode ? x + : gen_rtx_SUBREG (DImode, x, 0))); + return x; +} + +/* Subroutine of arm_expand_builtin to take care of binop insns. */ + +static rtx +arm_expand_binop_builtin (enum insn_code icode, + tree exp, rtx target) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); + rtx op0 = expand_normal (arg0); + rtx op1 = expand_normal (arg1); + enum machine_mode tmode = insn_data[icode].operand[0].mode; + enum machine_mode mode0 = insn_data[icode].operand[1].mode; + enum machine_mode mode1 = insn_data[icode].operand[2].mode; + + if (VECTOR_MODE_P (mode0)) + op0 = safe_vector_operand (op0, mode0); + if (VECTOR_MODE_P (mode1)) + op1 = safe_vector_operand (op1, mode1); + + if (! target + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) + target = gen_reg_rtx (tmode); + + gcc_assert (GET_MODE (op0) == mode0 && GET_MODE (op1) == mode1); + + if (! (*insn_data[icode].operand[1].predicate) (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + if (! (*insn_data[icode].operand[2].predicate) (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + + pat = GEN_FCN (icode) (target, op0, op1); + if (! pat) + return 0; + emit_insn (pat); + return target; +} + +/* Subroutine of arm_expand_builtin to take care of unop insns. */ + +static rtx +arm_expand_unop_builtin (enum insn_code icode, + tree exp, rtx target, int do_load) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + rtx op0 = expand_normal (arg0); + enum machine_mode tmode = insn_data[icode].operand[0].mode; + enum machine_mode mode0 = insn_data[icode].operand[1].mode; + + if (! target + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) + target = gen_reg_rtx (tmode); + if (do_load) + op0 = gen_rtx_MEM (mode0, copy_to_mode_reg (Pmode, op0)); + else + { + if (VECTOR_MODE_P (mode0)) + op0 = safe_vector_operand (op0, mode0); + + if (! (*insn_data[icode].operand[1].predicate) (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + } + + pat = GEN_FCN (icode) (target, op0); + if (! pat) + return 0; + emit_insn (pat); + return target; +} + +static int +neon_builtin_compare (const void *a, const void *b) +{ + const neon_builtin_datum *const key = (const neon_builtin_datum *) a; + const neon_builtin_datum *const memb = (const neon_builtin_datum *) b; + unsigned int soughtcode = key->base_fcode; + + if (soughtcode >= memb->base_fcode + && soughtcode < memb->base_fcode + memb->num_vars) + return 0; + else if (soughtcode < memb->base_fcode) + return -1; + else + return 1; +} + +static enum insn_code +locate_neon_builtin_icode (int fcode, neon_itype *itype) +{ + neon_builtin_datum key + = { NULL, (neon_itype) 0, 0, { CODE_FOR_nothing }, 0, 0 }; + neon_builtin_datum *found; + int idx; + + key.base_fcode = fcode; + found = (neon_builtin_datum *) + bsearch (&key, &neon_builtin_data[0], ARRAY_SIZE (neon_builtin_data), + sizeof (neon_builtin_data[0]), neon_builtin_compare); + gcc_assert (found); + idx = fcode - (int) found->base_fcode; + gcc_assert (idx >= 0 && idx < T_MAX && idx < (int)found->num_vars); + + if (itype) + *itype = found->itype; + + return found->codes[idx]; +} + +typedef enum { + NEON_ARG_COPY_TO_REG, + NEON_ARG_CONSTANT, + NEON_ARG_STOP +} builtin_arg; + +#define NEON_MAX_BUILTIN_ARGS 5 + +/* Expand a Neon builtin. */ +static rtx +arm_expand_neon_args (rtx target, int icode, int have_retval, + tree exp, ...) +{ + va_list ap; + rtx pat; + tree arg[NEON_MAX_BUILTIN_ARGS]; + rtx op[NEON_MAX_BUILTIN_ARGS]; + enum machine_mode tmode = insn_data[icode].operand[0].mode; + enum machine_mode mode[NEON_MAX_BUILTIN_ARGS]; + int argc = 0; + + if (have_retval + && (!target + || GET_MODE (target) != tmode + || !(*insn_data[icode].operand[0].predicate) (target, tmode))) + target = gen_reg_rtx (tmode); + + va_start (ap, exp); + + for (;;) + { + builtin_arg thisarg = (builtin_arg) va_arg (ap, int); + + if (thisarg == NEON_ARG_STOP) + break; + else + { + arg[argc] = CALL_EXPR_ARG (exp, argc); + op[argc] = expand_normal (arg[argc]); + mode[argc] = insn_data[icode].operand[argc + have_retval].mode; + + switch (thisarg) + { + case NEON_ARG_COPY_TO_REG: + /*gcc_assert (GET_MODE (op[argc]) == mode[argc]);*/ + if (!(*insn_data[icode].operand[argc + have_retval].predicate) + (op[argc], mode[argc])) + op[argc] = copy_to_mode_reg (mode[argc], op[argc]); + break; + + case NEON_ARG_CONSTANT: + /* FIXME: This error message is somewhat unhelpful. */ + if (!(*insn_data[icode].operand[argc + have_retval].predicate) + (op[argc], mode[argc])) + error ("argument must be a constant"); + break; + + case NEON_ARG_STOP: + gcc_unreachable (); + } + + argc++; + } + } + + va_end (ap); + + if (have_retval) + switch (argc) + { + case 1: + pat = GEN_FCN (icode) (target, op[0]); + break; + + case 2: + pat = GEN_FCN (icode) (target, op[0], op[1]); + break; + + case 3: + pat = GEN_FCN (icode) (target, op[0], op[1], op[2]); + break; + + case 4: + pat = GEN_FCN (icode) (target, op[0], op[1], op[2], op[3]); + break; + + case 5: + pat = GEN_FCN (icode) (target, op[0], op[1], op[2], op[3], op[4]); + break; + + default: + gcc_unreachable (); + } + else + switch (argc) + { + case 1: + pat = GEN_FCN (icode) (op[0]); + break; + + case 2: + pat = GEN_FCN (icode) (op[0], op[1]); + break; + + case 3: + pat = GEN_FCN (icode) (op[0], op[1], op[2]); + break; + + case 4: + pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3]); + break; + + case 5: + pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4]); + break; + + default: + gcc_unreachable (); + } + + if (!pat) + return 0; + + emit_insn (pat); + + return target; +} + +/* Expand a Neon builtin. These are "special" because they don't have symbolic + constants defined per-instruction or per instruction-variant. Instead, the + required info is looked up in the table neon_builtin_data. */ +static rtx +arm_expand_neon_builtin (int fcode, tree exp, rtx target) +{ + neon_itype itype; + enum insn_code icode = locate_neon_builtin_icode (fcode, &itype); + + switch (itype) + { + case NEON_UNOP: + case NEON_CONVERT: + case NEON_DUPLANE: + return arm_expand_neon_args (target, icode, 1, exp, + NEON_ARG_COPY_TO_REG, NEON_ARG_CONSTANT, NEON_ARG_STOP); + + case NEON_BINOP: + case NEON_SETLANE: + case NEON_SCALARMUL: + case NEON_SCALARMULL: + case NEON_SCALARMULH: + case NEON_SHIFTINSERT: + case NEON_LOGICBINOP: + return arm_expand_neon_args (target, icode, 1, exp, + NEON_ARG_COPY_TO_REG, NEON_ARG_COPY_TO_REG, NEON_ARG_CONSTANT, + NEON_ARG_STOP); + + case NEON_TERNOP: + return arm_expand_neon_args (target, icode, 1, exp, + NEON_ARG_COPY_TO_REG, NEON_ARG_COPY_TO_REG, NEON_ARG_COPY_TO_REG, + NEON_ARG_CONSTANT, NEON_ARG_STOP); + + case NEON_GETLANE: + case NEON_FIXCONV: + case NEON_SHIFTIMM: + return arm_expand_neon_args (target, icode, 1, exp, + NEON_ARG_COPY_TO_REG, NEON_ARG_CONSTANT, NEON_ARG_CONSTANT, + NEON_ARG_STOP); + + case NEON_CREATE: + return arm_expand_neon_args (target, icode, 1, exp, + NEON_ARG_COPY_TO_REG, NEON_ARG_STOP); + + case NEON_DUP: + case NEON_SPLIT: + case NEON_REINTERP: + return arm_expand_neon_args (target, icode, 1, exp, + NEON_ARG_COPY_TO_REG, NEON_ARG_STOP); + + case NEON_COMBINE: + case NEON_VTBL: + return arm_expand_neon_args (target, icode, 1, exp, + NEON_ARG_COPY_TO_REG, NEON_ARG_COPY_TO_REG, NEON_ARG_STOP); + + case NEON_RESULTPAIR: + return arm_expand_neon_args (target, icode, 0, exp, + NEON_ARG_COPY_TO_REG, NEON_ARG_COPY_TO_REG, NEON_ARG_COPY_TO_REG, + NEON_ARG_STOP); + + case NEON_LANEMUL: + case NEON_LANEMULL: + case NEON_LANEMULH: + return arm_expand_neon_args (target, icode, 1, exp, + NEON_ARG_COPY_TO_REG, NEON_ARG_COPY_TO_REG, NEON_ARG_CONSTANT, + NEON_ARG_CONSTANT, NEON_ARG_STOP); + + case NEON_LANEMAC: + return arm_expand_neon_args (target, icode, 1, exp, + NEON_ARG_COPY_TO_REG, NEON_ARG_COPY_TO_REG, NEON_ARG_COPY_TO_REG, + NEON_ARG_CONSTANT, NEON_ARG_CONSTANT, NEON_ARG_STOP); + + case NEON_SHIFTACC: + return arm_expand_neon_args (target, icode, 1, exp, + NEON_ARG_COPY_TO_REG, NEON_ARG_COPY_TO_REG, NEON_ARG_CONSTANT, + NEON_ARG_CONSTANT, NEON_ARG_STOP); + + case NEON_SCALARMAC: + return arm_expand_neon_args (target, icode, 1, exp, + NEON_ARG_COPY_TO_REG, NEON_ARG_COPY_TO_REG, NEON_ARG_COPY_TO_REG, + NEON_ARG_CONSTANT, NEON_ARG_STOP); + + case NEON_SELECT: + case NEON_VTBX: + return arm_expand_neon_args (target, icode, 1, exp, + NEON_ARG_COPY_TO_REG, NEON_ARG_COPY_TO_REG, NEON_ARG_COPY_TO_REG, + NEON_ARG_STOP); + + case NEON_LOAD1: + case NEON_LOADSTRUCT: + return arm_expand_neon_args (target, icode, 1, exp, + NEON_ARG_COPY_TO_REG, NEON_ARG_STOP); + + case NEON_LOAD1LANE: + case NEON_LOADSTRUCTLANE: + return arm_expand_neon_args (target, icode, 1, exp, + NEON_ARG_COPY_TO_REG, NEON_ARG_COPY_TO_REG, NEON_ARG_CONSTANT, + NEON_ARG_STOP); + + case NEON_STORE1: + case NEON_STORESTRUCT: + return arm_expand_neon_args (target, icode, 0, exp, + NEON_ARG_COPY_TO_REG, NEON_ARG_COPY_TO_REG, NEON_ARG_STOP); + + case NEON_STORE1LANE: + case NEON_STORESTRUCTLANE: + return arm_expand_neon_args (target, icode, 0, exp, + NEON_ARG_COPY_TO_REG, NEON_ARG_COPY_TO_REG, NEON_ARG_CONSTANT, + NEON_ARG_STOP); + } + + gcc_unreachable (); +} + +/* Emit code to reinterpret one Neon type as another, without altering bits. */ +void +neon_reinterpret (rtx dest, rtx src) +{ + emit_move_insn (dest, gen_lowpart (GET_MODE (dest), src)); +} + +/* Emit code to place a Neon pair result in memory locations (with equal + registers). */ +void +neon_emit_pair_result_insn (enum machine_mode mode, + rtx (*intfn) (rtx, rtx, rtx, rtx), rtx destaddr, + rtx op1, rtx op2) +{ + rtx mem = gen_rtx_MEM (mode, destaddr); + rtx tmp1 = gen_reg_rtx (mode); + rtx tmp2 = gen_reg_rtx (mode); + + emit_insn (intfn (tmp1, op1, op2, tmp2)); + + emit_move_insn (mem, tmp1); + mem = adjust_address (mem, mode, GET_MODE_SIZE (mode)); + emit_move_insn (mem, tmp2); +} + +/* Set up OPERANDS for a register copy from SRC to DEST, taking care + not to early-clobber SRC registers in the process. + + We assume that the operands described by SRC and DEST represent a + decomposed copy of OPERANDS[1] into OPERANDS[0]. COUNT is the + number of components into which the copy has been decomposed. */ +void +neon_disambiguate_copy (rtx *operands, rtx *dest, rtx *src, unsigned int count) +{ + unsigned int i; + + if (!reg_overlap_mentioned_p (operands[0], operands[1]) + || REGNO (operands[0]) < REGNO (operands[1])) + { + for (i = 0; i < count; i++) + { + operands[2 * i] = dest[i]; + operands[2 * i + 1] = src[i]; + } + } + else + { + for (i = 0; i < count; i++) + { + operands[2 * i] = dest[count - i - 1]; + operands[2 * i + 1] = src[count - i - 1]; + } + } +} + +/* Expand an expression EXP that calls a built-in function, + with result going to TARGET if that's convenient + (and in mode MODE if that's convenient). + SUBTARGET may be used as the target for computing one of EXP's operands. + IGNORE is nonzero if the value is to be ignored. */ + +static rtx +arm_expand_builtin (tree exp, + rtx target, + rtx subtarget ATTRIBUTE_UNUSED, + enum machine_mode mode ATTRIBUTE_UNUSED, + int ignore ATTRIBUTE_UNUSED) +{ + const struct builtin_description * d; + enum insn_code icode; + tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); + tree arg0; + tree arg1; + tree arg2; + rtx op0; + rtx op1; + rtx op2; + rtx pat; + int fcode = DECL_FUNCTION_CODE (fndecl); + size_t i; + enum machine_mode tmode; + enum machine_mode mode0; + enum machine_mode mode1; + enum machine_mode mode2; + + if (fcode >= ARM_BUILTIN_NEON_BASE) + return arm_expand_neon_builtin (fcode, exp, target); + + switch (fcode) + { + case ARM_BUILTIN_TEXTRMSB: + case ARM_BUILTIN_TEXTRMUB: + case ARM_BUILTIN_TEXTRMSH: + case ARM_BUILTIN_TEXTRMUH: + case ARM_BUILTIN_TEXTRMSW: + case ARM_BUILTIN_TEXTRMUW: + icode = (fcode == ARM_BUILTIN_TEXTRMSB ? CODE_FOR_iwmmxt_textrmsb + : fcode == ARM_BUILTIN_TEXTRMUB ? CODE_FOR_iwmmxt_textrmub + : fcode == ARM_BUILTIN_TEXTRMSH ? CODE_FOR_iwmmxt_textrmsh + : fcode == ARM_BUILTIN_TEXTRMUH ? CODE_FOR_iwmmxt_textrmuh + : CODE_FOR_iwmmxt_textrmw); + + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + tmode = insn_data[icode].operand[0].mode; + mode0 = insn_data[icode].operand[1].mode; + mode1 = insn_data[icode].operand[2].mode; + + if (! (*insn_data[icode].operand[1].predicate) (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + if (! (*insn_data[icode].operand[2].predicate) (op1, mode1)) + { + /* @@@ better error message */ + error ("selector must be an immediate"); + return gen_reg_rtx (tmode); + } + if (target == 0 + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) + target = gen_reg_rtx (tmode); + pat = GEN_FCN (icode) (target, op0, op1); + if (! pat) + return 0; + emit_insn (pat); + return target; + + case ARM_BUILTIN_TINSRB: + case ARM_BUILTIN_TINSRH: + case ARM_BUILTIN_TINSRW: + icode = (fcode == ARM_BUILTIN_TINSRB ? CODE_FOR_iwmmxt_tinsrb + : fcode == ARM_BUILTIN_TINSRH ? CODE_FOR_iwmmxt_tinsrh + : CODE_FOR_iwmmxt_tinsrw); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + op2 = expand_normal (arg2); + tmode = insn_data[icode].operand[0].mode; + mode0 = insn_data[icode].operand[1].mode; + mode1 = insn_data[icode].operand[2].mode; + mode2 = insn_data[icode].operand[3].mode; + + if (! (*insn_data[icode].operand[1].predicate) (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + if (! (*insn_data[icode].operand[2].predicate) (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + if (! (*insn_data[icode].operand[3].predicate) (op2, mode2)) + { + /* @@@ better error message */ + error ("selector must be an immediate"); + return const0_rtx; + } + if (target == 0 + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) + target = gen_reg_rtx (tmode); + pat = GEN_FCN (icode) (target, op0, op1, op2); + if (! pat) + return 0; + emit_insn (pat); + return target; + + case ARM_BUILTIN_SETWCX: + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + op0 = force_reg (SImode, expand_normal (arg0)); + op1 = expand_normal (arg1); + emit_insn (gen_iwmmxt_tmcr (op1, op0)); + return 0; + + case ARM_BUILTIN_GETWCX: + arg0 = CALL_EXPR_ARG (exp, 0); + op0 = expand_normal (arg0); + target = gen_reg_rtx (SImode); + emit_insn (gen_iwmmxt_tmrc (target, op0)); + return target; + + case ARM_BUILTIN_WSHUFH: + icode = CODE_FOR_iwmmxt_wshufh; + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + tmode = insn_data[icode].operand[0].mode; + mode1 = insn_data[icode].operand[1].mode; + mode2 = insn_data[icode].operand[2].mode; + + if (! (*insn_data[icode].operand[1].predicate) (op0, mode1)) + op0 = copy_to_mode_reg (mode1, op0); + if (! (*insn_data[icode].operand[2].predicate) (op1, mode2)) + { + /* @@@ better error message */ + error ("mask must be an immediate"); + return const0_rtx; + } + if (target == 0 + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) + target = gen_reg_rtx (tmode); + pat = GEN_FCN (icode) (target, op0, op1); + if (! pat) + return 0; + emit_insn (pat); + return target; + + case ARM_BUILTIN_WSADB: + return arm_expand_binop_builtin (CODE_FOR_iwmmxt_wsadb, exp, target); + case ARM_BUILTIN_WSADH: + return arm_expand_binop_builtin (CODE_FOR_iwmmxt_wsadh, exp, target); + case ARM_BUILTIN_WSADBZ: + return arm_expand_binop_builtin (CODE_FOR_iwmmxt_wsadbz, exp, target); + case ARM_BUILTIN_WSADHZ: + return arm_expand_binop_builtin (CODE_FOR_iwmmxt_wsadhz, exp, target); + + /* Several three-argument builtins. */ + case ARM_BUILTIN_WMACS: + case ARM_BUILTIN_WMACU: + case ARM_BUILTIN_WALIGN: + case ARM_BUILTIN_TMIA: + case ARM_BUILTIN_TMIAPH: + case ARM_BUILTIN_TMIATT: + case ARM_BUILTIN_TMIATB: + case ARM_BUILTIN_TMIABT: + case ARM_BUILTIN_TMIABB: + icode = (fcode == ARM_BUILTIN_WMACS ? CODE_FOR_iwmmxt_wmacs + : fcode == ARM_BUILTIN_WMACU ? CODE_FOR_iwmmxt_wmacu + : fcode == ARM_BUILTIN_TMIA ? CODE_FOR_iwmmxt_tmia + : fcode == ARM_BUILTIN_TMIAPH ? CODE_FOR_iwmmxt_tmiaph + : fcode == ARM_BUILTIN_TMIABB ? CODE_FOR_iwmmxt_tmiabb + : fcode == ARM_BUILTIN_TMIABT ? CODE_FOR_iwmmxt_tmiabt + : fcode == ARM_BUILTIN_TMIATB ? CODE_FOR_iwmmxt_tmiatb + : fcode == ARM_BUILTIN_TMIATT ? CODE_FOR_iwmmxt_tmiatt + : CODE_FOR_iwmmxt_walign); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + op2 = expand_normal (arg2); + tmode = insn_data[icode].operand[0].mode; + mode0 = insn_data[icode].operand[1].mode; + mode1 = insn_data[icode].operand[2].mode; + mode2 = insn_data[icode].operand[3].mode; + + if (! (*insn_data[icode].operand[1].predicate) (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + if (! (*insn_data[icode].operand[2].predicate) (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + if (! (*insn_data[icode].operand[3].predicate) (op2, mode2)) + op2 = copy_to_mode_reg (mode2, op2); + if (target == 0 + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) + target = gen_reg_rtx (tmode); + pat = GEN_FCN (icode) (target, op0, op1, op2); + if (! pat) + return 0; + emit_insn (pat); + return target; + + case ARM_BUILTIN_WZERO: + target = gen_reg_rtx (DImode); + emit_insn (gen_iwmmxt_clrdi (target)); + return target; + + case ARM_BUILTIN_THREAD_POINTER: + return arm_load_tp (target); + + default: + break; + } + + for (i = 0, d = bdesc_2arg; i < ARRAY_SIZE (bdesc_2arg); i++, d++) + if (d->code == (const enum arm_builtins) fcode) + return arm_expand_binop_builtin (d->icode, exp, target); + + for (i = 0, d = bdesc_1arg; i < ARRAY_SIZE (bdesc_1arg); i++, d++) + if (d->code == (const enum arm_builtins) fcode) + return arm_expand_unop_builtin (d->icode, exp, target, 0); + + /* @@@ Should really do something sensible here. */ + return NULL_RTX; +} + +/* Return the number (counting from 0) of + the least significant set bit in MASK. */ + +inline static int +number_of_first_bit_set (unsigned mask) +{ + int bit; + + for (bit = 0; + (mask & (1 << bit)) == 0; + ++bit) + continue; + + return bit; +} + +/* Emit code to push or pop registers to or from the stack. F is the + assembly file. MASK is the registers to push or pop. PUSH is + nonzero if we should push, and zero if we should pop. For debugging + output, if pushing, adjust CFA_OFFSET by the amount of space added + to the stack. REAL_REGS should have the same number of bits set as + MASK, and will be used instead (in the same order) to describe which + registers were saved - this is used to mark the save slots when we + push high registers after moving them to low registers. */ +static void +thumb_pushpop (FILE *f, unsigned long mask, int push, int *cfa_offset, + unsigned long real_regs) +{ + int regno; + int lo_mask = mask & 0xFF; + int pushed_words = 0; + + gcc_assert (mask); + + if (lo_mask == 0 && !push && (mask & (1 << PC_REGNUM))) + { + /* Special case. Do not generate a POP PC statement here, do it in + thumb_exit() */ + thumb_exit (f, -1); + return; + } + + if (push && arm_except_unwind_info (&global_options) == UI_TARGET) + { + fprintf (f, "\t.save\t{"); + for (regno = 0; regno < 15; regno++) + { + if (real_regs & (1 << regno)) + { + if (real_regs & ((1 << regno) -1)) + fprintf (f, ", "); + asm_fprintf (f, "%r", regno); + } + } + fprintf (f, "}\n"); + } + + fprintf (f, "\t%s\t{", push ? "push" : "pop"); + + /* Look at the low registers first. */ + for (regno = 0; regno <= LAST_LO_REGNUM; regno++, lo_mask >>= 1) + { + if (lo_mask & 1) + { + asm_fprintf (f, "%r", regno); + + if ((lo_mask & ~1) != 0) + fprintf (f, ", "); + + pushed_words++; + } + } + + if (push && (mask & (1 << LR_REGNUM))) + { + /* Catch pushing the LR. */ + if (mask & 0xFF) + fprintf (f, ", "); + + asm_fprintf (f, "%r", LR_REGNUM); + + pushed_words++; + } + else if (!push && (mask & (1 << PC_REGNUM))) + { + /* Catch popping the PC. */ + if (TARGET_INTERWORK || TARGET_BACKTRACE + || crtl->calls_eh_return) + { + /* The PC is never poped directly, instead + it is popped into r3 and then BX is used. */ + fprintf (f, "}\n"); + + thumb_exit (f, -1); + + return; + } + else + { + if (mask & 0xFF) + fprintf (f, ", "); + + asm_fprintf (f, "%r", PC_REGNUM); + } + } + + fprintf (f, "}\n"); + + if (push && pushed_words && dwarf2out_do_frame ()) + { + char *l = dwarf2out_cfi_label (false); + int pushed_mask = real_regs; + + *cfa_offset += pushed_words * 4; + dwarf2out_def_cfa (l, SP_REGNUM, *cfa_offset); + + pushed_words = 0; + pushed_mask = real_regs; + for (regno = 0; regno <= 14; regno++, pushed_mask >>= 1) + { + if (pushed_mask & 1) + dwarf2out_reg_save (l, regno, 4 * pushed_words++ - *cfa_offset); + } + } +} + +/* Generate code to return from a thumb function. + If 'reg_containing_return_addr' is -1, then the return address is + actually on the stack, at the stack pointer. */ +static void +thumb_exit (FILE *f, int reg_containing_return_addr) +{ + unsigned regs_available_for_popping; + unsigned regs_to_pop; + int pops_needed; + unsigned available; + unsigned required; + int mode; + int size; + int restore_a4 = FALSE; + + /* Compute the registers we need to pop. */ + regs_to_pop = 0; + pops_needed = 0; + + if (reg_containing_return_addr == -1) + { + regs_to_pop |= 1 << LR_REGNUM; + ++pops_needed; + } + + if (TARGET_BACKTRACE) + { + /* Restore the (ARM) frame pointer and stack pointer. */ + regs_to_pop |= (1 << ARM_HARD_FRAME_POINTER_REGNUM) | (1 << SP_REGNUM); + pops_needed += 2; + } + + /* If there is nothing to pop then just emit the BX instruction and + return. */ + if (pops_needed == 0) + { + if (crtl->calls_eh_return) + asm_fprintf (f, "\tadd\t%r, %r\n", SP_REGNUM, ARM_EH_STACKADJ_REGNUM); + + asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr); + return; + } + /* Otherwise if we are not supporting interworking and we have not created + a backtrace structure and the function was not entered in ARM mode then + just pop the return address straight into the PC. */ + else if (!TARGET_INTERWORK + && !TARGET_BACKTRACE + && !is_called_in_ARM_mode (current_function_decl) + && !crtl->calls_eh_return) + { + asm_fprintf (f, "\tpop\t{%r}\n", PC_REGNUM); + return; + } + + /* Find out how many of the (return) argument registers we can corrupt. */ + regs_available_for_popping = 0; + + /* If returning via __builtin_eh_return, the bottom three registers + all contain information needed for the return. */ + if (crtl->calls_eh_return) + size = 12; + else + { + /* If we can deduce the registers used from the function's + return value. This is more reliable that examining + df_regs_ever_live_p () because that will be set if the register is + ever used in the function, not just if the register is used + to hold a return value. */ + + if (crtl->return_rtx != 0) + mode = GET_MODE (crtl->return_rtx); + else + mode = DECL_MODE (DECL_RESULT (current_function_decl)); + + size = GET_MODE_SIZE (mode); + + if (size == 0) + { + /* In a void function we can use any argument register. + In a function that returns a structure on the stack + we can use the second and third argument registers. */ + if (mode == VOIDmode) + regs_available_for_popping = + (1 << ARG_REGISTER (1)) + | (1 << ARG_REGISTER (2)) + | (1 << ARG_REGISTER (3)); + else + regs_available_for_popping = + (1 << ARG_REGISTER (2)) + | (1 << ARG_REGISTER (3)); + } + else if (size <= 4) + regs_available_for_popping = + (1 << ARG_REGISTER (2)) + | (1 << ARG_REGISTER (3)); + else if (size <= 8) + regs_available_for_popping = + (1 << ARG_REGISTER (3)); + } + + /* Match registers to be popped with registers into which we pop them. */ + for (available = regs_available_for_popping, + required = regs_to_pop; + required != 0 && available != 0; + available &= ~(available & - available), + required &= ~(required & - required)) + -- pops_needed; + + /* If we have any popping registers left over, remove them. */ + if (available > 0) + regs_available_for_popping &= ~available; + + /* Otherwise if we need another popping register we can use + the fourth argument register. */ + else if (pops_needed) + { + /* If we have not found any free argument registers and + reg a4 contains the return address, we must move it. */ + if (regs_available_for_popping == 0 + && reg_containing_return_addr == LAST_ARG_REGNUM) + { + asm_fprintf (f, "\tmov\t%r, %r\n", LR_REGNUM, LAST_ARG_REGNUM); + reg_containing_return_addr = LR_REGNUM; + } + else if (size > 12) + { + /* Register a4 is being used to hold part of the return value, + but we have dire need of a free, low register. */ + restore_a4 = TRUE; + + asm_fprintf (f, "\tmov\t%r, %r\n",IP_REGNUM, LAST_ARG_REGNUM); + } + + if (reg_containing_return_addr != LAST_ARG_REGNUM) + { + /* The fourth argument register is available. */ + regs_available_for_popping |= 1 << LAST_ARG_REGNUM; + + --pops_needed; + } + } + + /* Pop as many registers as we can. */ + thumb_pushpop (f, regs_available_for_popping, FALSE, NULL, + regs_available_for_popping); + + /* Process the registers we popped. */ + if (reg_containing_return_addr == -1) + { + /* The return address was popped into the lowest numbered register. */ + regs_to_pop &= ~(1 << LR_REGNUM); + + reg_containing_return_addr = + number_of_first_bit_set (regs_available_for_popping); + + /* Remove this register for the mask of available registers, so that + the return address will not be corrupted by further pops. */ + regs_available_for_popping &= ~(1 << reg_containing_return_addr); + } + + /* If we popped other registers then handle them here. */ + if (regs_available_for_popping) + { + int frame_pointer; + + /* Work out which register currently contains the frame pointer. */ + frame_pointer = number_of_first_bit_set (regs_available_for_popping); + + /* Move it into the correct place. */ + asm_fprintf (f, "\tmov\t%r, %r\n", + ARM_HARD_FRAME_POINTER_REGNUM, frame_pointer); + + /* (Temporarily) remove it from the mask of popped registers. */ + regs_available_for_popping &= ~(1 << frame_pointer); + regs_to_pop &= ~(1 << ARM_HARD_FRAME_POINTER_REGNUM); + + if (regs_available_for_popping) + { + int stack_pointer; + + /* We popped the stack pointer as well, + find the register that contains it. */ + stack_pointer = number_of_first_bit_set (regs_available_for_popping); + + /* Move it into the stack register. */ + asm_fprintf (f, "\tmov\t%r, %r\n", SP_REGNUM, stack_pointer); + + /* At this point we have popped all necessary registers, so + do not worry about restoring regs_available_for_popping + to its correct value: + + assert (pops_needed == 0) + assert (regs_available_for_popping == (1 << frame_pointer)) + assert (regs_to_pop == (1 << STACK_POINTER)) */ + } + else + { + /* Since we have just move the popped value into the frame + pointer, the popping register is available for reuse, and + we know that we still have the stack pointer left to pop. */ + regs_available_for_popping |= (1 << frame_pointer); + } + } + + /* If we still have registers left on the stack, but we no longer have + any registers into which we can pop them, then we must move the return + address into the link register and make available the register that + contained it. */ + if (regs_available_for_popping == 0 && pops_needed > 0) + { + regs_available_for_popping |= 1 << reg_containing_return_addr; + + asm_fprintf (f, "\tmov\t%r, %r\n", LR_REGNUM, + reg_containing_return_addr); + + reg_containing_return_addr = LR_REGNUM; + } + + /* If we have registers left on the stack then pop some more. + We know that at most we will want to pop FP and SP. */ + if (pops_needed > 0) + { + int popped_into; + int move_to; + + thumb_pushpop (f, regs_available_for_popping, FALSE, NULL, + regs_available_for_popping); + + /* We have popped either FP or SP. + Move whichever one it is into the correct register. */ + popped_into = number_of_first_bit_set (regs_available_for_popping); + move_to = number_of_first_bit_set (regs_to_pop); + + asm_fprintf (f, "\tmov\t%r, %r\n", move_to, popped_into); + + regs_to_pop &= ~(1 << move_to); + + --pops_needed; + } + + /* If we still have not popped everything then we must have only + had one register available to us and we are now popping the SP. */ + if (pops_needed > 0) + { + int popped_into; + + thumb_pushpop (f, regs_available_for_popping, FALSE, NULL, + regs_available_for_popping); + + popped_into = number_of_first_bit_set (regs_available_for_popping); + + asm_fprintf (f, "\tmov\t%r, %r\n", SP_REGNUM, popped_into); + /* + assert (regs_to_pop == (1 << STACK_POINTER)) + assert (pops_needed == 1) + */ + } + + /* If necessary restore the a4 register. */ + if (restore_a4) + { + if (reg_containing_return_addr != LR_REGNUM) + { + asm_fprintf (f, "\tmov\t%r, %r\n", LR_REGNUM, LAST_ARG_REGNUM); + reg_containing_return_addr = LR_REGNUM; + } + + asm_fprintf (f, "\tmov\t%r, %r\n", LAST_ARG_REGNUM, IP_REGNUM); + } + + if (crtl->calls_eh_return) + asm_fprintf (f, "\tadd\t%r, %r\n", SP_REGNUM, ARM_EH_STACKADJ_REGNUM); + + /* Return to caller. */ + asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr); +} + +/* Scan INSN just before assembler is output for it. + For Thumb-1, we track the status of the condition codes; this + information is used in the cbranchsi4_insn pattern. */ +void +thumb1_final_prescan_insn (rtx insn) +{ + if (flag_print_asm_name) + asm_fprintf (asm_out_file, "%@ 0x%04x\n", + INSN_ADDRESSES (INSN_UID (insn))); + /* Don't overwrite the previous setter when we get to a cbranch. */ + if (INSN_CODE (insn) != CODE_FOR_cbranchsi4_insn) + { + enum attr_conds conds; + + if (cfun->machine->thumb1_cc_insn) + { + if (modified_in_p (cfun->machine->thumb1_cc_op0, insn) + || modified_in_p (cfun->machine->thumb1_cc_op1, insn)) + CC_STATUS_INIT; + } + conds = get_attr_conds (insn); + if (conds == CONDS_SET) + { + rtx set = single_set (insn); + cfun->machine->thumb1_cc_insn = insn; + cfun->machine->thumb1_cc_op0 = SET_DEST (set); + cfun->machine->thumb1_cc_op1 = const0_rtx; + cfun->machine->thumb1_cc_mode = CC_NOOVmode; + if (INSN_CODE (insn) == CODE_FOR_thumb1_subsi3_insn) + { + rtx src1 = XEXP (SET_SRC (set), 1); + if (src1 == const0_rtx) + cfun->machine->thumb1_cc_mode = CCmode; + } + } + else if (conds != CONDS_NOCOND) + cfun->machine->thumb1_cc_insn = NULL_RTX; + } +} + +int +thumb_shiftable_const (unsigned HOST_WIDE_INT val) +{ + unsigned HOST_WIDE_INT mask = 0xff; + int i; + + val = val & (unsigned HOST_WIDE_INT)0xffffffffu; + if (val == 0) /* XXX */ + return 0; + + for (i = 0; i < 25; i++) + if ((val & (mask << i)) == val) + return 1; + + return 0; +} + +/* Returns nonzero if the current function contains, + or might contain a far jump. */ +static int +thumb_far_jump_used_p (void) +{ + rtx insn; + + /* This test is only important for leaf functions. */ + /* assert (!leaf_function_p ()); */ + + /* If we have already decided that far jumps may be used, + do not bother checking again, and always return true even if + it turns out that they are not being used. Once we have made + the decision that far jumps are present (and that hence the link + register will be pushed onto the stack) we cannot go back on it. */ + if (cfun->machine->far_jump_used) + return 1; + + /* If this function is not being called from the prologue/epilogue + generation code then it must be being called from the + INITIAL_ELIMINATION_OFFSET macro. */ + if (!(ARM_DOUBLEWORD_ALIGN || reload_completed)) + { + /* In this case we know that we are being asked about the elimination + of the arg pointer register. If that register is not being used, + then there are no arguments on the stack, and we do not have to + worry that a far jump might force the prologue to push the link + register, changing the stack offsets. In this case we can just + return false, since the presence of far jumps in the function will + not affect stack offsets. + + If the arg pointer is live (or if it was live, but has now been + eliminated and so set to dead) then we do have to test to see if + the function might contain a far jump. This test can lead to some + false negatives, since before reload is completed, then length of + branch instructions is not known, so gcc defaults to returning their + longest length, which in turn sets the far jump attribute to true. + + A false negative will not result in bad code being generated, but it + will result in a needless push and pop of the link register. We + hope that this does not occur too often. + + If we need doubleword stack alignment this could affect the other + elimination offsets so we can't risk getting it wrong. */ + if (df_regs_ever_live_p (ARG_POINTER_REGNUM)) + cfun->machine->arg_pointer_live = 1; + else if (!cfun->machine->arg_pointer_live) + return 0; + } + + /* Check to see if the function contains a branch + insn with the far jump attribute set. */ + for (insn = get_insns (); insn; insn = NEXT_INSN (insn)) + { + if (GET_CODE (insn) == JUMP_INSN + /* Ignore tablejump patterns. */ + && GET_CODE (PATTERN (insn)) != ADDR_VEC + && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC + && get_attr_far_jump (insn) == FAR_JUMP_YES + ) + { + /* Record the fact that we have decided that + the function does use far jumps. */ + cfun->machine->far_jump_used = 1; + return 1; + } + } + + return 0; +} + +/* Return nonzero if FUNC must be entered in ARM mode. */ +int +is_called_in_ARM_mode (tree func) +{ + gcc_assert (TREE_CODE (func) == FUNCTION_DECL); + + /* Ignore the problem about functions whose address is taken. */ + if (TARGET_CALLEE_INTERWORKING && TREE_PUBLIC (func)) + return TRUE; + +#ifdef ARM_PE + return lookup_attribute ("interfacearm", DECL_ATTRIBUTES (func)) != NULL_TREE; +#else + return FALSE; +#endif +} + +/* Given the stack offsets and register mask in OFFSETS, decide how + many additional registers to push instead of subtracting a constant + from SP. For epilogues the principle is the same except we use pop. + FOR_PROLOGUE indicates which we're generating. */ +static int +thumb1_extra_regs_pushed (arm_stack_offsets *offsets, bool for_prologue) +{ + HOST_WIDE_INT amount; + unsigned long live_regs_mask = offsets->saved_regs_mask; + /* Extract a mask of the ones we can give to the Thumb's push/pop + instruction. */ + unsigned long l_mask = live_regs_mask & (for_prologue ? 0x40ff : 0xff); + /* Then count how many other high registers will need to be pushed. */ + unsigned long high_regs_pushed = bit_count (live_regs_mask & 0x0f00); + int n_free, reg_base, size; + + if (!for_prologue && frame_pointer_needed) + amount = offsets->locals_base - offsets->saved_regs; + else + amount = offsets->outgoing_args - offsets->saved_regs; + + /* If the stack frame size is 512 exactly, we can save one load + instruction, which should make this a win even when optimizing + for speed. */ + if (!optimize_size && amount != 512) + return 0; + + /* Can't do this if there are high registers to push. */ + if (high_regs_pushed != 0) + return 0; + + /* Shouldn't do it in the prologue if no registers would normally + be pushed at all. In the epilogue, also allow it if we'll have + a pop insn for the PC. */ + if (l_mask == 0 + && (for_prologue + || TARGET_BACKTRACE + || (live_regs_mask & 1 << LR_REGNUM) == 0 + || TARGET_INTERWORK + || crtl->args.pretend_args_size != 0)) + return 0; + + /* Don't do this if thumb_expand_prologue wants to emit instructions + between the push and the stack frame allocation. */ + if (for_prologue + && ((flag_pic && arm_pic_register != INVALID_REGNUM) + || (!frame_pointer_needed && CALLER_INTERWORKING_SLOT_SIZE > 0))) + return 0; + + reg_base = 0; + n_free = 0; + if (!for_prologue) + { + size = arm_size_return_regs (); + reg_base = ARM_NUM_INTS (size); + live_regs_mask >>= reg_base; + } + + while (reg_base + n_free < 8 && !(live_regs_mask & 1) + && (for_prologue || call_used_regs[reg_base + n_free])) + { + live_regs_mask >>= 1; + n_free++; + } + + if (n_free == 0) + return 0; + gcc_assert (amount / 4 * 4 == amount); + + if (amount >= 512 && (amount - n_free * 4) < 512) + return (amount - 508) / 4; + if (amount <= n_free * 4) + return amount / 4; + return 0; +} + +/* The bits which aren't usefully expanded as rtl. */ +const char * +thumb_unexpanded_epilogue (void) +{ + arm_stack_offsets *offsets; + int regno; + unsigned long live_regs_mask = 0; + int high_regs_pushed = 0; + int extra_pop; + int had_to_push_lr; + int size; + + if (cfun->machine->return_used_this_function != 0) + return ""; + + if (IS_NAKED (arm_current_func_type ())) + return ""; + + offsets = arm_get_frame_offsets (); + live_regs_mask = offsets->saved_regs_mask; + high_regs_pushed = bit_count (live_regs_mask & 0x0f00); + + /* If we can deduce the registers used from the function's return value. + This is more reliable that examining df_regs_ever_live_p () because that + will be set if the register is ever used in the function, not just if + the register is used to hold a return value. */ + size = arm_size_return_regs (); + + extra_pop = thumb1_extra_regs_pushed (offsets, false); + if (extra_pop > 0) + { + unsigned long extra_mask = (1 << extra_pop) - 1; + live_regs_mask |= extra_mask << ARM_NUM_INTS (size); + } + + /* The prolog may have pushed some high registers to use as + work registers. e.g. the testsuite file: + gcc/testsuite/gcc/gcc.c-torture/execute/complex-2.c + compiles to produce: + push {r4, r5, r6, r7, lr} + mov r7, r9 + mov r6, r8 + push {r6, r7} + as part of the prolog. We have to undo that pushing here. */ + + if (high_regs_pushed) + { + unsigned long mask = live_regs_mask & 0xff; + int next_hi_reg; + + /* The available low registers depend on the size of the value we are + returning. */ + if (size <= 12) + mask |= 1 << 3; + if (size <= 8) + mask |= 1 << 2; + + if (mask == 0) + /* Oh dear! We have no low registers into which we can pop + high registers! */ + internal_error + ("no low registers available for popping high registers"); + + for (next_hi_reg = 8; next_hi_reg < 13; next_hi_reg++) + if (live_regs_mask & (1 << next_hi_reg)) + break; + + while (high_regs_pushed) + { + /* Find lo register(s) into which the high register(s) can + be popped. */ + for (regno = 0; regno <= LAST_LO_REGNUM; regno++) + { + if (mask & (1 << regno)) + high_regs_pushed--; + if (high_regs_pushed == 0) + break; + } + + mask &= (2 << regno) - 1; /* A noop if regno == 8 */ + + /* Pop the values into the low register(s). */ + thumb_pushpop (asm_out_file, mask, 0, NULL, mask); + + /* Move the value(s) into the high registers. */ + for (regno = 0; regno <= LAST_LO_REGNUM; regno++) + { + if (mask & (1 << regno)) + { + asm_fprintf (asm_out_file, "\tmov\t%r, %r\n", next_hi_reg, + regno); + + for (next_hi_reg++; next_hi_reg < 13; next_hi_reg++) + if (live_regs_mask & (1 << next_hi_reg)) + break; + } + } + } + live_regs_mask &= ~0x0f00; + } + + had_to_push_lr = (live_regs_mask & (1 << LR_REGNUM)) != 0; + live_regs_mask &= 0xff; + + if (crtl->args.pretend_args_size == 0 || TARGET_BACKTRACE) + { + /* Pop the return address into the PC. */ + if (had_to_push_lr) + live_regs_mask |= 1 << PC_REGNUM; + + /* Either no argument registers were pushed or a backtrace + structure was created which includes an adjusted stack + pointer, so just pop everything. */ + if (live_regs_mask) + thumb_pushpop (asm_out_file, live_regs_mask, FALSE, NULL, + live_regs_mask); + + /* We have either just popped the return address into the + PC or it is was kept in LR for the entire function. + Note that thumb_pushpop has already called thumb_exit if the + PC was in the list. */ + if (!had_to_push_lr) + thumb_exit (asm_out_file, LR_REGNUM); + } + else + { + /* Pop everything but the return address. */ + if (live_regs_mask) + thumb_pushpop (asm_out_file, live_regs_mask, FALSE, NULL, + live_regs_mask); + + if (had_to_push_lr) + { + if (size > 12) + { + /* We have no free low regs, so save one. */ + asm_fprintf (asm_out_file, "\tmov\t%r, %r\n", IP_REGNUM, + LAST_ARG_REGNUM); + } + + /* Get the return address into a temporary register. */ + thumb_pushpop (asm_out_file, 1 << LAST_ARG_REGNUM, 0, NULL, + 1 << LAST_ARG_REGNUM); + + if (size > 12) + { + /* Move the return address to lr. */ + asm_fprintf (asm_out_file, "\tmov\t%r, %r\n", LR_REGNUM, + LAST_ARG_REGNUM); + /* Restore the low register. */ + asm_fprintf (asm_out_file, "\tmov\t%r, %r\n", LAST_ARG_REGNUM, + IP_REGNUM); + regno = LR_REGNUM; + } + else + regno = LAST_ARG_REGNUM; + } + else + regno = LR_REGNUM; + + /* Remove the argument registers that were pushed onto the stack. */ + asm_fprintf (asm_out_file, "\tadd\t%r, %r, #%d\n", + SP_REGNUM, SP_REGNUM, + crtl->args.pretend_args_size); + + thumb_exit (asm_out_file, regno); + } + + return ""; +} + +/* Functions to save and restore machine-specific function data. */ +static struct machine_function * +arm_init_machine_status (void) +{ + struct machine_function *machine; + machine = ggc_alloc_cleared_machine_function (); + +#if ARM_FT_UNKNOWN != 0 + machine->func_type = ARM_FT_UNKNOWN; +#endif + return machine; +} + +/* Return an RTX indicating where the return address to the + calling function can be found. */ +rtx +arm_return_addr (int count, rtx frame ATTRIBUTE_UNUSED) +{ + if (count != 0) + return NULL_RTX; + + return get_hard_reg_initial_val (Pmode, LR_REGNUM); +} + +/* Do anything needed before RTL is emitted for each function. */ +void +arm_init_expanders (void) +{ + /* Arrange to initialize and mark the machine per-function status. */ + init_machine_status = arm_init_machine_status; + + /* This is to stop the combine pass optimizing away the alignment + adjustment of va_arg. */ + /* ??? It is claimed that this should not be necessary. */ + if (cfun) + mark_reg_pointer (arg_pointer_rtx, PARM_BOUNDARY); +} + + +/* Like arm_compute_initial_elimination offset. Simpler because there + isn't an ABI specified frame pointer for Thumb. Instead, we set it + to point at the base of the local variables after static stack + space for a function has been allocated. */ + +HOST_WIDE_INT +thumb_compute_initial_elimination_offset (unsigned int from, unsigned int to) +{ + arm_stack_offsets *offsets; + + offsets = arm_get_frame_offsets (); + + switch (from) + { + case ARG_POINTER_REGNUM: + switch (to) + { + case STACK_POINTER_REGNUM: + return offsets->outgoing_args - offsets->saved_args; + + case FRAME_POINTER_REGNUM: + return offsets->soft_frame - offsets->saved_args; + + case ARM_HARD_FRAME_POINTER_REGNUM: + return offsets->saved_regs - offsets->saved_args; + + case THUMB_HARD_FRAME_POINTER_REGNUM: + return offsets->locals_base - offsets->saved_args; + + default: + gcc_unreachable (); + } + break; + + case FRAME_POINTER_REGNUM: + switch (to) + { + case STACK_POINTER_REGNUM: + return offsets->outgoing_args - offsets->soft_frame; + + case ARM_HARD_FRAME_POINTER_REGNUM: + return offsets->saved_regs - offsets->soft_frame; + + case THUMB_HARD_FRAME_POINTER_REGNUM: + return offsets->locals_base - offsets->soft_frame; + + default: + gcc_unreachable (); + } + break; + + default: + gcc_unreachable (); + } +} + +/* Generate the rest of a function's prologue. */ +void +thumb1_expand_prologue (void) +{ + rtx insn, dwarf; + + HOST_WIDE_INT amount; + arm_stack_offsets *offsets; + unsigned long func_type; + int regno; + unsigned long live_regs_mask; + + func_type = arm_current_func_type (); + + /* Naked functions don't have prologues. */ + if (IS_NAKED (func_type)) + return; + + if (IS_INTERRUPT (func_type)) + { + error ("interrupt Service Routines cannot be coded in Thumb mode"); + return; + } + + offsets = arm_get_frame_offsets (); + live_regs_mask = offsets->saved_regs_mask; + /* Load the pic register before setting the frame pointer, + so we can use r7 as a temporary work register. */ + if (flag_pic && arm_pic_register != INVALID_REGNUM) + arm_load_pic_register (live_regs_mask); + + if (!frame_pointer_needed && CALLER_INTERWORKING_SLOT_SIZE > 0) + emit_move_insn (gen_rtx_REG (Pmode, ARM_HARD_FRAME_POINTER_REGNUM), + stack_pointer_rtx); + + if (flag_stack_usage) + current_function_static_stack_size + = offsets->outgoing_args - offsets->saved_args; + + amount = offsets->outgoing_args - offsets->saved_regs; + amount -= 4 * thumb1_extra_regs_pushed (offsets, true); + if (amount) + { + if (amount < 512) + { + insn = emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (- amount))); + RTX_FRAME_RELATED_P (insn) = 1; + } + else + { + rtx reg; + + /* The stack decrement is too big for an immediate value in a single + insn. In theory we could issue multiple subtracts, but after + three of them it becomes more space efficient to place the full + value in the constant pool and load into a register. (Also the + ARM debugger really likes to see only one stack decrement per + function). So instead we look for a scratch register into which + we can load the decrement, and then we subtract this from the + stack pointer. Unfortunately on the thumb the only available + scratch registers are the argument registers, and we cannot use + these as they may hold arguments to the function. Instead we + attempt to locate a call preserved register which is used by this + function. If we can find one, then we know that it will have + been pushed at the start of the prologue and so we can corrupt + it now. */ + for (regno = LAST_ARG_REGNUM + 1; regno <= LAST_LO_REGNUM; regno++) + if (live_regs_mask & (1 << regno)) + break; + + gcc_assert(regno <= LAST_LO_REGNUM); + + reg = gen_rtx_REG (SImode, regno); + + emit_insn (gen_movsi (reg, GEN_INT (- amount))); + + insn = emit_insn (gen_addsi3 (stack_pointer_rtx, + stack_pointer_rtx, reg)); + RTX_FRAME_RELATED_P (insn) = 1; + dwarf = gen_rtx_SET (VOIDmode, stack_pointer_rtx, + plus_constant (stack_pointer_rtx, + -amount)); + RTX_FRAME_RELATED_P (dwarf) = 1; + add_reg_note (insn, REG_FRAME_RELATED_EXPR, dwarf); + } + } + + if (frame_pointer_needed) + thumb_set_frame_pointer (offsets); + + /* If we are profiling, make sure no instructions are scheduled before + the call to mcount. Similarly if the user has requested no + scheduling in the prolog. Similarly if we want non-call exceptions + using the EABI unwinder, to prevent faulting instructions from being + swapped with a stack adjustment. */ + if (crtl->profile || !TARGET_SCHED_PROLOG + || (arm_except_unwind_info (&global_options) == UI_TARGET + && cfun->can_throw_non_call_exceptions)) + emit_insn (gen_blockage ()); + + cfun->machine->lr_save_eliminated = !thumb_force_lr_save (); + if (live_regs_mask & 0xff) + cfun->machine->lr_save_eliminated = 0; +} + + +void +thumb1_expand_epilogue (void) +{ + HOST_WIDE_INT amount; + arm_stack_offsets *offsets; + int regno; + + /* Naked functions don't have prologues. */ + if (IS_NAKED (arm_current_func_type ())) + return; + + offsets = arm_get_frame_offsets (); + amount = offsets->outgoing_args - offsets->saved_regs; + + if (frame_pointer_needed) + { + emit_insn (gen_movsi (stack_pointer_rtx, hard_frame_pointer_rtx)); + amount = offsets->locals_base - offsets->saved_regs; + } + amount -= 4 * thumb1_extra_regs_pushed (offsets, false); + + gcc_assert (amount >= 0); + if (amount) + { + emit_insn (gen_blockage ()); + + if (amount < 512) + emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (amount))); + else + { + /* r3 is always free in the epilogue. */ + rtx reg = gen_rtx_REG (SImode, LAST_ARG_REGNUM); + + emit_insn (gen_movsi (reg, GEN_INT (amount))); + emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx, reg)); + } + } + + /* Emit a USE (stack_pointer_rtx), so that + the stack adjustment will not be deleted. */ + emit_insn (gen_prologue_use (stack_pointer_rtx)); + + if (crtl->profile || !TARGET_SCHED_PROLOG) + emit_insn (gen_blockage ()); + + /* Emit a clobber for each insn that will be restored in the epilogue, + so that flow2 will get register lifetimes correct. */ + for (regno = 0; regno < 13; regno++) + if (df_regs_ever_live_p (regno) && !call_used_regs[regno]) + emit_clobber (gen_rtx_REG (SImode, regno)); + + if (! df_regs_ever_live_p (LR_REGNUM)) + emit_use (gen_rtx_REG (SImode, LR_REGNUM)); +} + +static void +thumb1_output_function_prologue (FILE *f, HOST_WIDE_INT size ATTRIBUTE_UNUSED) +{ + arm_stack_offsets *offsets; + unsigned long live_regs_mask = 0; + unsigned long l_mask; + unsigned high_regs_pushed = 0; + int cfa_offset = 0; + int regno; + + if (IS_NAKED (arm_current_func_type ())) + return; + + if (is_called_in_ARM_mode (current_function_decl)) + { + const char * name; + + gcc_assert (GET_CODE (DECL_RTL (current_function_decl)) == MEM); + gcc_assert (GET_CODE (XEXP (DECL_RTL (current_function_decl), 0)) + == SYMBOL_REF); + name = XSTR (XEXP (DECL_RTL (current_function_decl), 0), 0); + + /* Generate code sequence to switch us into Thumb mode. */ + /* The .code 32 directive has already been emitted by + ASM_DECLARE_FUNCTION_NAME. */ + asm_fprintf (f, "\torr\t%r, %r, #1\n", IP_REGNUM, PC_REGNUM); + asm_fprintf (f, "\tbx\t%r\n", IP_REGNUM); + + /* Generate a label, so that the debugger will notice the + change in instruction sets. This label is also used by + the assembler to bypass the ARM code when this function + is called from a Thumb encoded function elsewhere in the + same file. Hence the definition of STUB_NAME here must + agree with the definition in gas/config/tc-arm.c. */ + +#define STUB_NAME ".real_start_of" + + fprintf (f, "\t.code\t16\n"); +#ifdef ARM_PE + if (arm_dllexport_name_p (name)) + name = arm_strip_name_encoding (name); +#endif + asm_fprintf (f, "\t.globl %s%U%s\n", STUB_NAME, name); + fprintf (f, "\t.thumb_func\n"); + asm_fprintf (f, "%s%U%s:\n", STUB_NAME, name); + } + + if (crtl->args.pretend_args_size) + { + /* Output unwind directive for the stack adjustment. */ + if (arm_except_unwind_info (&global_options) == UI_TARGET) + fprintf (f, "\t.pad #%d\n", + crtl->args.pretend_args_size); + + if (cfun->machine->uses_anonymous_args) + { + int num_pushes; + + fprintf (f, "\tpush\t{"); + + num_pushes = ARM_NUM_INTS (crtl->args.pretend_args_size); + + for (regno = LAST_ARG_REGNUM + 1 - num_pushes; + regno <= LAST_ARG_REGNUM; + regno++) + asm_fprintf (f, "%r%s", regno, + regno == LAST_ARG_REGNUM ? "" : ", "); + + fprintf (f, "}\n"); + } + else + asm_fprintf (f, "\tsub\t%r, %r, #%d\n", + SP_REGNUM, SP_REGNUM, + crtl->args.pretend_args_size); + + /* We don't need to record the stores for unwinding (would it + help the debugger any if we did?), but record the change in + the stack pointer. */ + if (dwarf2out_do_frame ()) + { + char *l = dwarf2out_cfi_label (false); + + cfa_offset = cfa_offset + crtl->args.pretend_args_size; + dwarf2out_def_cfa (l, SP_REGNUM, cfa_offset); + } + } + + /* Get the registers we are going to push. */ + offsets = arm_get_frame_offsets (); + live_regs_mask = offsets->saved_regs_mask; + /* Extract a mask of the ones we can give to the Thumb's push instruction. */ + l_mask = live_regs_mask & 0x40ff; + /* Then count how many other high registers will need to be pushed. */ + high_regs_pushed = bit_count (live_regs_mask & 0x0f00); + + if (TARGET_BACKTRACE) + { + unsigned offset; + unsigned work_register; + + /* We have been asked to create a stack backtrace structure. + The code looks like this: + + 0 .align 2 + 0 func: + 0 sub SP, #16 Reserve space for 4 registers. + 2 push {R7} Push low registers. + 4 add R7, SP, #20 Get the stack pointer before the push. + 6 str R7, [SP, #8] Store the stack pointer (before reserving the space). + 8 mov R7, PC Get hold of the start of this code plus 12. + 10 str R7, [SP, #16] Store it. + 12 mov R7, FP Get hold of the current frame pointer. + 14 str R7, [SP, #4] Store it. + 16 mov R7, LR Get hold of the current return address. + 18 str R7, [SP, #12] Store it. + 20 add R7, SP, #16 Point at the start of the backtrace structure. + 22 mov FP, R7 Put this value into the frame pointer. */ + + work_register = thumb_find_work_register (live_regs_mask); + + if (arm_except_unwind_info (&global_options) == UI_TARGET) + asm_fprintf (f, "\t.pad #16\n"); + + asm_fprintf + (f, "\tsub\t%r, %r, #16\t%@ Create stack backtrace structure\n", + SP_REGNUM, SP_REGNUM); + + if (dwarf2out_do_frame ()) + { + char *l = dwarf2out_cfi_label (false); + + cfa_offset = cfa_offset + 16; + dwarf2out_def_cfa (l, SP_REGNUM, cfa_offset); + } + + if (l_mask) + { + thumb_pushpop (f, l_mask, 1, &cfa_offset, l_mask); + offset = bit_count (l_mask) * UNITS_PER_WORD; + } + else + offset = 0; + + asm_fprintf (f, "\tadd\t%r, %r, #%d\n", work_register, SP_REGNUM, + offset + 16 + crtl->args.pretend_args_size); + + asm_fprintf (f, "\tstr\t%r, [%r, #%d]\n", work_register, SP_REGNUM, + offset + 4); + + /* Make sure that the instruction fetching the PC is in the right place + to calculate "start of backtrace creation code + 12". */ + if (l_mask) + { + asm_fprintf (f, "\tmov\t%r, %r\n", work_register, PC_REGNUM); + asm_fprintf (f, "\tstr\t%r, [%r, #%d]\n", work_register, SP_REGNUM, + offset + 12); + asm_fprintf (f, "\tmov\t%r, %r\n", work_register, + ARM_HARD_FRAME_POINTER_REGNUM); + asm_fprintf (f, "\tstr\t%r, [%r, #%d]\n", work_register, SP_REGNUM, + offset); + } + else + { + asm_fprintf (f, "\tmov\t%r, %r\n", work_register, + ARM_HARD_FRAME_POINTER_REGNUM); + asm_fprintf (f, "\tstr\t%r, [%r, #%d]\n", work_register, SP_REGNUM, + offset); + asm_fprintf (f, "\tmov\t%r, %r\n", work_register, PC_REGNUM); + asm_fprintf (f, "\tstr\t%r, [%r, #%d]\n", work_register, SP_REGNUM, + offset + 12); + } + + asm_fprintf (f, "\tmov\t%r, %r\n", work_register, LR_REGNUM); + asm_fprintf (f, "\tstr\t%r, [%r, #%d]\n", work_register, SP_REGNUM, + offset + 8); + asm_fprintf (f, "\tadd\t%r, %r, #%d\n", work_register, SP_REGNUM, + offset + 12); + asm_fprintf (f, "\tmov\t%r, %r\t\t%@ Backtrace structure created\n", + ARM_HARD_FRAME_POINTER_REGNUM, work_register); + } + /* Optimization: If we are not pushing any low registers but we are going + to push some high registers then delay our first push. This will just + be a push of LR and we can combine it with the push of the first high + register. */ + else if ((l_mask & 0xff) != 0 + || (high_regs_pushed == 0 && l_mask)) + { + unsigned long mask = l_mask; + mask |= (1 << thumb1_extra_regs_pushed (offsets, true)) - 1; + thumb_pushpop (f, mask, 1, &cfa_offset, mask); + } + + if (high_regs_pushed) + { + unsigned pushable_regs; + unsigned next_hi_reg; + + for (next_hi_reg = 12; next_hi_reg > LAST_LO_REGNUM; next_hi_reg--) + if (live_regs_mask & (1 << next_hi_reg)) + break; + + pushable_regs = l_mask & 0xff; + + if (pushable_regs == 0) + pushable_regs = 1 << thumb_find_work_register (live_regs_mask); + + while (high_regs_pushed > 0) + { + unsigned long real_regs_mask = 0; + + for (regno = LAST_LO_REGNUM; regno >= 0; regno --) + { + if (pushable_regs & (1 << regno)) + { + asm_fprintf (f, "\tmov\t%r, %r\n", regno, next_hi_reg); + + high_regs_pushed --; + real_regs_mask |= (1 << next_hi_reg); + + if (high_regs_pushed) + { + for (next_hi_reg --; next_hi_reg > LAST_LO_REGNUM; + next_hi_reg --) + if (live_regs_mask & (1 << next_hi_reg)) + break; + } + else + { + pushable_regs &= ~((1 << regno) - 1); + break; + } + } + } + + /* If we had to find a work register and we have not yet + saved the LR then add it to the list of regs to push. */ + if (l_mask == (1 << LR_REGNUM)) + { + thumb_pushpop (f, pushable_regs | (1 << LR_REGNUM), + 1, &cfa_offset, + real_regs_mask | (1 << LR_REGNUM)); + l_mask = 0; + } + else + thumb_pushpop (f, pushable_regs, 1, &cfa_offset, real_regs_mask); + } + } +} + +/* Handle the case of a double word load into a low register from + a computed memory address. The computed address may involve a + register which is overwritten by the load. */ +const char * +thumb_load_double_from_address (rtx *operands) +{ + rtx addr; + rtx base; + rtx offset; + rtx arg1; + rtx arg2; + + gcc_assert (GET_CODE (operands[0]) == REG); + gcc_assert (GET_CODE (operands[1]) == MEM); + + /* Get the memory address. */ + addr = XEXP (operands[1], 0); + + /* Work out how the memory address is computed. */ + switch (GET_CODE (addr)) + { + case REG: + operands[2] = adjust_address (operands[1], SImode, 4); + + if (REGNO (operands[0]) == REGNO (addr)) + { + output_asm_insn ("ldr\t%H0, %2", operands); + output_asm_insn ("ldr\t%0, %1", operands); + } + else + { + output_asm_insn ("ldr\t%0, %1", operands); + output_asm_insn ("ldr\t%H0, %2", operands); + } + break; + + case CONST: + /* Compute
+ 4 for the high order load. */ + operands[2] = adjust_address (operands[1], SImode, 4); + + output_asm_insn ("ldr\t%0, %1", operands); + output_asm_insn ("ldr\t%H0, %2", operands); + break; + + case PLUS: + arg1 = XEXP (addr, 0); + arg2 = XEXP (addr, 1); + + if (CONSTANT_P (arg1)) + base = arg2, offset = arg1; + else + base = arg1, offset = arg2; + + gcc_assert (GET_CODE (base) == REG); + + /* Catch the case of
= + */ + if (GET_CODE (offset) == REG) + { + int reg_offset = REGNO (offset); + int reg_base = REGNO (base); + int reg_dest = REGNO (operands[0]); + + /* Add the base and offset registers together into the + higher destination register. */ + asm_fprintf (asm_out_file, "\tadd\t%r, %r, %r", + reg_dest + 1, reg_base, reg_offset); + + /* Load the lower destination register from the address in + the higher destination register. */ + asm_fprintf (asm_out_file, "\tldr\t%r, [%r, #0]", + reg_dest, reg_dest + 1); + + /* Load the higher destination register from its own address + plus 4. */ + asm_fprintf (asm_out_file, "\tldr\t%r, [%r, #4]", + reg_dest + 1, reg_dest + 1); + } + else + { + /* Compute
+ 4 for the high order load. */ + operands[2] = adjust_address (operands[1], SImode, 4); + + /* If the computed address is held in the low order register + then load the high order register first, otherwise always + load the low order register first. */ + if (REGNO (operands[0]) == REGNO (base)) + { + output_asm_insn ("ldr\t%H0, %2", operands); + output_asm_insn ("ldr\t%0, %1", operands); + } + else + { + output_asm_insn ("ldr\t%0, %1", operands); + output_asm_insn ("ldr\t%H0, %2", operands); + } + } + break; + + case LABEL_REF: + /* With no registers to worry about we can just load the value + directly. */ + operands[2] = adjust_address (operands[1], SImode, 4); + + output_asm_insn ("ldr\t%H0, %2", operands); + output_asm_insn ("ldr\t%0, %1", operands); + break; + + default: + gcc_unreachable (); + } + + return ""; +} + +const char * +thumb_output_move_mem_multiple (int n, rtx *operands) +{ + rtx tmp; + + switch (n) + { + case 2: + if (REGNO (operands[4]) > REGNO (operands[5])) + { + tmp = operands[4]; + operands[4] = operands[5]; + operands[5] = tmp; + } + output_asm_insn ("ldmia\t%1!, {%4, %5}", operands); + output_asm_insn ("stmia\t%0!, {%4, %5}", operands); + break; + + case 3: + if (REGNO (operands[4]) > REGNO (operands[5])) + { + tmp = operands[4]; + operands[4] = operands[5]; + operands[5] = tmp; + } + if (REGNO (operands[5]) > REGNO (operands[6])) + { + tmp = operands[5]; + operands[5] = operands[6]; + operands[6] = tmp; + } + if (REGNO (operands[4]) > REGNO (operands[5])) + { + tmp = operands[4]; + operands[4] = operands[5]; + operands[5] = tmp; + } + + output_asm_insn ("ldmia\t%1!, {%4, %5, %6}", operands); + output_asm_insn ("stmia\t%0!, {%4, %5, %6}", operands); + break; + + default: + gcc_unreachable (); + } + + return ""; +} + +/* Output a call-via instruction for thumb state. */ +const char * +thumb_call_via_reg (rtx reg) +{ + int regno = REGNO (reg); + rtx *labelp; + + gcc_assert (regno < LR_REGNUM); + + /* If we are in the normal text section we can use a single instance + per compilation unit. If we are doing function sections, then we need + an entry per section, since we can't rely on reachability. */ + if (in_section == text_section) + { + thumb_call_reg_needed = 1; + + if (thumb_call_via_label[regno] == NULL) + thumb_call_via_label[regno] = gen_label_rtx (); + labelp = thumb_call_via_label + regno; + } + else + { + if (cfun->machine->call_via[regno] == NULL) + cfun->machine->call_via[regno] = gen_label_rtx (); + labelp = cfun->machine->call_via + regno; + } + + output_asm_insn ("bl\t%a0", labelp); + return ""; +} + +/* Routines for generating rtl. */ +void +thumb_expand_movmemqi (rtx *operands) +{ + rtx out = copy_to_mode_reg (SImode, XEXP (operands[0], 0)); + rtx in = copy_to_mode_reg (SImode, XEXP (operands[1], 0)); + HOST_WIDE_INT len = INTVAL (operands[2]); + HOST_WIDE_INT offset = 0; + + while (len >= 12) + { + emit_insn (gen_movmem12b (out, in, out, in)); + len -= 12; + } + + if (len >= 8) + { + emit_insn (gen_movmem8b (out, in, out, in)); + len -= 8; + } + + if (len >= 4) + { + rtx reg = gen_reg_rtx (SImode); + emit_insn (gen_movsi (reg, gen_rtx_MEM (SImode, in))); + emit_insn (gen_movsi (gen_rtx_MEM (SImode, out), reg)); + len -= 4; + offset += 4; + } + + if (len >= 2) + { + rtx reg = gen_reg_rtx (HImode); + emit_insn (gen_movhi (reg, gen_rtx_MEM (HImode, + plus_constant (in, offset)))); + emit_insn (gen_movhi (gen_rtx_MEM (HImode, plus_constant (out, offset)), + reg)); + len -= 2; + offset += 2; + } + + if (len) + { + rtx reg = gen_reg_rtx (QImode); + emit_insn (gen_movqi (reg, gen_rtx_MEM (QImode, + plus_constant (in, offset)))); + emit_insn (gen_movqi (gen_rtx_MEM (QImode, plus_constant (out, offset)), + reg)); + } +} + +void +thumb_reload_out_hi (rtx *operands) +{ + emit_insn (gen_thumb_movhi_clobber (operands[0], operands[1], operands[2])); +} + +/* Handle reading a half-word from memory during reload. */ +void +thumb_reload_in_hi (rtx *operands ATTRIBUTE_UNUSED) +{ + gcc_unreachable (); +} + +/* Return the length of a function name prefix + that starts with the character 'c'. */ +static int +arm_get_strip_length (int c) +{ + switch (c) + { + ARM_NAME_ENCODING_LENGTHS + default: return 0; + } +} + +/* Return a pointer to a function's name with any + and all prefix encodings stripped from it. */ +const char * +arm_strip_name_encoding (const char *name) +{ + int skip; + + while ((skip = arm_get_strip_length (* name))) + name += skip; + + return name; +} + +/* If there is a '*' anywhere in the name's prefix, then + emit the stripped name verbatim, otherwise prepend an + underscore if leading underscores are being used. */ +void +arm_asm_output_labelref (FILE *stream, const char *name) +{ + int skip; + int verbatim = 0; + + while ((skip = arm_get_strip_length (* name))) + { + verbatim |= (*name == '*'); + name += skip; + } + + if (verbatim) + fputs (name, stream); + else + asm_fprintf (stream, "%U%s", name); +} + +static void +arm_file_start (void) +{ + int val; + + if (TARGET_UNIFIED_ASM) + asm_fprintf (asm_out_file, "\t.syntax unified\n"); + + if (TARGET_BPABI) + { + const char *fpu_name; + if (arm_selected_arch) + asm_fprintf (asm_out_file, "\t.arch %s\n", arm_selected_arch->name); + else + asm_fprintf (asm_out_file, "\t.cpu %s\n", arm_selected_cpu->name); + + if (TARGET_SOFT_FLOAT) + { + if (TARGET_VFP) + fpu_name = "softvfp"; + else + fpu_name = "softfpa"; + } + else + { + fpu_name = arm_fpu_desc->name; + if (arm_fpu_desc->model == ARM_FP_MODEL_VFP) + { + if (TARGET_HARD_FLOAT) + asm_fprintf (asm_out_file, "\t.eabi_attribute 27, 3\n"); + if (TARGET_HARD_FLOAT_ABI) + asm_fprintf (asm_out_file, "\t.eabi_attribute 28, 1\n"); + } + } + asm_fprintf (asm_out_file, "\t.fpu %s\n", fpu_name); + + /* Some of these attributes only apply when the corresponding features + are used. However we don't have any easy way of figuring this out. + Conservatively record the setting that would have been used. */ + + /* Tag_ABI_FP_rounding. */ + if (flag_rounding_math) + asm_fprintf (asm_out_file, "\t.eabi_attribute 19, 1\n"); + if (!flag_unsafe_math_optimizations) + { + /* Tag_ABI_FP_denomal. */ + asm_fprintf (asm_out_file, "\t.eabi_attribute 20, 1\n"); + /* Tag_ABI_FP_exceptions. */ + asm_fprintf (asm_out_file, "\t.eabi_attribute 21, 1\n"); + } + /* Tag_ABI_FP_user_exceptions. */ + if (flag_signaling_nans) + asm_fprintf (asm_out_file, "\t.eabi_attribute 22, 1\n"); + /* Tag_ABI_FP_number_model. */ + asm_fprintf (asm_out_file, "\t.eabi_attribute 23, %d\n", + flag_finite_math_only ? 1 : 3); + + /* Tag_ABI_align8_needed. */ + asm_fprintf (asm_out_file, "\t.eabi_attribute 24, 1\n"); + /* Tag_ABI_align8_preserved. */ + asm_fprintf (asm_out_file, "\t.eabi_attribute 25, 1\n"); + /* Tag_ABI_enum_size. */ + asm_fprintf (asm_out_file, "\t.eabi_attribute 26, %d\n", + flag_short_enums ? 1 : 2); + + /* Tag_ABI_optimization_goals. */ + if (optimize_size) + val = 4; + else if (optimize >= 2) + val = 2; + else if (optimize) + val = 1; + else + val = 6; + asm_fprintf (asm_out_file, "\t.eabi_attribute 30, %d\n", val); + + /* Tag_ABI_FP_16bit_format. */ + if (arm_fp16_format) + asm_fprintf (asm_out_file, "\t.eabi_attribute 38, %d\n", + (int)arm_fp16_format); + + if (arm_lang_output_object_attributes_hook) + arm_lang_output_object_attributes_hook(); + } + default_file_start(); +} + +static void +arm_file_end (void) +{ + int regno; + + if (NEED_INDICATE_EXEC_STACK) + /* Add .note.GNU-stack. */ + file_end_indicate_exec_stack (); + + if (! thumb_call_reg_needed) + return; + + switch_to_section (text_section); + asm_fprintf (asm_out_file, "\t.code 16\n"); + ASM_OUTPUT_ALIGN (asm_out_file, 1); + + for (regno = 0; regno < LR_REGNUM; regno++) + { + rtx label = thumb_call_via_label[regno]; + + if (label != 0) + { + targetm.asm_out.internal_label (asm_out_file, "L", + CODE_LABEL_NUMBER (label)); + asm_fprintf (asm_out_file, "\tbx\t%r\n", regno); + } + } +} + +#ifndef ARM_PE +/* Symbols in the text segment can be accessed without indirecting via the + constant pool; it may take an extra binary operation, but this is still + faster than indirecting via memory. Don't do this when not optimizing, + since we won't be calculating al of the offsets necessary to do this + simplification. */ + +static void +arm_encode_section_info (tree decl, rtx rtl, int first) +{ + if (optimize > 0 && TREE_CONSTANT (decl)) + SYMBOL_REF_FLAG (XEXP (rtl, 0)) = 1; + + default_encode_section_info (decl, rtl, first); +} +#endif /* !ARM_PE */ + +static void +arm_internal_label (FILE *stream, const char *prefix, unsigned long labelno) +{ + if (arm_ccfsm_state == 3 && (unsigned) arm_target_label == labelno + && !strcmp (prefix, "L")) + { + arm_ccfsm_state = 0; + arm_target_insn = NULL; + } + default_internal_label (stream, prefix, labelno); +} + +/* Output code to add DELTA to the first argument, and then jump + to FUNCTION. Used for C++ multiple inheritance. */ +static void +arm_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED, + HOST_WIDE_INT delta, + HOST_WIDE_INT vcall_offset ATTRIBUTE_UNUSED, + tree function) +{ + static int thunk_label = 0; + char label[256]; + char labelpc[256]; + int mi_delta = delta; + const char *const mi_op = mi_delta < 0 ? "sub" : "add"; + int shift = 0; + int this_regno = (aggregate_value_p (TREE_TYPE (TREE_TYPE (function)), function) + ? 1 : 0); + if (mi_delta < 0) + mi_delta = - mi_delta; + + if (TARGET_THUMB1) + { + int labelno = thunk_label++; + ASM_GENERATE_INTERNAL_LABEL (label, "LTHUMBFUNC", labelno); + /* Thunks are entered in arm mode when avaiable. */ + if (TARGET_THUMB1_ONLY) + { + /* push r3 so we can use it as a temporary. */ + /* TODO: Omit this save if r3 is not used. */ + fputs ("\tpush {r3}\n", file); + fputs ("\tldr\tr3, ", file); + } + else + { + fputs ("\tldr\tr12, ", file); + } + assemble_name (file, label); + fputc ('\n', file); + if (flag_pic) + { + /* If we are generating PIC, the ldr instruction below loads + "(target - 7) - .LTHUNKPCn" into r12. The pc reads as + the address of the add + 8, so we have: + + r12 = (target - 7) - .LTHUNKPCn + (.LTHUNKPCn + 8) + = target + 1. + + Note that we have "+ 1" because some versions of GNU ld + don't set the low bit of the result for R_ARM_REL32 + relocations against thumb function symbols. + On ARMv6M this is +4, not +8. */ + ASM_GENERATE_INTERNAL_LABEL (labelpc, "LTHUNKPC", labelno); + assemble_name (file, labelpc); + fputs (":\n", file); + if (TARGET_THUMB1_ONLY) + { + /* This is 2 insns after the start of the thunk, so we know it + is 4-byte aligned. */ + fputs ("\tadd\tr3, pc, r3\n", file); + fputs ("\tmov r12, r3\n", file); + } + else + fputs ("\tadd\tr12, pc, r12\n", file); + } + else if (TARGET_THUMB1_ONLY) + fputs ("\tmov r12, r3\n", file); + } + if (TARGET_THUMB1_ONLY) + { + if (mi_delta > 255) + { + fputs ("\tldr\tr3, ", file); + assemble_name (file, label); + fputs ("+4\n", file); + asm_fprintf (file, "\t%s\t%r, %r, r3\n", + mi_op, this_regno, this_regno); + } + else if (mi_delta != 0) + { + asm_fprintf (file, "\t%s\t%r, %r, #%d\n", + mi_op, this_regno, this_regno, + mi_delta); + } + } + else + { + /* TODO: Use movw/movt for large constants when available. */ + while (mi_delta != 0) + { + if ((mi_delta & (3 << shift)) == 0) + shift += 2; + else + { + asm_fprintf (file, "\t%s\t%r, %r, #%d\n", + mi_op, this_regno, this_regno, + mi_delta & (0xff << shift)); + mi_delta &= ~(0xff << shift); + shift += 8; + } + } + } + if (TARGET_THUMB1) + { + if (TARGET_THUMB1_ONLY) + fputs ("\tpop\t{r3}\n", file); + + fprintf (file, "\tbx\tr12\n"); + ASM_OUTPUT_ALIGN (file, 2); + assemble_name (file, label); + fputs (":\n", file); + if (flag_pic) + { + /* Output ".word .LTHUNKn-7-.LTHUNKPCn". */ + rtx tem = XEXP (DECL_RTL (function), 0); + tem = gen_rtx_PLUS (GET_MODE (tem), tem, GEN_INT (-7)); + tem = gen_rtx_MINUS (GET_MODE (tem), + tem, + gen_rtx_SYMBOL_REF (Pmode, + ggc_strdup (labelpc))); + assemble_integer (tem, 4, BITS_PER_WORD, 1); + } + else + /* Output ".word .LTHUNKn". */ + assemble_integer (XEXP (DECL_RTL (function), 0), 4, BITS_PER_WORD, 1); + + if (TARGET_THUMB1_ONLY && mi_delta > 255) + assemble_integer (GEN_INT(mi_delta), 4, BITS_PER_WORD, 1); + } + else + { + fputs ("\tb\t", file); + assemble_name (file, XSTR (XEXP (DECL_RTL (function), 0), 0)); + if (NEED_PLT_RELOC) + fputs ("(PLT)", file); + fputc ('\n', file); + } +} + +int +arm_emit_vector_const (FILE *file, rtx x) +{ + int i; + const char * pattern; + + gcc_assert (GET_CODE (x) == CONST_VECTOR); + + switch (GET_MODE (x)) + { + case V2SImode: pattern = "%08x"; break; + case V4HImode: pattern = "%04x"; break; + case V8QImode: pattern = "%02x"; break; + default: gcc_unreachable (); + } + + fprintf (file, "0x"); + for (i = CONST_VECTOR_NUNITS (x); i--;) + { + rtx element; + + element = CONST_VECTOR_ELT (x, i); + fprintf (file, pattern, INTVAL (element)); + } + + return 1; +} + +/* Emit a fp16 constant appropriately padded to occupy a 4-byte word. + HFmode constant pool entries are actually loaded with ldr. */ +void +arm_emit_fp16_const (rtx c) +{ + REAL_VALUE_TYPE r; + long bits; + + REAL_VALUE_FROM_CONST_DOUBLE (r, c); + bits = real_to_target (NULL, &r, HFmode); + if (WORDS_BIG_ENDIAN) + assemble_zeros (2); + assemble_integer (GEN_INT (bits), 2, BITS_PER_WORD, 1); + if (!WORDS_BIG_ENDIAN) + assemble_zeros (2); +} + +const char * +arm_output_load_gr (rtx *operands) +{ + rtx reg; + rtx offset; + rtx wcgr; + rtx sum; + + if (GET_CODE (operands [1]) != MEM + || GET_CODE (sum = XEXP (operands [1], 0)) != PLUS + || GET_CODE (reg = XEXP (sum, 0)) != REG + || GET_CODE (offset = XEXP (sum, 1)) != CONST_INT + || ((INTVAL (offset) < 1024) && (INTVAL (offset) > -1024))) + return "wldrw%?\t%0, %1"; + + /* Fix up an out-of-range load of a GR register. */ + output_asm_insn ("str%?\t%0, [sp, #-4]!\t@ Start of GR load expansion", & reg); + wcgr = operands[0]; + operands[0] = reg; + output_asm_insn ("ldr%?\t%0, %1", operands); + + operands[0] = wcgr; + operands[1] = reg; + output_asm_insn ("tmcr%?\t%0, %1", operands); + output_asm_insn ("ldr%?\t%0, [sp], #4\t@ End of GR load expansion", & reg); + + return ""; +} + +/* Worker function for TARGET_SETUP_INCOMING_VARARGS. + + On the ARM, PRETEND_SIZE is set in order to have the prologue push the last + named arg and all anonymous args onto the stack. + XXX I know the prologue shouldn't be pushing registers, but it is faster + that way. */ + +static void +arm_setup_incoming_varargs (CUMULATIVE_ARGS *pcum, + enum machine_mode mode, + tree type, + int *pretend_size, + int second_time ATTRIBUTE_UNUSED) +{ + int nregs; + + cfun->machine->uses_anonymous_args = 1; + if (pcum->pcs_variant <= ARM_PCS_AAPCS_LOCAL) + { + nregs = pcum->aapcs_ncrn; + if ((nregs & 1) && arm_needs_doubleword_align (mode, type)) + nregs++; + } + else + nregs = pcum->nregs; + + if (nregs < NUM_ARG_REGS) + *pretend_size = (NUM_ARG_REGS - nregs) * UNITS_PER_WORD; +} + +/* Return nonzero if the CONSUMER instruction (a store) does not need + PRODUCER's value to calculate the address. */ + +int +arm_no_early_store_addr_dep (rtx producer, rtx consumer) +{ + rtx value = PATTERN (producer); + rtx addr = PATTERN (consumer); + + if (GET_CODE (value) == COND_EXEC) + value = COND_EXEC_CODE (value); + if (GET_CODE (value) == PARALLEL) + value = XVECEXP (value, 0, 0); + value = XEXP (value, 0); + if (GET_CODE (addr) == COND_EXEC) + addr = COND_EXEC_CODE (addr); + if (GET_CODE (addr) == PARALLEL) + addr = XVECEXP (addr, 0, 0); + addr = XEXP (addr, 0); + + return !reg_overlap_mentioned_p (value, addr); +} + +/* Return nonzero if the CONSUMER instruction (a store) does need + PRODUCER's value to calculate the address. */ + +int +arm_early_store_addr_dep (rtx producer, rtx consumer) +{ + return !arm_no_early_store_addr_dep (producer, consumer); +} + +/* Return nonzero if the CONSUMER instruction (a load) does need + PRODUCER's value to calculate the address. */ + +int +arm_early_load_addr_dep (rtx producer, rtx consumer) +{ + rtx value = PATTERN (producer); + rtx addr = PATTERN (consumer); + + if (GET_CODE (value) == COND_EXEC) + value = COND_EXEC_CODE (value); + if (GET_CODE (value) == PARALLEL) + value = XVECEXP (value, 0, 0); + value = XEXP (value, 0); + if (GET_CODE (addr) == COND_EXEC) + addr = COND_EXEC_CODE (addr); + if (GET_CODE (addr) == PARALLEL) + addr = XVECEXP (addr, 0, 0); + addr = XEXP (addr, 1); + + return reg_overlap_mentioned_p (value, addr); +} + +/* Return nonzero if the CONSUMER instruction (an ALU op) does not + have an early register shift value or amount dependency on the + result of PRODUCER. */ + +int +arm_no_early_alu_shift_dep (rtx producer, rtx consumer) +{ + rtx value = PATTERN (producer); + rtx op = PATTERN (consumer); + rtx early_op; + + if (GET_CODE (value) == COND_EXEC) + value = COND_EXEC_CODE (value); + if (GET_CODE (value) == PARALLEL) + value = XVECEXP (value, 0, 0); + value = XEXP (value, 0); + if (GET_CODE (op) == COND_EXEC) + op = COND_EXEC_CODE (op); + if (GET_CODE (op) == PARALLEL) + op = XVECEXP (op, 0, 0); + op = XEXP (op, 1); + + early_op = XEXP (op, 0); + /* This is either an actual independent shift, or a shift applied to + the first operand of another operation. We want the whole shift + operation. */ + if (GET_CODE (early_op) == REG) + early_op = op; + + return !reg_overlap_mentioned_p (value, early_op); +} + +/* Return nonzero if the CONSUMER instruction (an ALU op) does not + have an early register shift value dependency on the result of + PRODUCER. */ + +int +arm_no_early_alu_shift_value_dep (rtx producer, rtx consumer) +{ + rtx value = PATTERN (producer); + rtx op = PATTERN (consumer); + rtx early_op; + + if (GET_CODE (value) == COND_EXEC) + value = COND_EXEC_CODE (value); + if (GET_CODE (value) == PARALLEL) + value = XVECEXP (value, 0, 0); + value = XEXP (value, 0); + if (GET_CODE (op) == COND_EXEC) + op = COND_EXEC_CODE (op); + if (GET_CODE (op) == PARALLEL) + op = XVECEXP (op, 0, 0); + op = XEXP (op, 1); + + early_op = XEXP (op, 0); + + /* This is either an actual independent shift, or a shift applied to + the first operand of another operation. We want the value being + shifted, in either case. */ + if (GET_CODE (early_op) != REG) + early_op = XEXP (early_op, 0); + + return !reg_overlap_mentioned_p (value, early_op); +} + +/* Return nonzero if the CONSUMER (a mul or mac op) does not + have an early register mult dependency on the result of + PRODUCER. */ + +int +arm_no_early_mul_dep (rtx producer, rtx consumer) +{ + rtx value = PATTERN (producer); + rtx op = PATTERN (consumer); + + if (GET_CODE (value) == COND_EXEC) + value = COND_EXEC_CODE (value); + if (GET_CODE (value) == PARALLEL) + value = XVECEXP (value, 0, 0); + value = XEXP (value, 0); + if (GET_CODE (op) == COND_EXEC) + op = COND_EXEC_CODE (op); + if (GET_CODE (op) == PARALLEL) + op = XVECEXP (op, 0, 0); + op = XEXP (op, 1); + + if (GET_CODE (op) == PLUS || GET_CODE (op) == MINUS) + { + if (GET_CODE (XEXP (op, 0)) == MULT) + return !reg_overlap_mentioned_p (value, XEXP (op, 0)); + else + return !reg_overlap_mentioned_p (value, XEXP (op, 1)); + } + + return 0; +} + +/* We can't rely on the caller doing the proper promotion when + using APCS or ATPCS. */ + +static bool +arm_promote_prototypes (const_tree t ATTRIBUTE_UNUSED) +{ + return !TARGET_AAPCS_BASED; +} + +static enum machine_mode +arm_promote_function_mode (const_tree type ATTRIBUTE_UNUSED, + enum machine_mode mode, + int *punsignedp ATTRIBUTE_UNUSED, + const_tree fntype ATTRIBUTE_UNUSED, + int for_return ATTRIBUTE_UNUSED) +{ + if (GET_MODE_CLASS (mode) == MODE_INT + && GET_MODE_SIZE (mode) < 4) + return SImode; + + return mode; +} + +/* AAPCS based ABIs use short enums by default. */ + +static bool +arm_default_short_enums (void) +{ + return TARGET_AAPCS_BASED && arm_abi != ARM_ABI_AAPCS_LINUX; +} + + +/* AAPCS requires that anonymous bitfields affect structure alignment. */ + +static bool +arm_align_anon_bitfield (void) +{ + return TARGET_AAPCS_BASED; +} + + +/* The generic C++ ABI says 64-bit (long long). The EABI says 32-bit. */ + +static tree +arm_cxx_guard_type (void) +{ + return TARGET_AAPCS_BASED ? integer_type_node : long_long_integer_type_node; +} + +/* Return non-zero if the consumer (a multiply-accumulate instruction) + has an accumulator dependency on the result of the producer (a + multiplication instruction) and no other dependency on that result. */ +int +arm_mac_accumulator_is_mul_result (rtx producer, rtx consumer) +{ + rtx mul = PATTERN (producer); + rtx mac = PATTERN (consumer); + rtx mul_result; + rtx mac_op0, mac_op1, mac_acc; + + if (GET_CODE (mul) == COND_EXEC) + mul = COND_EXEC_CODE (mul); + if (GET_CODE (mac) == COND_EXEC) + mac = COND_EXEC_CODE (mac); + + /* Check that mul is of the form (set (...) (mult ...)) + and mla is of the form (set (...) (plus (mult ...) (...))). */ + if ((GET_CODE (mul) != SET || GET_CODE (XEXP (mul, 1)) != MULT) + || (GET_CODE (mac) != SET || GET_CODE (XEXP (mac, 1)) != PLUS + || GET_CODE (XEXP (XEXP (mac, 1), 0)) != MULT)) + return 0; + + mul_result = XEXP (mul, 0); + mac_op0 = XEXP (XEXP (XEXP (mac, 1), 0), 0); + mac_op1 = XEXP (XEXP (XEXP (mac, 1), 0), 1); + mac_acc = XEXP (XEXP (mac, 1), 1); + + return (reg_overlap_mentioned_p (mul_result, mac_acc) + && !reg_overlap_mentioned_p (mul_result, mac_op0) + && !reg_overlap_mentioned_p (mul_result, mac_op1)); +} + + +/* The EABI says test the least significant bit of a guard variable. */ + +static bool +arm_cxx_guard_mask_bit (void) +{ + return TARGET_AAPCS_BASED; +} + + +/* The EABI specifies that all array cookies are 8 bytes long. */ + +static tree +arm_get_cookie_size (tree type) +{ + tree size; + + if (!TARGET_AAPCS_BASED) + return default_cxx_get_cookie_size (type); + + size = build_int_cst (sizetype, 8); + return size; +} + + +/* The EABI says that array cookies should also contain the element size. */ + +static bool +arm_cookie_has_size (void) +{ + return TARGET_AAPCS_BASED; +} + + +/* The EABI says constructors and destructors should return a pointer to + the object constructed/destroyed. */ + +static bool +arm_cxx_cdtor_returns_this (void) +{ + return TARGET_AAPCS_BASED; +} + +/* The EABI says that an inline function may never be the key + method. */ + +static bool +arm_cxx_key_method_may_be_inline (void) +{ + return !TARGET_AAPCS_BASED; +} + +static void +arm_cxx_determine_class_data_visibility (tree decl) +{ + if (!TARGET_AAPCS_BASED + || !TARGET_DLLIMPORT_DECL_ATTRIBUTES) + return; + + /* In general, \S 3.2.5.5 of the ARM EABI requires that class data + is exported. However, on systems without dynamic vague linkage, + \S 3.2.5.6 says that COMDAT class data has hidden linkage. */ + if (!TARGET_ARM_DYNAMIC_VAGUE_LINKAGE_P && DECL_COMDAT (decl)) + DECL_VISIBILITY (decl) = VISIBILITY_HIDDEN; + else + DECL_VISIBILITY (decl) = VISIBILITY_DEFAULT; + DECL_VISIBILITY_SPECIFIED (decl) = 1; +} + +static bool +arm_cxx_class_data_always_comdat (void) +{ + /* \S 3.2.5.4 of the ARM C++ ABI says that class data only have + vague linkage if the class has no key function. */ + return !TARGET_AAPCS_BASED; +} + + +/* The EABI says __aeabi_atexit should be used to register static + destructors. */ + +static bool +arm_cxx_use_aeabi_atexit (void) +{ + return TARGET_AAPCS_BASED; +} + + +void +arm_set_return_address (rtx source, rtx scratch) +{ + arm_stack_offsets *offsets; + HOST_WIDE_INT delta; + rtx addr; + unsigned long saved_regs; + + offsets = arm_get_frame_offsets (); + saved_regs = offsets->saved_regs_mask; + + if ((saved_regs & (1 << LR_REGNUM)) == 0) + emit_move_insn (gen_rtx_REG (Pmode, LR_REGNUM), source); + else + { + if (frame_pointer_needed) + addr = plus_constant(hard_frame_pointer_rtx, -4); + else + { + /* LR will be the first saved register. */ + delta = offsets->outgoing_args - (offsets->frame + 4); + + + if (delta >= 4096) + { + emit_insn (gen_addsi3 (scratch, stack_pointer_rtx, + GEN_INT (delta & ~4095))); + addr = scratch; + delta &= 4095; + } + else + addr = stack_pointer_rtx; + + addr = plus_constant (addr, delta); + } + emit_move_insn (gen_frame_mem (Pmode, addr), source); + } +} + + +void +thumb_set_return_address (rtx source, rtx scratch) +{ + arm_stack_offsets *offsets; + HOST_WIDE_INT delta; + HOST_WIDE_INT limit; + int reg; + rtx addr; + unsigned long mask; + + emit_use (source); + + offsets = arm_get_frame_offsets (); + mask = offsets->saved_regs_mask; + if (mask & (1 << LR_REGNUM)) + { + limit = 1024; + /* Find the saved regs. */ + if (frame_pointer_needed) + { + delta = offsets->soft_frame - offsets->saved_args; + reg = THUMB_HARD_FRAME_POINTER_REGNUM; + if (TARGET_THUMB1) + limit = 128; + } + else + { + delta = offsets->outgoing_args - offsets->saved_args; + reg = SP_REGNUM; + } + /* Allow for the stack frame. */ + if (TARGET_THUMB1 && TARGET_BACKTRACE) + delta -= 16; + /* The link register is always the first saved register. */ + delta -= 4; + + /* Construct the address. */ + addr = gen_rtx_REG (SImode, reg); + if (delta > limit) + { + emit_insn (gen_movsi (scratch, GEN_INT (delta))); + emit_insn (gen_addsi3 (scratch, scratch, stack_pointer_rtx)); + addr = scratch; + } + else + addr = plus_constant (addr, delta); + + emit_move_insn (gen_frame_mem (Pmode, addr), source); + } + else + emit_move_insn (gen_rtx_REG (Pmode, LR_REGNUM), source); +} + +/* Implements target hook vector_mode_supported_p. */ +bool +arm_vector_mode_supported_p (enum machine_mode mode) +{ + /* Neon also supports V2SImode, etc. listed in the clause below. */ + if (TARGET_NEON && (mode == V2SFmode || mode == V4SImode || mode == V8HImode + || mode == V16QImode || mode == V4SFmode || mode == V2DImode)) + return true; + + if ((TARGET_NEON || TARGET_IWMMXT) + && ((mode == V2SImode) + || (mode == V4HImode) + || (mode == V8QImode))) + return true; + + return false; +} + +/* Use the option -mvectorize-with-neon-quad to override the use of doubleword + registers when autovectorizing for Neon, at least until multiple vector + widths are supported properly by the middle-end. */ + +static enum machine_mode +arm_preferred_simd_mode (enum machine_mode mode) +{ + if (TARGET_NEON) + switch (mode) + { + case SFmode: + return TARGET_NEON_VECTORIZE_QUAD ? V4SFmode : V2SFmode; + case SImode: + return TARGET_NEON_VECTORIZE_QUAD ? V4SImode : V2SImode; + case HImode: + return TARGET_NEON_VECTORIZE_QUAD ? V8HImode : V4HImode; + case QImode: + return TARGET_NEON_VECTORIZE_QUAD ? V16QImode : V8QImode; + case DImode: + if (TARGET_NEON_VECTORIZE_QUAD) + return V2DImode; + break; + + default:; + } + + if (TARGET_REALLY_IWMMXT) + switch (mode) + { + case SImode: + return V2SImode; + case HImode: + return V4HImode; + case QImode: + return V8QImode; + + default:; + } + + return word_mode; +} + +/* Implement TARGET_CLASS_LIKELY_SPILLED_P. + + We need to define this for LO_REGS on thumb. Otherwise we can end up + using r0-r4 for function arguments, r7 for the stack frame and don't + have enough left over to do doubleword arithmetic. */ + +static bool +arm_class_likely_spilled_p (reg_class_t rclass) +{ + if ((TARGET_THUMB && rclass == LO_REGS) + || rclass == CC_REG) + return true; + + return false; +} + +/* Implements target hook small_register_classes_for_mode_p. */ +bool +arm_small_register_classes_for_mode_p (enum machine_mode mode ATTRIBUTE_UNUSED) +{ + return TARGET_THUMB1; +} + +/* Implement TARGET_SHIFT_TRUNCATION_MASK. SImode shifts use normal + ARM insns and therefore guarantee that the shift count is modulo 256. + DImode shifts (those implemented by lib1funcs.asm or by optabs.c) + guarantee no particular behavior for out-of-range counts. */ + +static unsigned HOST_WIDE_INT +arm_shift_truncation_mask (enum machine_mode mode) +{ + return mode == SImode ? 255 : 0; +} + + +/* Map internal gcc register numbers to DWARF2 register numbers. */ + +unsigned int +arm_dbx_register_number (unsigned int regno) +{ + if (regno < 16) + return regno; + + /* TODO: Legacy targets output FPA regs as registers 16-23 for backwards + compatibility. The EABI defines them as registers 96-103. */ + if (IS_FPA_REGNUM (regno)) + return (TARGET_AAPCS_BASED ? 96 : 16) + regno - FIRST_FPA_REGNUM; + + if (IS_VFP_REGNUM (regno)) + { + /* See comment in arm_dwarf_register_span. */ + if (VFP_REGNO_OK_FOR_SINGLE (regno)) + return 64 + regno - FIRST_VFP_REGNUM; + else + return 256 + (regno - FIRST_VFP_REGNUM) / 2; + } + + if (IS_IWMMXT_GR_REGNUM (regno)) + return 104 + regno - FIRST_IWMMXT_GR_REGNUM; + + if (IS_IWMMXT_REGNUM (regno)) + return 112 + regno - FIRST_IWMMXT_REGNUM; + + gcc_unreachable (); +} + +/* Dwarf models VFPv3 registers as 32 64-bit registers. + GCC models tham as 64 32-bit registers, so we need to describe this to + the DWARF generation code. Other registers can use the default. */ +static rtx +arm_dwarf_register_span (rtx rtl) +{ + unsigned regno; + int nregs; + int i; + rtx p; + + regno = REGNO (rtl); + if (!IS_VFP_REGNUM (regno)) + return NULL_RTX; + + /* XXX FIXME: The EABI defines two VFP register ranges: + 64-95: Legacy VFPv2 numbering for S0-S31 (obsolescent) + 256-287: D0-D31 + The recommended encoding for S0-S31 is a DW_OP_bit_piece of the + corresponding D register. Until GDB supports this, we shall use the + legacy encodings. We also use these encodings for D0-D15 for + compatibility with older debuggers. */ + if (VFP_REGNO_OK_FOR_SINGLE (regno)) + return NULL_RTX; + + nregs = GET_MODE_SIZE (GET_MODE (rtl)) / 8; + p = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (nregs)); + regno = (regno - FIRST_VFP_REGNUM) / 2; + for (i = 0; i < nregs; i++) + XVECEXP (p, 0, i) = gen_rtx_REG (DImode, 256 + regno + i); + + return p; +} + +#if ARM_UNWIND_INFO +/* Emit unwind directives for a store-multiple instruction or stack pointer + push during alignment. + These should only ever be generated by the function prologue code, so + expect them to have a particular form. */ + +static void +arm_unwind_emit_sequence (FILE * asm_out_file, rtx p) +{ + int i; + HOST_WIDE_INT offset; + HOST_WIDE_INT nregs; + int reg_size; + unsigned reg; + unsigned lastreg; + rtx e; + + e = XVECEXP (p, 0, 0); + if (GET_CODE (e) != SET) + abort (); + + /* First insn will adjust the stack pointer. */ + if (GET_CODE (e) != SET + || GET_CODE (XEXP (e, 0)) != REG + || REGNO (XEXP (e, 0)) != SP_REGNUM + || GET_CODE (XEXP (e, 1)) != PLUS) + abort (); + + offset = -INTVAL (XEXP (XEXP (e, 1), 1)); + nregs = XVECLEN (p, 0) - 1; + + reg = REGNO (XEXP (XVECEXP (p, 0, 1), 1)); + if (reg < 16) + { + /* The function prologue may also push pc, but not annotate it as it is + never restored. We turn this into a stack pointer adjustment. */ + if (nregs * 4 == offset - 4) + { + fprintf (asm_out_file, "\t.pad #4\n"); + offset -= 4; + } + reg_size = 4; + fprintf (asm_out_file, "\t.save {"); + } + else if (IS_VFP_REGNUM (reg)) + { + reg_size = 8; + fprintf (asm_out_file, "\t.vsave {"); + } + else if (reg >= FIRST_FPA_REGNUM && reg <= LAST_FPA_REGNUM) + { + /* FPA registers are done differently. */ + asm_fprintf (asm_out_file, "\t.save %r, %wd\n", reg, nregs); + return; + } + else + /* Unknown register type. */ + abort (); + + /* If the stack increment doesn't match the size of the saved registers, + something has gone horribly wrong. */ + if (offset != nregs * reg_size) + abort (); + + offset = 0; + lastreg = 0; + /* The remaining insns will describe the stores. */ + for (i = 1; i <= nregs; i++) + { + /* Expect (set (mem ) (reg)). + Where is (reg:SP) or (plus (reg:SP) (const_int)). */ + e = XVECEXP (p, 0, i); + if (GET_CODE (e) != SET + || GET_CODE (XEXP (e, 0)) != MEM + || GET_CODE (XEXP (e, 1)) != REG) + abort (); + + reg = REGNO (XEXP (e, 1)); + if (reg < lastreg) + abort (); + + if (i != 1) + fprintf (asm_out_file, ", "); + /* We can't use %r for vfp because we need to use the + double precision register names. */ + if (IS_VFP_REGNUM (reg)) + asm_fprintf (asm_out_file, "d%d", (reg - FIRST_VFP_REGNUM) / 2); + else + asm_fprintf (asm_out_file, "%r", reg); + +#ifdef ENABLE_CHECKING + /* Check that the addresses are consecutive. */ + e = XEXP (XEXP (e, 0), 0); + if (GET_CODE (e) == PLUS) + { + offset += reg_size; + if (GET_CODE (XEXP (e, 0)) != REG + || REGNO (XEXP (e, 0)) != SP_REGNUM + || GET_CODE (XEXP (e, 1)) != CONST_INT + || offset != INTVAL (XEXP (e, 1))) + abort (); + } + else if (i != 1 + || GET_CODE (e) != REG + || REGNO (e) != SP_REGNUM) + abort (); +#endif + } + fprintf (asm_out_file, "}\n"); +} + +/* Emit unwind directives for a SET. */ + +static void +arm_unwind_emit_set (FILE * asm_out_file, rtx p) +{ + rtx e0; + rtx e1; + unsigned reg; + + e0 = XEXP (p, 0); + e1 = XEXP (p, 1); + switch (GET_CODE (e0)) + { + case MEM: + /* Pushing a single register. */ + if (GET_CODE (XEXP (e0, 0)) != PRE_DEC + || GET_CODE (XEXP (XEXP (e0, 0), 0)) != REG + || REGNO (XEXP (XEXP (e0, 0), 0)) != SP_REGNUM) + abort (); + + asm_fprintf (asm_out_file, "\t.save "); + if (IS_VFP_REGNUM (REGNO (e1))) + asm_fprintf(asm_out_file, "{d%d}\n", + (REGNO (e1) - FIRST_VFP_REGNUM) / 2); + else + asm_fprintf(asm_out_file, "{%r}\n", REGNO (e1)); + break; + + case REG: + if (REGNO (e0) == SP_REGNUM) + { + /* A stack increment. */ + if (GET_CODE (e1) != PLUS + || GET_CODE (XEXP (e1, 0)) != REG + || REGNO (XEXP (e1, 0)) != SP_REGNUM + || GET_CODE (XEXP (e1, 1)) != CONST_INT) + abort (); + + asm_fprintf (asm_out_file, "\t.pad #%wd\n", + -INTVAL (XEXP (e1, 1))); + } + else if (REGNO (e0) == HARD_FRAME_POINTER_REGNUM) + { + HOST_WIDE_INT offset; + + if (GET_CODE (e1) == PLUS) + { + if (GET_CODE (XEXP (e1, 0)) != REG + || GET_CODE (XEXP (e1, 1)) != CONST_INT) + abort (); + reg = REGNO (XEXP (e1, 0)); + offset = INTVAL (XEXP (e1, 1)); + asm_fprintf (asm_out_file, "\t.setfp %r, %r, #%wd\n", + HARD_FRAME_POINTER_REGNUM, reg, + offset); + } + else if (GET_CODE (e1) == REG) + { + reg = REGNO (e1); + asm_fprintf (asm_out_file, "\t.setfp %r, %r\n", + HARD_FRAME_POINTER_REGNUM, reg); + } + else + abort (); + } + else if (GET_CODE (e1) == REG && REGNO (e1) == SP_REGNUM) + { + /* Move from sp to reg. */ + asm_fprintf (asm_out_file, "\t.movsp %r\n", REGNO (e0)); + } + else if (GET_CODE (e1) == PLUS + && GET_CODE (XEXP (e1, 0)) == REG + && REGNO (XEXP (e1, 0)) == SP_REGNUM + && GET_CODE (XEXP (e1, 1)) == CONST_INT) + { + /* Set reg to offset from sp. */ + asm_fprintf (asm_out_file, "\t.movsp %r, #%d\n", + REGNO (e0), (int)INTVAL(XEXP (e1, 1))); + } + else if (GET_CODE (e1) == UNSPEC && XINT (e1, 1) == UNSPEC_STACK_ALIGN) + { + /* Stack pointer save before alignment. */ + reg = REGNO (e0); + asm_fprintf (asm_out_file, "\t.unwind_raw 0, 0x%x @ vsp = r%d\n", + reg + 0x90, reg); + } + else + abort (); + break; + + default: + abort (); + } +} + + +/* Emit unwind directives for the given insn. */ + +static void +arm_unwind_emit (FILE * asm_out_file, rtx insn) +{ + rtx pat; + + if (arm_except_unwind_info (&global_options) != UI_TARGET) + return; + + if (!(flag_unwind_tables || crtl->uses_eh_lsda) + && (TREE_NOTHROW (current_function_decl) + || crtl->all_throwers_are_sibcalls)) + return; + + if (GET_CODE (insn) == NOTE || !RTX_FRAME_RELATED_P (insn)) + return; + + pat = find_reg_note (insn, REG_FRAME_RELATED_EXPR, NULL_RTX); + if (pat) + pat = XEXP (pat, 0); + else + pat = PATTERN (insn); + + switch (GET_CODE (pat)) + { + case SET: + arm_unwind_emit_set (asm_out_file, pat); + break; + + case SEQUENCE: + /* Store multiple. */ + arm_unwind_emit_sequence (asm_out_file, pat); + break; + + default: + abort(); + } +} + + +/* Output a reference from a function exception table to the type_info + object X. The EABI specifies that the symbol should be relocated by + an R_ARM_TARGET2 relocation. */ + +static bool +arm_output_ttype (rtx x) +{ + fputs ("\t.word\t", asm_out_file); + output_addr_const (asm_out_file, x); + /* Use special relocations for symbol references. */ + if (GET_CODE (x) != CONST_INT) + fputs ("(TARGET2)", asm_out_file); + fputc ('\n', asm_out_file); + + return TRUE; +} + +/* Implement TARGET_ASM_EMIT_EXCEPT_PERSONALITY. */ + +static void +arm_asm_emit_except_personality (rtx personality) +{ + fputs ("\t.personality\t", asm_out_file); + output_addr_const (asm_out_file, personality); + fputc ('\n', asm_out_file); +} + +/* Implement TARGET_ASM_INITIALIZE_SECTIONS. */ + +static void +arm_asm_init_sections (void) +{ + exception_section = get_unnamed_section (0, output_section_asm_op, + "\t.handlerdata"); +} +#endif /* ARM_UNWIND_INFO */ + +/* Implement TARGET_EXCEPT_UNWIND_INFO. */ + +static enum unwind_info_type +arm_except_unwind_info (struct gcc_options *opts) +{ + /* Honor the --enable-sjlj-exceptions configure switch. */ +#ifdef CONFIG_SJLJ_EXCEPTIONS + if (CONFIG_SJLJ_EXCEPTIONS) + return UI_SJLJ; +#endif + + /* If not using ARM EABI unwind tables... */ + if (ARM_UNWIND_INFO) + { + /* For simplicity elsewhere in this file, indicate that all unwind + info is disabled if we're not emitting unwind tables. */ + if (!opts->x_flag_exceptions && !opts->x_flag_unwind_tables) + return UI_NONE; + else + return UI_TARGET; + } + + /* ... we use sjlj exceptions for backwards compatibility. */ + return UI_SJLJ; +} + + +/* Handle UNSPEC DWARF call frame instructions. These are needed for dynamic + stack alignment. */ + +static void +arm_dwarf_handle_frame_unspec (const char *label, rtx pattern, int index) +{ + rtx unspec = SET_SRC (pattern); + gcc_assert (GET_CODE (unspec) == UNSPEC); + + switch (index) + { + case UNSPEC_STACK_ALIGN: + /* ??? We should set the CFA = (SP & ~7). At this point we haven't + put anything on the stack, so hopefully it won't matter. + CFA = SP will be correct after alignment. */ + dwarf2out_reg_save_reg (label, stack_pointer_rtx, + SET_DEST (pattern)); + break; + default: + gcc_unreachable (); + } +} + + +/* Output unwind directives for the start/end of a function. */ + +void +arm_output_fn_unwind (FILE * f, bool prologue) +{ + if (arm_except_unwind_info (&global_options) != UI_TARGET) + return; + + if (prologue) + fputs ("\t.fnstart\n", f); + else + { + /* If this function will never be unwound, then mark it as such. + The came condition is used in arm_unwind_emit to suppress + the frame annotations. */ + if (!(flag_unwind_tables || crtl->uses_eh_lsda) + && (TREE_NOTHROW (current_function_decl) + || crtl->all_throwers_are_sibcalls)) + fputs("\t.cantunwind\n", f); + + fputs ("\t.fnend\n", f); + } +} + +static bool +arm_emit_tls_decoration (FILE *fp, rtx x) +{ + enum tls_reloc reloc; + rtx val; + + val = XVECEXP (x, 0, 0); + reloc = (enum tls_reloc) INTVAL (XVECEXP (x, 0, 1)); + + output_addr_const (fp, val); + + switch (reloc) + { + case TLS_GD32: + fputs ("(tlsgd)", fp); + break; + case TLS_LDM32: + fputs ("(tlsldm)", fp); + break; + case TLS_LDO32: + fputs ("(tlsldo)", fp); + break; + case TLS_IE32: + fputs ("(gottpoff)", fp); + break; + case TLS_LE32: + fputs ("(tpoff)", fp); + break; + default: + gcc_unreachable (); + } + + switch (reloc) + { + case TLS_GD32: + case TLS_LDM32: + case TLS_IE32: + fputs (" + (. - ", fp); + output_addr_const (fp, XVECEXP (x, 0, 2)); + fputs (" - ", fp); + output_addr_const (fp, XVECEXP (x, 0, 3)); + fputc (')', fp); + break; + default: + break; + } + + return TRUE; +} + +/* ARM implementation of TARGET_ASM_OUTPUT_DWARF_DTPREL. */ + +static void +arm_output_dwarf_dtprel (FILE *file, int size, rtx x) +{ + gcc_assert (size == 4); + fputs ("\t.word\t", file); + output_addr_const (file, x); + fputs ("(tlsldo)", file); +} + +/* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */ + +static bool +arm_output_addr_const_extra (FILE *fp, rtx x) +{ + if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS) + return arm_emit_tls_decoration (fp, x); + else if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_PIC_LABEL) + { + char label[256]; + int labelno = INTVAL (XVECEXP (x, 0, 0)); + + ASM_GENERATE_INTERNAL_LABEL (label, "LPIC", labelno); + assemble_name_raw (fp, label); + + return TRUE; + } + else if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_GOTSYM_OFF) + { + assemble_name (fp, "_GLOBAL_OFFSET_TABLE_"); + if (GOT_PCREL) + fputs ("+.", fp); + fputs ("-(", fp); + output_addr_const (fp, XVECEXP (x, 0, 0)); + fputc (')', fp); + return TRUE; + } + else if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SYMBOL_OFFSET) + { + output_addr_const (fp, XVECEXP (x, 0, 0)); + if (GOT_PCREL) + fputs ("+.", fp); + fputs ("-(", fp); + output_addr_const (fp, XVECEXP (x, 0, 1)); + fputc (')', fp); + return TRUE; + } + else if (GET_CODE (x) == CONST_VECTOR) + return arm_emit_vector_const (fp, x); + + return FALSE; +} + +/* Output assembly for a shift instruction. + SET_FLAGS determines how the instruction modifies the condition codes. + 0 - Do not set condition codes. + 1 - Set condition codes. + 2 - Use smallest instruction. */ +const char * +arm_output_shift(rtx * operands, int set_flags) +{ + char pattern[100]; + static const char flag_chars[3] = {'?', '.', '!'}; + const char *shift; + HOST_WIDE_INT val; + char c; + + c = flag_chars[set_flags]; + if (TARGET_UNIFIED_ASM) + { + shift = shift_op(operands[3], &val); + if (shift) + { + if (val != -1) + operands[2] = GEN_INT(val); + sprintf (pattern, "%s%%%c\t%%0, %%1, %%2", shift, c); + } + else + sprintf (pattern, "mov%%%c\t%%0, %%1", c); + } + else + sprintf (pattern, "mov%%%c\t%%0, %%1%%S3", c); + output_asm_insn (pattern, operands); + return ""; +} + +/* Output a Thumb-1 casesi dispatch sequence. */ +const char * +thumb1_output_casesi (rtx *operands) +{ + rtx diff_vec = PATTERN (next_real_insn (operands[0])); + + gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC); + + switch (GET_MODE(diff_vec)) + { + case QImode: + return (ADDR_DIFF_VEC_FLAGS (diff_vec).offset_unsigned ? + "bl\t%___gnu_thumb1_case_uqi" : "bl\t%___gnu_thumb1_case_sqi"); + case HImode: + return (ADDR_DIFF_VEC_FLAGS (diff_vec).offset_unsigned ? + "bl\t%___gnu_thumb1_case_uhi" : "bl\t%___gnu_thumb1_case_shi"); + case SImode: + return "bl\t%___gnu_thumb1_case_si"; + default: + gcc_unreachable (); + } +} + +/* Output a Thumb-2 casesi instruction. */ +const char * +thumb2_output_casesi (rtx *operands) +{ + rtx diff_vec = PATTERN (next_real_insn (operands[2])); + + gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC); + + output_asm_insn ("cmp\t%0, %1", operands); + output_asm_insn ("bhi\t%l3", operands); + switch (GET_MODE(diff_vec)) + { + case QImode: + return "tbb\t[%|pc, %0]"; + case HImode: + return "tbh\t[%|pc, %0, lsl #1]"; + case SImode: + if (flag_pic) + { + output_asm_insn ("adr\t%4, %l2", operands); + output_asm_insn ("ldr\t%5, [%4, %0, lsl #2]", operands); + output_asm_insn ("add\t%4, %4, %5", operands); + return "bx\t%4"; + } + else + { + output_asm_insn ("adr\t%4, %l2", operands); + return "ldr\t%|pc, [%4, %0, lsl #2]"; + } + default: + gcc_unreachable (); + } +} + +/* Most ARM cores are single issue, but some newer ones can dual issue. + The scheduler descriptions rely on this being correct. */ +static int +arm_issue_rate (void) +{ + switch (arm_tune) + { + case cortexr4: + case cortexr4f: + case cortexa5: + case cortexa8: + case cortexa9: + case fa726te: + return 2; + + default: + return 1; + } +} + +/* A table and a function to perform ARM-specific name mangling for + NEON vector types in order to conform to the AAPCS (see "Procedure + Call Standard for the ARM Architecture", Appendix A). To qualify + for emission with the mangled names defined in that document, a + vector type must not only be of the correct mode but also be + composed of NEON vector element types (e.g. __builtin_neon_qi). */ +typedef struct +{ + enum machine_mode mode; + const char *element_type_name; + const char *aapcs_name; +} arm_mangle_map_entry; + +static arm_mangle_map_entry arm_mangle_map[] = { + /* 64-bit containerized types. */ + { V8QImode, "__builtin_neon_qi", "15__simd64_int8_t" }, + { V8QImode, "__builtin_neon_uqi", "16__simd64_uint8_t" }, + { V4HImode, "__builtin_neon_hi", "16__simd64_int16_t" }, + { V4HImode, "__builtin_neon_uhi", "17__simd64_uint16_t" }, + { V2SImode, "__builtin_neon_si", "16__simd64_int32_t" }, + { V2SImode, "__builtin_neon_usi", "17__simd64_uint32_t" }, + { V2SFmode, "__builtin_neon_sf", "18__simd64_float32_t" }, + { V8QImode, "__builtin_neon_poly8", "16__simd64_poly8_t" }, + { V4HImode, "__builtin_neon_poly16", "17__simd64_poly16_t" }, + /* 128-bit containerized types. */ + { V16QImode, "__builtin_neon_qi", "16__simd128_int8_t" }, + { V16QImode, "__builtin_neon_uqi", "17__simd128_uint8_t" }, + { V8HImode, "__builtin_neon_hi", "17__simd128_int16_t" }, + { V8HImode, "__builtin_neon_uhi", "18__simd128_uint16_t" }, + { V4SImode, "__builtin_neon_si", "17__simd128_int32_t" }, + { V4SImode, "__builtin_neon_usi", "18__simd128_uint32_t" }, + { V4SFmode, "__builtin_neon_sf", "19__simd128_float32_t" }, + { V16QImode, "__builtin_neon_poly8", "17__simd128_poly8_t" }, + { V8HImode, "__builtin_neon_poly16", "18__simd128_poly16_t" }, + { VOIDmode, NULL, NULL } +}; + +const char * +arm_mangle_type (const_tree type) +{ + arm_mangle_map_entry *pos = arm_mangle_map; + + /* The ARM ABI documents (10th October 2008) say that "__va_list" + has to be managled as if it is in the "std" namespace. */ + if (TARGET_AAPCS_BASED + && lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type)) + { + static bool warned; + if (!warned && warn_psabi && !in_system_header) + { + warned = true; + inform (input_location, + "the mangling of % has changed in GCC 4.4"); + } + return "St9__va_list"; + } + + /* Half-precision float. */ + if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16) + return "Dh"; + + if (TREE_CODE (type) != VECTOR_TYPE) + return NULL; + + /* Check the mode of the vector type, and the name of the vector + element type, against the table. */ + while (pos->mode != VOIDmode) + { + tree elt_type = TREE_TYPE (type); + + if (pos->mode == TYPE_MODE (type) + && TREE_CODE (TYPE_NAME (elt_type)) == TYPE_DECL + && !strcmp (IDENTIFIER_POINTER (DECL_NAME (TYPE_NAME (elt_type))), + pos->element_type_name)) + return pos->aapcs_name; + + pos++; + } + + /* Use the default mangling for unrecognized (possibly user-defined) + vector types. */ + return NULL; +} + +/* Order of allocation of core registers for Thumb: this allocation is + written over the corresponding initial entries of the array + initialized with REG_ALLOC_ORDER. We allocate all low registers + first. Saving and restoring a low register is usually cheaper than + using a call-clobbered high register. */ + +static const int thumb_core_reg_alloc_order[] = +{ + 3, 2, 1, 0, 4, 5, 6, 7, + 14, 12, 8, 9, 10, 11, 13, 15 +}; + +/* Adjust register allocation order when compiling for Thumb. */ + +void +arm_order_regs_for_local_alloc (void) +{ + const int arm_reg_alloc_order[] = REG_ALLOC_ORDER; + memcpy(reg_alloc_order, arm_reg_alloc_order, sizeof (reg_alloc_order)); + if (TARGET_THUMB) + memcpy (reg_alloc_order, thumb_core_reg_alloc_order, + sizeof (thumb_core_reg_alloc_order)); +} + +/* Implement TARGET_FRAME_POINTER_REQUIRED. */ + +bool +arm_frame_pointer_required (void) +{ + return (cfun->has_nonlocal_label + || SUBTARGET_FRAME_POINTER_REQUIRED + || (TARGET_ARM && TARGET_APCS_FRAME && ! leaf_function_p ())); +} + +/* Only thumb1 can't support conditional execution, so return true if + the target is not thumb1. */ +static bool +arm_have_conditional_execution (void) +{ + return !TARGET_THUMB1; +} + +/* Legitimize a memory reference for sync primitive implemented using + ldrex / strex. We currently force the form of the reference to be + indirect without offset. We do not yet support the indirect offset + addressing supported by some ARM targets for these + instructions. */ +static rtx +arm_legitimize_sync_memory (rtx memory) +{ + rtx addr = force_reg (Pmode, XEXP (memory, 0)); + rtx legitimate_memory = gen_rtx_MEM (GET_MODE (memory), addr); + + set_mem_alias_set (legitimate_memory, ALIAS_SET_MEMORY_BARRIER); + MEM_VOLATILE_P (legitimate_memory) = MEM_VOLATILE_P (memory); + return legitimate_memory; +} + +/* An instruction emitter. */ +typedef void (* emit_f) (int label, const char *, rtx *); + +/* An instruction emitter that emits via the conventional + output_asm_insn. */ +static void +arm_emit (int label ATTRIBUTE_UNUSED, const char *pattern, rtx *operands) +{ + output_asm_insn (pattern, operands); +} + +/* Count the number of emitted synchronization instructions. */ +static unsigned arm_insn_count; + +/* An emitter that counts emitted instructions but does not actually + emit instruction into the the instruction stream. */ +static void +arm_count (int label, + const char *pattern ATTRIBUTE_UNUSED, + rtx *operands ATTRIBUTE_UNUSED) +{ + if (! label) + ++ arm_insn_count; +} + +/* Construct a pattern using conventional output formatting and feed + it to output_asm_insn. Provides a mechanism to construct the + output pattern on the fly. Note the hard limit on the pattern + buffer size. */ +static void ATTRIBUTE_PRINTF_4 +arm_output_asm_insn (emit_f emit, int label, rtx *operands, + const char *pattern, ...) +{ + va_list ap; + char buffer[256]; + + va_start (ap, pattern); + vsprintf (buffer, pattern, ap); + va_end (ap); + emit (label, buffer, operands); +} + +/* Emit the memory barrier instruction, if any, provided by this + target to a specified emitter. */ +static void +arm_process_output_memory_barrier (emit_f emit, rtx *operands) +{ + if (TARGET_HAVE_DMB) + { + /* Note we issue a system level barrier. We should consider + issuing a inner shareabilty zone barrier here instead, ie. + "DMB ISH". */ + emit (0, "dmb\tsy", operands); + return; + } + + if (TARGET_HAVE_DMB_MCR) + { + emit (0, "mcr\tp15, 0, r0, c7, c10, 5", operands); + return; + } + + gcc_unreachable (); +} + +/* Emit the memory barrier instruction, if any, provided by this + target. */ +const char * +arm_output_memory_barrier (rtx *operands) +{ + arm_process_output_memory_barrier (arm_emit, operands); + return ""; +} + +/* Helper to figure out the instruction suffix required on ldrex/strex + for operations on an object of the specified mode. */ +static const char * +arm_ldrex_suffix (enum machine_mode mode) +{ + switch (mode) + { + case QImode: return "b"; + case HImode: return "h"; + case SImode: return ""; + case DImode: return "d"; + default: + gcc_unreachable (); + } + return ""; +} + +/* Emit an ldrex{b,h,d, } instruction appropriate for the specified + mode. */ +static void +arm_output_ldrex (emit_f emit, + enum machine_mode mode, + rtx target, + rtx memory) +{ + const char *suffix = arm_ldrex_suffix (mode); + rtx operands[2]; + + operands[0] = target; + operands[1] = memory; + arm_output_asm_insn (emit, 0, operands, "ldrex%s\t%%0, %%C1", suffix); +} + +/* Emit a strex{b,h,d, } instruction appropriate for the specified + mode. */ +static void +arm_output_strex (emit_f emit, + enum machine_mode mode, + const char *cc, + rtx result, + rtx value, + rtx memory) +{ + const char *suffix = arm_ldrex_suffix (mode); + rtx operands[3]; + + operands[0] = result; + operands[1] = value; + operands[2] = memory; + arm_output_asm_insn (emit, 0, operands, "strex%s%s\t%%0, %%1, %%C2", suffix, + cc); +} + +/* Helper to emit a two operand instruction. */ +static void +arm_output_op2 (emit_f emit, const char *mnemonic, rtx d, rtx s) +{ + rtx operands[2]; + + operands[0] = d; + operands[1] = s; + arm_output_asm_insn (emit, 0, operands, "%s\t%%0, %%1", mnemonic); +} + +/* Helper to emit a three operand instruction. */ +static void +arm_output_op3 (emit_f emit, const char *mnemonic, rtx d, rtx a, rtx b) +{ + rtx operands[3]; + + operands[0] = d; + operands[1] = a; + operands[2] = b; + arm_output_asm_insn (emit, 0, operands, "%s\t%%0, %%1, %%2", mnemonic); +} + +/* Emit a load store exclusive synchronization loop. + + do + old_value = [mem] + if old_value != required_value + break; + t1 = sync_op (old_value, new_value) + [mem] = t1, t2 = [0|1] + while ! t2 + + Note: + t1 == t2 is not permitted + t1 == old_value is permitted + + required_value: + + RTX register or const_int representing the required old_value for + the modify to continue, if NULL no comparsion is performed. */ +static void +arm_output_sync_loop (emit_f emit, + enum machine_mode mode, + rtx old_value, + rtx memory, + rtx required_value, + rtx new_value, + rtx t1, + rtx t2, + enum attr_sync_op sync_op, + int early_barrier_required) +{ + rtx operands[1]; + + gcc_assert (t1 != t2); + + if (early_barrier_required) + arm_process_output_memory_barrier (emit, NULL); + + arm_output_asm_insn (emit, 1, operands, "%sLSYT%%=:", LOCAL_LABEL_PREFIX); + + arm_output_ldrex (emit, mode, old_value, memory); + + if (required_value) + { + rtx operands[2]; + + operands[0] = old_value; + operands[1] = required_value; + arm_output_asm_insn (emit, 0, operands, "cmp\t%%0, %%1"); + arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYB%%=", LOCAL_LABEL_PREFIX); + } + + switch (sync_op) + { + case SYNC_OP_ADD: + arm_output_op3 (emit, "add", t1, old_value, new_value); + break; + + case SYNC_OP_SUB: + arm_output_op3 (emit, "sub", t1, old_value, new_value); + break; + + case SYNC_OP_IOR: + arm_output_op3 (emit, "orr", t1, old_value, new_value); + break; + + case SYNC_OP_XOR: + arm_output_op3 (emit, "eor", t1, old_value, new_value); + break; + + case SYNC_OP_AND: + arm_output_op3 (emit,"and", t1, old_value, new_value); + break; + + case SYNC_OP_NAND: + arm_output_op3 (emit, "and", t1, old_value, new_value); + arm_output_op2 (emit, "mvn", t1, t1); + break; + + case SYNC_OP_NONE: + t1 = new_value; + break; + } + + if (t2) + { + arm_output_strex (emit, mode, "", t2, t1, memory); + operands[0] = t2; + arm_output_asm_insn (emit, 0, operands, "teq\t%%0, #0"); + arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYT%%=", + LOCAL_LABEL_PREFIX); + } + else + { + /* Use old_value for the return value because for some operations + the old_value can easily be restored. This saves one register. */ + arm_output_strex (emit, mode, "", old_value, t1, memory); + operands[0] = old_value; + arm_output_asm_insn (emit, 0, operands, "teq\t%%0, #0"); + arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYT%%=", + LOCAL_LABEL_PREFIX); + + switch (sync_op) + { + case SYNC_OP_ADD: + arm_output_op3 (emit, "sub", old_value, t1, new_value); + break; + + case SYNC_OP_SUB: + arm_output_op3 (emit, "add", old_value, t1, new_value); + break; + + case SYNC_OP_XOR: + arm_output_op3 (emit, "eor", old_value, t1, new_value); + break; + + case SYNC_OP_NONE: + arm_output_op2 (emit, "mov", old_value, required_value); + break; + + default: + gcc_unreachable (); + } + } + + /* Note: label is before barrier so that in cmp failure case we still get + a barrier to stop subsequent loads floating upwards past the ldrex + PR target/48126. */ + arm_output_asm_insn (emit, 1, operands, "%sLSYB%%=:", LOCAL_LABEL_PREFIX); + arm_process_output_memory_barrier (emit, NULL); +} + +static rtx +arm_get_sync_operand (rtx *operands, int index, rtx default_value) +{ + if (index > 0) + default_value = operands[index - 1]; + + return default_value; +} + +#define FETCH_SYNC_OPERAND(NAME, DEFAULT) \ + arm_get_sync_operand (operands, (int) get_attr_sync_##NAME (insn), DEFAULT); + +/* Extract the operands for a synchroniztion instruction from the + instructions attributes and emit the instruction. */ +static void +arm_process_output_sync_insn (emit_f emit, rtx insn, rtx *operands) +{ + rtx result, memory, required_value, new_value, t1, t2; + int early_barrier; + enum machine_mode mode; + enum attr_sync_op sync_op; + + result = FETCH_SYNC_OPERAND(result, 0); + memory = FETCH_SYNC_OPERAND(memory, 0); + required_value = FETCH_SYNC_OPERAND(required_value, 0); + new_value = FETCH_SYNC_OPERAND(new_value, 0); + t1 = FETCH_SYNC_OPERAND(t1, 0); + t2 = FETCH_SYNC_OPERAND(t2, 0); + early_barrier = + get_attr_sync_release_barrier (insn) == SYNC_RELEASE_BARRIER_YES; + sync_op = get_attr_sync_op (insn); + mode = GET_MODE (memory); + + arm_output_sync_loop (emit, mode, result, memory, required_value, + new_value, t1, t2, sync_op, early_barrier); +} + +/* Emit a synchronization instruction loop. */ +const char * +arm_output_sync_insn (rtx insn, rtx *operands) +{ + arm_process_output_sync_insn (arm_emit, insn, operands); + return ""; +} + +/* Count the number of machine instruction that will be emitted for a + synchronization instruction. Note that the emitter used does not + emit instructions, it just counts instructions being carefull not + to count labels. */ +unsigned int +arm_sync_loop_insns (rtx insn, rtx *operands) +{ + arm_insn_count = 0; + arm_process_output_sync_insn (arm_count, insn, operands); + return arm_insn_count; +} + +/* Helper to call a target sync instruction generator, dealing with + the variation in operands required by the different generators. */ +static rtx +arm_call_generator (struct arm_sync_generator *generator, rtx old_value, + rtx memory, rtx required_value, rtx new_value) +{ + switch (generator->op) + { + case arm_sync_generator_omn: + gcc_assert (! required_value); + return generator->u.omn (old_value, memory, new_value); + + case arm_sync_generator_omrn: + gcc_assert (required_value); + return generator->u.omrn (old_value, memory, required_value, new_value); + } + + return NULL; +} + +/* Expand a synchronization loop. The synchronization loop is expanded + as an opaque block of instructions in order to ensure that we do + not subsequently get extraneous memory accesses inserted within the + critical region. The exclusive access property of ldrex/strex is + only guaranteed in there are no intervening memory accesses. */ +void +arm_expand_sync (enum machine_mode mode, + struct arm_sync_generator *generator, + rtx target, rtx memory, rtx required_value, rtx new_value) +{ + if (target == NULL) + target = gen_reg_rtx (mode); + + memory = arm_legitimize_sync_memory (memory); + if (mode != SImode) + { + rtx load_temp = gen_reg_rtx (SImode); + + if (required_value) + required_value = convert_modes (SImode, mode, required_value, true); + + new_value = convert_modes (SImode, mode, new_value, true); + emit_insn (arm_call_generator (generator, load_temp, memory, + required_value, new_value)); + emit_move_insn (target, gen_lowpart (mode, load_temp)); + } + else + { + emit_insn (arm_call_generator (generator, target, memory, required_value, + new_value)); + } +} + +static bool +arm_vector_alignment_reachable (const_tree type, bool is_packed) +{ + /* Vectors which aren't in packed structures will not be less aligned than + the natural alignment of their element type, so this is safe. */ + if (TARGET_NEON && !BYTES_BIG_ENDIAN) + return !is_packed; + + return default_builtin_vector_alignment_reachable (type, is_packed); +} + +static bool +arm_builtin_support_vector_misalignment (enum machine_mode mode, + const_tree type, int misalignment, + bool is_packed) +{ + if (TARGET_NEON && !BYTES_BIG_ENDIAN) + { + HOST_WIDE_INT align = TYPE_ALIGN_UNIT (type); + + if (is_packed) + return align == 1; + + /* If the misalignment is unknown, we should be able to handle the access + so long as it is not to a member of a packed data structure. */ + if (misalignment == -1) + return true; + + /* Return true if the misalignment is a multiple of the natural alignment + of the vector's element type. This is probably always going to be + true in practice, since we've already established that this isn't a + packed access. */ + return ((misalignment % align) == 0); + } + + return default_builtin_support_vector_misalignment (mode, type, misalignment, + is_packed); +} + +static void +arm_conditional_register_usage (void) +{ + int regno; + + if (TARGET_SOFT_FLOAT || TARGET_THUMB1 || !TARGET_FPA) + { + for (regno = FIRST_FPA_REGNUM; + regno <= LAST_FPA_REGNUM; ++regno) + fixed_regs[regno] = call_used_regs[regno] = 1; + } + + if (TARGET_THUMB1 && optimize_size) + { + /* When optimizing for size on Thumb-1, it's better not + to use the HI regs, because of the overhead of + stacking them. */ + for (regno = FIRST_HI_REGNUM; + regno <= LAST_HI_REGNUM; ++regno) + fixed_regs[regno] = call_used_regs[regno] = 1; + } + + /* The link register can be clobbered by any branch insn, + but we have no way to track that at present, so mark + it as unavailable. */ + if (TARGET_THUMB1) + fixed_regs[LR_REGNUM] = call_used_regs[LR_REGNUM] = 1; + + if (TARGET_32BIT && TARGET_HARD_FLOAT) + { + if (TARGET_MAVERICK) + { + for (regno = FIRST_FPA_REGNUM; + regno <= LAST_FPA_REGNUM; ++ regno) + fixed_regs[regno] = call_used_regs[regno] = 1; + for (regno = FIRST_CIRRUS_FP_REGNUM; + regno <= LAST_CIRRUS_FP_REGNUM; ++ regno) + { + fixed_regs[regno] = 0; + call_used_regs[regno] = regno < FIRST_CIRRUS_FP_REGNUM + 4; + } + } + if (TARGET_VFP) + { + /* VFPv3 registers are disabled when earlier VFP + versions are selected due to the definition of + LAST_VFP_REGNUM. */ + for (regno = FIRST_VFP_REGNUM; + regno <= LAST_VFP_REGNUM; ++ regno) + { + fixed_regs[regno] = 0; + call_used_regs[regno] = regno < FIRST_VFP_REGNUM + 16 + || regno >= FIRST_VFP_REGNUM + 32; + } + } + } + + if (TARGET_REALLY_IWMMXT) + { + regno = FIRST_IWMMXT_GR_REGNUM; + /* The 2002/10/09 revision of the XScale ABI has wCG0 + and wCG1 as call-preserved registers. The 2002/11/21 + revision changed this so that all wCG registers are + scratch registers. */ + for (regno = FIRST_IWMMXT_GR_REGNUM; + regno <= LAST_IWMMXT_GR_REGNUM; ++ regno) + fixed_regs[regno] = 0; + /* The XScale ABI has wR0 - wR9 as scratch registers, + the rest as call-preserved registers. */ + for (regno = FIRST_IWMMXT_REGNUM; + regno <= LAST_IWMMXT_REGNUM; ++ regno) + { + fixed_regs[regno] = 0; + call_used_regs[regno] = regno < FIRST_IWMMXT_REGNUM + 10; + } + } + + if ((unsigned) PIC_OFFSET_TABLE_REGNUM != INVALID_REGNUM) + { + fixed_regs[PIC_OFFSET_TABLE_REGNUM] = 1; + call_used_regs[PIC_OFFSET_TABLE_REGNUM] = 1; + } + else if (TARGET_APCS_STACK) + { + fixed_regs[10] = 1; + call_used_regs[10] = 1; + } + /* -mcaller-super-interworking reserves r11 for calls to + _interwork_r11_call_via_rN(). Making the register global + is an easy way of ensuring that it remains valid for all + calls. */ + if (TARGET_APCS_FRAME || TARGET_CALLER_INTERWORKING + || TARGET_TPCS_FRAME || TARGET_TPCS_LEAF_FRAME) + { + fixed_regs[ARM_HARD_FRAME_POINTER_REGNUM] = 1; + call_used_regs[ARM_HARD_FRAME_POINTER_REGNUM] = 1; + if (TARGET_CALLER_INTERWORKING) + global_regs[ARM_HARD_FRAME_POINTER_REGNUM] = 1; + } + SUBTARGET_CONDITIONAL_REGISTER_USAGE +} + +static reg_class_t +arm_preferred_rename_class (reg_class_t rclass) +{ + /* Thumb-2 instructions using LO_REGS may be smaller than instructions + using GENERIC_REGS. During register rename pass, we prefer LO_REGS, + and code size can be reduced. */ + if (TARGET_THUMB2 && rclass == GENERAL_REGS) + return LO_REGS; + else + return NO_REGS; +} + +#include "gt-arm.h" diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h new file mode 100644 index 000000000..292b48f96 --- /dev/null +++ b/gcc/config/arm/arm.h @@ -0,0 +1,2464 @@ +/* Definitions of target machine for GNU compiler, for ARM. + Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, + 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 + Free Software Foundation, Inc. + Contributed by Pieter `Tiggr' Schoenmakers (rcpieter@win.tue.nl) + and Martin Simmons (@harleqn.co.uk). + More major hacks by Richard Earnshaw (rearnsha@arm.com) + Minor hacks by Nick Clifton (nickc@cygnus.com) + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +#ifndef GCC_ARM_H +#define GCC_ARM_H + +/* We can't use enum machine_mode inside a generator file because it + hasn't been created yet; we shouldn't be using any code that + needs the real definition though, so this ought to be safe. */ +#ifdef GENERATOR_FILE +#define MACHMODE int +#else +#include "insn-modes.h" +#define MACHMODE enum machine_mode +#endif + +#include "config/vxworks-dummy.h" + +/* The architecture define. */ +extern char arm_arch_name[]; + +/* Target CPU builtins. */ +#define TARGET_CPU_CPP_BUILTINS() \ + do \ + { \ + if (TARGET_DSP_MULTIPLY) \ + builtin_define ("__ARM_FEATURE_DSP"); \ + /* Define __arm__ even when in thumb mode, for \ + consistency with armcc. */ \ + builtin_define ("__arm__"); \ + builtin_define ("__APCS_32__"); \ + if (TARGET_THUMB) \ + builtin_define ("__thumb__"); \ + if (TARGET_THUMB2) \ + builtin_define ("__thumb2__"); \ + \ + if (TARGET_BIG_END) \ + { \ + builtin_define ("__ARMEB__"); \ + if (TARGET_THUMB) \ + builtin_define ("__THUMBEB__"); \ + if (TARGET_LITTLE_WORDS) \ + builtin_define ("__ARMWEL__"); \ + } \ + else \ + { \ + builtin_define ("__ARMEL__"); \ + if (TARGET_THUMB) \ + builtin_define ("__THUMBEL__"); \ + } \ + \ + if (TARGET_SOFT_FLOAT) \ + builtin_define ("__SOFTFP__"); \ + \ + if (TARGET_VFP) \ + builtin_define ("__VFP_FP__"); \ + \ + if (TARGET_NEON) \ + builtin_define ("__ARM_NEON__"); \ + \ + /* Add a define for interworking. \ + Needed when building libgcc.a. */ \ + if (arm_cpp_interwork) \ + builtin_define ("__THUMB_INTERWORK__"); \ + \ + builtin_assert ("cpu=arm"); \ + builtin_assert ("machine=arm"); \ + \ + builtin_define (arm_arch_name); \ + if (arm_arch_cirrus) \ + builtin_define ("__MAVERICK__"); \ + if (arm_arch_xscale) \ + builtin_define ("__XSCALE__"); \ + if (arm_arch_iwmmxt) \ + builtin_define ("__IWMMXT__"); \ + if (TARGET_AAPCS_BASED) \ + { \ + if (arm_pcs_default == ARM_PCS_AAPCS_VFP) \ + builtin_define ("__ARM_PCS_VFP"); \ + else if (arm_pcs_default == ARM_PCS_AAPCS) \ + builtin_define ("__ARM_PCS"); \ + builtin_define ("__ARM_EABI__"); \ + } \ + } while (0) + +/* The various ARM cores. */ +enum processor_type +{ +#define ARM_CORE(NAME, IDENT, ARCH, FLAGS, COSTS) \ + IDENT, +#include "arm-cores.def" +#undef ARM_CORE + /* Used to indicate that no processor has been specified. */ + arm_none +}; + +enum target_cpus +{ +#define ARM_CORE(NAME, IDENT, ARCH, FLAGS, COSTS) \ + TARGET_CPU_##IDENT, +#include "arm-cores.def" +#undef ARM_CORE + TARGET_CPU_generic +}; + +/* The processor for which instructions should be scheduled. */ +extern enum processor_type arm_tune; + +enum arm_sync_generator_tag + { + arm_sync_generator_omn, + arm_sync_generator_omrn + }; + +/* Wrapper to pass around a polymorphic pointer to a sync instruction + generator and. */ +struct arm_sync_generator +{ + enum arm_sync_generator_tag op; + union + { + rtx (* omn) (rtx, rtx, rtx); + rtx (* omrn) (rtx, rtx, rtx, rtx); + } u; +}; + +typedef enum arm_cond_code +{ + ARM_EQ = 0, ARM_NE, ARM_CS, ARM_CC, ARM_MI, ARM_PL, ARM_VS, ARM_VC, + ARM_HI, ARM_LS, ARM_GE, ARM_LT, ARM_GT, ARM_LE, ARM_AL, ARM_NV +} +arm_cc; + +extern arm_cc arm_current_cc; + +#define ARM_INVERSE_CONDITION_CODE(X) ((arm_cc) (((int)X) ^ 1)) + +extern int arm_target_label; +extern int arm_ccfsm_state; +extern GTY(()) rtx arm_target_insn; +/* The label of the current constant pool. */ +extern rtx pool_vector_label; +/* Set to 1 when a return insn is output, this means that the epilogue + is not needed. */ +extern int return_used_this_function; +/* Callback to output language specific object attributes. */ +extern void (*arm_lang_output_object_attributes_hook)(void); + +/* Just in case configure has failed to define anything. */ +#ifndef TARGET_CPU_DEFAULT +#define TARGET_CPU_DEFAULT TARGET_CPU_generic +#endif + + +#undef CPP_SPEC +#define CPP_SPEC "%(subtarget_cpp_spec) \ +%{msoft-float:%{mhard-float: \ + %e-msoft-float and -mhard_float may not be used together}} \ +%{mbig-endian:%{mlittle-endian: \ + %e-mbig-endian and -mlittle-endian may not be used together}}" + +#ifndef CC1_SPEC +#define CC1_SPEC "" +#endif + +/* This macro defines names of additional specifications to put in the specs + that can be used in various specifications like CC1_SPEC. Its definition + is an initializer with a subgrouping for each command option. + + Each subgrouping contains a string constant, that defines the + specification name, and a string constant that used by the GCC driver + program. + + Do not define this macro if it does not need to do anything. */ +#define EXTRA_SPECS \ + { "subtarget_cpp_spec", SUBTARGET_CPP_SPEC }, \ + SUBTARGET_EXTRA_SPECS + +#ifndef SUBTARGET_EXTRA_SPECS +#define SUBTARGET_EXTRA_SPECS +#endif + +#ifndef SUBTARGET_CPP_SPEC +#define SUBTARGET_CPP_SPEC "" +#endif + +/* Run-time Target Specification. */ +#ifndef TARGET_VERSION +#define TARGET_VERSION fputs (" (ARM/generic)", stderr); +#endif + +#define TARGET_SOFT_FLOAT (arm_float_abi == ARM_FLOAT_ABI_SOFT) +/* Use hardware floating point instructions. */ +#define TARGET_HARD_FLOAT (arm_float_abi != ARM_FLOAT_ABI_SOFT) +/* Use hardware floating point calling convention. */ +#define TARGET_HARD_FLOAT_ABI (arm_float_abi == ARM_FLOAT_ABI_HARD) +#define TARGET_FPA (arm_fpu_desc->model == ARM_FP_MODEL_FPA) +#define TARGET_MAVERICK (arm_fpu_desc->model == ARM_FP_MODEL_MAVERICK) +#define TARGET_VFP (arm_fpu_desc->model == ARM_FP_MODEL_VFP) +#define TARGET_IWMMXT (arm_arch_iwmmxt) +#define TARGET_REALLY_IWMMXT (TARGET_IWMMXT && TARGET_32BIT) +#define TARGET_IWMMXT_ABI (TARGET_32BIT && arm_abi == ARM_ABI_IWMMXT) +#define TARGET_ARM (! TARGET_THUMB) +#define TARGET_EITHER 1 /* (TARGET_ARM | TARGET_THUMB) */ +#define TARGET_BACKTRACE (leaf_function_p () \ + ? TARGET_TPCS_LEAF_FRAME \ + : TARGET_TPCS_FRAME) +#define TARGET_LDRD (arm_arch5e && ARM_DOUBLEWORD_ALIGN) +#define TARGET_AAPCS_BASED \ + (arm_abi != ARM_ABI_APCS && arm_abi != ARM_ABI_ATPCS) + +#define TARGET_HARD_TP (target_thread_pointer == TP_CP15) +#define TARGET_SOFT_TP (target_thread_pointer == TP_SOFT) + +/* Only 16-bit thumb code. */ +#define TARGET_THUMB1 (TARGET_THUMB && !arm_arch_thumb2) +/* Arm or Thumb-2 32-bit code. */ +#define TARGET_32BIT (TARGET_ARM || arm_arch_thumb2) +/* 32-bit Thumb-2 code. */ +#define TARGET_THUMB2 (TARGET_THUMB && arm_arch_thumb2) +/* Thumb-1 only. */ +#define TARGET_THUMB1_ONLY (TARGET_THUMB1 && !arm_arch_notm) +/* FPA emulator without LFM. */ +#define TARGET_FPA_EMU2 (TARGET_FPA && arm_fpu_desc->rev == 2) + +/* The following two macros concern the ability to execute coprocessor + instructions for VFPv3 or NEON. TARGET_VFP3/TARGET_VFPD32 are currently + only ever tested when we know we are generating for VFP hardware; we need + to be more careful with TARGET_NEON as noted below. */ + +/* FPU is has the full VFPv3/NEON register file of 32 D registers. */ +#define TARGET_VFPD32 (TARGET_VFP && arm_fpu_desc->regs == VFP_REG_D32) + +/* FPU supports VFPv3 instructions. */ +#define TARGET_VFP3 (TARGET_VFP && arm_fpu_desc->rev >= 3) + +/* FPU only supports VFP single-precision instructions. */ +#define TARGET_VFP_SINGLE (TARGET_VFP && arm_fpu_desc->regs == VFP_REG_SINGLE) + +/* FPU supports VFP double-precision instructions. */ +#define TARGET_VFP_DOUBLE (TARGET_VFP && arm_fpu_desc->regs != VFP_REG_SINGLE) + +/* FPU supports half-precision floating-point with NEON element load/store. */ +#define TARGET_NEON_FP16 \ + (TARGET_VFP && arm_fpu_desc->neon && arm_fpu_desc->fp16) + +/* FPU supports VFP half-precision floating-point. */ +#define TARGET_FP16 (TARGET_VFP && arm_fpu_desc->fp16) + +/* FPU supports Neon instructions. The setting of this macro gets + revealed via __ARM_NEON__ so we add extra guards upon TARGET_32BIT + and TARGET_HARD_FLOAT to ensure that NEON instructions are + available. */ +#define TARGET_NEON (TARGET_32BIT && TARGET_HARD_FLOAT \ + && TARGET_VFP && arm_fpu_desc->neon) + +/* "DSP" multiply instructions, eg. SMULxy. */ +#define TARGET_DSP_MULTIPLY \ + (TARGET_32BIT && arm_arch5e && (arm_arch_notm || arm_arch7em)) +/* Integer SIMD instructions, and extend-accumulate instructions. */ +#define TARGET_INT_SIMD \ + (TARGET_32BIT && arm_arch6 && (arm_arch_notm || arm_arch7em)) + +/* Should MOVW/MOVT be used in preference to a constant pool. */ +#define TARGET_USE_MOVT (arm_arch_thumb2 && !optimize_size) + +/* We could use unified syntax for arm mode, but for now we just use it + for Thumb-2. */ +#define TARGET_UNIFIED_ASM TARGET_THUMB2 + +/* Nonzero if this chip provides the DMB instruction. */ +#define TARGET_HAVE_DMB (arm_arch7) + +/* Nonzero if this chip implements a memory barrier via CP15. */ +#define TARGET_HAVE_DMB_MCR (arm_arch6 && ! TARGET_HAVE_DMB \ + && ! TARGET_THUMB1) + +/* Nonzero if this chip implements a memory barrier instruction. */ +#define TARGET_HAVE_MEMORY_BARRIER (TARGET_HAVE_DMB || TARGET_HAVE_DMB_MCR) + +/* Nonzero if this chip supports ldrex and strex */ +#define TARGET_HAVE_LDREX ((arm_arch6 && TARGET_ARM) || arm_arch7) + +/* Nonzero if this chip supports ldrex{bhd} and strex{bhd}. */ +#define TARGET_HAVE_LDREXBHD ((arm_arch6k && TARGET_ARM) || arm_arch7) + +/* True iff the full BPABI is being used. If TARGET_BPABI is true, + then TARGET_AAPCS_BASED must be true -- but the converse does not + hold. TARGET_BPABI implies the use of the BPABI runtime library, + etc., in addition to just the AAPCS calling conventions. */ +#ifndef TARGET_BPABI +#define TARGET_BPABI false +#endif + +/* Support for a compile-time default CPU, et cetera. The rules are: + --with-arch is ignored if -march or -mcpu are specified. + --with-cpu is ignored if -march or -mcpu are specified, and is overridden + by --with-arch. + --with-tune is ignored if -mtune or -mcpu are specified (but not affected + by -march). + --with-float is ignored if -mhard-float, -msoft-float or -mfloat-abi are + specified. + --with-fpu is ignored if -mfpu is specified. + --with-abi is ignored is -mabi is specified. */ +#define OPTION_DEFAULT_SPECS \ + {"arch", "%{!march=*:%{!mcpu=*:-march=%(VALUE)}}" }, \ + {"cpu", "%{!march=*:%{!mcpu=*:-mcpu=%(VALUE)}}" }, \ + {"tune", "%{!mcpu=*:%{!mtune=*:-mtune=%(VALUE)}}" }, \ + {"float", \ + "%{!msoft-float:%{!mhard-float:%{!mfloat-abi=*:-mfloat-abi=%(VALUE)}}}" }, \ + {"fpu", "%{!mfpu=*:-mfpu=%(VALUE)}"}, \ + {"abi", "%{!mabi=*:-mabi=%(VALUE)}"}, \ + {"mode", "%{!marm:%{!mthumb:-m%(VALUE)}}"}, + +/* Which floating point model to use. */ +enum arm_fp_model +{ + ARM_FP_MODEL_UNKNOWN, + /* FPA model (Hardware or software). */ + ARM_FP_MODEL_FPA, + /* Cirrus Maverick floating point model. */ + ARM_FP_MODEL_MAVERICK, + /* VFP floating point model. */ + ARM_FP_MODEL_VFP +}; + +enum vfp_reg_type +{ + VFP_NONE = 0, + VFP_REG_D16, + VFP_REG_D32, + VFP_REG_SINGLE +}; + +extern const struct arm_fpu_desc +{ + const char *name; + enum arm_fp_model model; + int rev; + enum vfp_reg_type regs; + int neon; + int fp16; +} *arm_fpu_desc; + +/* Which floating point hardware to schedule for. */ +extern int arm_fpu_attr; + +enum float_abi_type +{ + ARM_FLOAT_ABI_SOFT, + ARM_FLOAT_ABI_SOFTFP, + ARM_FLOAT_ABI_HARD +}; + +extern enum float_abi_type arm_float_abi; + +#ifndef TARGET_DEFAULT_FLOAT_ABI +#define TARGET_DEFAULT_FLOAT_ABI ARM_FLOAT_ABI_SOFT +#endif + +/* Which __fp16 format to use. + The enumeration values correspond to the numbering for the + Tag_ABI_FP_16bit_format attribute. + */ +enum arm_fp16_format_type +{ + ARM_FP16_FORMAT_NONE = 0, + ARM_FP16_FORMAT_IEEE = 1, + ARM_FP16_FORMAT_ALTERNATIVE = 2 +}; + +extern enum arm_fp16_format_type arm_fp16_format; +#define LARGEST_EXPONENT_IS_NORMAL(bits) \ + ((bits) == 16 && arm_fp16_format == ARM_FP16_FORMAT_ALTERNATIVE) + +/* Which ABI to use. */ +enum arm_abi_type +{ + ARM_ABI_APCS, + ARM_ABI_ATPCS, + ARM_ABI_AAPCS, + ARM_ABI_IWMMXT, + ARM_ABI_AAPCS_LINUX +}; + +extern enum arm_abi_type arm_abi; + +#ifndef ARM_DEFAULT_ABI +#define ARM_DEFAULT_ABI ARM_ABI_APCS +#endif + +/* Which thread pointer access sequence to use. */ +enum arm_tp_type { + TP_AUTO, + TP_SOFT, + TP_CP15 +}; + +extern enum arm_tp_type target_thread_pointer; + +/* Nonzero if this chip supports the ARM Architecture 3M extensions. */ +extern int arm_arch3m; + +/* Nonzero if this chip supports the ARM Architecture 4 extensions. */ +extern int arm_arch4; + +/* Nonzero if this chip supports the ARM Architecture 4T extensions. */ +extern int arm_arch4t; + +/* Nonzero if this chip supports the ARM Architecture 5 extensions. */ +extern int arm_arch5; + +/* Nonzero if this chip supports the ARM Architecture 5E extensions. */ +extern int arm_arch5e; + +/* Nonzero if this chip supports the ARM Architecture 6 extensions. */ +extern int arm_arch6; + +/* Nonzero if this chip supports the ARM Architecture 6k extensions. */ +extern int arm_arch6k; + +/* Nonzero if this chip supports the ARM Architecture 7 extensions. */ +extern int arm_arch7; + +/* Nonzero if instructions not present in the 'M' profile can be used. */ +extern int arm_arch_notm; + +/* Nonzero if instructions present in ARMv7E-M can be used. */ +extern int arm_arch7em; + +/* Nonzero if this chip can benefit from load scheduling. */ +extern int arm_ld_sched; + +/* Nonzero if generating Thumb code, either Thumb-1 or Thumb-2. */ +extern int thumb_code; + +/* Nonzero if generating Thumb-1 code. */ +extern int thumb1_code; + +/* Nonzero if this chip is a StrongARM. */ +extern int arm_tune_strongarm; + +/* Nonzero if this chip is a Cirrus variant. */ +extern int arm_arch_cirrus; + +/* Nonzero if this chip supports Intel XScale with Wireless MMX technology. */ +extern int arm_arch_iwmmxt; + +/* Nonzero if this chip is an XScale. */ +extern int arm_arch_xscale; + +/* Nonzero if tuning for XScale. */ +extern int arm_tune_xscale; + +/* Nonzero if tuning for stores via the write buffer. */ +extern int arm_tune_wbuf; + +/* Nonzero if tuning for Cortex-A9. */ +extern int arm_tune_cortex_a9; + +/* Nonzero if we should define __THUMB_INTERWORK__ in the + preprocessor. + XXX This is a bit of a hack, it's intended to help work around + problems in GLD which doesn't understand that armv5t code is + interworking clean. */ +extern int arm_cpp_interwork; + +/* Nonzero if chip supports Thumb 2. */ +extern int arm_arch_thumb2; + +/* Nonzero if chip supports integer division instruction. */ +extern int arm_arch_hwdiv; + +#ifndef TARGET_DEFAULT +#define TARGET_DEFAULT (MASK_APCS_FRAME) +#endif + +/* Nonzero if PIC code requires explicit qualifiers to generate + PLT and GOT relocs rather than the assembler doing so implicitly. + Subtargets can override these if required. */ +#ifndef NEED_GOT_RELOC +#define NEED_GOT_RELOC 0 +#endif +#ifndef NEED_PLT_RELOC +#define NEED_PLT_RELOC 0 +#endif + +/* Nonzero if we need to refer to the GOT with a PC-relative + offset. In other words, generate + + .word _GLOBAL_OFFSET_TABLE_ - [. - (.Lxx + 8)] + + rather than + + .word _GLOBAL_OFFSET_TABLE_ - (.Lxx + 8) + + The default is true, which matches NetBSD. Subtargets can + override this if required. */ +#ifndef GOT_PCREL +#define GOT_PCREL 1 +#endif + +/* Target machine storage Layout. */ + + +/* Define this macro if it is advisable to hold scalars in registers + in a wider mode than that declared by the program. In such cases, + the value is constrained to be within the bounds of the declared + type, but kept valid in the wider mode. The signedness of the + extension may differ from that of the type. */ + +/* It is far faster to zero extend chars than to sign extend them */ + +#define PROMOTE_MODE(MODE, UNSIGNEDP, TYPE) \ + if (GET_MODE_CLASS (MODE) == MODE_INT \ + && GET_MODE_SIZE (MODE) < 4) \ + { \ + if (MODE == QImode) \ + UNSIGNEDP = 1; \ + else if (MODE == HImode) \ + UNSIGNEDP = 1; \ + (MODE) = SImode; \ + } + +/* Define this if most significant bit is lowest numbered + in instructions that operate on numbered bit-fields. */ +#define BITS_BIG_ENDIAN 0 + +/* Define this if most significant byte of a word is the lowest numbered. + Most ARM processors are run in little endian mode, so that is the default. + If you want to have it run-time selectable, change the definition in a + cover file to be TARGET_BIG_ENDIAN. */ +#define BYTES_BIG_ENDIAN (TARGET_BIG_END != 0) + +/* Define this if most significant word of a multiword number is the lowest + numbered. + This is always false, even when in big-endian mode. */ +#define WORDS_BIG_ENDIAN (BYTES_BIG_ENDIAN && ! TARGET_LITTLE_WORDS) + +/* Define this if most significant word of doubles is the lowest numbered. + The rules are different based on whether or not we use FPA-format, + VFP-format or some other floating point co-processor's format doubles. */ +#define FLOAT_WORDS_BIG_ENDIAN (arm_float_words_big_endian ()) + +#define UNITS_PER_WORD 4 + +/* True if natural alignment is used for doubleword types. */ +#define ARM_DOUBLEWORD_ALIGN TARGET_AAPCS_BASED + +#define DOUBLEWORD_ALIGNMENT 64 + +#define PARM_BOUNDARY 32 + +#define STACK_BOUNDARY (ARM_DOUBLEWORD_ALIGN ? DOUBLEWORD_ALIGNMENT : 32) + +#define PREFERRED_STACK_BOUNDARY \ + (arm_abi == ARM_ABI_ATPCS ? 64 : STACK_BOUNDARY) + +#define FUNCTION_BOUNDARY ((TARGET_THUMB && optimize_size) ? 16 : 32) + +/* The lowest bit is used to indicate Thumb-mode functions, so the + vbit must go into the delta field of pointers to member + functions. */ +#define TARGET_PTRMEMFUNC_VBIT_LOCATION ptrmemfunc_vbit_in_delta + +#define EMPTY_FIELD_BOUNDARY 32 + +#define BIGGEST_ALIGNMENT (ARM_DOUBLEWORD_ALIGN ? DOUBLEWORD_ALIGNMENT : 32) + +/* XXX Blah -- this macro is used directly by libobjc. Since it + supports no vector modes, cut out the complexity and fall back + on BIGGEST_FIELD_ALIGNMENT. */ +#ifdef IN_TARGET_LIBS +#define BIGGEST_FIELD_ALIGNMENT 64 +#endif + +/* Make strings word-aligned so strcpy from constants will be faster. */ +#define CONSTANT_ALIGNMENT_FACTOR (TARGET_THUMB || ! arm_tune_xscale ? 1 : 2) + +#define CONSTANT_ALIGNMENT(EXP, ALIGN) \ + ((TREE_CODE (EXP) == STRING_CST \ + && !optimize_size \ + && (ALIGN) < BITS_PER_WORD * CONSTANT_ALIGNMENT_FACTOR) \ + ? BITS_PER_WORD * CONSTANT_ALIGNMENT_FACTOR : (ALIGN)) + +/* Align definitions of arrays, unions and structures so that + initializations and copies can be made more efficient. This is not + ABI-changing, so it only affects places where we can see the + definition. Increasing the alignment tends to introduce padding, + so don't do this when optimizing for size/conserving stack space. */ +#define ARM_EXPAND_ALIGNMENT(COND, EXP, ALIGN) \ + (((COND) && ((ALIGN) < BITS_PER_WORD) \ + && (TREE_CODE (EXP) == ARRAY_TYPE \ + || TREE_CODE (EXP) == UNION_TYPE \ + || TREE_CODE (EXP) == RECORD_TYPE)) ? BITS_PER_WORD : (ALIGN)) + +/* Align global data. */ +#define DATA_ALIGNMENT(EXP, ALIGN) \ + ARM_EXPAND_ALIGNMENT(!optimize_size, EXP, ALIGN) + +/* Similarly, make sure that objects on the stack are sensibly aligned. */ +#define LOCAL_ALIGNMENT(EXP, ALIGN) \ + ARM_EXPAND_ALIGNMENT(!flag_conserve_stack, EXP, ALIGN) + +/* Setting STRUCTURE_SIZE_BOUNDARY to 32 produces more efficient code, but the + value set in previous versions of this toolchain was 8, which produces more + compact structures. The command line option -mstructure_size_boundary= + can be used to change this value. For compatibility with the ARM SDK + however the value should be left at 32. ARM SDT Reference Manual (ARM DUI + 0020D) page 2-20 says "Structures are aligned on word boundaries". + The AAPCS specifies a value of 8. */ +#define STRUCTURE_SIZE_BOUNDARY arm_structure_size_boundary +extern int arm_structure_size_boundary; + +/* This is the value used to initialize arm_structure_size_boundary. If a + particular arm target wants to change the default value it should change + the definition of this macro, not STRUCTURE_SIZE_BOUNDARY. See netbsd.h + for an example of this. */ +#ifndef DEFAULT_STRUCTURE_SIZE_BOUNDARY +#define DEFAULT_STRUCTURE_SIZE_BOUNDARY 32 +#endif + +/* Nonzero if move instructions will actually fail to work + when given unaligned data. */ +#define STRICT_ALIGNMENT 1 + +/* wchar_t is unsigned under the AAPCS. */ +#ifndef WCHAR_TYPE +#define WCHAR_TYPE (TARGET_AAPCS_BASED ? "unsigned int" : "int") + +#define WCHAR_TYPE_SIZE BITS_PER_WORD +#endif + +#ifndef SIZE_TYPE +#define SIZE_TYPE (TARGET_AAPCS_BASED ? "unsigned int" : "long unsigned int") +#endif + +#ifndef PTRDIFF_TYPE +#define PTRDIFF_TYPE (TARGET_AAPCS_BASED ? "int" : "long int") +#endif + +/* AAPCS requires that structure alignment is affected by bitfields. */ +#ifndef PCC_BITFIELD_TYPE_MATTERS +#define PCC_BITFIELD_TYPE_MATTERS TARGET_AAPCS_BASED +#endif + + +/* Standard register usage. */ + +/* Register allocation in ARM Procedure Call Standard (as used on RISCiX): + (S - saved over call). + + r0 * argument word/integer result + r1-r3 argument word + + r4-r8 S register variable + r9 S (rfp) register variable (real frame pointer) + + r10 F S (sl) stack limit (used by -mapcs-stack-check) + r11 F S (fp) argument pointer + r12 (ip) temp workspace + r13 F S (sp) lower end of current stack frame + r14 (lr) link address/workspace + r15 F (pc) program counter + + f0 floating point result + f1-f3 floating point scratch + + f4-f7 S floating point variable + + cc This is NOT a real register, but is used internally + to represent things that use or set the condition + codes. + sfp This isn't either. It is used during rtl generation + since the offset between the frame pointer and the + auto's isn't known until after register allocation. + afp Nor this, we only need this because of non-local + goto. Without it fp appears to be used and the + elimination code won't get rid of sfp. It tracks + fp exactly at all times. + + *: See TARGET_CONDITIONAL_REGISTER_USAGE */ + +/* + mvf0 Cirrus floating point result + mvf1-mvf3 Cirrus floating point scratch + mvf4-mvf15 S Cirrus floating point variable. */ + +/* s0-s15 VFP scratch (aka d0-d7). + s16-s31 S VFP variable (aka d8-d15). + vfpcc Not a real register. Represents the VFP condition + code flags. */ + +/* The stack backtrace structure is as follows: + fp points to here: | save code pointer | [fp] + | return link value | [fp, #-4] + | return sp value | [fp, #-8] + | return fp value | [fp, #-12] + [| saved r10 value |] + [| saved r9 value |] + [| saved r8 value |] + [| saved r7 value |] + [| saved r6 value |] + [| saved r5 value |] + [| saved r4 value |] + [| saved r3 value |] + [| saved r2 value |] + [| saved r1 value |] + [| saved r0 value |] + [| saved f7 value |] three words + [| saved f6 value |] three words + [| saved f5 value |] three words + [| saved f4 value |] three words + r0-r3 are not normally saved in a C function. */ + +/* 1 for registers that have pervasive standard uses + and are not available for the register allocator. */ +#define FIXED_REGISTERS \ +{ \ + 0,0,0,0,0,0,0,0, \ + 0,0,0,0,0,1,0,1, \ + 0,0,0,0,0,0,0,0, \ + 1,1,1, \ + 1,1,1,1,1,1,1,1, \ + 1,1,1,1,1,1,1,1, \ + 1,1,1,1,1,1,1,1, \ + 1,1,1,1,1,1,1,1, \ + 1,1,1,1, \ + 1,1,1,1,1,1,1,1, \ + 1,1,1,1,1,1,1,1, \ + 1,1,1,1,1,1,1,1, \ + 1,1,1,1,1,1,1,1, \ + 1,1,1,1,1,1,1,1, \ + 1,1,1,1,1,1,1,1, \ + 1,1,1,1,1,1,1,1, \ + 1,1,1,1,1,1,1,1, \ + 1 \ +} + +/* 1 for registers not available across function calls. + These must include the FIXED_REGISTERS and also any + registers that can be used without being saved. + The latter must include the registers where values are returned + and the register where structure-value addresses are passed. + Aside from that, you can include as many other registers as you like. + The CC is not preserved over function calls on the ARM 6, so it is + easier to assume this for all. SFP is preserved, since FP is. */ +#define CALL_USED_REGISTERS \ +{ \ + 1,1,1,1,0,0,0,0, \ + 0,0,0,0,1,1,1,1, \ + 1,1,1,1,0,0,0,0, \ + 1,1,1, \ + 1,1,1,1,1,1,1,1, \ + 1,1,1,1,1,1,1,1, \ + 1,1,1,1,1,1,1,1, \ + 1,1,1,1,1,1,1,1, \ + 1,1,1,1, \ + 1,1,1,1,1,1,1,1, \ + 1,1,1,1,1,1,1,1, \ + 1,1,1,1,1,1,1,1, \ + 1,1,1,1,1,1,1,1, \ + 1,1,1,1,1,1,1,1, \ + 1,1,1,1,1,1,1,1, \ + 1,1,1,1,1,1,1,1, \ + 1,1,1,1,1,1,1,1, \ + 1 \ +} + +#ifndef SUBTARGET_CONDITIONAL_REGISTER_USAGE +#define SUBTARGET_CONDITIONAL_REGISTER_USAGE +#endif + +/* These are a couple of extensions to the formats accepted + by asm_fprintf: + %@ prints out ASM_COMMENT_START + %r prints out REGISTER_PREFIX reg_names[arg] */ +#define ASM_FPRINTF_EXTENSIONS(FILE, ARGS, P) \ + case '@': \ + fputs (ASM_COMMENT_START, FILE); \ + break; \ + \ + case 'r': \ + fputs (REGISTER_PREFIX, FILE); \ + fputs (reg_names [va_arg (ARGS, int)], FILE); \ + break; + +/* Round X up to the nearest word. */ +#define ROUND_UP_WORD(X) (((X) + 3) & ~3) + +/* Convert fron bytes to ints. */ +#define ARM_NUM_INTS(X) (((X) + UNITS_PER_WORD - 1) / UNITS_PER_WORD) + +/* The number of (integer) registers required to hold a quantity of type MODE. + Also used for VFP registers. */ +#define ARM_NUM_REGS(MODE) \ + ARM_NUM_INTS (GET_MODE_SIZE (MODE)) + +/* The number of (integer) registers required to hold a quantity of TYPE MODE. */ +#define ARM_NUM_REGS2(MODE, TYPE) \ + ARM_NUM_INTS ((MODE) == BLKmode ? \ + int_size_in_bytes (TYPE) : GET_MODE_SIZE (MODE)) + +/* The number of (integer) argument register available. */ +#define NUM_ARG_REGS 4 + +/* And similarly for the VFP. */ +#define NUM_VFP_ARG_REGS 16 + +/* Return the register number of the N'th (integer) argument. */ +#define ARG_REGISTER(N) (N - 1) + +/* Specify the registers used for certain standard purposes. + The values of these macros are register numbers. */ + +/* The number of the last argument register. */ +#define LAST_ARG_REGNUM ARG_REGISTER (NUM_ARG_REGS) + +/* The numbers of the Thumb register ranges. */ +#define FIRST_LO_REGNUM 0 +#define LAST_LO_REGNUM 7 +#define FIRST_HI_REGNUM 8 +#define LAST_HI_REGNUM 11 + +/* Overridden by config/arm/bpabi.h. */ +#ifndef ARM_UNWIND_INFO +#define ARM_UNWIND_INFO 0 +#endif + +/* Use r0 and r1 to pass exception handling information. */ +#define EH_RETURN_DATA_REGNO(N) (((N) < 2) ? N : INVALID_REGNUM) + +/* The register that holds the return address in exception handlers. */ +#define ARM_EH_STACKADJ_REGNUM 2 +#define EH_RETURN_STACKADJ_RTX gen_rtx_REG (SImode, ARM_EH_STACKADJ_REGNUM) + +/* The native (Norcroft) Pascal compiler for the ARM passes the static chain + as an invisible last argument (possible since varargs don't exist in + Pascal), so the following is not true. */ +#define STATIC_CHAIN_REGNUM 12 + +/* Define this to be where the real frame pointer is if it is not possible to + work out the offset between the frame pointer and the automatic variables + until after register allocation has taken place. FRAME_POINTER_REGNUM + should point to a special register that we will make sure is eliminated. + + For the Thumb we have another problem. The TPCS defines the frame pointer + as r11, and GCC believes that it is always possible to use the frame pointer + as base register for addressing purposes. (See comments in + find_reloads_address()). But - the Thumb does not allow high registers, + including r11, to be used as base address registers. Hence our problem. + + The solution used here, and in the old thumb port is to use r7 instead of + r11 as the hard frame pointer and to have special code to generate + backtrace structures on the stack (if required to do so via a command line + option) using r11. This is the only 'user visible' use of r11 as a frame + pointer. */ +#define ARM_HARD_FRAME_POINTER_REGNUM 11 +#define THUMB_HARD_FRAME_POINTER_REGNUM 7 + +#define HARD_FRAME_POINTER_REGNUM \ + (TARGET_ARM \ + ? ARM_HARD_FRAME_POINTER_REGNUM \ + : THUMB_HARD_FRAME_POINTER_REGNUM) + +#define HARD_FRAME_POINTER_IS_FRAME_POINTER 0 +#define HARD_FRAME_POINTER_IS_ARG_POINTER 0 + +#define FP_REGNUM HARD_FRAME_POINTER_REGNUM + +/* Register to use for pushing function arguments. */ +#define STACK_POINTER_REGNUM SP_REGNUM + +/* ARM floating pointer registers. */ +#define FIRST_FPA_REGNUM 16 +#define LAST_FPA_REGNUM 23 +#define IS_FPA_REGNUM(REGNUM) \ + (((REGNUM) >= FIRST_FPA_REGNUM) && ((REGNUM) <= LAST_FPA_REGNUM)) + +#define FIRST_IWMMXT_GR_REGNUM 43 +#define LAST_IWMMXT_GR_REGNUM 46 +#define FIRST_IWMMXT_REGNUM 47 +#define LAST_IWMMXT_REGNUM 62 +#define IS_IWMMXT_REGNUM(REGNUM) \ + (((REGNUM) >= FIRST_IWMMXT_REGNUM) && ((REGNUM) <= LAST_IWMMXT_REGNUM)) +#define IS_IWMMXT_GR_REGNUM(REGNUM) \ + (((REGNUM) >= FIRST_IWMMXT_GR_REGNUM) && ((REGNUM) <= LAST_IWMMXT_GR_REGNUM)) + +/* Base register for access to local variables of the function. */ +#define FRAME_POINTER_REGNUM 25 + +/* Base register for access to arguments of the function. */ +#define ARG_POINTER_REGNUM 26 + +#define FIRST_CIRRUS_FP_REGNUM 27 +#define LAST_CIRRUS_FP_REGNUM 42 +#define IS_CIRRUS_REGNUM(REGNUM) \ + (((REGNUM) >= FIRST_CIRRUS_FP_REGNUM) && ((REGNUM) <= LAST_CIRRUS_FP_REGNUM)) + +#define FIRST_VFP_REGNUM 63 +#define D7_VFP_REGNUM 78 /* Registers 77 and 78 == VFP reg D7. */ +#define LAST_VFP_REGNUM \ + (TARGET_VFPD32 ? LAST_HI_VFP_REGNUM : LAST_LO_VFP_REGNUM) + +#define IS_VFP_REGNUM(REGNUM) \ + (((REGNUM) >= FIRST_VFP_REGNUM) && ((REGNUM) <= LAST_VFP_REGNUM)) + +/* VFP registers are split into two types: those defined by VFP versions < 3 + have D registers overlaid on consecutive pairs of S registers. VFP version 3 + defines 16 new D registers (d16-d31) which, for simplicity and correctness + in various parts of the backend, we implement as "fake" single-precision + registers (which would be S32-S63, but cannot be used in that way). The + following macros define these ranges of registers. */ +#define LAST_LO_VFP_REGNUM 94 +#define FIRST_HI_VFP_REGNUM 95 +#define LAST_HI_VFP_REGNUM 126 + +#define VFP_REGNO_OK_FOR_SINGLE(REGNUM) \ + ((REGNUM) <= LAST_LO_VFP_REGNUM) + +/* DFmode values are only valid in even register pairs. */ +#define VFP_REGNO_OK_FOR_DOUBLE(REGNUM) \ + ((((REGNUM) - FIRST_VFP_REGNUM) & 1) == 0) + +/* Neon Quad values must start at a multiple of four registers. */ +#define NEON_REGNO_OK_FOR_QUAD(REGNUM) \ + ((((REGNUM) - FIRST_VFP_REGNUM) & 3) == 0) + +/* Neon structures of vectors must be in even register pairs and there + must be enough registers available. Because of various patterns + requiring quad registers, we require them to start at a multiple of + four. */ +#define NEON_REGNO_OK_FOR_NREGS(REGNUM, N) \ + ((((REGNUM) - FIRST_VFP_REGNUM) & 3) == 0 \ + && (LAST_VFP_REGNUM - (REGNUM) >= 2 * (N) - 1)) + +/* The number of hard registers is 16 ARM + 8 FPA + 1 CC + 1 SFP + 1 AFP. */ +/* + 16 Cirrus registers take us up to 43. */ +/* Intel Wireless MMX Technology registers add 16 + 4 more. */ +/* VFP (VFP3) adds 32 (64) + 1 more. */ +#define FIRST_PSEUDO_REGISTER 128 + +#define DBX_REGISTER_NUMBER(REGNO) arm_dbx_register_number (REGNO) + +/* Value should be nonzero if functions must have frame pointers. + Zero means the frame pointer need not be set up (and parms may be accessed + via the stack pointer) in functions that seem suitable. + If we have to have a frame pointer we might as well make use of it. + APCS says that the frame pointer does not need to be pushed in leaf + functions, or simple tail call functions. */ + +#ifndef SUBTARGET_FRAME_POINTER_REQUIRED +#define SUBTARGET_FRAME_POINTER_REQUIRED 0 +#endif + +/* Return number of consecutive hard regs needed starting at reg REGNO + to hold something of mode MODE. + This is ordinarily the length in words of a value of mode MODE + but can be less for certain modes in special long registers. + + On the ARM regs are UNITS_PER_WORD bits wide; FPA regs can hold any FP + mode. */ +#define HARD_REGNO_NREGS(REGNO, MODE) \ + ((TARGET_32BIT \ + && REGNO >= FIRST_FPA_REGNUM \ + && REGNO != FRAME_POINTER_REGNUM \ + && REGNO != ARG_POINTER_REGNUM) \ + && !IS_VFP_REGNUM (REGNO) \ + ? 1 : ARM_NUM_REGS (MODE)) + +/* Return true if REGNO is suitable for holding a quantity of type MODE. */ +#define HARD_REGNO_MODE_OK(REGNO, MODE) \ + arm_hard_regno_mode_ok ((REGNO), (MODE)) + +/* Value is 1 if it is a good idea to tie two pseudo registers + when one has mode MODE1 and one has mode MODE2. + If HARD_REGNO_MODE_OK could produce different values for MODE1 and MODE2, + for any hard reg, then this must be 0 for correct output. */ +#define MODES_TIEABLE_P(MODE1, MODE2) \ + (GET_MODE_CLASS (MODE1) == GET_MODE_CLASS (MODE2)) + +#define VALID_IWMMXT_REG_MODE(MODE) \ + (arm_vector_mode_supported_p (MODE) || (MODE) == DImode) + +/* Modes valid for Neon D registers. */ +#define VALID_NEON_DREG_MODE(MODE) \ + ((MODE) == V2SImode || (MODE) == V4HImode || (MODE) == V8QImode \ + || (MODE) == V2SFmode || (MODE) == DImode) + +/* Modes valid for Neon Q registers. */ +#define VALID_NEON_QREG_MODE(MODE) \ + ((MODE) == V4SImode || (MODE) == V8HImode || (MODE) == V16QImode \ + || (MODE) == V4SFmode || (MODE) == V2DImode) + +/* Structure modes valid for Neon registers. */ +#define VALID_NEON_STRUCT_MODE(MODE) \ + ((MODE) == TImode || (MODE) == EImode || (MODE) == OImode \ + || (MODE) == CImode || (MODE) == XImode) + +/* The register numbers in sequence, for passing to arm_gen_load_multiple. */ +extern int arm_regs_in_sequence[]; + +/* The order in which register should be allocated. It is good to use ip + since no saving is required (though calls clobber it) and it never contains + function parameters. It is quite good to use lr since other calls may + clobber it anyway. Allocate r0 through r3 in reverse order since r3 is + least likely to contain a function parameter; in addition results are + returned in r0. + For VFP/VFPv3, allocate D16-D31 first, then caller-saved registers (D0-D7), + then D8-D15. The reason for doing this is to attempt to reduce register + pressure when both single- and double-precision registers are used in a + function. */ + +#define REG_ALLOC_ORDER \ +{ \ + 3, 2, 1, 0, 12, 14, 4, 5, \ + 6, 7, 8, 10, 9, 11, 13, 15, \ + 16, 17, 18, 19, 20, 21, 22, 23, \ + 27, 28, 29, 30, 31, 32, 33, 34, \ + 35, 36, 37, 38, 39, 40, 41, 42, \ + 43, 44, 45, 46, 47, 48, 49, 50, \ + 51, 52, 53, 54, 55, 56, 57, 58, \ + 59, 60, 61, 62, \ + 24, 25, 26, \ + 95, 96, 97, 98, 99, 100, 101, 102, \ + 103, 104, 105, 106, 107, 108, 109, 110, \ + 111, 112, 113, 114, 115, 116, 117, 118, \ + 119, 120, 121, 122, 123, 124, 125, 126, \ + 78, 77, 76, 75, 74, 73, 72, 71, \ + 70, 69, 68, 67, 66, 65, 64, 63, \ + 79, 80, 81, 82, 83, 84, 85, 86, \ + 87, 88, 89, 90, 91, 92, 93, 94, \ + 127 \ +} + +/* Use different register alloc ordering for Thumb. */ +#define ADJUST_REG_ALLOC_ORDER arm_order_regs_for_local_alloc () + +/* Tell IRA to use the order we define rather than messing it up with its + own cost calculations. */ +#define HONOR_REG_ALLOC_ORDER + +/* Interrupt functions can only use registers that have already been + saved by the prologue, even if they would normally be + call-clobbered. */ +#define HARD_REGNO_RENAME_OK(SRC, DST) \ + (! IS_INTERRUPT (cfun->machine->func_type) || \ + df_regs_ever_live_p (DST)) + +/* Register and constant classes. */ + +/* Register classes: used to be simple, just all ARM regs or all FPA regs + Now that the Thumb is involved it has become more complicated. */ +enum reg_class +{ + NO_REGS, + FPA_REGS, + CIRRUS_REGS, + VFP_D0_D7_REGS, + VFP_LO_REGS, + VFP_HI_REGS, + VFP_REGS, + IWMMXT_GR_REGS, + IWMMXT_REGS, + LO_REGS, + STACK_REG, + BASE_REGS, + HI_REGS, + CC_REG, + VFPCC_REG, + GENERAL_REGS, + CORE_REGS, + ALL_REGS, + LIM_REG_CLASSES +}; + +#define N_REG_CLASSES (int) LIM_REG_CLASSES + +/* Give names of register classes as strings for dump file. */ +#define REG_CLASS_NAMES \ +{ \ + "NO_REGS", \ + "FPA_REGS", \ + "CIRRUS_REGS", \ + "VFP_D0_D7_REGS", \ + "VFP_LO_REGS", \ + "VFP_HI_REGS", \ + "VFP_REGS", \ + "IWMMXT_GR_REGS", \ + "IWMMXT_REGS", \ + "LO_REGS", \ + "STACK_REG", \ + "BASE_REGS", \ + "HI_REGS", \ + "CC_REG", \ + "VFPCC_REG", \ + "GENERAL_REGS", \ + "CORE_REGS", \ + "ALL_REGS", \ +} + +/* Define which registers fit in which classes. + This is an initializer for a vector of HARD_REG_SET + of length N_REG_CLASSES. */ +#define REG_CLASS_CONTENTS \ +{ \ + { 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, /* NO_REGS */ \ + { 0x00FF0000, 0x00000000, 0x00000000, 0x00000000 }, /* FPA_REGS */ \ + { 0xF8000000, 0x000007FF, 0x00000000, 0x00000000 }, /* CIRRUS_REGS */ \ + { 0x00000000, 0x80000000, 0x00007FFF, 0x00000000 }, /* VFP_D0_D7_REGS */ \ + { 0x00000000, 0x80000000, 0x7FFFFFFF, 0x00000000 }, /* VFP_LO_REGS */ \ + { 0x00000000, 0x00000000, 0x80000000, 0x7FFFFFFF }, /* VFP_HI_REGS */ \ + { 0x00000000, 0x80000000, 0xFFFFFFFF, 0x7FFFFFFF }, /* VFP_REGS */ \ + { 0x00000000, 0x00007800, 0x00000000, 0x00000000 }, /* IWMMXT_GR_REGS */ \ + { 0x00000000, 0x7FFF8000, 0x00000000, 0x00000000 }, /* IWMMXT_REGS */ \ + { 0x000000FF, 0x00000000, 0x00000000, 0x00000000 }, /* LO_REGS */ \ + { 0x00002000, 0x00000000, 0x00000000, 0x00000000 }, /* STACK_REG */ \ + { 0x000020FF, 0x00000000, 0x00000000, 0x00000000 }, /* BASE_REGS */ \ + { 0x0000DF00, 0x00000000, 0x00000000, 0x00000000 }, /* HI_REGS */ \ + { 0x01000000, 0x00000000, 0x00000000, 0x00000000 }, /* CC_REG */ \ + { 0x00000000, 0x00000000, 0x00000000, 0x80000000 }, /* VFPCC_REG */ \ + { 0x0000DFFF, 0x00000000, 0x00000000, 0x00000000 }, /* GENERAL_REGS */ \ + { 0x0000FFFF, 0x00000000, 0x00000000, 0x00000000 }, /* CORE_REGS */ \ + { 0xFAFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF } /* ALL_REGS */ \ +} + +/* Any of the VFP register classes. */ +#define IS_VFP_CLASS(X) \ + ((X) == VFP_D0_D7_REGS || (X) == VFP_LO_REGS \ + || (X) == VFP_HI_REGS || (X) == VFP_REGS) + +/* The same information, inverted: + Return the class number of the smallest class containing + reg number REGNO. This could be a conditional expression + or could index an array. */ +#define REGNO_REG_CLASS(REGNO) arm_regno_class (REGNO) + +/* The following macro defines cover classes for Integrated Register + Allocator. Cover classes is a set of non-intersected register + classes covering all hard registers used for register allocation + purpose. Any move between two registers of a cover class should be + cheaper than load or store of the registers. The macro value is + array of register classes with LIM_REG_CLASSES used as the end + marker. */ + +#define IRA_COVER_CLASSES \ +{ \ + GENERAL_REGS, FPA_REGS, CIRRUS_REGS, VFP_REGS, IWMMXT_GR_REGS, IWMMXT_REGS,\ + LIM_REG_CLASSES \ +} + +/* FPA registers can't do subreg as all values are reformatted to internal + precision. VFP registers may only be accessed in the mode they + were set. */ +#define CANNOT_CHANGE_MODE_CLASS(FROM, TO, CLASS) \ + (GET_MODE_SIZE (FROM) != GET_MODE_SIZE (TO) \ + ? reg_classes_intersect_p (FPA_REGS, (CLASS)) \ + || reg_classes_intersect_p (VFP_REGS, (CLASS)) \ + : 0) + +/* The class value for index registers, and the one for base regs. */ +#define INDEX_REG_CLASS (TARGET_THUMB1 ? LO_REGS : GENERAL_REGS) +#define BASE_REG_CLASS (TARGET_THUMB1 ? LO_REGS : CORE_REGS) + +/* For the Thumb the high registers cannot be used as base registers + when addressing quantities in QI or HI mode; if we don't know the + mode, then we must be conservative. */ +#define MODE_BASE_REG_CLASS(MODE) \ + (TARGET_32BIT ? CORE_REGS : \ + (((MODE) == SImode) ? BASE_REGS : LO_REGS)) + +/* For Thumb we can not support SP+reg addressing, so we return LO_REGS + instead of BASE_REGS. */ +#define MODE_BASE_REG_REG_CLASS(MODE) BASE_REG_CLASS + +/* When this hook returns true for MODE, the compiler allows + registers explicitly used in the rtl to be used as spill registers + but prevents the compiler from extending the lifetime of these + registers. */ +#define TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P \ + arm_small_register_classes_for_mode_p + +/* Given an rtx X being reloaded into a reg required to be + in class CLASS, return the class of reg to actually use. + In general this is just CLASS, but for the Thumb core registers and + immediate constants we prefer a LO_REGS class or a subset. */ +#define PREFERRED_RELOAD_CLASS(X, CLASS) \ + (TARGET_32BIT ? (CLASS) : \ + ((CLASS) == GENERAL_REGS || (CLASS) == HI_REGS \ + || (CLASS) == NO_REGS || (CLASS) == STACK_REG \ + ? LO_REGS : (CLASS))) + +/* Must leave BASE_REGS reloads alone */ +#define THUMB_SECONDARY_INPUT_RELOAD_CLASS(CLASS, MODE, X) \ + ((CLASS) != LO_REGS && (CLASS) != BASE_REGS \ + ? ((true_regnum (X) == -1 ? LO_REGS \ + : (true_regnum (X) + HARD_REGNO_NREGS (0, MODE) > 8) ? LO_REGS \ + : NO_REGS)) \ + : NO_REGS) + +#define THUMB_SECONDARY_OUTPUT_RELOAD_CLASS(CLASS, MODE, X) \ + ((CLASS) != LO_REGS && (CLASS) != BASE_REGS \ + ? ((true_regnum (X) == -1 ? LO_REGS \ + : (true_regnum (X) + HARD_REGNO_NREGS (0, MODE) > 8) ? LO_REGS \ + : NO_REGS)) \ + : NO_REGS) + +/* Return the register class of a scratch register needed to copy IN into + or out of a register in CLASS in MODE. If it can be done directly, + NO_REGS is returned. */ +#define SECONDARY_OUTPUT_RELOAD_CLASS(CLASS, MODE, X) \ + /* Restrict which direct reloads are allowed for VFP/iWMMXt regs. */ \ + ((TARGET_VFP && TARGET_HARD_FLOAT \ + && IS_VFP_CLASS (CLASS)) \ + ? coproc_secondary_reload_class (MODE, X, FALSE) \ + : (TARGET_IWMMXT && (CLASS) == IWMMXT_REGS) \ + ? coproc_secondary_reload_class (MODE, X, TRUE) \ + : TARGET_32BIT \ + ? (((MODE) == HImode && ! arm_arch4 && true_regnum (X) == -1) \ + ? GENERAL_REGS : NO_REGS) \ + : THUMB_SECONDARY_OUTPUT_RELOAD_CLASS (CLASS, MODE, X)) + +/* If we need to load shorts byte-at-a-time, then we need a scratch. */ +#define SECONDARY_INPUT_RELOAD_CLASS(CLASS, MODE, X) \ + /* Restrict which direct reloads are allowed for VFP/iWMMXt regs. */ \ + ((TARGET_VFP && TARGET_HARD_FLOAT \ + && IS_VFP_CLASS (CLASS)) \ + ? coproc_secondary_reload_class (MODE, X, FALSE) : \ + (TARGET_IWMMXT && (CLASS) == IWMMXT_REGS) ? \ + coproc_secondary_reload_class (MODE, X, TRUE) : \ + /* Cannot load constants into Cirrus registers. */ \ + (TARGET_MAVERICK && TARGET_HARD_FLOAT \ + && (CLASS) == CIRRUS_REGS \ + && (CONSTANT_P (X) || GET_CODE (X) == SYMBOL_REF)) \ + ? GENERAL_REGS : \ + (TARGET_32BIT ? \ + (((CLASS) == IWMMXT_REGS || (CLASS) == IWMMXT_GR_REGS) \ + && CONSTANT_P (X)) \ + ? GENERAL_REGS : \ + (((MODE) == HImode && ! arm_arch4 \ + && (GET_CODE (X) == MEM \ + || ((GET_CODE (X) == REG || GET_CODE (X) == SUBREG) \ + && true_regnum (X) == -1))) \ + ? GENERAL_REGS : NO_REGS) \ + : THUMB_SECONDARY_INPUT_RELOAD_CLASS (CLASS, MODE, X))) + +/* Try a machine-dependent way of reloading an illegitimate address + operand. If we find one, push the reload and jump to WIN. This + macro is used in only one place: `find_reloads_address' in reload.c. + + For the ARM, we wish to handle large displacements off a base + register by splitting the addend across a MOV and the mem insn. + This can cut the number of reloads needed. */ +#define ARM_LEGITIMIZE_RELOAD_ADDRESS(X, MODE, OPNUM, TYPE, IND, WIN) \ + do \ + { \ + if (arm_legitimize_reload_address (&X, MODE, OPNUM, TYPE, IND)) \ + goto WIN; \ + } \ + while (0) + +/* XXX If an HImode FP+large_offset address is converted to an HImode + SP+large_offset address, then reload won't know how to fix it. It sees + only that SP isn't valid for HImode, and so reloads the SP into an index + register, but the resulting address is still invalid because the offset + is too big. We fix it here instead by reloading the entire address. */ +/* We could probably achieve better results by defining PROMOTE_MODE to help + cope with the variances between the Thumb's signed and unsigned byte and + halfword load instructions. */ +/* ??? This should be safe for thumb2, but we may be able to do better. */ +#define THUMB_LEGITIMIZE_RELOAD_ADDRESS(X, MODE, OPNUM, TYPE, IND_L, WIN) \ +do { \ + rtx new_x = thumb_legitimize_reload_address (&X, MODE, OPNUM, TYPE, IND_L); \ + if (new_x) \ + { \ + X = new_x; \ + goto WIN; \ + } \ +} while (0) + +#define LEGITIMIZE_RELOAD_ADDRESS(X, MODE, OPNUM, TYPE, IND_LEVELS, WIN) \ + if (TARGET_ARM) \ + ARM_LEGITIMIZE_RELOAD_ADDRESS (X, MODE, OPNUM, TYPE, IND_LEVELS, WIN); \ + else \ + THUMB_LEGITIMIZE_RELOAD_ADDRESS (X, MODE, OPNUM, TYPE, IND_LEVELS, WIN) + +/* Return the maximum number of consecutive registers + needed to represent mode MODE in a register of class CLASS. + ARM regs are UNITS_PER_WORD bits while FPA regs can hold any FP mode */ +#define CLASS_MAX_NREGS(CLASS, MODE) \ + (((CLASS) == FPA_REGS || (CLASS) == CIRRUS_REGS) ? 1 : ARM_NUM_REGS (MODE)) + +/* If defined, gives a class of registers that cannot be used as the + operand of a SUBREG that changes the mode of the object illegally. */ + +/* Moves between FPA_REGS and GENERAL_REGS are two memory insns. + Moves between VFP_REGS and GENERAL_REGS are a single insn, but + it is typically more expensive than a single memory access. We set + the cost to less than two memory accesses so that floating + point to integer conversion does not go through memory. */ +#define REGISTER_MOVE_COST(MODE, FROM, TO) \ + (TARGET_32BIT ? \ + ((FROM) == FPA_REGS && (TO) != FPA_REGS ? 20 : \ + (FROM) != FPA_REGS && (TO) == FPA_REGS ? 20 : \ + IS_VFP_CLASS (FROM) && !IS_VFP_CLASS (TO) ? 15 : \ + !IS_VFP_CLASS (FROM) && IS_VFP_CLASS (TO) ? 15 : \ + (FROM) == IWMMXT_REGS && (TO) != IWMMXT_REGS ? 4 : \ + (FROM) != IWMMXT_REGS && (TO) == IWMMXT_REGS ? 4 : \ + (FROM) == IWMMXT_GR_REGS || (TO) == IWMMXT_GR_REGS ? 20 : \ + (FROM) == CIRRUS_REGS && (TO) != CIRRUS_REGS ? 20 : \ + (FROM) != CIRRUS_REGS && (TO) == CIRRUS_REGS ? 20 : \ + 2) \ + : \ + ((FROM) == HI_REGS || (TO) == HI_REGS) ? 4 : 2) + +/* Stack layout; function entry, exit and calling. */ + +/* Define this if pushing a word on the stack + makes the stack pointer a smaller address. */ +#define STACK_GROWS_DOWNWARD 1 + +/* Define this to nonzero if the nominal address of the stack frame + is at the high-address end of the local variables; + that is, each additional local variable allocated + goes at a more negative offset in the frame. */ +#define FRAME_GROWS_DOWNWARD 1 + +/* The amount of scratch space needed by _interwork_{r7,r11}_call_via_rN(). + When present, it is one word in size, and sits at the top of the frame, + between the soft frame pointer and either r7 or r11. + + We only need _interwork_rM_call_via_rN() for -mcaller-super-interworking, + and only then if some outgoing arguments are passed on the stack. It would + be tempting to also check whether the stack arguments are passed by indirect + calls, but there seems to be no reason in principle why a post-reload pass + couldn't convert a direct call into an indirect one. */ +#define CALLER_INTERWORKING_SLOT_SIZE \ + (TARGET_CALLER_INTERWORKING \ + && crtl->outgoing_args_size != 0 \ + ? UNITS_PER_WORD : 0) + +/* Offset within stack frame to start allocating local variables at. + If FRAME_GROWS_DOWNWARD, this is the offset to the END of the + first local allocated. Otherwise, it is the offset to the BEGINNING + of the first local allocated. */ +#define STARTING_FRAME_OFFSET 0 + +/* If we generate an insn to push BYTES bytes, + this says how many the stack pointer really advances by. */ +/* The push insns do not do this rounding implicitly. + So don't define this. */ +/* #define PUSH_ROUNDING(NPUSHED) ROUND_UP_WORD (NPUSHED) */ + +/* Define this if the maximum size of all the outgoing args is to be + accumulated and pushed during the prologue. The amount can be + found in the variable crtl->outgoing_args_size. */ +#define ACCUMULATE_OUTGOING_ARGS 1 + +/* Offset of first parameter from the argument pointer register value. */ +#define FIRST_PARM_OFFSET(FNDECL) (TARGET_ARM ? 4 : 0) + +/* Define how to find the value returned by a library function + assuming the value has mode MODE. */ +#define LIBCALL_VALUE(MODE) \ + (TARGET_AAPCS_BASED ? aapcs_libcall_value (MODE) \ + : (TARGET_32BIT && TARGET_HARD_FLOAT_ABI && TARGET_FPA \ + && GET_MODE_CLASS (MODE) == MODE_FLOAT) \ + ? gen_rtx_REG (MODE, FIRST_FPA_REGNUM) \ + : TARGET_32BIT && TARGET_HARD_FLOAT_ABI && TARGET_MAVERICK \ + && GET_MODE_CLASS (MODE) == MODE_FLOAT \ + ? gen_rtx_REG (MODE, FIRST_CIRRUS_FP_REGNUM) \ + : TARGET_IWMMXT_ABI && arm_vector_mode_supported_p (MODE) \ + ? gen_rtx_REG (MODE, FIRST_IWMMXT_REGNUM) \ + : gen_rtx_REG (MODE, ARG_REGISTER (1))) + +/* 1 if REGNO is a possible register number for a function value. */ +#define FUNCTION_VALUE_REGNO_P(REGNO) \ + ((REGNO) == ARG_REGISTER (1) \ + || (TARGET_AAPCS_BASED && TARGET_32BIT \ + && TARGET_VFP && TARGET_HARD_FLOAT \ + && (REGNO) == FIRST_VFP_REGNUM) \ + || (TARGET_32BIT && ((REGNO) == FIRST_CIRRUS_FP_REGNUM) \ + && TARGET_HARD_FLOAT_ABI && TARGET_MAVERICK) \ + || ((REGNO) == FIRST_IWMMXT_REGNUM && TARGET_IWMMXT_ABI) \ + || (TARGET_32BIT && ((REGNO) == FIRST_FPA_REGNUM) \ + && TARGET_HARD_FLOAT_ABI && TARGET_FPA)) + +/* Amount of memory needed for an untyped call to save all possible return + registers. */ +#define APPLY_RESULT_SIZE arm_apply_result_size() + +/* Define DEFAULT_PCC_STRUCT_RETURN to 1 if all structure and union return + values must be in memory. On the ARM, they need only do so if larger + than a word, or if they contain elements offset from zero in the struct. */ +#define DEFAULT_PCC_STRUCT_RETURN 0 + +/* These bits describe the different types of function supported + by the ARM backend. They are exclusive. i.e. a function cannot be both a + normal function and an interworked function, for example. Knowing the + type of a function is important for determining its prologue and + epilogue sequences. + Note value 7 is currently unassigned. Also note that the interrupt + function types all have bit 2 set, so that they can be tested for easily. + Note that 0 is deliberately chosen for ARM_FT_UNKNOWN so that when the + machine_function structure is initialized (to zero) func_type will + default to unknown. This will force the first use of arm_current_func_type + to call arm_compute_func_type. */ +#define ARM_FT_UNKNOWN 0 /* Type has not yet been determined. */ +#define ARM_FT_NORMAL 1 /* Your normal, straightforward function. */ +#define ARM_FT_INTERWORKED 2 /* A function that supports interworking. */ +#define ARM_FT_ISR 4 /* An interrupt service routine. */ +#define ARM_FT_FIQ 5 /* A fast interrupt service routine. */ +#define ARM_FT_EXCEPTION 6 /* An ARM exception handler (subcase of ISR). */ + +#define ARM_FT_TYPE_MASK ((1 << 3) - 1) + +/* In addition functions can have several type modifiers, + outlined by these bit masks: */ +#define ARM_FT_INTERRUPT (1 << 2) /* Note overlap with FT_ISR and above. */ +#define ARM_FT_NAKED (1 << 3) /* No prologue or epilogue. */ +#define ARM_FT_VOLATILE (1 << 4) /* Does not return. */ +#define ARM_FT_NESTED (1 << 5) /* Embedded inside another func. */ +#define ARM_FT_STACKALIGN (1 << 6) /* Called with misaligned stack. */ + +/* Some macros to test these flags. */ +#define ARM_FUNC_TYPE(t) (t & ARM_FT_TYPE_MASK) +#define IS_INTERRUPT(t) (t & ARM_FT_INTERRUPT) +#define IS_VOLATILE(t) (t & ARM_FT_VOLATILE) +#define IS_NAKED(t) (t & ARM_FT_NAKED) +#define IS_NESTED(t) (t & ARM_FT_NESTED) +#define IS_STACKALIGN(t) (t & ARM_FT_STACKALIGN) + + +/* Structure used to hold the function stack frame layout. Offsets are + relative to the stack pointer on function entry. Positive offsets are + in the direction of stack growth. + Only soft_frame is used in thumb mode. */ + +typedef struct GTY(()) arm_stack_offsets +{ + int saved_args; /* ARG_POINTER_REGNUM. */ + int frame; /* ARM_HARD_FRAME_POINTER_REGNUM. */ + int saved_regs; + int soft_frame; /* FRAME_POINTER_REGNUM. */ + int locals_base; /* THUMB_HARD_FRAME_POINTER_REGNUM. */ + int outgoing_args; /* STACK_POINTER_REGNUM. */ + unsigned int saved_regs_mask; +} +arm_stack_offsets; + +#ifndef GENERATOR_FILE +/* A C structure for machine-specific, per-function data. + This is added to the cfun structure. */ +typedef struct GTY(()) machine_function +{ + /* Additional stack adjustment in __builtin_eh_throw. */ + rtx eh_epilogue_sp_ofs; + /* Records if LR has to be saved for far jumps. */ + int far_jump_used; + /* Records if ARG_POINTER was ever live. */ + int arg_pointer_live; + /* Records if the save of LR has been eliminated. */ + int lr_save_eliminated; + /* The size of the stack frame. Only valid after reload. */ + arm_stack_offsets stack_offsets; + /* Records the type of the current function. */ + unsigned long func_type; + /* Record if the function has a variable argument list. */ + int uses_anonymous_args; + /* Records if sibcalls are blocked because an argument + register is needed to preserve stack alignment. */ + int sibcall_blocked; + /* The PIC register for this function. This might be a pseudo. */ + rtx pic_reg; + /* Labels for per-function Thumb call-via stubs. One per potential calling + register. We can never call via LR or PC. We can call via SP if a + trampoline happens to be on the top of the stack. */ + rtx call_via[14]; + /* Set to 1 when a return insn is output, this means that the epilogue + is not needed. */ + int return_used_this_function; + /* When outputting Thumb-1 code, record the last insn that provides + information about condition codes, and the comparison operands. */ + rtx thumb1_cc_insn; + rtx thumb1_cc_op0; + rtx thumb1_cc_op1; + /* Also record the CC mode that is supported. */ + enum machine_mode thumb1_cc_mode; +} +machine_function; +#endif + +/* As in the machine_function, a global set of call-via labels, for code + that is in text_section. */ +extern GTY(()) rtx thumb_call_via_label[14]; + +/* The number of potential ways of assigning to a co-processor. */ +#define ARM_NUM_COPROC_SLOTS 1 + +/* Enumeration of procedure calling standard variants. We don't really + support all of these yet. */ +enum arm_pcs +{ + ARM_PCS_AAPCS, /* Base standard AAPCS. */ + ARM_PCS_AAPCS_VFP, /* Use VFP registers for floating point values. */ + ARM_PCS_AAPCS_IWMMXT, /* Use iWMMXT registers for vectors. */ + /* This must be the last AAPCS variant. */ + ARM_PCS_AAPCS_LOCAL, /* Private call within this compilation unit. */ + ARM_PCS_ATPCS, /* ATPCS. */ + ARM_PCS_APCS, /* APCS (legacy Linux etc). */ + ARM_PCS_UNKNOWN +}; + +/* Default procedure calling standard of current compilation unit. */ +extern enum arm_pcs arm_pcs_default; + +/* A C type for declaring a variable that is used as the first argument of + `FUNCTION_ARG' and other related values. */ +typedef struct +{ + /* This is the number of registers of arguments scanned so far. */ + int nregs; + /* This is the number of iWMMXt register arguments scanned so far. */ + int iwmmxt_nregs; + int named_count; + int nargs; + /* Which procedure call variant to use for this call. */ + enum arm_pcs pcs_variant; + + /* AAPCS related state tracking. */ + int aapcs_arg_processed; /* No need to lay out this argument again. */ + int aapcs_cprc_slot; /* Index of co-processor rules to handle + this argument, or -1 if using core + registers. */ + int aapcs_ncrn; + int aapcs_next_ncrn; + rtx aapcs_reg; /* Register assigned to this argument. */ + int aapcs_partial; /* How many bytes are passed in regs (if + split between core regs and stack. + Zero otherwise. */ + int aapcs_cprc_failed[ARM_NUM_COPROC_SLOTS]; + int can_split; /* Argument can be split between core regs + and the stack. */ + /* Private data for tracking VFP register allocation */ + unsigned aapcs_vfp_regs_free; + unsigned aapcs_vfp_reg_alloc; + int aapcs_vfp_rcount; + MACHMODE aapcs_vfp_rmode; +} CUMULATIVE_ARGS; + +#define FUNCTION_ARG_PADDING(MODE, TYPE) \ + (arm_pad_arg_upward (MODE, TYPE) ? upward : downward) + +#define BLOCK_REG_PADDING(MODE, TYPE, FIRST) \ + (arm_pad_reg_upward (MODE, TYPE, FIRST) ? upward : downward) + +/* For AAPCS, padding should never be below the argument. For other ABIs, + * mimic the default. */ +#define PAD_VARARGS_DOWN \ + ((TARGET_AAPCS_BASED) ? 0 : BYTES_BIG_ENDIAN) + +/* Initialize a variable CUM of type CUMULATIVE_ARGS + for a call to a function whose data type is FNTYPE. + For a library call, FNTYPE is 0. + On the ARM, the offset starts at 0. */ +#define INIT_CUMULATIVE_ARGS(CUM, FNTYPE, LIBNAME, FNDECL, N_NAMED_ARGS) \ + arm_init_cumulative_args (&(CUM), (FNTYPE), (LIBNAME), (FNDECL)) + +/* 1 if N is a possible register number for function argument passing. + On the ARM, r0-r3 are used to pass args. */ +#define FUNCTION_ARG_REGNO_P(REGNO) \ + (IN_RANGE ((REGNO), 0, 3) \ + || (TARGET_AAPCS_BASED && TARGET_VFP && TARGET_HARD_FLOAT \ + && IN_RANGE ((REGNO), FIRST_VFP_REGNUM, FIRST_VFP_REGNUM + 15)) \ + || (TARGET_IWMMXT_ABI \ + && IN_RANGE ((REGNO), FIRST_IWMMXT_REGNUM, FIRST_IWMMXT_REGNUM + 9))) + + +/* If your target environment doesn't prefix user functions with an + underscore, you may wish to re-define this to prevent any conflicts. */ +#ifndef ARM_MCOUNT_NAME +#define ARM_MCOUNT_NAME "*mcount" +#endif + +/* Call the function profiler with a given profile label. The Acorn + compiler puts this BEFORE the prolog but gcc puts it afterwards. + On the ARM the full profile code will look like: + .data + LP1 + .word 0 + .text + mov ip, lr + bl mcount + .word LP1 + + profile_function() in final.c outputs the .data section, FUNCTION_PROFILER + will output the .text section. + + The ``mov ip,lr'' seems like a good idea to stick with cc convention. + ``prof'' doesn't seem to mind about this! + + Note - this version of the code is designed to work in both ARM and + Thumb modes. */ +#ifndef ARM_FUNCTION_PROFILER +#define ARM_FUNCTION_PROFILER(STREAM, LABELNO) \ +{ \ + char temp[20]; \ + rtx sym; \ + \ + asm_fprintf (STREAM, "\tmov\t%r, %r\n\tbl\t", \ + IP_REGNUM, LR_REGNUM); \ + assemble_name (STREAM, ARM_MCOUNT_NAME); \ + fputc ('\n', STREAM); \ + ASM_GENERATE_INTERNAL_LABEL (temp, "LP", LABELNO); \ + sym = gen_rtx_SYMBOL_REF (Pmode, temp); \ + assemble_aligned_integer (UNITS_PER_WORD, sym); \ +} +#endif + +#ifdef THUMB_FUNCTION_PROFILER +#define FUNCTION_PROFILER(STREAM, LABELNO) \ + if (TARGET_ARM) \ + ARM_FUNCTION_PROFILER (STREAM, LABELNO) \ + else \ + THUMB_FUNCTION_PROFILER (STREAM, LABELNO) +#else +#define FUNCTION_PROFILER(STREAM, LABELNO) \ + ARM_FUNCTION_PROFILER (STREAM, LABELNO) +#endif + +/* EXIT_IGNORE_STACK should be nonzero if, when returning from a function, + the stack pointer does not matter. The value is tested only in + functions that have frame pointers. + No definition is equivalent to always zero. + + On the ARM, the function epilogue recovers the stack pointer from the + frame. */ +#define EXIT_IGNORE_STACK 1 + +#define EPILOGUE_USES(REGNO) ((REGNO) == LR_REGNUM) + +/* Determine if the epilogue should be output as RTL. + You should override this if you define FUNCTION_EXTRA_EPILOGUE. */ +#define USE_RETURN_INSN(ISCOND) \ + (TARGET_32BIT ? use_return_insn (ISCOND, NULL) : 0) + +/* Definitions for register eliminations. + + This is an array of structures. Each structure initializes one pair + of eliminable registers. The "from" register number is given first, + followed by "to". Eliminations of the same "from" register are listed + in order of preference. + + We have two registers that can be eliminated on the ARM. First, the + arg pointer register can often be eliminated in favor of the stack + pointer register. Secondly, the pseudo frame pointer register can always + be eliminated; it is replaced with either the stack or the real frame + pointer. Note we have to use {ARM|THUMB}_HARD_FRAME_POINTER_REGNUM + because the definition of HARD_FRAME_POINTER_REGNUM is not a constant. */ + +#define ELIMINABLE_REGS \ +{{ ARG_POINTER_REGNUM, STACK_POINTER_REGNUM },\ + { ARG_POINTER_REGNUM, FRAME_POINTER_REGNUM },\ + { ARG_POINTER_REGNUM, ARM_HARD_FRAME_POINTER_REGNUM },\ + { ARG_POINTER_REGNUM, THUMB_HARD_FRAME_POINTER_REGNUM },\ + { FRAME_POINTER_REGNUM, STACK_POINTER_REGNUM },\ + { FRAME_POINTER_REGNUM, ARM_HARD_FRAME_POINTER_REGNUM },\ + { FRAME_POINTER_REGNUM, THUMB_HARD_FRAME_POINTER_REGNUM }} + +/* Define the offset between two registers, one to be eliminated, and the + other its replacement, at the start of a routine. */ +#define INITIAL_ELIMINATION_OFFSET(FROM, TO, OFFSET) \ + if (TARGET_ARM) \ + (OFFSET) = arm_compute_initial_elimination_offset (FROM, TO); \ + else \ + (OFFSET) = thumb_compute_initial_elimination_offset (FROM, TO) + +/* Special case handling of the location of arguments passed on the stack. */ +#define DEBUGGER_ARG_OFFSET(value, addr) value ? value : arm_debugger_arg_offset (value, addr) + +/* Initialize data used by insn expanders. This is called from insn_emit, + once for every function before code is generated. */ +#define INIT_EXPANDERS arm_init_expanders () + +/* Length in units of the trampoline for entering a nested function. */ +#define TRAMPOLINE_SIZE (TARGET_32BIT ? 16 : 20) + +/* Alignment required for a trampoline in bits. */ +#define TRAMPOLINE_ALIGNMENT 32 + +/* Addressing modes, and classification of registers for them. */ +#define HAVE_POST_INCREMENT 1 +#define HAVE_PRE_INCREMENT TARGET_32BIT +#define HAVE_POST_DECREMENT TARGET_32BIT +#define HAVE_PRE_DECREMENT TARGET_32BIT +#define HAVE_PRE_MODIFY_DISP TARGET_32BIT +#define HAVE_POST_MODIFY_DISP TARGET_32BIT +#define HAVE_PRE_MODIFY_REG TARGET_32BIT +#define HAVE_POST_MODIFY_REG TARGET_32BIT + +/* Macros to check register numbers against specific register classes. */ + +/* These assume that REGNO is a hard or pseudo reg number. + They give nonzero only if REGNO is a hard reg of the suitable class + or a pseudo reg currently allocated to a suitable hard reg. + Since they use reg_renumber, they are safe only once reg_renumber + has been allocated, which happens in local-alloc.c. */ +#define TEST_REGNO(R, TEST, VALUE) \ + ((R TEST VALUE) || ((unsigned) reg_renumber[R] TEST VALUE)) + +/* Don't allow the pc to be used. */ +#define ARM_REGNO_OK_FOR_BASE_P(REGNO) \ + (TEST_REGNO (REGNO, <, PC_REGNUM) \ + || TEST_REGNO (REGNO, ==, FRAME_POINTER_REGNUM) \ + || TEST_REGNO (REGNO, ==, ARG_POINTER_REGNUM)) + +#define THUMB1_REGNO_MODE_OK_FOR_BASE_P(REGNO, MODE) \ + (TEST_REGNO (REGNO, <=, LAST_LO_REGNUM) \ + || (GET_MODE_SIZE (MODE) >= 4 \ + && TEST_REGNO (REGNO, ==, STACK_POINTER_REGNUM))) + +#define REGNO_MODE_OK_FOR_BASE_P(REGNO, MODE) \ + (TARGET_THUMB1 \ + ? THUMB1_REGNO_MODE_OK_FOR_BASE_P (REGNO, MODE) \ + : ARM_REGNO_OK_FOR_BASE_P (REGNO)) + +/* Nonzero if X can be the base register in a reg+reg addressing mode. + For Thumb, we can not use SP + reg, so reject SP. */ +#define REGNO_MODE_OK_FOR_REG_BASE_P(X, MODE) \ + REGNO_MODE_OK_FOR_BASE_P (X, QImode) + +/* For ARM code, we don't care about the mode, but for Thumb, the index + must be suitable for use in a QImode load. */ +#define REGNO_OK_FOR_INDEX_P(REGNO) \ + (REGNO_MODE_OK_FOR_BASE_P (REGNO, QImode) \ + && !TEST_REGNO (REGNO, ==, STACK_POINTER_REGNUM)) + +/* Maximum number of registers that can appear in a valid memory address. + Shifts in addresses can't be by a register. */ +#define MAX_REGS_PER_ADDRESS 2 + +/* Recognize any constant value that is a valid address. */ +/* XXX We can address any constant, eventually... */ +/* ??? Should the TARGET_ARM here also apply to thumb2? */ +#define CONSTANT_ADDRESS_P(X) \ + (GET_CODE (X) == SYMBOL_REF \ + && (CONSTANT_POOL_ADDRESS_P (X) \ + || (TARGET_ARM && optimize > 0 && SYMBOL_REF_FLAG (X)))) + +/* True if SYMBOL + OFFSET constants must refer to something within + SYMBOL's section. */ +#define ARM_OFFSETS_MUST_BE_WITHIN_SECTIONS_P 0 + +/* Nonzero if all target requires all absolute relocations be R_ARM_ABS32. */ +#ifndef TARGET_DEFAULT_WORD_RELOCATIONS +#define TARGET_DEFAULT_WORD_RELOCATIONS 0 +#endif + +/* Nonzero if the constant value X is a legitimate general operand. + It is given that X satisfies CONSTANT_P or is a CONST_DOUBLE. + + On the ARM, allow any integer (invalid ones are removed later by insn + patterns), nice doubles and symbol_refs which refer to the function's + constant pool XXX. + + When generating pic allow anything. */ +#define ARM_LEGITIMATE_CONSTANT_P(X) (flag_pic || ! label_mentioned_p (X)) + +#define THUMB_LEGITIMATE_CONSTANT_P(X) \ + ( GET_CODE (X) == CONST_INT \ + || GET_CODE (X) == CONST_DOUBLE \ + || CONSTANT_ADDRESS_P (X) \ + || flag_pic) + +#define LEGITIMATE_CONSTANT_P(X) \ + (!arm_cannot_force_const_mem (X) \ + && (TARGET_32BIT ? ARM_LEGITIMATE_CONSTANT_P (X) \ + : THUMB_LEGITIMATE_CONSTANT_P (X))) + +#ifndef SUBTARGET_NAME_ENCODING_LENGTHS +#define SUBTARGET_NAME_ENCODING_LENGTHS +#endif + +/* This is a C fragment for the inside of a switch statement. + Each case label should return the number of characters to + be stripped from the start of a function's name, if that + name starts with the indicated character. */ +#define ARM_NAME_ENCODING_LENGTHS \ + case '*': return 1; \ + SUBTARGET_NAME_ENCODING_LENGTHS + +/* This is how to output a reference to a user-level label named NAME. + `assemble_name' uses this. */ +#undef ASM_OUTPUT_LABELREF +#define ASM_OUTPUT_LABELREF(FILE, NAME) \ + arm_asm_output_labelref (FILE, NAME) + +/* Output IT instructions for conditionally executed Thumb-2 instructions. */ +#define ASM_OUTPUT_OPCODE(STREAM, PTR) \ + if (TARGET_THUMB2) \ + thumb2_asm_output_opcode (STREAM); + +/* The EABI specifies that constructors should go in .init_array. + Other targets use .ctors for compatibility. */ +#ifndef ARM_EABI_CTORS_SECTION_OP +#define ARM_EABI_CTORS_SECTION_OP \ + "\t.section\t.init_array,\"aw\",%init_array" +#endif +#ifndef ARM_EABI_DTORS_SECTION_OP +#define ARM_EABI_DTORS_SECTION_OP \ + "\t.section\t.fini_array,\"aw\",%fini_array" +#endif +#define ARM_CTORS_SECTION_OP \ + "\t.section\t.ctors,\"aw\",%progbits" +#define ARM_DTORS_SECTION_OP \ + "\t.section\t.dtors,\"aw\",%progbits" + +/* Define CTORS_SECTION_ASM_OP. */ +#undef CTORS_SECTION_ASM_OP +#undef DTORS_SECTION_ASM_OP +#ifndef IN_LIBGCC2 +# define CTORS_SECTION_ASM_OP \ + (TARGET_AAPCS_BASED ? ARM_EABI_CTORS_SECTION_OP : ARM_CTORS_SECTION_OP) +# define DTORS_SECTION_ASM_OP \ + (TARGET_AAPCS_BASED ? ARM_EABI_DTORS_SECTION_OP : ARM_DTORS_SECTION_OP) +#else /* !defined (IN_LIBGCC2) */ +/* In libgcc, CTORS_SECTION_ASM_OP must be a compile-time constant, + so we cannot use the definition above. */ +# ifdef __ARM_EABI__ +/* The .ctors section is not part of the EABI, so we do not define + CTORS_SECTION_ASM_OP when in libgcc; that prevents crtstuff + from trying to use it. We do define it when doing normal + compilation, as .init_array can be used instead of .ctors. */ +/* There is no need to emit begin or end markers when using + init_array; the dynamic linker will compute the size of the + array itself based on special symbols created by the static + linker. However, we do need to arrange to set up + exception-handling here. */ +# define CTOR_LIST_BEGIN asm (ARM_EABI_CTORS_SECTION_OP) +# define CTOR_LIST_END /* empty */ +# define DTOR_LIST_BEGIN asm (ARM_EABI_DTORS_SECTION_OP) +# define DTOR_LIST_END /* empty */ +# else /* !defined (__ARM_EABI__) */ +# define CTORS_SECTION_ASM_OP ARM_CTORS_SECTION_OP +# define DTORS_SECTION_ASM_OP ARM_DTORS_SECTION_OP +# endif /* !defined (__ARM_EABI__) */ +#endif /* !defined (IN_LIBCC2) */ + +/* True if the operating system can merge entities with vague linkage + (e.g., symbols in COMDAT group) during dynamic linking. */ +#ifndef TARGET_ARM_DYNAMIC_VAGUE_LINKAGE_P +#define TARGET_ARM_DYNAMIC_VAGUE_LINKAGE_P true +#endif + +#define ARM_OUTPUT_FN_UNWIND(F, PROLOGUE) arm_output_fn_unwind (F, PROLOGUE) + +/* The macros REG_OK_FOR..._P assume that the arg is a REG rtx + and check its validity for a certain class. + We have two alternate definitions for each of them. + The usual definition accepts all pseudo regs; the other rejects + them unless they have been allocated suitable hard regs. + The symbol REG_OK_STRICT causes the latter definition to be used. + Thumb-2 has the same restrictions as arm. */ +#ifndef REG_OK_STRICT + +#define ARM_REG_OK_FOR_BASE_P(X) \ + (REGNO (X) <= LAST_ARM_REGNUM \ + || REGNO (X) >= FIRST_PSEUDO_REGISTER \ + || REGNO (X) == FRAME_POINTER_REGNUM \ + || REGNO (X) == ARG_POINTER_REGNUM) + +#define ARM_REG_OK_FOR_INDEX_P(X) \ + ((REGNO (X) <= LAST_ARM_REGNUM \ + && REGNO (X) != STACK_POINTER_REGNUM) \ + || REGNO (X) >= FIRST_PSEUDO_REGISTER \ + || REGNO (X) == FRAME_POINTER_REGNUM \ + || REGNO (X) == ARG_POINTER_REGNUM) + +#define THUMB1_REG_MODE_OK_FOR_BASE_P(X, MODE) \ + (REGNO (X) <= LAST_LO_REGNUM \ + || REGNO (X) >= FIRST_PSEUDO_REGISTER \ + || (GET_MODE_SIZE (MODE) >= 4 \ + && (REGNO (X) == STACK_POINTER_REGNUM \ + || (X) == hard_frame_pointer_rtx \ + || (X) == arg_pointer_rtx))) + +#define REG_STRICT_P 0 + +#else /* REG_OK_STRICT */ + +#define ARM_REG_OK_FOR_BASE_P(X) \ + ARM_REGNO_OK_FOR_BASE_P (REGNO (X)) + +#define ARM_REG_OK_FOR_INDEX_P(X) \ + ARM_REGNO_OK_FOR_INDEX_P (REGNO (X)) + +#define THUMB1_REG_MODE_OK_FOR_BASE_P(X, MODE) \ + THUMB1_REGNO_MODE_OK_FOR_BASE_P (REGNO (X), MODE) + +#define REG_STRICT_P 1 + +#endif /* REG_OK_STRICT */ + +/* Now define some helpers in terms of the above. */ + +#define REG_MODE_OK_FOR_BASE_P(X, MODE) \ + (TARGET_THUMB1 \ + ? THUMB1_REG_MODE_OK_FOR_BASE_P (X, MODE) \ + : ARM_REG_OK_FOR_BASE_P (X)) + +/* For 16-bit Thumb, a valid index register is anything that can be used in + a byte load instruction. */ +#define THUMB1_REG_OK_FOR_INDEX_P(X) \ + THUMB1_REG_MODE_OK_FOR_BASE_P (X, QImode) + +/* Nonzero if X is a hard reg that can be used as an index + or if it is a pseudo reg. On the Thumb, the stack pointer + is not suitable. */ +#define REG_OK_FOR_INDEX_P(X) \ + (TARGET_THUMB1 \ + ? THUMB1_REG_OK_FOR_INDEX_P (X) \ + : ARM_REG_OK_FOR_INDEX_P (X)) + +/* Nonzero if X can be the base register in a reg+reg addressing mode. + For Thumb, we can not use SP + reg, so reject SP. */ +#define REG_MODE_OK_FOR_REG_BASE_P(X, MODE) \ + REG_OK_FOR_INDEX_P (X) + +#define ARM_BASE_REGISTER_RTX_P(X) \ + (GET_CODE (X) == REG && ARM_REG_OK_FOR_BASE_P (X)) + +#define ARM_INDEX_REGISTER_RTX_P(X) \ + (GET_CODE (X) == REG && ARM_REG_OK_FOR_INDEX_P (X)) + +/* Specify the machine mode that this machine uses + for the index in the tablejump instruction. */ +#define CASE_VECTOR_MODE Pmode + +#define CASE_VECTOR_PC_RELATIVE (TARGET_THUMB2 \ + || (TARGET_THUMB1 \ + && (optimize_size || flag_pic))) + +#define CASE_VECTOR_SHORTEN_MODE(min, max, body) \ + (TARGET_THUMB1 \ + ? (min >= 0 && max < 512 \ + ? (ADDR_DIFF_VEC_FLAGS (body).offset_unsigned = 1, QImode) \ + : min >= -256 && max < 256 \ + ? (ADDR_DIFF_VEC_FLAGS (body).offset_unsigned = 0, QImode) \ + : min >= 0 && max < 8192 \ + ? (ADDR_DIFF_VEC_FLAGS (body).offset_unsigned = 1, HImode) \ + : min >= -4096 && max < 4096 \ + ? (ADDR_DIFF_VEC_FLAGS (body).offset_unsigned = 0, HImode) \ + : SImode) \ + : ((min < 0 || max >= 0x2000 || !TARGET_THUMB2) ? SImode \ + : (max >= 0x200) ? HImode \ + : QImode)) + +/* signed 'char' is most compatible, but RISC OS wants it unsigned. + unsigned is probably best, but may break some code. */ +#ifndef DEFAULT_SIGNED_CHAR +#define DEFAULT_SIGNED_CHAR 0 +#endif + +/* Max number of bytes we can move from memory to memory + in one reasonably fast instruction. */ +#define MOVE_MAX 4 + +#undef MOVE_RATIO +#define MOVE_RATIO(speed) (arm_tune_xscale ? 4 : 2) + +/* Define if operations between registers always perform the operation + on the full register even if a narrower mode is specified. */ +#define WORD_REGISTER_OPERATIONS + +/* Define if loading in MODE, an integral mode narrower than BITS_PER_WORD + will either zero-extend or sign-extend. The value of this macro should + be the code that says which one of the two operations is implicitly + done, UNKNOWN if none. */ +#define LOAD_EXTEND_OP(MODE) \ + (TARGET_THUMB ? ZERO_EXTEND : \ + ((arm_arch4 || (MODE) == QImode) ? ZERO_EXTEND \ + : ((BYTES_BIG_ENDIAN && (MODE) == HImode) ? SIGN_EXTEND : UNKNOWN))) + +/* Nonzero if access to memory by bytes is slow and undesirable. */ +#define SLOW_BYTE_ACCESS 0 + +#define SLOW_UNALIGNED_ACCESS(MODE, ALIGN) 1 + +/* Immediate shift counts are truncated by the output routines (or was it + the assembler?). Shift counts in a register are truncated by ARM. Note + that the native compiler puts too large (> 32) immediate shift counts + into a register and shifts by the register, letting the ARM decide what + to do instead of doing that itself. */ +/* This is all wrong. Defining SHIFT_COUNT_TRUNCATED tells combine that + code like (X << (Y % 32)) for register X, Y is equivalent to (X << Y). + On the arm, Y in a register is used modulo 256 for the shift. Only for + rotates is modulo 32 used. */ +/* #define SHIFT_COUNT_TRUNCATED 1 */ + +/* All integers have the same format so truncation is easy. */ +#define TRULY_NOOP_TRUNCATION(OUTPREC, INPREC) 1 + +/* Calling from registers is a massive pain. */ +#define NO_FUNCTION_CSE 1 + +/* The machine modes of pointers and functions */ +#define Pmode SImode +#define FUNCTION_MODE Pmode + +#define ARM_FRAME_RTX(X) \ + ( (X) == frame_pointer_rtx || (X) == stack_pointer_rtx \ + || (X) == arg_pointer_rtx) + +/* Moves to and from memory are quite expensive */ +#define MEMORY_MOVE_COST(M, CLASS, IN) \ + (TARGET_32BIT ? 10 : \ + ((GET_MODE_SIZE (M) < 4 ? 8 : 2 * GET_MODE_SIZE (M)) \ + * (CLASS == LO_REGS ? 1 : 2))) + +/* Try to generate sequences that don't involve branches, we can then use + conditional instructions */ +#define BRANCH_COST(speed_p, predictable_p) \ + (TARGET_32BIT ? 4 : (optimize > 0 ? 2 : 0)) + +/* Position Independent Code. */ +/* We decide which register to use based on the compilation options and + the assembler in use; this is more general than the APCS restriction of + using sb (r9) all the time. */ +extern unsigned arm_pic_register; + +/* The register number of the register used to address a table of static + data addresses in memory. */ +#define PIC_OFFSET_TABLE_REGNUM arm_pic_register + +/* We can't directly access anything that contains a symbol, + nor can we indirect via the constant pool. One exception is + UNSPEC_TLS, which is always PIC. */ +#define LEGITIMATE_PIC_OPERAND_P(X) \ + (!(symbol_mentioned_p (X) \ + || label_mentioned_p (X) \ + || (GET_CODE (X) == SYMBOL_REF \ + && CONSTANT_POOL_ADDRESS_P (X) \ + && (symbol_mentioned_p (get_pool_constant (X)) \ + || label_mentioned_p (get_pool_constant (X))))) \ + || tls_mentioned_p (X)) + +/* We need to know when we are making a constant pool; this determines + whether data needs to be in the GOT or can be referenced via a GOT + offset. */ +extern int making_const_table; + +/* Handle pragmas for compatibility with Intel's compilers. */ +/* Also abuse this to register additional C specific EABI attributes. */ +#define REGISTER_TARGET_PRAGMAS() do { \ + c_register_pragma (0, "long_calls", arm_pr_long_calls); \ + c_register_pragma (0, "no_long_calls", arm_pr_no_long_calls); \ + c_register_pragma (0, "long_calls_off", arm_pr_long_calls_off); \ + arm_lang_object_attributes_init(); \ +} while (0) + +/* Condition code information. */ +/* Given a comparison code (EQ, NE, etc.) and the first operand of a COMPARE, + return the mode to be used for the comparison. */ + +#define SELECT_CC_MODE(OP, X, Y) arm_select_cc_mode (OP, X, Y) + +#define REVERSIBLE_CC_MODE(MODE) 1 + +#define REVERSE_CONDITION(CODE,MODE) \ + (((MODE) == CCFPmode || (MODE) == CCFPEmode) \ + ? reverse_condition_maybe_unordered (code) \ + : reverse_condition (code)) + +#define CANONICALIZE_COMPARISON(CODE, OP0, OP1) \ + (CODE) = arm_canonicalize_comparison (CODE, &(OP0), &(OP1)) + +/* The arm5 clz instruction returns 32. */ +#define CLZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE) ((VALUE) = 32, 1) +#define CTZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE) ((VALUE) = 32, 1) + +#define CC_STATUS_INIT \ + do { cfun->machine->thumb1_cc_insn = NULL_RTX; } while (0) + +#undef ASM_APP_OFF +#define ASM_APP_OFF (TARGET_THUMB1 ? "\t.code\t16\n" : \ + TARGET_THUMB2 ? "\t.thumb\n" : "") + +/* Output a push or a pop instruction (only used when profiling). + We can't push STATIC_CHAIN_REGNUM (r12) directly with Thumb-1. We know + that ASM_OUTPUT_REG_PUSH will be matched with ASM_OUTPUT_REG_POP, and + that r7 isn't used by the function profiler, so we can use it as a + scratch reg. WARNING: This isn't safe in the general case! It may be + sensitive to future changes in final.c:profile_function. */ +#define ASM_OUTPUT_REG_PUSH(STREAM, REGNO) \ + do \ + { \ + if (TARGET_ARM) \ + asm_fprintf (STREAM,"\tstmfd\t%r!,{%r}\n", \ + STACK_POINTER_REGNUM, REGNO); \ + else if (TARGET_THUMB1 \ + && (REGNO) == STATIC_CHAIN_REGNUM) \ + { \ + asm_fprintf (STREAM, "\tpush\t{r7}\n"); \ + asm_fprintf (STREAM, "\tmov\tr7, %r\n", REGNO);\ + asm_fprintf (STREAM, "\tpush\t{r7}\n"); \ + } \ + else \ + asm_fprintf (STREAM, "\tpush {%r}\n", REGNO); \ + } while (0) + + +/* See comment for ASM_OUTPUT_REG_PUSH concerning Thumb-1 issue. */ +#define ASM_OUTPUT_REG_POP(STREAM, REGNO) \ + do \ + { \ + if (TARGET_ARM) \ + asm_fprintf (STREAM, "\tldmfd\t%r!,{%r}\n", \ + STACK_POINTER_REGNUM, REGNO); \ + else if (TARGET_THUMB1 \ + && (REGNO) == STATIC_CHAIN_REGNUM) \ + { \ + asm_fprintf (STREAM, "\tpop\t{r7}\n"); \ + asm_fprintf (STREAM, "\tmov\t%r, r7\n", REGNO);\ + asm_fprintf (STREAM, "\tpop\t{r7}\n"); \ + } \ + else \ + asm_fprintf (STREAM, "\tpop {%r}\n", REGNO); \ + } while (0) + +/* Jump table alignment is explicit in ASM_OUTPUT_CASE_LABEL. */ +#define ADDR_VEC_ALIGN(JUMPTABLE) 0 + +/* This is how to output a label which precedes a jumptable. Since + Thumb instructions are 2 bytes, we may need explicit alignment here. */ +#undef ASM_OUTPUT_CASE_LABEL +#define ASM_OUTPUT_CASE_LABEL(FILE, PREFIX, NUM, JUMPTABLE) \ + do \ + { \ + if (TARGET_THUMB && GET_MODE (PATTERN (JUMPTABLE)) == SImode) \ + ASM_OUTPUT_ALIGN (FILE, 2); \ + (*targetm.asm_out.internal_label) (FILE, PREFIX, NUM); \ + } \ + while (0) + +/* Make sure subsequent insns are aligned after a TBB. */ +#define ASM_OUTPUT_CASE_END(FILE, NUM, JUMPTABLE) \ + do \ + { \ + if (GET_MODE (PATTERN (JUMPTABLE)) == QImode) \ + ASM_OUTPUT_ALIGN (FILE, 1); \ + } \ + while (0) + +#define ARM_DECLARE_FUNCTION_NAME(STREAM, NAME, DECL) \ + do \ + { \ + if (TARGET_THUMB) \ + { \ + if (is_called_in_ARM_mode (DECL) \ + || (TARGET_THUMB1 && !TARGET_THUMB1_ONLY \ + && cfun->is_thunk)) \ + fprintf (STREAM, "\t.code 32\n") ; \ + else if (TARGET_THUMB1) \ + fprintf (STREAM, "\t.code\t16\n\t.thumb_func\n") ; \ + else \ + fprintf (STREAM, "\t.thumb\n\t.thumb_func\n") ; \ + } \ + if (TARGET_POKE_FUNCTION_NAME) \ + arm_poke_function_name (STREAM, (const char *) NAME); \ + } \ + while (0) + +/* For aliases of functions we use .thumb_set instead. */ +#define ASM_OUTPUT_DEF_FROM_DECLS(FILE, DECL1, DECL2) \ + do \ + { \ + const char *const LABEL1 = XSTR (XEXP (DECL_RTL (decl), 0), 0); \ + const char *const LABEL2 = IDENTIFIER_POINTER (DECL2); \ + \ + if (TARGET_THUMB && TREE_CODE (DECL1) == FUNCTION_DECL) \ + { \ + fprintf (FILE, "\t.thumb_set "); \ + assemble_name (FILE, LABEL1); \ + fprintf (FILE, ","); \ + assemble_name (FILE, LABEL2); \ + fprintf (FILE, "\n"); \ + } \ + else \ + ASM_OUTPUT_DEF (FILE, LABEL1, LABEL2); \ + } \ + while (0) + +#ifdef HAVE_GAS_MAX_SKIP_P2ALIGN +/* To support -falign-* switches we need to use .p2align so + that alignment directives in code sections will be padded + with no-op instructions, rather than zeroes. */ +#define ASM_OUTPUT_MAX_SKIP_ALIGN(FILE, LOG, MAX_SKIP) \ + if ((LOG) != 0) \ + { \ + if ((MAX_SKIP) == 0) \ + fprintf ((FILE), "\t.p2align %d\n", (int) (LOG)); \ + else \ + fprintf ((FILE), "\t.p2align %d,,%d\n", \ + (int) (LOG), (int) (MAX_SKIP)); \ + } +#endif + +/* Add two bytes to the length of conditionally executed Thumb-2 + instructions for the IT instruction. */ +#define ADJUST_INSN_LENGTH(insn, length) \ + if (TARGET_THUMB2 && GET_CODE (PATTERN (insn)) == COND_EXEC) \ + length += 2; + +/* Only perform branch elimination (by making instructions conditional) if + we're optimizing. For Thumb-2 check if any IT instructions need + outputting. */ +#define FINAL_PRESCAN_INSN(INSN, OPVEC, NOPERANDS) \ + if (TARGET_ARM && optimize) \ + arm_final_prescan_insn (INSN); \ + else if (TARGET_THUMB2) \ + thumb2_final_prescan_insn (INSN); \ + else if (TARGET_THUMB1) \ + thumb1_final_prescan_insn (INSN) + +#define ARM_SIGN_EXTEND(x) ((HOST_WIDE_INT) \ + (HOST_BITS_PER_WIDE_INT <= 32 ? (unsigned HOST_WIDE_INT) (x) \ + : ((((unsigned HOST_WIDE_INT)(x)) & (unsigned HOST_WIDE_INT) 0xffffffff) |\ + ((((unsigned HOST_WIDE_INT)(x)) & (unsigned HOST_WIDE_INT) 0x80000000) \ + ? ((~ (unsigned HOST_WIDE_INT) 0) \ + & ~ (unsigned HOST_WIDE_INT) 0xffffffff) \ + : 0)))) + +/* A C expression whose value is RTL representing the value of the return + address for the frame COUNT steps up from the current frame. */ + +#define RETURN_ADDR_RTX(COUNT, FRAME) \ + arm_return_addr (COUNT, FRAME) + +/* Mask of the bits in the PC that contain the real return address + when running in 26-bit mode. */ +#define RETURN_ADDR_MASK26 (0x03fffffc) + +/* Pick up the return address upon entry to a procedure. Used for + dwarf2 unwind information. This also enables the table driven + mechanism. */ +#define INCOMING_RETURN_ADDR_RTX gen_rtx_REG (Pmode, LR_REGNUM) +#define DWARF_FRAME_RETURN_COLUMN DWARF_FRAME_REGNUM (LR_REGNUM) + +/* Used to mask out junk bits from the return address, such as + processor state, interrupt status, condition codes and the like. */ +#define MASK_RETURN_ADDR \ + /* If we are generating code for an ARM2/ARM3 machine or for an ARM6 \ + in 26 bit mode, the condition codes must be masked out of the \ + return address. This does not apply to ARM6 and later processors \ + when running in 32 bit mode. */ \ + ((arm_arch4 || TARGET_THUMB) \ + ? (gen_int_mode ((unsigned long)0xffffffff, Pmode)) \ + : arm_gen_return_addr_mask ()) + + +/* Neon defines builtins from ARM_BUILTIN_MAX upwards, though they don't have + symbolic names defined here (which would require too much duplication). + FIXME? */ +enum arm_builtins +{ + ARM_BUILTIN_GETWCX, + ARM_BUILTIN_SETWCX, + + ARM_BUILTIN_WZERO, + + ARM_BUILTIN_WAVG2BR, + ARM_BUILTIN_WAVG2HR, + ARM_BUILTIN_WAVG2B, + ARM_BUILTIN_WAVG2H, + + ARM_BUILTIN_WACCB, + ARM_BUILTIN_WACCH, + ARM_BUILTIN_WACCW, + + ARM_BUILTIN_WMACS, + ARM_BUILTIN_WMACSZ, + ARM_BUILTIN_WMACU, + ARM_BUILTIN_WMACUZ, + + ARM_BUILTIN_WSADB, + ARM_BUILTIN_WSADBZ, + ARM_BUILTIN_WSADH, + ARM_BUILTIN_WSADHZ, + + ARM_BUILTIN_WALIGN, + + ARM_BUILTIN_TMIA, + ARM_BUILTIN_TMIAPH, + ARM_BUILTIN_TMIABB, + ARM_BUILTIN_TMIABT, + ARM_BUILTIN_TMIATB, + ARM_BUILTIN_TMIATT, + + ARM_BUILTIN_TMOVMSKB, + ARM_BUILTIN_TMOVMSKH, + ARM_BUILTIN_TMOVMSKW, + + ARM_BUILTIN_TBCSTB, + ARM_BUILTIN_TBCSTH, + ARM_BUILTIN_TBCSTW, + + ARM_BUILTIN_WMADDS, + ARM_BUILTIN_WMADDU, + + ARM_BUILTIN_WPACKHSS, + ARM_BUILTIN_WPACKWSS, + ARM_BUILTIN_WPACKDSS, + ARM_BUILTIN_WPACKHUS, + ARM_BUILTIN_WPACKWUS, + ARM_BUILTIN_WPACKDUS, + + ARM_BUILTIN_WADDB, + ARM_BUILTIN_WADDH, + ARM_BUILTIN_WADDW, + ARM_BUILTIN_WADDSSB, + ARM_BUILTIN_WADDSSH, + ARM_BUILTIN_WADDSSW, + ARM_BUILTIN_WADDUSB, + ARM_BUILTIN_WADDUSH, + ARM_BUILTIN_WADDUSW, + ARM_BUILTIN_WSUBB, + ARM_BUILTIN_WSUBH, + ARM_BUILTIN_WSUBW, + ARM_BUILTIN_WSUBSSB, + ARM_BUILTIN_WSUBSSH, + ARM_BUILTIN_WSUBSSW, + ARM_BUILTIN_WSUBUSB, + ARM_BUILTIN_WSUBUSH, + ARM_BUILTIN_WSUBUSW, + + ARM_BUILTIN_WAND, + ARM_BUILTIN_WANDN, + ARM_BUILTIN_WOR, + ARM_BUILTIN_WXOR, + + ARM_BUILTIN_WCMPEQB, + ARM_BUILTIN_WCMPEQH, + ARM_BUILTIN_WCMPEQW, + ARM_BUILTIN_WCMPGTUB, + ARM_BUILTIN_WCMPGTUH, + ARM_BUILTIN_WCMPGTUW, + ARM_BUILTIN_WCMPGTSB, + ARM_BUILTIN_WCMPGTSH, + ARM_BUILTIN_WCMPGTSW, + + ARM_BUILTIN_TEXTRMSB, + ARM_BUILTIN_TEXTRMSH, + ARM_BUILTIN_TEXTRMSW, + ARM_BUILTIN_TEXTRMUB, + ARM_BUILTIN_TEXTRMUH, + ARM_BUILTIN_TEXTRMUW, + ARM_BUILTIN_TINSRB, + ARM_BUILTIN_TINSRH, + ARM_BUILTIN_TINSRW, + + ARM_BUILTIN_WMAXSW, + ARM_BUILTIN_WMAXSH, + ARM_BUILTIN_WMAXSB, + ARM_BUILTIN_WMAXUW, + ARM_BUILTIN_WMAXUH, + ARM_BUILTIN_WMAXUB, + ARM_BUILTIN_WMINSW, + ARM_BUILTIN_WMINSH, + ARM_BUILTIN_WMINSB, + ARM_BUILTIN_WMINUW, + ARM_BUILTIN_WMINUH, + ARM_BUILTIN_WMINUB, + + ARM_BUILTIN_WMULUM, + ARM_BUILTIN_WMULSM, + ARM_BUILTIN_WMULUL, + + ARM_BUILTIN_PSADBH, + ARM_BUILTIN_WSHUFH, + + ARM_BUILTIN_WSLLH, + ARM_BUILTIN_WSLLW, + ARM_BUILTIN_WSLLD, + ARM_BUILTIN_WSRAH, + ARM_BUILTIN_WSRAW, + ARM_BUILTIN_WSRAD, + ARM_BUILTIN_WSRLH, + ARM_BUILTIN_WSRLW, + ARM_BUILTIN_WSRLD, + ARM_BUILTIN_WRORH, + ARM_BUILTIN_WRORW, + ARM_BUILTIN_WRORD, + ARM_BUILTIN_WSLLHI, + ARM_BUILTIN_WSLLWI, + ARM_BUILTIN_WSLLDI, + ARM_BUILTIN_WSRAHI, + ARM_BUILTIN_WSRAWI, + ARM_BUILTIN_WSRADI, + ARM_BUILTIN_WSRLHI, + ARM_BUILTIN_WSRLWI, + ARM_BUILTIN_WSRLDI, + ARM_BUILTIN_WRORHI, + ARM_BUILTIN_WRORWI, + ARM_BUILTIN_WRORDI, + + ARM_BUILTIN_WUNPCKIHB, + ARM_BUILTIN_WUNPCKIHH, + ARM_BUILTIN_WUNPCKIHW, + ARM_BUILTIN_WUNPCKILB, + ARM_BUILTIN_WUNPCKILH, + ARM_BUILTIN_WUNPCKILW, + + ARM_BUILTIN_WUNPCKEHSB, + ARM_BUILTIN_WUNPCKEHSH, + ARM_BUILTIN_WUNPCKEHSW, + ARM_BUILTIN_WUNPCKEHUB, + ARM_BUILTIN_WUNPCKEHUH, + ARM_BUILTIN_WUNPCKEHUW, + ARM_BUILTIN_WUNPCKELSB, + ARM_BUILTIN_WUNPCKELSH, + ARM_BUILTIN_WUNPCKELSW, + ARM_BUILTIN_WUNPCKELUB, + ARM_BUILTIN_WUNPCKELUH, + ARM_BUILTIN_WUNPCKELUW, + + ARM_BUILTIN_THREAD_POINTER, + + ARM_BUILTIN_NEON_BASE, + + ARM_BUILTIN_MAX = ARM_BUILTIN_NEON_BASE /* FIXME: Wrong! */ +}; + +/* Do not emit .note.GNU-stack by default. */ +#ifndef NEED_INDICATE_EXEC_STACK +#define NEED_INDICATE_EXEC_STACK 0 +#endif + +/* The maximum number of parallel loads or stores we support in an ldm/stm + instruction. */ +#define MAX_LDM_STM_OPS 4 + +#endif /* ! GCC_ARM_H */ diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md new file mode 100644 index 000000000..130053b0b --- /dev/null +++ b/gcc/config/arm/arm.md @@ -0,0 +1,10746 @@ +;;- Machine description for ARM for GNU compiler +;; Copyright 1991, 1993, 1994, 1995, 1996, 1996, 1997, 1998, 1999, 2000, +;; 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 +;; Free Software Foundation, Inc. +;; Contributed by Pieter `Tiggr' Schoenmakers (rcpieter@win.tue.nl) +;; and Martin Simmons (@harleqn.co.uk). +;; More major hacks by Richard Earnshaw (rearnsha@arm.com). + +;; This file is part of GCC. + +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published +;; by the Free Software Foundation; either version 3, or (at your +;; option) any later version. + +;; GCC is distributed in the hope that it will be useful, but WITHOUT +;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public +;; License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +;;- See file "rtl.def" for documentation on define_insn, match_*, et. al. + + +;;--------------------------------------------------------------------------- +;; Constants + +;; Register numbers +(define_constants + [(R0_REGNUM 0) ; First CORE register + (IP_REGNUM 12) ; Scratch register + (SP_REGNUM 13) ; Stack pointer + (LR_REGNUM 14) ; Return address register + (PC_REGNUM 15) ; Program counter + (CC_REGNUM 24) ; Condition code pseudo register + (LAST_ARM_REGNUM 15) ; + (FPA_F0_REGNUM 16) ; FIRST_FPA_REGNUM + (FPA_F7_REGNUM 23) ; LAST_FPA_REGNUM + ] +) +;; 3rd operand to select_dominance_cc_mode +(define_constants + [(DOM_CC_X_AND_Y 0) + (DOM_CC_NX_OR_Y 1) + (DOM_CC_X_OR_Y 2) + ] +) + +;; UNSPEC Usage: +;; Note: sin and cos are no-longer used. +;; Unspec constants for Neon are defined in neon.md. + +(define_constants + [(UNSPEC_SIN 0) ; `sin' operation (MODE_FLOAT): + ; operand 0 is the result, + ; operand 1 the parameter. + (UNPSEC_COS 1) ; `cos' operation (MODE_FLOAT): + ; operand 0 is the result, + ; operand 1 the parameter. + (UNSPEC_PUSH_MULT 2) ; `push multiple' operation: + ; operand 0 is the first register, + ; subsequent registers are in parallel (use ...) + ; expressions. + (UNSPEC_PIC_SYM 3) ; A symbol that has been treated properly for pic + ; usage, that is, we will add the pic_register + ; value to it before trying to dereference it. + (UNSPEC_PIC_BASE 4) ; Add PC and all but the last operand together, + ; The last operand is the number of a PIC_LABEL + ; that points at the containing instruction. + (UNSPEC_PRLG_STK 5) ; A special barrier that prevents frame accesses + ; being scheduled before the stack adjustment insn. + (UNSPEC_PROLOGUE_USE 6) ; As USE insns are not meaningful after reload, + ; this unspec is used to prevent the deletion of + ; instructions setting registers for EH handling + ; and stack frame generation. Operand 0 is the + ; register to "use". + (UNSPEC_CHECK_ARCH 7); Set CCs to indicate 26-bit or 32-bit mode. + (UNSPEC_WSHUFH 8) ; Used by the intrinsic form of the iWMMXt WSHUFH instruction. + (UNSPEC_WACC 9) ; Used by the intrinsic form of the iWMMXt WACC instruction. + (UNSPEC_TMOVMSK 10) ; Used by the intrinsic form of the iWMMXt TMOVMSK instruction. + (UNSPEC_WSAD 11) ; Used by the intrinsic form of the iWMMXt WSAD instruction. + (UNSPEC_WSADZ 12) ; Used by the intrinsic form of the iWMMXt WSADZ instruction. + (UNSPEC_WMACS 13) ; Used by the intrinsic form of the iWMMXt WMACS instruction. + (UNSPEC_WMACU 14) ; Used by the intrinsic form of the iWMMXt WMACU instruction. + (UNSPEC_WMACSZ 15) ; Used by the intrinsic form of the iWMMXt WMACSZ instruction. + (UNSPEC_WMACUZ 16) ; Used by the intrinsic form of the iWMMXt WMACUZ instruction. + (UNSPEC_CLRDI 17) ; Used by the intrinsic form of the iWMMXt CLRDI instruction. + (UNSPEC_WMADDS 18) ; Used by the intrinsic form of the iWMMXt WMADDS instruction. + (UNSPEC_WMADDU 19) ; Used by the intrinsic form of the iWMMXt WMADDU instruction. + (UNSPEC_TLS 20) ; A symbol that has been treated properly for TLS usage. + (UNSPEC_PIC_LABEL 21) ; A label used for PIC access that does not appear in the + ; instruction stream. + (UNSPEC_STACK_ALIGN 22) ; Doubleword aligned stack pointer. Used to + ; generate correct unwind information. + (UNSPEC_PIC_OFFSET 23) ; A symbolic 12-bit OFFSET that has been treated + ; correctly for PIC usage. + (UNSPEC_GOTSYM_OFF 24) ; The offset of the start of the the GOT from a + ; a given symbolic address. + (UNSPEC_THUMB1_CASESI 25) ; A Thumb1 compressed dispatch-table call. + (UNSPEC_RBIT 26) ; rbit operation. + (UNSPEC_SYMBOL_OFFSET 27) ; The offset of the start of the symbol from + ; another symbolic address. + (UNSPEC_MEMORY_BARRIER 28) ; Represent a memory barrier. + (UNSPEC_PIC_UNIFIED 29) ; Create a common pic addressing form. + ] +) + +;; UNSPEC_VOLATILE Usage: + +(define_constants + [(VUNSPEC_BLOCKAGE 0) ; `blockage' insn to prevent scheduling across an + ; insn in the code. + (VUNSPEC_EPILOGUE 1) ; `epilogue' insn, used to represent any part of the + ; instruction epilogue sequence that isn't expanded + ; into normal RTL. Used for both normal and sibcall + ; epilogues. + (VUNSPEC_ALIGN 2) ; `align' insn. Used at the head of a minipool table + ; for inlined constants. + (VUNSPEC_POOL_END 3) ; `end-of-table'. Used to mark the end of a minipool + ; table. + (VUNSPEC_POOL_1 4) ; `pool-entry(1)'. An entry in the constant pool for + ; an 8-bit object. + (VUNSPEC_POOL_2 5) ; `pool-entry(2)'. An entry in the constant pool for + ; a 16-bit object. + (VUNSPEC_POOL_4 6) ; `pool-entry(4)'. An entry in the constant pool for + ; a 32-bit object. + (VUNSPEC_POOL_8 7) ; `pool-entry(8)'. An entry in the constant pool for + ; a 64-bit object. + (VUNSPEC_POOL_16 8) ; `pool-entry(16)'. An entry in the constant pool for + ; a 128-bit object. + (VUNSPEC_TMRC 9) ; Used by the iWMMXt TMRC instruction. + (VUNSPEC_TMCR 10) ; Used by the iWMMXt TMCR instruction. + (VUNSPEC_ALIGN8 11) ; 8-byte alignment version of VUNSPEC_ALIGN + (VUNSPEC_WCMP_EQ 12) ; Used by the iWMMXt WCMPEQ instructions + (VUNSPEC_WCMP_GTU 13) ; Used by the iWMMXt WCMPGTU instructions + (VUNSPEC_WCMP_GT 14) ; Used by the iwMMXT WCMPGT instructions + (VUNSPEC_EH_RETURN 20); Use to override the return address for exception + ; handling. + (VUNSPEC_SYNC_COMPARE_AND_SWAP 21) ; Represent an atomic compare swap. + (VUNSPEC_SYNC_LOCK 22) ; Represent a sync_lock_test_and_set. + (VUNSPEC_SYNC_OP 23) ; Represent a sync_ + (VUNSPEC_SYNC_NEW_OP 24) ; Represent a sync_new_ + (VUNSPEC_SYNC_OLD_OP 25) ; Represent a sync_old_ + ] +) + +;;--------------------------------------------------------------------------- +;; Attributes + +; IS_THUMB is set to 'yes' when we are generating Thumb code, and 'no' when +; generating ARM code. This is used to control the length of some insn +; patterns that share the same RTL in both ARM and Thumb code. +(define_attr "is_thumb" "no,yes" (const (symbol_ref "thumb_code"))) + +; IS_ARCH6 is set to 'yes' when we are generating code form ARMv6. +(define_attr "is_arch6" "no,yes" (const (symbol_ref "arm_arch6"))) + +; IS_THUMB1 is set to 'yes' iff we are generating Thumb-1 code. +(define_attr "is_thumb1" "no,yes" (const (symbol_ref "thumb1_code"))) + +;; Operand number of an input operand that is shifted. Zero if the +;; given instruction does not shift one of its input operands. +(define_attr "shift" "" (const_int 0)) + +; Floating Point Unit. If we only have floating point emulation, then there +; is no point in scheduling the floating point insns. (Well, for best +; performance we should try and group them together). +(define_attr "fpu" "none,fpa,fpe2,fpe3,maverick,vfp" + (const (symbol_ref "arm_fpu_attr"))) + +(define_attr "sync_result" "none,0,1,2,3,4,5" (const_string "none")) +(define_attr "sync_memory" "none,0,1,2,3,4,5" (const_string "none")) +(define_attr "sync_required_value" "none,0,1,2,3,4,5" (const_string "none")) +(define_attr "sync_new_value" "none,0,1,2,3,4,5" (const_string "none")) +(define_attr "sync_t1" "none,0,1,2,3,4,5" (const_string "none")) +(define_attr "sync_t2" "none,0,1,2,3,4,5" (const_string "none")) +(define_attr "sync_release_barrier" "yes,no" (const_string "yes")) +(define_attr "sync_op" "none,add,sub,ior,xor,and,nand" + (const_string "none")) + +; LENGTH of an instruction (in bytes) +(define_attr "length" "" + (cond [(not (eq_attr "sync_memory" "none")) + (symbol_ref "arm_sync_loop_insns (insn, operands) * 4") + ] (const_int 4))) + +; The architecture which supports the instruction (or alternative). +; This can be "a" for ARM, "t" for either of the Thumbs, "32" for +; TARGET_32BIT, "t1" or "t2" to specify a specific Thumb mode. "v6" +; for ARM or Thumb-2 with arm_arch6, and nov6 for ARM without +; arm_arch6. This attribute is used to compute attribute "enabled", +; use type "any" to enable an alternative in all cases. +(define_attr "arch" "any,a,t,32,t1,t2,v6,nov6" + (const_string "any")) + +(define_attr "arch_enabled" "no,yes" + (cond [(eq_attr "arch" "any") + (const_string "yes") + + (and (eq_attr "arch" "a") + (ne (symbol_ref "TARGET_ARM") (const_int 0))) + (const_string "yes") + + (and (eq_attr "arch" "t") + (ne (symbol_ref "TARGET_THUMB") (const_int 0))) + (const_string "yes") + + (and (eq_attr "arch" "t1") + (ne (symbol_ref "TARGET_THUMB1") (const_int 0))) + (const_string "yes") + + (and (eq_attr "arch" "t2") + (ne (symbol_ref "TARGET_THUMB2") (const_int 0))) + (const_string "yes") + + (and (eq_attr "arch" "32") + (ne (symbol_ref "TARGET_32BIT") (const_int 0))) + (const_string "yes") + + (and (eq_attr "arch" "v6") + (ne (symbol_ref "(TARGET_32BIT && arm_arch6)") (const_int 0))) + (const_string "yes") + + (and (eq_attr "arch" "nov6") + (ne (symbol_ref "(TARGET_32BIT && !arm_arch6)") (const_int 0))) + (const_string "yes")] + (const_string "no"))) + +; Allows an insn to disable certain alternatives for reasons other than +; arch support. +(define_attr "insn_enabled" "no,yes" + (const_string "yes")) + +; Enable all alternatives that are both arch_enabled and insn_enabled. + (define_attr "enabled" "no,yes" + (if_then_else (eq_attr "insn_enabled" "yes") + (if_then_else (eq_attr "arch_enabled" "yes") + (const_string "yes") + (const_string "no")) + (const_string "no"))) + +; POOL_RANGE is how far away from a constant pool entry that this insn +; can be placed. If the distance is zero, then this insn will never +; reference the pool. +; NEG_POOL_RANGE is nonzero for insns that can reference a constant pool entry +; before its address. +(define_attr "arm_pool_range" "" (const_int 0)) +(define_attr "thumb2_pool_range" "" (const_int 0)) +(define_attr "arm_neg_pool_range" "" (const_int 0)) +(define_attr "thumb2_neg_pool_range" "" (const_int 0)) + +(define_attr "pool_range" "" + (cond [(eq_attr "is_thumb" "yes") (attr "thumb2_pool_range")] + (attr "arm_pool_range"))) +(define_attr "neg_pool_range" "" + (cond [(eq_attr "is_thumb" "yes") (attr "thumb2_neg_pool_range")] + (attr "arm_neg_pool_range"))) + +; An assembler sequence may clobber the condition codes without us knowing. +; If such an insn references the pool, then we have no way of knowing how, +; so use the most conservative value for pool_range. +(define_asm_attributes + [(set_attr "conds" "clob") + (set_attr "length" "4") + (set_attr "pool_range" "250")]) + +;; The instruction used to implement a particular pattern. This +;; information is used by pipeline descriptions to provide accurate +;; scheduling information. + +(define_attr "insn" + "mov,mvn,smulxy,smlaxy,smlalxy,smulwy,smlawx,mul,muls,mla,mlas,umull,umulls,umlal,umlals,smull,smulls,smlal,smlals,smlawy,smuad,smuadx,smlad,smladx,smusd,smusdx,smlsd,smlsdx,smmul,smmulr,smmla,umaal,smlald,smlsld,clz,mrs,msr,xtab,sdiv,udiv,other" + (const_string "other")) + +; TYPE attribute is used to detect floating point instructions which, if +; running on a co-processor can run in parallel with other, basic instructions +; If write-buffer scheduling is enabled then it can also be used in the +; scheduling of writes. + +; Classification of each insn +; Note: vfp.md has different meanings for some of these, and some further +; types as well. See that file for details. +; alu any alu instruction that doesn't hit memory or fp +; regs or have a shifted source operand +; alu_shift any data instruction that doesn't hit memory or fp +; regs, but has a source operand shifted by a constant +; alu_shift_reg any data instruction that doesn't hit memory or fp +; regs, but has a source operand shifted by a register value +; mult a multiply instruction +; block blockage insn, this blocks all functional units +; float a floating point arithmetic operation (subject to expansion) +; fdivd DFmode floating point division +; fdivs SFmode floating point division +; fmul Floating point multiply +; ffmul Fast floating point multiply +; farith Floating point arithmetic (4 cycle) +; ffarith Fast floating point arithmetic (2 cycle) +; float_em a floating point arithmetic operation that is normally emulated +; even on a machine with an fpa. +; f_fpa_load a floating point load from memory. Only for the FPA. +; f_fpa_store a floating point store to memory. Only for the FPA. +; f_load[sd] A single/double load from memory. Used for VFP unit. +; f_store[sd] A single/double store to memory. Used for VFP unit. +; f_flag a transfer of co-processor flags to the CPSR +; f_mem_r a transfer of a floating point register to a real reg via mem +; r_mem_f the reverse of f_mem_r +; f_2_r fast transfer float to arm (no memory needed) +; r_2_f fast transfer arm to float +; f_cvt convert floating<->integral +; branch a branch +; call a subroutine call +; load_byte load byte(s) from memory to arm registers +; load1 load 1 word from memory to arm registers +; load2 load 2 words from memory to arm registers +; load3 load 3 words from memory to arm registers +; load4 load 4 words from memory to arm registers +; store store 1 word to memory from arm registers +; store2 store 2 words +; store3 store 3 words +; store4 store 4 (or more) words +; Additions for Cirrus Maverick co-processor: +; mav_farith Floating point arithmetic (4 cycle) +; mav_dmult Double multiplies (7 cycle) +; + +(define_attr "type" + "alu,alu_shift,alu_shift_reg,mult,block,float,fdivx,fdivd,fdivs,fmul,fmuls,fmuld,fmacs,fmacd,ffmul,farith,ffarith,f_flag,float_em,f_fpa_load,f_fpa_store,f_loads,f_loadd,f_stores,f_stored,f_mem_r,r_mem_f,f_2_r,r_2_f,f_cvt,branch,call,load_byte,load1,load2,load3,load4,store1,store2,store3,store4,mav_farith,mav_dmult,fconsts,fconstd,fadds,faddd,ffariths,ffarithd,fcmps,fcmpd,fcpys" + (if_then_else + (eq_attr "insn" "smulxy,smlaxy,smlalxy,smulwy,smlawx,mul,muls,mla,mlas,umull,umulls,umlal,umlals,smull,smulls,smlal,smlals") + (const_string "mult") + (const_string "alu"))) + +; Load scheduling, set from the arm_ld_sched variable +; initialized by arm_option_override() +(define_attr "ldsched" "no,yes" (const (symbol_ref "arm_ld_sched"))) + +;; Classification of NEON instructions for scheduling purposes. +;; Do not set this attribute and the "type" attribute together in +;; any one instruction pattern. +(define_attr "neon_type" + "neon_int_1,\ + neon_int_2,\ + neon_int_3,\ + neon_int_4,\ + neon_int_5,\ + neon_vqneg_vqabs,\ + neon_vmov,\ + neon_vaba,\ + neon_vsma,\ + neon_vaba_qqq,\ + neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + neon_mul_qqq_8_16_32_ddd_32,\ + neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar,\ + neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + neon_mla_qqq_8_16,\ + neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long,\ + neon_mla_qqq_32_qqd_32_scalar,\ + neon_mul_ddd_16_scalar_32_16_long_scalar,\ + neon_mul_qqd_32_scalar,\ + neon_mla_ddd_16_scalar_qdd_32_16_long_scalar,\ + neon_shift_1,\ + neon_shift_2,\ + neon_shift_3,\ + neon_vshl_ddd,\ + neon_vqshl_vrshl_vqrshl_qqq,\ + neon_vsra_vrsra,\ + neon_fp_vadd_ddd_vabs_dd,\ + neon_fp_vadd_qqq_vabs_qq,\ + neon_fp_vsum,\ + neon_fp_vmul_ddd,\ + neon_fp_vmul_qqd,\ + neon_fp_vmla_ddd,\ + neon_fp_vmla_qqq,\ + neon_fp_vmla_ddd_scalar,\ + neon_fp_vmla_qqq_scalar,\ + neon_fp_vrecps_vrsqrts_ddd,\ + neon_fp_vrecps_vrsqrts_qqq,\ + neon_bp_simple,\ + neon_bp_2cycle,\ + neon_bp_3cycle,\ + neon_ldr,\ + neon_str,\ + neon_vld1_1_2_regs,\ + neon_vld1_3_4_regs,\ + neon_vld2_2_regs_vld1_vld2_all_lanes,\ + neon_vld2_4_regs,\ + neon_vld3_vld4,\ + neon_vst1_1_2_regs_vst2_2_regs,\ + neon_vst1_3_4_regs,\ + neon_vst2_4_regs_vst3_vst4,\ + neon_vst3_vst4,\ + neon_vld1_vld2_lane,\ + neon_vld3_vld4_lane,\ + neon_vst1_vst2_lane,\ + neon_vst3_vst4_lane,\ + neon_vld3_vld4_all_lanes,\ + neon_mcr,\ + neon_mcr_2_mcrr,\ + neon_mrc,\ + neon_mrrc,\ + neon_ldm_2,\ + neon_stm_2,\ + none" + (const_string "none")) + +; condition codes: this one is used by final_prescan_insn to speed up +; conditionalizing instructions. It saves having to scan the rtl to see if +; it uses or alters the condition codes. +; +; USE means that the condition codes are used by the insn in the process of +; outputting code, this means (at present) that we can't use the insn in +; inlined branches +; +; SET means that the purpose of the insn is to set the condition codes in a +; well defined manner. +; +; CLOB means that the condition codes are altered in an undefined manner, if +; they are altered at all +; +; UNCONDITIONAL means the instruction can not be conditionally executed and +; that the instruction does not use or alter the condition codes. +; +; NOCOND means that the instruction does not use or alter the condition +; codes but can be converted into a conditionally exectuted instruction. + +(define_attr "conds" "use,set,clob,unconditional,nocond" + (if_then_else + (ior (eq_attr "is_thumb1" "yes") + (eq_attr "type" "call")) + (const_string "clob") + (if_then_else (eq_attr "neon_type" "none") + (const_string "nocond") + (const_string "unconditional")))) + +; Predicable means that the insn can be conditionally executed based on +; an automatically added predicate (additional patterns are generated by +; gen...). We default to 'no' because no Thumb patterns match this rule +; and not all ARM patterns do. +(define_attr "predicable" "no,yes" (const_string "no")) + +; Only model the write buffer for ARM6 and ARM7. Earlier processors don't +; have one. Later ones, such as StrongARM, have write-back caches, so don't +; suffer blockages enough to warrant modelling this (and it can adversely +; affect the schedule). +(define_attr "model_wbuf" "no,yes" (const (symbol_ref "arm_tune_wbuf"))) + +; WRITE_CONFLICT implies that a read following an unrelated write is likely +; to stall the processor. Used with model_wbuf above. +(define_attr "write_conflict" "no,yes" + (if_then_else (eq_attr "type" + "block,float_em,f_fpa_load,f_fpa_store,f_mem_r,r_mem_f,call,load1") + (const_string "yes") + (const_string "no"))) + +; Classify the insns into those that take one cycle and those that take more +; than one on the main cpu execution unit. +(define_attr "core_cycles" "single,multi" + (if_then_else (eq_attr "type" + "alu,alu_shift,float,fdivx,fdivd,fdivs,fmul,ffmul,farith,ffarith") + (const_string "single") + (const_string "multi"))) + +;; FAR_JUMP is "yes" if a BL instruction is used to generate a branch to a +;; distant label. Only applicable to Thumb code. +(define_attr "far_jump" "yes,no" (const_string "no")) + + +;; The number of machine instructions this pattern expands to. +;; Used for Thumb-2 conditional execution. +(define_attr "ce_count" "" (const_int 1)) + +;;--------------------------------------------------------------------------- +;; Mode iterators + +(include "iterators.md") + +;;--------------------------------------------------------------------------- +;; Predicates + +(include "predicates.md") +(include "constraints.md") + +;;--------------------------------------------------------------------------- +;; Pipeline descriptions + +;; Processor type. This is created automatically from arm-cores.def. +(include "arm-tune.md") + +(define_attr "tune_cortexr4" "yes,no" + (const (if_then_else + (eq_attr "tune" "cortexr4,cortexr4f") + (const_string "yes") + (const_string "no")))) + +;; True if the generic scheduling description should be used. + +(define_attr "generic_sched" "yes,no" + (const (if_then_else + (ior (eq_attr "tune" "fa526,fa626,fa606te,fa626te,fmp626,fa726te,arm926ejs,arm1020e,arm1026ejs,arm1136js,arm1136jfs,cortexa5,cortexa8,cortexa9,cortexm4") + (eq_attr "tune_cortexr4" "yes")) + (const_string "no") + (const_string "yes")))) + +(define_attr "generic_vfp" "yes,no" + (const (if_then_else + (and (eq_attr "fpu" "vfp") + (eq_attr "tune" "!arm1020e,arm1022e,cortexa5,cortexa8,cortexa9,cortexm4") + (eq_attr "tune_cortexr4" "no")) + (const_string "yes") + (const_string "no")))) + +(include "arm-generic.md") +(include "arm926ejs.md") +(include "arm1020e.md") +(include "arm1026ejs.md") +(include "arm1136jfs.md") +(include "fa526.md") +(include "fa606te.md") +(include "fa626te.md") +(include "fmp626.md") +(include "fa726te.md") +(include "cortex-a5.md") +(include "cortex-a8.md") +(include "cortex-a9.md") +(include "cortex-r4.md") +(include "cortex-r4f.md") +(include "cortex-m4.md") +(include "cortex-m4-fpu.md") +(include "vfp11.md") + + +;;--------------------------------------------------------------------------- +;; Insn patterns +;; +;; Addition insns. + +;; Note: For DImode insns, there is normally no reason why operands should +;; not be in the same register, what we don't want is for something being +;; written to partially overlap something that is an input. +;; Cirrus 64bit additions should not be split because we have a native +;; 64bit addition instructions. + +(define_expand "adddi3" + [(parallel + [(set (match_operand:DI 0 "s_register_operand" "") + (plus:DI (match_operand:DI 1 "s_register_operand" "") + (match_operand:DI 2 "s_register_operand" ""))) + (clobber (reg:CC CC_REGNUM))])] + "TARGET_EITHER" + " + if (TARGET_HARD_FLOAT && TARGET_MAVERICK) + { + if (!cirrus_fp_register (operands[0], DImode)) + operands[0] = force_reg (DImode, operands[0]); + if (!cirrus_fp_register (operands[1], DImode)) + operands[1] = force_reg (DImode, operands[1]); + emit_insn (gen_cirrus_adddi3 (operands[0], operands[1], operands[2])); + DONE; + } + + if (TARGET_THUMB1) + { + if (GET_CODE (operands[1]) != REG) + operands[1] = force_reg (DImode, operands[1]); + if (GET_CODE (operands[2]) != REG) + operands[2] = force_reg (DImode, operands[2]); + } + " +) + +(define_insn "*thumb1_adddi3" + [(set (match_operand:DI 0 "register_operand" "=l") + (plus:DI (match_operand:DI 1 "register_operand" "%0") + (match_operand:DI 2 "register_operand" "l"))) + (clobber (reg:CC CC_REGNUM)) + ] + "TARGET_THUMB1" + "add\\t%Q0, %Q0, %Q2\;adc\\t%R0, %R0, %R2" + [(set_attr "length" "4")] +) + +(define_insn_and_split "*arm_adddi3" + [(set (match_operand:DI 0 "s_register_operand" "=&r,&r") + (plus:DI (match_operand:DI 1 "s_register_operand" "%0, 0") + (match_operand:DI 2 "s_register_operand" "r, 0"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_32BIT && !(TARGET_HARD_FLOAT && TARGET_MAVERICK) && !TARGET_NEON" + "#" + "TARGET_32BIT && reload_completed + && ! (TARGET_NEON && IS_VFP_REGNUM (REGNO (operands[0])))" + [(parallel [(set (reg:CC_C CC_REGNUM) + (compare:CC_C (plus:SI (match_dup 1) (match_dup 2)) + (match_dup 1))) + (set (match_dup 0) (plus:SI (match_dup 1) (match_dup 2)))]) + (set (match_dup 3) (plus:SI (plus:SI (match_dup 4) (match_dup 5)) + (ltu:SI (reg:CC_C CC_REGNUM) (const_int 0))))] + " + { + operands[3] = gen_highpart (SImode, operands[0]); + operands[0] = gen_lowpart (SImode, operands[0]); + operands[4] = gen_highpart (SImode, operands[1]); + operands[1] = gen_lowpart (SImode, operands[1]); + operands[5] = gen_highpart (SImode, operands[2]); + operands[2] = gen_lowpart (SImode, operands[2]); + }" + [(set_attr "conds" "clob") + (set_attr "length" "8")] +) + +(define_insn_and_split "*adddi_sesidi_di" + [(set (match_operand:DI 0 "s_register_operand" "=&r,&r") + (plus:DI (sign_extend:DI + (match_operand:SI 2 "s_register_operand" "r,r")) + (match_operand:DI 1 "s_register_operand" "0,r"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_32BIT && !(TARGET_HARD_FLOAT && TARGET_MAVERICK)" + "#" + "TARGET_32BIT && reload_completed" + [(parallel [(set (reg:CC_C CC_REGNUM) + (compare:CC_C (plus:SI (match_dup 1) (match_dup 2)) + (match_dup 1))) + (set (match_dup 0) (plus:SI (match_dup 1) (match_dup 2)))]) + (set (match_dup 3) (plus:SI (plus:SI (ashiftrt:SI (match_dup 2) + (const_int 31)) + (match_dup 4)) + (ltu:SI (reg:CC_C CC_REGNUM) (const_int 0))))] + " + { + operands[3] = gen_highpart (SImode, operands[0]); + operands[0] = gen_lowpart (SImode, operands[0]); + operands[4] = gen_highpart (SImode, operands[1]); + operands[1] = gen_lowpart (SImode, operands[1]); + operands[2] = gen_lowpart (SImode, operands[2]); + }" + [(set_attr "conds" "clob") + (set_attr "length" "8")] +) + +(define_insn_and_split "*adddi_zesidi_di" + [(set (match_operand:DI 0 "s_register_operand" "=&r,&r") + (plus:DI (zero_extend:DI + (match_operand:SI 2 "s_register_operand" "r,r")) + (match_operand:DI 1 "s_register_operand" "0,r"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_32BIT && !(TARGET_HARD_FLOAT && TARGET_MAVERICK)" + "#" + "TARGET_32BIT && reload_completed" + [(parallel [(set (reg:CC_C CC_REGNUM) + (compare:CC_C (plus:SI (match_dup 1) (match_dup 2)) + (match_dup 1))) + (set (match_dup 0) (plus:SI (match_dup 1) (match_dup 2)))]) + (set (match_dup 3) (plus:SI (plus:SI (match_dup 4) (const_int 0)) + (ltu:SI (reg:CC_C CC_REGNUM) (const_int 0))))] + " + { + operands[3] = gen_highpart (SImode, operands[0]); + operands[0] = gen_lowpart (SImode, operands[0]); + operands[4] = gen_highpart (SImode, operands[1]); + operands[1] = gen_lowpart (SImode, operands[1]); + operands[2] = gen_lowpart (SImode, operands[2]); + }" + [(set_attr "conds" "clob") + (set_attr "length" "8")] +) + +(define_expand "addsi3" + [(set (match_operand:SI 0 "s_register_operand" "") + (plus:SI (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 2 "reg_or_int_operand" "")))] + "TARGET_EITHER" + " + if (TARGET_32BIT && GET_CODE (operands[2]) == CONST_INT) + { + arm_split_constant (PLUS, SImode, NULL_RTX, + INTVAL (operands[2]), operands[0], operands[1], + optimize && can_create_pseudo_p ()); + DONE; + } + " +) + +; If there is a scratch available, this will be faster than synthesizing the +; addition. +(define_peephole2 + [(match_scratch:SI 3 "r") + (set (match_operand:SI 0 "arm_general_register_operand" "") + (plus:SI (match_operand:SI 1 "arm_general_register_operand" "") + (match_operand:SI 2 "const_int_operand" "")))] + "TARGET_32BIT && + !(const_ok_for_arm (INTVAL (operands[2])) + || const_ok_for_arm (-INTVAL (operands[2]))) + && const_ok_for_arm (~INTVAL (operands[2]))" + [(set (match_dup 3) (match_dup 2)) + (set (match_dup 0) (plus:SI (match_dup 1) (match_dup 3)))] + "" +) + +;; The r/r/k alternative is required when reloading the address +;; (plus (reg rN) (reg sp)) into (reg rN). In this case reload will +;; put the duplicated register first, and not try the commutative version. +(define_insn_and_split "*arm_addsi3" + [(set (match_operand:SI 0 "s_register_operand" "=r, k,r,r, k,r") + (plus:SI (match_operand:SI 1 "s_register_operand" "%rk,k,r,rk,k,rk") + (match_operand:SI 2 "reg_or_int_operand" "rI,rI,k,L, L,?n")))] + "TARGET_32BIT" + "@ + add%?\\t%0, %1, %2 + add%?\\t%0, %1, %2 + add%?\\t%0, %2, %1 + sub%?\\t%0, %1, #%n2 + sub%?\\t%0, %1, #%n2 + #" + "TARGET_32BIT + && GET_CODE (operands[2]) == CONST_INT + && !(const_ok_for_arm (INTVAL (operands[2])) + || const_ok_for_arm (-INTVAL (operands[2]))) + && (reload_completed || !arm_eliminable_register (operands[1]))" + [(clobber (const_int 0))] + " + arm_split_constant (PLUS, SImode, curr_insn, + INTVAL (operands[2]), operands[0], + operands[1], 0); + DONE; + " + [(set_attr "length" "4,4,4,4,4,16") + (set_attr "predicable" "yes")] +) + +(define_insn_and_split "*thumb1_addsi3" + [(set (match_operand:SI 0 "register_operand" "=l,l,l,*rk,*hk,l,k,l,l,l") + (plus:SI (match_operand:SI 1 "register_operand" "%0,0,l,*0,*0,k,k,0,l,k") + (match_operand:SI 2 "nonmemory_operand" "I,J,lL,*hk,*rk,M,O,Pa,Pb,Pc")))] + "TARGET_THUMB1" + "* + static const char * const asms[] = + { + \"add\\t%0, %0, %2\", + \"sub\\t%0, %0, #%n2\", + \"add\\t%0, %1, %2\", + \"add\\t%0, %0, %2\", + \"add\\t%0, %0, %2\", + \"add\\t%0, %1, %2\", + \"add\\t%0, %1, %2\", + \"#\", + \"#\", + \"#\" + }; + if ((which_alternative == 2 || which_alternative == 6) + && GET_CODE (operands[2]) == CONST_INT + && INTVAL (operands[2]) < 0) + return \"sub\\t%0, %1, #%n2\"; + return asms[which_alternative]; + " + "&& reload_completed && CONST_INT_P (operands[2]) + && ((operands[1] != stack_pointer_rtx + && (INTVAL (operands[2]) > 255 || INTVAL (operands[2]) < -255)) + || (operands[1] == stack_pointer_rtx + && INTVAL (operands[2]) > 1020))" + [(set (match_dup 0) (plus:SI (match_dup 1) (match_dup 2))) + (set (match_dup 0) (plus:SI (match_dup 0) (match_dup 3)))] + { + HOST_WIDE_INT offset = INTVAL (operands[2]); + if (operands[1] == stack_pointer_rtx) + offset -= 1020; + else + { + if (offset > 255) + offset = 255; + else if (offset < -255) + offset = -255; + } + operands[3] = GEN_INT (offset); + operands[2] = GEN_INT (INTVAL (operands[2]) - offset); + } + [(set_attr "length" "2,2,2,2,2,2,2,4,4,4")] +) + +;; Reloading and elimination of the frame pointer can +;; sometimes cause this optimization to be missed. +(define_peephole2 + [(set (match_operand:SI 0 "arm_general_register_operand" "") + (match_operand:SI 1 "const_int_operand" "")) + (set (match_dup 0) + (plus:SI (match_dup 0) (reg:SI SP_REGNUM)))] + "TARGET_THUMB1 + && (unsigned HOST_WIDE_INT) (INTVAL (operands[1])) < 1024 + && (INTVAL (operands[1]) & 3) == 0" + [(set (match_dup 0) (plus:SI (reg:SI SP_REGNUM) (match_dup 1)))] + "" +) + +(define_insn "*addsi3_compare0" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV + (plus:SI (match_operand:SI 1 "s_register_operand" "r, r") + (match_operand:SI 2 "arm_add_operand" "rI,L")) + (const_int 0))) + (set (match_operand:SI 0 "s_register_operand" "=r,r") + (plus:SI (match_dup 1) (match_dup 2)))] + "TARGET_ARM" + "@ + add%.\\t%0, %1, %2 + sub%.\\t%0, %1, #%n2" + [(set_attr "conds" "set")] +) + +(define_insn "*addsi3_compare0_scratch" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV + (plus:SI (match_operand:SI 0 "s_register_operand" "r, r") + (match_operand:SI 1 "arm_add_operand" "rI,L")) + (const_int 0)))] + "TARGET_ARM" + "@ + cmn%?\\t%0, %1 + cmp%?\\t%0, #%n1" + [(set_attr "conds" "set")] +) + +(define_insn "*compare_negsi_si" + [(set (reg:CC_Z CC_REGNUM) + (compare:CC_Z + (neg:SI (match_operand:SI 0 "s_register_operand" "r")) + (match_operand:SI 1 "s_register_operand" "r")))] + "TARGET_32BIT" + "cmn%?\\t%1, %0" + [(set_attr "conds" "set")] +) + +;; This is the canonicalization of addsi3_compare0_for_combiner when the +;; addend is a constant. +(define_insn "*cmpsi2_addneg" + [(set (reg:CC CC_REGNUM) + (compare:CC + (match_operand:SI 1 "s_register_operand" "r,r") + (match_operand:SI 2 "arm_addimm_operand" "L,I"))) + (set (match_operand:SI 0 "s_register_operand" "=r,r") + (plus:SI (match_dup 1) + (match_operand:SI 3 "arm_addimm_operand" "I,L")))] + "TARGET_32BIT && INTVAL (operands[2]) == -INTVAL (operands[3])" + "@ + add%.\\t%0, %1, %3 + sub%.\\t%0, %1, #%n3" + [(set_attr "conds" "set")] +) + +;; Convert the sequence +;; sub rd, rn, #1 +;; cmn rd, #1 (equivalent to cmp rd, #-1) +;; bne dest +;; into +;; subs rd, rn, #1 +;; bcs dest ((unsigned)rn >= 1) +;; similarly for the beq variant using bcc. +;; This is a common looping idiom (while (n--)) +(define_peephole2 + [(set (match_operand:SI 0 "arm_general_register_operand" "") + (plus:SI (match_operand:SI 1 "arm_general_register_operand" "") + (const_int -1))) + (set (match_operand 2 "cc_register" "") + (compare (match_dup 0) (const_int -1))) + (set (pc) + (if_then_else (match_operator 3 "equality_operator" + [(match_dup 2) (const_int 0)]) + (match_operand 4 "" "") + (match_operand 5 "" "")))] + "TARGET_32BIT && peep2_reg_dead_p (3, operands[2])" + [(parallel[ + (set (match_dup 2) + (compare:CC + (match_dup 1) (const_int 1))) + (set (match_dup 0) (plus:SI (match_dup 1) (const_int -1)))]) + (set (pc) + (if_then_else (match_op_dup 3 [(match_dup 2) (const_int 0)]) + (match_dup 4) + (match_dup 5)))] + "operands[2] = gen_rtx_REG (CCmode, CC_REGNUM); + operands[3] = gen_rtx_fmt_ee ((GET_CODE (operands[3]) == NE + ? GEU : LTU), + VOIDmode, + operands[2], const0_rtx);" +) + +;; The next four insns work because they compare the result with one of +;; the operands, and we know that the use of the condition code is +;; either GEU or LTU, so we can use the carry flag from the addition +;; instead of doing the compare a second time. +(define_insn "*addsi3_compare_op1" + [(set (reg:CC_C CC_REGNUM) + (compare:CC_C + (plus:SI (match_operand:SI 1 "s_register_operand" "r,r") + (match_operand:SI 2 "arm_add_operand" "rI,L")) + (match_dup 1))) + (set (match_operand:SI 0 "s_register_operand" "=r,r") + (plus:SI (match_dup 1) (match_dup 2)))] + "TARGET_32BIT" + "@ + add%.\\t%0, %1, %2 + sub%.\\t%0, %1, #%n2" + [(set_attr "conds" "set")] +) + +(define_insn "*addsi3_compare_op2" + [(set (reg:CC_C CC_REGNUM) + (compare:CC_C + (plus:SI (match_operand:SI 1 "s_register_operand" "r,r") + (match_operand:SI 2 "arm_add_operand" "rI,L")) + (match_dup 2))) + (set (match_operand:SI 0 "s_register_operand" "=r,r") + (plus:SI (match_dup 1) (match_dup 2)))] + "TARGET_32BIT" + "@ + add%.\\t%0, %1, %2 + sub%.\\t%0, %1, #%n2" + [(set_attr "conds" "set")] +) + +(define_insn "*compare_addsi2_op0" + [(set (reg:CC_C CC_REGNUM) + (compare:CC_C + (plus:SI (match_operand:SI 0 "s_register_operand" "r,r") + (match_operand:SI 1 "arm_add_operand" "rI,L")) + (match_dup 0)))] + "TARGET_32BIT" + "@ + cmn%?\\t%0, %1 + cmp%?\\t%0, #%n1" + [(set_attr "conds" "set")] +) + +(define_insn "*compare_addsi2_op1" + [(set (reg:CC_C CC_REGNUM) + (compare:CC_C + (plus:SI (match_operand:SI 0 "s_register_operand" "r,r") + (match_operand:SI 1 "arm_add_operand" "rI,L")) + (match_dup 1)))] + "TARGET_32BIT" + "@ + cmn%?\\t%0, %1 + cmp%?\\t%0, #%n1" + [(set_attr "conds" "set")] +) + +(define_insn "*addsi3_carryin_" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (plus:SI (plus:SI (match_operand:SI 1 "s_register_operand" "%r") + (match_operand:SI 2 "arm_rhs_operand" "rI")) + (LTUGEU:SI (reg: CC_REGNUM) (const_int 0))))] + "TARGET_32BIT" + "adc%?\\t%0, %1, %2" + [(set_attr "conds" "use")] +) + +(define_insn "*addsi3_carryin_alt2_" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (plus:SI (plus:SI (LTUGEU:SI (reg: CC_REGNUM) (const_int 0)) + (match_operand:SI 1 "s_register_operand" "%r")) + (match_operand:SI 2 "arm_rhs_operand" "rI")))] + "TARGET_32BIT" + "adc%?\\t%0, %1, %2" + [(set_attr "conds" "use")] +) + +(define_insn "*addsi3_carryin_shift_" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (plus:SI (plus:SI + (match_operator:SI 2 "shift_operator" + [(match_operand:SI 3 "s_register_operand" "r") + (match_operand:SI 4 "reg_or_int_operand" "rM")]) + (match_operand:SI 1 "s_register_operand" "r")) + (LTUGEU:SI (reg: CC_REGNUM) (const_int 0))))] + "TARGET_32BIT" + "adc%?\\t%0, %1, %3%S2" + [(set_attr "conds" "use") + (set (attr "type") (if_then_else (match_operand 4 "const_int_operand" "") + (const_string "alu_shift") + (const_string "alu_shift_reg")))] +) + +(define_expand "incscc" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (plus:SI (match_operator:SI 2 "arm_comparison_operator" + [(match_operand:CC 3 "cc_register" "") (const_int 0)]) + (match_operand:SI 1 "s_register_operand" "0,?r")))] + "TARGET_32BIT" + "" +) + +(define_insn "*arm_incscc" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (plus:SI (match_operator:SI 2 "arm_comparison_operator" + [(match_operand:CC 3 "cc_register" "") (const_int 0)]) + (match_operand:SI 1 "s_register_operand" "0,?r")))] + "TARGET_ARM" + "@ + add%d2\\t%0, %1, #1 + mov%D2\\t%0, %1\;add%d2\\t%0, %1, #1" + [(set_attr "conds" "use") + (set_attr "length" "4,8")] +) + +; transform ((x << y) - 1) to ~(~(x-1) << y) Where X is a constant. +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (plus:SI (ashift:SI (match_operand:SI 1 "const_int_operand" "") + (match_operand:SI 2 "s_register_operand" "")) + (const_int -1))) + (clobber (match_operand:SI 3 "s_register_operand" ""))] + "TARGET_32BIT" + [(set (match_dup 3) (match_dup 1)) + (set (match_dup 0) (not:SI (ashift:SI (match_dup 3) (match_dup 2))))] + " + operands[1] = GEN_INT (~(INTVAL (operands[1]) - 1)); +") + +(define_expand "addsf3" + [(set (match_operand:SF 0 "s_register_operand" "") + (plus:SF (match_operand:SF 1 "s_register_operand" "") + (match_operand:SF 2 "arm_float_add_operand" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT" + " + if (TARGET_MAVERICK + && !cirrus_fp_register (operands[2], SFmode)) + operands[2] = force_reg (SFmode, operands[2]); +") + +(define_expand "adddf3" + [(set (match_operand:DF 0 "s_register_operand" "") + (plus:DF (match_operand:DF 1 "s_register_operand" "") + (match_operand:DF 2 "arm_float_add_operand" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && !TARGET_VFP_SINGLE" + " + if (TARGET_MAVERICK + && !cirrus_fp_register (operands[2], DFmode)) + operands[2] = force_reg (DFmode, operands[2]); +") + +(define_expand "subdi3" + [(parallel + [(set (match_operand:DI 0 "s_register_operand" "") + (minus:DI (match_operand:DI 1 "s_register_operand" "") + (match_operand:DI 2 "s_register_operand" ""))) + (clobber (reg:CC CC_REGNUM))])] + "TARGET_EITHER" + " + if (TARGET_HARD_FLOAT && TARGET_MAVERICK + && TARGET_32BIT + && cirrus_fp_register (operands[0], DImode) + && cirrus_fp_register (operands[1], DImode)) + { + emit_insn (gen_cirrus_subdi3 (operands[0], operands[1], operands[2])); + DONE; + } + + if (TARGET_THUMB1) + { + if (GET_CODE (operands[1]) != REG) + operands[1] = force_reg (DImode, operands[1]); + if (GET_CODE (operands[2]) != REG) + operands[2] = force_reg (DImode, operands[2]); + } + " +) + +(define_insn "*arm_subdi3" + [(set (match_operand:DI 0 "s_register_operand" "=&r,&r,&r") + (minus:DI (match_operand:DI 1 "s_register_operand" "0,r,0") + (match_operand:DI 2 "s_register_operand" "r,0,0"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_32BIT && !TARGET_NEON" + "subs\\t%Q0, %Q1, %Q2\;sbc\\t%R0, %R1, %R2" + [(set_attr "conds" "clob") + (set_attr "length" "8")] +) + +(define_insn "*thumb_subdi3" + [(set (match_operand:DI 0 "register_operand" "=l") + (minus:DI (match_operand:DI 1 "register_operand" "0") + (match_operand:DI 2 "register_operand" "l"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_THUMB1" + "sub\\t%Q0, %Q0, %Q2\;sbc\\t%R0, %R0, %R2" + [(set_attr "length" "4")] +) + +(define_insn "*subdi_di_zesidi" + [(set (match_operand:DI 0 "s_register_operand" "=&r,&r") + (minus:DI (match_operand:DI 1 "s_register_operand" "0,r") + (zero_extend:DI + (match_operand:SI 2 "s_register_operand" "r,r")))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_32BIT" + "subs\\t%Q0, %Q1, %2\;sbc\\t%R0, %R1, #0" + [(set_attr "conds" "clob") + (set_attr "length" "8")] +) + +(define_insn "*subdi_di_sesidi" + [(set (match_operand:DI 0 "s_register_operand" "=&r,&r") + (minus:DI (match_operand:DI 1 "s_register_operand" "0,r") + (sign_extend:DI + (match_operand:SI 2 "s_register_operand" "r,r")))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_32BIT" + "subs\\t%Q0, %Q1, %2\;sbc\\t%R0, %R1, %2, asr #31" + [(set_attr "conds" "clob") + (set_attr "length" "8")] +) + +(define_insn "*subdi_zesidi_di" + [(set (match_operand:DI 0 "s_register_operand" "=&r,&r") + (minus:DI (zero_extend:DI + (match_operand:SI 2 "s_register_operand" "r,r")) + (match_operand:DI 1 "s_register_operand" "0,r"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "rsbs\\t%Q0, %Q1, %2\;rsc\\t%R0, %R1, #0" + [(set_attr "conds" "clob") + (set_attr "length" "8")] +) + +(define_insn "*subdi_sesidi_di" + [(set (match_operand:DI 0 "s_register_operand" "=&r,&r") + (minus:DI (sign_extend:DI + (match_operand:SI 2 "s_register_operand" "r,r")) + (match_operand:DI 1 "s_register_operand" "0,r"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "rsbs\\t%Q0, %Q1, %2\;rsc\\t%R0, %R1, %2, asr #31" + [(set_attr "conds" "clob") + (set_attr "length" "8")] +) + +(define_insn "*subdi_zesidi_zesidi" + [(set (match_operand:DI 0 "s_register_operand" "=r") + (minus:DI (zero_extend:DI + (match_operand:SI 1 "s_register_operand" "r")) + (zero_extend:DI + (match_operand:SI 2 "s_register_operand" "r")))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_32BIT" + "subs\\t%Q0, %1, %2\;sbc\\t%R0, %1, %1" + [(set_attr "conds" "clob") + (set_attr "length" "8")] +) + +(define_expand "subsi3" + [(set (match_operand:SI 0 "s_register_operand" "") + (minus:SI (match_operand:SI 1 "reg_or_int_operand" "") + (match_operand:SI 2 "s_register_operand" "")))] + "TARGET_EITHER" + " + if (GET_CODE (operands[1]) == CONST_INT) + { + if (TARGET_32BIT) + { + arm_split_constant (MINUS, SImode, NULL_RTX, + INTVAL (operands[1]), operands[0], + operands[2], optimize && can_create_pseudo_p ()); + DONE; + } + else /* TARGET_THUMB1 */ + operands[1] = force_reg (SImode, operands[1]); + } + " +) + +(define_insn "thumb1_subsi3_insn" + [(set (match_operand:SI 0 "register_operand" "=l") + (minus:SI (match_operand:SI 1 "register_operand" "l") + (match_operand:SI 2 "reg_or_int_operand" "lPd")))] + "TARGET_THUMB1" + "sub\\t%0, %1, %2" + [(set_attr "length" "2") + (set_attr "conds" "set")]) + +; ??? Check Thumb-2 split length +(define_insn_and_split "*arm_subsi3_insn" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,rk,r") + (minus:SI (match_operand:SI 1 "reg_or_int_operand" "rI,r,k,?n") + (match_operand:SI 2 "reg_or_int_operand" "r,rI,r, r")))] + "TARGET_32BIT" + "@ + rsb%?\\t%0, %2, %1 + sub%?\\t%0, %1, %2 + sub%?\\t%0, %1, %2 + #" + "&& (GET_CODE (operands[1]) == CONST_INT + && !const_ok_for_arm (INTVAL (operands[1])))" + [(clobber (const_int 0))] + " + arm_split_constant (MINUS, SImode, curr_insn, + INTVAL (operands[1]), operands[0], operands[2], 0); + DONE; + " + [(set_attr "length" "4,4,4,16") + (set_attr "predicable" "yes")] +) + +(define_peephole2 + [(match_scratch:SI 3 "r") + (set (match_operand:SI 0 "arm_general_register_operand" "") + (minus:SI (match_operand:SI 1 "const_int_operand" "") + (match_operand:SI 2 "arm_general_register_operand" "")))] + "TARGET_32BIT + && !const_ok_for_arm (INTVAL (operands[1])) + && const_ok_for_arm (~INTVAL (operands[1]))" + [(set (match_dup 3) (match_dup 1)) + (set (match_dup 0) (minus:SI (match_dup 3) (match_dup 2)))] + "" +) + +(define_insn "*subsi3_compare0" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV + (minus:SI (match_operand:SI 1 "arm_rhs_operand" "r,I") + (match_operand:SI 2 "arm_rhs_operand" "rI,r")) + (const_int 0))) + (set (match_operand:SI 0 "s_register_operand" "=r,r") + (minus:SI (match_dup 1) (match_dup 2)))] + "TARGET_32BIT" + "@ + sub%.\\t%0, %1, %2 + rsb%.\\t%0, %2, %1" + [(set_attr "conds" "set")] +) + +(define_insn "*subsi3_compare" + [(set (reg:CC CC_REGNUM) + (compare:CC (match_operand:SI 1 "arm_rhs_operand" "r,I") + (match_operand:SI 2 "arm_rhs_operand" "rI,r"))) + (set (match_operand:SI 0 "s_register_operand" "=r,r") + (minus:SI (match_dup 1) (match_dup 2)))] + "TARGET_32BIT" + "@ + sub%.\\t%0, %1, %2 + rsb%.\\t%0, %2, %1" + [(set_attr "conds" "set")] +) + +(define_expand "decscc" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (minus:SI (match_operand:SI 1 "s_register_operand" "0,?r") + (match_operator:SI 2 "arm_comparison_operator" + [(match_operand 3 "cc_register" "") (const_int 0)])))] + "TARGET_32BIT" + "" +) + +(define_insn "*arm_decscc" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (minus:SI (match_operand:SI 1 "s_register_operand" "0,?r") + (match_operator:SI 2 "arm_comparison_operator" + [(match_operand 3 "cc_register" "") (const_int 0)])))] + "TARGET_ARM" + "@ + sub%d2\\t%0, %1, #1 + mov%D2\\t%0, %1\;sub%d2\\t%0, %1, #1" + [(set_attr "conds" "use") + (set_attr "length" "*,8")] +) + +(define_expand "subsf3" + [(set (match_operand:SF 0 "s_register_operand" "") + (minus:SF (match_operand:SF 1 "arm_float_rhs_operand" "") + (match_operand:SF 2 "arm_float_rhs_operand" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT" + " + if (TARGET_MAVERICK) + { + if (!cirrus_fp_register (operands[1], SFmode)) + operands[1] = force_reg (SFmode, operands[1]); + if (!cirrus_fp_register (operands[2], SFmode)) + operands[2] = force_reg (SFmode, operands[2]); + } +") + +(define_expand "subdf3" + [(set (match_operand:DF 0 "s_register_operand" "") + (minus:DF (match_operand:DF 1 "arm_float_rhs_operand" "") + (match_operand:DF 2 "arm_float_rhs_operand" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && !TARGET_VFP_SINGLE" + " + if (TARGET_MAVERICK) + { + if (!cirrus_fp_register (operands[1], DFmode)) + operands[1] = force_reg (DFmode, operands[1]); + if (!cirrus_fp_register (operands[2], DFmode)) + operands[2] = force_reg (DFmode, operands[2]); + } +") + + +;; Multiplication insns + +(define_expand "mulsi3" + [(set (match_operand:SI 0 "s_register_operand" "") + (mult:SI (match_operand:SI 2 "s_register_operand" "") + (match_operand:SI 1 "s_register_operand" "")))] + "TARGET_EITHER" + "" +) + +;; Use `&' and then `0' to prevent the operands 0 and 1 being the same +(define_insn "*arm_mulsi3" + [(set (match_operand:SI 0 "s_register_operand" "=&r,&r") + (mult:SI (match_operand:SI 2 "s_register_operand" "r,r") + (match_operand:SI 1 "s_register_operand" "%0,r")))] + "TARGET_32BIT && !arm_arch6" + "mul%?\\t%0, %2, %1" + [(set_attr "insn" "mul") + (set_attr "predicable" "yes")] +) + +(define_insn "*arm_mulsi3_v6" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (mult:SI (match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "s_register_operand" "r")))] + "TARGET_32BIT && arm_arch6" + "mul%?\\t%0, %1, %2" + [(set_attr "insn" "mul") + (set_attr "predicable" "yes")] +) + +; Unfortunately with the Thumb the '&'/'0' trick can fails when operands +; 1 and 2; are the same, because reload will make operand 0 match +; operand 1 without realizing that this conflicts with operand 2. We fix +; this by adding another alternative to match this case, and then `reload' +; it ourselves. This alternative must come first. +(define_insn "*thumb_mulsi3" + [(set (match_operand:SI 0 "register_operand" "=&l,&l,&l") + (mult:SI (match_operand:SI 1 "register_operand" "%l,*h,0") + (match_operand:SI 2 "register_operand" "l,l,l")))] + "TARGET_THUMB1 && !arm_arch6" + "* + if (which_alternative < 2) + return \"mov\\t%0, %1\;mul\\t%0, %2\"; + else + return \"mul\\t%0, %2\"; + " + [(set_attr "length" "4,4,2") + (set_attr "insn" "mul")] +) + +(define_insn "*thumb_mulsi3_v6" + [(set (match_operand:SI 0 "register_operand" "=l,l,l") + (mult:SI (match_operand:SI 1 "register_operand" "0,l,0") + (match_operand:SI 2 "register_operand" "l,0,0")))] + "TARGET_THUMB1 && arm_arch6" + "@ + mul\\t%0, %2 + mul\\t%0, %1 + mul\\t%0, %1" + [(set_attr "length" "2") + (set_attr "insn" "mul")] +) + +(define_insn "*mulsi3_compare0" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV (mult:SI + (match_operand:SI 2 "s_register_operand" "r,r") + (match_operand:SI 1 "s_register_operand" "%0,r")) + (const_int 0))) + (set (match_operand:SI 0 "s_register_operand" "=&r,&r") + (mult:SI (match_dup 2) (match_dup 1)))] + "TARGET_ARM && !arm_arch6" + "mul%.\\t%0, %2, %1" + [(set_attr "conds" "set") + (set_attr "insn" "muls")] +) + +(define_insn "*mulsi3_compare0_v6" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV (mult:SI + (match_operand:SI 2 "s_register_operand" "r") + (match_operand:SI 1 "s_register_operand" "r")) + (const_int 0))) + (set (match_operand:SI 0 "s_register_operand" "=r") + (mult:SI (match_dup 2) (match_dup 1)))] + "TARGET_ARM && arm_arch6 && optimize_size" + "mul%.\\t%0, %2, %1" + [(set_attr "conds" "set") + (set_attr "insn" "muls")] +) + +(define_insn "*mulsi_compare0_scratch" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV (mult:SI + (match_operand:SI 2 "s_register_operand" "r,r") + (match_operand:SI 1 "s_register_operand" "%0,r")) + (const_int 0))) + (clobber (match_scratch:SI 0 "=&r,&r"))] + "TARGET_ARM && !arm_arch6" + "mul%.\\t%0, %2, %1" + [(set_attr "conds" "set") + (set_attr "insn" "muls")] +) + +(define_insn "*mulsi_compare0_scratch_v6" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV (mult:SI + (match_operand:SI 2 "s_register_operand" "r") + (match_operand:SI 1 "s_register_operand" "r")) + (const_int 0))) + (clobber (match_scratch:SI 0 "=r"))] + "TARGET_ARM && arm_arch6 && optimize_size" + "mul%.\\t%0, %2, %1" + [(set_attr "conds" "set") + (set_attr "insn" "muls")] +) + +;; Unnamed templates to match MLA instruction. + +(define_insn "*mulsi3addsi" + [(set (match_operand:SI 0 "s_register_operand" "=&r,&r,&r,&r") + (plus:SI + (mult:SI (match_operand:SI 2 "s_register_operand" "r,r,r,r") + (match_operand:SI 1 "s_register_operand" "%0,r,0,r")) + (match_operand:SI 3 "s_register_operand" "r,r,0,0")))] + "TARGET_32BIT && !arm_arch6" + "mla%?\\t%0, %2, %1, %3" + [(set_attr "insn" "mla") + (set_attr "predicable" "yes")] +) + +(define_insn "*mulsi3addsi_v6" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (plus:SI + (mult:SI (match_operand:SI 2 "s_register_operand" "r") + (match_operand:SI 1 "s_register_operand" "r")) + (match_operand:SI 3 "s_register_operand" "r")))] + "TARGET_32BIT && arm_arch6" + "mla%?\\t%0, %2, %1, %3" + [(set_attr "insn" "mla") + (set_attr "predicable" "yes")] +) + +(define_insn "*mulsi3addsi_compare0" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV + (plus:SI (mult:SI + (match_operand:SI 2 "s_register_operand" "r,r,r,r") + (match_operand:SI 1 "s_register_operand" "%0,r,0,r")) + (match_operand:SI 3 "s_register_operand" "r,r,0,0")) + (const_int 0))) + (set (match_operand:SI 0 "s_register_operand" "=&r,&r,&r,&r") + (plus:SI (mult:SI (match_dup 2) (match_dup 1)) + (match_dup 3)))] + "TARGET_ARM && arm_arch6" + "mla%.\\t%0, %2, %1, %3" + [(set_attr "conds" "set") + (set_attr "insn" "mlas")] +) + +(define_insn "*mulsi3addsi_compare0_v6" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV + (plus:SI (mult:SI + (match_operand:SI 2 "s_register_operand" "r") + (match_operand:SI 1 "s_register_operand" "r")) + (match_operand:SI 3 "s_register_operand" "r")) + (const_int 0))) + (set (match_operand:SI 0 "s_register_operand" "=r") + (plus:SI (mult:SI (match_dup 2) (match_dup 1)) + (match_dup 3)))] + "TARGET_ARM && arm_arch6 && optimize_size" + "mla%.\\t%0, %2, %1, %3" + [(set_attr "conds" "set") + (set_attr "insn" "mlas")] +) + +(define_insn "*mulsi3addsi_compare0_scratch" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV + (plus:SI (mult:SI + (match_operand:SI 2 "s_register_operand" "r,r,r,r") + (match_operand:SI 1 "s_register_operand" "%0,r,0,r")) + (match_operand:SI 3 "s_register_operand" "?r,r,0,0")) + (const_int 0))) + (clobber (match_scratch:SI 0 "=&r,&r,&r,&r"))] + "TARGET_ARM && !arm_arch6" + "mla%.\\t%0, %2, %1, %3" + [(set_attr "conds" "set") + (set_attr "insn" "mlas")] +) + +(define_insn "*mulsi3addsi_compare0_scratch_v6" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV + (plus:SI (mult:SI + (match_operand:SI 2 "s_register_operand" "r") + (match_operand:SI 1 "s_register_operand" "r")) + (match_operand:SI 3 "s_register_operand" "r")) + (const_int 0))) + (clobber (match_scratch:SI 0 "=r"))] + "TARGET_ARM && arm_arch6 && optimize_size" + "mla%.\\t%0, %2, %1, %3" + [(set_attr "conds" "set") + (set_attr "insn" "mlas")] +) + +(define_insn "*mulsi3subsi" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (minus:SI + (match_operand:SI 3 "s_register_operand" "r") + (mult:SI (match_operand:SI 2 "s_register_operand" "r") + (match_operand:SI 1 "s_register_operand" "r"))))] + "TARGET_32BIT && arm_arch_thumb2" + "mls%?\\t%0, %2, %1, %3" + [(set_attr "insn" "mla") + (set_attr "predicable" "yes")] +) + +(define_expand "maddsidi4" + [(set (match_operand:DI 0 "s_register_operand" "") + (plus:DI + (mult:DI + (sign_extend:DI (match_operand:SI 1 "s_register_operand" "")) + (sign_extend:DI (match_operand:SI 2 "s_register_operand" ""))) + (match_operand:DI 3 "s_register_operand" "")))] + "TARGET_32BIT && arm_arch3m" + "") + +(define_insn "*mulsidi3adddi" + [(set (match_operand:DI 0 "s_register_operand" "=&r") + (plus:DI + (mult:DI + (sign_extend:DI (match_operand:SI 2 "s_register_operand" "%r")) + (sign_extend:DI (match_operand:SI 3 "s_register_operand" "r"))) + (match_operand:DI 1 "s_register_operand" "0")))] + "TARGET_32BIT && arm_arch3m && !arm_arch6" + "smlal%?\\t%Q0, %R0, %3, %2" + [(set_attr "insn" "smlal") + (set_attr "predicable" "yes")] +) + +(define_insn "*mulsidi3adddi_v6" + [(set (match_operand:DI 0 "s_register_operand" "=r") + (plus:DI + (mult:DI + (sign_extend:DI (match_operand:SI 2 "s_register_operand" "r")) + (sign_extend:DI (match_operand:SI 3 "s_register_operand" "r"))) + (match_operand:DI 1 "s_register_operand" "0")))] + "TARGET_32BIT && arm_arch6" + "smlal%?\\t%Q0, %R0, %3, %2" + [(set_attr "insn" "smlal") + (set_attr "predicable" "yes")] +) + +;; 32x32->64 widening multiply. +;; As with mulsi3, the only difference between the v3-5 and v6+ +;; versions of these patterns is the requirement that the output not +;; overlap the inputs, but that still means we have to have a named +;; expander and two different starred insns. + +(define_expand "mulsidi3" + [(set (match_operand:DI 0 "s_register_operand" "") + (mult:DI + (sign_extend:DI (match_operand:SI 1 "s_register_operand" "")) + (sign_extend:DI (match_operand:SI 2 "s_register_operand" ""))))] + "TARGET_32BIT && arm_arch3m" + "" +) + +(define_insn "*mulsidi3_nov6" + [(set (match_operand:DI 0 "s_register_operand" "=&r") + (mult:DI + (sign_extend:DI (match_operand:SI 1 "s_register_operand" "%r")) + (sign_extend:DI (match_operand:SI 2 "s_register_operand" "r"))))] + "TARGET_32BIT && arm_arch3m && !arm_arch6" + "smull%?\\t%Q0, %R0, %1, %2" + [(set_attr "insn" "smull") + (set_attr "predicable" "yes")] +) + +(define_insn "*mulsidi3_v6" + [(set (match_operand:DI 0 "s_register_operand" "=r") + (mult:DI + (sign_extend:DI (match_operand:SI 1 "s_register_operand" "r")) + (sign_extend:DI (match_operand:SI 2 "s_register_operand" "r"))))] + "TARGET_32BIT && arm_arch6" + "smull%?\\t%Q0, %R0, %1, %2" + [(set_attr "insn" "smull") + (set_attr "predicable" "yes")] +) + +(define_expand "umulsidi3" + [(set (match_operand:DI 0 "s_register_operand" "") + (mult:DI + (zero_extend:DI (match_operand:SI 1 "s_register_operand" "")) + (zero_extend:DI (match_operand:SI 2 "s_register_operand" ""))))] + "TARGET_32BIT && arm_arch3m" + "" +) + +(define_insn "*umulsidi3_nov6" + [(set (match_operand:DI 0 "s_register_operand" "=&r") + (mult:DI + (zero_extend:DI (match_operand:SI 1 "s_register_operand" "%r")) + (zero_extend:DI (match_operand:SI 2 "s_register_operand" "r"))))] + "TARGET_32BIT && arm_arch3m && !arm_arch6" + "umull%?\\t%Q0, %R0, %1, %2" + [(set_attr "insn" "umull") + (set_attr "predicable" "yes")] +) + +(define_insn "*umulsidi3_v6" + [(set (match_operand:DI 0 "s_register_operand" "=r") + (mult:DI + (zero_extend:DI (match_operand:SI 1 "s_register_operand" "r")) + (zero_extend:DI (match_operand:SI 2 "s_register_operand" "r"))))] + "TARGET_32BIT && arm_arch6" + "umull%?\\t%Q0, %R0, %1, %2" + [(set_attr "insn" "umull") + (set_attr "predicable" "yes")] +) + +(define_expand "umaddsidi4" + [(set (match_operand:DI 0 "s_register_operand" "") + (plus:DI + (mult:DI + (zero_extend:DI (match_operand:SI 1 "s_register_operand" "")) + (zero_extend:DI (match_operand:SI 2 "s_register_operand" ""))) + (match_operand:DI 3 "s_register_operand" "")))] + "TARGET_32BIT && arm_arch3m" + "") + +(define_insn "*umulsidi3adddi" + [(set (match_operand:DI 0 "s_register_operand" "=&r") + (plus:DI + (mult:DI + (zero_extend:DI (match_operand:SI 2 "s_register_operand" "%r")) + (zero_extend:DI (match_operand:SI 3 "s_register_operand" "r"))) + (match_operand:DI 1 "s_register_operand" "0")))] + "TARGET_32BIT && arm_arch3m && !arm_arch6" + "umlal%?\\t%Q0, %R0, %3, %2" + [(set_attr "insn" "umlal") + (set_attr "predicable" "yes")] +) + +(define_insn "*umulsidi3adddi_v6" + [(set (match_operand:DI 0 "s_register_operand" "=r") + (plus:DI + (mult:DI + (zero_extend:DI (match_operand:SI 2 "s_register_operand" "r")) + (zero_extend:DI (match_operand:SI 3 "s_register_operand" "r"))) + (match_operand:DI 1 "s_register_operand" "0")))] + "TARGET_32BIT && arm_arch6" + "umlal%?\\t%Q0, %R0, %3, %2" + [(set_attr "insn" "umlal") + (set_attr "predicable" "yes")] +) + +(define_expand "smulsi3_highpart" + [(parallel + [(set (match_operand:SI 0 "s_register_operand" "") + (truncate:SI + (lshiftrt:DI + (mult:DI + (sign_extend:DI (match_operand:SI 1 "s_register_operand" "")) + (sign_extend:DI (match_operand:SI 2 "s_register_operand" ""))) + (const_int 32)))) + (clobber (match_scratch:SI 3 ""))])] + "TARGET_32BIT && arm_arch3m" + "" +) + +(define_insn "*smulsi3_highpart_nov6" + [(set (match_operand:SI 0 "s_register_operand" "=&r,&r") + (truncate:SI + (lshiftrt:DI + (mult:DI + (sign_extend:DI (match_operand:SI 1 "s_register_operand" "%0,r")) + (sign_extend:DI (match_operand:SI 2 "s_register_operand" "r,r"))) + (const_int 32)))) + (clobber (match_scratch:SI 3 "=&r,&r"))] + "TARGET_32BIT && arm_arch3m && !arm_arch6" + "smull%?\\t%3, %0, %2, %1" + [(set_attr "insn" "smull") + (set_attr "predicable" "yes")] +) + +(define_insn "*smulsi3_highpart_v6" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (truncate:SI + (lshiftrt:DI + (mult:DI + (sign_extend:DI (match_operand:SI 1 "s_register_operand" "r")) + (sign_extend:DI (match_operand:SI 2 "s_register_operand" "r"))) + (const_int 32)))) + (clobber (match_scratch:SI 3 "=r"))] + "TARGET_32BIT && arm_arch6" + "smull%?\\t%3, %0, %2, %1" + [(set_attr "insn" "smull") + (set_attr "predicable" "yes")] +) + +(define_expand "umulsi3_highpart" + [(parallel + [(set (match_operand:SI 0 "s_register_operand" "") + (truncate:SI + (lshiftrt:DI + (mult:DI + (zero_extend:DI (match_operand:SI 1 "s_register_operand" "")) + (zero_extend:DI (match_operand:SI 2 "s_register_operand" ""))) + (const_int 32)))) + (clobber (match_scratch:SI 3 ""))])] + "TARGET_32BIT && arm_arch3m" + "" +) + +(define_insn "*umulsi3_highpart_nov6" + [(set (match_operand:SI 0 "s_register_operand" "=&r,&r") + (truncate:SI + (lshiftrt:DI + (mult:DI + (zero_extend:DI (match_operand:SI 1 "s_register_operand" "%0,r")) + (zero_extend:DI (match_operand:SI 2 "s_register_operand" "r,r"))) + (const_int 32)))) + (clobber (match_scratch:SI 3 "=&r,&r"))] + "TARGET_32BIT && arm_arch3m && !arm_arch6" + "umull%?\\t%3, %0, %2, %1" + [(set_attr "insn" "umull") + (set_attr "predicable" "yes")] +) + +(define_insn "*umulsi3_highpart_v6" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (truncate:SI + (lshiftrt:DI + (mult:DI + (zero_extend:DI (match_operand:SI 1 "s_register_operand" "r")) + (zero_extend:DI (match_operand:SI 2 "s_register_operand" "r"))) + (const_int 32)))) + (clobber (match_scratch:SI 3 "=r"))] + "TARGET_32BIT && arm_arch6" + "umull%?\\t%3, %0, %2, %1" + [(set_attr "insn" "umull") + (set_attr "predicable" "yes")] +) + +(define_insn "mulhisi3" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (mult:SI (sign_extend:SI + (match_operand:HI 1 "s_register_operand" "%r")) + (sign_extend:SI + (match_operand:HI 2 "s_register_operand" "r"))))] + "TARGET_DSP_MULTIPLY" + "smulbb%?\\t%0, %1, %2" + [(set_attr "insn" "smulxy") + (set_attr "predicable" "yes")] +) + +(define_insn "*mulhisi3tb" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (mult:SI (ashiftrt:SI + (match_operand:SI 1 "s_register_operand" "r") + (const_int 16)) + (sign_extend:SI + (match_operand:HI 2 "s_register_operand" "r"))))] + "TARGET_DSP_MULTIPLY" + "smultb%?\\t%0, %1, %2" + [(set_attr "insn" "smulxy") + (set_attr "predicable" "yes")] +) + +(define_insn "*mulhisi3bt" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (mult:SI (sign_extend:SI + (match_operand:HI 1 "s_register_operand" "r")) + (ashiftrt:SI + (match_operand:SI 2 "s_register_operand" "r") + (const_int 16))))] + "TARGET_DSP_MULTIPLY" + "smulbt%?\\t%0, %1, %2" + [(set_attr "insn" "smulxy") + (set_attr "predicable" "yes")] +) + +(define_insn "*mulhisi3tt" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (mult:SI (ashiftrt:SI + (match_operand:SI 1 "s_register_operand" "r") + (const_int 16)) + (ashiftrt:SI + (match_operand:SI 2 "s_register_operand" "r") + (const_int 16))))] + "TARGET_DSP_MULTIPLY" + "smultt%?\\t%0, %1, %2" + [(set_attr "insn" "smulxy") + (set_attr "predicable" "yes")] +) + +(define_insn "maddhisi4" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (plus:SI (mult:SI (sign_extend:SI + (match_operand:HI 1 "s_register_operand" "r")) + (sign_extend:SI + (match_operand:HI 2 "s_register_operand" "r"))) + (match_operand:SI 3 "s_register_operand" "r")))] + "TARGET_DSP_MULTIPLY" + "smlabb%?\\t%0, %1, %2, %3" + [(set_attr "insn" "smlaxy") + (set_attr "predicable" "yes")] +) + +(define_insn "*maddhidi4" + [(set (match_operand:DI 0 "s_register_operand" "=r") + (plus:DI + (mult:DI (sign_extend:DI + (match_operand:HI 1 "s_register_operand" "r")) + (sign_extend:DI + (match_operand:HI 2 "s_register_operand" "r"))) + (match_operand:DI 3 "s_register_operand" "0")))] + "TARGET_DSP_MULTIPLY" + "smlalbb%?\\t%Q0, %R0, %1, %2" + [(set_attr "insn" "smlalxy") + (set_attr "predicable" "yes")]) + +(define_expand "mulsf3" + [(set (match_operand:SF 0 "s_register_operand" "") + (mult:SF (match_operand:SF 1 "s_register_operand" "") + (match_operand:SF 2 "arm_float_rhs_operand" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT" + " + if (TARGET_MAVERICK + && !cirrus_fp_register (operands[2], SFmode)) + operands[2] = force_reg (SFmode, operands[2]); +") + +(define_expand "muldf3" + [(set (match_operand:DF 0 "s_register_operand" "") + (mult:DF (match_operand:DF 1 "s_register_operand" "") + (match_operand:DF 2 "arm_float_rhs_operand" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && !TARGET_VFP_SINGLE" + " + if (TARGET_MAVERICK + && !cirrus_fp_register (operands[2], DFmode)) + operands[2] = force_reg (DFmode, operands[2]); +") + +;; Division insns + +(define_expand "divsf3" + [(set (match_operand:SF 0 "s_register_operand" "") + (div:SF (match_operand:SF 1 "arm_float_rhs_operand" "") + (match_operand:SF 2 "arm_float_rhs_operand" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && (TARGET_FPA || TARGET_VFP)" + "") + +(define_expand "divdf3" + [(set (match_operand:DF 0 "s_register_operand" "") + (div:DF (match_operand:DF 1 "arm_float_rhs_operand" "") + (match_operand:DF 2 "arm_float_rhs_operand" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && (TARGET_FPA || TARGET_VFP_DOUBLE)" + "") + +;; Modulo insns + +(define_expand "modsf3" + [(set (match_operand:SF 0 "s_register_operand" "") + (mod:SF (match_operand:SF 1 "s_register_operand" "") + (match_operand:SF 2 "arm_float_rhs_operand" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "") + +(define_expand "moddf3" + [(set (match_operand:DF 0 "s_register_operand" "") + (mod:DF (match_operand:DF 1 "s_register_operand" "") + (match_operand:DF 2 "arm_float_rhs_operand" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "") + +;; Boolean and,ior,xor insns + +;; Split up double word logical operations + +;; Split up simple DImode logical operations. Simply perform the logical +;; operation on the upper and lower halves of the registers. +(define_split + [(set (match_operand:DI 0 "s_register_operand" "") + (match_operator:DI 6 "logical_binary_operator" + [(match_operand:DI 1 "s_register_operand" "") + (match_operand:DI 2 "s_register_operand" "")]))] + "TARGET_32BIT && reload_completed + && ! (TARGET_NEON && IS_VFP_REGNUM (REGNO (operands[0]))) + && ! IS_IWMMXT_REGNUM (REGNO (operands[0]))" + [(set (match_dup 0) (match_op_dup:SI 6 [(match_dup 1) (match_dup 2)])) + (set (match_dup 3) (match_op_dup:SI 6 [(match_dup 4) (match_dup 5)]))] + " + { + operands[3] = gen_highpart (SImode, operands[0]); + operands[0] = gen_lowpart (SImode, operands[0]); + operands[4] = gen_highpart (SImode, operands[1]); + operands[1] = gen_lowpart (SImode, operands[1]); + operands[5] = gen_highpart (SImode, operands[2]); + operands[2] = gen_lowpart (SImode, operands[2]); + }" +) + +(define_split + [(set (match_operand:DI 0 "s_register_operand" "") + (match_operator:DI 6 "logical_binary_operator" + [(sign_extend:DI (match_operand:SI 2 "s_register_operand" "")) + (match_operand:DI 1 "s_register_operand" "")]))] + "TARGET_32BIT && reload_completed" + [(set (match_dup 0) (match_op_dup:SI 6 [(match_dup 1) (match_dup 2)])) + (set (match_dup 3) (match_op_dup:SI 6 + [(ashiftrt:SI (match_dup 2) (const_int 31)) + (match_dup 4)]))] + " + { + operands[3] = gen_highpart (SImode, operands[0]); + operands[0] = gen_lowpart (SImode, operands[0]); + operands[4] = gen_highpart (SImode, operands[1]); + operands[1] = gen_lowpart (SImode, operands[1]); + operands[5] = gen_highpart (SImode, operands[2]); + operands[2] = gen_lowpart (SImode, operands[2]); + }" +) + +;; The zero extend of operand 2 means we can just copy the high part of +;; operand1 into operand0. +(define_split + [(set (match_operand:DI 0 "s_register_operand" "") + (ior:DI + (zero_extend:DI (match_operand:SI 2 "s_register_operand" "")) + (match_operand:DI 1 "s_register_operand" "")))] + "TARGET_32BIT && operands[0] != operands[1] && reload_completed" + [(set (match_dup 0) (ior:SI (match_dup 1) (match_dup 2))) + (set (match_dup 3) (match_dup 4))] + " + { + operands[4] = gen_highpart (SImode, operands[1]); + operands[3] = gen_highpart (SImode, operands[0]); + operands[0] = gen_lowpart (SImode, operands[0]); + operands[1] = gen_lowpart (SImode, operands[1]); + }" +) + +;; The zero extend of operand 2 means we can just copy the high part of +;; operand1 into operand0. +(define_split + [(set (match_operand:DI 0 "s_register_operand" "") + (xor:DI + (zero_extend:DI (match_operand:SI 2 "s_register_operand" "")) + (match_operand:DI 1 "s_register_operand" "")))] + "TARGET_32BIT && operands[0] != operands[1] && reload_completed" + [(set (match_dup 0) (xor:SI (match_dup 1) (match_dup 2))) + (set (match_dup 3) (match_dup 4))] + " + { + operands[4] = gen_highpart (SImode, operands[1]); + operands[3] = gen_highpart (SImode, operands[0]); + operands[0] = gen_lowpart (SImode, operands[0]); + operands[1] = gen_lowpart (SImode, operands[1]); + }" +) + +(define_expand "anddi3" + [(set (match_operand:DI 0 "s_register_operand" "") + (and:DI (match_operand:DI 1 "s_register_operand" "") + (match_operand:DI 2 "neon_inv_logic_op2" "")))] + "TARGET_32BIT" + "" +) + +(define_insn "*anddi3_insn" + [(set (match_operand:DI 0 "s_register_operand" "=&r,&r") + (and:DI (match_operand:DI 1 "s_register_operand" "%0,r") + (match_operand:DI 2 "s_register_operand" "r,r")))] + "TARGET_32BIT && !TARGET_IWMMXT && !TARGET_NEON" + "#" + [(set_attr "length" "8")] +) + +(define_insn_and_split "*anddi_zesidi_di" + [(set (match_operand:DI 0 "s_register_operand" "=&r,&r") + (and:DI (zero_extend:DI + (match_operand:SI 2 "s_register_operand" "r,r")) + (match_operand:DI 1 "s_register_operand" "0,r")))] + "TARGET_32BIT" + "#" + "TARGET_32BIT && reload_completed" + ; The zero extend of operand 2 clears the high word of the output + ; operand. + [(set (match_dup 0) (and:SI (match_dup 1) (match_dup 2))) + (set (match_dup 3) (const_int 0))] + " + { + operands[3] = gen_highpart (SImode, operands[0]); + operands[0] = gen_lowpart (SImode, operands[0]); + operands[1] = gen_lowpart (SImode, operands[1]); + }" + [(set_attr "length" "8")] +) + +(define_insn "*anddi_sesdi_di" + [(set (match_operand:DI 0 "s_register_operand" "=&r,&r") + (and:DI (sign_extend:DI + (match_operand:SI 2 "s_register_operand" "r,r")) + (match_operand:DI 1 "s_register_operand" "0,r")))] + "TARGET_32BIT" + "#" + [(set_attr "length" "8")] +) + +(define_expand "andsi3" + [(set (match_operand:SI 0 "s_register_operand" "") + (and:SI (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 2 "reg_or_int_operand" "")))] + "TARGET_EITHER" + " + if (TARGET_32BIT) + { + if (GET_CODE (operands[2]) == CONST_INT) + { + if (INTVAL (operands[2]) == 255 && arm_arch6) + { + operands[1] = convert_to_mode (QImode, operands[1], 1); + emit_insn (gen_thumb2_zero_extendqisi2_v6 (operands[0], + operands[1])); + } + else + arm_split_constant (AND, SImode, NULL_RTX, + INTVAL (operands[2]), operands[0], + operands[1], + optimize && can_create_pseudo_p ()); + + DONE; + } + } + else /* TARGET_THUMB1 */ + { + if (GET_CODE (operands[2]) != CONST_INT) + { + rtx tmp = force_reg (SImode, operands[2]); + if (rtx_equal_p (operands[0], operands[1])) + operands[2] = tmp; + else + { + operands[2] = operands[1]; + operands[1] = tmp; + } + } + else + { + int i; + + if (((unsigned HOST_WIDE_INT) ~INTVAL (operands[2])) < 256) + { + operands[2] = force_reg (SImode, + GEN_INT (~INTVAL (operands[2]))); + + emit_insn (gen_thumb1_bicsi3 (operands[0], operands[2], operands[1])); + + DONE; + } + + for (i = 9; i <= 31; i++) + { + if ((((HOST_WIDE_INT) 1) << i) - 1 == INTVAL (operands[2])) + { + emit_insn (gen_extzv (operands[0], operands[1], GEN_INT (i), + const0_rtx)); + DONE; + } + else if ((((HOST_WIDE_INT) 1) << i) - 1 + == ~INTVAL (operands[2])) + { + rtx shift = GEN_INT (i); + rtx reg = gen_reg_rtx (SImode); + + emit_insn (gen_lshrsi3 (reg, operands[1], shift)); + emit_insn (gen_ashlsi3 (operands[0], reg, shift)); + + DONE; + } + } + + operands[2] = force_reg (SImode, operands[2]); + } + } + " +) + +; ??? Check split length for Thumb-2 +(define_insn_and_split "*arm_andsi3_insn" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,r") + (and:SI (match_operand:SI 1 "s_register_operand" "r,r,r") + (match_operand:SI 2 "reg_or_int_operand" "rI,K,?n")))] + "TARGET_32BIT" + "@ + and%?\\t%0, %1, %2 + bic%?\\t%0, %1, #%B2 + #" + "TARGET_32BIT + && GET_CODE (operands[2]) == CONST_INT + && !(const_ok_for_arm (INTVAL (operands[2])) + || const_ok_for_arm (~INTVAL (operands[2])))" + [(clobber (const_int 0))] + " + arm_split_constant (AND, SImode, curr_insn, + INTVAL (operands[2]), operands[0], operands[1], 0); + DONE; + " + [(set_attr "length" "4,4,16") + (set_attr "predicable" "yes")] +) + +(define_insn "*thumb1_andsi3_insn" + [(set (match_operand:SI 0 "register_operand" "=l") + (and:SI (match_operand:SI 1 "register_operand" "%0") + (match_operand:SI 2 "register_operand" "l")))] + "TARGET_THUMB1" + "and\\t%0, %2" + [(set_attr "length" "2") + (set_attr "conds" "set")]) + +(define_insn "*andsi3_compare0" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV + (and:SI (match_operand:SI 1 "s_register_operand" "r,r") + (match_operand:SI 2 "arm_not_operand" "rI,K")) + (const_int 0))) + (set (match_operand:SI 0 "s_register_operand" "=r,r") + (and:SI (match_dup 1) (match_dup 2)))] + "TARGET_32BIT" + "@ + and%.\\t%0, %1, %2 + bic%.\\t%0, %1, #%B2" + [(set_attr "conds" "set")] +) + +(define_insn "*andsi3_compare0_scratch" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV + (and:SI (match_operand:SI 0 "s_register_operand" "r,r") + (match_operand:SI 1 "arm_not_operand" "rI,K")) + (const_int 0))) + (clobber (match_scratch:SI 2 "=X,r"))] + "TARGET_32BIT" + "@ + tst%?\\t%0, %1 + bic%.\\t%2, %0, #%B1" + [(set_attr "conds" "set")] +) + +(define_insn "*zeroextractsi_compare0_scratch" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV (zero_extract:SI + (match_operand:SI 0 "s_register_operand" "r") + (match_operand 1 "const_int_operand" "n") + (match_operand 2 "const_int_operand" "n")) + (const_int 0)))] + "TARGET_32BIT + && (INTVAL (operands[2]) >= 0 && INTVAL (operands[2]) < 32 + && INTVAL (operands[1]) > 0 + && INTVAL (operands[1]) + (INTVAL (operands[2]) & 1) <= 8 + && INTVAL (operands[1]) + INTVAL (operands[2]) <= 32)" + "* + operands[1] = GEN_INT (((1 << INTVAL (operands[1])) - 1) + << INTVAL (operands[2])); + output_asm_insn (\"tst%?\\t%0, %1\", operands); + return \"\"; + " + [(set_attr "conds" "set")] +) + +(define_insn_and_split "*ne_zeroextractsi" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (ne:SI (zero_extract:SI + (match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "const_int_operand" "n") + (match_operand:SI 3 "const_int_operand" "n")) + (const_int 0))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_32BIT + && (INTVAL (operands[3]) >= 0 && INTVAL (operands[3]) < 32 + && INTVAL (operands[2]) > 0 + && INTVAL (operands[2]) + (INTVAL (operands[3]) & 1) <= 8 + && INTVAL (operands[2]) + INTVAL (operands[3]) <= 32)" + "#" + "TARGET_32BIT + && (INTVAL (operands[3]) >= 0 && INTVAL (operands[3]) < 32 + && INTVAL (operands[2]) > 0 + && INTVAL (operands[2]) + (INTVAL (operands[3]) & 1) <= 8 + && INTVAL (operands[2]) + INTVAL (operands[3]) <= 32)" + [(parallel [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV (and:SI (match_dup 1) (match_dup 2)) + (const_int 0))) + (set (match_dup 0) (and:SI (match_dup 1) (match_dup 2)))]) + (set (match_dup 0) + (if_then_else:SI (eq (reg:CC_NOOV CC_REGNUM) (const_int 0)) + (match_dup 0) (const_int 1)))] + " + operands[2] = GEN_INT (((1 << INTVAL (operands[2])) - 1) + << INTVAL (operands[3])); + " + [(set_attr "conds" "clob") + (set (attr "length") + (if_then_else (eq_attr "is_thumb" "yes") + (const_int 12) + (const_int 8)))] +) + +(define_insn_and_split "*ne_zeroextractsi_shifted" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (ne:SI (zero_extract:SI + (match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "const_int_operand" "n") + (const_int 0)) + (const_int 0))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "#" + "TARGET_ARM" + [(parallel [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV (ashift:SI (match_dup 1) (match_dup 2)) + (const_int 0))) + (set (match_dup 0) (ashift:SI (match_dup 1) (match_dup 2)))]) + (set (match_dup 0) + (if_then_else:SI (eq (reg:CC_NOOV CC_REGNUM) (const_int 0)) + (match_dup 0) (const_int 1)))] + " + operands[2] = GEN_INT (32 - INTVAL (operands[2])); + " + [(set_attr "conds" "clob") + (set_attr "length" "8")] +) + +(define_insn_and_split "*ite_ne_zeroextractsi" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (if_then_else:SI (ne (zero_extract:SI + (match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "const_int_operand" "n") + (match_operand:SI 3 "const_int_operand" "n")) + (const_int 0)) + (match_operand:SI 4 "arm_not_operand" "rIK") + (const_int 0))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM + && (INTVAL (operands[3]) >= 0 && INTVAL (operands[3]) < 32 + && INTVAL (operands[2]) > 0 + && INTVAL (operands[2]) + (INTVAL (operands[3]) & 1) <= 8 + && INTVAL (operands[2]) + INTVAL (operands[3]) <= 32) + && !reg_overlap_mentioned_p (operands[0], operands[4])" + "#" + "TARGET_ARM + && (INTVAL (operands[3]) >= 0 && INTVAL (operands[3]) < 32 + && INTVAL (operands[2]) > 0 + && INTVAL (operands[2]) + (INTVAL (operands[3]) & 1) <= 8 + && INTVAL (operands[2]) + INTVAL (operands[3]) <= 32) + && !reg_overlap_mentioned_p (operands[0], operands[4])" + [(parallel [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV (and:SI (match_dup 1) (match_dup 2)) + (const_int 0))) + (set (match_dup 0) (and:SI (match_dup 1) (match_dup 2)))]) + (set (match_dup 0) + (if_then_else:SI (eq (reg:CC_NOOV CC_REGNUM) (const_int 0)) + (match_dup 0) (match_dup 4)))] + " + operands[2] = GEN_INT (((1 << INTVAL (operands[2])) - 1) + << INTVAL (operands[3])); + " + [(set_attr "conds" "clob") + (set_attr "length" "8")] +) + +(define_insn_and_split "*ite_ne_zeroextractsi_shifted" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (if_then_else:SI (ne (zero_extract:SI + (match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "const_int_operand" "n") + (const_int 0)) + (const_int 0)) + (match_operand:SI 3 "arm_not_operand" "rIK") + (const_int 0))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM && !reg_overlap_mentioned_p (operands[0], operands[3])" + "#" + "TARGET_ARM && !reg_overlap_mentioned_p (operands[0], operands[3])" + [(parallel [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV (ashift:SI (match_dup 1) (match_dup 2)) + (const_int 0))) + (set (match_dup 0) (ashift:SI (match_dup 1) (match_dup 2)))]) + (set (match_dup 0) + (if_then_else:SI (eq (reg:CC_NOOV CC_REGNUM) (const_int 0)) + (match_dup 0) (match_dup 3)))] + " + operands[2] = GEN_INT (32 - INTVAL (operands[2])); + " + [(set_attr "conds" "clob") + (set_attr "length" "8")] +) + +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (zero_extract:SI (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 2 "const_int_operand" "") + (match_operand:SI 3 "const_int_operand" ""))) + (clobber (match_operand:SI 4 "s_register_operand" ""))] + "TARGET_THUMB1" + [(set (match_dup 4) (ashift:SI (match_dup 1) (match_dup 2))) + (set (match_dup 0) (lshiftrt:SI (match_dup 4) (match_dup 3)))] + "{ + HOST_WIDE_INT temp = INTVAL (operands[2]); + + operands[2] = GEN_INT (32 - temp - INTVAL (operands[3])); + operands[3] = GEN_INT (32 - temp); + }" +) + +;; ??? Use Thumb-2 has bitfield insert/extract instructions. +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operator:SI 1 "shiftable_operator" + [(zero_extract:SI (match_operand:SI 2 "s_register_operand" "") + (match_operand:SI 3 "const_int_operand" "") + (match_operand:SI 4 "const_int_operand" "")) + (match_operand:SI 5 "s_register_operand" "")])) + (clobber (match_operand:SI 6 "s_register_operand" ""))] + "TARGET_ARM" + [(set (match_dup 6) (ashift:SI (match_dup 2) (match_dup 3))) + (set (match_dup 0) + (match_op_dup 1 + [(lshiftrt:SI (match_dup 6) (match_dup 4)) + (match_dup 5)]))] + "{ + HOST_WIDE_INT temp = INTVAL (operands[3]); + + operands[3] = GEN_INT (32 - temp - INTVAL (operands[4])); + operands[4] = GEN_INT (32 - temp); + }" +) + +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (sign_extract:SI (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 2 "const_int_operand" "") + (match_operand:SI 3 "const_int_operand" "")))] + "TARGET_THUMB1" + [(set (match_dup 0) (ashift:SI (match_dup 1) (match_dup 2))) + (set (match_dup 0) (ashiftrt:SI (match_dup 0) (match_dup 3)))] + "{ + HOST_WIDE_INT temp = INTVAL (operands[2]); + + operands[2] = GEN_INT (32 - temp - INTVAL (operands[3])); + operands[3] = GEN_INT (32 - temp); + }" +) + +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operator:SI 1 "shiftable_operator" + [(sign_extract:SI (match_operand:SI 2 "s_register_operand" "") + (match_operand:SI 3 "const_int_operand" "") + (match_operand:SI 4 "const_int_operand" "")) + (match_operand:SI 5 "s_register_operand" "")])) + (clobber (match_operand:SI 6 "s_register_operand" ""))] + "TARGET_ARM" + [(set (match_dup 6) (ashift:SI (match_dup 2) (match_dup 3))) + (set (match_dup 0) + (match_op_dup 1 + [(ashiftrt:SI (match_dup 6) (match_dup 4)) + (match_dup 5)]))] + "{ + HOST_WIDE_INT temp = INTVAL (operands[3]); + + operands[3] = GEN_INT (32 - temp - INTVAL (operands[4])); + operands[4] = GEN_INT (32 - temp); + }" +) + +;;; ??? This pattern is bogus. If operand3 has bits outside the range +;;; represented by the bitfield, then this will produce incorrect results. +;;; Somewhere, the value needs to be truncated. On targets like the m68k, +;;; which have a real bit-field insert instruction, the truncation happens +;;; in the bit-field insert instruction itself. Since arm does not have a +;;; bit-field insert instruction, we would have to emit code here to truncate +;;; the value before we insert. This loses some of the advantage of having +;;; this insv pattern, so this pattern needs to be reevalutated. + +(define_expand "insv" + [(set (zero_extract:SI (match_operand:SI 0 "s_register_operand" "") + (match_operand:SI 1 "general_operand" "") + (match_operand:SI 2 "general_operand" "")) + (match_operand:SI 3 "reg_or_int_operand" ""))] + "TARGET_ARM || arm_arch_thumb2" + " + { + int start_bit = INTVAL (operands[2]); + int width = INTVAL (operands[1]); + HOST_WIDE_INT mask = (((HOST_WIDE_INT)1) << width) - 1; + rtx target, subtarget; + + if (arm_arch_thumb2) + { + bool use_bfi = TRUE; + + if (GET_CODE (operands[3]) == CONST_INT) + { + HOST_WIDE_INT val = INTVAL (operands[3]) & mask; + + if (val == 0) + { + emit_insn (gen_insv_zero (operands[0], operands[1], + operands[2])); + DONE; + } + + /* See if the set can be done with a single orr instruction. */ + if (val == mask && const_ok_for_arm (val << start_bit)) + use_bfi = FALSE; + } + + if (use_bfi) + { + if (GET_CODE (operands[3]) != REG) + operands[3] = force_reg (SImode, operands[3]); + + emit_insn (gen_insv_t2 (operands[0], operands[1], operands[2], + operands[3])); + DONE; + } + } + + target = copy_rtx (operands[0]); + /* Avoid using a subreg as a subtarget, and avoid writing a paradoxical + subreg as the final target. */ + if (GET_CODE (target) == SUBREG) + { + subtarget = gen_reg_rtx (SImode); + if (GET_MODE_SIZE (GET_MODE (SUBREG_REG (target))) + < GET_MODE_SIZE (SImode)) + target = SUBREG_REG (target); + } + else + subtarget = target; + + if (GET_CODE (operands[3]) == CONST_INT) + { + /* Since we are inserting a known constant, we may be able to + reduce the number of bits that we have to clear so that + the mask becomes simple. */ + /* ??? This code does not check to see if the new mask is actually + simpler. It may not be. */ + rtx op1 = gen_reg_rtx (SImode); + /* ??? Truncate operand3 to fit in the bitfield. See comment before + start of this pattern. */ + HOST_WIDE_INT op3_value = mask & INTVAL (operands[3]); + HOST_WIDE_INT mask2 = ((mask & ~op3_value) << start_bit); + + emit_insn (gen_andsi3 (op1, operands[0], + gen_int_mode (~mask2, SImode))); + emit_insn (gen_iorsi3 (subtarget, op1, + gen_int_mode (op3_value << start_bit, SImode))); + } + else if (start_bit == 0 + && !(const_ok_for_arm (mask) + || const_ok_for_arm (~mask))) + { + /* A Trick, since we are setting the bottom bits in the word, + we can shift operand[3] up, operand[0] down, OR them together + and rotate the result back again. This takes 3 insns, and + the third might be mergeable into another op. */ + /* The shift up copes with the possibility that operand[3] is + wider than the bitfield. */ + rtx op0 = gen_reg_rtx (SImode); + rtx op1 = gen_reg_rtx (SImode); + + emit_insn (gen_ashlsi3 (op0, operands[3], GEN_INT (32 - width))); + emit_insn (gen_lshrsi3 (op1, operands[0], operands[1])); + emit_insn (gen_iorsi3 (op1, op1, op0)); + emit_insn (gen_rotlsi3 (subtarget, op1, operands[1])); + } + else if ((width + start_bit == 32) + && !(const_ok_for_arm (mask) + || const_ok_for_arm (~mask))) + { + /* Similar trick, but slightly less efficient. */ + + rtx op0 = gen_reg_rtx (SImode); + rtx op1 = gen_reg_rtx (SImode); + + emit_insn (gen_ashlsi3 (op0, operands[3], GEN_INT (32 - width))); + emit_insn (gen_ashlsi3 (op1, operands[0], operands[1])); + emit_insn (gen_lshrsi3 (op1, op1, operands[1])); + emit_insn (gen_iorsi3 (subtarget, op1, op0)); + } + else + { + rtx op0 = gen_int_mode (mask, SImode); + rtx op1 = gen_reg_rtx (SImode); + rtx op2 = gen_reg_rtx (SImode); + + if (!(const_ok_for_arm (mask) || const_ok_for_arm (~mask))) + { + rtx tmp = gen_reg_rtx (SImode); + + emit_insn (gen_movsi (tmp, op0)); + op0 = tmp; + } + + /* Mask out any bits in operand[3] that are not needed. */ + emit_insn (gen_andsi3 (op1, operands[3], op0)); + + if (GET_CODE (op0) == CONST_INT + && (const_ok_for_arm (mask << start_bit) + || const_ok_for_arm (~(mask << start_bit)))) + { + op0 = gen_int_mode (~(mask << start_bit), SImode); + emit_insn (gen_andsi3 (op2, operands[0], op0)); + } + else + { + if (GET_CODE (op0) == CONST_INT) + { + rtx tmp = gen_reg_rtx (SImode); + + emit_insn (gen_movsi (tmp, op0)); + op0 = tmp; + } + + if (start_bit != 0) + emit_insn (gen_ashlsi3 (op0, op0, operands[2])); + + emit_insn (gen_andsi_notsi_si (op2, operands[0], op0)); + } + + if (start_bit != 0) + emit_insn (gen_ashlsi3 (op1, op1, operands[2])); + + emit_insn (gen_iorsi3 (subtarget, op1, op2)); + } + + if (subtarget != target) + { + /* If TARGET is still a SUBREG, then it must be wider than a word, + so we must be careful only to set the subword we were asked to. */ + if (GET_CODE (target) == SUBREG) + emit_move_insn (target, subtarget); + else + emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget)); + } + + DONE; + }" +) + +(define_insn "insv_zero" + [(set (zero_extract:SI (match_operand:SI 0 "s_register_operand" "+r") + (match_operand:SI 1 "const_int_operand" "M") + (match_operand:SI 2 "const_int_operand" "M")) + (const_int 0))] + "arm_arch_thumb2" + "bfc%?\t%0, %2, %1" + [(set_attr "length" "4") + (set_attr "predicable" "yes")] +) + +(define_insn "insv_t2" + [(set (zero_extract:SI (match_operand:SI 0 "s_register_operand" "+r") + (match_operand:SI 1 "const_int_operand" "M") + (match_operand:SI 2 "const_int_operand" "M")) + (match_operand:SI 3 "s_register_operand" "r"))] + "arm_arch_thumb2" + "bfi%?\t%0, %3, %2, %1" + [(set_attr "length" "4") + (set_attr "predicable" "yes")] +) + +; constants for op 2 will never be given to these patterns. +(define_insn_and_split "*anddi_notdi_di" + [(set (match_operand:DI 0 "s_register_operand" "=&r,&r") + (and:DI (not:DI (match_operand:DI 1 "s_register_operand" "0,r")) + (match_operand:DI 2 "s_register_operand" "r,0")))] + "TARGET_32BIT" + "#" + "TARGET_32BIT && reload_completed + && ! (TARGET_NEON && IS_VFP_REGNUM (REGNO (operands[0]))) + && ! IS_IWMMXT_REGNUM (REGNO (operands[0]))" + [(set (match_dup 0) (and:SI (not:SI (match_dup 1)) (match_dup 2))) + (set (match_dup 3) (and:SI (not:SI (match_dup 4)) (match_dup 5)))] + " + { + operands[3] = gen_highpart (SImode, operands[0]); + operands[0] = gen_lowpart (SImode, operands[0]); + operands[4] = gen_highpart (SImode, operands[1]); + operands[1] = gen_lowpart (SImode, operands[1]); + operands[5] = gen_highpart (SImode, operands[2]); + operands[2] = gen_lowpart (SImode, operands[2]); + }" + [(set_attr "length" "8") + (set_attr "predicable" "yes")] +) + +(define_insn_and_split "*anddi_notzesidi_di" + [(set (match_operand:DI 0 "s_register_operand" "=&r,&r") + (and:DI (not:DI (zero_extend:DI + (match_operand:SI 2 "s_register_operand" "r,r"))) + (match_operand:DI 1 "s_register_operand" "0,?r")))] + "TARGET_32BIT" + "@ + bic%?\\t%Q0, %Q1, %2 + #" + ; (not (zero_extend ...)) allows us to just copy the high word from + ; operand1 to operand0. + "TARGET_32BIT + && reload_completed + && operands[0] != operands[1]" + [(set (match_dup 0) (and:SI (not:SI (match_dup 2)) (match_dup 1))) + (set (match_dup 3) (match_dup 4))] + " + { + operands[3] = gen_highpart (SImode, operands[0]); + operands[0] = gen_lowpart (SImode, operands[0]); + operands[4] = gen_highpart (SImode, operands[1]); + operands[1] = gen_lowpart (SImode, operands[1]); + }" + [(set_attr "length" "4,8") + (set_attr "predicable" "yes")] +) + +(define_insn_and_split "*anddi_notsesidi_di" + [(set (match_operand:DI 0 "s_register_operand" "=&r,&r") + (and:DI (not:DI (sign_extend:DI + (match_operand:SI 2 "s_register_operand" "r,r"))) + (match_operand:DI 1 "s_register_operand" "0,r")))] + "TARGET_32BIT" + "#" + "TARGET_32BIT && reload_completed" + [(set (match_dup 0) (and:SI (not:SI (match_dup 2)) (match_dup 1))) + (set (match_dup 3) (and:SI (not:SI + (ashiftrt:SI (match_dup 2) (const_int 31))) + (match_dup 4)))] + " + { + operands[3] = gen_highpart (SImode, operands[0]); + operands[0] = gen_lowpart (SImode, operands[0]); + operands[4] = gen_highpart (SImode, operands[1]); + operands[1] = gen_lowpart (SImode, operands[1]); + }" + [(set_attr "length" "8") + (set_attr "predicable" "yes")] +) + +(define_insn "andsi_notsi_si" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (and:SI (not:SI (match_operand:SI 2 "s_register_operand" "r")) + (match_operand:SI 1 "s_register_operand" "r")))] + "TARGET_32BIT" + "bic%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")] +) + +(define_insn "thumb1_bicsi3" + [(set (match_operand:SI 0 "register_operand" "=l") + (and:SI (not:SI (match_operand:SI 1 "register_operand" "l")) + (match_operand:SI 2 "register_operand" "0")))] + "TARGET_THUMB1" + "bic\\t%0, %1" + [(set_attr "length" "2") + (set_attr "conds" "set")]) + +(define_insn "andsi_not_shiftsi_si" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (and:SI (not:SI (match_operator:SI 4 "shift_operator" + [(match_operand:SI 2 "s_register_operand" "r") + (match_operand:SI 3 "arm_rhs_operand" "rM")])) + (match_operand:SI 1 "s_register_operand" "r")))] + "TARGET_ARM" + "bic%?\\t%0, %1, %2%S4" + [(set_attr "predicable" "yes") + (set_attr "shift" "2") + (set (attr "type") (if_then_else (match_operand 3 "const_int_operand" "") + (const_string "alu_shift") + (const_string "alu_shift_reg")))] +) + +(define_insn "*andsi_notsi_si_compare0" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV + (and:SI (not:SI (match_operand:SI 2 "s_register_operand" "r")) + (match_operand:SI 1 "s_register_operand" "r")) + (const_int 0))) + (set (match_operand:SI 0 "s_register_operand" "=r") + (and:SI (not:SI (match_dup 2)) (match_dup 1)))] + "TARGET_32BIT" + "bic%.\\t%0, %1, %2" + [(set_attr "conds" "set")] +) + +(define_insn "*andsi_notsi_si_compare0_scratch" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV + (and:SI (not:SI (match_operand:SI 2 "s_register_operand" "r")) + (match_operand:SI 1 "s_register_operand" "r")) + (const_int 0))) + (clobber (match_scratch:SI 0 "=r"))] + "TARGET_32BIT" + "bic%.\\t%0, %1, %2" + [(set_attr "conds" "set")] +) + +(define_expand "iordi3" + [(set (match_operand:DI 0 "s_register_operand" "") + (ior:DI (match_operand:DI 1 "s_register_operand" "") + (match_operand:DI 2 "neon_logic_op2" "")))] + "TARGET_32BIT" + "" +) + +(define_insn "*iordi3_insn" + [(set (match_operand:DI 0 "s_register_operand" "=&r,&r") + (ior:DI (match_operand:DI 1 "s_register_operand" "%0,r") + (match_operand:DI 2 "s_register_operand" "r,r")))] + "TARGET_32BIT && !TARGET_IWMMXT && !TARGET_NEON" + "#" + [(set_attr "length" "8") + (set_attr "predicable" "yes")] +) + +(define_insn "*iordi_zesidi_di" + [(set (match_operand:DI 0 "s_register_operand" "=&r,&r") + (ior:DI (zero_extend:DI + (match_operand:SI 2 "s_register_operand" "r,r")) + (match_operand:DI 1 "s_register_operand" "0,?r")))] + "TARGET_32BIT" + "@ + orr%?\\t%Q0, %Q1, %2 + #" + [(set_attr "length" "4,8") + (set_attr "predicable" "yes")] +) + +(define_insn "*iordi_sesidi_di" + [(set (match_operand:DI 0 "s_register_operand" "=&r,&r") + (ior:DI (sign_extend:DI + (match_operand:SI 2 "s_register_operand" "r,r")) + (match_operand:DI 1 "s_register_operand" "0,r")))] + "TARGET_32BIT" + "#" + [(set_attr "length" "8") + (set_attr "predicable" "yes")] +) + +(define_expand "iorsi3" + [(set (match_operand:SI 0 "s_register_operand" "") + (ior:SI (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 2 "reg_or_int_operand" "")))] + "TARGET_EITHER" + " + if (GET_CODE (operands[2]) == CONST_INT) + { + if (TARGET_32BIT) + { + arm_split_constant (IOR, SImode, NULL_RTX, + INTVAL (operands[2]), operands[0], operands[1], + optimize && can_create_pseudo_p ()); + DONE; + } + else /* TARGET_THUMB1 */ + { + rtx tmp = force_reg (SImode, operands[2]); + if (rtx_equal_p (operands[0], operands[1])) + operands[2] = tmp; + else + { + operands[2] = operands[1]; + operands[1] = tmp; + } + } + } + " +) + +(define_insn_and_split "*iorsi3_insn" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,r") + (ior:SI (match_operand:SI 1 "s_register_operand" "%r,r,r") + (match_operand:SI 2 "reg_or_int_operand" "rI,K,?n")))] + "TARGET_32BIT" + "@ + orr%?\\t%0, %1, %2 + orn%?\\t%0, %1, #%B2 + #" + "TARGET_32BIT + && GET_CODE (operands[2]) == CONST_INT + && !(const_ok_for_arm (INTVAL (operands[2])) + || (TARGET_THUMB2 && const_ok_for_arm (~INTVAL (operands[2]))))" + [(clobber (const_int 0))] +{ + arm_split_constant (IOR, SImode, curr_insn, + INTVAL (operands[2]), operands[0], operands[1], 0); + DONE; +} + [(set_attr "length" "4,4,16") + (set_attr "arch" "32,t2,32") + (set_attr "predicable" "yes")]) + +(define_insn "*thumb1_iorsi3_insn" + [(set (match_operand:SI 0 "register_operand" "=l") + (ior:SI (match_operand:SI 1 "register_operand" "%0") + (match_operand:SI 2 "register_operand" "l")))] + "TARGET_THUMB1" + "orr\\t%0, %2" + [(set_attr "length" "2") + (set_attr "conds" "set")]) + +(define_peephole2 + [(match_scratch:SI 3 "r") + (set (match_operand:SI 0 "arm_general_register_operand" "") + (ior:SI (match_operand:SI 1 "arm_general_register_operand" "") + (match_operand:SI 2 "const_int_operand" "")))] + "TARGET_ARM + && !const_ok_for_arm (INTVAL (operands[2])) + && const_ok_for_arm (~INTVAL (operands[2]))" + [(set (match_dup 3) (match_dup 2)) + (set (match_dup 0) (ior:SI (match_dup 1) (match_dup 3)))] + "" +) + +(define_insn "*iorsi3_compare0" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV (ior:SI (match_operand:SI 1 "s_register_operand" "%r") + (match_operand:SI 2 "arm_rhs_operand" "rI")) + (const_int 0))) + (set (match_operand:SI 0 "s_register_operand" "=r") + (ior:SI (match_dup 1) (match_dup 2)))] + "TARGET_32BIT" + "orr%.\\t%0, %1, %2" + [(set_attr "conds" "set")] +) + +(define_insn "*iorsi3_compare0_scratch" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV (ior:SI (match_operand:SI 1 "s_register_operand" "%r") + (match_operand:SI 2 "arm_rhs_operand" "rI")) + (const_int 0))) + (clobber (match_scratch:SI 0 "=r"))] + "TARGET_32BIT" + "orr%.\\t%0, %1, %2" + [(set_attr "conds" "set")] +) + +(define_expand "xordi3" + [(set (match_operand:DI 0 "s_register_operand" "") + (xor:DI (match_operand:DI 1 "s_register_operand" "") + (match_operand:DI 2 "s_register_operand" "")))] + "TARGET_32BIT" + "" +) + +(define_insn "*xordi3_insn" + [(set (match_operand:DI 0 "s_register_operand" "=&r,&r") + (xor:DI (match_operand:DI 1 "s_register_operand" "%0,r") + (match_operand:DI 2 "s_register_operand" "r,r")))] + "TARGET_32BIT && !TARGET_IWMMXT && !TARGET_NEON" + "#" + [(set_attr "length" "8") + (set_attr "predicable" "yes")] +) + +(define_insn "*xordi_zesidi_di" + [(set (match_operand:DI 0 "s_register_operand" "=&r,&r") + (xor:DI (zero_extend:DI + (match_operand:SI 2 "s_register_operand" "r,r")) + (match_operand:DI 1 "s_register_operand" "0,?r")))] + "TARGET_32BIT" + "@ + eor%?\\t%Q0, %Q1, %2 + #" + [(set_attr "length" "4,8") + (set_attr "predicable" "yes")] +) + +(define_insn "*xordi_sesidi_di" + [(set (match_operand:DI 0 "s_register_operand" "=&r,&r") + (xor:DI (sign_extend:DI + (match_operand:SI 2 "s_register_operand" "r,r")) + (match_operand:DI 1 "s_register_operand" "0,r")))] + "TARGET_32BIT" + "#" + [(set_attr "length" "8") + (set_attr "predicable" "yes")] +) + +(define_expand "xorsi3" + [(set (match_operand:SI 0 "s_register_operand" "") + (xor:SI (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 2 "reg_or_int_operand" "")))] + "TARGET_EITHER" + "if (GET_CODE (operands[2]) == CONST_INT) + { + if (TARGET_32BIT) + { + arm_split_constant (XOR, SImode, NULL_RTX, + INTVAL (operands[2]), operands[0], operands[1], + optimize && can_create_pseudo_p ()); + DONE; + } + else /* TARGET_THUMB1 */ + { + rtx tmp = force_reg (SImode, operands[2]); + if (rtx_equal_p (operands[0], operands[1])) + operands[2] = tmp; + else + { + operands[2] = operands[1]; + operands[1] = tmp; + } + } + }" +) + +(define_insn "*arm_xorsi3" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (xor:SI (match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "arm_rhs_operand" "rI")))] + "TARGET_32BIT" + "eor%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")] +) + +(define_insn "*thumb1_xorsi3_insn" + [(set (match_operand:SI 0 "register_operand" "=l") + (xor:SI (match_operand:SI 1 "register_operand" "%0") + (match_operand:SI 2 "register_operand" "l")))] + "TARGET_THUMB1" + "eor\\t%0, %2" + [(set_attr "length" "2") + (set_attr "conds" "set")]) + +(define_insn "*xorsi3_compare0" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV (xor:SI (match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "arm_rhs_operand" "rI")) + (const_int 0))) + (set (match_operand:SI 0 "s_register_operand" "=r") + (xor:SI (match_dup 1) (match_dup 2)))] + "TARGET_32BIT" + "eor%.\\t%0, %1, %2" + [(set_attr "conds" "set")] +) + +(define_insn "*xorsi3_compare0_scratch" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV (xor:SI (match_operand:SI 0 "s_register_operand" "r") + (match_operand:SI 1 "arm_rhs_operand" "rI")) + (const_int 0)))] + "TARGET_32BIT" + "teq%?\\t%0, %1" + [(set_attr "conds" "set")] +) + +; By splitting (IOR (AND (NOT A) (NOT B)) C) as D = AND (IOR A B) (NOT C), +; (NOT D) we can sometimes merge the final NOT into one of the following +; insns. + +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (ior:SI (and:SI (not:SI (match_operand:SI 1 "s_register_operand" "")) + (not:SI (match_operand:SI 2 "arm_rhs_operand" ""))) + (match_operand:SI 3 "arm_rhs_operand" ""))) + (clobber (match_operand:SI 4 "s_register_operand" ""))] + "TARGET_32BIT" + [(set (match_dup 4) (and:SI (ior:SI (match_dup 1) (match_dup 2)) + (not:SI (match_dup 3)))) + (set (match_dup 0) (not:SI (match_dup 4)))] + "" +) + +(define_insn "*andsi_iorsi3_notsi" + [(set (match_operand:SI 0 "s_register_operand" "=&r,&r,&r") + (and:SI (ior:SI (match_operand:SI 1 "s_register_operand" "%0,r,r") + (match_operand:SI 2 "arm_rhs_operand" "rI,0,rI")) + (not:SI (match_operand:SI 3 "arm_rhs_operand" "rI,rI,rI"))))] + "TARGET_32BIT" + "orr%?\\t%0, %1, %2\;bic%?\\t%0, %0, %3" + [(set_attr "length" "8") + (set_attr "ce_count" "2") + (set_attr "predicable" "yes")] +) + +; ??? Are these four splitters still beneficial when the Thumb-2 bitfield +; insns are available? +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operator:SI 1 "logical_binary_operator" + [(zero_extract:SI (match_operand:SI 2 "s_register_operand" "") + (match_operand:SI 3 "const_int_operand" "") + (match_operand:SI 4 "const_int_operand" "")) + (match_operator:SI 9 "logical_binary_operator" + [(lshiftrt:SI (match_operand:SI 5 "s_register_operand" "") + (match_operand:SI 6 "const_int_operand" "")) + (match_operand:SI 7 "s_register_operand" "")])])) + (clobber (match_operand:SI 8 "s_register_operand" ""))] + "TARGET_32BIT + && GET_CODE (operands[1]) == GET_CODE (operands[9]) + && INTVAL (operands[3]) == 32 - INTVAL (operands[6])" + [(set (match_dup 8) + (match_op_dup 1 + [(ashift:SI (match_dup 2) (match_dup 4)) + (match_dup 5)])) + (set (match_dup 0) + (match_op_dup 1 + [(lshiftrt:SI (match_dup 8) (match_dup 6)) + (match_dup 7)]))] + " + operands[4] = GEN_INT (32 - (INTVAL (operands[3]) + INTVAL (operands[4]))); +") + +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operator:SI 1 "logical_binary_operator" + [(match_operator:SI 9 "logical_binary_operator" + [(lshiftrt:SI (match_operand:SI 5 "s_register_operand" "") + (match_operand:SI 6 "const_int_operand" "")) + (match_operand:SI 7 "s_register_operand" "")]) + (zero_extract:SI (match_operand:SI 2 "s_register_operand" "") + (match_operand:SI 3 "const_int_operand" "") + (match_operand:SI 4 "const_int_operand" ""))])) + (clobber (match_operand:SI 8 "s_register_operand" ""))] + "TARGET_32BIT + && GET_CODE (operands[1]) == GET_CODE (operands[9]) + && INTVAL (operands[3]) == 32 - INTVAL (operands[6])" + [(set (match_dup 8) + (match_op_dup 1 + [(ashift:SI (match_dup 2) (match_dup 4)) + (match_dup 5)])) + (set (match_dup 0) + (match_op_dup 1 + [(lshiftrt:SI (match_dup 8) (match_dup 6)) + (match_dup 7)]))] + " + operands[4] = GEN_INT (32 - (INTVAL (operands[3]) + INTVAL (operands[4]))); +") + +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operator:SI 1 "logical_binary_operator" + [(sign_extract:SI (match_operand:SI 2 "s_register_operand" "") + (match_operand:SI 3 "const_int_operand" "") + (match_operand:SI 4 "const_int_operand" "")) + (match_operator:SI 9 "logical_binary_operator" + [(ashiftrt:SI (match_operand:SI 5 "s_register_operand" "") + (match_operand:SI 6 "const_int_operand" "")) + (match_operand:SI 7 "s_register_operand" "")])])) + (clobber (match_operand:SI 8 "s_register_operand" ""))] + "TARGET_32BIT + && GET_CODE (operands[1]) == GET_CODE (operands[9]) + && INTVAL (operands[3]) == 32 - INTVAL (operands[6])" + [(set (match_dup 8) + (match_op_dup 1 + [(ashift:SI (match_dup 2) (match_dup 4)) + (match_dup 5)])) + (set (match_dup 0) + (match_op_dup 1 + [(ashiftrt:SI (match_dup 8) (match_dup 6)) + (match_dup 7)]))] + " + operands[4] = GEN_INT (32 - (INTVAL (operands[3]) + INTVAL (operands[4]))); +") + +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operator:SI 1 "logical_binary_operator" + [(match_operator:SI 9 "logical_binary_operator" + [(ashiftrt:SI (match_operand:SI 5 "s_register_operand" "") + (match_operand:SI 6 "const_int_operand" "")) + (match_operand:SI 7 "s_register_operand" "")]) + (sign_extract:SI (match_operand:SI 2 "s_register_operand" "") + (match_operand:SI 3 "const_int_operand" "") + (match_operand:SI 4 "const_int_operand" ""))])) + (clobber (match_operand:SI 8 "s_register_operand" ""))] + "TARGET_32BIT + && GET_CODE (operands[1]) == GET_CODE (operands[9]) + && INTVAL (operands[3]) == 32 - INTVAL (operands[6])" + [(set (match_dup 8) + (match_op_dup 1 + [(ashift:SI (match_dup 2) (match_dup 4)) + (match_dup 5)])) + (set (match_dup 0) + (match_op_dup 1 + [(ashiftrt:SI (match_dup 8) (match_dup 6)) + (match_dup 7)]))] + " + operands[4] = GEN_INT (32 - (INTVAL (operands[3]) + INTVAL (operands[4]))); +") + + +;; Minimum and maximum insns + +(define_expand "smaxsi3" + [(parallel [ + (set (match_operand:SI 0 "s_register_operand" "") + (smax:SI (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 2 "arm_rhs_operand" ""))) + (clobber (reg:CC CC_REGNUM))])] + "TARGET_32BIT" + " + if (operands[2] == const0_rtx || operands[2] == constm1_rtx) + { + /* No need for a clobber of the condition code register here. */ + emit_insn (gen_rtx_SET (VOIDmode, operands[0], + gen_rtx_SMAX (SImode, operands[1], + operands[2]))); + DONE; + } +") + +(define_insn "*smax_0" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (smax:SI (match_operand:SI 1 "s_register_operand" "r") + (const_int 0)))] + "TARGET_32BIT" + "bic%?\\t%0, %1, %1, asr #31" + [(set_attr "predicable" "yes")] +) + +(define_insn "*smax_m1" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (smax:SI (match_operand:SI 1 "s_register_operand" "r") + (const_int -1)))] + "TARGET_32BIT" + "orr%?\\t%0, %1, %1, asr #31" + [(set_attr "predicable" "yes")] +) + +(define_insn "*arm_smax_insn" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (smax:SI (match_operand:SI 1 "s_register_operand" "%0,?r") + (match_operand:SI 2 "arm_rhs_operand" "rI,rI"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "@ + cmp\\t%1, %2\;movlt\\t%0, %2 + cmp\\t%1, %2\;movge\\t%0, %1\;movlt\\t%0, %2" + [(set_attr "conds" "clob") + (set_attr "length" "8,12")] +) + +(define_expand "sminsi3" + [(parallel [ + (set (match_operand:SI 0 "s_register_operand" "") + (smin:SI (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 2 "arm_rhs_operand" ""))) + (clobber (reg:CC CC_REGNUM))])] + "TARGET_32BIT" + " + if (operands[2] == const0_rtx) + { + /* No need for a clobber of the condition code register here. */ + emit_insn (gen_rtx_SET (VOIDmode, operands[0], + gen_rtx_SMIN (SImode, operands[1], + operands[2]))); + DONE; + } +") + +(define_insn "*smin_0" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (smin:SI (match_operand:SI 1 "s_register_operand" "r") + (const_int 0)))] + "TARGET_32BIT" + "and%?\\t%0, %1, %1, asr #31" + [(set_attr "predicable" "yes")] +) + +(define_insn "*arm_smin_insn" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (smin:SI (match_operand:SI 1 "s_register_operand" "%0,?r") + (match_operand:SI 2 "arm_rhs_operand" "rI,rI"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "@ + cmp\\t%1, %2\;movge\\t%0, %2 + cmp\\t%1, %2\;movlt\\t%0, %1\;movge\\t%0, %2" + [(set_attr "conds" "clob") + (set_attr "length" "8,12")] +) + +(define_expand "umaxsi3" + [(parallel [ + (set (match_operand:SI 0 "s_register_operand" "") + (umax:SI (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 2 "arm_rhs_operand" ""))) + (clobber (reg:CC CC_REGNUM))])] + "TARGET_32BIT" + "" +) + +(define_insn "*arm_umaxsi3" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,r") + (umax:SI (match_operand:SI 1 "s_register_operand" "0,r,?r") + (match_operand:SI 2 "arm_rhs_operand" "rI,0,rI"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "@ + cmp\\t%1, %2\;movcc\\t%0, %2 + cmp\\t%1, %2\;movcs\\t%0, %1 + cmp\\t%1, %2\;movcs\\t%0, %1\;movcc\\t%0, %2" + [(set_attr "conds" "clob") + (set_attr "length" "8,8,12")] +) + +(define_expand "uminsi3" + [(parallel [ + (set (match_operand:SI 0 "s_register_operand" "") + (umin:SI (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 2 "arm_rhs_operand" ""))) + (clobber (reg:CC CC_REGNUM))])] + "TARGET_32BIT" + "" +) + +(define_insn "*arm_uminsi3" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,r") + (umin:SI (match_operand:SI 1 "s_register_operand" "0,r,?r") + (match_operand:SI 2 "arm_rhs_operand" "rI,0,rI"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "@ + cmp\\t%1, %2\;movcs\\t%0, %2 + cmp\\t%1, %2\;movcc\\t%0, %1 + cmp\\t%1, %2\;movcc\\t%0, %1\;movcs\\t%0, %2" + [(set_attr "conds" "clob") + (set_attr "length" "8,8,12")] +) + +(define_insn "*store_minmaxsi" + [(set (match_operand:SI 0 "memory_operand" "=m") + (match_operator:SI 3 "minmax_operator" + [(match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "s_register_operand" "r")])) + (clobber (reg:CC CC_REGNUM))] + "TARGET_32BIT" + "* + operands[3] = gen_rtx_fmt_ee (minmax_code (operands[3]), SImode, + operands[1], operands[2]); + output_asm_insn (\"cmp\\t%1, %2\", operands); + if (TARGET_THUMB2) + output_asm_insn (\"ite\t%d3\", operands); + output_asm_insn (\"str%d3\\t%1, %0\", operands); + output_asm_insn (\"str%D3\\t%2, %0\", operands); + return \"\"; + " + [(set_attr "conds" "clob") + (set (attr "length") + (if_then_else (eq_attr "is_thumb" "yes") + (const_int 14) + (const_int 12))) + (set_attr "type" "store1")] +) + +; Reject the frame pointer in operand[1], since reloading this after +; it has been eliminated can cause carnage. +(define_insn "*minmax_arithsi" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (match_operator:SI 4 "shiftable_operator" + [(match_operator:SI 5 "minmax_operator" + [(match_operand:SI 2 "s_register_operand" "r,r") + (match_operand:SI 3 "arm_rhs_operand" "rI,rI")]) + (match_operand:SI 1 "s_register_operand" "0,?r")])) + (clobber (reg:CC CC_REGNUM))] + "TARGET_32BIT && !arm_eliminable_register (operands[1])" + "* + { + enum rtx_code code = GET_CODE (operands[4]); + bool need_else; + + if (which_alternative != 0 || operands[3] != const0_rtx + || (code != PLUS && code != IOR && code != XOR)) + need_else = true; + else + need_else = false; + + operands[5] = gen_rtx_fmt_ee (minmax_code (operands[5]), SImode, + operands[2], operands[3]); + output_asm_insn (\"cmp\\t%2, %3\", operands); + if (TARGET_THUMB2) + { + if (need_else) + output_asm_insn (\"ite\\t%d5\", operands); + else + output_asm_insn (\"it\\t%d5\", operands); + } + output_asm_insn (\"%i4%d5\\t%0, %1, %2\", operands); + if (need_else) + output_asm_insn (\"%i4%D5\\t%0, %1, %3\", operands); + return \"\"; + }" + [(set_attr "conds" "clob") + (set (attr "length") + (if_then_else (eq_attr "is_thumb" "yes") + (const_int 14) + (const_int 12)))] +) + + +;; Shift and rotation insns + +(define_expand "ashldi3" + [(set (match_operand:DI 0 "s_register_operand" "") + (ashift:DI (match_operand:DI 1 "s_register_operand" "") + (match_operand:SI 2 "reg_or_int_operand" "")))] + "TARGET_32BIT" + " + if (GET_CODE (operands[2]) == CONST_INT) + { + if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1) + { + emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1])); + DONE; + } + /* Ideally we shouldn't fail here if we could know that operands[1] + ends up already living in an iwmmxt register. Otherwise it's + cheaper to have the alternate code being generated than moving + values to iwmmxt regs and back. */ + FAIL; + } + else if (!TARGET_REALLY_IWMMXT && !(TARGET_HARD_FLOAT && TARGET_MAVERICK)) + FAIL; + " +) + +(define_insn "arm_ashldi3_1bit" + [(set (match_operand:DI 0 "s_register_operand" "=r,&r") + (ashift:DI (match_operand:DI 1 "s_register_operand" "0,r") + (const_int 1))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_32BIT" + "movs\\t%Q0, %Q1, asl #1\;adc\\t%R0, %R1, %R1" + [(set_attr "conds" "clob") + (set_attr "length" "8")] +) + +(define_expand "ashlsi3" + [(set (match_operand:SI 0 "s_register_operand" "") + (ashift:SI (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 2 "arm_rhs_operand" "")))] + "TARGET_EITHER" + " + if (GET_CODE (operands[2]) == CONST_INT + && ((unsigned HOST_WIDE_INT) INTVAL (operands[2])) > 31) + { + emit_insn (gen_movsi (operands[0], const0_rtx)); + DONE; + } + " +) + +(define_insn "*thumb1_ashlsi3" + [(set (match_operand:SI 0 "register_operand" "=l,l") + (ashift:SI (match_operand:SI 1 "register_operand" "l,0") + (match_operand:SI 2 "nonmemory_operand" "N,l")))] + "TARGET_THUMB1" + "lsl\\t%0, %1, %2" + [(set_attr "length" "2") + (set_attr "conds" "set")]) + +(define_expand "ashrdi3" + [(set (match_operand:DI 0 "s_register_operand" "") + (ashiftrt:DI (match_operand:DI 1 "s_register_operand" "") + (match_operand:SI 2 "reg_or_int_operand" "")))] + "TARGET_32BIT" + " + if (GET_CODE (operands[2]) == CONST_INT) + { + if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1) + { + emit_insn (gen_arm_ashrdi3_1bit (operands[0], operands[1])); + DONE; + } + /* Ideally we shouldn't fail here if we could know that operands[1] + ends up already living in an iwmmxt register. Otherwise it's + cheaper to have the alternate code being generated than moving + values to iwmmxt regs and back. */ + FAIL; + } + else if (!TARGET_REALLY_IWMMXT) + FAIL; + " +) + +(define_insn "arm_ashrdi3_1bit" + [(set (match_operand:DI 0 "s_register_operand" "=r,&r") + (ashiftrt:DI (match_operand:DI 1 "s_register_operand" "0,r") + (const_int 1))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_32BIT" + "movs\\t%R0, %R1, asr #1\;mov\\t%Q0, %Q1, rrx" + [(set_attr "conds" "clob") + (set_attr "insn" "mov") + (set_attr "length" "8")] +) + +(define_expand "ashrsi3" + [(set (match_operand:SI 0 "s_register_operand" "") + (ashiftrt:SI (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 2 "arm_rhs_operand" "")))] + "TARGET_EITHER" + " + if (GET_CODE (operands[2]) == CONST_INT + && ((unsigned HOST_WIDE_INT) INTVAL (operands[2])) > 31) + operands[2] = GEN_INT (31); + " +) + +(define_insn "*thumb1_ashrsi3" + [(set (match_operand:SI 0 "register_operand" "=l,l") + (ashiftrt:SI (match_operand:SI 1 "register_operand" "l,0") + (match_operand:SI 2 "nonmemory_operand" "N,l")))] + "TARGET_THUMB1" + "asr\\t%0, %1, %2" + [(set_attr "length" "2") + (set_attr "conds" "set")]) + +(define_expand "lshrdi3" + [(set (match_operand:DI 0 "s_register_operand" "") + (lshiftrt:DI (match_operand:DI 1 "s_register_operand" "") + (match_operand:SI 2 "reg_or_int_operand" "")))] + "TARGET_32BIT" + " + if (GET_CODE (operands[2]) == CONST_INT) + { + if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1) + { + emit_insn (gen_arm_lshrdi3_1bit (operands[0], operands[1])); + DONE; + } + /* Ideally we shouldn't fail here if we could know that operands[1] + ends up already living in an iwmmxt register. Otherwise it's + cheaper to have the alternate code being generated than moving + values to iwmmxt regs and back. */ + FAIL; + } + else if (!TARGET_REALLY_IWMMXT) + FAIL; + " +) + +(define_insn "arm_lshrdi3_1bit" + [(set (match_operand:DI 0 "s_register_operand" "=r,&r") + (lshiftrt:DI (match_operand:DI 1 "s_register_operand" "0,r") + (const_int 1))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_32BIT" + "movs\\t%R0, %R1, lsr #1\;mov\\t%Q0, %Q1, rrx" + [(set_attr "conds" "clob") + (set_attr "insn" "mov") + (set_attr "length" "8")] +) + +(define_expand "lshrsi3" + [(set (match_operand:SI 0 "s_register_operand" "") + (lshiftrt:SI (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 2 "arm_rhs_operand" "")))] + "TARGET_EITHER" + " + if (GET_CODE (operands[2]) == CONST_INT + && ((unsigned HOST_WIDE_INT) INTVAL (operands[2])) > 31) + { + emit_insn (gen_movsi (operands[0], const0_rtx)); + DONE; + } + " +) + +(define_insn "*thumb1_lshrsi3" + [(set (match_operand:SI 0 "register_operand" "=l,l") + (lshiftrt:SI (match_operand:SI 1 "register_operand" "l,0") + (match_operand:SI 2 "nonmemory_operand" "N,l")))] + "TARGET_THUMB1" + "lsr\\t%0, %1, %2" + [(set_attr "length" "2") + (set_attr "conds" "set")]) + +(define_expand "rotlsi3" + [(set (match_operand:SI 0 "s_register_operand" "") + (rotatert:SI (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 2 "reg_or_int_operand" "")))] + "TARGET_32BIT" + " + if (GET_CODE (operands[2]) == CONST_INT) + operands[2] = GEN_INT ((32 - INTVAL (operands[2])) % 32); + else + { + rtx reg = gen_reg_rtx (SImode); + emit_insn (gen_subsi3 (reg, GEN_INT (32), operands[2])); + operands[2] = reg; + } + " +) + +(define_expand "rotrsi3" + [(set (match_operand:SI 0 "s_register_operand" "") + (rotatert:SI (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 2 "arm_rhs_operand" "")))] + "TARGET_EITHER" + " + if (TARGET_32BIT) + { + if (GET_CODE (operands[2]) == CONST_INT + && ((unsigned HOST_WIDE_INT) INTVAL (operands[2])) > 31) + operands[2] = GEN_INT (INTVAL (operands[2]) % 32); + } + else /* TARGET_THUMB1 */ + { + if (GET_CODE (operands [2]) == CONST_INT) + operands [2] = force_reg (SImode, operands[2]); + } + " +) + +(define_insn "*thumb1_rotrsi3" + [(set (match_operand:SI 0 "register_operand" "=l") + (rotatert:SI (match_operand:SI 1 "register_operand" "0") + (match_operand:SI 2 "register_operand" "l")))] + "TARGET_THUMB1" + "ror\\t%0, %0, %2" + [(set_attr "length" "2")] +) + +(define_insn "*arm_shiftsi3" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (match_operator:SI 3 "shift_operator" + [(match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "reg_or_int_operand" "rM")]))] + "TARGET_32BIT" + "* return arm_output_shift(operands, 0);" + [(set_attr "predicable" "yes") + (set_attr "shift" "1") + (set (attr "type") (if_then_else (match_operand 2 "const_int_operand" "") + (const_string "alu_shift") + (const_string "alu_shift_reg")))] +) + +(define_insn "*shiftsi3_compare0" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV (match_operator:SI 3 "shift_operator" + [(match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "arm_rhs_operand" "rM")]) + (const_int 0))) + (set (match_operand:SI 0 "s_register_operand" "=r") + (match_op_dup 3 [(match_dup 1) (match_dup 2)]))] + "TARGET_32BIT" + "* return arm_output_shift(operands, 1);" + [(set_attr "conds" "set") + (set_attr "shift" "1") + (set (attr "type") (if_then_else (match_operand 2 "const_int_operand" "") + (const_string "alu_shift") + (const_string "alu_shift_reg")))] +) + +(define_insn "*shiftsi3_compare0_scratch" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV (match_operator:SI 3 "shift_operator" + [(match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "arm_rhs_operand" "rM")]) + (const_int 0))) + (clobber (match_scratch:SI 0 "=r"))] + "TARGET_32BIT" + "* return arm_output_shift(operands, 1);" + [(set_attr "conds" "set") + (set_attr "shift" "1")] +) + +(define_insn "*not_shiftsi" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (not:SI (match_operator:SI 3 "shift_operator" + [(match_operand:SI 1 "s_register_operand" "r,r") + (match_operand:SI 2 "shift_amount_operand" "M,rM")])))] + "TARGET_32BIT" + "mvn%?\\t%0, %1%S3" + [(set_attr "predicable" "yes") + (set_attr "shift" "1") + (set_attr "insn" "mvn") + (set_attr "arch" "32,a") + (set_attr "type" "alu_shift,alu_shift_reg")]) + +(define_insn "*not_shiftsi_compare0" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV + (not:SI (match_operator:SI 3 "shift_operator" + [(match_operand:SI 1 "s_register_operand" "r,r") + (match_operand:SI 2 "shift_amount_operand" "M,rM")])) + (const_int 0))) + (set (match_operand:SI 0 "s_register_operand" "=r,r") + (not:SI (match_op_dup 3 [(match_dup 1) (match_dup 2)])))] + "TARGET_32BIT" + "mvn%.\\t%0, %1%S3" + [(set_attr "conds" "set") + (set_attr "shift" "1") + (set_attr "insn" "mvn") + (set_attr "arch" "32,a") + (set_attr "type" "alu_shift,alu_shift_reg")]) + +(define_insn "*not_shiftsi_compare0_scratch" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV + (not:SI (match_operator:SI 3 "shift_operator" + [(match_operand:SI 1 "s_register_operand" "r,r") + (match_operand:SI 2 "shift_amount_operand" "M,rM")])) + (const_int 0))) + (clobber (match_scratch:SI 0 "=r,r"))] + "TARGET_32BIT" + "mvn%.\\t%0, %1%S3" + [(set_attr "conds" "set") + (set_attr "shift" "1") + (set_attr "insn" "mvn") + (set_attr "arch" "32,a") + (set_attr "type" "alu_shift,alu_shift_reg")]) + +;; We don't really have extzv, but defining this using shifts helps +;; to reduce register pressure later on. + +(define_expand "extzv" + [(set (match_dup 4) + (ashift:SI (match_operand:SI 1 "register_operand" "") + (match_operand:SI 2 "const_int_operand" ""))) + (set (match_operand:SI 0 "register_operand" "") + (lshiftrt:SI (match_dup 4) + (match_operand:SI 3 "const_int_operand" "")))] + "TARGET_THUMB1 || arm_arch_thumb2" + " + { + HOST_WIDE_INT lshift = 32 - INTVAL (operands[2]) - INTVAL (operands[3]); + HOST_WIDE_INT rshift = 32 - INTVAL (operands[2]); + + if (arm_arch_thumb2) + { + emit_insn (gen_extzv_t2 (operands[0], operands[1], operands[2], + operands[3])); + DONE; + } + + operands[3] = GEN_INT (rshift); + + if (lshift == 0) + { + emit_insn (gen_lshrsi3 (operands[0], operands[1], operands[3])); + DONE; + } + + operands[2] = GEN_INT (lshift); + operands[4] = gen_reg_rtx (SImode); + }" +) + +(define_insn "extv" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (sign_extract:SI (match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "const_int_operand" "M") + (match_operand:SI 3 "const_int_operand" "M")))] + "arm_arch_thumb2" + "sbfx%?\t%0, %1, %3, %2" + [(set_attr "length" "4") + (set_attr "predicable" "yes")] +) + +(define_insn "extzv_t2" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (zero_extract:SI (match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "const_int_operand" "M") + (match_operand:SI 3 "const_int_operand" "M")))] + "arm_arch_thumb2" + "ubfx%?\t%0, %1, %3, %2" + [(set_attr "length" "4") + (set_attr "predicable" "yes")] +) + + +;; Unary arithmetic insns + +(define_expand "negdi2" + [(parallel + [(set (match_operand:DI 0 "s_register_operand" "") + (neg:DI (match_operand:DI 1 "s_register_operand" ""))) + (clobber (reg:CC CC_REGNUM))])] + "TARGET_EITHER" + "" +) + +;; The constraints here are to prevent a *partial* overlap (where %Q0 == %R1). +;; The first alternative allows the common case of a *full* overlap. +(define_insn "*arm_negdi2" + [(set (match_operand:DI 0 "s_register_operand" "=r,&r") + (neg:DI (match_operand:DI 1 "s_register_operand" "0,r"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "rsbs\\t%Q0, %Q1, #0\;rsc\\t%R0, %R1, #0" + [(set_attr "conds" "clob") + (set_attr "length" "8")] +) + +(define_insn "*thumb1_negdi2" + [(set (match_operand:DI 0 "register_operand" "=&l") + (neg:DI (match_operand:DI 1 "register_operand" "l"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_THUMB1" + "mov\\t%R0, #0\;neg\\t%Q0, %Q1\;sbc\\t%R0, %R1" + [(set_attr "length" "6")] +) + +(define_expand "negsi2" + [(set (match_operand:SI 0 "s_register_operand" "") + (neg:SI (match_operand:SI 1 "s_register_operand" "")))] + "TARGET_EITHER" + "" +) + +(define_insn "*arm_negsi2" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (neg:SI (match_operand:SI 1 "s_register_operand" "r")))] + "TARGET_32BIT" + "rsb%?\\t%0, %1, #0" + [(set_attr "predicable" "yes")] +) + +(define_insn "*thumb1_negsi2" + [(set (match_operand:SI 0 "register_operand" "=l") + (neg:SI (match_operand:SI 1 "register_operand" "l")))] + "TARGET_THUMB1" + "neg\\t%0, %1" + [(set_attr "length" "2")] +) + +(define_expand "negsf2" + [(set (match_operand:SF 0 "s_register_operand" "") + (neg:SF (match_operand:SF 1 "s_register_operand" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && (TARGET_FPA || TARGET_VFP)" + "" +) + +(define_expand "negdf2" + [(set (match_operand:DF 0 "s_register_operand" "") + (neg:DF (match_operand:DF 1 "s_register_operand" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && (TARGET_FPA || TARGET_VFP_DOUBLE)" + "") + +;; abssi2 doesn't really clobber the condition codes if a different register +;; is being set. To keep things simple, assume during rtl manipulations that +;; it does, but tell the final scan operator the truth. Similarly for +;; (neg (abs...)) + +(define_expand "abssi2" + [(parallel + [(set (match_operand:SI 0 "s_register_operand" "") + (abs:SI (match_operand:SI 1 "s_register_operand" ""))) + (clobber (match_dup 2))])] + "TARGET_EITHER" + " + if (TARGET_THUMB1) + operands[2] = gen_rtx_SCRATCH (SImode); + else + operands[2] = gen_rtx_REG (CCmode, CC_REGNUM); +") + +(define_insn "*arm_abssi2" + [(set (match_operand:SI 0 "s_register_operand" "=r,&r") + (abs:SI (match_operand:SI 1 "s_register_operand" "0,r"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "@ + cmp\\t%0, #0\;rsblt\\t%0, %0, #0 + eor%?\\t%0, %1, %1, asr #31\;sub%?\\t%0, %0, %1, asr #31" + [(set_attr "conds" "clob,*") + (set_attr "shift" "1") + ;; predicable can't be set based on the variant, so left as no + (set_attr "length" "8")] +) + +(define_insn_and_split "*thumb1_abssi2" + [(set (match_operand:SI 0 "s_register_operand" "=l") + (abs:SI (match_operand:SI 1 "s_register_operand" "l"))) + (clobber (match_scratch:SI 2 "=&l"))] + "TARGET_THUMB1" + "#" + "TARGET_THUMB1 && reload_completed" + [(set (match_dup 2) (ashiftrt:SI (match_dup 1) (const_int 31))) + (set (match_dup 0) (plus:SI (match_dup 1) (match_dup 2))) + (set (match_dup 0) (xor:SI (match_dup 0) (match_dup 2)))] + "" + [(set_attr "length" "6")] +) + +(define_insn "*arm_neg_abssi2" + [(set (match_operand:SI 0 "s_register_operand" "=r,&r") + (neg:SI (abs:SI (match_operand:SI 1 "s_register_operand" "0,r")))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "@ + cmp\\t%0, #0\;rsbgt\\t%0, %0, #0 + eor%?\\t%0, %1, %1, asr #31\;rsb%?\\t%0, %0, %1, asr #31" + [(set_attr "conds" "clob,*") + (set_attr "shift" "1") + ;; predicable can't be set based on the variant, so left as no + (set_attr "length" "8")] +) + +(define_insn_and_split "*thumb1_neg_abssi2" + [(set (match_operand:SI 0 "s_register_operand" "=l") + (neg:SI (abs:SI (match_operand:SI 1 "s_register_operand" "l")))) + (clobber (match_scratch:SI 2 "=&l"))] + "TARGET_THUMB1" + "#" + "TARGET_THUMB1 && reload_completed" + [(set (match_dup 2) (ashiftrt:SI (match_dup 1) (const_int 31))) + (set (match_dup 0) (minus:SI (match_dup 2) (match_dup 1))) + (set (match_dup 0) (xor:SI (match_dup 0) (match_dup 2)))] + "" + [(set_attr "length" "6")] +) + +(define_expand "abssf2" + [(set (match_operand:SF 0 "s_register_operand" "") + (abs:SF (match_operand:SF 1 "s_register_operand" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT" + "") + +(define_expand "absdf2" + [(set (match_operand:DF 0 "s_register_operand" "") + (abs:DF (match_operand:DF 1 "s_register_operand" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && !TARGET_VFP_SINGLE" + "") + +(define_expand "sqrtsf2" + [(set (match_operand:SF 0 "s_register_operand" "") + (sqrt:SF (match_operand:SF 1 "s_register_operand" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && (TARGET_FPA || TARGET_VFP)" + "") + +(define_expand "sqrtdf2" + [(set (match_operand:DF 0 "s_register_operand" "") + (sqrt:DF (match_operand:DF 1 "s_register_operand" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && (TARGET_FPA || TARGET_VFP_DOUBLE)" + "") + +(define_insn_and_split "one_cmpldi2" + [(set (match_operand:DI 0 "s_register_operand" "=&r,&r") + (not:DI (match_operand:DI 1 "s_register_operand" "0,r")))] + "TARGET_32BIT" + "#" + "TARGET_32BIT && reload_completed" + [(set (match_dup 0) (not:SI (match_dup 1))) + (set (match_dup 2) (not:SI (match_dup 3)))] + " + { + operands[2] = gen_highpart (SImode, operands[0]); + operands[0] = gen_lowpart (SImode, operands[0]); + operands[3] = gen_highpart (SImode, operands[1]); + operands[1] = gen_lowpart (SImode, operands[1]); + }" + [(set_attr "length" "8") + (set_attr "predicable" "yes")] +) + +(define_expand "one_cmplsi2" + [(set (match_operand:SI 0 "s_register_operand" "") + (not:SI (match_operand:SI 1 "s_register_operand" "")))] + "TARGET_EITHER" + "" +) + +(define_insn "*arm_one_cmplsi2" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (not:SI (match_operand:SI 1 "s_register_operand" "r")))] + "TARGET_32BIT" + "mvn%?\\t%0, %1" + [(set_attr "predicable" "yes") + (set_attr "insn" "mvn")] +) + +(define_insn "*thumb1_one_cmplsi2" + [(set (match_operand:SI 0 "register_operand" "=l") + (not:SI (match_operand:SI 1 "register_operand" "l")))] + "TARGET_THUMB1" + "mvn\\t%0, %1" + [(set_attr "length" "2") + (set_attr "insn" "mvn")] +) + +(define_insn "*notsi_compare0" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV (not:SI (match_operand:SI 1 "s_register_operand" "r")) + (const_int 0))) + (set (match_operand:SI 0 "s_register_operand" "=r") + (not:SI (match_dup 1)))] + "TARGET_32BIT" + "mvn%.\\t%0, %1" + [(set_attr "conds" "set") + (set_attr "insn" "mvn")] +) + +(define_insn "*notsi_compare0_scratch" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV (not:SI (match_operand:SI 1 "s_register_operand" "r")) + (const_int 0))) + (clobber (match_scratch:SI 0 "=r"))] + "TARGET_32BIT" + "mvn%.\\t%0, %1" + [(set_attr "conds" "set") + (set_attr "insn" "mvn")] +) + +;; Fixed <--> Floating conversion insns + +(define_expand "floatsihf2" + [(set (match_operand:HF 0 "general_operand" "") + (float:HF (match_operand:SI 1 "general_operand" "")))] + "TARGET_EITHER" + " + { + rtx op1 = gen_reg_rtx (SFmode); + expand_float (op1, operands[1], 0); + op1 = convert_to_mode (HFmode, op1, 0); + emit_move_insn (operands[0], op1); + DONE; + }" +) + +(define_expand "floatdihf2" + [(set (match_operand:HF 0 "general_operand" "") + (float:HF (match_operand:DI 1 "general_operand" "")))] + "TARGET_EITHER" + " + { + rtx op1 = gen_reg_rtx (SFmode); + expand_float (op1, operands[1], 0); + op1 = convert_to_mode (HFmode, op1, 0); + emit_move_insn (operands[0], op1); + DONE; + }" +) + +(define_expand "floatsisf2" + [(set (match_operand:SF 0 "s_register_operand" "") + (float:SF (match_operand:SI 1 "s_register_operand" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT" + " + if (TARGET_MAVERICK) + { + emit_insn (gen_cirrus_floatsisf2 (operands[0], operands[1])); + DONE; + } +") + +(define_expand "floatsidf2" + [(set (match_operand:DF 0 "s_register_operand" "") + (float:DF (match_operand:SI 1 "s_register_operand" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && !TARGET_VFP_SINGLE" + " + if (TARGET_MAVERICK) + { + emit_insn (gen_cirrus_floatsidf2 (operands[0], operands[1])); + DONE; + } +") + +(define_expand "fix_trunchfsi2" + [(set (match_operand:SI 0 "general_operand" "") + (fix:SI (fix:HF (match_operand:HF 1 "general_operand" ""))))] + "TARGET_EITHER" + " + { + rtx op1 = convert_to_mode (SFmode, operands[1], 0); + expand_fix (operands[0], op1, 0); + DONE; + }" +) + +(define_expand "fix_trunchfdi2" + [(set (match_operand:DI 0 "general_operand" "") + (fix:DI (fix:HF (match_operand:HF 1 "general_operand" ""))))] + "TARGET_EITHER" + " + { + rtx op1 = convert_to_mode (SFmode, operands[1], 0); + expand_fix (operands[0], op1, 0); + DONE; + }" +) + +(define_expand "fix_truncsfsi2" + [(set (match_operand:SI 0 "s_register_operand" "") + (fix:SI (fix:SF (match_operand:SF 1 "s_register_operand" ""))))] + "TARGET_32BIT && TARGET_HARD_FLOAT" + " + if (TARGET_MAVERICK) + { + if (!cirrus_fp_register (operands[0], SImode)) + operands[0] = force_reg (SImode, operands[0]); + if (!cirrus_fp_register (operands[1], SFmode)) + operands[1] = force_reg (SFmode, operands[0]); + emit_insn (gen_cirrus_truncsfsi2 (operands[0], operands[1])); + DONE; + } +") + +(define_expand "fix_truncdfsi2" + [(set (match_operand:SI 0 "s_register_operand" "") + (fix:SI (fix:DF (match_operand:DF 1 "s_register_operand" ""))))] + "TARGET_32BIT && TARGET_HARD_FLOAT && !TARGET_VFP_SINGLE" + " + if (TARGET_MAVERICK) + { + if (!cirrus_fp_register (operands[1], DFmode)) + operands[1] = force_reg (DFmode, operands[0]); + emit_insn (gen_cirrus_truncdfsi2 (operands[0], operands[1])); + DONE; + } +") + +;; Truncation insns + +(define_expand "truncdfsf2" + [(set (match_operand:SF 0 "s_register_operand" "") + (float_truncate:SF + (match_operand:DF 1 "s_register_operand" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && !TARGET_VFP_SINGLE" + "" +) + +/* DFmode -> HFmode conversions have to go through SFmode. */ +(define_expand "truncdfhf2" + [(set (match_operand:HF 0 "general_operand" "") + (float_truncate:HF + (match_operand:DF 1 "general_operand" "")))] + "TARGET_EITHER" + " + { + rtx op1; + op1 = convert_to_mode (SFmode, operands[1], 0); + op1 = convert_to_mode (HFmode, op1, 0); + emit_move_insn (operands[0], op1); + DONE; + }" +) + +;; Zero and sign extension instructions. + +(define_insn "zero_extenddi2" + [(set (match_operand:DI 0 "s_register_operand" "=r") + (zero_extend:DI (match_operand:QHSI 1 "" + "")))] + "TARGET_32BIT " + "#" + [(set_attr "length" "8") + (set_attr "ce_count" "2") + (set_attr "predicable" "yes")] +) + +(define_insn "extenddi2" + [(set (match_operand:DI 0 "s_register_operand" "=r") + (sign_extend:DI (match_operand:QHSI 1 "" + "")))] + "TARGET_32BIT " + "#" + [(set_attr "length" "8") + (set_attr "ce_count" "2") + (set_attr "shift" "1") + (set_attr "predicable" "yes")] +) + +;; Splits for all extensions to DImode +(define_split + [(set (match_operand:DI 0 "s_register_operand" "") + (zero_extend:DI (match_operand 1 "nonimmediate_operand" "")))] + "TARGET_32BIT" + [(set (match_dup 0) (match_dup 1))] +{ + rtx lo_part = gen_lowpart (SImode, operands[0]); + enum machine_mode src_mode = GET_MODE (operands[1]); + + if (REG_P (operands[0]) + && !reg_overlap_mentioned_p (operands[0], operands[1])) + emit_clobber (operands[0]); + if (!REG_P (lo_part) || src_mode != SImode + || !rtx_equal_p (lo_part, operands[1])) + { + if (src_mode == SImode) + emit_move_insn (lo_part, operands[1]); + else + emit_insn (gen_rtx_SET (VOIDmode, lo_part, + gen_rtx_ZERO_EXTEND (SImode, operands[1]))); + operands[1] = lo_part; + } + operands[0] = gen_highpart (SImode, operands[0]); + operands[1] = const0_rtx; +}) + +(define_split + [(set (match_operand:DI 0 "s_register_operand" "") + (sign_extend:DI (match_operand 1 "nonimmediate_operand" "")))] + "TARGET_32BIT" + [(set (match_dup 0) (ashiftrt:SI (match_dup 1) (const_int 31)))] +{ + rtx lo_part = gen_lowpart (SImode, operands[0]); + enum machine_mode src_mode = GET_MODE (operands[1]); + + if (REG_P (operands[0]) + && !reg_overlap_mentioned_p (operands[0], operands[1])) + emit_clobber (operands[0]); + + if (!REG_P (lo_part) || src_mode != SImode + || !rtx_equal_p (lo_part, operands[1])) + { + if (src_mode == SImode) + emit_move_insn (lo_part, operands[1]); + else + emit_insn (gen_rtx_SET (VOIDmode, lo_part, + gen_rtx_SIGN_EXTEND (SImode, operands[1]))); + operands[1] = lo_part; + } + operands[0] = gen_highpart (SImode, operands[0]); +}) + +(define_expand "zero_extendhisi2" + [(set (match_operand:SI 0 "s_register_operand" "") + (zero_extend:SI (match_operand:HI 1 "nonimmediate_operand" "")))] + "TARGET_EITHER" +{ + if (TARGET_ARM && !arm_arch4 && MEM_P (operands[1])) + { + emit_insn (gen_movhi_bytes (operands[0], operands[1])); + DONE; + } + if (!arm_arch6 && !MEM_P (operands[1])) + { + rtx t = gen_lowpart (SImode, operands[1]); + rtx tmp = gen_reg_rtx (SImode); + emit_insn (gen_ashlsi3 (tmp, t, GEN_INT (16))); + emit_insn (gen_lshrsi3 (operands[0], tmp, GEN_INT (16))); + DONE; + } +}) + +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (zero_extend:SI (match_operand:HI 1 "s_register_operand" "")))] + "!TARGET_THUMB2 && !arm_arch6" + [(set (match_dup 0) (ashift:SI (match_dup 2) (const_int 16))) + (set (match_dup 0) (lshiftrt:SI (match_dup 0) (const_int 16)))] +{ + operands[2] = gen_lowpart (SImode, operands[1]); +}) + +(define_insn "*thumb1_zero_extendhisi2" + [(set (match_operand:SI 0 "register_operand" "=l,l") + (zero_extend:SI (match_operand:HI 1 "nonimmediate_operand" "l,m")))] + "TARGET_THUMB1" +{ + rtx mem; + + if (which_alternative == 0 && arm_arch6) + return "uxth\t%0, %1"; + if (which_alternative == 0) + return "#"; + + mem = XEXP (operands[1], 0); + + if (GET_CODE (mem) == CONST) + mem = XEXP (mem, 0); + + if (GET_CODE (mem) == PLUS) + { + rtx a = XEXP (mem, 0); + + /* This can happen due to bugs in reload. */ + if (GET_CODE (a) == REG && REGNO (a) == SP_REGNUM) + { + rtx ops[2]; + ops[0] = operands[0]; + ops[1] = a; + + output_asm_insn ("mov\t%0, %1", ops); + + XEXP (mem, 0) = operands[0]; + } + } + + return "ldrh\t%0, %1"; +} + [(set_attr_alternative "length" + [(if_then_else (eq_attr "is_arch6" "yes") + (const_int 2) (const_int 4)) + (const_int 4)]) + (set_attr "type" "alu_shift,load_byte")] +) + +(define_insn "*arm_zero_extendhisi2" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (zero_extend:SI (match_operand:HI 1 "nonimmediate_operand" "r,m")))] + "TARGET_ARM && arm_arch4 && !arm_arch6" + "@ + # + ldr%(h%)\\t%0, %1" + [(set_attr "type" "alu_shift,load_byte") + (set_attr "predicable" "yes")] +) + +(define_insn "*arm_zero_extendhisi2_v6" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (zero_extend:SI (match_operand:HI 1 "nonimmediate_operand" "r,m")))] + "TARGET_ARM && arm_arch6" + "@ + uxth%?\\t%0, %1 + ldr%(h%)\\t%0, %1" + [(set_attr "type" "alu_shift,load_byte") + (set_attr "predicable" "yes")] +) + +(define_insn "*arm_zero_extendhisi2addsi" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (plus:SI (zero_extend:SI (match_operand:HI 1 "s_register_operand" "r")) + (match_operand:SI 2 "s_register_operand" "r")))] + "TARGET_INT_SIMD" + "uxtah%?\\t%0, %2, %1" + [(set_attr "type" "alu_shift") + (set_attr "predicable" "yes")] +) + +(define_expand "zero_extendqisi2" + [(set (match_operand:SI 0 "s_register_operand" "") + (zero_extend:SI (match_operand:QI 1 "nonimmediate_operand" "")))] + "TARGET_EITHER" +{ + if (TARGET_ARM && !arm_arch6 && GET_CODE (operands[1]) != MEM) + { + emit_insn (gen_andsi3 (operands[0], + gen_lowpart (SImode, operands[1]), + GEN_INT (255))); + DONE; + } + if (!arm_arch6 && !MEM_P (operands[1])) + { + rtx t = gen_lowpart (SImode, operands[1]); + rtx tmp = gen_reg_rtx (SImode); + emit_insn (gen_ashlsi3 (tmp, t, GEN_INT (24))); + emit_insn (gen_lshrsi3 (operands[0], tmp, GEN_INT (24))); + DONE; + } +}) + +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (zero_extend:SI (match_operand:QI 1 "s_register_operand" "")))] + "!arm_arch6" + [(set (match_dup 0) (ashift:SI (match_dup 2) (const_int 24))) + (set (match_dup 0) (lshiftrt:SI (match_dup 0) (const_int 24)))] +{ + operands[2] = simplify_gen_subreg (SImode, operands[1], QImode, 0); + if (TARGET_ARM) + { + emit_insn (gen_andsi3 (operands[0], operands[2], GEN_INT (255))); + DONE; + } +}) + +(define_insn "*thumb1_zero_extendqisi2" + [(set (match_operand:SI 0 "register_operand" "=l,l") + (zero_extend:SI (match_operand:QI 1 "nonimmediate_operand" "l,m")))] + "TARGET_THUMB1 && !arm_arch6" + "@ + # + ldrb\\t%0, %1" + [(set_attr "length" "4,2") + (set_attr "type" "alu_shift,load_byte") + (set_attr "pool_range" "*,32")] +) + +(define_insn "*thumb1_zero_extendqisi2_v6" + [(set (match_operand:SI 0 "register_operand" "=l,l") + (zero_extend:SI (match_operand:QI 1 "nonimmediate_operand" "l,m")))] + "TARGET_THUMB1 && arm_arch6" + "@ + uxtb\\t%0, %1 + ldrb\\t%0, %1" + [(set_attr "length" "2") + (set_attr "type" "alu_shift,load_byte")] +) + +(define_insn "*arm_zero_extendqisi2" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (zero_extend:SI (match_operand:QI 1 "nonimmediate_operand" "r,m")))] + "TARGET_ARM && !arm_arch6" + "@ + # + ldr%(b%)\\t%0, %1\\t%@ zero_extendqisi2" + [(set_attr "length" "8,4") + (set_attr "type" "alu_shift,load_byte") + (set_attr "predicable" "yes")] +) + +(define_insn "*arm_zero_extendqisi2_v6" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (zero_extend:SI (match_operand:QI 1 "nonimmediate_operand" "r,m")))] + "TARGET_ARM && arm_arch6" + "@ + uxtb%(%)\\t%0, %1 + ldr%(b%)\\t%0, %1\\t%@ zero_extendqisi2" + [(set_attr "type" "alu_shift,load_byte") + (set_attr "predicable" "yes")] +) + +(define_insn "*arm_zero_extendqisi2addsi" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (plus:SI (zero_extend:SI (match_operand:QI 1 "s_register_operand" "r")) + (match_operand:SI 2 "s_register_operand" "r")))] + "TARGET_INT_SIMD" + "uxtab%?\\t%0, %2, %1" + [(set_attr "predicable" "yes") + (set_attr "insn" "xtab") + (set_attr "type" "alu_shift")] +) + +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (zero_extend:SI (subreg:QI (match_operand:SI 1 "" "") 0))) + (clobber (match_operand:SI 2 "s_register_operand" ""))] + "TARGET_32BIT && (GET_CODE (operands[1]) != MEM) && ! BYTES_BIG_ENDIAN" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (and:SI (match_dup 2) (const_int 255)))] + "" +) + +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (zero_extend:SI (subreg:QI (match_operand:SI 1 "" "") 3))) + (clobber (match_operand:SI 2 "s_register_operand" ""))] + "TARGET_32BIT && (GET_CODE (operands[1]) != MEM) && BYTES_BIG_ENDIAN" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (and:SI (match_dup 2) (const_int 255)))] + "" +) + + +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (ior_xor:SI (and:SI (ashift:SI + (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 2 "const_int_operand" "")) + (match_operand:SI 3 "const_int_operand" "")) + (zero_extend:SI + (match_operator 5 "subreg_lowpart_operator" + [(match_operand:SI 4 "s_register_operand" "")]))))] + "TARGET_32BIT + && ((unsigned HOST_WIDE_INT) INTVAL (operands[3]) + == (GET_MODE_MASK (GET_MODE (operands[5])) + & (GET_MODE_MASK (GET_MODE (operands[5])) + << (INTVAL (operands[2])))))" + [(set (match_dup 0) (ior_xor:SI (ashift:SI (match_dup 1) (match_dup 2)) + (match_dup 4))) + (set (match_dup 0) (zero_extend:SI (match_dup 5)))] + "operands[5] = gen_lowpart (GET_MODE (operands[5]), operands[0]);" +) + +(define_insn "*compareqi_eq0" + [(set (reg:CC_Z CC_REGNUM) + (compare:CC_Z (match_operand:QI 0 "s_register_operand" "r") + (const_int 0)))] + "TARGET_32BIT" + "tst\\t%0, #255" + [(set_attr "conds" "set")] +) + +(define_expand "extendhisi2" + [(set (match_operand:SI 0 "s_register_operand" "") + (sign_extend:SI (match_operand:HI 1 "nonimmediate_operand" "")))] + "TARGET_EITHER" +{ + if (TARGET_THUMB1) + { + emit_insn (gen_thumb1_extendhisi2 (operands[0], operands[1])); + DONE; + } + if (MEM_P (operands[1]) && TARGET_ARM && !arm_arch4) + { + emit_insn (gen_extendhisi2_mem (operands[0], operands[1])); + DONE; + } + + if (!arm_arch6 && !MEM_P (operands[1])) + { + rtx t = gen_lowpart (SImode, operands[1]); + rtx tmp = gen_reg_rtx (SImode); + emit_insn (gen_ashlsi3 (tmp, t, GEN_INT (16))); + emit_insn (gen_ashrsi3 (operands[0], tmp, GEN_INT (16))); + DONE; + } +}) + +(define_split + [(parallel + [(set (match_operand:SI 0 "register_operand" "") + (sign_extend:SI (match_operand:HI 1 "register_operand" ""))) + (clobber (match_scratch:SI 2 ""))])] + "!arm_arch6" + [(set (match_dup 0) (ashift:SI (match_dup 2) (const_int 16))) + (set (match_dup 0) (ashiftrt:SI (match_dup 0) (const_int 16)))] +{ + operands[2] = simplify_gen_subreg (SImode, operands[1], HImode, 0); +}) + +;; We used to have an early-clobber on the scratch register here. +;; However, there's a bug somewhere in reload which means that this +;; can be partially ignored during spill allocation if the memory +;; address also needs reloading; this causes us to die later on when +;; we try to verify the operands. Fortunately, we don't really need +;; the early-clobber: we can always use operand 0 if operand 2 +;; overlaps the address. +(define_insn "thumb1_extendhisi2" + [(set (match_operand:SI 0 "register_operand" "=l,l") + (sign_extend:SI (match_operand:HI 1 "nonimmediate_operand" "l,m"))) + (clobber (match_scratch:SI 2 "=X,l"))] + "TARGET_THUMB1" + "* + { + rtx ops[4]; + rtx mem; + + if (which_alternative == 0 && !arm_arch6) + return \"#\"; + if (which_alternative == 0) + return \"sxth\\t%0, %1\"; + + mem = XEXP (operands[1], 0); + + /* This code used to try to use 'V', and fix the address only if it was + offsettable, but this fails for e.g. REG+48 because 48 is outside the + range of QImode offsets, and offsettable_address_p does a QImode + address check. */ + + if (GET_CODE (mem) == CONST) + mem = XEXP (mem, 0); + + if (GET_CODE (mem) == LABEL_REF) + return \"ldr\\t%0, %1\"; + + if (GET_CODE (mem) == PLUS) + { + rtx a = XEXP (mem, 0); + rtx b = XEXP (mem, 1); + + if (GET_CODE (a) == LABEL_REF + && GET_CODE (b) == CONST_INT) + return \"ldr\\t%0, %1\"; + + if (GET_CODE (b) == REG) + return \"ldrsh\\t%0, %1\"; + + ops[1] = a; + ops[2] = b; + } + else + { + ops[1] = mem; + ops[2] = const0_rtx; + } + + gcc_assert (GET_CODE (ops[1]) == REG); + + ops[0] = operands[0]; + if (reg_mentioned_p (operands[2], ops[1])) + ops[3] = ops[0]; + else + ops[3] = operands[2]; + output_asm_insn (\"mov\\t%3, %2\;ldrsh\\t%0, [%1, %3]\", ops); + return \"\"; + }" + [(set_attr_alternative "length" + [(if_then_else (eq_attr "is_arch6" "yes") + (const_int 2) (const_int 4)) + (const_int 4)]) + (set_attr "type" "alu_shift,load_byte") + (set_attr "pool_range" "*,1020")] +) + +;; This pattern will only be used when ldsh is not available +(define_expand "extendhisi2_mem" + [(set (match_dup 2) (zero_extend:SI (match_operand:HI 1 "" ""))) + (set (match_dup 3) + (zero_extend:SI (match_dup 7))) + (set (match_dup 6) (ashift:SI (match_dup 4) (const_int 24))) + (set (match_operand:SI 0 "" "") + (ior:SI (ashiftrt:SI (match_dup 6) (const_int 16)) (match_dup 5)))] + "TARGET_ARM" + " + { + rtx mem1, mem2; + rtx addr = copy_to_mode_reg (SImode, XEXP (operands[1], 0)); + + mem1 = change_address (operands[1], QImode, addr); + mem2 = change_address (operands[1], QImode, plus_constant (addr, 1)); + operands[0] = gen_lowpart (SImode, operands[0]); + operands[1] = mem1; + operands[2] = gen_reg_rtx (SImode); + operands[3] = gen_reg_rtx (SImode); + operands[6] = gen_reg_rtx (SImode); + operands[7] = mem2; + + if (BYTES_BIG_ENDIAN) + { + operands[4] = operands[2]; + operands[5] = operands[3]; + } + else + { + operands[4] = operands[3]; + operands[5] = operands[2]; + } + }" +) + +(define_split + [(set (match_operand:SI 0 "register_operand" "") + (sign_extend:SI (match_operand:HI 1 "register_operand" "")))] + "!arm_arch6" + [(set (match_dup 0) (ashift:SI (match_dup 2) (const_int 16))) + (set (match_dup 0) (ashiftrt:SI (match_dup 0) (const_int 16)))] +{ + operands[2] = simplify_gen_subreg (SImode, operands[1], HImode, 0); +}) + +(define_insn "*arm_extendhisi2" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (sign_extend:SI (match_operand:HI 1 "nonimmediate_operand" "r,m")))] + "TARGET_ARM && arm_arch4 && !arm_arch6" + "@ + # + ldr%(sh%)\\t%0, %1" + [(set_attr "length" "8,4") + (set_attr "type" "alu_shift,load_byte") + (set_attr "predicable" "yes") + (set_attr "pool_range" "*,256") + (set_attr "neg_pool_range" "*,244")] +) + +;; ??? Check Thumb-2 pool range +(define_insn "*arm_extendhisi2_v6" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (sign_extend:SI (match_operand:HI 1 "nonimmediate_operand" "r,m")))] + "TARGET_32BIT && arm_arch6" + "@ + sxth%?\\t%0, %1 + ldr%(sh%)\\t%0, %1" + [(set_attr "type" "alu_shift,load_byte") + (set_attr "predicable" "yes") + (set_attr "pool_range" "*,256") + (set_attr "neg_pool_range" "*,244")] +) + +(define_insn "*arm_extendhisi2addsi" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (plus:SI (sign_extend:SI (match_operand:HI 1 "s_register_operand" "r")) + (match_operand:SI 2 "s_register_operand" "r")))] + "TARGET_INT_SIMD" + "sxtah%?\\t%0, %2, %1" +) + +(define_expand "extendqihi2" + [(set (match_dup 2) + (ashift:SI (match_operand:QI 1 "arm_reg_or_extendqisi_mem_op" "") + (const_int 24))) + (set (match_operand:HI 0 "s_register_operand" "") + (ashiftrt:SI (match_dup 2) + (const_int 24)))] + "TARGET_ARM" + " + { + if (arm_arch4 && GET_CODE (operands[1]) == MEM) + { + emit_insn (gen_rtx_SET (VOIDmode, + operands[0], + gen_rtx_SIGN_EXTEND (HImode, operands[1]))); + DONE; + } + if (!s_register_operand (operands[1], QImode)) + operands[1] = copy_to_mode_reg (QImode, operands[1]); + operands[0] = gen_lowpart (SImode, operands[0]); + operands[1] = gen_lowpart (SImode, operands[1]); + operands[2] = gen_reg_rtx (SImode); + }" +) + +(define_insn "*arm_extendqihi_insn" + [(set (match_operand:HI 0 "s_register_operand" "=r") + (sign_extend:HI (match_operand:QI 1 "arm_extendqisi_mem_op" "Uq")))] + "TARGET_ARM && arm_arch4" + "ldr%(sb%)\\t%0, %1" + [(set_attr "type" "load_byte") + (set_attr "predicable" "yes") + (set_attr "pool_range" "256") + (set_attr "neg_pool_range" "244")] +) + +(define_expand "extendqisi2" + [(set (match_operand:SI 0 "s_register_operand" "") + (sign_extend:SI (match_operand:QI 1 "arm_reg_or_extendqisi_mem_op" "")))] + "TARGET_EITHER" +{ + if (!arm_arch4 && MEM_P (operands[1])) + operands[1] = copy_to_mode_reg (QImode, operands[1]); + + if (!arm_arch6 && !MEM_P (operands[1])) + { + rtx t = gen_lowpart (SImode, operands[1]); + rtx tmp = gen_reg_rtx (SImode); + emit_insn (gen_ashlsi3 (tmp, t, GEN_INT (24))); + emit_insn (gen_ashrsi3 (operands[0], tmp, GEN_INT (24))); + DONE; + } +}) + +(define_split + [(set (match_operand:SI 0 "register_operand" "") + (sign_extend:SI (match_operand:QI 1 "register_operand" "")))] + "!arm_arch6" + [(set (match_dup 0) (ashift:SI (match_dup 2) (const_int 24))) + (set (match_dup 0) (ashiftrt:SI (match_dup 0) (const_int 24)))] +{ + operands[2] = simplify_gen_subreg (SImode, operands[1], QImode, 0); +}) + +(define_insn "*arm_extendqisi" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (sign_extend:SI (match_operand:QI 1 "arm_reg_or_extendqisi_mem_op" "r,Uq")))] + "TARGET_ARM && arm_arch4 && !arm_arch6" + "@ + # + ldr%(sb%)\\t%0, %1" + [(set_attr "length" "8,4") + (set_attr "type" "alu_shift,load_byte") + (set_attr "predicable" "yes") + (set_attr "pool_range" "*,256") + (set_attr "neg_pool_range" "*,244")] +) + +(define_insn "*arm_extendqisi_v6" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (sign_extend:SI + (match_operand:QI 1 "arm_reg_or_extendqisi_mem_op" "r,Uq")))] + "TARGET_ARM && arm_arch6" + "@ + sxtb%?\\t%0, %1 + ldr%(sb%)\\t%0, %1" + [(set_attr "type" "alu_shift,load_byte") + (set_attr "predicable" "yes") + (set_attr "pool_range" "*,256") + (set_attr "neg_pool_range" "*,244")] +) + +(define_insn "*arm_extendqisi2addsi" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (plus:SI (sign_extend:SI (match_operand:QI 1 "s_register_operand" "r")) + (match_operand:SI 2 "s_register_operand" "r")))] + "TARGET_INT_SIMD" + "sxtab%?\\t%0, %2, %1" + [(set_attr "type" "alu_shift") + (set_attr "insn" "xtab") + (set_attr "predicable" "yes")] +) + +(define_split + [(set (match_operand:SI 0 "register_operand" "") + (sign_extend:SI (match_operand:QI 1 "memory_operand" "")))] + "TARGET_THUMB1 && reload_completed" + [(set (match_dup 0) (match_dup 2)) + (set (match_dup 0) (sign_extend:SI (match_dup 3)))] +{ + rtx addr = XEXP (operands[1], 0); + + if (GET_CODE (addr) == CONST) + addr = XEXP (addr, 0); + + if (GET_CODE (addr) == PLUS + && REG_P (XEXP (addr, 0)) && REG_P (XEXP (addr, 1))) + /* No split necessary. */ + FAIL; + + if (GET_CODE (addr) == PLUS + && !REG_P (XEXP (addr, 0)) && !REG_P (XEXP (addr, 1))) + FAIL; + + if (reg_overlap_mentioned_p (operands[0], addr)) + { + rtx t = gen_lowpart (QImode, operands[0]); + emit_move_insn (t, operands[1]); + emit_insn (gen_thumb1_extendqisi2 (operands[0], t)); + DONE; + } + + if (REG_P (addr)) + { + addr = gen_rtx_PLUS (Pmode, addr, operands[0]); + operands[2] = const0_rtx; + } + else if (GET_CODE (addr) != PLUS) + FAIL; + else if (REG_P (XEXP (addr, 0))) + { + operands[2] = XEXP (addr, 1); + addr = gen_rtx_PLUS (Pmode, XEXP (addr, 0), operands[0]); + } + else + { + operands[2] = XEXP (addr, 0); + addr = gen_rtx_PLUS (Pmode, XEXP (addr, 1), operands[0]); + } + + operands[3] = change_address (operands[1], QImode, addr); +}) + +(define_peephole2 + [(set (match_operand:SI 0 "register_operand" "") + (plus:SI (match_dup 0) (match_operand 1 "const_int_operand"))) + (set (match_operand:SI 2 "register_operand" "") (const_int 0)) + (set (match_operand:SI 3 "register_operand" "") + (sign_extend:SI (match_operand:QI 4 "memory_operand" "")))] + "TARGET_THUMB1 + && GET_CODE (XEXP (operands[4], 0)) == PLUS + && rtx_equal_p (operands[0], XEXP (XEXP (operands[4], 0), 0)) + && rtx_equal_p (operands[2], XEXP (XEXP (operands[4], 0), 1)) + && (peep2_reg_dead_p (3, operands[0]) + || rtx_equal_p (operands[0], operands[3])) + && (peep2_reg_dead_p (3, operands[2]) + || rtx_equal_p (operands[2], operands[3]))" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 3) (sign_extend:SI (match_dup 4)))] +{ + rtx addr = gen_rtx_PLUS (Pmode, operands[0], operands[2]); + operands[4] = change_address (operands[4], QImode, addr); +}) + +(define_insn "thumb1_extendqisi2" + [(set (match_operand:SI 0 "register_operand" "=l,l,l") + (sign_extend:SI (match_operand:QI 1 "nonimmediate_operand" "l,V,m")))] + "TARGET_THUMB1" +{ + rtx addr; + + if (which_alternative == 0 && arm_arch6) + return "sxtb\\t%0, %1"; + if (which_alternative == 0) + return "#"; + + addr = XEXP (operands[1], 0); + if (GET_CODE (addr) == PLUS + && REG_P (XEXP (addr, 0)) && REG_P (XEXP (addr, 1))) + return "ldrsb\\t%0, %1"; + + return "#"; +} + [(set_attr_alternative "length" + [(if_then_else (eq_attr "is_arch6" "yes") + (const_int 2) (const_int 4)) + (const_int 2) + (if_then_else (eq_attr "is_arch6" "yes") + (const_int 4) (const_int 6))]) + (set_attr "type" "alu_shift,load_byte,load_byte")] +) + +(define_expand "extendsfdf2" + [(set (match_operand:DF 0 "s_register_operand" "") + (float_extend:DF (match_operand:SF 1 "s_register_operand" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && !TARGET_VFP_SINGLE" + "" +) + +/* HFmode -> DFmode conversions have to go through SFmode. */ +(define_expand "extendhfdf2" + [(set (match_operand:DF 0 "general_operand" "") + (float_extend:DF (match_operand:HF 1 "general_operand" "")))] + "TARGET_EITHER" + " + { + rtx op1; + op1 = convert_to_mode (SFmode, operands[1], 0); + op1 = convert_to_mode (DFmode, op1, 0); + emit_insn (gen_movdf (operands[0], op1)); + DONE; + }" +) + +;; Move insns (including loads and stores) + +;; XXX Just some ideas about movti. +;; I don't think these are a good idea on the arm, there just aren't enough +;; registers +;;(define_expand "loadti" +;; [(set (match_operand:TI 0 "s_register_operand" "") +;; (mem:TI (match_operand:SI 1 "address_operand" "")))] +;; "" "") + +;;(define_expand "storeti" +;; [(set (mem:TI (match_operand:TI 0 "address_operand" "")) +;; (match_operand:TI 1 "s_register_operand" ""))] +;; "" "") + +;;(define_expand "movti" +;; [(set (match_operand:TI 0 "general_operand" "") +;; (match_operand:TI 1 "general_operand" ""))] +;; "" +;; " +;;{ +;; rtx insn; +;; +;; if (GET_CODE (operands[0]) == MEM && GET_CODE (operands[1]) == MEM) +;; operands[1] = copy_to_reg (operands[1]); +;; if (GET_CODE (operands[0]) == MEM) +;; insn = gen_storeti (XEXP (operands[0], 0), operands[1]); +;; else if (GET_CODE (operands[1]) == MEM) +;; insn = gen_loadti (operands[0], XEXP (operands[1], 0)); +;; else +;; FAIL; +;; +;; emit_insn (insn); +;; DONE; +;;}") + +;; Recognize garbage generated above. + +;;(define_insn "" +;; [(set (match_operand:TI 0 "general_operand" "=r,r,r,<,>,m") +;; (match_operand:TI 1 "general_operand" "<,>,m,r,r,r"))] +;; "" +;; "* +;; { +;; register mem = (which_alternative < 3); +;; register const char *template; +;; +;; operands[mem] = XEXP (operands[mem], 0); +;; switch (which_alternative) +;; { +;; case 0: template = \"ldmdb\\t%1!, %M0\"; break; +;; case 1: template = \"ldmia\\t%1!, %M0\"; break; +;; case 2: template = \"ldmia\\t%1, %M0\"; break; +;; case 3: template = \"stmdb\\t%0!, %M1\"; break; +;; case 4: template = \"stmia\\t%0!, %M1\"; break; +;; case 5: template = \"stmia\\t%0, %M1\"; break; +;; } +;; output_asm_insn (template, operands); +;; return \"\"; +;; }") + +(define_expand "movdi" + [(set (match_operand:DI 0 "general_operand" "") + (match_operand:DI 1 "general_operand" ""))] + "TARGET_EITHER" + " + if (can_create_pseudo_p ()) + { + if (GET_CODE (operands[0]) != REG) + operands[1] = force_reg (DImode, operands[1]); + } + " +) + +(define_insn "*arm_movdi" + [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r, r, r, r, m") + (match_operand:DI 1 "di_operand" "rDa,Db,Dc,mi,r"))] + "TARGET_32BIT + && !(TARGET_HARD_FLOAT && (TARGET_MAVERICK || TARGET_VFP)) + && !TARGET_IWMMXT + && ( register_operand (operands[0], DImode) + || register_operand (operands[1], DImode))" + "* + switch (which_alternative) + { + case 0: + case 1: + case 2: + return \"#\"; + default: + return output_move_double (operands); + } + " + [(set_attr "length" "8,12,16,8,8") + (set_attr "type" "*,*,*,load2,store2") + (set_attr "arm_pool_range" "*,*,*,1020,*") + (set_attr "arm_neg_pool_range" "*,*,*,1008,*") + (set_attr "thumb2_pool_range" "*,*,*,4096,*") + (set_attr "thumb2_neg_pool_range" "*,*,*,0,*")] +) + +(define_split + [(set (match_operand:ANY64 0 "arm_general_register_operand" "") + (match_operand:ANY64 1 "const_double_operand" ""))] + "TARGET_32BIT + && reload_completed + && (arm_const_double_inline_cost (operands[1]) + <= ((optimize_size || arm_ld_sched) ? 3 : 4))" + [(const_int 0)] + " + arm_split_constant (SET, SImode, curr_insn, + INTVAL (gen_lowpart (SImode, operands[1])), + gen_lowpart (SImode, operands[0]), NULL_RTX, 0); + arm_split_constant (SET, SImode, curr_insn, + INTVAL (gen_highpart_mode (SImode, + GET_MODE (operands[0]), + operands[1])), + gen_highpart (SImode, operands[0]), NULL_RTX, 0); + DONE; + " +) + +; If optimizing for size, or if we have load delay slots, then +; we want to split the constant into two separate operations. +; In both cases this may split a trivial part into a single data op +; leaving a single complex constant to load. We can also get longer +; offsets in a LDR which means we get better chances of sharing the pool +; entries. Finally, we can normally do a better job of scheduling +; LDR instructions than we can with LDM. +; This pattern will only match if the one above did not. +(define_split + [(set (match_operand:ANY64 0 "arm_general_register_operand" "") + (match_operand:ANY64 1 "const_double_operand" ""))] + "TARGET_ARM && reload_completed + && arm_const_double_by_parts (operands[1])" + [(set (match_dup 0) (match_dup 1)) + (set (match_dup 2) (match_dup 3))] + " + operands[2] = gen_highpart (SImode, operands[0]); + operands[3] = gen_highpart_mode (SImode, GET_MODE (operands[0]), + operands[1]); + operands[0] = gen_lowpart (SImode, operands[0]); + operands[1] = gen_lowpart (SImode, operands[1]); + " +) + +(define_split + [(set (match_operand:ANY64 0 "arm_general_register_operand" "") + (match_operand:ANY64 1 "arm_general_register_operand" ""))] + "TARGET_EITHER && reload_completed" + [(set (match_dup 0) (match_dup 1)) + (set (match_dup 2) (match_dup 3))] + " + operands[2] = gen_highpart (SImode, operands[0]); + operands[3] = gen_highpart (SImode, operands[1]); + operands[0] = gen_lowpart (SImode, operands[0]); + operands[1] = gen_lowpart (SImode, operands[1]); + + /* Handle a partial overlap. */ + if (rtx_equal_p (operands[0], operands[3])) + { + rtx tmp0 = operands[0]; + rtx tmp1 = operands[1]; + + operands[0] = operands[2]; + operands[1] = operands[3]; + operands[2] = tmp0; + operands[3] = tmp1; + } + " +) + +;; We can't actually do base+index doubleword loads if the index and +;; destination overlap. Split here so that we at least have chance to +;; schedule. +(define_split + [(set (match_operand:DI 0 "s_register_operand" "") + (mem:DI (plus:SI (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 2 "s_register_operand" ""))))] + "TARGET_LDRD + && reg_overlap_mentioned_p (operands[0], operands[1]) + && reg_overlap_mentioned_p (operands[0], operands[2])" + [(set (match_dup 4) + (plus:SI (match_dup 1) + (match_dup 2))) + (set (match_dup 0) + (mem:DI (match_dup 4)))] + " + operands[4] = gen_rtx_REG (SImode, REGNO(operands[0])); + " +) + +;;; ??? This should have alternatives for constants. +;;; ??? This was originally identical to the movdf_insn pattern. +;;; ??? The 'i' constraint looks funny, but it should always be replaced by +;;; thumb_reorg with a memory reference. +(define_insn "*thumb1_movdi_insn" + [(set (match_operand:DI 0 "nonimmediate_operand" "=l,l,l,l,>,l, m,*r") + (match_operand:DI 1 "general_operand" "l, I,J,>,l,mi,l,*r"))] + "TARGET_THUMB1 + && !(TARGET_HARD_FLOAT && TARGET_MAVERICK) + && ( register_operand (operands[0], DImode) + || register_operand (operands[1], DImode))" + "* + { + switch (which_alternative) + { + default: + case 0: + if (REGNO (operands[1]) == REGNO (operands[0]) + 1) + return \"add\\t%0, %1, #0\;add\\t%H0, %H1, #0\"; + return \"add\\t%H0, %H1, #0\;add\\t%0, %1, #0\"; + case 1: + return \"mov\\t%Q0, %1\;mov\\t%R0, #0\"; + case 2: + operands[1] = GEN_INT (- INTVAL (operands[1])); + return \"mov\\t%Q0, %1\;neg\\t%Q0, %Q0\;asr\\t%R0, %Q0, #31\"; + case 3: + return \"ldmia\\t%1, {%0, %H0}\"; + case 4: + return \"stmia\\t%0, {%1, %H1}\"; + case 5: + return thumb_load_double_from_address (operands); + case 6: + operands[2] = gen_rtx_MEM (SImode, + plus_constant (XEXP (operands[0], 0), 4)); + output_asm_insn (\"str\\t%1, %0\;str\\t%H1, %2\", operands); + return \"\"; + case 7: + if (REGNO (operands[1]) == REGNO (operands[0]) + 1) + return \"mov\\t%0, %1\;mov\\t%H0, %H1\"; + return \"mov\\t%H0, %H1\;mov\\t%0, %1\"; + } + }" + [(set_attr "length" "4,4,6,2,2,6,4,4") + (set_attr "type" "*,*,*,load2,store2,load2,store2,*") + (set_attr "insn" "*,mov,*,*,*,*,*,mov") + (set_attr "pool_range" "*,*,*,*,*,1020,*,*")] +) + +(define_expand "movsi" + [(set (match_operand:SI 0 "general_operand" "") + (match_operand:SI 1 "general_operand" ""))] + "TARGET_EITHER" + " + { + rtx base, offset, tmp; + + if (TARGET_32BIT) + { + /* Everything except mem = const or mem = mem can be done easily. */ + if (GET_CODE (operands[0]) == MEM) + operands[1] = force_reg (SImode, operands[1]); + if (arm_general_register_operand (operands[0], SImode) + && GET_CODE (operands[1]) == CONST_INT + && !(const_ok_for_arm (INTVAL (operands[1])) + || const_ok_for_arm (~INTVAL (operands[1])))) + { + arm_split_constant (SET, SImode, NULL_RTX, + INTVAL (operands[1]), operands[0], NULL_RTX, + optimize && can_create_pseudo_p ()); + DONE; + } + + if (TARGET_USE_MOVT && !target_word_relocations + && GET_CODE (operands[1]) == SYMBOL_REF + && !flag_pic && !arm_tls_referenced_p (operands[1])) + { + arm_emit_movpair (operands[0], operands[1]); + DONE; + } + } + else /* TARGET_THUMB1... */ + { + if (can_create_pseudo_p ()) + { + if (GET_CODE (operands[0]) != REG) + operands[1] = force_reg (SImode, operands[1]); + } + } + + if (ARM_OFFSETS_MUST_BE_WITHIN_SECTIONS_P) + { + split_const (operands[1], &base, &offset); + if (GET_CODE (base) == SYMBOL_REF + && !offset_within_block_p (base, INTVAL (offset))) + { + tmp = can_create_pseudo_p () ? gen_reg_rtx (SImode) : operands[0]; + emit_move_insn (tmp, base); + emit_insn (gen_addsi3 (operands[0], tmp, offset)); + DONE; + } + } + + /* Recognize the case where operand[1] is a reference to thread-local + data and load its address to a register. */ + if (arm_tls_referenced_p (operands[1])) + { + rtx tmp = operands[1]; + rtx addend = NULL; + + if (GET_CODE (tmp) == CONST && GET_CODE (XEXP (tmp, 0)) == PLUS) + { + addend = XEXP (XEXP (tmp, 0), 1); + tmp = XEXP (XEXP (tmp, 0), 0); + } + + gcc_assert (GET_CODE (tmp) == SYMBOL_REF); + gcc_assert (SYMBOL_REF_TLS_MODEL (tmp) != 0); + + tmp = legitimize_tls_address (tmp, + !can_create_pseudo_p () ? operands[0] : 0); + if (addend) + { + tmp = gen_rtx_PLUS (SImode, tmp, addend); + tmp = force_operand (tmp, operands[0]); + } + operands[1] = tmp; + } + else if (flag_pic + && (CONSTANT_P (operands[1]) + || symbol_mentioned_p (operands[1]) + || label_mentioned_p (operands[1]))) + operands[1] = legitimize_pic_address (operands[1], SImode, + (!can_create_pseudo_p () + ? operands[0] + : 0)); + } + " +) + +;; The ARM LO_SUM and HIGH are backwards - HIGH sets the low bits, and +;; LO_SUM adds in the high bits. Fortunately these are opaque operations +;; so this does not matter. +(define_insn "*arm_movt" + [(set (match_operand:SI 0 "nonimmediate_operand" "=r") + (lo_sum:SI (match_operand:SI 1 "nonimmediate_operand" "0") + (match_operand:SI 2 "general_operand" "i")))] + "arm_arch_thumb2" + "movt%?\t%0, #:upper16:%c2" + [(set_attr "predicable" "yes") + (set_attr "length" "4")] +) + +(define_insn "*arm_movsi_insn" + [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,r,r,rk,m") + (match_operand:SI 1 "general_operand" "rk, I,K,j,mi,rk"))] + "TARGET_ARM && ! TARGET_IWMMXT + && !(TARGET_HARD_FLOAT && TARGET_VFP) + && ( register_operand (operands[0], SImode) + || register_operand (operands[1], SImode))" + "@ + mov%?\\t%0, %1 + mov%?\\t%0, %1 + mvn%?\\t%0, #%B1 + movw%?\\t%0, %1 + ldr%?\\t%0, %1 + str%?\\t%1, %0" + [(set_attr "type" "*,*,*,*,load1,store1") + (set_attr "insn" "mov,mov,mvn,mov,*,*") + (set_attr "predicable" "yes") + (set_attr "pool_range" "*,*,*,*,4096,*") + (set_attr "neg_pool_range" "*,*,*,*,4084,*")] +) + +(define_split + [(set (match_operand:SI 0 "arm_general_register_operand" "") + (match_operand:SI 1 "const_int_operand" ""))] + "TARGET_32BIT + && (!(const_ok_for_arm (INTVAL (operands[1])) + || const_ok_for_arm (~INTVAL (operands[1]))))" + [(clobber (const_int 0))] + " + arm_split_constant (SET, SImode, NULL_RTX, + INTVAL (operands[1]), operands[0], NULL_RTX, 0); + DONE; + " +) + +(define_insn "*thumb1_movsi_insn" + [(set (match_operand:SI 0 "nonimmediate_operand" "=l,l,l,l,l,>,l, m,*l*h*k") + (match_operand:SI 1 "general_operand" "l, I,J,K,>,l,mi,l,*l*h*k"))] + "TARGET_THUMB1 + && ( register_operand (operands[0], SImode) + || register_operand (operands[1], SImode))" + "@ + mov %0, %1 + mov %0, %1 + # + # + ldmia\\t%1, {%0} + stmia\\t%0, {%1} + ldr\\t%0, %1 + str\\t%1, %0 + mov\\t%0, %1" + [(set_attr "length" "2,2,4,4,2,2,2,2,2") + (set_attr "type" "*,*,*,*,load1,store1,load1,store1,*") + (set_attr "pool_range" "*,*,*,*,*,*,1020,*,*") + (set_attr "conds" "set,clob,*,*,nocond,nocond,nocond,nocond,nocond")]) + +(define_split + [(set (match_operand:SI 0 "register_operand" "") + (match_operand:SI 1 "const_int_operand" ""))] + "TARGET_THUMB1 && satisfies_constraint_J (operands[1])" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (neg:SI (match_dup 2)))] + " + { + operands[1] = GEN_INT (- INTVAL (operands[1])); + operands[2] = can_create_pseudo_p () ? gen_reg_rtx (SImode) : operands[0]; + }" +) + +(define_split + [(set (match_operand:SI 0 "register_operand" "") + (match_operand:SI 1 "const_int_operand" ""))] + "TARGET_THUMB1 && satisfies_constraint_K (operands[1])" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (ashift:SI (match_dup 2) (match_dup 3)))] + " + { + unsigned HOST_WIDE_INT val = INTVAL (operands[1]) & 0xffffffffu; + unsigned HOST_WIDE_INT mask = 0xff; + int i; + + for (i = 0; i < 25; i++) + if ((val & (mask << i)) == val) + break; + + /* Don't split if the shift is zero. */ + if (i == 0) + FAIL; + + operands[1] = GEN_INT (val >> i); + operands[2] = can_create_pseudo_p () ? gen_reg_rtx (SImode) : operands[0]; + operands[3] = GEN_INT (i); + }" +) + +;; When generating pic, we need to load the symbol offset into a register. +;; So that the optimizer does not confuse this with a normal symbol load +;; we use an unspec. The offset will be loaded from a constant pool entry, +;; since that is the only type of relocation we can use. + +;; Wrap calculation of the whole PIC address in a single pattern for the +;; benefit of optimizers, particularly, PRE and HOIST. Calculation of +;; a PIC address involves two loads from memory, so we want to CSE it +;; as often as possible. +;; This pattern will be split into one of the pic_load_addr_* patterns +;; and a move after GCSE optimizations. +;; +;; Note: Update arm.c: legitimize_pic_address() when changing this pattern. +(define_expand "calculate_pic_address" + [(set (match_operand:SI 0 "register_operand" "") + (mem:SI (plus:SI (match_operand:SI 1 "register_operand" "") + (unspec:SI [(match_operand:SI 2 "" "")] + UNSPEC_PIC_SYM))))] + "flag_pic" +) + +;; Split calculate_pic_address into pic_load_addr_* and a move. +(define_split + [(set (match_operand:SI 0 "register_operand" "") + (mem:SI (plus:SI (match_operand:SI 1 "register_operand" "") + (unspec:SI [(match_operand:SI 2 "" "")] + UNSPEC_PIC_SYM))))] + "flag_pic" + [(set (match_dup 3) (unspec:SI [(match_dup 2)] UNSPEC_PIC_SYM)) + (set (match_dup 0) (mem:SI (plus:SI (match_dup 1) (match_dup 3))))] + "operands[3] = can_create_pseudo_p () ? gen_reg_rtx (SImode) : operands[0];" +) + +;; operand1 is the memory address to go into +;; pic_load_addr_32bit. +;; operand2 is the PIC label to be emitted +;; from pic_add_dot_plus_eight. +;; We do this to allow hoisting of the entire insn. +(define_insn_and_split "pic_load_addr_unified" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,l") + (unspec:SI [(match_operand:SI 1 "" "mX,mX,mX") + (match_operand:SI 2 "" "")] + UNSPEC_PIC_UNIFIED))] + "flag_pic" + "#" + "&& reload_completed" + [(set (match_dup 0) (unspec:SI [(match_dup 1)] UNSPEC_PIC_SYM)) + (set (match_dup 0) (unspec:SI [(match_dup 0) (match_dup 3) + (match_dup 2)] UNSPEC_PIC_BASE))] + "operands[3] = TARGET_THUMB ? GEN_INT (4) : GEN_INT (8);" + [(set_attr "type" "load1,load1,load1") + (set_attr "pool_range" "4096,4096,1024") + (set_attr "neg_pool_range" "4084,0,0") + (set_attr "arch" "a,t2,t1") + (set_attr "length" "8,6,4")] +) + +;; The rather odd constraints on the following are to force reload to leave +;; the insn alone, and to force the minipool generation pass to then move +;; the GOT symbol to memory. + +(define_insn "pic_load_addr_32bit" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (unspec:SI [(match_operand:SI 1 "" "mX")] UNSPEC_PIC_SYM))] + "TARGET_32BIT && flag_pic" + "ldr%?\\t%0, %1" + [(set_attr "type" "load1") + (set_attr "pool_range" "4096") + (set (attr "neg_pool_range") + (if_then_else (eq_attr "is_thumb" "no") + (const_int 4084) + (const_int 0)))] +) + +(define_insn "pic_load_addr_thumb1" + [(set (match_operand:SI 0 "s_register_operand" "=l") + (unspec:SI [(match_operand:SI 1 "" "mX")] UNSPEC_PIC_SYM))] + "TARGET_THUMB1 && flag_pic" + "ldr\\t%0, %1" + [(set_attr "type" "load1") + (set (attr "pool_range") (const_int 1024))] +) + +(define_insn "pic_add_dot_plus_four" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec:SI [(match_operand:SI 1 "register_operand" "0") + (const_int 4) + (match_operand 2 "" "")] + UNSPEC_PIC_BASE))] + "TARGET_THUMB" + "* + (*targetm.asm_out.internal_label) (asm_out_file, \"LPIC\", + INTVAL (operands[2])); + return \"add\\t%0, %|pc\"; + " + [(set_attr "length" "2")] +) + +(define_insn "pic_add_dot_plus_eight" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec:SI [(match_operand:SI 1 "register_operand" "r") + (const_int 8) + (match_operand 2 "" "")] + UNSPEC_PIC_BASE))] + "TARGET_ARM" + "* + (*targetm.asm_out.internal_label) (asm_out_file, \"LPIC\", + INTVAL (operands[2])); + return \"add%?\\t%0, %|pc, %1\"; + " + [(set_attr "predicable" "yes")] +) + +(define_insn "tls_load_dot_plus_eight" + [(set (match_operand:SI 0 "register_operand" "=r") + (mem:SI (unspec:SI [(match_operand:SI 1 "register_operand" "r") + (const_int 8) + (match_operand 2 "" "")] + UNSPEC_PIC_BASE)))] + "TARGET_ARM" + "* + (*targetm.asm_out.internal_label) (asm_out_file, \"LPIC\", + INTVAL (operands[2])); + return \"ldr%?\\t%0, [%|pc, %1]\t\t@ tls_load_dot_plus_eight\"; + " + [(set_attr "predicable" "yes")] +) + +;; PIC references to local variables can generate pic_add_dot_plus_eight +;; followed by a load. These sequences can be crunched down to +;; tls_load_dot_plus_eight by a peephole. + +(define_peephole2 + [(set (match_operand:SI 0 "register_operand" "") + (unspec:SI [(match_operand:SI 3 "register_operand" "") + (const_int 8) + (match_operand 1 "" "")] + UNSPEC_PIC_BASE)) + (set (match_operand:SI 2 "arm_general_register_operand" "") + (mem:SI (match_dup 0)))] + "TARGET_ARM && peep2_reg_dead_p (2, operands[0])" + [(set (match_dup 2) + (mem:SI (unspec:SI [(match_dup 3) + (const_int 8) + (match_dup 1)] + UNSPEC_PIC_BASE)))] + "" +) + +(define_insn "pic_offset_arm" + [(set (match_operand:SI 0 "register_operand" "=r") + (mem:SI (plus:SI (match_operand:SI 1 "register_operand" "r") + (unspec:SI [(match_operand:SI 2 "" "X")] + UNSPEC_PIC_OFFSET))))] + "TARGET_VXWORKS_RTP && TARGET_ARM && flag_pic" + "ldr%?\\t%0, [%1,%2]" + [(set_attr "type" "load1")] +) + +(define_expand "builtin_setjmp_receiver" + [(label_ref (match_operand 0 "" ""))] + "flag_pic" + " +{ + /* r3 is clobbered by set/longjmp, so we can use it as a scratch + register. */ + if (arm_pic_register != INVALID_REGNUM) + arm_load_pic_register (1UL << 3); + DONE; +}") + +;; If copying one reg to another we can set the condition codes according to +;; its value. Such a move is common after a return from subroutine and the +;; result is being tested against zero. + +(define_insn "*movsi_compare0" + [(set (reg:CC CC_REGNUM) + (compare:CC (match_operand:SI 1 "s_register_operand" "0,r") + (const_int 0))) + (set (match_operand:SI 0 "s_register_operand" "=r,r") + (match_dup 1))] + "TARGET_32BIT" + "@ + cmp%?\\t%0, #0 + sub%.\\t%0, %1, #0" + [(set_attr "conds" "set")] +) + +;; Subroutine to store a half word from a register into memory. +;; Operand 0 is the source register (HImode) +;; Operand 1 is the destination address in a register (SImode) + +;; In both this routine and the next, we must be careful not to spill +;; a memory address of reg+large_const into a separate PLUS insn, since this +;; can generate unrecognizable rtl. + +(define_expand "storehi" + [;; store the low byte + (set (match_operand 1 "" "") (match_dup 3)) + ;; extract the high byte + (set (match_dup 2) + (ashiftrt:SI (match_operand 0 "" "") (const_int 8))) + ;; store the high byte + (set (match_dup 4) (match_dup 5))] + "TARGET_ARM" + " + { + rtx op1 = operands[1]; + rtx addr = XEXP (op1, 0); + enum rtx_code code = GET_CODE (addr); + + if ((code == PLUS && GET_CODE (XEXP (addr, 1)) != CONST_INT) + || code == MINUS) + op1 = replace_equiv_address (operands[1], force_reg (SImode, addr)); + + operands[4] = adjust_address (op1, QImode, 1); + operands[1] = adjust_address (operands[1], QImode, 0); + operands[3] = gen_lowpart (QImode, operands[0]); + operands[0] = gen_lowpart (SImode, operands[0]); + operands[2] = gen_reg_rtx (SImode); + operands[5] = gen_lowpart (QImode, operands[2]); + }" +) + +(define_expand "storehi_bigend" + [(set (match_dup 4) (match_dup 3)) + (set (match_dup 2) + (ashiftrt:SI (match_operand 0 "" "") (const_int 8))) + (set (match_operand 1 "" "") (match_dup 5))] + "TARGET_ARM" + " + { + rtx op1 = operands[1]; + rtx addr = XEXP (op1, 0); + enum rtx_code code = GET_CODE (addr); + + if ((code == PLUS && GET_CODE (XEXP (addr, 1)) != CONST_INT) + || code == MINUS) + op1 = replace_equiv_address (op1, force_reg (SImode, addr)); + + operands[4] = adjust_address (op1, QImode, 1); + operands[1] = adjust_address (operands[1], QImode, 0); + operands[3] = gen_lowpart (QImode, operands[0]); + operands[0] = gen_lowpart (SImode, operands[0]); + operands[2] = gen_reg_rtx (SImode); + operands[5] = gen_lowpart (QImode, operands[2]); + }" +) + +;; Subroutine to store a half word integer constant into memory. +(define_expand "storeinthi" + [(set (match_operand 0 "" "") + (match_operand 1 "" "")) + (set (match_dup 3) (match_dup 2))] + "TARGET_ARM" + " + { + HOST_WIDE_INT value = INTVAL (operands[1]); + rtx addr = XEXP (operands[0], 0); + rtx op0 = operands[0]; + enum rtx_code code = GET_CODE (addr); + + if ((code == PLUS && GET_CODE (XEXP (addr, 1)) != CONST_INT) + || code == MINUS) + op0 = replace_equiv_address (op0, force_reg (SImode, addr)); + + operands[1] = gen_reg_rtx (SImode); + if (BYTES_BIG_ENDIAN) + { + emit_insn (gen_movsi (operands[1], GEN_INT ((value >> 8) & 255))); + if ((value & 255) == ((value >> 8) & 255)) + operands[2] = operands[1]; + else + { + operands[2] = gen_reg_rtx (SImode); + emit_insn (gen_movsi (operands[2], GEN_INT (value & 255))); + } + } + else + { + emit_insn (gen_movsi (operands[1], GEN_INT (value & 255))); + if ((value & 255) == ((value >> 8) & 255)) + operands[2] = operands[1]; + else + { + operands[2] = gen_reg_rtx (SImode); + emit_insn (gen_movsi (operands[2], GEN_INT ((value >> 8) & 255))); + } + } + + operands[3] = adjust_address (op0, QImode, 1); + operands[0] = adjust_address (operands[0], QImode, 0); + operands[2] = gen_lowpart (QImode, operands[2]); + operands[1] = gen_lowpart (QImode, operands[1]); + }" +) + +(define_expand "storehi_single_op" + [(set (match_operand:HI 0 "memory_operand" "") + (match_operand:HI 1 "general_operand" ""))] + "TARGET_32BIT && arm_arch4" + " + if (!s_register_operand (operands[1], HImode)) + operands[1] = copy_to_mode_reg (HImode, operands[1]); + " +) + +(define_expand "movhi" + [(set (match_operand:HI 0 "general_operand" "") + (match_operand:HI 1 "general_operand" ""))] + "TARGET_EITHER" + " + if (TARGET_ARM) + { + if (can_create_pseudo_p ()) + { + if (GET_CODE (operands[0]) == MEM) + { + if (arm_arch4) + { + emit_insn (gen_storehi_single_op (operands[0], operands[1])); + DONE; + } + if (GET_CODE (operands[1]) == CONST_INT) + emit_insn (gen_storeinthi (operands[0], operands[1])); + else + { + if (GET_CODE (operands[1]) == MEM) + operands[1] = force_reg (HImode, operands[1]); + if (BYTES_BIG_ENDIAN) + emit_insn (gen_storehi_bigend (operands[1], operands[0])); + else + emit_insn (gen_storehi (operands[1], operands[0])); + } + DONE; + } + /* Sign extend a constant, and keep it in an SImode reg. */ + else if (GET_CODE (operands[1]) == CONST_INT) + { + rtx reg = gen_reg_rtx (SImode); + HOST_WIDE_INT val = INTVAL (operands[1]) & 0xffff; + + /* If the constant is already valid, leave it alone. */ + if (!const_ok_for_arm (val)) + { + /* If setting all the top bits will make the constant + loadable in a single instruction, then set them. + Otherwise, sign extend the number. */ + + if (const_ok_for_arm (~(val | ~0xffff))) + val |= ~0xffff; + else if (val & 0x8000) + val |= ~0xffff; + } + + emit_insn (gen_movsi (reg, GEN_INT (val))); + operands[1] = gen_lowpart (HImode, reg); + } + else if (arm_arch4 && optimize && can_create_pseudo_p () + && GET_CODE (operands[1]) == MEM) + { + rtx reg = gen_reg_rtx (SImode); + + emit_insn (gen_zero_extendhisi2 (reg, operands[1])); + operands[1] = gen_lowpart (HImode, reg); + } + else if (!arm_arch4) + { + if (GET_CODE (operands[1]) == MEM) + { + rtx base; + rtx offset = const0_rtx; + rtx reg = gen_reg_rtx (SImode); + + if ((GET_CODE (base = XEXP (operands[1], 0)) == REG + || (GET_CODE (base) == PLUS + && (GET_CODE (offset = XEXP (base, 1)) + == CONST_INT) + && ((INTVAL(offset) & 1) != 1) + && GET_CODE (base = XEXP (base, 0)) == REG)) + && REGNO_POINTER_ALIGN (REGNO (base)) >= 32) + { + rtx new_rtx; + + new_rtx = widen_memory_access (operands[1], SImode, + ((INTVAL (offset) & ~3) + - INTVAL (offset))); + emit_insn (gen_movsi (reg, new_rtx)); + if (((INTVAL (offset) & 2) != 0) + ^ (BYTES_BIG_ENDIAN ? 1 : 0)) + { + rtx reg2 = gen_reg_rtx (SImode); + + emit_insn (gen_lshrsi3 (reg2, reg, GEN_INT (16))); + reg = reg2; + } + } + else + emit_insn (gen_movhi_bytes (reg, operands[1])); + + operands[1] = gen_lowpart (HImode, reg); + } + } + } + /* Handle loading a large integer during reload. */ + else if (GET_CODE (operands[1]) == CONST_INT + && !const_ok_for_arm (INTVAL (operands[1])) + && !const_ok_for_arm (~INTVAL (operands[1]))) + { + /* Writing a constant to memory needs a scratch, which should + be handled with SECONDARY_RELOADs. */ + gcc_assert (GET_CODE (operands[0]) == REG); + + operands[0] = gen_rtx_SUBREG (SImode, operands[0], 0); + emit_insn (gen_movsi (operands[0], operands[1])); + DONE; + } + } + else if (TARGET_THUMB2) + { + /* Thumb-2 can do everything except mem=mem and mem=const easily. */ + if (can_create_pseudo_p ()) + { + if (GET_CODE (operands[0]) != REG) + operands[1] = force_reg (HImode, operands[1]); + /* Zero extend a constant, and keep it in an SImode reg. */ + else if (GET_CODE (operands[1]) == CONST_INT) + { + rtx reg = gen_reg_rtx (SImode); + HOST_WIDE_INT val = INTVAL (operands[1]) & 0xffff; + + emit_insn (gen_movsi (reg, GEN_INT (val))); + operands[1] = gen_lowpart (HImode, reg); + } + } + } + else /* TARGET_THUMB1 */ + { + if (can_create_pseudo_p ()) + { + if (GET_CODE (operands[1]) == CONST_INT) + { + rtx reg = gen_reg_rtx (SImode); + + emit_insn (gen_movsi (reg, operands[1])); + operands[1] = gen_lowpart (HImode, reg); + } + + /* ??? We shouldn't really get invalid addresses here, but this can + happen if we are passed a SP (never OK for HImode/QImode) or + virtual register (also rejected as illegitimate for HImode/QImode) + relative address. */ + /* ??? This should perhaps be fixed elsewhere, for instance, in + fixup_stack_1, by checking for other kinds of invalid addresses, + e.g. a bare reference to a virtual register. This may confuse the + alpha though, which must handle this case differently. */ + if (GET_CODE (operands[0]) == MEM + && !memory_address_p (GET_MODE (operands[0]), + XEXP (operands[0], 0))) + operands[0] + = replace_equiv_address (operands[0], + copy_to_reg (XEXP (operands[0], 0))); + + if (GET_CODE (operands[1]) == MEM + && !memory_address_p (GET_MODE (operands[1]), + XEXP (operands[1], 0))) + operands[1] + = replace_equiv_address (operands[1], + copy_to_reg (XEXP (operands[1], 0))); + + if (GET_CODE (operands[1]) == MEM && optimize > 0) + { + rtx reg = gen_reg_rtx (SImode); + + emit_insn (gen_zero_extendhisi2 (reg, operands[1])); + operands[1] = gen_lowpart (HImode, reg); + } + + if (GET_CODE (operands[0]) == MEM) + operands[1] = force_reg (HImode, operands[1]); + } + else if (GET_CODE (operands[1]) == CONST_INT + && !satisfies_constraint_I (operands[1])) + { + /* Handle loading a large integer during reload. */ + + /* Writing a constant to memory needs a scratch, which should + be handled with SECONDARY_RELOADs. */ + gcc_assert (GET_CODE (operands[0]) == REG); + + operands[0] = gen_rtx_SUBREG (SImode, operands[0], 0); + emit_insn (gen_movsi (operands[0], operands[1])); + DONE; + } + } + " +) + +(define_insn "*thumb1_movhi_insn" + [(set (match_operand:HI 0 "nonimmediate_operand" "=l,l,m,*r,*h,l") + (match_operand:HI 1 "general_operand" "l,m,l,*h,*r,I"))] + "TARGET_THUMB1 + && ( register_operand (operands[0], HImode) + || register_operand (operands[1], HImode))" + "* + switch (which_alternative) + { + case 0: return \"add %0, %1, #0\"; + case 2: return \"strh %1, %0\"; + case 3: return \"mov %0, %1\"; + case 4: return \"mov %0, %1\"; + case 5: return \"mov %0, %1\"; + default: gcc_unreachable (); + case 1: + /* The stack pointer can end up being taken as an index register. + Catch this case here and deal with it. */ + if (GET_CODE (XEXP (operands[1], 0)) == PLUS + && GET_CODE (XEXP (XEXP (operands[1], 0), 0)) == REG + && REGNO (XEXP (XEXP (operands[1], 0), 0)) == SP_REGNUM) + { + rtx ops[2]; + ops[0] = operands[0]; + ops[1] = XEXP (XEXP (operands[1], 0), 0); + + output_asm_insn (\"mov %0, %1\", ops); + + XEXP (XEXP (operands[1], 0), 0) = operands[0]; + + } + return \"ldrh %0, %1\"; + }" + [(set_attr "length" "2,4,2,2,2,2") + (set_attr "type" "*,load1,store1,*,*,*") + (set_attr "conds" "clob,nocond,nocond,nocond,nocond,clob")]) + + +(define_expand "movhi_bytes" + [(set (match_dup 2) (zero_extend:SI (match_operand:HI 1 "" ""))) + (set (match_dup 3) + (zero_extend:SI (match_dup 6))) + (set (match_operand:SI 0 "" "") + (ior:SI (ashift:SI (match_dup 4) (const_int 8)) (match_dup 5)))] + "TARGET_ARM" + " + { + rtx mem1, mem2; + rtx addr = copy_to_mode_reg (SImode, XEXP (operands[1], 0)); + + mem1 = change_address (operands[1], QImode, addr); + mem2 = change_address (operands[1], QImode, plus_constant (addr, 1)); + operands[0] = gen_lowpart (SImode, operands[0]); + operands[1] = mem1; + operands[2] = gen_reg_rtx (SImode); + operands[3] = gen_reg_rtx (SImode); + operands[6] = mem2; + + if (BYTES_BIG_ENDIAN) + { + operands[4] = operands[2]; + operands[5] = operands[3]; + } + else + { + operands[4] = operands[3]; + operands[5] = operands[2]; + } + }" +) + +(define_expand "movhi_bigend" + [(set (match_dup 2) + (rotate:SI (subreg:SI (match_operand:HI 1 "memory_operand" "") 0) + (const_int 16))) + (set (match_dup 3) + (ashiftrt:SI (match_dup 2) (const_int 16))) + (set (match_operand:HI 0 "s_register_operand" "") + (match_dup 4))] + "TARGET_ARM" + " + operands[2] = gen_reg_rtx (SImode); + operands[3] = gen_reg_rtx (SImode); + operands[4] = gen_lowpart (HImode, operands[3]); + " +) + +;; Pattern to recognize insn generated default case above +(define_insn "*movhi_insn_arch4" + [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,m,r") + (match_operand:HI 1 "general_operand" "rI,K,r,mi"))] + "TARGET_ARM + && arm_arch4 + && (register_operand (operands[0], HImode) + || register_operand (operands[1], HImode))" + "@ + mov%?\\t%0, %1\\t%@ movhi + mvn%?\\t%0, #%B1\\t%@ movhi + str%(h%)\\t%1, %0\\t%@ movhi + ldr%(h%)\\t%0, %1\\t%@ movhi" + [(set_attr "type" "*,*,store1,load1") + (set_attr "predicable" "yes") + (set_attr "insn" "mov,mvn,*,*") + (set_attr "pool_range" "*,*,*,256") + (set_attr "neg_pool_range" "*,*,*,244")] +) + +(define_insn "*movhi_bytes" + [(set (match_operand:HI 0 "s_register_operand" "=r,r") + (match_operand:HI 1 "arm_rhs_operand" "rI,K"))] + "TARGET_ARM" + "@ + mov%?\\t%0, %1\\t%@ movhi + mvn%?\\t%0, #%B1\\t%@ movhi" + [(set_attr "predicable" "yes") + (set_attr "insn" "mov,mvn")] +) + +(define_expand "thumb_movhi_clobber" + [(set (match_operand:HI 0 "memory_operand" "") + (match_operand:HI 1 "register_operand" "")) + (clobber (match_operand:DI 2 "register_operand" ""))] + "TARGET_THUMB1" + " + if (strict_memory_address_p (HImode, XEXP (operands[0], 0)) + && REGNO (operands[1]) <= LAST_LO_REGNUM) + { + emit_insn (gen_movhi (operands[0], operands[1])); + DONE; + } + /* XXX Fixme, need to handle other cases here as well. */ + gcc_unreachable (); + " +) + +;; We use a DImode scratch because we may occasionally need an additional +;; temporary if the address isn't offsettable -- push_reload doesn't seem +;; to take any notice of the "o" constraints on reload_memory_operand operand. +(define_expand "reload_outhi" + [(parallel [(match_operand:HI 0 "arm_reload_memory_operand" "=o") + (match_operand:HI 1 "s_register_operand" "r") + (match_operand:DI 2 "s_register_operand" "=&l")])] + "TARGET_EITHER" + "if (TARGET_ARM) + arm_reload_out_hi (operands); + else + thumb_reload_out_hi (operands); + DONE; + " +) + +(define_expand "reload_inhi" + [(parallel [(match_operand:HI 0 "s_register_operand" "=r") + (match_operand:HI 1 "arm_reload_memory_operand" "o") + (match_operand:DI 2 "s_register_operand" "=&r")])] + "TARGET_EITHER" + " + if (TARGET_ARM) + arm_reload_in_hi (operands); + else + thumb_reload_out_hi (operands); + DONE; +") + +(define_expand "movqi" + [(set (match_operand:QI 0 "general_operand" "") + (match_operand:QI 1 "general_operand" ""))] + "TARGET_EITHER" + " + /* Everything except mem = const or mem = mem can be done easily */ + + if (can_create_pseudo_p ()) + { + if (GET_CODE (operands[1]) == CONST_INT) + { + rtx reg = gen_reg_rtx (SImode); + + /* For thumb we want an unsigned immediate, then we are more likely + to be able to use a movs insn. */ + if (TARGET_THUMB) + operands[1] = GEN_INT (INTVAL (operands[1]) & 255); + + emit_insn (gen_movsi (reg, operands[1])); + operands[1] = gen_lowpart (QImode, reg); + } + + if (TARGET_THUMB) + { + /* ??? We shouldn't really get invalid addresses here, but this can + happen if we are passed a SP (never OK for HImode/QImode) or + virtual register (also rejected as illegitimate for HImode/QImode) + relative address. */ + /* ??? This should perhaps be fixed elsewhere, for instance, in + fixup_stack_1, by checking for other kinds of invalid addresses, + e.g. a bare reference to a virtual register. This may confuse the + alpha though, which must handle this case differently. */ + if (GET_CODE (operands[0]) == MEM + && !memory_address_p (GET_MODE (operands[0]), + XEXP (operands[0], 0))) + operands[0] + = replace_equiv_address (operands[0], + copy_to_reg (XEXP (operands[0], 0))); + if (GET_CODE (operands[1]) == MEM + && !memory_address_p (GET_MODE (operands[1]), + XEXP (operands[1], 0))) + operands[1] + = replace_equiv_address (operands[1], + copy_to_reg (XEXP (operands[1], 0))); + } + + if (GET_CODE (operands[1]) == MEM && optimize > 0) + { + rtx reg = gen_reg_rtx (SImode); + + emit_insn (gen_zero_extendqisi2 (reg, operands[1])); + operands[1] = gen_lowpart (QImode, reg); + } + + if (GET_CODE (operands[0]) == MEM) + operands[1] = force_reg (QImode, operands[1]); + } + else if (TARGET_THUMB + && GET_CODE (operands[1]) == CONST_INT + && !satisfies_constraint_I (operands[1])) + { + /* Handle loading a large integer during reload. */ + + /* Writing a constant to memory needs a scratch, which should + be handled with SECONDARY_RELOADs. */ + gcc_assert (GET_CODE (operands[0]) == REG); + + operands[0] = gen_rtx_SUBREG (SImode, operands[0], 0); + emit_insn (gen_movsi (operands[0], operands[1])); + DONE; + } + " +) + + +(define_insn "*arm_movqi_insn" + [(set (match_operand:QI 0 "nonimmediate_operand" "=r,r,r,m") + (match_operand:QI 1 "general_operand" "rI,K,m,r"))] + "TARGET_32BIT + && ( register_operand (operands[0], QImode) + || register_operand (operands[1], QImode))" + "@ + mov%?\\t%0, %1 + mvn%?\\t%0, #%B1 + ldr%(b%)\\t%0, %1 + str%(b%)\\t%1, %0" + [(set_attr "type" "*,*,load1,store1") + (set_attr "insn" "mov,mvn,*,*") + (set_attr "predicable" "yes")] +) + +(define_insn "*thumb1_movqi_insn" + [(set (match_operand:QI 0 "nonimmediate_operand" "=l,l,m,*r,*h,l") + (match_operand:QI 1 "general_operand" "l, m,l,*h,*r,I"))] + "TARGET_THUMB1 + && ( register_operand (operands[0], QImode) + || register_operand (operands[1], QImode))" + "@ + add\\t%0, %1, #0 + ldrb\\t%0, %1 + strb\\t%1, %0 + mov\\t%0, %1 + mov\\t%0, %1 + mov\\t%0, %1" + [(set_attr "length" "2") + (set_attr "type" "*,load1,store1,*,*,*") + (set_attr "insn" "*,*,*,mov,mov,mov") + (set_attr "pool_range" "*,32,*,*,*,*") + (set_attr "conds" "clob,nocond,nocond,nocond,nocond,clob")]) + +;; HFmode moves +(define_expand "movhf" + [(set (match_operand:HF 0 "general_operand" "") + (match_operand:HF 1 "general_operand" ""))] + "TARGET_EITHER" + " + if (TARGET_32BIT) + { + if (GET_CODE (operands[0]) == MEM) + operands[1] = force_reg (HFmode, operands[1]); + } + else /* TARGET_THUMB1 */ + { + if (can_create_pseudo_p ()) + { + if (GET_CODE (operands[0]) != REG) + operands[1] = force_reg (HFmode, operands[1]); + } + } + " +) + +(define_insn "*arm32_movhf" + [(set (match_operand:HF 0 "nonimmediate_operand" "=r,m,r,r") + (match_operand:HF 1 "general_operand" " m,r,r,F"))] + "TARGET_32BIT && !(TARGET_HARD_FLOAT && TARGET_FP16) + && ( s_register_operand (operands[0], HFmode) + || s_register_operand (operands[1], HFmode))" + "* + switch (which_alternative) + { + case 0: /* ARM register from memory */ + return \"ldr%(h%)\\t%0, %1\\t%@ __fp16\"; + case 1: /* memory from ARM register */ + return \"str%(h%)\\t%1, %0\\t%@ __fp16\"; + case 2: /* ARM register from ARM register */ + return \"mov%?\\t%0, %1\\t%@ __fp16\"; + case 3: /* ARM register from constant */ + { + REAL_VALUE_TYPE r; + long bits; + rtx ops[4]; + + REAL_VALUE_FROM_CONST_DOUBLE (r, operands[1]); + bits = real_to_target (NULL, &r, HFmode); + ops[0] = operands[0]; + ops[1] = GEN_INT (bits); + ops[2] = GEN_INT (bits & 0xff00); + ops[3] = GEN_INT (bits & 0x00ff); + + if (arm_arch_thumb2) + output_asm_insn (\"movw%?\\t%0, %1\", ops); + else + output_asm_insn (\"mov%?\\t%0, %2\;orr%?\\t%0, %0, %3\", ops); + return \"\"; + } + default: + gcc_unreachable (); + } + " + [(set_attr "conds" "unconditional") + (set_attr "type" "load1,store1,*,*") + (set_attr "insn" "*,*,mov,mov") + (set_attr "length" "4,4,4,8") + (set_attr "predicable" "yes")] +) + +(define_insn "*thumb1_movhf" + [(set (match_operand:HF 0 "nonimmediate_operand" "=l,l,m,*r,*h") + (match_operand:HF 1 "general_operand" "l,mF,l,*h,*r"))] + "TARGET_THUMB1 + && ( s_register_operand (operands[0], HFmode) + || s_register_operand (operands[1], HFmode))" + "* + switch (which_alternative) + { + case 1: + { + rtx addr; + gcc_assert (GET_CODE(operands[1]) == MEM); + addr = XEXP (operands[1], 0); + if (GET_CODE (addr) == LABEL_REF + || (GET_CODE (addr) == CONST + && GET_CODE (XEXP (addr, 0)) == PLUS + && GET_CODE (XEXP (XEXP (addr, 0), 0)) == LABEL_REF + && GET_CODE (XEXP (XEXP (addr, 0), 1)) == CONST_INT)) + { + /* Constant pool entry. */ + return \"ldr\\t%0, %1\"; + } + return \"ldrh\\t%0, %1\"; + } + case 2: return \"strh\\t%1, %0\"; + default: return \"mov\\t%0, %1\"; + } + " + [(set_attr "length" "2") + (set_attr "type" "*,load1,store1,*,*") + (set_attr "insn" "mov,*,*,mov,mov") + (set_attr "pool_range" "*,1020,*,*,*") + (set_attr "conds" "clob,nocond,nocond,nocond,nocond")]) + +(define_expand "movsf" + [(set (match_operand:SF 0 "general_operand" "") + (match_operand:SF 1 "general_operand" ""))] + "TARGET_EITHER" + " + if (TARGET_32BIT) + { + if (GET_CODE (operands[0]) == MEM) + operands[1] = force_reg (SFmode, operands[1]); + } + else /* TARGET_THUMB1 */ + { + if (can_create_pseudo_p ()) + { + if (GET_CODE (operands[0]) != REG) + operands[1] = force_reg (SFmode, operands[1]); + } + } + " +) + +;; Transform a floating-point move of a constant into a core register into +;; an SImode operation. +(define_split + [(set (match_operand:SF 0 "arm_general_register_operand" "") + (match_operand:SF 1 "immediate_operand" ""))] + "TARGET_EITHER + && reload_completed + && GET_CODE (operands[1]) == CONST_DOUBLE" + [(set (match_dup 2) (match_dup 3))] + " + operands[2] = gen_lowpart (SImode, operands[0]); + operands[3] = gen_lowpart (SImode, operands[1]); + if (operands[2] == 0 || operands[3] == 0) + FAIL; + " +) + +(define_insn "*arm_movsf_soft_insn" + [(set (match_operand:SF 0 "nonimmediate_operand" "=r,r,m") + (match_operand:SF 1 "general_operand" "r,mE,r"))] + "TARGET_32BIT + && TARGET_SOFT_FLOAT + && (GET_CODE (operands[0]) != MEM + || register_operand (operands[1], SFmode))" + "@ + mov%?\\t%0, %1 + ldr%?\\t%0, %1\\t%@ float + str%?\\t%1, %0\\t%@ float" + [(set_attr "predicable" "yes") + (set_attr "type" "*,load1,store1") + (set_attr "insn" "mov,*,*") + (set_attr "pool_range" "*,4096,*") + (set_attr "arm_neg_pool_range" "*,4084,*") + (set_attr "thumb2_neg_pool_range" "*,0,*")] +) + +;;; ??? This should have alternatives for constants. +(define_insn "*thumb1_movsf_insn" + [(set (match_operand:SF 0 "nonimmediate_operand" "=l,l,>,l, m,*r,*h") + (match_operand:SF 1 "general_operand" "l, >,l,mF,l,*h,*r"))] + "TARGET_THUMB1 + && ( register_operand (operands[0], SFmode) + || register_operand (operands[1], SFmode))" + "@ + add\\t%0, %1, #0 + ldmia\\t%1, {%0} + stmia\\t%0, {%1} + ldr\\t%0, %1 + str\\t%1, %0 + mov\\t%0, %1 + mov\\t%0, %1" + [(set_attr "length" "2") + (set_attr "type" "*,load1,store1,load1,store1,*,*") + (set_attr "pool_range" "*,*,*,1020,*,*,*") + (set_attr "insn" "*,*,*,*,*,mov,mov") + (set_attr "conds" "clob,nocond,nocond,nocond,nocond,nocond,nocond")] +) + +(define_expand "movdf" + [(set (match_operand:DF 0 "general_operand" "") + (match_operand:DF 1 "general_operand" ""))] + "TARGET_EITHER" + " + if (TARGET_32BIT) + { + if (GET_CODE (operands[0]) == MEM) + operands[1] = force_reg (DFmode, operands[1]); + } + else /* TARGET_THUMB */ + { + if (can_create_pseudo_p ()) + { + if (GET_CODE (operands[0]) != REG) + operands[1] = force_reg (DFmode, operands[1]); + } + } + " +) + +;; Reloading a df mode value stored in integer regs to memory can require a +;; scratch reg. +(define_expand "reload_outdf" + [(match_operand:DF 0 "arm_reload_memory_operand" "=o") + (match_operand:DF 1 "s_register_operand" "r") + (match_operand:SI 2 "s_register_operand" "=&r")] + "TARGET_32BIT" + " + { + enum rtx_code code = GET_CODE (XEXP (operands[0], 0)); + + if (code == REG) + operands[2] = XEXP (operands[0], 0); + else if (code == POST_INC || code == PRE_DEC) + { + operands[0] = gen_rtx_SUBREG (DImode, operands[0], 0); + operands[1] = gen_rtx_SUBREG (DImode, operands[1], 0); + emit_insn (gen_movdi (operands[0], operands[1])); + DONE; + } + else if (code == PRE_INC) + { + rtx reg = XEXP (XEXP (operands[0], 0), 0); + + emit_insn (gen_addsi3 (reg, reg, GEN_INT (8))); + operands[2] = reg; + } + else if (code == POST_DEC) + operands[2] = XEXP (XEXP (operands[0], 0), 0); + else + emit_insn (gen_addsi3 (operands[2], XEXP (XEXP (operands[0], 0), 0), + XEXP (XEXP (operands[0], 0), 1))); + + emit_insn (gen_rtx_SET (VOIDmode, + replace_equiv_address (operands[0], operands[2]), + operands[1])); + + if (code == POST_DEC) + emit_insn (gen_addsi3 (operands[2], operands[2], GEN_INT (-8))); + + DONE; + }" +) + +(define_insn "*movdf_soft_insn" + [(set (match_operand:DF 0 "nonimmediate_soft_df_operand" "=r,r,r,r,m") + (match_operand:DF 1 "soft_df_operand" "rDa,Db,Dc,mF,r"))] + "TARGET_32BIT && TARGET_SOFT_FLOAT + && ( register_operand (operands[0], DFmode) + || register_operand (operands[1], DFmode))" + "* + switch (which_alternative) + { + case 0: + case 1: + case 2: + return \"#\"; + default: + return output_move_double (operands); + } + " + [(set_attr "length" "8,12,16,8,8") + (set_attr "type" "*,*,*,load2,store2") + (set_attr "pool_range" "*,*,*,1020,*") + (set_attr "arm_neg_pool_range" "*,*,*,1008,*") + (set_attr "thumb2_neg_pool_range" "*,*,*,0,*")] +) + +;;; ??? This should have alternatives for constants. +;;; ??? This was originally identical to the movdi_insn pattern. +;;; ??? The 'F' constraint looks funny, but it should always be replaced by +;;; thumb_reorg with a memory reference. +(define_insn "*thumb_movdf_insn" + [(set (match_operand:DF 0 "nonimmediate_operand" "=l,l,>,l, m,*r") + (match_operand:DF 1 "general_operand" "l, >,l,mF,l,*r"))] + "TARGET_THUMB1 + && ( register_operand (operands[0], DFmode) + || register_operand (operands[1], DFmode))" + "* + switch (which_alternative) + { + default: + case 0: + if (REGNO (operands[1]) == REGNO (operands[0]) + 1) + return \"add\\t%0, %1, #0\;add\\t%H0, %H1, #0\"; + return \"add\\t%H0, %H1, #0\;add\\t%0, %1, #0\"; + case 1: + return \"ldmia\\t%1, {%0, %H0}\"; + case 2: + return \"stmia\\t%0, {%1, %H1}\"; + case 3: + return thumb_load_double_from_address (operands); + case 4: + operands[2] = gen_rtx_MEM (SImode, + plus_constant (XEXP (operands[0], 0), 4)); + output_asm_insn (\"str\\t%1, %0\;str\\t%H1, %2\", operands); + return \"\"; + case 5: + if (REGNO (operands[1]) == REGNO (operands[0]) + 1) + return \"mov\\t%0, %1\;mov\\t%H0, %H1\"; + return \"mov\\t%H0, %H1\;mov\\t%0, %1\"; + } + " + [(set_attr "length" "4,2,2,6,4,4") + (set_attr "type" "*,load2,store2,load2,store2,*") + (set_attr "insn" "*,*,*,*,*,mov") + (set_attr "pool_range" "*,*,*,1020,*,*")] +) + +(define_expand "movxf" + [(set (match_operand:XF 0 "general_operand" "") + (match_operand:XF 1 "general_operand" ""))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + " + if (GET_CODE (operands[0]) == MEM) + operands[1] = force_reg (XFmode, operands[1]); + " +) + + + +;; load- and store-multiple insns +;; The arm can load/store any set of registers, provided that they are in +;; ascending order, but these expanders assume a contiguous set. + +(define_expand "load_multiple" + [(match_par_dup 3 [(set (match_operand:SI 0 "" "") + (match_operand:SI 1 "" "")) + (use (match_operand:SI 2 "" ""))])] + "TARGET_32BIT" +{ + HOST_WIDE_INT offset = 0; + + /* Support only fixed point registers. */ + if (GET_CODE (operands[2]) != CONST_INT + || INTVAL (operands[2]) > 14 + || INTVAL (operands[2]) < 2 + || GET_CODE (operands[1]) != MEM + || GET_CODE (operands[0]) != REG + || REGNO (operands[0]) > (LAST_ARM_REGNUM - 1) + || REGNO (operands[0]) + INTVAL (operands[2]) > LAST_ARM_REGNUM) + FAIL; + + operands[3] + = arm_gen_load_multiple (arm_regs_in_sequence + REGNO (operands[0]), + INTVAL (operands[2]), + force_reg (SImode, XEXP (operands[1], 0)), + FALSE, operands[1], &offset); +}) + +(define_expand "store_multiple" + [(match_par_dup 3 [(set (match_operand:SI 0 "" "") + (match_operand:SI 1 "" "")) + (use (match_operand:SI 2 "" ""))])] + "TARGET_32BIT" +{ + HOST_WIDE_INT offset = 0; + + /* Support only fixed point registers. */ + if (GET_CODE (operands[2]) != CONST_INT + || INTVAL (operands[2]) > 14 + || INTVAL (operands[2]) < 2 + || GET_CODE (operands[1]) != REG + || GET_CODE (operands[0]) != MEM + || REGNO (operands[1]) > (LAST_ARM_REGNUM - 1) + || REGNO (operands[1]) + INTVAL (operands[2]) > LAST_ARM_REGNUM) + FAIL; + + operands[3] + = arm_gen_store_multiple (arm_regs_in_sequence + REGNO (operands[1]), + INTVAL (operands[2]), + force_reg (SImode, XEXP (operands[0], 0)), + FALSE, operands[0], &offset); +}) + + +;; Move a block of memory if it is word aligned and MORE than 2 words long. +;; We could let this apply for blocks of less than this, but it clobbers so +;; many registers that there is then probably a better way. + +(define_expand "movmemqi" + [(match_operand:BLK 0 "general_operand" "") + (match_operand:BLK 1 "general_operand" "") + (match_operand:SI 2 "const_int_operand" "") + (match_operand:SI 3 "const_int_operand" "")] + "TARGET_EITHER" + " + if (TARGET_32BIT) + { + if (arm_gen_movmemqi (operands)) + DONE; + FAIL; + } + else /* TARGET_THUMB1 */ + { + if ( INTVAL (operands[3]) != 4 + || INTVAL (operands[2]) > 48) + FAIL; + + thumb_expand_movmemqi (operands); + DONE; + } + " +) + +;; Thumb block-move insns + +(define_insn "movmem12b" + [(set (mem:SI (match_operand:SI 2 "register_operand" "0")) + (mem:SI (match_operand:SI 3 "register_operand" "1"))) + (set (mem:SI (plus:SI (match_dup 2) (const_int 4))) + (mem:SI (plus:SI (match_dup 3) (const_int 4)))) + (set (mem:SI (plus:SI (match_dup 2) (const_int 8))) + (mem:SI (plus:SI (match_dup 3) (const_int 8)))) + (set (match_operand:SI 0 "register_operand" "=l") + (plus:SI (match_dup 2) (const_int 12))) + (set (match_operand:SI 1 "register_operand" "=l") + (plus:SI (match_dup 3) (const_int 12))) + (clobber (match_scratch:SI 4 "=&l")) + (clobber (match_scratch:SI 5 "=&l")) + (clobber (match_scratch:SI 6 "=&l"))] + "TARGET_THUMB1" + "* return thumb_output_move_mem_multiple (3, operands);" + [(set_attr "length" "4") + ; This isn't entirely accurate... It loads as well, but in terms of + ; scheduling the following insn it is better to consider it as a store + (set_attr "type" "store3")] +) + +(define_insn "movmem8b" + [(set (mem:SI (match_operand:SI 2 "register_operand" "0")) + (mem:SI (match_operand:SI 3 "register_operand" "1"))) + (set (mem:SI (plus:SI (match_dup 2) (const_int 4))) + (mem:SI (plus:SI (match_dup 3) (const_int 4)))) + (set (match_operand:SI 0 "register_operand" "=l") + (plus:SI (match_dup 2) (const_int 8))) + (set (match_operand:SI 1 "register_operand" "=l") + (plus:SI (match_dup 3) (const_int 8))) + (clobber (match_scratch:SI 4 "=&l")) + (clobber (match_scratch:SI 5 "=&l"))] + "TARGET_THUMB1" + "* return thumb_output_move_mem_multiple (2, operands);" + [(set_attr "length" "4") + ; This isn't entirely accurate... It loads as well, but in terms of + ; scheduling the following insn it is better to consider it as a store + (set_attr "type" "store2")] +) + + + +;; Compare & branch insns +;; The range calculations are based as follows: +;; For forward branches, the address calculation returns the address of +;; the next instruction. This is 2 beyond the branch instruction. +;; For backward branches, the address calculation returns the address of +;; the first instruction in this pattern (cmp). This is 2 before the branch +;; instruction for the shortest sequence, and 4 before the branch instruction +;; if we have to jump around an unconditional branch. +;; To the basic branch range the PC offset must be added (this is +4). +;; So for forward branches we have +;; (pos_range - pos_base_offs + pc_offs) = (pos_range - 2 + 4). +;; And for backward branches we have +;; (neg_range - neg_base_offs + pc_offs) = (neg_range - (-2 or -4) + 4). +;; +;; For a 'b' pos_range = 2046, neg_range = -2048 giving (-2040->2048). +;; For a 'b' pos_range = 254, neg_range = -256 giving (-250 ->256). + +(define_expand "cbranchsi4" + [(set (pc) (if_then_else + (match_operator 0 "arm_comparison_operator" + [(match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 2 "nonmemory_operand" "")]) + (label_ref (match_operand 3 "" "")) + (pc)))] + "TARGET_THUMB1 || TARGET_32BIT" + " + if (!TARGET_THUMB1) + { + if (!arm_add_operand (operands[2], SImode)) + operands[2] = force_reg (SImode, operands[2]); + emit_jump_insn (gen_cbranch_cc (operands[0], operands[1], operands[2], + operands[3])); + DONE; + } + if (thumb1_cmpneg_operand (operands[2], SImode)) + { + emit_jump_insn (gen_cbranchsi4_scratch (NULL, operands[1], operands[2], + operands[3], operands[0])); + DONE; + } + if (!thumb1_cmp_operand (operands[2], SImode)) + operands[2] = force_reg (SImode, operands[2]); + ") + +;; A pattern to recognize a special situation and optimize for it. +;; On the thumb, zero-extension from memory is preferrable to sign-extension +;; due to the available addressing modes. Hence, convert a signed comparison +;; with zero into an unsigned comparison with 127 if possible. +(define_expand "cbranchqi4" + [(set (pc) (if_then_else + (match_operator 0 "lt_ge_comparison_operator" + [(match_operand:QI 1 "memory_operand" "") + (match_operand:QI 2 "const0_operand" "")]) + (label_ref (match_operand 3 "" "")) + (pc)))] + "TARGET_THUMB1" +{ + rtx xops[4]; + xops[1] = gen_reg_rtx (SImode); + emit_insn (gen_zero_extendqisi2 (xops[1], operands[1])); + xops[2] = GEN_INT (127); + xops[0] = gen_rtx_fmt_ee (GET_CODE (operands[0]) == GE ? LEU : GTU, + VOIDmode, xops[1], xops[2]); + xops[3] = operands[3]; + emit_insn (gen_cbranchsi4 (xops[0], xops[1], xops[2], xops[3])); + DONE; +}) + +(define_expand "cbranchsf4" + [(set (pc) (if_then_else + (match_operator 0 "arm_comparison_operator" + [(match_operand:SF 1 "s_register_operand" "") + (match_operand:SF 2 "arm_float_compare_operand" "")]) + (label_ref (match_operand 3 "" "")) + (pc)))] + "TARGET_32BIT && TARGET_HARD_FLOAT" + "emit_jump_insn (gen_cbranch_cc (operands[0], operands[1], operands[2], + operands[3])); DONE;" +) + +(define_expand "cbranchdf4" + [(set (pc) (if_then_else + (match_operator 0 "arm_comparison_operator" + [(match_operand:DF 1 "s_register_operand" "") + (match_operand:DF 2 "arm_float_compare_operand" "")]) + (label_ref (match_operand 3 "" "")) + (pc)))] + "TARGET_32BIT && TARGET_HARD_FLOAT && !TARGET_VFP_SINGLE" + "emit_jump_insn (gen_cbranch_cc (operands[0], operands[1], operands[2], + operands[3])); DONE;" +) + +(define_expand "cbranchdi4" + [(set (pc) (if_then_else + (match_operator 0 "arm_comparison_operator" + [(match_operand:DI 1 "cmpdi_operand" "") + (match_operand:DI 2 "cmpdi_operand" "")]) + (label_ref (match_operand 3 "" "")) + (pc)))] + "TARGET_32BIT" + "{ + rtx swap = NULL_RTX; + enum rtx_code code = GET_CODE (operands[0]); + + /* We should not have two constants. */ + gcc_assert (GET_MODE (operands[1]) == DImode + || GET_MODE (operands[2]) == DImode); + + /* Flip unimplemented DImode comparisons to a form that + arm_gen_compare_reg can handle. */ + switch (code) + { + case GT: + swap = gen_rtx_LT (VOIDmode, operands[2], operands[1]); break; + case LE: + swap = gen_rtx_GE (VOIDmode, operands[2], operands[1]); break; + case GTU: + swap = gen_rtx_LTU (VOIDmode, operands[2], operands[1]); break; + case LEU: + swap = gen_rtx_GEU (VOIDmode, operands[2], operands[1]); break; + default: + break; + } + if (swap) + emit_jump_insn (gen_cbranch_cc (swap, operands[2], operands[1], + operands[3])); + else + emit_jump_insn (gen_cbranch_cc (operands[0], operands[1], operands[2], + operands[3])); + DONE; + }" +) + +(define_insn "cbranchsi4_insn" + [(set (pc) (if_then_else + (match_operator 0 "arm_comparison_operator" + [(match_operand:SI 1 "s_register_operand" "l,l*h") + (match_operand:SI 2 "thumb1_cmp_operand" "lI*h,*r")]) + (label_ref (match_operand 3 "" "")) + (pc)))] + "TARGET_THUMB1" +{ + rtx t = cfun->machine->thumb1_cc_insn; + if (t != NULL_RTX) + { + if (!rtx_equal_p (cfun->machine->thumb1_cc_op0, operands[1]) + || !rtx_equal_p (cfun->machine->thumb1_cc_op1, operands[2])) + t = NULL_RTX; + if (cfun->machine->thumb1_cc_mode == CC_NOOVmode) + { + if (!noov_comparison_operator (operands[0], VOIDmode)) + t = NULL_RTX; + } + else if (cfun->machine->thumb1_cc_mode != CCmode) + t = NULL_RTX; + } + if (t == NULL_RTX) + { + output_asm_insn ("cmp\t%1, %2", operands); + cfun->machine->thumb1_cc_insn = insn; + cfun->machine->thumb1_cc_op0 = operands[1]; + cfun->machine->thumb1_cc_op1 = operands[2]; + cfun->machine->thumb1_cc_mode = CCmode; + } + else + /* Ensure we emit the right type of condition code on the jump. */ + XEXP (operands[0], 0) = gen_rtx_REG (cfun->machine->thumb1_cc_mode, + CC_REGNUM); + + switch (get_attr_length (insn)) + { + case 4: return \"b%d0\\t%l3\"; + case 6: return \"b%D0\\t.LCB%=\;b\\t%l3\\t%@long jump\\n.LCB%=:\"; + default: return \"b%D0\\t.LCB%=\;bl\\t%l3\\t%@far jump\\n.LCB%=:\"; + } +} + [(set (attr "far_jump") + (if_then_else + (eq_attr "length" "8") + (const_string "yes") + (const_string "no"))) + (set (attr "length") + (if_then_else + (and (ge (minus (match_dup 3) (pc)) (const_int -250)) + (le (minus (match_dup 3) (pc)) (const_int 256))) + (const_int 4) + (if_then_else + (and (ge (minus (match_dup 3) (pc)) (const_int -2040)) + (le (minus (match_dup 3) (pc)) (const_int 2048))) + (const_int 6) + (const_int 8))))] +) + +(define_insn "cbranchsi4_scratch" + [(set (pc) (if_then_else + (match_operator 4 "arm_comparison_operator" + [(match_operand:SI 1 "s_register_operand" "l,0") + (match_operand:SI 2 "thumb1_cmpneg_operand" "L,J")]) + (label_ref (match_operand 3 "" "")) + (pc))) + (clobber (match_scratch:SI 0 "=l,l"))] + "TARGET_THUMB1" + "* + output_asm_insn (\"add\\t%0, %1, #%n2\", operands); + + switch (get_attr_length (insn)) + { + case 4: return \"b%d4\\t%l3\"; + case 6: return \"b%D4\\t.LCB%=\;b\\t%l3\\t%@long jump\\n.LCB%=:\"; + default: return \"b%D4\\t.LCB%=\;bl\\t%l3\\t%@far jump\\n.LCB%=:\"; + } + " + [(set (attr "far_jump") + (if_then_else + (eq_attr "length" "8") + (const_string "yes") + (const_string "no"))) + (set (attr "length") + (if_then_else + (and (ge (minus (match_dup 3) (pc)) (const_int -250)) + (le (minus (match_dup 3) (pc)) (const_int 256))) + (const_int 4) + (if_then_else + (and (ge (minus (match_dup 3) (pc)) (const_int -2040)) + (le (minus (match_dup 3) (pc)) (const_int 2048))) + (const_int 6) + (const_int 8))))] +) + +;; Two peepholes to generate subtract of 0 instead of a move if the +;; condition codes will be useful. +(define_peephole2 + [(set (match_operand:SI 0 "low_register_operand" "") + (match_operand:SI 1 "low_register_operand" "")) + (set (pc) + (if_then_else (match_operator 2 "arm_comparison_operator" + [(match_dup 1) (const_int 0)]) + (label_ref (match_operand 3 "" "")) + (pc)))] + "TARGET_THUMB1" + [(set (match_dup 0) (minus:SI (match_dup 1) (const_int 0))) + (set (pc) + (if_then_else (match_op_dup 2 [(match_dup 0) (const_int 0)]) + (label_ref (match_dup 3)) + (pc)))] + "") + +;; Sigh! This variant shouldn't be needed, but combine often fails to +;; merge cases like this because the op1 is a hard register in +;; arm_class_likely_spilled_p. +(define_peephole2 + [(set (match_operand:SI 0 "low_register_operand" "") + (match_operand:SI 1 "low_register_operand" "")) + (set (pc) + (if_then_else (match_operator 2 "arm_comparison_operator" + [(match_dup 0) (const_int 0)]) + (label_ref (match_operand 3 "" "")) + (pc)))] + "TARGET_THUMB1" + [(set (match_dup 0) (minus:SI (match_dup 1) (const_int 0))) + (set (pc) + (if_then_else (match_op_dup 2 [(match_dup 0) (const_int 0)]) + (label_ref (match_dup 3)) + (pc)))] + "") + +(define_insn "*negated_cbranchsi4" + [(set (pc) + (if_then_else + (match_operator 0 "equality_operator" + [(match_operand:SI 1 "s_register_operand" "l") + (neg:SI (match_operand:SI 2 "s_register_operand" "l"))]) + (label_ref (match_operand 3 "" "")) + (pc)))] + "TARGET_THUMB1" + "* + output_asm_insn (\"cmn\\t%1, %2\", operands); + switch (get_attr_length (insn)) + { + case 4: return \"b%d0\\t%l3\"; + case 6: return \"b%D0\\t.LCB%=\;b\\t%l3\\t%@long jump\\n.LCB%=:\"; + default: return \"b%D0\\t.LCB%=\;bl\\t%l3\\t%@far jump\\n.LCB%=:\"; + } + " + [(set (attr "far_jump") + (if_then_else + (eq_attr "length" "8") + (const_string "yes") + (const_string "no"))) + (set (attr "length") + (if_then_else + (and (ge (minus (match_dup 3) (pc)) (const_int -250)) + (le (minus (match_dup 3) (pc)) (const_int 256))) + (const_int 4) + (if_then_else + (and (ge (minus (match_dup 3) (pc)) (const_int -2040)) + (le (minus (match_dup 3) (pc)) (const_int 2048))) + (const_int 6) + (const_int 8))))] +) + +(define_insn "*tbit_cbranch" + [(set (pc) + (if_then_else + (match_operator 0 "equality_operator" + [(zero_extract:SI (match_operand:SI 1 "s_register_operand" "l") + (const_int 1) + (match_operand:SI 2 "const_int_operand" "i")) + (const_int 0)]) + (label_ref (match_operand 3 "" "")) + (pc))) + (clobber (match_scratch:SI 4 "=l"))] + "TARGET_THUMB1" + "* + { + rtx op[3]; + op[0] = operands[4]; + op[1] = operands[1]; + op[2] = GEN_INT (32 - 1 - INTVAL (operands[2])); + + output_asm_insn (\"lsl\\t%0, %1, %2\", op); + switch (get_attr_length (insn)) + { + case 4: return \"b%d0\\t%l3\"; + case 6: return \"b%D0\\t.LCB%=\;b\\t%l3\\t%@long jump\\n.LCB%=:\"; + default: return \"b%D0\\t.LCB%=\;bl\\t%l3\\t%@far jump\\n.LCB%=:\"; + } + }" + [(set (attr "far_jump") + (if_then_else + (eq_attr "length" "8") + (const_string "yes") + (const_string "no"))) + (set (attr "length") + (if_then_else + (and (ge (minus (match_dup 3) (pc)) (const_int -250)) + (le (minus (match_dup 3) (pc)) (const_int 256))) + (const_int 4) + (if_then_else + (and (ge (minus (match_dup 3) (pc)) (const_int -2040)) + (le (minus (match_dup 3) (pc)) (const_int 2048))) + (const_int 6) + (const_int 8))))] +) + +(define_insn "*tlobits_cbranch" + [(set (pc) + (if_then_else + (match_operator 0 "equality_operator" + [(zero_extract:SI (match_operand:SI 1 "s_register_operand" "l") + (match_operand:SI 2 "const_int_operand" "i") + (const_int 0)) + (const_int 0)]) + (label_ref (match_operand 3 "" "")) + (pc))) + (clobber (match_scratch:SI 4 "=l"))] + "TARGET_THUMB1" + "* + { + rtx op[3]; + op[0] = operands[4]; + op[1] = operands[1]; + op[2] = GEN_INT (32 - INTVAL (operands[2])); + + output_asm_insn (\"lsl\\t%0, %1, %2\", op); + switch (get_attr_length (insn)) + { + case 4: return \"b%d0\\t%l3\"; + case 6: return \"b%D0\\t.LCB%=\;b\\t%l3\\t%@long jump\\n.LCB%=:\"; + default: return \"b%D0\\t.LCB%=\;bl\\t%l3\\t%@far jump\\n.LCB%=:\"; + } + }" + [(set (attr "far_jump") + (if_then_else + (eq_attr "length" "8") + (const_string "yes") + (const_string "no"))) + (set (attr "length") + (if_then_else + (and (ge (minus (match_dup 3) (pc)) (const_int -250)) + (le (minus (match_dup 3) (pc)) (const_int 256))) + (const_int 4) + (if_then_else + (and (ge (minus (match_dup 3) (pc)) (const_int -2040)) + (le (minus (match_dup 3) (pc)) (const_int 2048))) + (const_int 6) + (const_int 8))))] +) + +(define_insn "*tstsi3_cbranch" + [(set (pc) + (if_then_else + (match_operator 3 "equality_operator" + [(and:SI (match_operand:SI 0 "s_register_operand" "%l") + (match_operand:SI 1 "s_register_operand" "l")) + (const_int 0)]) + (label_ref (match_operand 2 "" "")) + (pc)))] + "TARGET_THUMB1" + "* + { + output_asm_insn (\"tst\\t%0, %1\", operands); + switch (get_attr_length (insn)) + { + case 4: return \"b%d3\\t%l2\"; + case 6: return \"b%D3\\t.LCB%=\;b\\t%l2\\t%@long jump\\n.LCB%=:\"; + default: return \"b%D3\\t.LCB%=\;bl\\t%l2\\t%@far jump\\n.LCB%=:\"; + } + }" + [(set (attr "far_jump") + (if_then_else + (eq_attr "length" "8") + (const_string "yes") + (const_string "no"))) + (set (attr "length") + (if_then_else + (and (ge (minus (match_dup 2) (pc)) (const_int -250)) + (le (minus (match_dup 2) (pc)) (const_int 256))) + (const_int 4) + (if_then_else + (and (ge (minus (match_dup 2) (pc)) (const_int -2040)) + (le (minus (match_dup 2) (pc)) (const_int 2048))) + (const_int 6) + (const_int 8))))] +) + +(define_insn "*cbranchne_decr1" + [(set (pc) + (if_then_else (match_operator 3 "equality_operator" + [(match_operand:SI 2 "s_register_operand" "l,l,1,l") + (const_int 0)]) + (label_ref (match_operand 4 "" "")) + (pc))) + (set (match_operand:SI 0 "thumb_cbrch_target_operand" "=l,*?h,*?m,*?m") + (plus:SI (match_dup 2) (const_int -1))) + (clobber (match_scratch:SI 1 "=X,l,&l,&l"))] + "TARGET_THUMB1" + "* + { + rtx cond[2]; + cond[0] = gen_rtx_fmt_ee ((GET_CODE (operands[3]) == NE + ? GEU : LTU), + VOIDmode, operands[2], const1_rtx); + cond[1] = operands[4]; + + if (which_alternative == 0) + output_asm_insn (\"sub\\t%0, %2, #1\", operands); + else if (which_alternative == 1) + { + /* We must provide an alternative for a hi reg because reload + cannot handle output reloads on a jump instruction, but we + can't subtract into that. Fortunately a mov from lo to hi + does not clobber the condition codes. */ + output_asm_insn (\"sub\\t%1, %2, #1\", operands); + output_asm_insn (\"mov\\t%0, %1\", operands); + } + else + { + /* Similarly, but the target is memory. */ + output_asm_insn (\"sub\\t%1, %2, #1\", operands); + output_asm_insn (\"str\\t%1, %0\", operands); + } + + switch (get_attr_length (insn) - (which_alternative ? 2 : 0)) + { + case 4: + output_asm_insn (\"b%d0\\t%l1\", cond); + return \"\"; + case 6: + output_asm_insn (\"b%D0\\t.LCB%=\", cond); + return \"b\\t%l4\\t%@long jump\\n.LCB%=:\"; + default: + output_asm_insn (\"b%D0\\t.LCB%=\", cond); + return \"bl\\t%l4\\t%@far jump\\n.LCB%=:\"; + } + } + " + [(set (attr "far_jump") + (if_then_else + (ior (and (eq (symbol_ref ("which_alternative")) + (const_int 0)) + (eq_attr "length" "8")) + (eq_attr "length" "10")) + (const_string "yes") + (const_string "no"))) + (set_attr_alternative "length" + [ + ;; Alternative 0 + (if_then_else + (and (ge (minus (match_dup 4) (pc)) (const_int -250)) + (le (minus (match_dup 4) (pc)) (const_int 256))) + (const_int 4) + (if_then_else + (and (ge (minus (match_dup 4) (pc)) (const_int -2040)) + (le (minus (match_dup 4) (pc)) (const_int 2048))) + (const_int 6) + (const_int 8))) + ;; Alternative 1 + (if_then_else + (and (ge (minus (match_dup 4) (pc)) (const_int -248)) + (le (minus (match_dup 4) (pc)) (const_int 256))) + (const_int 6) + (if_then_else + (and (ge (minus (match_dup 4) (pc)) (const_int -2038)) + (le (minus (match_dup 4) (pc)) (const_int 2048))) + (const_int 8) + (const_int 10))) + ;; Alternative 2 + (if_then_else + (and (ge (minus (match_dup 4) (pc)) (const_int -248)) + (le (minus (match_dup 4) (pc)) (const_int 256))) + (const_int 6) + (if_then_else + (and (ge (minus (match_dup 4) (pc)) (const_int -2038)) + (le (minus (match_dup 4) (pc)) (const_int 2048))) + (const_int 8) + (const_int 10))) + ;; Alternative 3 + (if_then_else + (and (ge (minus (match_dup 4) (pc)) (const_int -248)) + (le (minus (match_dup 4) (pc)) (const_int 256))) + (const_int 6) + (if_then_else + (and (ge (minus (match_dup 4) (pc)) (const_int -2038)) + (le (minus (match_dup 4) (pc)) (const_int 2048))) + (const_int 8) + (const_int 10)))])] +) + +(define_insn "*addsi3_cbranch" + [(set (pc) + (if_then_else + (match_operator 4 "arm_comparison_operator" + [(plus:SI + (match_operand:SI 2 "s_register_operand" "%0,l,*l,1,1,1") + (match_operand:SI 3 "reg_or_int_operand" "IJ,lL,*l,lIJ,lIJ,lIJ")) + (const_int 0)]) + (label_ref (match_operand 5 "" "")) + (pc))) + (set + (match_operand:SI 0 "thumb_cbrch_target_operand" "=l,l,*!h,*?h,*?m,*?m") + (plus:SI (match_dup 2) (match_dup 3))) + (clobber (match_scratch:SI 1 "=X,X,l,l,&l,&l"))] + "TARGET_THUMB1 + && (GET_CODE (operands[4]) == EQ + || GET_CODE (operands[4]) == NE + || GET_CODE (operands[4]) == GE + || GET_CODE (operands[4]) == LT)" + "* + { + rtx cond[3]; + + cond[0] = (which_alternative < 2) ? operands[0] : operands[1]; + cond[1] = operands[2]; + cond[2] = operands[3]; + + if (GET_CODE (cond[2]) == CONST_INT && INTVAL (cond[2]) < 0) + output_asm_insn (\"sub\\t%0, %1, #%n2\", cond); + else + output_asm_insn (\"add\\t%0, %1, %2\", cond); + + if (which_alternative >= 2 + && which_alternative < 4) + output_asm_insn (\"mov\\t%0, %1\", operands); + else if (which_alternative >= 4) + output_asm_insn (\"str\\t%1, %0\", operands); + + switch (get_attr_length (insn) - ((which_alternative >= 2) ? 2 : 0)) + { + case 4: + return \"b%d4\\t%l5\"; + case 6: + return \"b%D4\\t.LCB%=\;b\\t%l5\\t%@long jump\\n.LCB%=:\"; + default: + return \"b%D4\\t.LCB%=\;bl\\t%l5\\t%@far jump\\n.LCB%=:\"; + } + } + " + [(set (attr "far_jump") + (if_then_else + (ior (and (lt (symbol_ref ("which_alternative")) + (const_int 2)) + (eq_attr "length" "8")) + (eq_attr "length" "10")) + (const_string "yes") + (const_string "no"))) + (set (attr "length") + (if_then_else + (lt (symbol_ref ("which_alternative")) + (const_int 2)) + (if_then_else + (and (ge (minus (match_dup 5) (pc)) (const_int -250)) + (le (minus (match_dup 5) (pc)) (const_int 256))) + (const_int 4) + (if_then_else + (and (ge (minus (match_dup 5) (pc)) (const_int -2040)) + (le (minus (match_dup 5) (pc)) (const_int 2048))) + (const_int 6) + (const_int 8))) + (if_then_else + (and (ge (minus (match_dup 5) (pc)) (const_int -248)) + (le (minus (match_dup 5) (pc)) (const_int 256))) + (const_int 6) + (if_then_else + (and (ge (minus (match_dup 5) (pc)) (const_int -2038)) + (le (minus (match_dup 5) (pc)) (const_int 2048))) + (const_int 8) + (const_int 10)))))] +) + +(define_insn "*addsi3_cbranch_scratch" + [(set (pc) + (if_then_else + (match_operator 3 "arm_comparison_operator" + [(plus:SI + (match_operand:SI 1 "s_register_operand" "%l,l,l,0") + (match_operand:SI 2 "reg_or_int_operand" "J,l,L,IJ")) + (const_int 0)]) + (label_ref (match_operand 4 "" "")) + (pc))) + (clobber (match_scratch:SI 0 "=X,X,l,l"))] + "TARGET_THUMB1 + && (GET_CODE (operands[3]) == EQ + || GET_CODE (operands[3]) == NE + || GET_CODE (operands[3]) == GE + || GET_CODE (operands[3]) == LT)" + "* + { + switch (which_alternative) + { + case 0: + output_asm_insn (\"cmp\t%1, #%n2\", operands); + break; + case 1: + output_asm_insn (\"cmn\t%1, %2\", operands); + break; + case 2: + if (INTVAL (operands[2]) < 0) + output_asm_insn (\"sub\t%0, %1, %2\", operands); + else + output_asm_insn (\"add\t%0, %1, %2\", operands); + break; + case 3: + if (INTVAL (operands[2]) < 0) + output_asm_insn (\"sub\t%0, %0, %2\", operands); + else + output_asm_insn (\"add\t%0, %0, %2\", operands); + break; + } + + switch (get_attr_length (insn)) + { + case 4: + return \"b%d3\\t%l4\"; + case 6: + return \"b%D3\\t.LCB%=\;b\\t%l4\\t%@long jump\\n.LCB%=:\"; + default: + return \"b%D3\\t.LCB%=\;bl\\t%l4\\t%@far jump\\n.LCB%=:\"; + } + } + " + [(set (attr "far_jump") + (if_then_else + (eq_attr "length" "8") + (const_string "yes") + (const_string "no"))) + (set (attr "length") + (if_then_else + (and (ge (minus (match_dup 4) (pc)) (const_int -250)) + (le (minus (match_dup 4) (pc)) (const_int 256))) + (const_int 4) + (if_then_else + (and (ge (minus (match_dup 4) (pc)) (const_int -2040)) + (le (minus (match_dup 4) (pc)) (const_int 2048))) + (const_int 6) + (const_int 8))))] +) + + +;; Comparison and test insns + +(define_insn "*arm_cmpsi_insn" + [(set (reg:CC CC_REGNUM) + (compare:CC (match_operand:SI 0 "s_register_operand" "r,r") + (match_operand:SI 1 "arm_add_operand" "rI,L")))] + "TARGET_32BIT" + "@ + cmp%?\\t%0, %1 + cmn%?\\t%0, #%n1" + [(set_attr "conds" "set")] +) + +(define_insn "*cmpsi_shiftsi" + [(set (reg:CC CC_REGNUM) + (compare:CC (match_operand:SI 0 "s_register_operand" "r,r") + (match_operator:SI 3 "shift_operator" + [(match_operand:SI 1 "s_register_operand" "r,r") + (match_operand:SI 2 "shift_amount_operand" "M,rM")])))] + "TARGET_32BIT" + "cmp%?\\t%0, %1%S3" + [(set_attr "conds" "set") + (set_attr "shift" "1") + (set_attr "arch" "32,a") + (set_attr "type" "alu_shift,alu_shift_reg")]) + +(define_insn "*cmpsi_shiftsi_swp" + [(set (reg:CC_SWP CC_REGNUM) + (compare:CC_SWP (match_operator:SI 3 "shift_operator" + [(match_operand:SI 1 "s_register_operand" "r,r") + (match_operand:SI 2 "shift_amount_operand" "M,rM")]) + (match_operand:SI 0 "s_register_operand" "r,r")))] + "TARGET_32BIT" + "cmp%?\\t%0, %1%S3" + [(set_attr "conds" "set") + (set_attr "shift" "1") + (set_attr "arch" "32,a") + (set_attr "type" "alu_shift,alu_shift_reg")]) + +(define_insn "*arm_cmpsi_negshiftsi_si" + [(set (reg:CC_Z CC_REGNUM) + (compare:CC_Z + (neg:SI (match_operator:SI 1 "shift_operator" + [(match_operand:SI 2 "s_register_operand" "r") + (match_operand:SI 3 "reg_or_int_operand" "rM")])) + (match_operand:SI 0 "s_register_operand" "r")))] + "TARGET_ARM" + "cmn%?\\t%0, %2%S1" + [(set_attr "conds" "set") + (set (attr "type") (if_then_else (match_operand 3 "const_int_operand" "") + (const_string "alu_shift") + (const_string "alu_shift_reg")))] +) + +;; DImode comparisons. The generic code generates branches that +;; if-conversion can not reduce to a conditional compare, so we do +;; that directly. + +(define_insn "*arm_cmpdi_insn" + [(set (reg:CC_NCV CC_REGNUM) + (compare:CC_NCV (match_operand:DI 0 "s_register_operand" "r") + (match_operand:DI 1 "arm_di_operand" "rDi"))) + (clobber (match_scratch:SI 2 "=r"))] + "TARGET_32BIT && !(TARGET_HARD_FLOAT && TARGET_MAVERICK)" + "cmp\\t%Q0, %Q1\;sbcs\\t%2, %R0, %R1" + [(set_attr "conds" "set") + (set_attr "length" "8")] +) + +(define_insn "*arm_cmpdi_unsigned" + [(set (reg:CC_CZ CC_REGNUM) + (compare:CC_CZ (match_operand:DI 0 "s_register_operand" "r") + (match_operand:DI 1 "arm_di_operand" "rDi")))] + "TARGET_ARM" + "cmp%?\\t%R0, %R1\;cmpeq\\t%Q0, %Q1" + [(set_attr "conds" "set") + (set_attr "length" "8")] +) + +(define_insn "*arm_cmpdi_zero" + [(set (reg:CC_Z CC_REGNUM) + (compare:CC_Z (match_operand:DI 0 "s_register_operand" "r") + (const_int 0))) + (clobber (match_scratch:SI 1 "=r"))] + "TARGET_32BIT" + "orr%.\\t%1, %Q0, %R0" + [(set_attr "conds" "set")] +) + +(define_insn "*thumb_cmpdi_zero" + [(set (reg:CC_Z CC_REGNUM) + (compare:CC_Z (match_operand:DI 0 "s_register_operand" "l") + (const_int 0))) + (clobber (match_scratch:SI 1 "=l"))] + "TARGET_THUMB1" + "orr\\t%1, %Q0, %R0" + [(set_attr "conds" "set") + (set_attr "length" "2")] +) + +;; Cirrus SF compare instruction +(define_insn "*cirrus_cmpsf" + [(set (reg:CCFP CC_REGNUM) + (compare:CCFP (match_operand:SF 0 "cirrus_fp_register" "v") + (match_operand:SF 1 "cirrus_fp_register" "v")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfcmps%?\\tr15, %V0, %V1" + [(set_attr "type" "mav_farith") + (set_attr "cirrus" "compare")] +) + +;; Cirrus DF compare instruction +(define_insn "*cirrus_cmpdf" + [(set (reg:CCFP CC_REGNUM) + (compare:CCFP (match_operand:DF 0 "cirrus_fp_register" "v") + (match_operand:DF 1 "cirrus_fp_register" "v")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfcmpd%?\\tr15, %V0, %V1" + [(set_attr "type" "mav_farith") + (set_attr "cirrus" "compare")] +) + +(define_insn "*cirrus_cmpdi" + [(set (reg:CC CC_REGNUM) + (compare:CC (match_operand:DI 0 "cirrus_fp_register" "v") + (match_operand:DI 1 "cirrus_fp_register" "v")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfcmp64%?\\tr15, %V0, %V1" + [(set_attr "type" "mav_farith") + (set_attr "cirrus" "compare")] +) + +; This insn allows redundant compares to be removed by cse, nothing should +; ever appear in the output file since (set (reg x) (reg x)) is a no-op that +; is deleted later on. The match_dup will match the mode here, so that +; mode changes of the condition codes aren't lost by this even though we don't +; specify what they are. + +(define_insn "*deleted_compare" + [(set (match_operand 0 "cc_register" "") (match_dup 0))] + "TARGET_32BIT" + "\\t%@ deleted compare" + [(set_attr "conds" "set") + (set_attr "length" "0")] +) + + +;; Conditional branch insns + +(define_expand "cbranch_cc" + [(set (pc) + (if_then_else (match_operator 0 "" [(match_operand 1 "" "") + (match_operand 2 "" "")]) + (label_ref (match_operand 3 "" "")) + (pc)))] + "TARGET_32BIT" + "operands[1] = arm_gen_compare_reg (GET_CODE (operands[0]), + operands[1], operands[2]); + operands[2] = const0_rtx;" +) + +;; +;; Patterns to match conditional branch insns. +;; + +(define_insn "*arm_cond_branch" + [(set (pc) + (if_then_else (match_operator 1 "arm_comparison_operator" + [(match_operand 2 "cc_register" "") (const_int 0)]) + (label_ref (match_operand 0 "" "")) + (pc)))] + "TARGET_32BIT" + "* + if (arm_ccfsm_state == 1 || arm_ccfsm_state == 2) + { + arm_ccfsm_state += 2; + return \"\"; + } + return \"b%d1\\t%l0\"; + " + [(set_attr "conds" "use") + (set_attr "type" "branch")] +) + +(define_insn "*arm_cond_branch_reversed" + [(set (pc) + (if_then_else (match_operator 1 "arm_comparison_operator" + [(match_operand 2 "cc_register" "") (const_int 0)]) + (pc) + (label_ref (match_operand 0 "" ""))))] + "TARGET_32BIT" + "* + if (arm_ccfsm_state == 1 || arm_ccfsm_state == 2) + { + arm_ccfsm_state += 2; + return \"\"; + } + return \"b%D1\\t%l0\"; + " + [(set_attr "conds" "use") + (set_attr "type" "branch")] +) + + + +; scc insns + +(define_expand "cstore_cc" + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operator:SI 1 "" [(match_operand 2 "" "") + (match_operand 3 "" "")]))] + "TARGET_32BIT" + "operands[2] = arm_gen_compare_reg (GET_CODE (operands[1]), + operands[2], operands[3]); + operands[3] = const0_rtx;" +) + +(define_insn "*mov_scc" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (match_operator:SI 1 "arm_comparison_operator" + [(match_operand 2 "cc_register" "") (const_int 0)]))] + "TARGET_ARM" + "mov%D1\\t%0, #0\;mov%d1\\t%0, #1" + [(set_attr "conds" "use") + (set_attr "insn" "mov") + (set_attr "length" "8")] +) + +(define_insn "*mov_negscc" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (neg:SI (match_operator:SI 1 "arm_comparison_operator" + [(match_operand 2 "cc_register" "") (const_int 0)])))] + "TARGET_ARM" + "mov%D1\\t%0, #0\;mvn%d1\\t%0, #0" + [(set_attr "conds" "use") + (set_attr "insn" "mov") + (set_attr "length" "8")] +) + +(define_insn "*mov_notscc" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (not:SI (match_operator:SI 1 "arm_comparison_operator" + [(match_operand 2 "cc_register" "") (const_int 0)])))] + "TARGET_ARM" + "mvn%D1\\t%0, #0\;mvn%d1\\t%0, #1" + [(set_attr "conds" "use") + (set_attr "insn" "mov") + (set_attr "length" "8")] +) + +(define_expand "cstoresi4" + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operator:SI 1 "arm_comparison_operator" + [(match_operand:SI 2 "s_register_operand" "") + (match_operand:SI 3 "reg_or_int_operand" "")]))] + "TARGET_32BIT || TARGET_THUMB1" + "{ + rtx op3, scratch, scratch2; + + if (!TARGET_THUMB1) + { + if (!arm_add_operand (operands[3], SImode)) + operands[3] = force_reg (SImode, operands[3]); + emit_insn (gen_cstore_cc (operands[0], operands[1], + operands[2], operands[3])); + DONE; + } + + if (operands[3] == const0_rtx) + { + switch (GET_CODE (operands[1])) + { + case EQ: + emit_insn (gen_cstoresi_eq0_thumb1 (operands[0], operands[2])); + break; + + case NE: + emit_insn (gen_cstoresi_ne0_thumb1 (operands[0], operands[2])); + break; + + case LE: + scratch = expand_binop (SImode, add_optab, operands[2], constm1_rtx, + NULL_RTX, 0, OPTAB_WIDEN); + scratch = expand_binop (SImode, ior_optab, operands[2], scratch, + NULL_RTX, 0, OPTAB_WIDEN); + expand_binop (SImode, lshr_optab, scratch, GEN_INT (31), + operands[0], 1, OPTAB_WIDEN); + break; + + case GE: + scratch = expand_unop (SImode, one_cmpl_optab, operands[2], + NULL_RTX, 1); + expand_binop (SImode, lshr_optab, scratch, GEN_INT (31), + NULL_RTX, 1, OPTAB_WIDEN); + break; + + case GT: + scratch = expand_binop (SImode, ashr_optab, operands[2], + GEN_INT (31), NULL_RTX, 0, OPTAB_WIDEN); + scratch = expand_binop (SImode, sub_optab, scratch, operands[2], + NULL_RTX, 0, OPTAB_WIDEN); + expand_binop (SImode, lshr_optab, scratch, GEN_INT (31), operands[0], + 0, OPTAB_WIDEN); + break; + + /* LT is handled by generic code. No need for unsigned with 0. */ + default: + FAIL; + } + DONE; + } + + switch (GET_CODE (operands[1])) + { + case EQ: + scratch = expand_binop (SImode, sub_optab, operands[2], operands[3], + NULL_RTX, 0, OPTAB_WIDEN); + emit_insn (gen_cstoresi_eq0_thumb1 (operands[0], scratch)); + break; + + case NE: + scratch = expand_binop (SImode, sub_optab, operands[2], operands[3], + NULL_RTX, 0, OPTAB_WIDEN); + emit_insn (gen_cstoresi_ne0_thumb1 (operands[0], scratch)); + break; + + case LE: + op3 = force_reg (SImode, operands[3]); + + scratch = expand_binop (SImode, lshr_optab, operands[2], GEN_INT (31), + NULL_RTX, 1, OPTAB_WIDEN); + scratch2 = expand_binop (SImode, ashr_optab, op3, GEN_INT (31), + NULL_RTX, 0, OPTAB_WIDEN); + emit_insn (gen_thumb1_addsi3_addgeu (operands[0], scratch, scratch2, + op3, operands[2])); + break; + + case GE: + op3 = operands[3]; + if (!thumb1_cmp_operand (op3, SImode)) + op3 = force_reg (SImode, op3); + scratch = expand_binop (SImode, ashr_optab, operands[2], GEN_INT (31), + NULL_RTX, 0, OPTAB_WIDEN); + scratch2 = expand_binop (SImode, lshr_optab, op3, GEN_INT (31), + NULL_RTX, 1, OPTAB_WIDEN); + emit_insn (gen_thumb1_addsi3_addgeu (operands[0], scratch, scratch2, + operands[2], op3)); + break; + + case LEU: + op3 = force_reg (SImode, operands[3]); + scratch = force_reg (SImode, const0_rtx); + emit_insn (gen_thumb1_addsi3_addgeu (operands[0], scratch, scratch, + op3, operands[2])); + break; + + case GEU: + op3 = operands[3]; + if (!thumb1_cmp_operand (op3, SImode)) + op3 = force_reg (SImode, op3); + scratch = force_reg (SImode, const0_rtx); + emit_insn (gen_thumb1_addsi3_addgeu (operands[0], scratch, scratch, + operands[2], op3)); + break; + + case LTU: + op3 = operands[3]; + if (!thumb1_cmp_operand (op3, SImode)) + op3 = force_reg (SImode, op3); + scratch = gen_reg_rtx (SImode); + emit_insn (gen_cstoresi_ltu_thumb1 (operands[0], operands[2], op3)); + break; + + case GTU: + op3 = force_reg (SImode, operands[3]); + scratch = gen_reg_rtx (SImode); + emit_insn (gen_cstoresi_ltu_thumb1 (operands[0], op3, operands[2])); + break; + + /* No good sequences for GT, LT. */ + default: + FAIL; + } + DONE; +}") + +(define_expand "cstoresf4" + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operator:SI 1 "arm_comparison_operator" + [(match_operand:SF 2 "s_register_operand" "") + (match_operand:SF 3 "arm_float_compare_operand" "")]))] + "TARGET_32BIT && TARGET_HARD_FLOAT" + "emit_insn (gen_cstore_cc (operands[0], operands[1], + operands[2], operands[3])); DONE;" +) + +(define_expand "cstoredf4" + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operator:SI 1 "arm_comparison_operator" + [(match_operand:DF 2 "s_register_operand" "") + (match_operand:DF 3 "arm_float_compare_operand" "")]))] + "TARGET_32BIT && TARGET_HARD_FLOAT && !TARGET_VFP_SINGLE" + "emit_insn (gen_cstore_cc (operands[0], operands[1], + operands[2], operands[3])); DONE;" +) + +(define_expand "cstoredi4" + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operator:SI 1 "arm_comparison_operator" + [(match_operand:DI 2 "cmpdi_operand" "") + (match_operand:DI 3 "cmpdi_operand" "")]))] + "TARGET_32BIT" + "{ + rtx swap = NULL_RTX; + enum rtx_code code = GET_CODE (operands[1]); + + /* We should not have two constants. */ + gcc_assert (GET_MODE (operands[2]) == DImode + || GET_MODE (operands[3]) == DImode); + + /* Flip unimplemented DImode comparisons to a form that + arm_gen_compare_reg can handle. */ + switch (code) + { + case GT: + swap = gen_rtx_LT (VOIDmode, operands[3], operands[2]); break; + case LE: + swap = gen_rtx_GE (VOIDmode, operands[3], operands[2]); break; + case GTU: + swap = gen_rtx_LTU (VOIDmode, operands[3], operands[2]); break; + case LEU: + swap = gen_rtx_GEU (VOIDmode, operands[3], operands[2]); break; + default: + break; + } + if (swap) + emit_insn (gen_cstore_cc (operands[0], swap, operands[3], + operands[2])); + else + emit_insn (gen_cstore_cc (operands[0], operands[1], operands[2], + operands[3])); + DONE; + }" +) + +(define_expand "cstoresi_eq0_thumb1" + [(parallel + [(set (match_operand:SI 0 "s_register_operand" "") + (eq:SI (match_operand:SI 1 "s_register_operand" "") + (const_int 0))) + (clobber (match_dup:SI 2))])] + "TARGET_THUMB1" + "operands[2] = gen_reg_rtx (SImode);" +) + +(define_expand "cstoresi_ne0_thumb1" + [(parallel + [(set (match_operand:SI 0 "s_register_operand" "") + (ne:SI (match_operand:SI 1 "s_register_operand" "") + (const_int 0))) + (clobber (match_dup:SI 2))])] + "TARGET_THUMB1" + "operands[2] = gen_reg_rtx (SImode);" +) + +(define_insn "*cstoresi_eq0_thumb1_insn" + [(set (match_operand:SI 0 "s_register_operand" "=&l,l") + (eq:SI (match_operand:SI 1 "s_register_operand" "l,0") + (const_int 0))) + (clobber (match_operand:SI 2 "s_register_operand" "=X,l"))] + "TARGET_THUMB1" + "@ + neg\\t%0, %1\;adc\\t%0, %0, %1 + neg\\t%2, %1\;adc\\t%0, %1, %2" + [(set_attr "length" "4")] +) + +(define_insn "*cstoresi_ne0_thumb1_insn" + [(set (match_operand:SI 0 "s_register_operand" "=l") + (ne:SI (match_operand:SI 1 "s_register_operand" "0") + (const_int 0))) + (clobber (match_operand:SI 2 "s_register_operand" "=l"))] + "TARGET_THUMB1" + "sub\\t%2, %1, #1\;sbc\\t%0, %1, %2" + [(set_attr "length" "4")] +) + +;; Used as part of the expansion of thumb ltu and gtu sequences +(define_insn "cstoresi_nltu_thumb1" + [(set (match_operand:SI 0 "s_register_operand" "=l,l") + (neg:SI (ltu:SI (match_operand:SI 1 "s_register_operand" "l,*h") + (match_operand:SI 2 "thumb1_cmp_operand" "lI*h,*r"))))] + "TARGET_THUMB1" + "cmp\\t%1, %2\;sbc\\t%0, %0, %0" + [(set_attr "length" "4")] +) + +(define_insn_and_split "cstoresi_ltu_thumb1" + [(set (match_operand:SI 0 "s_register_operand" "=l,l") + (ltu:SI (match_operand:SI 1 "s_register_operand" "l,*h") + (match_operand:SI 2 "thumb1_cmp_operand" "lI*h,*r")))] + "TARGET_THUMB1" + "#" + "TARGET_THUMB1" + [(set (match_dup 3) + (neg:SI (ltu:SI (match_dup 1) (match_dup 2)))) + (set (match_dup 0) (neg:SI (match_dup 3)))] + "operands[3] = gen_reg_rtx (SImode);" + [(set_attr "length" "4")] +) + +;; Used as part of the expansion of thumb les sequence. +(define_insn "thumb1_addsi3_addgeu" + [(set (match_operand:SI 0 "s_register_operand" "=l") + (plus:SI (plus:SI (match_operand:SI 1 "s_register_operand" "%0") + (match_operand:SI 2 "s_register_operand" "l")) + (geu:SI (match_operand:SI 3 "s_register_operand" "l") + (match_operand:SI 4 "thumb1_cmp_operand" "lI"))))] + "TARGET_THUMB1" + "cmp\\t%3, %4\;adc\\t%0, %1, %2" + [(set_attr "length" "4")] +) + + +;; Conditional move insns + +(define_expand "movsicc" + [(set (match_operand:SI 0 "s_register_operand" "") + (if_then_else:SI (match_operand 1 "arm_comparison_operator" "") + (match_operand:SI 2 "arm_not_operand" "") + (match_operand:SI 3 "arm_not_operand" "")))] + "TARGET_32BIT" + " + { + enum rtx_code code = GET_CODE (operands[1]); + rtx ccreg; + + if (code == UNEQ || code == LTGT) + FAIL; + + ccreg = arm_gen_compare_reg (code, XEXP (operands[1], 0), + XEXP (operands[1], 1)); + operands[1] = gen_rtx_fmt_ee (code, VOIDmode, ccreg, const0_rtx); + }" +) + +(define_expand "movsfcc" + [(set (match_operand:SF 0 "s_register_operand" "") + (if_then_else:SF (match_operand 1 "arm_comparison_operator" "") + (match_operand:SF 2 "s_register_operand" "") + (match_operand:SF 3 "nonmemory_operand" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT" + " + { + enum rtx_code code = GET_CODE (operands[1]); + rtx ccreg; + + if (code == UNEQ || code == LTGT) + FAIL; + + /* When compiling for SOFT_FLOAT, ensure both arms are in registers. + Otherwise, ensure it is a valid FP add operand */ + if ((!(TARGET_HARD_FLOAT && TARGET_FPA)) + || (!arm_float_add_operand (operands[3], SFmode))) + operands[3] = force_reg (SFmode, operands[3]); + + ccreg = arm_gen_compare_reg (code, XEXP (operands[1], 0), + XEXP (operands[1], 1)); + operands[1] = gen_rtx_fmt_ee (code, VOIDmode, ccreg, const0_rtx); + }" +) + +(define_expand "movdfcc" + [(set (match_operand:DF 0 "s_register_operand" "") + (if_then_else:DF (match_operand 1 "arm_comparison_operator" "") + (match_operand:DF 2 "s_register_operand" "") + (match_operand:DF 3 "arm_float_add_operand" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && (TARGET_FPA || TARGET_VFP_DOUBLE)" + " + { + enum rtx_code code = GET_CODE (operands[1]); + rtx ccreg; + + if (code == UNEQ || code == LTGT) + FAIL; + + ccreg = arm_gen_compare_reg (code, XEXP (operands[1], 0), + XEXP (operands[1], 1)); + operands[1] = gen_rtx_fmt_ee (code, VOIDmode, ccreg, const0_rtx); + }" +) + +(define_insn "*movsicc_insn" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,r,r,r,r,r,r") + (if_then_else:SI + (match_operator 3 "arm_comparison_operator" + [(match_operand 4 "cc_register" "") (const_int 0)]) + (match_operand:SI 1 "arm_not_operand" "0,0,rI,K,rI,rI,K,K") + (match_operand:SI 2 "arm_not_operand" "rI,K,0,0,rI,K,rI,K")))] + "TARGET_ARM" + "@ + mov%D3\\t%0, %2 + mvn%D3\\t%0, #%B2 + mov%d3\\t%0, %1 + mvn%d3\\t%0, #%B1 + mov%d3\\t%0, %1\;mov%D3\\t%0, %2 + mov%d3\\t%0, %1\;mvn%D3\\t%0, #%B2 + mvn%d3\\t%0, #%B1\;mov%D3\\t%0, %2 + mvn%d3\\t%0, #%B1\;mvn%D3\\t%0, #%B2" + [(set_attr "length" "4,4,4,4,8,8,8,8") + (set_attr "conds" "use") + (set_attr "insn" "mov,mvn,mov,mvn,mov,mov,mvn,mvn")] +) + +(define_insn "*movsfcc_soft_insn" + [(set (match_operand:SF 0 "s_register_operand" "=r,r") + (if_then_else:SF (match_operator 3 "arm_comparison_operator" + [(match_operand 4 "cc_register" "") (const_int 0)]) + (match_operand:SF 1 "s_register_operand" "0,r") + (match_operand:SF 2 "s_register_operand" "r,0")))] + "TARGET_ARM && TARGET_SOFT_FLOAT" + "@ + mov%D3\\t%0, %2 + mov%d3\\t%0, %1" + [(set_attr "conds" "use") + (set_attr "insn" "mov")] +) + + +;; Jump and linkage insns + +(define_expand "jump" + [(set (pc) + (label_ref (match_operand 0 "" "")))] + "TARGET_EITHER" + "" +) + +(define_insn "*arm_jump" + [(set (pc) + (label_ref (match_operand 0 "" "")))] + "TARGET_32BIT" + "* + { + if (arm_ccfsm_state == 1 || arm_ccfsm_state == 2) + { + arm_ccfsm_state += 2; + return \"\"; + } + return \"b%?\\t%l0\"; + } + " + [(set_attr "predicable" "yes")] +) + +(define_insn "*thumb_jump" + [(set (pc) + (label_ref (match_operand 0 "" "")))] + "TARGET_THUMB1" + "* + if (get_attr_length (insn) == 2) + return \"b\\t%l0\"; + return \"bl\\t%l0\\t%@ far jump\"; + " + [(set (attr "far_jump") + (if_then_else + (eq_attr "length" "4") + (const_string "yes") + (const_string "no"))) + (set (attr "length") + (if_then_else + (and (ge (minus (match_dup 0) (pc)) (const_int -2044)) + (le (minus (match_dup 0) (pc)) (const_int 2048))) + (const_int 2) + (const_int 4)))] +) + +(define_expand "call" + [(parallel [(call (match_operand 0 "memory_operand" "") + (match_operand 1 "general_operand" "")) + (use (match_operand 2 "" "")) + (clobber (reg:SI LR_REGNUM))])] + "TARGET_EITHER" + " + { + rtx callee, pat; + + /* In an untyped call, we can get NULL for operand 2. */ + if (operands[2] == NULL_RTX) + operands[2] = const0_rtx; + + /* Decide if we should generate indirect calls by loading the + 32-bit address of the callee into a register before performing the + branch and link. */ + callee = XEXP (operands[0], 0); + if (GET_CODE (callee) == SYMBOL_REF + ? arm_is_long_call_p (SYMBOL_REF_DECL (callee)) + : !REG_P (callee)) + XEXP (operands[0], 0) = force_reg (Pmode, callee); + + pat = gen_call_internal (operands[0], operands[1], operands[2]); + arm_emit_call_insn (pat, XEXP (operands[0], 0)); + DONE; + }" +) + +(define_expand "call_internal" + [(parallel [(call (match_operand 0 "memory_operand" "") + (match_operand 1 "general_operand" "")) + (use (match_operand 2 "" "")) + (clobber (reg:SI LR_REGNUM))])]) + +(define_insn "*call_reg_armv5" + [(call (mem:SI (match_operand:SI 0 "s_register_operand" "r")) + (match_operand 1 "" "")) + (use (match_operand 2 "" "")) + (clobber (reg:SI LR_REGNUM))] + "TARGET_ARM && arm_arch5" + "blx%?\\t%0" + [(set_attr "type" "call")] +) + +(define_insn "*call_reg_arm" + [(call (mem:SI (match_operand:SI 0 "s_register_operand" "r")) + (match_operand 1 "" "")) + (use (match_operand 2 "" "")) + (clobber (reg:SI LR_REGNUM))] + "TARGET_ARM && !arm_arch5" + "* + return output_call (operands); + " + ;; length is worst case, normally it is only two + [(set_attr "length" "12") + (set_attr "type" "call")] +) + + +;; Note: not used for armv5+ because the sequence used (ldr pc, ...) is not +;; considered a function call by the branch predictor of some cores (PR40887). +;; Falls back to blx rN (*call_reg_armv5). + +(define_insn "*call_mem" + [(call (mem:SI (match_operand:SI 0 "call_memory_operand" "m")) + (match_operand 1 "" "")) + (use (match_operand 2 "" "")) + (clobber (reg:SI LR_REGNUM))] + "TARGET_ARM && !arm_arch5" + "* + return output_call_mem (operands); + " + [(set_attr "length" "12") + (set_attr "type" "call")] +) + +(define_insn "*call_reg_thumb1_v5" + [(call (mem:SI (match_operand:SI 0 "register_operand" "l*r")) + (match_operand 1 "" "")) + (use (match_operand 2 "" "")) + (clobber (reg:SI LR_REGNUM))] + "TARGET_THUMB1 && arm_arch5" + "blx\\t%0" + [(set_attr "length" "2") + (set_attr "type" "call")] +) + +(define_insn "*call_reg_thumb1" + [(call (mem:SI (match_operand:SI 0 "register_operand" "l*r")) + (match_operand 1 "" "")) + (use (match_operand 2 "" "")) + (clobber (reg:SI LR_REGNUM))] + "TARGET_THUMB1 && !arm_arch5" + "* + { + if (!TARGET_CALLER_INTERWORKING) + return thumb_call_via_reg (operands[0]); + else if (operands[1] == const0_rtx) + return \"bl\\t%__interwork_call_via_%0\"; + else if (frame_pointer_needed) + return \"bl\\t%__interwork_r7_call_via_%0\"; + else + return \"bl\\t%__interwork_r11_call_via_%0\"; + }" + [(set_attr "type" "call")] +) + +(define_expand "call_value" + [(parallel [(set (match_operand 0 "" "") + (call (match_operand 1 "memory_operand" "") + (match_operand 2 "general_operand" ""))) + (use (match_operand 3 "" "")) + (clobber (reg:SI LR_REGNUM))])] + "TARGET_EITHER" + " + { + rtx pat, callee; + + /* In an untyped call, we can get NULL for operand 2. */ + if (operands[3] == 0) + operands[3] = const0_rtx; + + /* Decide if we should generate indirect calls by loading the + 32-bit address of the callee into a register before performing the + branch and link. */ + callee = XEXP (operands[1], 0); + if (GET_CODE (callee) == SYMBOL_REF + ? arm_is_long_call_p (SYMBOL_REF_DECL (callee)) + : !REG_P (callee)) + XEXP (operands[1], 0) = force_reg (Pmode, callee); + + pat = gen_call_value_internal (operands[0], operands[1], + operands[2], operands[3]); + arm_emit_call_insn (pat, XEXP (operands[1], 0)); + DONE; + }" +) + +(define_expand "call_value_internal" + [(parallel [(set (match_operand 0 "" "") + (call (match_operand 1 "memory_operand" "") + (match_operand 2 "general_operand" ""))) + (use (match_operand 3 "" "")) + (clobber (reg:SI LR_REGNUM))])]) + +(define_insn "*call_value_reg_armv5" + [(set (match_operand 0 "" "") + (call (mem:SI (match_operand:SI 1 "s_register_operand" "r")) + (match_operand 2 "" ""))) + (use (match_operand 3 "" "")) + (clobber (reg:SI LR_REGNUM))] + "TARGET_ARM && arm_arch5" + "blx%?\\t%1" + [(set_attr "type" "call")] +) + +(define_insn "*call_value_reg_arm" + [(set (match_operand 0 "" "") + (call (mem:SI (match_operand:SI 1 "s_register_operand" "r")) + (match_operand 2 "" ""))) + (use (match_operand 3 "" "")) + (clobber (reg:SI LR_REGNUM))] + "TARGET_ARM && !arm_arch5" + "* + return output_call (&operands[1]); + " + [(set_attr "length" "12") + (set_attr "type" "call")] +) + +;; Note: see *call_mem + +(define_insn "*call_value_mem" + [(set (match_operand 0 "" "") + (call (mem:SI (match_operand:SI 1 "call_memory_operand" "m")) + (match_operand 2 "" ""))) + (use (match_operand 3 "" "")) + (clobber (reg:SI LR_REGNUM))] + "TARGET_ARM && !arm_arch5 && (!CONSTANT_ADDRESS_P (XEXP (operands[1], 0)))" + "* + return output_call_mem (&operands[1]); + " + [(set_attr "length" "12") + (set_attr "type" "call")] +) + +(define_insn "*call_value_reg_thumb1_v5" + [(set (match_operand 0 "" "") + (call (mem:SI (match_operand:SI 1 "register_operand" "l*r")) + (match_operand 2 "" ""))) + (use (match_operand 3 "" "")) + (clobber (reg:SI LR_REGNUM))] + "TARGET_THUMB1 && arm_arch5" + "blx\\t%1" + [(set_attr "length" "2") + (set_attr "type" "call")] +) + +(define_insn "*call_value_reg_thumb1" + [(set (match_operand 0 "" "") + (call (mem:SI (match_operand:SI 1 "register_operand" "l*r")) + (match_operand 2 "" ""))) + (use (match_operand 3 "" "")) + (clobber (reg:SI LR_REGNUM))] + "TARGET_THUMB1 && !arm_arch5" + "* + { + if (!TARGET_CALLER_INTERWORKING) + return thumb_call_via_reg (operands[1]); + else if (operands[2] == const0_rtx) + return \"bl\\t%__interwork_call_via_%1\"; + else if (frame_pointer_needed) + return \"bl\\t%__interwork_r7_call_via_%1\"; + else + return \"bl\\t%__interwork_r11_call_via_%1\"; + }" + [(set_attr "type" "call")] +) + +;; Allow calls to SYMBOL_REFs specially as they are not valid general addresses +;; The 'a' causes the operand to be treated as an address, i.e. no '#' output. + +(define_insn "*call_symbol" + [(call (mem:SI (match_operand:SI 0 "" "")) + (match_operand 1 "" "")) + (use (match_operand 2 "" "")) + (clobber (reg:SI LR_REGNUM))] + "TARGET_32BIT + && (GET_CODE (operands[0]) == SYMBOL_REF) + && !arm_is_long_call_p (SYMBOL_REF_DECL (operands[0]))" + "* + { + return NEED_PLT_RELOC ? \"bl%?\\t%a0(PLT)\" : \"bl%?\\t%a0\"; + }" + [(set_attr "type" "call")] +) + +(define_insn "*call_value_symbol" + [(set (match_operand 0 "" "") + (call (mem:SI (match_operand:SI 1 "" "")) + (match_operand:SI 2 "" ""))) + (use (match_operand 3 "" "")) + (clobber (reg:SI LR_REGNUM))] + "TARGET_32BIT + && (GET_CODE (operands[1]) == SYMBOL_REF) + && !arm_is_long_call_p (SYMBOL_REF_DECL (operands[1]))" + "* + { + return NEED_PLT_RELOC ? \"bl%?\\t%a1(PLT)\" : \"bl%?\\t%a1\"; + }" + [(set_attr "type" "call")] +) + +(define_insn "*call_insn" + [(call (mem:SI (match_operand:SI 0 "" "")) + (match_operand:SI 1 "" "")) + (use (match_operand 2 "" "")) + (clobber (reg:SI LR_REGNUM))] + "TARGET_THUMB1 + && GET_CODE (operands[0]) == SYMBOL_REF + && !arm_is_long_call_p (SYMBOL_REF_DECL (operands[0]))" + "bl\\t%a0" + [(set_attr "length" "4") + (set_attr "type" "call")] +) + +(define_insn "*call_value_insn" + [(set (match_operand 0 "" "") + (call (mem:SI (match_operand 1 "" "")) + (match_operand 2 "" ""))) + (use (match_operand 3 "" "")) + (clobber (reg:SI LR_REGNUM))] + "TARGET_THUMB1 + && GET_CODE (operands[1]) == SYMBOL_REF + && !arm_is_long_call_p (SYMBOL_REF_DECL (operands[1]))" + "bl\\t%a1" + [(set_attr "length" "4") + (set_attr "type" "call")] +) + +;; We may also be able to do sibcalls for Thumb, but it's much harder... +(define_expand "sibcall" + [(parallel [(call (match_operand 0 "memory_operand" "") + (match_operand 1 "general_operand" "")) + (return) + (use (match_operand 2 "" ""))])] + "TARGET_32BIT" + " + { + if (operands[2] == NULL_RTX) + operands[2] = const0_rtx; + }" +) + +(define_expand "sibcall_value" + [(parallel [(set (match_operand 0 "" "") + (call (match_operand 1 "memory_operand" "") + (match_operand 2 "general_operand" ""))) + (return) + (use (match_operand 3 "" ""))])] + "TARGET_32BIT" + " + { + if (operands[3] == NULL_RTX) + operands[3] = const0_rtx; + }" +) + +(define_insn "*sibcall_insn" + [(call (mem:SI (match_operand:SI 0 "" "X")) + (match_operand 1 "" "")) + (return) + (use (match_operand 2 "" ""))] + "TARGET_32BIT && GET_CODE (operands[0]) == SYMBOL_REF" + "* + return NEED_PLT_RELOC ? \"b%?\\t%a0(PLT)\" : \"b%?\\t%a0\"; + " + [(set_attr "type" "call")] +) + +(define_insn "*sibcall_value_insn" + [(set (match_operand 0 "" "") + (call (mem:SI (match_operand:SI 1 "" "X")) + (match_operand 2 "" ""))) + (return) + (use (match_operand 3 "" ""))] + "TARGET_32BIT && GET_CODE (operands[1]) == SYMBOL_REF" + "* + return NEED_PLT_RELOC ? \"b%?\\t%a1(PLT)\" : \"b%?\\t%a1\"; + " + [(set_attr "type" "call")] +) + +(define_expand "return" + [(return)] + "TARGET_32BIT && USE_RETURN_INSN (FALSE)" + "") + +;; Often the return insn will be the same as loading from memory, so set attr +(define_insn "*arm_return" + [(return)] + "TARGET_ARM && USE_RETURN_INSN (FALSE)" + "* + { + if (arm_ccfsm_state == 2) + { + arm_ccfsm_state += 2; + return \"\"; + } + return output_return_instruction (const_true_rtx, TRUE, FALSE); + }" + [(set_attr "type" "load1") + (set_attr "length" "12") + (set_attr "predicable" "yes")] +) + +(define_insn "*cond_return" + [(set (pc) + (if_then_else (match_operator 0 "arm_comparison_operator" + [(match_operand 1 "cc_register" "") (const_int 0)]) + (return) + (pc)))] + "TARGET_ARM && USE_RETURN_INSN (TRUE)" + "* + { + if (arm_ccfsm_state == 2) + { + arm_ccfsm_state += 2; + return \"\"; + } + return output_return_instruction (operands[0], TRUE, FALSE); + }" + [(set_attr "conds" "use") + (set_attr "length" "12") + (set_attr "type" "load1")] +) + +(define_insn "*cond_return_inverted" + [(set (pc) + (if_then_else (match_operator 0 "arm_comparison_operator" + [(match_operand 1 "cc_register" "") (const_int 0)]) + (pc) + (return)))] + "TARGET_ARM && USE_RETURN_INSN (TRUE)" + "* + { + if (arm_ccfsm_state == 2) + { + arm_ccfsm_state += 2; + return \"\"; + } + return output_return_instruction (operands[0], TRUE, TRUE); + }" + [(set_attr "conds" "use") + (set_attr "length" "12") + (set_attr "type" "load1")] +) + +;; Generate a sequence of instructions to determine if the processor is +;; in 26-bit or 32-bit mode, and return the appropriate return address +;; mask. + +(define_expand "return_addr_mask" + [(set (match_dup 1) + (compare:CC_NOOV (unspec [(const_int 0)] UNSPEC_CHECK_ARCH) + (const_int 0))) + (set (match_operand:SI 0 "s_register_operand" "") + (if_then_else:SI (eq (match_dup 1) (const_int 0)) + (const_int -1) + (const_int 67108860)))] ; 0x03fffffc + "TARGET_ARM" + " + operands[1] = gen_rtx_REG (CC_NOOVmode, CC_REGNUM); + ") + +(define_insn "*check_arch2" + [(set (match_operand:CC_NOOV 0 "cc_register" "") + (compare:CC_NOOV (unspec [(const_int 0)] UNSPEC_CHECK_ARCH) + (const_int 0)))] + "TARGET_ARM" + "teq\\t%|r0, %|r0\;teq\\t%|pc, %|pc" + [(set_attr "length" "8") + (set_attr "conds" "set")] +) + +;; Call subroutine returning any type. + +(define_expand "untyped_call" + [(parallel [(call (match_operand 0 "" "") + (const_int 0)) + (match_operand 1 "" "") + (match_operand 2 "" "")])] + "TARGET_EITHER" + " + { + int i; + rtx par = gen_rtx_PARALLEL (VOIDmode, + rtvec_alloc (XVECLEN (operands[2], 0))); + rtx addr = gen_reg_rtx (Pmode); + rtx mem; + int size = 0; + + emit_move_insn (addr, XEXP (operands[1], 0)); + mem = change_address (operands[1], BLKmode, addr); + + for (i = 0; i < XVECLEN (operands[2], 0); i++) + { + rtx src = SET_SRC (XVECEXP (operands[2], 0, i)); + + /* Default code only uses r0 as a return value, but we could + be using anything up to 4 registers. */ + if (REGNO (src) == R0_REGNUM) + src = gen_rtx_REG (TImode, R0_REGNUM); + + XVECEXP (par, 0, i) = gen_rtx_EXPR_LIST (VOIDmode, src, + GEN_INT (size)); + size += GET_MODE_SIZE (GET_MODE (src)); + } + + emit_call_insn (GEN_CALL_VALUE (par, operands[0], const0_rtx, NULL, + const0_rtx)); + + size = 0; + + for (i = 0; i < XVECLEN (par, 0); i++) + { + HOST_WIDE_INT offset = 0; + rtx reg = XEXP (XVECEXP (par, 0, i), 0); + + if (size != 0) + emit_move_insn (addr, plus_constant (addr, size)); + + mem = change_address (mem, GET_MODE (reg), NULL); + if (REGNO (reg) == R0_REGNUM) + { + /* On thumb we have to use a write-back instruction. */ + emit_insn (arm_gen_store_multiple (arm_regs_in_sequence, 4, addr, + TARGET_THUMB ? TRUE : FALSE, mem, &offset)); + size = TARGET_ARM ? 16 : 0; + } + else + { + emit_move_insn (mem, reg); + size = GET_MODE_SIZE (GET_MODE (reg)); + } + } + + /* The optimizer does not know that the call sets the function value + registers we stored in the result block. We avoid problems by + claiming that all hard registers are used and clobbered at this + point. */ + emit_insn (gen_blockage ()); + + DONE; + }" +) + +(define_expand "untyped_return" + [(match_operand:BLK 0 "memory_operand" "") + (match_operand 1 "" "")] + "TARGET_EITHER" + " + { + int i; + rtx addr = gen_reg_rtx (Pmode); + rtx mem; + int size = 0; + + emit_move_insn (addr, XEXP (operands[0], 0)); + mem = change_address (operands[0], BLKmode, addr); + + for (i = 0; i < XVECLEN (operands[1], 0); i++) + { + HOST_WIDE_INT offset = 0; + rtx reg = SET_DEST (XVECEXP (operands[1], 0, i)); + + if (size != 0) + emit_move_insn (addr, plus_constant (addr, size)); + + mem = change_address (mem, GET_MODE (reg), NULL); + if (REGNO (reg) == R0_REGNUM) + { + /* On thumb we have to use a write-back instruction. */ + emit_insn (arm_gen_load_multiple (arm_regs_in_sequence, 4, addr, + TARGET_THUMB ? TRUE : FALSE, mem, &offset)); + size = TARGET_ARM ? 16 : 0; + } + else + { + emit_move_insn (reg, mem); + size = GET_MODE_SIZE (GET_MODE (reg)); + } + } + + /* Emit USE insns before the return. */ + for (i = 0; i < XVECLEN (operands[1], 0); i++) + emit_use (SET_DEST (XVECEXP (operands[1], 0, i))); + + /* Construct the return. */ + expand_naked_return (); + + DONE; + }" +) + +;; UNSPEC_VOLATILE is considered to use and clobber all hard registers and +;; all of memory. This blocks insns from being moved across this point. + +(define_insn "blockage" + [(unspec_volatile [(const_int 0)] VUNSPEC_BLOCKAGE)] + "TARGET_EITHER" + "" + [(set_attr "length" "0") + (set_attr "type" "block")] +) + +(define_expand "casesi" + [(match_operand:SI 0 "s_register_operand" "") ; index to jump on + (match_operand:SI 1 "const_int_operand" "") ; lower bound + (match_operand:SI 2 "const_int_operand" "") ; total range + (match_operand:SI 3 "" "") ; table label + (match_operand:SI 4 "" "")] ; Out of range label + "TARGET_32BIT || optimize_size || flag_pic" + " + { + enum insn_code code; + if (operands[1] != const0_rtx) + { + rtx reg = gen_reg_rtx (SImode); + + emit_insn (gen_addsi3 (reg, operands[0], + GEN_INT (-INTVAL (operands[1])))); + operands[0] = reg; + } + + if (TARGET_ARM) + code = CODE_FOR_arm_casesi_internal; + else if (TARGET_THUMB1) + code = CODE_FOR_thumb1_casesi_internal_pic; + else if (flag_pic) + code = CODE_FOR_thumb2_casesi_internal_pic; + else + code = CODE_FOR_thumb2_casesi_internal; + + if (!insn_data[(int) code].operand[1].predicate(operands[2], SImode)) + operands[2] = force_reg (SImode, operands[2]); + + emit_jump_insn (GEN_FCN ((int) code) (operands[0], operands[2], + operands[3], operands[4])); + DONE; + }" +) + +;; The USE in this pattern is needed to tell flow analysis that this is +;; a CASESI insn. It has no other purpose. +(define_insn "arm_casesi_internal" + [(parallel [(set (pc) + (if_then_else + (leu (match_operand:SI 0 "s_register_operand" "r") + (match_operand:SI 1 "arm_rhs_operand" "rI")) + (mem:SI (plus:SI (mult:SI (match_dup 0) (const_int 4)) + (label_ref (match_operand 2 "" "")))) + (label_ref (match_operand 3 "" "")))) + (clobber (reg:CC CC_REGNUM)) + (use (label_ref (match_dup 2)))])] + "TARGET_ARM" + "* + if (flag_pic) + return \"cmp\\t%0, %1\;addls\\t%|pc, %|pc, %0, asl #2\;b\\t%l3\"; + return \"cmp\\t%0, %1\;ldrls\\t%|pc, [%|pc, %0, asl #2]\;b\\t%l3\"; + " + [(set_attr "conds" "clob") + (set_attr "length" "12")] +) + +(define_expand "thumb1_casesi_internal_pic" + [(match_operand:SI 0 "s_register_operand" "") + (match_operand:SI 1 "thumb1_cmp_operand" "") + (match_operand 2 "" "") + (match_operand 3 "" "")] + "TARGET_THUMB1" + { + rtx reg0; + rtx test = gen_rtx_GTU (VOIDmode, operands[0], operands[1]); + emit_jump_insn (gen_cbranchsi4 (test, operands[0], operands[1], + operands[3])); + reg0 = gen_rtx_REG (SImode, 0); + emit_move_insn (reg0, operands[0]); + emit_jump_insn (gen_thumb1_casesi_dispatch (operands[2]/*, operands[3]*/)); + DONE; + } +) + +(define_insn "thumb1_casesi_dispatch" + [(parallel [(set (pc) (unspec [(reg:SI 0) + (label_ref (match_operand 0 "" "")) +;; (label_ref (match_operand 1 "" "")) +] + UNSPEC_THUMB1_CASESI)) + (clobber (reg:SI IP_REGNUM)) + (clobber (reg:SI LR_REGNUM))])] + "TARGET_THUMB1" + "* return thumb1_output_casesi(operands);" + [(set_attr "length" "4")] +) + +(define_expand "indirect_jump" + [(set (pc) + (match_operand:SI 0 "s_register_operand" ""))] + "TARGET_EITHER" + " + /* Thumb-2 doesn't have mov pc, reg. Explicitly set the low bit of the + address and use bx. */ + if (TARGET_THUMB2) + { + rtx tmp; + tmp = gen_reg_rtx (SImode); + emit_insn (gen_iorsi3 (tmp, operands[0], GEN_INT(1))); + operands[0] = tmp; + } + " +) + +;; NB Never uses BX. +(define_insn "*arm_indirect_jump" + [(set (pc) + (match_operand:SI 0 "s_register_operand" "r"))] + "TARGET_ARM" + "mov%?\\t%|pc, %0\\t%@ indirect register jump" + [(set_attr "predicable" "yes")] +) + +(define_insn "*load_indirect_jump" + [(set (pc) + (match_operand:SI 0 "memory_operand" "m"))] + "TARGET_ARM" + "ldr%?\\t%|pc, %0\\t%@ indirect memory jump" + [(set_attr "type" "load1") + (set_attr "pool_range" "4096") + (set_attr "neg_pool_range" "4084") + (set_attr "predicable" "yes")] +) + +;; NB Never uses BX. +(define_insn "*thumb1_indirect_jump" + [(set (pc) + (match_operand:SI 0 "register_operand" "l*r"))] + "TARGET_THUMB1" + "mov\\tpc, %0" + [(set_attr "conds" "clob") + (set_attr "length" "2")] +) + + +;; Misc insns + +(define_insn "nop" + [(const_int 0)] + "TARGET_EITHER" + "* + if (TARGET_UNIFIED_ASM) + return \"nop\"; + if (TARGET_ARM) + return \"mov%?\\t%|r0, %|r0\\t%@ nop\"; + return \"mov\\tr8, r8\"; + " + [(set (attr "length") + (if_then_else (eq_attr "is_thumb" "yes") + (const_int 2) + (const_int 4)))] +) + + +;; Patterns to allow combination of arithmetic, cond code and shifts + +(define_insn "*arith_shiftsi" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,r,r") + (match_operator:SI 1 "shiftable_operator" + [(match_operator:SI 3 "shift_operator" + [(match_operand:SI 4 "s_register_operand" "r,r,r,r") + (match_operand:SI 5 "shift_amount_operand" "M,M,M,r")]) + (match_operand:SI 2 "s_register_operand" "rk,rk,r,rk")]))] + "TARGET_32BIT" + "%i1%?\\t%0, %2, %4%S3" + [(set_attr "predicable" "yes") + (set_attr "shift" "4") + (set_attr "arch" "a,t2,t2,a") + ;; Thumb2 doesn't allow the stack pointer to be used for + ;; operand1 for all operations other than add and sub. In this case + ;; the minus operation is a candidate for an rsub and hence needs + ;; to be disabled. + ;; We have to make sure to disable the fourth alternative if + ;; the shift_operator is MULT, since otherwise the insn will + ;; also match a multiply_accumulate pattern and validate_change + ;; will allow a replacement of the constant with a register + ;; despite the checks done in shift_operator. + (set_attr_alternative "insn_enabled" + [(const_string "yes") + (if_then_else + (match_operand:SI 1 "add_operator" "") + (const_string "yes") (const_string "no")) + (const_string "yes") + (if_then_else + (match_operand:SI 3 "mult_operator" "") + (const_string "no") (const_string "yes"))]) + (set_attr "type" "alu_shift,alu_shift,alu_shift,alu_shift_reg")]) + +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operator:SI 1 "shiftable_operator" + [(match_operator:SI 2 "shiftable_operator" + [(match_operator:SI 3 "shift_operator" + [(match_operand:SI 4 "s_register_operand" "") + (match_operand:SI 5 "reg_or_int_operand" "")]) + (match_operand:SI 6 "s_register_operand" "")]) + (match_operand:SI 7 "arm_rhs_operand" "")])) + (clobber (match_operand:SI 8 "s_register_operand" ""))] + "TARGET_32BIT" + [(set (match_dup 8) + (match_op_dup 2 [(match_op_dup 3 [(match_dup 4) (match_dup 5)]) + (match_dup 6)])) + (set (match_dup 0) + (match_op_dup 1 [(match_dup 8) (match_dup 7)]))] + "") + +(define_insn "*arith_shiftsi_compare0" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV + (match_operator:SI 1 "shiftable_operator" + [(match_operator:SI 3 "shift_operator" + [(match_operand:SI 4 "s_register_operand" "r,r") + (match_operand:SI 5 "shift_amount_operand" "M,r")]) + (match_operand:SI 2 "s_register_operand" "r,r")]) + (const_int 0))) + (set (match_operand:SI 0 "s_register_operand" "=r,r") + (match_op_dup 1 [(match_op_dup 3 [(match_dup 4) (match_dup 5)]) + (match_dup 2)]))] + "TARGET_32BIT" + "%i1%.\\t%0, %2, %4%S3" + [(set_attr "conds" "set") + (set_attr "shift" "4") + (set_attr "arch" "32,a") + (set_attr "type" "alu_shift,alu_shift_reg")]) + +(define_insn "*arith_shiftsi_compare0_scratch" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV + (match_operator:SI 1 "shiftable_operator" + [(match_operator:SI 3 "shift_operator" + [(match_operand:SI 4 "s_register_operand" "r,r") + (match_operand:SI 5 "shift_amount_operand" "M,r")]) + (match_operand:SI 2 "s_register_operand" "r,r")]) + (const_int 0))) + (clobber (match_scratch:SI 0 "=r,r"))] + "TARGET_32BIT" + "%i1%.\\t%0, %2, %4%S3" + [(set_attr "conds" "set") + (set_attr "shift" "4") + (set_attr "arch" "32,a") + (set_attr "type" "alu_shift,alu_shift_reg")]) + +(define_insn "*sub_shiftsi" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (minus:SI (match_operand:SI 1 "s_register_operand" "r,r") + (match_operator:SI 2 "shift_operator" + [(match_operand:SI 3 "s_register_operand" "r,r") + (match_operand:SI 4 "shift_amount_operand" "M,r")])))] + "TARGET_32BIT" + "sub%?\\t%0, %1, %3%S2" + [(set_attr "predicable" "yes") + (set_attr "shift" "3") + (set_attr "arch" "32,a") + (set_attr "type" "alu_shift,alu_shift_reg")]) + +(define_insn "*sub_shiftsi_compare0" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV + (minus:SI (match_operand:SI 1 "s_register_operand" "r,r") + (match_operator:SI 2 "shift_operator" + [(match_operand:SI 3 "s_register_operand" "r,r") + (match_operand:SI 4 "shift_amount_operand" "M,rM")])) + (const_int 0))) + (set (match_operand:SI 0 "s_register_operand" "=r,r") + (minus:SI (match_dup 1) + (match_op_dup 2 [(match_dup 3) (match_dup 4)])))] + "TARGET_32BIT" + "sub%.\\t%0, %1, %3%S2" + [(set_attr "conds" "set") + (set_attr "shift" "3") + (set_attr "arch" "32,a") + (set_attr "type" "alu_shift,alu_shift_reg")]) + +(define_insn "*sub_shiftsi_compare0_scratch" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV + (minus:SI (match_operand:SI 1 "s_register_operand" "r,r") + (match_operator:SI 2 "shift_operator" + [(match_operand:SI 3 "s_register_operand" "r,r") + (match_operand:SI 4 "shift_amount_operand" "M,rM")])) + (const_int 0))) + (clobber (match_scratch:SI 0 "=r,r"))] + "TARGET_32BIT" + "sub%.\\t%0, %1, %3%S2" + [(set_attr "conds" "set") + (set_attr "shift" "3") + (set_attr "arch" "32,a") + (set_attr "type" "alu_shift,alu_shift_reg")]) + + +(define_insn "*and_scc" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (and:SI (match_operator:SI 1 "arm_comparison_operator" + [(match_operand 3 "cc_register" "") (const_int 0)]) + (match_operand:SI 2 "s_register_operand" "r")))] + "TARGET_ARM" + "mov%D1\\t%0, #0\;and%d1\\t%0, %2, #1" + [(set_attr "conds" "use") + (set_attr "insn" "mov") + (set_attr "length" "8")] +) + +(define_insn "*ior_scc" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (ior:SI (match_operator:SI 2 "arm_comparison_operator" + [(match_operand 3 "cc_register" "") (const_int 0)]) + (match_operand:SI 1 "s_register_operand" "0,?r")))] + "TARGET_ARM" + "@ + orr%d2\\t%0, %1, #1 + mov%D2\\t%0, %1\;orr%d2\\t%0, %1, #1" + [(set_attr "conds" "use") + (set_attr "length" "4,8")] +) + +; A series of splitters for the compare_scc pattern below. Note that +; order is important. +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (lt:SI (match_operand:SI 1 "s_register_operand" "") + (const_int 0))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_32BIT && reload_completed" + [(set (match_dup 0) (lshiftrt:SI (match_dup 1) (const_int 31)))]) + +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (ge:SI (match_operand:SI 1 "s_register_operand" "") + (const_int 0))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_32BIT && reload_completed" + [(set (match_dup 0) (not:SI (match_dup 1))) + (set (match_dup 0) (lshiftrt:SI (match_dup 0) (const_int 31)))]) + +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (eq:SI (match_operand:SI 1 "s_register_operand" "") + (const_int 0))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_32BIT && reload_completed" + [(parallel + [(set (reg:CC CC_REGNUM) + (compare:CC (const_int 1) (match_dup 1))) + (set (match_dup 0) + (minus:SI (const_int 1) (match_dup 1)))]) + (cond_exec (ltu:CC (reg:CC CC_REGNUM) (const_int 0)) + (set (match_dup 0) (const_int 0)))]) + +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (ne:SI (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 2 "const_int_operand" ""))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_32BIT && reload_completed" + [(parallel + [(set (reg:CC CC_REGNUM) + (compare:CC (match_dup 1) (match_dup 2))) + (set (match_dup 0) (plus:SI (match_dup 1) (match_dup 3)))]) + (cond_exec (ne:CC (reg:CC CC_REGNUM) (const_int 0)) + (set (match_dup 0) (const_int 1)))] +{ + operands[3] = GEN_INT (-INTVAL (operands[2])); +}) + +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (ne:SI (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 2 "arm_add_operand" ""))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_32BIT && reload_completed" + [(parallel + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV (minus:SI (match_dup 1) (match_dup 2)) + (const_int 0))) + (set (match_dup 0) (minus:SI (match_dup 1) (match_dup 2)))]) + (cond_exec (ne:CC_NOOV (reg:CC_NOOV CC_REGNUM) (const_int 0)) + (set (match_dup 0) (const_int 1)))]) + +(define_insn_and_split "*compare_scc" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (match_operator:SI 1 "arm_comparison_operator" + [(match_operand:SI 2 "s_register_operand" "r,r") + (match_operand:SI 3 "arm_add_operand" "rI,L")])) + (clobber (reg:CC CC_REGNUM))] + "TARGET_32BIT" + "#" + "&& reload_completed" + [(set (reg:CC CC_REGNUM) (compare:CC (match_dup 2) (match_dup 3))) + (cond_exec (match_dup 4) (set (match_dup 0) (const_int 0))) + (cond_exec (match_dup 5) (set (match_dup 0) (const_int 1)))] +{ + rtx tmp1; + enum machine_mode mode = SELECT_CC_MODE (GET_CODE (operands[1]), + operands[2], operands[3]); + enum rtx_code rc = GET_CODE (operands[1]); + + tmp1 = gen_rtx_REG (mode, CC_REGNUM); + + operands[5] = gen_rtx_fmt_ee (rc, VOIDmode, tmp1, const0_rtx); + if (mode == CCFPmode || mode == CCFPEmode) + rc = reverse_condition_maybe_unordered (rc); + else + rc = reverse_condition (rc); + operands[4] = gen_rtx_fmt_ee (rc, VOIDmode, tmp1, const0_rtx); +}) + +;; Attempt to improve the sequence generated by the compare_scc splitters +;; not to use conditional execution. +(define_peephole2 + [(set (reg:CC CC_REGNUM) + (compare:CC (match_operand:SI 1 "register_operand" "") + (match_operand:SI 2 "arm_rhs_operand" ""))) + (cond_exec (ne (reg:CC CC_REGNUM) (const_int 0)) + (set (match_operand:SI 0 "register_operand" "") (const_int 0))) + (cond_exec (eq (reg:CC CC_REGNUM) (const_int 0)) + (set (match_dup 0) (const_int 1))) + (match_scratch:SI 3 "r")] + "TARGET_32BIT" + [(set (match_dup 3) (minus:SI (match_dup 1) (match_dup 2))) + (parallel + [(set (reg:CC CC_REGNUM) + (compare:CC (const_int 0) (match_dup 3))) + (set (match_dup 0) (minus:SI (const_int 0) (match_dup 3)))]) + (set (match_dup 0) + (plus:SI (plus:SI (match_dup 0) (match_dup 3)) + (geu:SI (reg:CC CC_REGNUM) (const_int 0))))]) + +(define_insn "*cond_move" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,r") + (if_then_else:SI (match_operator 3 "equality_operator" + [(match_operator 4 "arm_comparison_operator" + [(match_operand 5 "cc_register" "") (const_int 0)]) + (const_int 0)]) + (match_operand:SI 1 "arm_rhs_operand" "0,rI,?rI") + (match_operand:SI 2 "arm_rhs_operand" "rI,0,rI")))] + "TARGET_ARM" + "* + if (GET_CODE (operands[3]) == NE) + { + if (which_alternative != 1) + output_asm_insn (\"mov%D4\\t%0, %2\", operands); + if (which_alternative != 0) + output_asm_insn (\"mov%d4\\t%0, %1\", operands); + return \"\"; + } + if (which_alternative != 0) + output_asm_insn (\"mov%D4\\t%0, %1\", operands); + if (which_alternative != 1) + output_asm_insn (\"mov%d4\\t%0, %2\", operands); + return \"\"; + " + [(set_attr "conds" "use") + (set_attr "insn" "mov") + (set_attr "length" "4,4,8")] +) + +(define_insn "*cond_arith" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (match_operator:SI 5 "shiftable_operator" + [(match_operator:SI 4 "arm_comparison_operator" + [(match_operand:SI 2 "s_register_operand" "r,r") + (match_operand:SI 3 "arm_rhs_operand" "rI,rI")]) + (match_operand:SI 1 "s_register_operand" "0,?r")])) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "* + if (GET_CODE (operands[4]) == LT && operands[3] == const0_rtx) + return \"%i5\\t%0, %1, %2, lsr #31\"; + + output_asm_insn (\"cmp\\t%2, %3\", operands); + if (GET_CODE (operands[5]) == AND) + output_asm_insn (\"mov%D4\\t%0, #0\", operands); + else if (GET_CODE (operands[5]) == MINUS) + output_asm_insn (\"rsb%D4\\t%0, %1, #0\", operands); + else if (which_alternative != 0) + output_asm_insn (\"mov%D4\\t%0, %1\", operands); + return \"%i5%d4\\t%0, %1, #1\"; + " + [(set_attr "conds" "clob") + (set_attr "length" "12")] +) + +(define_insn "*cond_sub" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (minus:SI (match_operand:SI 1 "s_register_operand" "0,?r") + (match_operator:SI 4 "arm_comparison_operator" + [(match_operand:SI 2 "s_register_operand" "r,r") + (match_operand:SI 3 "arm_rhs_operand" "rI,rI")]))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "* + output_asm_insn (\"cmp\\t%2, %3\", operands); + if (which_alternative != 0) + output_asm_insn (\"mov%D4\\t%0, %1\", operands); + return \"sub%d4\\t%0, %1, #1\"; + " + [(set_attr "conds" "clob") + (set_attr "length" "8,12")] +) + +;; ??? Is it worth using these conditional patterns in Thumb-2 mode? +(define_insn "*cmp_ite0" + [(set (match_operand 6 "dominant_cc_register" "") + (compare + (if_then_else:SI + (match_operator 4 "arm_comparison_operator" + [(match_operand:SI 0 "s_register_operand" "r,r,r,r") + (match_operand:SI 1 "arm_add_operand" "rI,L,rI,L")]) + (match_operator:SI 5 "arm_comparison_operator" + [(match_operand:SI 2 "s_register_operand" "r,r,r,r") + (match_operand:SI 3 "arm_add_operand" "rI,rI,L,L")]) + (const_int 0)) + (const_int 0)))] + "TARGET_ARM" + "* + { + static const char * const opcodes[4][2] = + { + {\"cmp\\t%2, %3\;cmp%d5\\t%0, %1\", + \"cmp\\t%0, %1\;cmp%d4\\t%2, %3\"}, + {\"cmp\\t%2, %3\;cmn%d5\\t%0, #%n1\", + \"cmn\\t%0, #%n1\;cmp%d4\\t%2, %3\"}, + {\"cmn\\t%2, #%n3\;cmp%d5\\t%0, %1\", + \"cmp\\t%0, %1\;cmn%d4\\t%2, #%n3\"}, + {\"cmn\\t%2, #%n3\;cmn%d5\\t%0, #%n1\", + \"cmn\\t%0, #%n1\;cmn%d4\\t%2, #%n3\"} + }; + int swap = + comparison_dominates_p (GET_CODE (operands[5]), GET_CODE (operands[4])); + + return opcodes[which_alternative][swap]; + }" + [(set_attr "conds" "set") + (set_attr "length" "8")] +) + +(define_insn "*cmp_ite1" + [(set (match_operand 6 "dominant_cc_register" "") + (compare + (if_then_else:SI + (match_operator 4 "arm_comparison_operator" + [(match_operand:SI 0 "s_register_operand" "r,r,r,r") + (match_operand:SI 1 "arm_add_operand" "rI,L,rI,L")]) + (match_operator:SI 5 "arm_comparison_operator" + [(match_operand:SI 2 "s_register_operand" "r,r,r,r") + (match_operand:SI 3 "arm_add_operand" "rI,rI,L,L")]) + (const_int 1)) + (const_int 0)))] + "TARGET_ARM" + "* + { + static const char * const opcodes[4][2] = + { + {\"cmp\\t%0, %1\;cmp%d4\\t%2, %3\", + \"cmp\\t%2, %3\;cmp%D5\\t%0, %1\"}, + {\"cmn\\t%0, #%n1\;cmp%d4\\t%2, %3\", + \"cmp\\t%2, %3\;cmn%D5\\t%0, #%n1\"}, + {\"cmp\\t%0, %1\;cmn%d4\\t%2, #%n3\", + \"cmn\\t%2, #%n3\;cmp%D5\\t%0, %1\"}, + {\"cmn\\t%0, #%n1\;cmn%d4\\t%2, #%n3\", + \"cmn\\t%2, #%n3\;cmn%D5\\t%0, #%n1\"} + }; + int swap = + comparison_dominates_p (GET_CODE (operands[5]), + reverse_condition (GET_CODE (operands[4]))); + + return opcodes[which_alternative][swap]; + }" + [(set_attr "conds" "set") + (set_attr "length" "8")] +) + +(define_insn "*cmp_and" + [(set (match_operand 6 "dominant_cc_register" "") + (compare + (and:SI + (match_operator 4 "arm_comparison_operator" + [(match_operand:SI 0 "s_register_operand" "r,r,r,r") + (match_operand:SI 1 "arm_add_operand" "rI,L,rI,L")]) + (match_operator:SI 5 "arm_comparison_operator" + [(match_operand:SI 2 "s_register_operand" "r,r,r,r") + (match_operand:SI 3 "arm_add_operand" "rI,rI,L,L")])) + (const_int 0)))] + "TARGET_ARM" + "* + { + static const char *const opcodes[4][2] = + { + {\"cmp\\t%2, %3\;cmp%d5\\t%0, %1\", + \"cmp\\t%0, %1\;cmp%d4\\t%2, %3\"}, + {\"cmp\\t%2, %3\;cmn%d5\\t%0, #%n1\", + \"cmn\\t%0, #%n1\;cmp%d4\\t%2, %3\"}, + {\"cmn\\t%2, #%n3\;cmp%d5\\t%0, %1\", + \"cmp\\t%0, %1\;cmn%d4\\t%2, #%n3\"}, + {\"cmn\\t%2, #%n3\;cmn%d5\\t%0, #%n1\", + \"cmn\\t%0, #%n1\;cmn%d4\\t%2, #%n3\"} + }; + int swap = + comparison_dominates_p (GET_CODE (operands[5]), GET_CODE (operands[4])); + + return opcodes[which_alternative][swap]; + }" + [(set_attr "conds" "set") + (set_attr "predicable" "no") + (set_attr "length" "8")] +) + +(define_insn "*cmp_ior" + [(set (match_operand 6 "dominant_cc_register" "") + (compare + (ior:SI + (match_operator 4 "arm_comparison_operator" + [(match_operand:SI 0 "s_register_operand" "r,r,r,r") + (match_operand:SI 1 "arm_add_operand" "rI,L,rI,L")]) + (match_operator:SI 5 "arm_comparison_operator" + [(match_operand:SI 2 "s_register_operand" "r,r,r,r") + (match_operand:SI 3 "arm_add_operand" "rI,rI,L,L")])) + (const_int 0)))] + "TARGET_ARM" + "* +{ + static const char *const opcodes[4][2] = + { + {\"cmp\\t%0, %1\;cmp%D4\\t%2, %3\", + \"cmp\\t%2, %3\;cmp%D5\\t%0, %1\"}, + {\"cmn\\t%0, #%n1\;cmp%D4\\t%2, %3\", + \"cmp\\t%2, %3\;cmn%D5\\t%0, #%n1\"}, + {\"cmp\\t%0, %1\;cmn%D4\\t%2, #%n3\", + \"cmn\\t%2, #%n3\;cmp%D5\\t%0, %1\"}, + {\"cmn\\t%0, #%n1\;cmn%D4\\t%2, #%n3\", + \"cmn\\t%2, #%n3\;cmn%D5\\t%0, #%n1\"} + }; + int swap = + comparison_dominates_p (GET_CODE (operands[5]), GET_CODE (operands[4])); + + return opcodes[which_alternative][swap]; +} +" + [(set_attr "conds" "set") + (set_attr "length" "8")] +) + +(define_insn_and_split "*ior_scc_scc" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (ior:SI (match_operator:SI 3 "arm_comparison_operator" + [(match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "arm_add_operand" "rIL")]) + (match_operator:SI 6 "arm_comparison_operator" + [(match_operand:SI 4 "s_register_operand" "r") + (match_operand:SI 5 "arm_add_operand" "rIL")]))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM + && (arm_select_dominance_cc_mode (operands[3], operands[6], DOM_CC_X_OR_Y) + != CCmode)" + "#" + "TARGET_ARM && reload_completed" + [(set (match_dup 7) + (compare + (ior:SI + (match_op_dup 3 [(match_dup 1) (match_dup 2)]) + (match_op_dup 6 [(match_dup 4) (match_dup 5)])) + (const_int 0))) + (set (match_dup 0) (ne:SI (match_dup 7) (const_int 0)))] + "operands[7] + = gen_rtx_REG (arm_select_dominance_cc_mode (operands[3], operands[6], + DOM_CC_X_OR_Y), + CC_REGNUM);" + [(set_attr "conds" "clob") + (set_attr "length" "16")]) + +; If the above pattern is followed by a CMP insn, then the compare is +; redundant, since we can rework the conditional instruction that follows. +(define_insn_and_split "*ior_scc_scc_cmp" + [(set (match_operand 0 "dominant_cc_register" "") + (compare (ior:SI (match_operator:SI 3 "arm_comparison_operator" + [(match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "arm_add_operand" "rIL")]) + (match_operator:SI 6 "arm_comparison_operator" + [(match_operand:SI 4 "s_register_operand" "r") + (match_operand:SI 5 "arm_add_operand" "rIL")])) + (const_int 0))) + (set (match_operand:SI 7 "s_register_operand" "=r") + (ior:SI (match_op_dup 3 [(match_dup 1) (match_dup 2)]) + (match_op_dup 6 [(match_dup 4) (match_dup 5)])))] + "TARGET_ARM" + "#" + "TARGET_ARM && reload_completed" + [(set (match_dup 0) + (compare + (ior:SI + (match_op_dup 3 [(match_dup 1) (match_dup 2)]) + (match_op_dup 6 [(match_dup 4) (match_dup 5)])) + (const_int 0))) + (set (match_dup 7) (ne:SI (match_dup 0) (const_int 0)))] + "" + [(set_attr "conds" "set") + (set_attr "length" "16")]) + +(define_insn_and_split "*and_scc_scc" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (and:SI (match_operator:SI 3 "arm_comparison_operator" + [(match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "arm_add_operand" "rIL")]) + (match_operator:SI 6 "arm_comparison_operator" + [(match_operand:SI 4 "s_register_operand" "r") + (match_operand:SI 5 "arm_add_operand" "rIL")]))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM + && (arm_select_dominance_cc_mode (operands[3], operands[6], DOM_CC_X_AND_Y) + != CCmode)" + "#" + "TARGET_ARM && reload_completed + && (arm_select_dominance_cc_mode (operands[3], operands[6], DOM_CC_X_AND_Y) + != CCmode)" + [(set (match_dup 7) + (compare + (and:SI + (match_op_dup 3 [(match_dup 1) (match_dup 2)]) + (match_op_dup 6 [(match_dup 4) (match_dup 5)])) + (const_int 0))) + (set (match_dup 0) (ne:SI (match_dup 7) (const_int 0)))] + "operands[7] + = gen_rtx_REG (arm_select_dominance_cc_mode (operands[3], operands[6], + DOM_CC_X_AND_Y), + CC_REGNUM);" + [(set_attr "conds" "clob") + (set_attr "length" "16")]) + +; If the above pattern is followed by a CMP insn, then the compare is +; redundant, since we can rework the conditional instruction that follows. +(define_insn_and_split "*and_scc_scc_cmp" + [(set (match_operand 0 "dominant_cc_register" "") + (compare (and:SI (match_operator:SI 3 "arm_comparison_operator" + [(match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "arm_add_operand" "rIL")]) + (match_operator:SI 6 "arm_comparison_operator" + [(match_operand:SI 4 "s_register_operand" "r") + (match_operand:SI 5 "arm_add_operand" "rIL")])) + (const_int 0))) + (set (match_operand:SI 7 "s_register_operand" "=r") + (and:SI (match_op_dup 3 [(match_dup 1) (match_dup 2)]) + (match_op_dup 6 [(match_dup 4) (match_dup 5)])))] + "TARGET_ARM" + "#" + "TARGET_ARM && reload_completed" + [(set (match_dup 0) + (compare + (and:SI + (match_op_dup 3 [(match_dup 1) (match_dup 2)]) + (match_op_dup 6 [(match_dup 4) (match_dup 5)])) + (const_int 0))) + (set (match_dup 7) (ne:SI (match_dup 0) (const_int 0)))] + "" + [(set_attr "conds" "set") + (set_attr "length" "16")]) + +;; If there is no dominance in the comparison, then we can still save an +;; instruction in the AND case, since we can know that the second compare +;; need only zero the value if false (if true, then the value is already +;; correct). +(define_insn_and_split "*and_scc_scc_nodom" + [(set (match_operand:SI 0 "s_register_operand" "=&r,&r,&r") + (and:SI (match_operator:SI 3 "arm_comparison_operator" + [(match_operand:SI 1 "s_register_operand" "r,r,0") + (match_operand:SI 2 "arm_add_operand" "rIL,0,rIL")]) + (match_operator:SI 6 "arm_comparison_operator" + [(match_operand:SI 4 "s_register_operand" "r,r,r") + (match_operand:SI 5 "arm_add_operand" "rIL,rIL,rIL")]))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM + && (arm_select_dominance_cc_mode (operands[3], operands[6], DOM_CC_X_AND_Y) + == CCmode)" + "#" + "TARGET_ARM && reload_completed" + [(parallel [(set (match_dup 0) + (match_op_dup 3 [(match_dup 1) (match_dup 2)])) + (clobber (reg:CC CC_REGNUM))]) + (set (match_dup 7) (match_op_dup 8 [(match_dup 4) (match_dup 5)])) + (set (match_dup 0) + (if_then_else:SI (match_op_dup 6 [(match_dup 7) (const_int 0)]) + (match_dup 0) + (const_int 0)))] + "operands[7] = gen_rtx_REG (SELECT_CC_MODE (GET_CODE (operands[6]), + operands[4], operands[5]), + CC_REGNUM); + operands[8] = gen_rtx_COMPARE (GET_MODE (operands[7]), operands[4], + operands[5]);" + [(set_attr "conds" "clob") + (set_attr "length" "20")]) + +(define_split + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV (ior:SI + (and:SI (match_operand:SI 0 "s_register_operand" "") + (const_int 1)) + (match_operator:SI 1 "arm_comparison_operator" + [(match_operand:SI 2 "s_register_operand" "") + (match_operand:SI 3 "arm_add_operand" "")])) + (const_int 0))) + (clobber (match_operand:SI 4 "s_register_operand" ""))] + "TARGET_ARM" + [(set (match_dup 4) + (ior:SI (match_op_dup 1 [(match_dup 2) (match_dup 3)]) + (match_dup 0))) + (set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV (and:SI (match_dup 4) (const_int 1)) + (const_int 0)))] + "") + +(define_split + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV (ior:SI + (match_operator:SI 1 "arm_comparison_operator" + [(match_operand:SI 2 "s_register_operand" "") + (match_operand:SI 3 "arm_add_operand" "")]) + (and:SI (match_operand:SI 0 "s_register_operand" "") + (const_int 1))) + (const_int 0))) + (clobber (match_operand:SI 4 "s_register_operand" ""))] + "TARGET_ARM" + [(set (match_dup 4) + (ior:SI (match_op_dup 1 [(match_dup 2) (match_dup 3)]) + (match_dup 0))) + (set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV (and:SI (match_dup 4) (const_int 1)) + (const_int 0)))] + "") +;; ??? The conditional patterns above need checking for Thumb-2 usefulness + +(define_insn "*negscc" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (neg:SI (match_operator 3 "arm_comparison_operator" + [(match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "arm_rhs_operand" "rI")]))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "* + if (GET_CODE (operands[3]) == LT && operands[2] == const0_rtx) + return \"mov\\t%0, %1, asr #31\"; + + if (GET_CODE (operands[3]) == NE) + return \"subs\\t%0, %1, %2\;mvnne\\t%0, #0\"; + + output_asm_insn (\"cmp\\t%1, %2\", operands); + output_asm_insn (\"mov%D3\\t%0, #0\", operands); + return \"mvn%d3\\t%0, #0\"; + " + [(set_attr "conds" "clob") + (set_attr "length" "12")] +) + +(define_insn "movcond" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,r") + (if_then_else:SI + (match_operator 5 "arm_comparison_operator" + [(match_operand:SI 3 "s_register_operand" "r,r,r") + (match_operand:SI 4 "arm_add_operand" "rIL,rIL,rIL")]) + (match_operand:SI 1 "arm_rhs_operand" "0,rI,?rI") + (match_operand:SI 2 "arm_rhs_operand" "rI,0,rI"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "* + if (GET_CODE (operands[5]) == LT + && (operands[4] == const0_rtx)) + { + if (which_alternative != 1 && GET_CODE (operands[1]) == REG) + { + if (operands[2] == const0_rtx) + return \"and\\t%0, %1, %3, asr #31\"; + return \"ands\\t%0, %1, %3, asr #32\;movcc\\t%0, %2\"; + } + else if (which_alternative != 0 && GET_CODE (operands[2]) == REG) + { + if (operands[1] == const0_rtx) + return \"bic\\t%0, %2, %3, asr #31\"; + return \"bics\\t%0, %2, %3, asr #32\;movcs\\t%0, %1\"; + } + /* The only case that falls through to here is when both ops 1 & 2 + are constants. */ + } + + if (GET_CODE (operands[5]) == GE + && (operands[4] == const0_rtx)) + { + if (which_alternative != 1 && GET_CODE (operands[1]) == REG) + { + if (operands[2] == const0_rtx) + return \"bic\\t%0, %1, %3, asr #31\"; + return \"bics\\t%0, %1, %3, asr #32\;movcs\\t%0, %2\"; + } + else if (which_alternative != 0 && GET_CODE (operands[2]) == REG) + { + if (operands[1] == const0_rtx) + return \"and\\t%0, %2, %3, asr #31\"; + return \"ands\\t%0, %2, %3, asr #32\;movcc\\t%0, %1\"; + } + /* The only case that falls through to here is when both ops 1 & 2 + are constants. */ + } + if (GET_CODE (operands[4]) == CONST_INT + && !const_ok_for_arm (INTVAL (operands[4]))) + output_asm_insn (\"cmn\\t%3, #%n4\", operands); + else + output_asm_insn (\"cmp\\t%3, %4\", operands); + if (which_alternative != 0) + output_asm_insn (\"mov%d5\\t%0, %1\", operands); + if (which_alternative != 1) + output_asm_insn (\"mov%D5\\t%0, %2\", operands); + return \"\"; + " + [(set_attr "conds" "clob") + (set_attr "length" "8,8,12")] +) + +;; ??? The patterns below need checking for Thumb-2 usefulness. + +(define_insn "*ifcompare_plus_move" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (if_then_else:SI (match_operator 6 "arm_comparison_operator" + [(match_operand:SI 4 "s_register_operand" "r,r") + (match_operand:SI 5 "arm_add_operand" "rIL,rIL")]) + (plus:SI + (match_operand:SI 2 "s_register_operand" "r,r") + (match_operand:SI 3 "arm_add_operand" "rIL,rIL")) + (match_operand:SI 1 "arm_rhs_operand" "0,?rI"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "#" + [(set_attr "conds" "clob") + (set_attr "length" "8,12")] +) + +(define_insn "*if_plus_move" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,r,r") + (if_then_else:SI + (match_operator 4 "arm_comparison_operator" + [(match_operand 5 "cc_register" "") (const_int 0)]) + (plus:SI + (match_operand:SI 2 "s_register_operand" "r,r,r,r") + (match_operand:SI 3 "arm_add_operand" "rI,L,rI,L")) + (match_operand:SI 1 "arm_rhs_operand" "0,0,?rI,?rI")))] + "TARGET_ARM" + "@ + add%d4\\t%0, %2, %3 + sub%d4\\t%0, %2, #%n3 + add%d4\\t%0, %2, %3\;mov%D4\\t%0, %1 + sub%d4\\t%0, %2, #%n3\;mov%D4\\t%0, %1" + [(set_attr "conds" "use") + (set_attr "length" "4,4,8,8") + (set_attr "type" "*,*,*,*")] +) + +(define_insn "*ifcompare_move_plus" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (if_then_else:SI (match_operator 6 "arm_comparison_operator" + [(match_operand:SI 4 "s_register_operand" "r,r") + (match_operand:SI 5 "arm_add_operand" "rIL,rIL")]) + (match_operand:SI 1 "arm_rhs_operand" "0,?rI") + (plus:SI + (match_operand:SI 2 "s_register_operand" "r,r") + (match_operand:SI 3 "arm_add_operand" "rIL,rIL")))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "#" + [(set_attr "conds" "clob") + (set_attr "length" "8,12")] +) + +(define_insn "*if_move_plus" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,r,r") + (if_then_else:SI + (match_operator 4 "arm_comparison_operator" + [(match_operand 5 "cc_register" "") (const_int 0)]) + (match_operand:SI 1 "arm_rhs_operand" "0,0,?rI,?rI") + (plus:SI + (match_operand:SI 2 "s_register_operand" "r,r,r,r") + (match_operand:SI 3 "arm_add_operand" "rI,L,rI,L"))))] + "TARGET_ARM" + "@ + add%D4\\t%0, %2, %3 + sub%D4\\t%0, %2, #%n3 + add%D4\\t%0, %2, %3\;mov%d4\\t%0, %1 + sub%D4\\t%0, %2, #%n3\;mov%d4\\t%0, %1" + [(set_attr "conds" "use") + (set_attr "length" "4,4,8,8") + (set_attr "type" "*,*,*,*")] +) + +(define_insn "*ifcompare_arith_arith" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (if_then_else:SI (match_operator 9 "arm_comparison_operator" + [(match_operand:SI 5 "s_register_operand" "r") + (match_operand:SI 6 "arm_add_operand" "rIL")]) + (match_operator:SI 8 "shiftable_operator" + [(match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "arm_rhs_operand" "rI")]) + (match_operator:SI 7 "shiftable_operator" + [(match_operand:SI 3 "s_register_operand" "r") + (match_operand:SI 4 "arm_rhs_operand" "rI")]))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "#" + [(set_attr "conds" "clob") + (set_attr "length" "12")] +) + +(define_insn "*if_arith_arith" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (if_then_else:SI (match_operator 5 "arm_comparison_operator" + [(match_operand 8 "cc_register" "") (const_int 0)]) + (match_operator:SI 6 "shiftable_operator" + [(match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "arm_rhs_operand" "rI")]) + (match_operator:SI 7 "shiftable_operator" + [(match_operand:SI 3 "s_register_operand" "r") + (match_operand:SI 4 "arm_rhs_operand" "rI")])))] + "TARGET_ARM" + "%I6%d5\\t%0, %1, %2\;%I7%D5\\t%0, %3, %4" + [(set_attr "conds" "use") + (set_attr "length" "8")] +) + +(define_insn "*ifcompare_arith_move" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (if_then_else:SI (match_operator 6 "arm_comparison_operator" + [(match_operand:SI 2 "s_register_operand" "r,r") + (match_operand:SI 3 "arm_add_operand" "rIL,rIL")]) + (match_operator:SI 7 "shiftable_operator" + [(match_operand:SI 4 "s_register_operand" "r,r") + (match_operand:SI 5 "arm_rhs_operand" "rI,rI")]) + (match_operand:SI 1 "arm_rhs_operand" "0,?rI"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "* + /* If we have an operation where (op x 0) is the identity operation and + the conditional operator is LT or GE and we are comparing against zero and + everything is in registers then we can do this in two instructions. */ + if (operands[3] == const0_rtx + && GET_CODE (operands[7]) != AND + && GET_CODE (operands[5]) == REG + && GET_CODE (operands[1]) == REG + && REGNO (operands[1]) == REGNO (operands[4]) + && REGNO (operands[4]) != REGNO (operands[0])) + { + if (GET_CODE (operands[6]) == LT) + return \"and\\t%0, %5, %2, asr #31\;%I7\\t%0, %4, %0\"; + else if (GET_CODE (operands[6]) == GE) + return \"bic\\t%0, %5, %2, asr #31\;%I7\\t%0, %4, %0\"; + } + if (GET_CODE (operands[3]) == CONST_INT + && !const_ok_for_arm (INTVAL (operands[3]))) + output_asm_insn (\"cmn\\t%2, #%n3\", operands); + else + output_asm_insn (\"cmp\\t%2, %3\", operands); + output_asm_insn (\"%I7%d6\\t%0, %4, %5\", operands); + if (which_alternative != 0) + return \"mov%D6\\t%0, %1\"; + return \"\"; + " + [(set_attr "conds" "clob") + (set_attr "length" "8,12")] +) + +(define_insn "*if_arith_move" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (if_then_else:SI (match_operator 4 "arm_comparison_operator" + [(match_operand 6 "cc_register" "") (const_int 0)]) + (match_operator:SI 5 "shiftable_operator" + [(match_operand:SI 2 "s_register_operand" "r,r") + (match_operand:SI 3 "arm_rhs_operand" "rI,rI")]) + (match_operand:SI 1 "arm_rhs_operand" "0,?rI")))] + "TARGET_ARM" + "@ + %I5%d4\\t%0, %2, %3 + %I5%d4\\t%0, %2, %3\;mov%D4\\t%0, %1" + [(set_attr "conds" "use") + (set_attr "length" "4,8") + (set_attr "type" "*,*")] +) + +(define_insn "*ifcompare_move_arith" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (if_then_else:SI (match_operator 6 "arm_comparison_operator" + [(match_operand:SI 4 "s_register_operand" "r,r") + (match_operand:SI 5 "arm_add_operand" "rIL,rIL")]) + (match_operand:SI 1 "arm_rhs_operand" "0,?rI") + (match_operator:SI 7 "shiftable_operator" + [(match_operand:SI 2 "s_register_operand" "r,r") + (match_operand:SI 3 "arm_rhs_operand" "rI,rI")]))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "* + /* If we have an operation where (op x 0) is the identity operation and + the conditional operator is LT or GE and we are comparing against zero and + everything is in registers then we can do this in two instructions */ + if (operands[5] == const0_rtx + && GET_CODE (operands[7]) != AND + && GET_CODE (operands[3]) == REG + && GET_CODE (operands[1]) == REG + && REGNO (operands[1]) == REGNO (operands[2]) + && REGNO (operands[2]) != REGNO (operands[0])) + { + if (GET_CODE (operands[6]) == GE) + return \"and\\t%0, %3, %4, asr #31\;%I7\\t%0, %2, %0\"; + else if (GET_CODE (operands[6]) == LT) + return \"bic\\t%0, %3, %4, asr #31\;%I7\\t%0, %2, %0\"; + } + + if (GET_CODE (operands[5]) == CONST_INT + && !const_ok_for_arm (INTVAL (operands[5]))) + output_asm_insn (\"cmn\\t%4, #%n5\", operands); + else + output_asm_insn (\"cmp\\t%4, %5\", operands); + + if (which_alternative != 0) + output_asm_insn (\"mov%d6\\t%0, %1\", operands); + return \"%I7%D6\\t%0, %2, %3\"; + " + [(set_attr "conds" "clob") + (set_attr "length" "8,12")] +) + +(define_insn "*if_move_arith" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (if_then_else:SI + (match_operator 4 "arm_comparison_operator" + [(match_operand 6 "cc_register" "") (const_int 0)]) + (match_operand:SI 1 "arm_rhs_operand" "0,?rI") + (match_operator:SI 5 "shiftable_operator" + [(match_operand:SI 2 "s_register_operand" "r,r") + (match_operand:SI 3 "arm_rhs_operand" "rI,rI")])))] + "TARGET_ARM" + "@ + %I5%D4\\t%0, %2, %3 + %I5%D4\\t%0, %2, %3\;mov%d4\\t%0, %1" + [(set_attr "conds" "use") + (set_attr "length" "4,8") + (set_attr "type" "*,*")] +) + +(define_insn "*ifcompare_move_not" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (if_then_else:SI + (match_operator 5 "arm_comparison_operator" + [(match_operand:SI 3 "s_register_operand" "r,r") + (match_operand:SI 4 "arm_add_operand" "rIL,rIL")]) + (match_operand:SI 1 "arm_not_operand" "0,?rIK") + (not:SI + (match_operand:SI 2 "s_register_operand" "r,r")))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "#" + [(set_attr "conds" "clob") + (set_attr "length" "8,12")] +) + +(define_insn "*if_move_not" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,r") + (if_then_else:SI + (match_operator 4 "arm_comparison_operator" + [(match_operand 3 "cc_register" "") (const_int 0)]) + (match_operand:SI 1 "arm_not_operand" "0,?rI,K") + (not:SI (match_operand:SI 2 "s_register_operand" "r,r,r"))))] + "TARGET_ARM" + "@ + mvn%D4\\t%0, %2 + mov%d4\\t%0, %1\;mvn%D4\\t%0, %2 + mvn%d4\\t%0, #%B1\;mvn%D4\\t%0, %2" + [(set_attr "conds" "use") + (set_attr "insn" "mvn") + (set_attr "length" "4,8,8")] +) + +(define_insn "*ifcompare_not_move" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (if_then_else:SI + (match_operator 5 "arm_comparison_operator" + [(match_operand:SI 3 "s_register_operand" "r,r") + (match_operand:SI 4 "arm_add_operand" "rIL,rIL")]) + (not:SI + (match_operand:SI 2 "s_register_operand" "r,r")) + (match_operand:SI 1 "arm_not_operand" "0,?rIK"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "#" + [(set_attr "conds" "clob") + (set_attr "length" "8,12")] +) + +(define_insn "*if_not_move" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,r") + (if_then_else:SI + (match_operator 4 "arm_comparison_operator" + [(match_operand 3 "cc_register" "") (const_int 0)]) + (not:SI (match_operand:SI 2 "s_register_operand" "r,r,r")) + (match_operand:SI 1 "arm_not_operand" "0,?rI,K")))] + "TARGET_ARM" + "@ + mvn%d4\\t%0, %2 + mov%D4\\t%0, %1\;mvn%d4\\t%0, %2 + mvn%D4\\t%0, #%B1\;mvn%d4\\t%0, %2" + [(set_attr "conds" "use") + (set_attr "insn" "mvn") + (set_attr "length" "4,8,8")] +) + +(define_insn "*ifcompare_shift_move" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (if_then_else:SI + (match_operator 6 "arm_comparison_operator" + [(match_operand:SI 4 "s_register_operand" "r,r") + (match_operand:SI 5 "arm_add_operand" "rIL,rIL")]) + (match_operator:SI 7 "shift_operator" + [(match_operand:SI 2 "s_register_operand" "r,r") + (match_operand:SI 3 "arm_rhs_operand" "rM,rM")]) + (match_operand:SI 1 "arm_not_operand" "0,?rIK"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "#" + [(set_attr "conds" "clob") + (set_attr "length" "8,12")] +) + +(define_insn "*if_shift_move" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,r") + (if_then_else:SI + (match_operator 5 "arm_comparison_operator" + [(match_operand 6 "cc_register" "") (const_int 0)]) + (match_operator:SI 4 "shift_operator" + [(match_operand:SI 2 "s_register_operand" "r,r,r") + (match_operand:SI 3 "arm_rhs_operand" "rM,rM,rM")]) + (match_operand:SI 1 "arm_not_operand" "0,?rI,K")))] + "TARGET_ARM" + "@ + mov%d5\\t%0, %2%S4 + mov%D5\\t%0, %1\;mov%d5\\t%0, %2%S4 + mvn%D5\\t%0, #%B1\;mov%d5\\t%0, %2%S4" + [(set_attr "conds" "use") + (set_attr "shift" "2") + (set_attr "length" "4,8,8") + (set_attr "insn" "mov") + (set (attr "type") (if_then_else (match_operand 3 "const_int_operand" "") + (const_string "alu_shift") + (const_string "alu_shift_reg")))] +) + +(define_insn "*ifcompare_move_shift" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (if_then_else:SI + (match_operator 6 "arm_comparison_operator" + [(match_operand:SI 4 "s_register_operand" "r,r") + (match_operand:SI 5 "arm_add_operand" "rIL,rIL")]) + (match_operand:SI 1 "arm_not_operand" "0,?rIK") + (match_operator:SI 7 "shift_operator" + [(match_operand:SI 2 "s_register_operand" "r,r") + (match_operand:SI 3 "arm_rhs_operand" "rM,rM")]))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "#" + [(set_attr "conds" "clob") + (set_attr "length" "8,12")] +) + +(define_insn "*if_move_shift" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,r") + (if_then_else:SI + (match_operator 5 "arm_comparison_operator" + [(match_operand 6 "cc_register" "") (const_int 0)]) + (match_operand:SI 1 "arm_not_operand" "0,?rI,K") + (match_operator:SI 4 "shift_operator" + [(match_operand:SI 2 "s_register_operand" "r,r,r") + (match_operand:SI 3 "arm_rhs_operand" "rM,rM,rM")])))] + "TARGET_ARM" + "@ + mov%D5\\t%0, %2%S4 + mov%d5\\t%0, %1\;mov%D5\\t%0, %2%S4 + mvn%d5\\t%0, #%B1\;mov%D5\\t%0, %2%S4" + [(set_attr "conds" "use") + (set_attr "shift" "2") + (set_attr "length" "4,8,8") + (set_attr "insn" "mov") + (set (attr "type") (if_then_else (match_operand 3 "const_int_operand" "") + (const_string "alu_shift") + (const_string "alu_shift_reg")))] +) + +(define_insn "*ifcompare_shift_shift" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (if_then_else:SI + (match_operator 7 "arm_comparison_operator" + [(match_operand:SI 5 "s_register_operand" "r") + (match_operand:SI 6 "arm_add_operand" "rIL")]) + (match_operator:SI 8 "shift_operator" + [(match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "arm_rhs_operand" "rM")]) + (match_operator:SI 9 "shift_operator" + [(match_operand:SI 3 "s_register_operand" "r") + (match_operand:SI 4 "arm_rhs_operand" "rM")]))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "#" + [(set_attr "conds" "clob") + (set_attr "length" "12")] +) + +(define_insn "*if_shift_shift" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (if_then_else:SI + (match_operator 5 "arm_comparison_operator" + [(match_operand 8 "cc_register" "") (const_int 0)]) + (match_operator:SI 6 "shift_operator" + [(match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "arm_rhs_operand" "rM")]) + (match_operator:SI 7 "shift_operator" + [(match_operand:SI 3 "s_register_operand" "r") + (match_operand:SI 4 "arm_rhs_operand" "rM")])))] + "TARGET_ARM" + "mov%d5\\t%0, %1%S6\;mov%D5\\t%0, %3%S7" + [(set_attr "conds" "use") + (set_attr "shift" "1") + (set_attr "length" "8") + (set_attr "insn" "mov") + (set (attr "type") (if_then_else + (and (match_operand 2 "const_int_operand" "") + (match_operand 4 "const_int_operand" "")) + (const_string "alu_shift") + (const_string "alu_shift_reg")))] +) + +(define_insn "*ifcompare_not_arith" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (if_then_else:SI + (match_operator 6 "arm_comparison_operator" + [(match_operand:SI 4 "s_register_operand" "r") + (match_operand:SI 5 "arm_add_operand" "rIL")]) + (not:SI (match_operand:SI 1 "s_register_operand" "r")) + (match_operator:SI 7 "shiftable_operator" + [(match_operand:SI 2 "s_register_operand" "r") + (match_operand:SI 3 "arm_rhs_operand" "rI")]))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "#" + [(set_attr "conds" "clob") + (set_attr "length" "12")] +) + +(define_insn "*if_not_arith" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (if_then_else:SI + (match_operator 5 "arm_comparison_operator" + [(match_operand 4 "cc_register" "") (const_int 0)]) + (not:SI (match_operand:SI 1 "s_register_operand" "r")) + (match_operator:SI 6 "shiftable_operator" + [(match_operand:SI 2 "s_register_operand" "r") + (match_operand:SI 3 "arm_rhs_operand" "rI")])))] + "TARGET_ARM" + "mvn%d5\\t%0, %1\;%I6%D5\\t%0, %2, %3" + [(set_attr "conds" "use") + (set_attr "insn" "mvn") + (set_attr "length" "8")] +) + +(define_insn "*ifcompare_arith_not" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (if_then_else:SI + (match_operator 6 "arm_comparison_operator" + [(match_operand:SI 4 "s_register_operand" "r") + (match_operand:SI 5 "arm_add_operand" "rIL")]) + (match_operator:SI 7 "shiftable_operator" + [(match_operand:SI 2 "s_register_operand" "r") + (match_operand:SI 3 "arm_rhs_operand" "rI")]) + (not:SI (match_operand:SI 1 "s_register_operand" "r")))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "#" + [(set_attr "conds" "clob") + (set_attr "length" "12")] +) + +(define_insn "*if_arith_not" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (if_then_else:SI + (match_operator 5 "arm_comparison_operator" + [(match_operand 4 "cc_register" "") (const_int 0)]) + (match_operator:SI 6 "shiftable_operator" + [(match_operand:SI 2 "s_register_operand" "r") + (match_operand:SI 3 "arm_rhs_operand" "rI")]) + (not:SI (match_operand:SI 1 "s_register_operand" "r"))))] + "TARGET_ARM" + "mvn%D5\\t%0, %1\;%I6%d5\\t%0, %2, %3" + [(set_attr "conds" "use") + (set_attr "insn" "mvn") + (set_attr "length" "8")] +) + +(define_insn "*ifcompare_neg_move" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (if_then_else:SI + (match_operator 5 "arm_comparison_operator" + [(match_operand:SI 3 "s_register_operand" "r,r") + (match_operand:SI 4 "arm_add_operand" "rIL,rIL")]) + (neg:SI (match_operand:SI 2 "s_register_operand" "r,r")) + (match_operand:SI 1 "arm_not_operand" "0,?rIK"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "#" + [(set_attr "conds" "clob") + (set_attr "length" "8,12")] +) + +(define_insn "*if_neg_move" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,r") + (if_then_else:SI + (match_operator 4 "arm_comparison_operator" + [(match_operand 3 "cc_register" "") (const_int 0)]) + (neg:SI (match_operand:SI 2 "s_register_operand" "r,r,r")) + (match_operand:SI 1 "arm_not_operand" "0,?rI,K")))] + "TARGET_ARM" + "@ + rsb%d4\\t%0, %2, #0 + mov%D4\\t%0, %1\;rsb%d4\\t%0, %2, #0 + mvn%D4\\t%0, #%B1\;rsb%d4\\t%0, %2, #0" + [(set_attr "conds" "use") + (set_attr "length" "4,8,8")] +) + +(define_insn "*ifcompare_move_neg" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (if_then_else:SI + (match_operator 5 "arm_comparison_operator" + [(match_operand:SI 3 "s_register_operand" "r,r") + (match_operand:SI 4 "arm_add_operand" "rIL,rIL")]) + (match_operand:SI 1 "arm_not_operand" "0,?rIK") + (neg:SI (match_operand:SI 2 "s_register_operand" "r,r")))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "#" + [(set_attr "conds" "clob") + (set_attr "length" "8,12")] +) + +(define_insn "*if_move_neg" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,r") + (if_then_else:SI + (match_operator 4 "arm_comparison_operator" + [(match_operand 3 "cc_register" "") (const_int 0)]) + (match_operand:SI 1 "arm_not_operand" "0,?rI,K") + (neg:SI (match_operand:SI 2 "s_register_operand" "r,r,r"))))] + "TARGET_ARM" + "@ + rsb%D4\\t%0, %2, #0 + mov%d4\\t%0, %1\;rsb%D4\\t%0, %2, #0 + mvn%d4\\t%0, #%B1\;rsb%D4\\t%0, %2, #0" + [(set_attr "conds" "use") + (set_attr "length" "4,8,8")] +) + +(define_insn "*arith_adjacentmem" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (match_operator:SI 1 "shiftable_operator" + [(match_operand:SI 2 "memory_operand" "m") + (match_operand:SI 3 "memory_operand" "m")])) + (clobber (match_scratch:SI 4 "=r"))] + "TARGET_ARM && adjacent_mem_locations (operands[2], operands[3])" + "* + { + rtx ldm[3]; + rtx arith[4]; + rtx base_reg; + HOST_WIDE_INT val1 = 0, val2 = 0; + + if (REGNO (operands[0]) > REGNO (operands[4])) + { + ldm[1] = operands[4]; + ldm[2] = operands[0]; + } + else + { + ldm[1] = operands[0]; + ldm[2] = operands[4]; + } + + base_reg = XEXP (operands[2], 0); + + if (!REG_P (base_reg)) + { + val1 = INTVAL (XEXP (base_reg, 1)); + base_reg = XEXP (base_reg, 0); + } + + if (!REG_P (XEXP (operands[3], 0))) + val2 = INTVAL (XEXP (XEXP (operands[3], 0), 1)); + + arith[0] = operands[0]; + arith[3] = operands[1]; + + if (val1 < val2) + { + arith[1] = ldm[1]; + arith[2] = ldm[2]; + } + else + { + arith[1] = ldm[2]; + arith[2] = ldm[1]; + } + + ldm[0] = base_reg; + if (val1 !=0 && val2 != 0) + { + rtx ops[3]; + + if (val1 == 4 || val2 == 4) + /* Other val must be 8, since we know they are adjacent and neither + is zero. */ + output_asm_insn (\"ldm%(ib%)\\t%0, {%1, %2}\", ldm); + else if (const_ok_for_arm (val1) || const_ok_for_arm (-val1)) + { + ldm[0] = ops[0] = operands[4]; + ops[1] = base_reg; + ops[2] = GEN_INT (val1); + output_add_immediate (ops); + if (val1 < val2) + output_asm_insn (\"ldm%(ia%)\\t%0, {%1, %2}\", ldm); + else + output_asm_insn (\"ldm%(da%)\\t%0, {%1, %2}\", ldm); + } + else + { + /* Offset is out of range for a single add, so use two ldr. */ + ops[0] = ldm[1]; + ops[1] = base_reg; + ops[2] = GEN_INT (val1); + output_asm_insn (\"ldr%?\\t%0, [%1, %2]\", ops); + ops[0] = ldm[2]; + ops[2] = GEN_INT (val2); + output_asm_insn (\"ldr%?\\t%0, [%1, %2]\", ops); + } + } + else if (val1 != 0) + { + if (val1 < val2) + output_asm_insn (\"ldm%(da%)\\t%0, {%1, %2}\", ldm); + else + output_asm_insn (\"ldm%(ia%)\\t%0, {%1, %2}\", ldm); + } + else + { + if (val1 < val2) + output_asm_insn (\"ldm%(ia%)\\t%0, {%1, %2}\", ldm); + else + output_asm_insn (\"ldm%(da%)\\t%0, {%1, %2}\", ldm); + } + output_asm_insn (\"%I3%?\\t%0, %1, %2\", arith); + return \"\"; + }" + [(set_attr "length" "12") + (set_attr "predicable" "yes") + (set_attr "type" "load1")] +) + +; This pattern is never tried by combine, so do it as a peephole + +(define_peephole2 + [(set (match_operand:SI 0 "arm_general_register_operand" "") + (match_operand:SI 1 "arm_general_register_operand" "")) + (set (reg:CC CC_REGNUM) + (compare:CC (match_dup 1) (const_int 0)))] + "TARGET_ARM" + [(parallel [(set (reg:CC CC_REGNUM) (compare:CC (match_dup 1) (const_int 0))) + (set (match_dup 0) (match_dup 1))])] + "" +) + +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (and:SI (ge:SI (match_operand:SI 1 "s_register_operand" "") + (const_int 0)) + (neg:SI (match_operator:SI 2 "arm_comparison_operator" + [(match_operand:SI 3 "s_register_operand" "") + (match_operand:SI 4 "arm_rhs_operand" "")])))) + (clobber (match_operand:SI 5 "s_register_operand" ""))] + "TARGET_ARM" + [(set (match_dup 5) (not:SI (ashiftrt:SI (match_dup 1) (const_int 31)))) + (set (match_dup 0) (and:SI (match_op_dup 2 [(match_dup 3) (match_dup 4)]) + (match_dup 5)))] + "" +) + +;; This split can be used because CC_Z mode implies that the following +;; branch will be an equality, or an unsigned inequality, so the sign +;; extension is not needed. + +(define_split + [(set (reg:CC_Z CC_REGNUM) + (compare:CC_Z + (ashift:SI (subreg:SI (match_operand:QI 0 "memory_operand" "") 0) + (const_int 24)) + (match_operand 1 "const_int_operand" ""))) + (clobber (match_scratch:SI 2 ""))] + "TARGET_ARM + && (((unsigned HOST_WIDE_INT) INTVAL (operands[1])) + == (((unsigned HOST_WIDE_INT) INTVAL (operands[1])) >> 24) << 24)" + [(set (match_dup 2) (zero_extend:SI (match_dup 0))) + (set (reg:CC CC_REGNUM) (compare:CC (match_dup 2) (match_dup 1)))] + " + operands[1] = GEN_INT (((unsigned long) INTVAL (operands[1])) >> 24); + " +) +;; ??? Check the patterns above for Thumb-2 usefulness + +(define_expand "prologue" + [(clobber (const_int 0))] + "TARGET_EITHER" + "if (TARGET_32BIT) + arm_expand_prologue (); + else + thumb1_expand_prologue (); + DONE; + " +) + +(define_expand "epilogue" + [(clobber (const_int 0))] + "TARGET_EITHER" + " + if (crtl->calls_eh_return) + emit_insn (gen_prologue_use (gen_rtx_REG (Pmode, 2))); + if (TARGET_THUMB1) + thumb1_expand_epilogue (); + else if (USE_RETURN_INSN (FALSE)) + { + emit_jump_insn (gen_return ()); + DONE; + } + emit_jump_insn (gen_rtx_UNSPEC_VOLATILE (VOIDmode, + gen_rtvec (1, + gen_rtx_RETURN (VOIDmode)), + VUNSPEC_EPILOGUE)); + DONE; + " +) + +;; Note - although unspec_volatile's USE all hard registers, +;; USEs are ignored after relaod has completed. Thus we need +;; to add an unspec of the link register to ensure that flow +;; does not think that it is unused by the sibcall branch that +;; will replace the standard function epilogue. +(define_insn "sibcall_epilogue" + [(parallel [(unspec:SI [(reg:SI LR_REGNUM)] UNSPEC_PROLOGUE_USE) + (unspec_volatile [(return)] VUNSPEC_EPILOGUE)])] + "TARGET_32BIT" + "* + if (use_return_insn (FALSE, next_nonnote_insn (insn))) + return output_return_instruction (const_true_rtx, FALSE, FALSE); + return arm_output_epilogue (next_nonnote_insn (insn)); + " +;; Length is absolute worst case + [(set_attr "length" "44") + (set_attr "type" "block") + ;; We don't clobber the conditions, but the potential length of this + ;; operation is sufficient to make conditionalizing the sequence + ;; unlikely to be profitable. + (set_attr "conds" "clob")] +) + +(define_insn "*epilogue_insns" + [(unspec_volatile [(return)] VUNSPEC_EPILOGUE)] + "TARGET_EITHER" + "* + if (TARGET_32BIT) + return arm_output_epilogue (NULL); + else /* TARGET_THUMB1 */ + return thumb_unexpanded_epilogue (); + " + ; Length is absolute worst case + [(set_attr "length" "44") + (set_attr "type" "block") + ;; We don't clobber the conditions, but the potential length of this + ;; operation is sufficient to make conditionalizing the sequence + ;; unlikely to be profitable. + (set_attr "conds" "clob")] +) + +(define_expand "eh_epilogue" + [(use (match_operand:SI 0 "register_operand" "")) + (use (match_operand:SI 1 "register_operand" "")) + (use (match_operand:SI 2 "register_operand" ""))] + "TARGET_EITHER" + " + { + cfun->machine->eh_epilogue_sp_ofs = operands[1]; + if (GET_CODE (operands[2]) != REG || REGNO (operands[2]) != 2) + { + rtx ra = gen_rtx_REG (Pmode, 2); + + emit_move_insn (ra, operands[2]); + operands[2] = ra; + } + /* This is a hack -- we may have crystalized the function type too + early. */ + cfun->machine->func_type = 0; + }" +) + +;; This split is only used during output to reduce the number of patterns +;; that need assembler instructions adding to them. We allowed the setting +;; of the conditions to be implicit during rtl generation so that +;; the conditional compare patterns would work. However this conflicts to +;; some extent with the conditional data operations, so we have to split them +;; up again here. + +;; ??? Need to audit these splitters for Thumb-2. Why isn't normal +;; conditional execution sufficient? + +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (if_then_else:SI (match_operator 1 "arm_comparison_operator" + [(match_operand 2 "" "") (match_operand 3 "" "")]) + (match_dup 0) + (match_operand 4 "" ""))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM && reload_completed" + [(set (match_dup 5) (match_dup 6)) + (cond_exec (match_dup 7) + (set (match_dup 0) (match_dup 4)))] + " + { + enum machine_mode mode = SELECT_CC_MODE (GET_CODE (operands[1]), + operands[2], operands[3]); + enum rtx_code rc = GET_CODE (operands[1]); + + operands[5] = gen_rtx_REG (mode, CC_REGNUM); + operands[6] = gen_rtx_COMPARE (mode, operands[2], operands[3]); + if (mode == CCFPmode || mode == CCFPEmode) + rc = reverse_condition_maybe_unordered (rc); + else + rc = reverse_condition (rc); + + operands[7] = gen_rtx_fmt_ee (rc, VOIDmode, operands[5], const0_rtx); + }" +) + +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (if_then_else:SI (match_operator 1 "arm_comparison_operator" + [(match_operand 2 "" "") (match_operand 3 "" "")]) + (match_operand 4 "" "") + (match_dup 0))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM && reload_completed" + [(set (match_dup 5) (match_dup 6)) + (cond_exec (match_op_dup 1 [(match_dup 5) (const_int 0)]) + (set (match_dup 0) (match_dup 4)))] + " + { + enum machine_mode mode = SELECT_CC_MODE (GET_CODE (operands[1]), + operands[2], operands[3]); + + operands[5] = gen_rtx_REG (mode, CC_REGNUM); + operands[6] = gen_rtx_COMPARE (mode, operands[2], operands[3]); + }" +) + +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (if_then_else:SI (match_operator 1 "arm_comparison_operator" + [(match_operand 2 "" "") (match_operand 3 "" "")]) + (match_operand 4 "" "") + (match_operand 5 "" ""))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM && reload_completed" + [(set (match_dup 6) (match_dup 7)) + (cond_exec (match_op_dup 1 [(match_dup 6) (const_int 0)]) + (set (match_dup 0) (match_dup 4))) + (cond_exec (match_dup 8) + (set (match_dup 0) (match_dup 5)))] + " + { + enum machine_mode mode = SELECT_CC_MODE (GET_CODE (operands[1]), + operands[2], operands[3]); + enum rtx_code rc = GET_CODE (operands[1]); + + operands[6] = gen_rtx_REG (mode, CC_REGNUM); + operands[7] = gen_rtx_COMPARE (mode, operands[2], operands[3]); + if (mode == CCFPmode || mode == CCFPEmode) + rc = reverse_condition_maybe_unordered (rc); + else + rc = reverse_condition (rc); + + operands[8] = gen_rtx_fmt_ee (rc, VOIDmode, operands[6], const0_rtx); + }" +) + +(define_split + [(set (match_operand:SI 0 "s_register_operand" "") + (if_then_else:SI (match_operator 1 "arm_comparison_operator" + [(match_operand:SI 2 "s_register_operand" "") + (match_operand:SI 3 "arm_add_operand" "")]) + (match_operand:SI 4 "arm_rhs_operand" "") + (not:SI + (match_operand:SI 5 "s_register_operand" "")))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM && reload_completed" + [(set (match_dup 6) (match_dup 7)) + (cond_exec (match_op_dup 1 [(match_dup 6) (const_int 0)]) + (set (match_dup 0) (match_dup 4))) + (cond_exec (match_dup 8) + (set (match_dup 0) (not:SI (match_dup 5))))] + " + { + enum machine_mode mode = SELECT_CC_MODE (GET_CODE (operands[1]), + operands[2], operands[3]); + enum rtx_code rc = GET_CODE (operands[1]); + + operands[6] = gen_rtx_REG (mode, CC_REGNUM); + operands[7] = gen_rtx_COMPARE (mode, operands[2], operands[3]); + if (mode == CCFPmode || mode == CCFPEmode) + rc = reverse_condition_maybe_unordered (rc); + else + rc = reverse_condition (rc); + + operands[8] = gen_rtx_fmt_ee (rc, VOIDmode, operands[6], const0_rtx); + }" +) + +(define_insn "*cond_move_not" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (if_then_else:SI (match_operator 4 "arm_comparison_operator" + [(match_operand 3 "cc_register" "") (const_int 0)]) + (match_operand:SI 1 "arm_rhs_operand" "0,?rI") + (not:SI + (match_operand:SI 2 "s_register_operand" "r,r"))))] + "TARGET_ARM" + "@ + mvn%D4\\t%0, %2 + mov%d4\\t%0, %1\;mvn%D4\\t%0, %2" + [(set_attr "conds" "use") + (set_attr "insn" "mvn") + (set_attr "length" "4,8")] +) + +;; The next two patterns occur when an AND operation is followed by a +;; scc insn sequence + +(define_insn "*sign_extract_onebit" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (sign_extract:SI (match_operand:SI 1 "s_register_operand" "r") + (const_int 1) + (match_operand:SI 2 "const_int_operand" "n"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "* + operands[2] = GEN_INT (1 << INTVAL (operands[2])); + output_asm_insn (\"ands\\t%0, %1, %2\", operands); + return \"mvnne\\t%0, #0\"; + " + [(set_attr "conds" "clob") + (set_attr "length" "8")] +) + +(define_insn "*not_signextract_onebit" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (not:SI + (sign_extract:SI (match_operand:SI 1 "s_register_operand" "r") + (const_int 1) + (match_operand:SI 2 "const_int_operand" "n")))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_ARM" + "* + operands[2] = GEN_INT (1 << INTVAL (operands[2])); + output_asm_insn (\"tst\\t%1, %2\", operands); + output_asm_insn (\"mvneq\\t%0, #0\", operands); + return \"movne\\t%0, #0\"; + " + [(set_attr "conds" "clob") + (set_attr "length" "12")] +) +;; ??? The above patterns need auditing for Thumb-2 + +;; Push multiple registers to the stack. Registers are in parallel (use ...) +;; expressions. For simplicity, the first register is also in the unspec +;; part. +(define_insn "*push_multi" + [(match_parallel 2 "multi_register_push" + [(set (match_operand:BLK 0 "memory_operand" "=m") + (unspec:BLK [(match_operand:SI 1 "s_register_operand" "")] + UNSPEC_PUSH_MULT))])] + "TARGET_32BIT" + "* + { + int num_saves = XVECLEN (operands[2], 0); + + /* For the StrongARM at least it is faster to + use STR to store only a single register. + In Thumb mode always use push, and the assembler will pick + something appropriate. */ + if (num_saves == 1 && TARGET_ARM) + output_asm_insn (\"str\\t%1, [%m0, #-4]!\", operands); + else + { + int i; + char pattern[100]; + + if (TARGET_ARM) + strcpy (pattern, \"stmfd\\t%m0!, {%1\"); + else + strcpy (pattern, \"push\\t{%1\"); + + for (i = 1; i < num_saves; i++) + { + strcat (pattern, \", %|\"); + strcat (pattern, + reg_names[REGNO (XEXP (XVECEXP (operands[2], 0, i), 0))]); + } + + strcat (pattern, \"}\"); + output_asm_insn (pattern, operands); + } + + return \"\"; + }" + [(set_attr "type" "store4")] +) + +(define_insn "stack_tie" + [(set (mem:BLK (scratch)) + (unspec:BLK [(match_operand:SI 0 "s_register_operand" "rk") + (match_operand:SI 1 "s_register_operand" "rk")] + UNSPEC_PRLG_STK))] + "" + "" + [(set_attr "length" "0")] +) + +;; Similarly for the floating point registers +(define_insn "*push_fp_multi" + [(match_parallel 2 "multi_register_push" + [(set (match_operand:BLK 0 "memory_operand" "=m") + (unspec:BLK [(match_operand:XF 1 "f_register_operand" "")] + UNSPEC_PUSH_MULT))])] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "* + { + char pattern[100]; + + sprintf (pattern, \"sfmfd\\t%%1, %d, [%%m0]!\", XVECLEN (operands[2], 0)); + output_asm_insn (pattern, operands); + return \"\"; + }" + [(set_attr "type" "f_fpa_store")] +) + +;; Special patterns for dealing with the constant pool + +(define_insn "align_4" + [(unspec_volatile [(const_int 0)] VUNSPEC_ALIGN)] + "TARGET_EITHER" + "* + assemble_align (32); + return \"\"; + " +) + +(define_insn "align_8" + [(unspec_volatile [(const_int 0)] VUNSPEC_ALIGN8)] + "TARGET_EITHER" + "* + assemble_align (64); + return \"\"; + " +) + +(define_insn "consttable_end" + [(unspec_volatile [(const_int 0)] VUNSPEC_POOL_END)] + "TARGET_EITHER" + "* + making_const_table = FALSE; + return \"\"; + " +) + +(define_insn "consttable_1" + [(unspec_volatile [(match_operand 0 "" "")] VUNSPEC_POOL_1)] + "TARGET_THUMB1" + "* + making_const_table = TRUE; + assemble_integer (operands[0], 1, BITS_PER_WORD, 1); + assemble_zeros (3); + return \"\"; + " + [(set_attr "length" "4")] +) + +(define_insn "consttable_2" + [(unspec_volatile [(match_operand 0 "" "")] VUNSPEC_POOL_2)] + "TARGET_THUMB1" + "* + making_const_table = TRUE; + gcc_assert (GET_MODE_CLASS (GET_MODE (operands[0])) != MODE_FLOAT); + assemble_integer (operands[0], 2, BITS_PER_WORD, 1); + assemble_zeros (2); + return \"\"; + " + [(set_attr "length" "4")] +) + +(define_insn "consttable_4" + [(unspec_volatile [(match_operand 0 "" "")] VUNSPEC_POOL_4)] + "TARGET_EITHER" + "* + { + rtx x = operands[0]; + making_const_table = TRUE; + switch (GET_MODE_CLASS (GET_MODE (x))) + { + case MODE_FLOAT: + if (GET_MODE (x) == HFmode) + arm_emit_fp16_const (x); + else + { + REAL_VALUE_TYPE r; + REAL_VALUE_FROM_CONST_DOUBLE (r, x); + assemble_real (r, GET_MODE (x), BITS_PER_WORD); + } + break; + default: + /* XXX: Sometimes gcc does something really dumb and ends up with + a HIGH in a constant pool entry, usually because it's trying to + load into a VFP register. We know this will always be used in + combination with a LO_SUM which ignores the high bits, so just + strip off the HIGH. */ + if (GET_CODE (x) == HIGH) + x = XEXP (x, 0); + assemble_integer (x, 4, BITS_PER_WORD, 1); + mark_symbol_refs_as_used (x); + break; + } + return \"\"; + }" + [(set_attr "length" "4")] +) + +(define_insn "consttable_8" + [(unspec_volatile [(match_operand 0 "" "")] VUNSPEC_POOL_8)] + "TARGET_EITHER" + "* + { + making_const_table = TRUE; + switch (GET_MODE_CLASS (GET_MODE (operands[0]))) + { + case MODE_FLOAT: + { + REAL_VALUE_TYPE r; + REAL_VALUE_FROM_CONST_DOUBLE (r, operands[0]); + assemble_real (r, GET_MODE (operands[0]), BITS_PER_WORD); + break; + } + default: + assemble_integer (operands[0], 8, BITS_PER_WORD, 1); + break; + } + return \"\"; + }" + [(set_attr "length" "8")] +) + +(define_insn "consttable_16" + [(unspec_volatile [(match_operand 0 "" "")] VUNSPEC_POOL_16)] + "TARGET_EITHER" + "* + { + making_const_table = TRUE; + switch (GET_MODE_CLASS (GET_MODE (operands[0]))) + { + case MODE_FLOAT: + { + REAL_VALUE_TYPE r; + REAL_VALUE_FROM_CONST_DOUBLE (r, operands[0]); + assemble_real (r, GET_MODE (operands[0]), BITS_PER_WORD); + break; + } + default: + assemble_integer (operands[0], 16, BITS_PER_WORD, 1); + break; + } + return \"\"; + }" + [(set_attr "length" "16")] +) + +;; Miscellaneous Thumb patterns + +(define_expand "tablejump" + [(parallel [(set (pc) (match_operand:SI 0 "register_operand" "")) + (use (label_ref (match_operand 1 "" "")))])] + "TARGET_THUMB1" + " + if (flag_pic) + { + /* Hopefully, CSE will eliminate this copy. */ + rtx reg1 = copy_addr_to_reg (gen_rtx_LABEL_REF (Pmode, operands[1])); + rtx reg2 = gen_reg_rtx (SImode); + + emit_insn (gen_addsi3 (reg2, operands[0], reg1)); + operands[0] = reg2; + } + " +) + +;; NB never uses BX. +(define_insn "*thumb1_tablejump" + [(set (pc) (match_operand:SI 0 "register_operand" "l*r")) + (use (label_ref (match_operand 1 "" "")))] + "TARGET_THUMB1" + "mov\\t%|pc, %0" + [(set_attr "length" "2")] +) + +;; V5 Instructions, + +(define_insn "clzsi2" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (clz:SI (match_operand:SI 1 "s_register_operand" "r")))] + "TARGET_32BIT && arm_arch5" + "clz%?\\t%0, %1" + [(set_attr "predicable" "yes") + (set_attr "insn" "clz")]) + +(define_insn "rbitsi2" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (unspec:SI [(match_operand:SI 1 "s_register_operand" "r")] UNSPEC_RBIT))] + "TARGET_32BIT && arm_arch_thumb2" + "rbit%?\\t%0, %1" + [(set_attr "predicable" "yes") + (set_attr "insn" "clz")]) + +(define_expand "ctzsi2" + [(set (match_operand:SI 0 "s_register_operand" "") + (ctz:SI (match_operand:SI 1 "s_register_operand" "")))] + "TARGET_32BIT && arm_arch_thumb2" + " + { + rtx tmp = gen_reg_rtx (SImode); + emit_insn (gen_rbitsi2 (tmp, operands[1])); + emit_insn (gen_clzsi2 (operands[0], tmp)); + } + DONE; + " +) + +;; V5E instructions. + +(define_insn "prefetch" + [(prefetch (match_operand:SI 0 "address_operand" "p") + (match_operand:SI 1 "" "") + (match_operand:SI 2 "" ""))] + "TARGET_32BIT && arm_arch5e" + "pld\\t%a0") + +;; General predication pattern + +(define_cond_exec + [(match_operator 0 "arm_comparison_operator" + [(match_operand 1 "cc_register" "") + (const_int 0)])] + "TARGET_32BIT" + "" +) + +(define_insn "prologue_use" + [(unspec:SI [(match_operand:SI 0 "register_operand" "")] UNSPEC_PROLOGUE_USE)] + "" + "%@ %0 needed for prologue" + [(set_attr "length" "0")] +) + + +;; Patterns for exception handling + +(define_expand "eh_return" + [(use (match_operand 0 "general_operand" ""))] + "TARGET_EITHER" + " + { + if (TARGET_32BIT) + emit_insn (gen_arm_eh_return (operands[0])); + else + emit_insn (gen_thumb_eh_return (operands[0])); + DONE; + }" +) + +;; We can't expand this before we know where the link register is stored. +(define_insn_and_split "arm_eh_return" + [(unspec_volatile [(match_operand:SI 0 "s_register_operand" "r")] + VUNSPEC_EH_RETURN) + (clobber (match_scratch:SI 1 "=&r"))] + "TARGET_ARM" + "#" + "&& reload_completed" + [(const_int 0)] + " + { + arm_set_return_address (operands[0], operands[1]); + DONE; + }" +) + +(define_insn_and_split "thumb_eh_return" + [(unspec_volatile [(match_operand:SI 0 "s_register_operand" "l")] + VUNSPEC_EH_RETURN) + (clobber (match_scratch:SI 1 "=&l"))] + "TARGET_THUMB1" + "#" + "&& reload_completed" + [(const_int 0)] + " + { + thumb_set_return_address (operands[0], operands[1]); + DONE; + }" +) + + +;; TLS support + +(define_insn "load_tp_hard" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec:SI [(const_int 0)] UNSPEC_TLS))] + "TARGET_HARD_TP" + "mrc%?\\tp15, 0, %0, c13, c0, 3\\t@ load_tp_hard" + [(set_attr "predicable" "yes")] +) + +;; Doesn't clobber R1-R3. Must use r0 for the first operand. +(define_insn "load_tp_soft" + [(set (reg:SI 0) (unspec:SI [(const_int 0)] UNSPEC_TLS)) + (clobber (reg:SI LR_REGNUM)) + (clobber (reg:SI IP_REGNUM)) + (clobber (reg:CC CC_REGNUM))] + "TARGET_SOFT_TP" + "bl\\t__aeabi_read_tp\\t@ load_tp_soft" + [(set_attr "conds" "clob")] +) + +;; We only care about the lower 16 bits of the constant +;; being inserted into the upper 16 bits of the register. +(define_insn "*arm_movtas_ze" + [(set (zero_extract:SI (match_operand:SI 0 "s_register_operand" "+r") + (const_int 16) + (const_int 16)) + (match_operand:SI 1 "const_int_operand" ""))] + "arm_arch_thumb2" + "movt%?\t%0, %L1" + [(set_attr "predicable" "yes") + (set_attr "length" "4")] +) + +(define_insn "*arm_rev" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (bswap:SI (match_operand:SI 1 "s_register_operand" "r")))] + "TARGET_32BIT && arm_arch6" + "rev%?\t%0, %1" + [(set_attr "predicable" "yes") + (set_attr "length" "4")] +) + +(define_insn "*thumb1_rev" + [(set (match_operand:SI 0 "s_register_operand" "=l") + (bswap:SI (match_operand:SI 1 "s_register_operand" "l")))] + "TARGET_THUMB1 && arm_arch6" + "rev\t%0, %1" + [(set_attr "length" "2")] +) + +(define_expand "arm_legacy_rev" + [(set (match_operand:SI 2 "s_register_operand" "") + (xor:SI (rotatert:SI (match_operand:SI 1 "s_register_operand" "") + (const_int 16)) + (match_dup 1))) + (set (match_dup 2) + (lshiftrt:SI (match_dup 2) + (const_int 8))) + (set (match_operand:SI 3 "s_register_operand" "") + (rotatert:SI (match_dup 1) + (const_int 8))) + (set (match_dup 2) + (and:SI (match_dup 2) + (const_int -65281))) + (set (match_operand:SI 0 "s_register_operand" "") + (xor:SI (match_dup 3) + (match_dup 2)))] + "TARGET_32BIT" + "" +) + +;; Reuse temporaries to keep register pressure down. +(define_expand "thumb_legacy_rev" + [(set (match_operand:SI 2 "s_register_operand" "") + (ashift:SI (match_operand:SI 1 "s_register_operand" "") + (const_int 24))) + (set (match_operand:SI 3 "s_register_operand" "") + (lshiftrt:SI (match_dup 1) + (const_int 24))) + (set (match_dup 3) + (ior:SI (match_dup 3) + (match_dup 2))) + (set (match_operand:SI 4 "s_register_operand" "") + (const_int 16)) + (set (match_operand:SI 5 "s_register_operand" "") + (rotatert:SI (match_dup 1) + (match_dup 4))) + (set (match_dup 2) + (ashift:SI (match_dup 5) + (const_int 24))) + (set (match_dup 5) + (lshiftrt:SI (match_dup 5) + (const_int 24))) + (set (match_dup 5) + (ior:SI (match_dup 5) + (match_dup 2))) + (set (match_dup 5) + (rotatert:SI (match_dup 5) + (match_dup 4))) + (set (match_operand:SI 0 "s_register_operand" "") + (ior:SI (match_dup 5) + (match_dup 3)))] + "TARGET_THUMB" + "" +) + +(define_expand "bswapsi2" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (bswap:SI (match_operand:SI 1 "s_register_operand" "r")))] +"TARGET_EITHER && (arm_arch6 || !optimize_size)" +" + if (!arm_arch6) + { + rtx op2 = gen_reg_rtx (SImode); + rtx op3 = gen_reg_rtx (SImode); + + if (TARGET_THUMB) + { + rtx op4 = gen_reg_rtx (SImode); + rtx op5 = gen_reg_rtx (SImode); + + emit_insn (gen_thumb_legacy_rev (operands[0], operands[1], + op2, op3, op4, op5)); + } + else + { + emit_insn (gen_arm_legacy_rev (operands[0], operands[1], + op2, op3)); + } + + DONE; + } + " +) + +;; Load the load/store multiple patterns +(include "ldmstm.md") +;; Load the FPA co-processor patterns +(include "fpa.md") +;; Load the Maverick co-processor patterns +(include "cirrus.md") +;; Vector bits common to IWMMXT and Neon +(include "vec-common.md") +;; Load the Intel Wireless Multimedia Extension patterns +(include "iwmmxt.md") +;; Load the VFP co-processor patterns +(include "vfp.md") +;; Thumb-2 patterns +(include "thumb2.md") +;; Neon patterns +(include "neon.md") +;; Synchronization Primitives +(include "sync.md") diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt new file mode 100644 index 000000000..a39bb3a8d --- /dev/null +++ b/gcc/config/arm/arm.opt @@ -0,0 +1,171 @@ +; Options for the ARM port of the compiler. + +; Copyright (C) 2005, 2007, 2008 Free Software Foundation, Inc. +; +; This file is part of GCC. +; +; GCC is free software; you can redistribute it and/or modify it under +; the terms of the GNU General Public License as published by the Free +; Software Foundation; either version 3, or (at your option) any later +; version. +; +; GCC is distributed in the hope that it will be useful, but WITHOUT ANY +; WARRANTY; without even the implied warranty of MERCHANTABILITY or +; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +; for more details. +; +; You should have received a copy of the GNU General Public License +; along with GCC; see the file COPYING3. If not see +; . + +mabi= +Target RejectNegative Joined Var(target_abi_name) +Specify an ABI + +mabort-on-noreturn +Target Report Mask(ABORT_NORETURN) +Generate a call to abort if a noreturn function returns + +mapcs +Target RejectNegative Mask(APCS_FRAME) MaskExists Undocumented + +mapcs-float +Target Report Mask(APCS_FLOAT) +Pass FP arguments in FP registers + +mapcs-frame +Target Report Mask(APCS_FRAME) +Generate APCS conformant stack frames + +mapcs-reentrant +Target Report Mask(APCS_REENT) +Generate re-entrant, PIC code + +mapcs-stack-check +Target Report Mask(APCS_STACK) Undocumented + +march= +Target RejectNegative Joined +Specify the name of the target architecture + +marm +Target RejectNegative InverseMask(THUMB) Undocumented + +mbig-endian +Target Report RejectNegative Mask(BIG_END) +Assume target CPU is configured as big endian + +mcallee-super-interworking +Target Report Mask(CALLEE_INTERWORKING) +Thumb: Assume non-static functions may be called from ARM code + +mcaller-super-interworking +Target Report Mask(CALLER_INTERWORKING) +Thumb: Assume function pointers may go to non-Thumb aware code + +mcirrus-fix-invalid-insns +Target Report Mask(CIRRUS_FIX_INVALID_INSNS) +Cirrus: Place NOPs to avoid invalid instruction combinations + +mcpu= +Target RejectNegative Joined +Specify the name of the target CPU + +mfloat-abi= +Target RejectNegative Joined Var(target_float_abi_name) +Specify if floating point hardware should be used + +mfp= +Target RejectNegative Joined Undocumented Var(target_fpe_name) + +mfp16-format= +Target RejectNegative Joined Var(target_fp16_format_name) +Specify the __fp16 floating-point format + +;; Now ignored. +mfpe +Target RejectNegative Mask(FPE) Undocumented + +mfpe= +Target RejectNegative Joined Undocumented Var(target_fpe_name) + +mfpu= +Target RejectNegative Joined Var(target_fpu_name) +Specify the name of the target floating point hardware/format + +mhard-float +Target RejectNegative +Alias for -mfloat-abi=hard + +mlittle-endian +Target Report RejectNegative InverseMask(BIG_END) +Assume target CPU is configured as little endian + +mlong-calls +Target Report Mask(LONG_CALLS) +Generate call insns as indirect calls, if necessary + +mpic-register= +Target RejectNegative Joined Var(arm_pic_register_string) +Specify the register to be used for PIC addressing + +mpoke-function-name +Target Report Mask(POKE_FUNCTION_NAME) +Store function names in object code + +msched-prolog +Target Report Mask(SCHED_PROLOG) +Permit scheduling of a function's prologue sequence + +msingle-pic-base +Target Report Mask(SINGLE_PIC_BASE) +Do not load the PIC register in function prologues + +msoft-float +Target RejectNegative +Alias for -mfloat-abi=soft + +mstructure-size-boundary= +Target RejectNegative Joined Var(structure_size_string) +Specify the minimum bit alignment of structures + +mthumb +Target Report Mask(THUMB) +Compile for the Thumb not the ARM + +mthumb-interwork +Target Report Mask(INTERWORK) +Support calls between Thumb and ARM instruction sets + +mtp= +Target RejectNegative Joined Var(target_thread_switch) +Specify how to access the thread pointer + +mtpcs-frame +Target Report Mask(TPCS_FRAME) +Thumb: Generate (non-leaf) stack frames even if not needed + +mtpcs-leaf-frame +Target Report Mask(TPCS_LEAF_FRAME) +Thumb: Generate (leaf) stack frames even if not needed + +mtune= +Target RejectNegative Joined +Tune code for the given processor + +mwords-little-endian +Target Report RejectNegative Mask(LITTLE_WORDS) +Assume big endian bytes, little endian words + +mvectorize-with-neon-quad +Target Report Mask(NEON_VECTORIZE_QUAD) +Use Neon quad-word (rather than double-word) registers for vectorization + +mword-relocations +Target Report Var(target_word_relocations) Init(TARGET_DEFAULT_WORD_RELOCATIONS) +Only generate absolute relocations on word sized values. + +mfix-cortex-m3-ldrd +Target Report Var(fix_cm3_ldrd) Init(2) +Avoid overlapping destination and address registers on LDRD instructions +that may trigger Cortex-M3 errata. diff --git a/gcc/config/arm/arm1020e.md b/gcc/config/arm/arm1020e.md new file mode 100644 index 000000000..280af12f9 --- /dev/null +++ b/gcc/config/arm/arm1020e.md @@ -0,0 +1,375 @@ +;; ARM 1020E & ARM 1022E Pipeline Description +;; Copyright (C) 2005, 2007, 2008 Free Software Foundation, Inc. +;; Contributed by Richard Earnshaw (richard.earnshaw@arm.com) +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ + +;; These descriptions are based on the information contained in the +;; ARM1020E Technical Reference Manual, Copyright (c) 2003 ARM +;; Limited. +;; + +;; This automaton provides a pipeline description for the ARM +;; 1020E core. +;; +;; The model given here assumes that the condition for all conditional +;; instructions is "true", i.e., that all of the instructions are +;; actually executed. + +(define_automaton "arm1020e") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Pipelines +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; There are two pipelines: +;; +;; - An Arithmetic Logic Unit (ALU) pipeline. +;; +;; The ALU pipeline has fetch, issue, decode, execute, memory, and +;; write stages. We only need to model the execute, memory and write +;; stages. +;; +;; - A Load-Store Unit (LSU) pipeline. +;; +;; The LSU pipeline has decode, execute, memory, and write stages. +;; We only model the execute, memory and write stages. + +(define_cpu_unit "1020a_e,1020a_m,1020a_w" "arm1020e") +(define_cpu_unit "1020l_e,1020l_m,1020l_w" "arm1020e") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ALU Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; ALU instructions require three cycles to execute, and use the ALU +;; pipeline in each of the three stages. The results are available +;; after the execute stage stage has finished. +;; +;; If the destination register is the PC, the pipelines are stalled +;; for several cycles. That case is not modeled here. + +;; ALU operations with no shifted operand +(define_insn_reservation "1020alu_op" 1 + (and (eq_attr "tune" "arm1020e,arm1022e") + (eq_attr "type" "alu")) + "1020a_e,1020a_m,1020a_w") + +;; ALU operations with a shift-by-constant operand +(define_insn_reservation "1020alu_shift_op" 1 + (and (eq_attr "tune" "arm1020e,arm1022e") + (eq_attr "type" "alu_shift")) + "1020a_e,1020a_m,1020a_w") + +;; ALU operations with a shift-by-register operand +;; These really stall in the decoder, in order to read +;; the shift value in a second cycle. Pretend we take two cycles in +;; the execute stage. +(define_insn_reservation "1020alu_shift_reg_op" 2 + (and (eq_attr "tune" "arm1020e,arm1022e") + (eq_attr "type" "alu_shift_reg")) + "1020a_e*2,1020a_m,1020a_w") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Multiplication Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Multiplication instructions loop in the execute stage until the +;; instruction has been passed through the multiplier array enough +;; times. + +;; The result of the "smul" and "smulw" instructions is not available +;; until after the memory stage. +(define_insn_reservation "1020mult1" 2 + (and (eq_attr "tune" "arm1020e,arm1022e") + (eq_attr "insn" "smulxy,smulwy")) + "1020a_e,1020a_m,1020a_w") + +;; The "smlaxy" and "smlawx" instructions require two iterations through +;; the execute stage; the result is available immediately following +;; the execute stage. +(define_insn_reservation "1020mult2" 2 + (and (eq_attr "tune" "arm1020e,arm1022e") + (eq_attr "insn" "smlaxy,smlalxy,smlawx")) + "1020a_e*2,1020a_m,1020a_w") + +;; The "smlalxy", "mul", and "mla" instructions require two iterations +;; through the execute stage; the result is not available until after +;; the memory stage. +(define_insn_reservation "1020mult3" 3 + (and (eq_attr "tune" "arm1020e,arm1022e") + (eq_attr "insn" "smlalxy,mul,mla")) + "1020a_e*2,1020a_m,1020a_w") + +;; The "muls" and "mlas" instructions loop in the execute stage for +;; four iterations in order to set the flags. The value result is +;; available after three iterations. +(define_insn_reservation "1020mult4" 3 + (and (eq_attr "tune" "arm1020e,arm1022e") + (eq_attr "insn" "muls,mlas")) + "1020a_e*4,1020a_m,1020a_w") + +;; Long multiply instructions that produce two registers of +;; output (such as umull) make their results available in two cycles; +;; the least significant word is available before the most significant +;; word. That fact is not modeled; instead, the instructions are +;; described.as if the entire result was available at the end of the +;; cycle in which both words are available. + +;; The "umull", "umlal", "smull", and "smlal" instructions all take +;; three iterations through the execute cycle, and make their results +;; available after the memory cycle. +(define_insn_reservation "1020mult5" 4 + (and (eq_attr "tune" "arm1020e,arm1022e") + (eq_attr "insn" "umull,umlal,smull,smlal")) + "1020a_e*3,1020a_m,1020a_w") + +;; The "umulls", "umlals", "smulls", and "smlals" instructions loop in +;; the execute stage for five iterations in order to set the flags. +;; The value result is available after four iterations. +(define_insn_reservation "1020mult6" 4 + (and (eq_attr "tune" "arm1020e,arm1022e") + (eq_attr "insn" "umulls,umlals,smulls,smlals")) + "1020a_e*5,1020a_m,1020a_w") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Load/Store Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; The models for load/store instructions do not accurately describe +;; the difference between operations with a base register writeback +;; (such as "ldm!"). These models assume that all memory references +;; hit in dcache. + +;; LSU instructions require six cycles to execute. They use the ALU +;; pipeline in all but the 5th cycle, and the LSU pipeline in cycles +;; three through six. +;; Loads and stores which use a scaled register offset or scaled +;; register pre-indexed addressing mode take three cycles EXCEPT for +;; those that are base + offset with LSL of 0 or 2, or base - offset +;; with LSL of zero. The remainder take 1 cycle to execute. +;; For 4byte loads there is a bypass from the load stage + +(define_insn_reservation "1020load1_op" 2 + (and (eq_attr "tune" "arm1020e,arm1022e") + (eq_attr "type" "load_byte,load1")) + "1020a_e+1020l_e,1020l_m,1020l_w") + +(define_insn_reservation "1020store1_op" 0 + (and (eq_attr "tune" "arm1020e,arm1022e") + (eq_attr "type" "store1")) + "1020a_e+1020l_e,1020l_m,1020l_w") + +;; A load's result can be stored by an immediately following store +(define_bypass 1 "1020load1_op" "1020store1_op" "arm_no_early_store_addr_dep") + +;; On a LDM/STM operation, the LSU pipeline iterates until all of the +;; registers have been processed. +;; +;; The time it takes to load the data depends on whether or not the +;; base address is 64-bit aligned; if it is not, an additional cycle +;; is required. This model assumes that the address is always 64-bit +;; aligned. Because the processor can load two registers per cycle, +;; that assumption means that we use the same instruction reservations +;; for loading 2k and 2k - 1 registers. +;; +;; The ALU pipeline is decoupled after the first cycle unless there is +;; a register dependency; the dependency is cleared as soon as the LDM/STM +;; has dealt with the corresponding register. So for example, +;; stmia sp, {r0-r3} +;; add r0, r0, #4 +;; will have one fewer stalls than +;; stmia sp, {r0-r3} +;; add r3, r3, #4 +;; +;; As with ALU operations, if one of the destination registers is the +;; PC, there are additional stalls; that is not modeled. + +(define_insn_reservation "1020load2_op" 2 + (and (eq_attr "tune" "arm1020e,arm1022e") + (eq_attr "type" "load2")) + "1020a_e+1020l_e,1020l_m,1020l_w") + +(define_insn_reservation "1020store2_op" 0 + (and (eq_attr "tune" "arm1020e,arm1022e") + (eq_attr "type" "store2")) + "1020a_e+1020l_e,1020l_m,1020l_w") + +(define_insn_reservation "1020load34_op" 3 + (and (eq_attr "tune" "arm1020e,arm1022e") + (eq_attr "type" "load3,load4")) + "1020a_e+1020l_e,1020l_e+1020l_m,1020l_m,1020l_w") + +(define_insn_reservation "1020store34_op" 0 + (and (eq_attr "tune" "arm1020e,arm1022e") + (eq_attr "type" "store3,store4")) + "1020a_e+1020l_e,1020l_e+1020l_m,1020l_m,1020l_w") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Branch and Call Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Branch instructions are difficult to model accurately. The ARM +;; core can predict most branches. If the branch is predicted +;; correctly, and predicted early enough, the branch can be completely +;; eliminated from the instruction stream. Some branches can +;; therefore appear to require zero cycles to execute. We assume that +;; all branches are predicted correctly, and that the latency is +;; therefore the minimum value. + +(define_insn_reservation "1020branch_op" 0 + (and (eq_attr "tune" "arm1020e,arm1022e") + (eq_attr "type" "branch")) + "1020a_e") + +;; The latency for a call is not predictable. Therefore, we use 32 as +;; roughly equivalent to positive infinity. + +(define_insn_reservation "1020call_op" 32 + (and (eq_attr "tune" "arm1020e,arm1022e") + (eq_attr "type" "call")) + "1020a_e*32") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; VFP +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_cpu_unit "v10_fmac" "arm1020e") + +(define_cpu_unit "v10_ds" "arm1020e") + +(define_cpu_unit "v10_fmstat" "arm1020e") + +(define_cpu_unit "v10_ls1,v10_ls2,v10_ls3" "arm1020e") + +;; fmstat is a serializing instruction. It will stall the core until +;; the mac and ds units have completed. +(exclusion_set "v10_fmac,v10_ds" "v10_fmstat") + +(define_attr "vfp10" "yes,no" + (const (if_then_else (and (eq_attr "tune" "arm1020e,arm1022e") + (eq_attr "fpu" "vfp")) + (const_string "yes") (const_string "no")))) + +;; Note, no instruction can issue to the VFP if the core is stalled in the +;; first execute state. We model this by using 1020a_e in the first cycle. +(define_insn_reservation "v10_ffarith" 5 + (and (eq_attr "vfp10" "yes") + (eq_attr "type" "fcpys,ffariths,ffarithd,fcmps,fcmpd")) + "1020a_e+v10_fmac") + +(define_insn_reservation "v10_farith" 5 + (and (eq_attr "vfp10" "yes") + (eq_attr "type" "faddd,fadds")) + "1020a_e+v10_fmac") + +(define_insn_reservation "v10_cvt" 5 + (and (eq_attr "vfp10" "yes") + (eq_attr "type" "f_cvt")) + "1020a_e+v10_fmac") + +(define_insn_reservation "v10_fmul" 6 + (and (eq_attr "vfp10" "yes") + (eq_attr "type" "fmuls,fmacs,fmuld,fmacd")) + "1020a_e+v10_fmac*2") + +(define_insn_reservation "v10_fdivs" 18 + (and (eq_attr "vfp10" "yes") + (eq_attr "type" "fdivs")) + "1020a_e+v10_ds*14") + +(define_insn_reservation "v10_fdivd" 32 + (and (eq_attr "vfp10" "yes") + (eq_attr "type" "fdivd")) + "1020a_e+v10_fmac+v10_ds*28") + +(define_insn_reservation "v10_floads" 4 + (and (eq_attr "vfp10" "yes") + (eq_attr "type" "f_loads")) + "1020a_e+1020l_e+v10_ls1,v10_ls2") + +;; We model a load of a double as needing all the vfp ls* stage in cycle 1. +;; This gives the correct mix between single-and double loads where a flds +;; followed by and fldd will stall for one cycle, but two back-to-back fldd +;; insns stall for two cycles. +(define_insn_reservation "v10_floadd" 5 + (and (eq_attr "vfp10" "yes") + (eq_attr "type" "f_loadd")) + "1020a_e+1020l_e+v10_ls1+v10_ls2+v10_ls3,v10_ls2+v10_ls3,v10_ls3") + +;; Moves to/from arm regs also use the load/store pipeline. + +(define_insn_reservation "v10_c2v" 4 + (and (eq_attr "vfp10" "yes") + (eq_attr "type" "r_2_f")) + "1020a_e+1020l_e+v10_ls1,v10_ls2") + +(define_insn_reservation "v10_fstores" 1 + (and (eq_attr "vfp10" "yes") + (eq_attr "type" "f_stores")) + "1020a_e+1020l_e+v10_ls1,v10_ls2") + +(define_insn_reservation "v10_fstored" 1 + (and (eq_attr "vfp10" "yes") + (eq_attr "type" "f_stored")) + "1020a_e+1020l_e+v10_ls1+v10_ls2+v10_ls3,v10_ls2+v10_ls3,v10_ls3") + +(define_insn_reservation "v10_v2c" 1 + (and (eq_attr "vfp10" "yes") + (eq_attr "type" "f_2_r")) + "1020a_e+1020l_e,1020l_m,1020l_w") + +(define_insn_reservation "v10_to_cpsr" 2 + (and (eq_attr "vfp10" "yes") + (eq_attr "type" "f_flag")) + "1020a_e+v10_fmstat,1020a_e+1020l_e,1020l_m,1020l_w") + +;; VFP bypasses + +;; There are bypasses for most operations other than store + +(define_bypass 3 + "v10_c2v,v10_floads" + "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd,v10_cvt") + +(define_bypass 4 + "v10_floadd" + "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd") + +;; Arithmetic to other arithmetic saves a cycle due to forwarding +(define_bypass 4 + "v10_ffarith,v10_farith" + "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd") + +(define_bypass 5 + "v10_fmul" + "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd") + +(define_bypass 17 + "v10_fdivs" + "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd") + +(define_bypass 31 + "v10_fdivd" + "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd") + +;; VFP anti-dependencies. + +;; There is one anti-dependence in the following case (not yet modelled): +;; - After a store: one extra cycle for both fsts and fstd +;; Note, back-to-back fstd instructions will overload the load/store datapath +;; causing a two-cycle stall. diff --git a/gcc/config/arm/arm1026ejs.md b/gcc/config/arm/arm1026ejs.md new file mode 100644 index 000000000..e62213638 --- /dev/null +++ b/gcc/config/arm/arm1026ejs.md @@ -0,0 +1,240 @@ +;; ARM 1026EJ-S Pipeline Description +;; Copyright (C) 2003, 2007 Free Software Foundation, Inc. +;; Written by CodeSourcery, LLC. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ + +;; These descriptions are based on the information contained in the +;; ARM1026EJ-S Technical Reference Manual, Copyright (c) 2003 ARM +;; Limited. +;; + +;; This automaton provides a pipeline description for the ARM +;; 1026EJ-S core. +;; +;; The model given here assumes that the condition for all conditional +;; instructions is "true", i.e., that all of the instructions are +;; actually executed. + +(define_automaton "arm1026ejs") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Pipelines +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; There are two pipelines: +;; +;; - An Arithmetic Logic Unit (ALU) pipeline. +;; +;; The ALU pipeline has fetch, issue, decode, execute, memory, and +;; write stages. We only need to model the execute, memory and write +;; stages. +;; +;; - A Load-Store Unit (LSU) pipeline. +;; +;; The LSU pipeline has decode, execute, memory, and write stages. +;; We only model the execute, memory and write stages. + +(define_cpu_unit "a_e,a_m,a_w" "arm1026ejs") +(define_cpu_unit "l_e,l_m,l_w" "arm1026ejs") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ALU Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; ALU instructions require three cycles to execute, and use the ALU +;; pipeline in each of the three stages. The results are available +;; after the execute stage stage has finished. +;; +;; If the destination register is the PC, the pipelines are stalled +;; for several cycles. That case is not modeled here. + +;; ALU operations with no shifted operand +(define_insn_reservation "alu_op" 1 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "type" "alu")) + "a_e,a_m,a_w") + +;; ALU operations with a shift-by-constant operand +(define_insn_reservation "alu_shift_op" 1 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "type" "alu_shift")) + "a_e,a_m,a_w") + +;; ALU operations with a shift-by-register operand +;; These really stall in the decoder, in order to read +;; the shift value in a second cycle. Pretend we take two cycles in +;; the execute stage. +(define_insn_reservation "alu_shift_reg_op" 2 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "type" "alu_shift_reg")) + "a_e*2,a_m,a_w") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Multiplication Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Multiplication instructions loop in the execute stage until the +;; instruction has been passed through the multiplier array enough +;; times. + +;; The result of the "smul" and "smulw" instructions is not available +;; until after the memory stage. +(define_insn_reservation "mult1" 2 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "insn" "smulxy,smulwy")) + "a_e,a_m,a_w") + +;; The "smlaxy" and "smlawx" instructions require two iterations through +;; the execute stage; the result is available immediately following +;; the execute stage. +(define_insn_reservation "mult2" 2 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "insn" "smlaxy,smlalxy,smlawx")) + "a_e*2,a_m,a_w") + +;; The "smlalxy", "mul", and "mla" instructions require two iterations +;; through the execute stage; the result is not available until after +;; the memory stage. +(define_insn_reservation "mult3" 3 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "insn" "smlalxy,mul,mla")) + "a_e*2,a_m,a_w") + +;; The "muls" and "mlas" instructions loop in the execute stage for +;; four iterations in order to set the flags. The value result is +;; available after three iterations. +(define_insn_reservation "mult4" 3 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "insn" "muls,mlas")) + "a_e*4,a_m,a_w") + +;; Long multiply instructions that produce two registers of +;; output (such as umull) make their results available in two cycles; +;; the least significant word is available before the most significant +;; word. That fact is not modeled; instead, the instructions are +;; described as if the entire result was available at the end of the +;; cycle in which both words are available. + +;; The "umull", "umlal", "smull", and "smlal" instructions all take +;; three iterations through the execute cycle, and make their results +;; available after the memory cycle. +(define_insn_reservation "mult5" 4 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "insn" "umull,umlal,smull,smlal")) + "a_e*3,a_m,a_w") + +;; The "umulls", "umlals", "smulls", and "smlals" instructions loop in +;; the execute stage for five iterations in order to set the flags. +;; The value result is available after four iterations. +(define_insn_reservation "mult6" 4 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "insn" "umulls,umlals,smulls,smlals")) + "a_e*5,a_m,a_w") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Load/Store Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; The models for load/store instructions do not accurately describe +;; the difference between operations with a base register writeback +;; (such as "ldm!"). These models assume that all memory references +;; hit in dcache. + +;; LSU instructions require six cycles to execute. They use the ALU +;; pipeline in all but the 5th cycle, and the LSU pipeline in cycles +;; three through six. +;; Loads and stores which use a scaled register offset or scaled +;; register pre-indexed addressing mode take three cycles EXCEPT for +;; those that are base + offset with LSL of 0 or 2, or base - offset +;; with LSL of zero. The remainder take 1 cycle to execute. +;; For 4byte loads there is a bypass from the load stage + +(define_insn_reservation "load1_op" 2 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "type" "load_byte,load1")) + "a_e+l_e,l_m,a_w+l_w") + +(define_insn_reservation "store1_op" 0 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "type" "store1")) + "a_e+l_e,l_m,a_w+l_w") + +;; A load's result can be stored by an immediately following store +(define_bypass 1 "load1_op" "store1_op" "arm_no_early_store_addr_dep") + +;; On a LDM/STM operation, the LSU pipeline iterates until all of the +;; registers have been processed. +;; +;; The time it takes to load the data depends on whether or not the +;; base address is 64-bit aligned; if it is not, an additional cycle +;; is required. This model assumes that the address is always 64-bit +;; aligned. Because the processor can load two registers per cycle, +;; that assumption means that we use the same instruction reservations +;; for loading 2k and 2k - 1 registers. +;; +;; The ALU pipeline is stalled until the completion of the last memory +;; stage in the LSU pipeline. That is modeled by keeping the ALU +;; execute stage busy until that point. +;; +;; As with ALU operations, if one of the destination registers is the +;; PC, there are additional stalls; that is not modeled. + +(define_insn_reservation "load2_op" 2 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "type" "load2")) + "a_e+l_e,l_m,a_w+l_w") + +(define_insn_reservation "store2_op" 0 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "type" "store2")) + "a_e+l_e,l_m,a_w+l_w") + +(define_insn_reservation "load34_op" 3 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "type" "load3,load4")) + "a_e+l_e,a_e+l_e+l_m,a_e+l_m,a_w+l_w") + +(define_insn_reservation "store34_op" 0 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "type" "store3,store4")) + "a_e+l_e,a_e+l_e+l_m,a_e+l_m,a_w+l_w") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Branch and Call Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Branch instructions are difficult to model accurately. The ARM +;; core can predict most branches. If the branch is predicted +;; correctly, and predicted early enough, the branch can be completely +;; eliminated from the instruction stream. Some branches can +;; therefore appear to require zero cycles to execute. We assume that +;; all branches are predicted correctly, and that the latency is +;; therefore the minimum value. + +(define_insn_reservation "branch_op" 0 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "type" "branch")) + "nothing") + +;; The latency for a call is not predictable. Therefore, we use 32 as +;; roughly equivalent to positive infinity. + +(define_insn_reservation "call_op" 32 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "type" "call")) + "nothing") diff --git a/gcc/config/arm/arm1136jfs.md b/gcc/config/arm/arm1136jfs.md new file mode 100644 index 000000000..8fc30e976 --- /dev/null +++ b/gcc/config/arm/arm1136jfs.md @@ -0,0 +1,376 @@ +;; ARM 1136J[F]-S Pipeline Description +;; Copyright (C) 2003, 2007 Free Software Foundation, Inc. +;; Written by CodeSourcery, LLC. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ + +;; These descriptions are based on the information contained in the +;; ARM1136JF-S Technical Reference Manual, Copyright (c) 2003 ARM +;; Limited. +;; + +;; This automaton provides a pipeline description for the ARM +;; 1136J-S and 1136JF-S cores. +;; +;; The model given here assumes that the condition for all conditional +;; instructions is "true", i.e., that all of the instructions are +;; actually executed. + +(define_automaton "arm1136jfs") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Pipelines +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; There are three distinct pipelines (page 1-26 and following): +;; +;; - A 4-stage decode pipeline, shared by all three. It has fetch (1), +;; fetch (2), decode, and issue stages. Since this is always involved, +;; we do not model it in the scheduler. +;; +;; - A 4-stage ALU pipeline. It has shifter, ALU (main integer operations), +;; and saturation stages. The fourth stage is writeback; see below. +;; +;; - A 4-stage multiply-accumulate pipeline. It has three stages, called +;; MAC1 through MAC3, and a fourth writeback stage. +;; +;; The 4th-stage writeback is shared between the ALU and MAC pipelines, +;; which operate in lockstep. Results from either pipeline will be +;; moved into the writeback stage. Because the two pipelines operate +;; in lockstep, we schedule them as a single "execute" pipeline. +;; +;; - A 4-stage LSU pipeline. It has address generation, data cache (1), +;; data cache (2), and writeback stages. (Note that this pipeline, +;; including the writeback stage, is independent from the ALU & LSU pipes.) + +(define_cpu_unit "e_1,e_2,e_3,e_wb" "arm1136jfs") ; ALU and MAC +; e_1 = Sh/Mac1, e_2 = ALU/Mac2, e_3 = SAT/Mac3 +(define_cpu_unit "l_a,l_dc1,l_dc2,l_wb" "arm1136jfs") ; Load/Store + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ALU Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; ALU instructions require eight cycles to execute, and use the ALU +;; pipeline in each of the eight stages. The results are available +;; after the alu stage has finished. +;; +;; If the destination register is the PC, the pipelines are stalled +;; for several cycles. That case is not modelled here. + +;; ALU operations with no shifted operand +(define_insn_reservation "11_alu_op" 2 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "type" "alu")) + "e_1,e_2,e_3,e_wb") + +;; ALU operations with a shift-by-constant operand +(define_insn_reservation "11_alu_shift_op" 2 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "type" "alu_shift")) + "e_1,e_2,e_3,e_wb") + +;; ALU operations with a shift-by-register operand +;; These really stall in the decoder, in order to read +;; the shift value in a second cycle. Pretend we take two cycles in +;; the shift stage. +(define_insn_reservation "11_alu_shift_reg_op" 3 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "type" "alu_shift_reg")) + "e_1*2,e_2,e_3,e_wb") + +;; alu_ops can start sooner, if there is no shifter dependency +(define_bypass 1 "11_alu_op,11_alu_shift_op" + "11_alu_op") +(define_bypass 1 "11_alu_op,11_alu_shift_op" + "11_alu_shift_op" + "arm_no_early_alu_shift_value_dep") +(define_bypass 1 "11_alu_op,11_alu_shift_op" + "11_alu_shift_reg_op" + "arm_no_early_alu_shift_dep") +(define_bypass 2 "11_alu_shift_reg_op" + "11_alu_op") +(define_bypass 2 "11_alu_shift_reg_op" + "11_alu_shift_op" + "arm_no_early_alu_shift_value_dep") +(define_bypass 2 "11_alu_shift_reg_op" + "11_alu_shift_reg_op" + "arm_no_early_alu_shift_dep") + +(define_bypass 1 "11_alu_op,11_alu_shift_op" + "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" + "arm_no_early_mul_dep") +(define_bypass 2 "11_alu_shift_reg_op" + "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" + "arm_no_early_mul_dep") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Multiplication Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Multiplication instructions loop in the first two execute stages until +;; the instruction has been passed through the multiplier array enough +;; times. + +;; Multiply and multiply-accumulate results are available after four stages. +(define_insn_reservation "11_mult1" 4 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "insn" "mul,mla")) + "e_1*2,e_2,e_3,e_wb") + +;; The *S variants set the condition flags, which requires three more cycles. +(define_insn_reservation "11_mult2" 4 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "insn" "muls,mlas")) + "e_1*2,e_2,e_3,e_wb") + +(define_bypass 3 "11_mult1,11_mult2" + "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" + "arm_no_early_mul_dep") +(define_bypass 3 "11_mult1,11_mult2" + "11_alu_op") +(define_bypass 3 "11_mult1,11_mult2" + "11_alu_shift_op" + "arm_no_early_alu_shift_value_dep") +(define_bypass 3 "11_mult1,11_mult2" + "11_alu_shift_reg_op" + "arm_no_early_alu_shift_dep") +(define_bypass 3 "11_mult1,11_mult2" + "11_store1" + "arm_no_early_store_addr_dep") + +;; Signed and unsigned multiply long results are available across two cycles; +;; the less significant word is available one cycle before the more significant +;; word. Here we conservatively wait until both are available, which is +;; after three iterations and the memory cycle. The same is also true of +;; the two multiply-accumulate instructions. +(define_insn_reservation "11_mult3" 5 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "insn" "smull,umull,smlal,umlal")) + "e_1*3,e_2,e_3,e_wb*2") + +;; The *S variants set the condition flags, which requires three more cycles. +(define_insn_reservation "11_mult4" 5 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "insn" "smulls,umulls,smlals,umlals")) + "e_1*3,e_2,e_3,e_wb*2") + +(define_bypass 4 "11_mult3,11_mult4" + "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" + "arm_no_early_mul_dep") +(define_bypass 4 "11_mult3,11_mult4" + "11_alu_op") +(define_bypass 4 "11_mult3,11_mult4" + "11_alu_shift_op" + "arm_no_early_alu_shift_value_dep") +(define_bypass 4 "11_mult3,11_mult4" + "11_alu_shift_reg_op" + "arm_no_early_alu_shift_dep") +(define_bypass 4 "11_mult3,11_mult4" + "11_store1" + "arm_no_early_store_addr_dep") + +;; Various 16x16->32 multiplies and multiply-accumulates, using combinations +;; of high and low halves of the argument registers. They take a single +;; pass through the pipeline and make the result available after three +;; cycles. +(define_insn_reservation "11_mult5" 3 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "insn" "smulxy,smlaxy,smulwy,smlawy,smuad,smuadx,smlad,smladx,smusd,smusdx,smlsd,smlsdx")) + "e_1,e_2,e_3,e_wb") + +(define_bypass 2 "11_mult5" + "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" + "arm_no_early_mul_dep") +(define_bypass 2 "11_mult5" + "11_alu_op") +(define_bypass 2 "11_mult5" + "11_alu_shift_op" + "arm_no_early_alu_shift_value_dep") +(define_bypass 2 "11_mult5" + "11_alu_shift_reg_op" + "arm_no_early_alu_shift_dep") +(define_bypass 2 "11_mult5" + "11_store1" + "arm_no_early_store_addr_dep") + +;; The same idea, then the 32-bit result is added to a 64-bit quantity. +(define_insn_reservation "11_mult6" 4 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "insn" "smlalxy")) + "e_1*2,e_2,e_3,e_wb*2") + +;; Signed 32x32 multiply, then the most significant 32 bits are extracted +;; and are available after the memory stage. +(define_insn_reservation "11_mult7" 4 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "insn" "smmul,smmulr")) + "e_1*2,e_2,e_3,e_wb") + +(define_bypass 3 "11_mult6,11_mult7" + "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" + "arm_no_early_mul_dep") +(define_bypass 3 "11_mult6,11_mult7" + "11_alu_op") +(define_bypass 3 "11_mult6,11_mult7" + "11_alu_shift_op" + "arm_no_early_alu_shift_value_dep") +(define_bypass 3 "11_mult6,11_mult7" + "11_alu_shift_reg_op" + "arm_no_early_alu_shift_dep") +(define_bypass 3 "11_mult6,11_mult7" + "11_store1" + "arm_no_early_store_addr_dep") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Branch Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; These vary greatly depending on their arguments and the results of +;; stat prediction. Cycle count ranges from zero (unconditional branch, +;; folded dynamic prediction) to seven (incorrect predictions, etc). We +;; assume an optimal case for now, because the cost of a cache miss +;; overwhelms the cost of everything else anyhow. + +(define_insn_reservation "11_branches" 0 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "type" "branch")) + "nothing") + +;; Call latencies are not predictable. A semi-arbitrary very large +;; number is used as "positive infinity" so that everything should be +;; finished by the time of return. +(define_insn_reservation "11_call" 32 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "type" "call")) + "nothing") + +;; Branches are predicted. A correctly predicted branch will be no +;; cost, but we're conservative here, and use the timings a +;; late-register would give us. +(define_bypass 1 "11_alu_op,11_alu_shift_op" + "11_branches") +(define_bypass 2 "11_alu_shift_reg_op" + "11_branches") +(define_bypass 2 "11_load1,11_load2" + "11_branches") +(define_bypass 3 "11_load34" + "11_branches") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Load/Store Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; The models for load/store instructions do not accurately describe +;; the difference between operations with a base register writeback. +;; These models assume that all memory references hit in dcache. Also, +;; if the PC is one of the registers involved, there are additional stalls +;; not modelled here. Addressing modes are also not modelled. + +(define_insn_reservation "11_load1" 3 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "type" "load1")) + "l_a+e_1,l_dc1,l_dc2,l_wb") + +;; Load byte results are not available until the writeback stage, where +;; the correct byte is extracted. + +(define_insn_reservation "11_loadb" 4 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "type" "load_byte")) + "l_a+e_1,l_dc1,l_dc2,l_wb") + +(define_insn_reservation "11_store1" 0 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "type" "store1")) + "l_a+e_1,l_dc1,l_dc2,l_wb") + +;; Load/store double words into adjacent registers. The timing and +;; latencies are different depending on whether the address is 64-bit +;; aligned. This model assumes that it is. +(define_insn_reservation "11_load2" 3 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "type" "load2")) + "l_a+e_1,l_dc1,l_dc2,l_wb") + +(define_insn_reservation "11_store2" 0 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "type" "store2")) + "l_a+e_1,l_dc1,l_dc2,l_wb") + +;; Load/store multiple registers. Two registers are stored per cycle. +;; Actual timing depends on how many registers are affected, so we +;; optimistically schedule a low latency. +(define_insn_reservation "11_load34" 4 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "type" "load3,load4")) + "l_a+e_1,l_dc1*2,l_dc2,l_wb") + +(define_insn_reservation "11_store34" 0 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "type" "store3,store4")) + "l_a+e_1,l_dc1*2,l_dc2,l_wb") + +;; A store can start immediately after an alu op, if that alu op does +;; not provide part of the address to access. +(define_bypass 1 "11_alu_op,11_alu_shift_op" + "11_store1" + "arm_no_early_store_addr_dep") +(define_bypass 2 "11_alu_shift_reg_op" + "11_store1" + "arm_no_early_store_addr_dep") + +;; An alu op can start sooner after a load, if that alu op does not +;; have an early register dependency on the load +(define_bypass 2 "11_load1" + "11_alu_op") +(define_bypass 2 "11_load1" + "11_alu_shift_op" + "arm_no_early_alu_shift_value_dep") +(define_bypass 2 "11_load1" + "11_alu_shift_reg_op" + "arm_no_early_alu_shift_dep") + +(define_bypass 3 "11_loadb" + "11_alu_op") +(define_bypass 3 "11_loadb" + "11_alu_shift_op" + "arm_no_early_alu_shift_value_dep") +(define_bypass 3 "11_loadb" + "11_alu_shift_reg_op" + "arm_no_early_alu_shift_dep") + +;; A mul op can start sooner after a load, if that mul op does not +;; have an early multiply dependency +(define_bypass 2 "11_load1" + "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" + "arm_no_early_mul_dep") +(define_bypass 3 "11_load34" + "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" + "arm_no_early_mul_dep") +(define_bypass 3 "11_loadb" + "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" + "arm_no_early_mul_dep") + +;; A store can start sooner after a load, if that load does not +;; produce part of the address to access +(define_bypass 2 "11_load1" + "11_store1" + "arm_no_early_store_addr_dep") +(define_bypass 3 "11_loadb" + "11_store1" + "arm_no_early_store_addr_dep") diff --git a/gcc/config/arm/arm926ejs.md b/gcc/config/arm/arm926ejs.md new file mode 100644 index 000000000..d3908f9e3 --- /dev/null +++ b/gcc/config/arm/arm926ejs.md @@ -0,0 +1,187 @@ +;; ARM 926EJ-S Pipeline Description +;; Copyright (C) 2003, 2007 Free Software Foundation, Inc. +;; Written by CodeSourcery, LLC. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ + +;; These descriptions are based on the information contained in the +;; ARM926EJ-S Technical Reference Manual, Copyright (c) 2002 ARM +;; Limited. +;; + +;; This automaton provides a pipeline description for the ARM +;; 926EJ-S core. +;; +;; The model given here assumes that the condition for all conditional +;; instructions is "true", i.e., that all of the instructions are +;; actually executed. + +(define_automaton "arm926ejs") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Pipelines +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; There is a single pipeline +;; +;; The ALU pipeline has fetch, decode, execute, memory, and +;; write stages. We only need to model the execute, memory and write +;; stages. + +(define_cpu_unit "e,m,w" "arm926ejs") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ALU Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; ALU instructions require three cycles to execute, and use the ALU +;; pipeline in each of the three stages. The results are available +;; after the execute stage stage has finished. +;; +;; If the destination register is the PC, the pipelines are stalled +;; for several cycles. That case is not modeled here. + +;; ALU operations with no shifted operand +(define_insn_reservation "9_alu_op" 1 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "type" "alu,alu_shift")) + "e,m,w") + +;; ALU operations with a shift-by-register operand +;; These really stall in the decoder, in order to read +;; the shift value in a second cycle. Pretend we take two cycles in +;; the execute stage. +(define_insn_reservation "9_alu_shift_reg_op" 2 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "type" "alu_shift_reg")) + "e*2,m,w") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Multiplication Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Multiplication instructions loop in the execute stage until the +;; instruction has been passed through the multiplier array enough +;; times. Multiply operations occur in both the execute and memory +;; stages of the pipeline + +(define_insn_reservation "9_mult1" 3 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "insn" "smlalxy,mul,mla")) + "e*2,m,w") + +(define_insn_reservation "9_mult2" 4 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "insn" "muls,mlas")) + "e*3,m,w") + +(define_insn_reservation "9_mult3" 4 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "insn" "umull,umlal,smull,smlal")) + "e*3,m,w") + +(define_insn_reservation "9_mult4" 5 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "insn" "umulls,umlals,smulls,smlals")) + "e*4,m,w") + +(define_insn_reservation "9_mult5" 2 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "insn" "smulxy,smlaxy,smlawx")) + "e,m,w") + +(define_insn_reservation "9_mult6" 3 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "insn" "smlalxy")) + "e*2,m,w") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Load/Store Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; The models for load/store instructions do not accurately describe +;; the difference between operations with a base register writeback +;; (such as "ldm!"). These models assume that all memory references +;; hit in dcache. + +;; Loads with a shifted offset take 3 cycles, and are (a) probably the +;; most common and (b) the pessimistic assumption will lead to fewer stalls. +(define_insn_reservation "9_load1_op" 3 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "type" "load1,load_byte")) + "e*2,m,w") + +(define_insn_reservation "9_store1_op" 0 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "type" "store1")) + "e,m,w") + +;; multiple word loads and stores +(define_insn_reservation "9_load2_op" 3 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "type" "load2")) + "e,m*2,w") + +(define_insn_reservation "9_load3_op" 4 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "type" "load3")) + "e,m*3,w") + +(define_insn_reservation "9_load4_op" 5 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "type" "load4")) + "e,m*4,w") + +(define_insn_reservation "9_store2_op" 0 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "type" "store2")) + "e,m*2,w") + +(define_insn_reservation "9_store3_op" 0 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "type" "store3")) + "e,m*3,w") + +(define_insn_reservation "9_store4_op" 0 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "type" "store4")) + "e,m*4,w") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Branch and Call Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Branch instructions are difficult to model accurately. The ARM +;; core can predict most branches. If the branch is predicted +;; correctly, and predicted early enough, the branch can be completely +;; eliminated from the instruction stream. Some branches can +;; therefore appear to require zero cycles to execute. We assume that +;; all branches are predicted correctly, and that the latency is +;; therefore the minimum value. + +(define_insn_reservation "9_branch_op" 0 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "type" "branch")) + "nothing") + +;; The latency for a call is not predictable. Therefore, we use 32 as +;; roughly equivalent to positive infinity. + +(define_insn_reservation "9_call_op" 32 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "type" "call")) + "nothing") diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h new file mode 100644 index 000000000..9cba0a90a --- /dev/null +++ b/gcc/config/arm/arm_neon.h @@ -0,0 +1,12176 @@ +/* ARM NEON intrinsics include file. This file is generated automatically + using neon-gen.ml. Please do not edit manually. + + Copyright (C) 2006, 2007, 2009 Free Software Foundation, Inc. + Contributed by CodeSourcery. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _GCC_ARM_NEON_H +#define _GCC_ARM_NEON_H 1 + +#ifndef __ARM_NEON__ +#error You must enable NEON instructions (e.g. -mfloat-abi=softfp -mfpu=neon) to use arm_neon.h +#else + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +typedef __builtin_neon_qi int8x8_t __attribute__ ((__vector_size__ (8))); +typedef __builtin_neon_hi int16x4_t __attribute__ ((__vector_size__ (8))); +typedef __builtin_neon_si int32x2_t __attribute__ ((__vector_size__ (8))); +typedef __builtin_neon_di int64x1_t; +typedef __builtin_neon_sf float32x2_t __attribute__ ((__vector_size__ (8))); +typedef __builtin_neon_poly8 poly8x8_t __attribute__ ((__vector_size__ (8))); +typedef __builtin_neon_poly16 poly16x4_t __attribute__ ((__vector_size__ (8))); +typedef __builtin_neon_uqi uint8x8_t __attribute__ ((__vector_size__ (8))); +typedef __builtin_neon_uhi uint16x4_t __attribute__ ((__vector_size__ (8))); +typedef __builtin_neon_usi uint32x2_t __attribute__ ((__vector_size__ (8))); +typedef __builtin_neon_udi uint64x1_t; +typedef __builtin_neon_qi int8x16_t __attribute__ ((__vector_size__ (16))); +typedef __builtin_neon_hi int16x8_t __attribute__ ((__vector_size__ (16))); +typedef __builtin_neon_si int32x4_t __attribute__ ((__vector_size__ (16))); +typedef __builtin_neon_di int64x2_t __attribute__ ((__vector_size__ (16))); +typedef __builtin_neon_sf float32x4_t __attribute__ ((__vector_size__ (16))); +typedef __builtin_neon_poly8 poly8x16_t __attribute__ ((__vector_size__ (16))); +typedef __builtin_neon_poly16 poly16x8_t __attribute__ ((__vector_size__ (16))); +typedef __builtin_neon_uqi uint8x16_t __attribute__ ((__vector_size__ (16))); +typedef __builtin_neon_uhi uint16x8_t __attribute__ ((__vector_size__ (16))); +typedef __builtin_neon_usi uint32x4_t __attribute__ ((__vector_size__ (16))); +typedef __builtin_neon_udi uint64x2_t __attribute__ ((__vector_size__ (16))); + +typedef float float32_t; +typedef __builtin_neon_poly8 poly8_t; +typedef __builtin_neon_poly16 poly16_t; + +typedef struct int8x8x2_t +{ + int8x8_t val[2]; +} int8x8x2_t; + +typedef struct int8x16x2_t +{ + int8x16_t val[2]; +} int8x16x2_t; + +typedef struct int16x4x2_t +{ + int16x4_t val[2]; +} int16x4x2_t; + +typedef struct int16x8x2_t +{ + int16x8_t val[2]; +} int16x8x2_t; + +typedef struct int32x2x2_t +{ + int32x2_t val[2]; +} int32x2x2_t; + +typedef struct int32x4x2_t +{ + int32x4_t val[2]; +} int32x4x2_t; + +typedef struct int64x1x2_t +{ + int64x1_t val[2]; +} int64x1x2_t; + +typedef struct int64x2x2_t +{ + int64x2_t val[2]; +} int64x2x2_t; + +typedef struct uint8x8x2_t +{ + uint8x8_t val[2]; +} uint8x8x2_t; + +typedef struct uint8x16x2_t +{ + uint8x16_t val[2]; +} uint8x16x2_t; + +typedef struct uint16x4x2_t +{ + uint16x4_t val[2]; +} uint16x4x2_t; + +typedef struct uint16x8x2_t +{ + uint16x8_t val[2]; +} uint16x8x2_t; + +typedef struct uint32x2x2_t +{ + uint32x2_t val[2]; +} uint32x2x2_t; + +typedef struct uint32x4x2_t +{ + uint32x4_t val[2]; +} uint32x4x2_t; + +typedef struct uint64x1x2_t +{ + uint64x1_t val[2]; +} uint64x1x2_t; + +typedef struct uint64x2x2_t +{ + uint64x2_t val[2]; +} uint64x2x2_t; + +typedef struct float32x2x2_t +{ + float32x2_t val[2]; +} float32x2x2_t; + +typedef struct float32x4x2_t +{ + float32x4_t val[2]; +} float32x4x2_t; + +typedef struct poly8x8x2_t +{ + poly8x8_t val[2]; +} poly8x8x2_t; + +typedef struct poly8x16x2_t +{ + poly8x16_t val[2]; +} poly8x16x2_t; + +typedef struct poly16x4x2_t +{ + poly16x4_t val[2]; +} poly16x4x2_t; + +typedef struct poly16x8x2_t +{ + poly16x8_t val[2]; +} poly16x8x2_t; + +typedef struct int8x8x3_t +{ + int8x8_t val[3]; +} int8x8x3_t; + +typedef struct int8x16x3_t +{ + int8x16_t val[3]; +} int8x16x3_t; + +typedef struct int16x4x3_t +{ + int16x4_t val[3]; +} int16x4x3_t; + +typedef struct int16x8x3_t +{ + int16x8_t val[3]; +} int16x8x3_t; + +typedef struct int32x2x3_t +{ + int32x2_t val[3]; +} int32x2x3_t; + +typedef struct int32x4x3_t +{ + int32x4_t val[3]; +} int32x4x3_t; + +typedef struct int64x1x3_t +{ + int64x1_t val[3]; +} int64x1x3_t; + +typedef struct int64x2x3_t +{ + int64x2_t val[3]; +} int64x2x3_t; + +typedef struct uint8x8x3_t +{ + uint8x8_t val[3]; +} uint8x8x3_t; + +typedef struct uint8x16x3_t +{ + uint8x16_t val[3]; +} uint8x16x3_t; + +typedef struct uint16x4x3_t +{ + uint16x4_t val[3]; +} uint16x4x3_t; + +typedef struct uint16x8x3_t +{ + uint16x8_t val[3]; +} uint16x8x3_t; + +typedef struct uint32x2x3_t +{ + uint32x2_t val[3]; +} uint32x2x3_t; + +typedef struct uint32x4x3_t +{ + uint32x4_t val[3]; +} uint32x4x3_t; + +typedef struct uint64x1x3_t +{ + uint64x1_t val[3]; +} uint64x1x3_t; + +typedef struct uint64x2x3_t +{ + uint64x2_t val[3]; +} uint64x2x3_t; + +typedef struct float32x2x3_t +{ + float32x2_t val[3]; +} float32x2x3_t; + +typedef struct float32x4x3_t +{ + float32x4_t val[3]; +} float32x4x3_t; + +typedef struct poly8x8x3_t +{ + poly8x8_t val[3]; +} poly8x8x3_t; + +typedef struct poly8x16x3_t +{ + poly8x16_t val[3]; +} poly8x16x3_t; + +typedef struct poly16x4x3_t +{ + poly16x4_t val[3]; +} poly16x4x3_t; + +typedef struct poly16x8x3_t +{ + poly16x8_t val[3]; +} poly16x8x3_t; + +typedef struct int8x8x4_t +{ + int8x8_t val[4]; +} int8x8x4_t; + +typedef struct int8x16x4_t +{ + int8x16_t val[4]; +} int8x16x4_t; + +typedef struct int16x4x4_t +{ + int16x4_t val[4]; +} int16x4x4_t; + +typedef struct int16x8x4_t +{ + int16x8_t val[4]; +} int16x8x4_t; + +typedef struct int32x2x4_t +{ + int32x2_t val[4]; +} int32x2x4_t; + +typedef struct int32x4x4_t +{ + int32x4_t val[4]; +} int32x4x4_t; + +typedef struct int64x1x4_t +{ + int64x1_t val[4]; +} int64x1x4_t; + +typedef struct int64x2x4_t +{ + int64x2_t val[4]; +} int64x2x4_t; + +typedef struct uint8x8x4_t +{ + uint8x8_t val[4]; +} uint8x8x4_t; + +typedef struct uint8x16x4_t +{ + uint8x16_t val[4]; +} uint8x16x4_t; + +typedef struct uint16x4x4_t +{ + uint16x4_t val[4]; +} uint16x4x4_t; + +typedef struct uint16x8x4_t +{ + uint16x8_t val[4]; +} uint16x8x4_t; + +typedef struct uint32x2x4_t +{ + uint32x2_t val[4]; +} uint32x2x4_t; + +typedef struct uint32x4x4_t +{ + uint32x4_t val[4]; +} uint32x4x4_t; + +typedef struct uint64x1x4_t +{ + uint64x1_t val[4]; +} uint64x1x4_t; + +typedef struct uint64x2x4_t +{ + uint64x2_t val[4]; +} uint64x2x4_t; + +typedef struct float32x2x4_t +{ + float32x2_t val[4]; +} float32x2x4_t; + +typedef struct float32x4x4_t +{ + float32x4_t val[4]; +} float32x4x4_t; + +typedef struct poly8x8x4_t +{ + poly8x8_t val[4]; +} poly8x8x4_t; + +typedef struct poly8x16x4_t +{ + poly8x16_t val[4]; +} poly8x16x4_t; + +typedef struct poly16x4x4_t +{ + poly16x4_t val[4]; +} poly16x4x4_t; + +typedef struct poly16x8x4_t +{ + poly16x8_t val[4]; +} poly16x8x4_t; + + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vadd_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_neon_vaddv8qi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vadd_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_vaddv4hi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vadd_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_vaddv2si (__a, __b, 1); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vadd_f32 (float32x2_t __a, float32x2_t __b) +{ + return (float32x2_t)__builtin_neon_vaddv2sf (__a, __b, 3); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vadd_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vaddv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vadd_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vaddv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vadd_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vaddv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vadd_s64 (int64x1_t __a, int64x1_t __b) +{ + return (int64x1_t)__builtin_neon_vadddi (__a, __b, 1); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vadd_u64 (uint64x1_t __a, uint64x1_t __b) +{ + return (uint64x1_t)__builtin_neon_vadddi ((int64x1_t) __a, (int64x1_t) __b, 0); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vaddq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (int8x16_t)__builtin_neon_vaddv16qi (__a, __b, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vaddq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int16x8_t)__builtin_neon_vaddv8hi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vaddq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int32x4_t)__builtin_neon_vaddv4si (__a, __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vaddq_s64 (int64x2_t __a, int64x2_t __b) +{ + return (int64x2_t)__builtin_neon_vaddv2di (__a, __b, 1); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vaddq_f32 (float32x4_t __a, float32x4_t __b) +{ + return (float32x4_t)__builtin_neon_vaddv4sf (__a, __b, 3); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vaddq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vaddv16qi ((int8x16_t) __a, (int8x16_t) __b, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vaddq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vaddv8hi ((int16x8_t) __a, (int16x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vaddq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vaddv4si ((int32x4_t) __a, (int32x4_t) __b, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vaddq_u64 (uint64x2_t __a, uint64x2_t __b) +{ + return (uint64x2_t)__builtin_neon_vaddv2di ((int64x2_t) __a, (int64x2_t) __b, 0); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vaddl_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int16x8_t)__builtin_neon_vaddlv8qi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vaddl_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int32x4_t)__builtin_neon_vaddlv4hi (__a, __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vaddl_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int64x2_t)__builtin_neon_vaddlv2si (__a, __b, 1); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vaddl_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vaddlv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vaddl_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vaddlv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vaddl_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint64x2_t)__builtin_neon_vaddlv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vaddw_s8 (int16x8_t __a, int8x8_t __b) +{ + return (int16x8_t)__builtin_neon_vaddwv8qi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vaddw_s16 (int32x4_t __a, int16x4_t __b) +{ + return (int32x4_t)__builtin_neon_vaddwv4hi (__a, __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vaddw_s32 (int64x2_t __a, int32x2_t __b) +{ + return (int64x2_t)__builtin_neon_vaddwv2si (__a, __b, 1); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vaddw_u8 (uint16x8_t __a, uint8x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vaddwv8qi ((int16x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vaddw_u16 (uint32x4_t __a, uint16x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vaddwv4hi ((int32x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vaddw_u32 (uint64x2_t __a, uint32x2_t __b) +{ + return (uint64x2_t)__builtin_neon_vaddwv2si ((int64x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vhadd_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_neon_vhaddv8qi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vhadd_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_vhaddv4hi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vhadd_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_vhaddv2si (__a, __b, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vhadd_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vhaddv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vhadd_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vhaddv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vhadd_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vhaddv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vhaddq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (int8x16_t)__builtin_neon_vhaddv16qi (__a, __b, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vhaddq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int16x8_t)__builtin_neon_vhaddv8hi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vhaddq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int32x4_t)__builtin_neon_vhaddv4si (__a, __b, 1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vhaddq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vhaddv16qi ((int8x16_t) __a, (int8x16_t) __b, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vhaddq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vhaddv8hi ((int16x8_t) __a, (int16x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vhaddq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vhaddv4si ((int32x4_t) __a, (int32x4_t) __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vrhadd_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_neon_vhaddv8qi (__a, __b, 5); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vrhadd_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_vhaddv4hi (__a, __b, 5); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vrhadd_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_vhaddv2si (__a, __b, 5); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vrhadd_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vhaddv8qi ((int8x8_t) __a, (int8x8_t) __b, 4); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vrhadd_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vhaddv4hi ((int16x4_t) __a, (int16x4_t) __b, 4); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vrhadd_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vhaddv2si ((int32x2_t) __a, (int32x2_t) __b, 4); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vrhaddq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (int8x16_t)__builtin_neon_vhaddv16qi (__a, __b, 5); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vrhaddq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int16x8_t)__builtin_neon_vhaddv8hi (__a, __b, 5); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vrhaddq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int32x4_t)__builtin_neon_vhaddv4si (__a, __b, 5); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vrhaddq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vhaddv16qi ((int8x16_t) __a, (int8x16_t) __b, 4); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vrhaddq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vhaddv8hi ((int16x8_t) __a, (int16x8_t) __b, 4); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vrhaddq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vhaddv4si ((int32x4_t) __a, (int32x4_t) __b, 4); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqadd_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_neon_vqaddv8qi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vqadd_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_vqaddv4hi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vqadd_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_vqaddv2si (__a, __b, 1); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vqadd_s64 (int64x1_t __a, int64x1_t __b) +{ + return (int64x1_t)__builtin_neon_vqadddi (__a, __b, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqadd_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vqaddv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vqadd_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vqaddv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vqadd_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vqaddv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vqadd_u64 (uint64x1_t __a, uint64x1_t __b) +{ + return (uint64x1_t)__builtin_neon_vqadddi ((int64x1_t) __a, (int64x1_t) __b, 0); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vqaddq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (int8x16_t)__builtin_neon_vqaddv16qi (__a, __b, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vqaddq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int16x8_t)__builtin_neon_vqaddv8hi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqaddq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int32x4_t)__builtin_neon_vqaddv4si (__a, __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vqaddq_s64 (int64x2_t __a, int64x2_t __b) +{ + return (int64x2_t)__builtin_neon_vqaddv2di (__a, __b, 1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vqaddq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vqaddv16qi ((int8x16_t) __a, (int8x16_t) __b, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vqaddq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vqaddv8hi ((int16x8_t) __a, (int16x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vqaddq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vqaddv4si ((int32x4_t) __a, (int32x4_t) __b, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vqaddq_u64 (uint64x2_t __a, uint64x2_t __b) +{ + return (uint64x2_t)__builtin_neon_vqaddv2di ((int64x2_t) __a, (int64x2_t) __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vaddhn_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int8x8_t)__builtin_neon_vaddhnv8hi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vaddhn_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int16x4_t)__builtin_neon_vaddhnv4si (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vaddhn_s64 (int64x2_t __a, int64x2_t __b) +{ + return (int32x2_t)__builtin_neon_vaddhnv2di (__a, __b, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vaddhn_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vaddhnv8hi ((int16x8_t) __a, (int16x8_t) __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vaddhn_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vaddhnv4si ((int32x4_t) __a, (int32x4_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vaddhn_u64 (uint64x2_t __a, uint64x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vaddhnv2di ((int64x2_t) __a, (int64x2_t) __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vraddhn_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int8x8_t)__builtin_neon_vaddhnv8hi (__a, __b, 5); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vraddhn_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int16x4_t)__builtin_neon_vaddhnv4si (__a, __b, 5); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vraddhn_s64 (int64x2_t __a, int64x2_t __b) +{ + return (int32x2_t)__builtin_neon_vaddhnv2di (__a, __b, 5); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vraddhn_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vaddhnv8hi ((int16x8_t) __a, (int16x8_t) __b, 4); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vraddhn_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vaddhnv4si ((int32x4_t) __a, (int32x4_t) __b, 4); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vraddhn_u64 (uint64x2_t __a, uint64x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vaddhnv2di ((int64x2_t) __a, (int64x2_t) __b, 4); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vmul_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_neon_vmulv8qi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmul_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_vmulv4hi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmul_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_vmulv2si (__a, __b, 1); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmul_f32 (float32x2_t __a, float32x2_t __b) +{ + return (float32x2_t)__builtin_neon_vmulv2sf (__a, __b, 3); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vmul_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vmulv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmul_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vmulv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmul_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vmulv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vmul_p8 (poly8x8_t __a, poly8x8_t __b) +{ + return (poly8x8_t)__builtin_neon_vmulv8qi ((int8x8_t) __a, (int8x8_t) __b, 2); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vmulq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (int8x16_t)__builtin_neon_vmulv16qi (__a, __b, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmulq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int16x8_t)__builtin_neon_vmulv8hi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmulq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int32x4_t)__builtin_neon_vmulv4si (__a, __b, 1); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmulq_f32 (float32x4_t __a, float32x4_t __b) +{ + return (float32x4_t)__builtin_neon_vmulv4sf (__a, __b, 3); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vmulq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vmulv16qi ((int8x16_t) __a, (int8x16_t) __b, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmulq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vmulv8hi ((int16x8_t) __a, (int16x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmulq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vmulv4si ((int32x4_t) __a, (int32x4_t) __b, 0); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vmulq_p8 (poly8x16_t __a, poly8x16_t __b) +{ + return (poly8x16_t)__builtin_neon_vmulv16qi ((int8x16_t) __a, (int8x16_t) __b, 2); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vqdmulh_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_vqdmulhv4hi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vqdmulh_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_vqdmulhv2si (__a, __b, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vqdmulhq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int16x8_t)__builtin_neon_vqdmulhv8hi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqdmulhq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int32x4_t)__builtin_neon_vqdmulhv4si (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vqrdmulh_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_vqdmulhv4hi (__a, __b, 5); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vqrdmulh_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_vqdmulhv2si (__a, __b, 5); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vqrdmulhq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int16x8_t)__builtin_neon_vqdmulhv8hi (__a, __b, 5); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqrdmulhq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int32x4_t)__builtin_neon_vqdmulhv4si (__a, __b, 5); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmull_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int16x8_t)__builtin_neon_vmullv8qi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmull_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int32x4_t)__builtin_neon_vmullv4hi (__a, __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmull_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int64x2_t)__builtin_neon_vmullv2si (__a, __b, 1); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmull_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vmullv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmull_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vmullv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmull_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint64x2_t)__builtin_neon_vmullv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vmull_p8 (poly8x8_t __a, poly8x8_t __b) +{ + return (poly16x8_t)__builtin_neon_vmullv8qi ((int8x8_t) __a, (int8x8_t) __b, 2); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqdmull_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int32x4_t)__builtin_neon_vqdmullv4hi (__a, __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vqdmull_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int64x2_t)__builtin_neon_vqdmullv2si (__a, __b, 1); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vmla_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c) +{ + return (int8x8_t)__builtin_neon_vmlav8qi (__a, __b, __c, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmla_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c) +{ + return (int16x4_t)__builtin_neon_vmlav4hi (__a, __b, __c, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmla_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c) +{ + return (int32x2_t)__builtin_neon_vmlav2si (__a, __b, __c, 1); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmla_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c) +{ + return (float32x2_t)__builtin_neon_vmlav2sf (__a, __b, __c, 3); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vmla_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c) +{ + return (uint8x8_t)__builtin_neon_vmlav8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmla_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c) +{ + return (uint16x4_t)__builtin_neon_vmlav4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c) +{ + return (uint32x2_t)__builtin_neon_vmlav2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, 0); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vmlaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c) +{ + return (int8x16_t)__builtin_neon_vmlav16qi (__a, __b, __c, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmlaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c) +{ + return (int16x8_t)__builtin_neon_vmlav8hi (__a, __b, __c, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c) +{ + return (int32x4_t)__builtin_neon_vmlav4si (__a, __b, __c, 1); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmlaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c) +{ + return (float32x4_t)__builtin_neon_vmlav4sf (__a, __b, __c, 3); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vmlaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c) +{ + return (uint8x16_t)__builtin_neon_vmlav16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmlaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c) +{ + return (uint16x8_t)__builtin_neon_vmlav8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) +{ + return (uint32x4_t)__builtin_neon_vmlav4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c, 0); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmlal_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c) +{ + return (int16x8_t)__builtin_neon_vmlalv8qi (__a, __b, __c, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c) +{ + return (int32x4_t)__builtin_neon_vmlalv4hi (__a, __b, __c, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c) +{ + return (int64x2_t)__builtin_neon_vmlalv2si (__a, __b, __c, 1); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmlal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c) +{ + return (uint16x8_t)__builtin_neon_vmlalv8qi ((int16x8_t) __a, (int8x8_t) __b, (int8x8_t) __c, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlal_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c) +{ + return (uint32x4_t)__builtin_neon_vmlalv4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmlal_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c) +{ + return (uint64x2_t)__builtin_neon_vmlalv2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, 0); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqdmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c) +{ + return (int32x4_t)__builtin_neon_vqdmlalv4hi (__a, __b, __c, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vqdmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c) +{ + return (int64x2_t)__builtin_neon_vqdmlalv2si (__a, __b, __c, 1); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vmls_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c) +{ + return (int8x8_t)__builtin_neon_vmlsv8qi (__a, __b, __c, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmls_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c) +{ + return (int16x4_t)__builtin_neon_vmlsv4hi (__a, __b, __c, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmls_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c) +{ + return (int32x2_t)__builtin_neon_vmlsv2si (__a, __b, __c, 1); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmls_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c) +{ + return (float32x2_t)__builtin_neon_vmlsv2sf (__a, __b, __c, 3); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vmls_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c) +{ + return (uint8x8_t)__builtin_neon_vmlsv8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmls_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c) +{ + return (uint16x4_t)__builtin_neon_vmlsv4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmls_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c) +{ + return (uint32x2_t)__builtin_neon_vmlsv2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, 0); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vmlsq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c) +{ + return (int8x16_t)__builtin_neon_vmlsv16qi (__a, __b, __c, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmlsq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c) +{ + return (int16x8_t)__builtin_neon_vmlsv8hi (__a, __b, __c, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlsq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c) +{ + return (int32x4_t)__builtin_neon_vmlsv4si (__a, __b, __c, 1); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmlsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c) +{ + return (float32x4_t)__builtin_neon_vmlsv4sf (__a, __b, __c, 3); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vmlsq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c) +{ + return (uint8x16_t)__builtin_neon_vmlsv16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmlsq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c) +{ + return (uint16x8_t)__builtin_neon_vmlsv8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlsq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) +{ + return (uint32x4_t)__builtin_neon_vmlsv4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c, 0); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmlsl_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c) +{ + return (int16x8_t)__builtin_neon_vmlslv8qi (__a, __b, __c, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c) +{ + return (int32x4_t)__builtin_neon_vmlslv4hi (__a, __b, __c, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c) +{ + return (int64x2_t)__builtin_neon_vmlslv2si (__a, __b, __c, 1); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmlsl_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c) +{ + return (uint16x8_t)__builtin_neon_vmlslv8qi ((int16x8_t) __a, (int8x8_t) __b, (int8x8_t) __c, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlsl_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c) +{ + return (uint32x4_t)__builtin_neon_vmlslv4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmlsl_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c) +{ + return (uint64x2_t)__builtin_neon_vmlslv2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, 0); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqdmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c) +{ + return (int32x4_t)__builtin_neon_vqdmlslv4hi (__a, __b, __c, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c) +{ + return (int64x2_t)__builtin_neon_vqdmlslv2si (__a, __b, __c, 1); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vsub_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_neon_vsubv8qi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vsub_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_vsubv4hi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vsub_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_vsubv2si (__a, __b, 1); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vsub_f32 (float32x2_t __a, float32x2_t __b) +{ + return (float32x2_t)__builtin_neon_vsubv2sf (__a, __b, 3); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vsub_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vsubv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vsub_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vsubv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vsub_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vsubv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vsub_s64 (int64x1_t __a, int64x1_t __b) +{ + return (int64x1_t)__builtin_neon_vsubdi (__a, __b, 1); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vsub_u64 (uint64x1_t __a, uint64x1_t __b) +{ + return (uint64x1_t)__builtin_neon_vsubdi ((int64x1_t) __a, (int64x1_t) __b, 0); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vsubq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (int8x16_t)__builtin_neon_vsubv16qi (__a, __b, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vsubq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int16x8_t)__builtin_neon_vsubv8hi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vsubq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int32x4_t)__builtin_neon_vsubv4si (__a, __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vsubq_s64 (int64x2_t __a, int64x2_t __b) +{ + return (int64x2_t)__builtin_neon_vsubv2di (__a, __b, 1); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vsubq_f32 (float32x4_t __a, float32x4_t __b) +{ + return (float32x4_t)__builtin_neon_vsubv4sf (__a, __b, 3); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vsubq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vsubv16qi ((int8x16_t) __a, (int8x16_t) __b, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vsubq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vsubv8hi ((int16x8_t) __a, (int16x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vsubq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vsubv4si ((int32x4_t) __a, (int32x4_t) __b, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vsubq_u64 (uint64x2_t __a, uint64x2_t __b) +{ + return (uint64x2_t)__builtin_neon_vsubv2di ((int64x2_t) __a, (int64x2_t) __b, 0); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vsubl_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int16x8_t)__builtin_neon_vsublv8qi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vsubl_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int32x4_t)__builtin_neon_vsublv4hi (__a, __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vsubl_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int64x2_t)__builtin_neon_vsublv2si (__a, __b, 1); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vsubl_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vsublv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vsubl_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vsublv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vsubl_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint64x2_t)__builtin_neon_vsublv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vsubw_s8 (int16x8_t __a, int8x8_t __b) +{ + return (int16x8_t)__builtin_neon_vsubwv8qi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vsubw_s16 (int32x4_t __a, int16x4_t __b) +{ + return (int32x4_t)__builtin_neon_vsubwv4hi (__a, __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vsubw_s32 (int64x2_t __a, int32x2_t __b) +{ + return (int64x2_t)__builtin_neon_vsubwv2si (__a, __b, 1); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vsubw_u8 (uint16x8_t __a, uint8x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vsubwv8qi ((int16x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vsubw_u16 (uint32x4_t __a, uint16x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vsubwv4hi ((int32x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vsubw_u32 (uint64x2_t __a, uint32x2_t __b) +{ + return (uint64x2_t)__builtin_neon_vsubwv2si ((int64x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vhsub_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_neon_vhsubv8qi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vhsub_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_vhsubv4hi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vhsub_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_vhsubv2si (__a, __b, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vhsub_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vhsubv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vhsub_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vhsubv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vhsub_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vhsubv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vhsubq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (int8x16_t)__builtin_neon_vhsubv16qi (__a, __b, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vhsubq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int16x8_t)__builtin_neon_vhsubv8hi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vhsubq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int32x4_t)__builtin_neon_vhsubv4si (__a, __b, 1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vhsubq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vhsubv16qi ((int8x16_t) __a, (int8x16_t) __b, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vhsubq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vhsubv8hi ((int16x8_t) __a, (int16x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vhsubq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vhsubv4si ((int32x4_t) __a, (int32x4_t) __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqsub_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_neon_vqsubv8qi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vqsub_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_vqsubv4hi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vqsub_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_vqsubv2si (__a, __b, 1); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vqsub_s64 (int64x1_t __a, int64x1_t __b) +{ + return (int64x1_t)__builtin_neon_vqsubdi (__a, __b, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqsub_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vqsubv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vqsub_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vqsubv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vqsub_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vqsubv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vqsub_u64 (uint64x1_t __a, uint64x1_t __b) +{ + return (uint64x1_t)__builtin_neon_vqsubdi ((int64x1_t) __a, (int64x1_t) __b, 0); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vqsubq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (int8x16_t)__builtin_neon_vqsubv16qi (__a, __b, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vqsubq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int16x8_t)__builtin_neon_vqsubv8hi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqsubq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int32x4_t)__builtin_neon_vqsubv4si (__a, __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vqsubq_s64 (int64x2_t __a, int64x2_t __b) +{ + return (int64x2_t)__builtin_neon_vqsubv2di (__a, __b, 1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vqsubq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vqsubv16qi ((int8x16_t) __a, (int8x16_t) __b, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vqsubq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vqsubv8hi ((int16x8_t) __a, (int16x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vqsubq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vqsubv4si ((int32x4_t) __a, (int32x4_t) __b, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vqsubq_u64 (uint64x2_t __a, uint64x2_t __b) +{ + return (uint64x2_t)__builtin_neon_vqsubv2di ((int64x2_t) __a, (int64x2_t) __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vsubhn_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int8x8_t)__builtin_neon_vsubhnv8hi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vsubhn_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int16x4_t)__builtin_neon_vsubhnv4si (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vsubhn_s64 (int64x2_t __a, int64x2_t __b) +{ + return (int32x2_t)__builtin_neon_vsubhnv2di (__a, __b, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vsubhn_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vsubhnv8hi ((int16x8_t) __a, (int16x8_t) __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vsubhn_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vsubhnv4si ((int32x4_t) __a, (int32x4_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vsubhn_u64 (uint64x2_t __a, uint64x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vsubhnv2di ((int64x2_t) __a, (int64x2_t) __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vrsubhn_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int8x8_t)__builtin_neon_vsubhnv8hi (__a, __b, 5); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vrsubhn_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int16x4_t)__builtin_neon_vsubhnv4si (__a, __b, 5); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vrsubhn_s64 (int64x2_t __a, int64x2_t __b) +{ + return (int32x2_t)__builtin_neon_vsubhnv2di (__a, __b, 5); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vrsubhn_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vsubhnv8hi ((int16x8_t) __a, (int16x8_t) __b, 4); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vrsubhn_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vsubhnv4si ((int32x4_t) __a, (int32x4_t) __b, 4); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vrsubhn_u64 (uint64x2_t __a, uint64x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vsubhnv2di ((int64x2_t) __a, (int64x2_t) __b, 4); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vceq_s8 (int8x8_t __a, int8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vceqv8qi (__a, __b, 1); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vceq_s16 (int16x4_t __a, int16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vceqv4hi (__a, __b, 1); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vceq_s32 (int32x2_t __a, int32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vceqv2si (__a, __b, 1); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vceq_f32 (float32x2_t __a, float32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vceqv2sf (__a, __b, 3); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vceq_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vceqv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vceq_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vceqv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vceq_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vceqv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vceq_p8 (poly8x8_t __a, poly8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vceqv8qi ((int8x8_t) __a, (int8x8_t) __b, 2); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vceqq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vceqv16qi (__a, __b, 1); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vceqq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vceqv8hi (__a, __b, 1); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vceqq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vceqv4si (__a, __b, 1); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vceqq_f32 (float32x4_t __a, float32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vceqv4sf (__a, __b, 3); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vceqq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vceqv16qi ((int8x16_t) __a, (int8x16_t) __b, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vceqq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vceqv8hi ((int16x8_t) __a, (int16x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vceqq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vceqv4si ((int32x4_t) __a, (int32x4_t) __b, 0); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vceqq_p8 (poly8x16_t __a, poly8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vceqv16qi ((int8x16_t) __a, (int8x16_t) __b, 2); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcge_s8 (int8x8_t __a, int8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vcgev8qi (__a, __b, 1); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcge_s16 (int16x4_t __a, int16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vcgev4hi (__a, __b, 1); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcge_s32 (int32x2_t __a, int32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vcgev2si (__a, __b, 1); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcge_f32 (float32x2_t __a, float32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vcgev2sf (__a, __b, 3); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcge_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vcgev8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcge_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vcgev4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcge_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vcgev2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcgeq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vcgev16qi (__a, __b, 1); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcgeq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vcgev8hi (__a, __b, 1); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcgeq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vcgev4si (__a, __b, 1); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcgeq_f32 (float32x4_t __a, float32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vcgev4sf (__a, __b, 3); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcgeq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vcgev16qi ((int8x16_t) __a, (int8x16_t) __b, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcgeq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vcgev8hi ((int16x8_t) __a, (int16x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcgeq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vcgev4si ((int32x4_t) __a, (int32x4_t) __b, 0); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcle_s8 (int8x8_t __a, int8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vcgev8qi (__b, __a, 1); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcle_s16 (int16x4_t __a, int16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vcgev4hi (__b, __a, 1); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcle_s32 (int32x2_t __a, int32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vcgev2si (__b, __a, 1); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcle_f32 (float32x2_t __a, float32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vcgev2sf (__b, __a, 3); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcle_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vcgev8qi ((int8x8_t) __b, (int8x8_t) __a, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcle_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vcgev4hi ((int16x4_t) __b, (int16x4_t) __a, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcle_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vcgev2si ((int32x2_t) __b, (int32x2_t) __a, 0); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcleq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vcgev16qi (__b, __a, 1); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcleq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vcgev8hi (__b, __a, 1); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcleq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vcgev4si (__b, __a, 1); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcleq_f32 (float32x4_t __a, float32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vcgev4sf (__b, __a, 3); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcleq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vcgev16qi ((int8x16_t) __b, (int8x16_t) __a, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcleq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vcgev8hi ((int16x8_t) __b, (int16x8_t) __a, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcleq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vcgev4si ((int32x4_t) __b, (int32x4_t) __a, 0); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcgt_s8 (int8x8_t __a, int8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vcgtv8qi (__a, __b, 1); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcgt_s16 (int16x4_t __a, int16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vcgtv4hi (__a, __b, 1); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcgt_s32 (int32x2_t __a, int32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vcgtv2si (__a, __b, 1); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcgt_f32 (float32x2_t __a, float32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vcgtv2sf (__a, __b, 3); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcgt_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vcgtv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcgt_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vcgtv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcgt_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vcgtv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcgtq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vcgtv16qi (__a, __b, 1); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcgtq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vcgtv8hi (__a, __b, 1); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcgtq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vcgtv4si (__a, __b, 1); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcgtq_f32 (float32x4_t __a, float32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vcgtv4sf (__a, __b, 3); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcgtq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vcgtv16qi ((int8x16_t) __a, (int8x16_t) __b, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcgtq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vcgtv8hi ((int16x8_t) __a, (int16x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcgtq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vcgtv4si ((int32x4_t) __a, (int32x4_t) __b, 0); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vclt_s8 (int8x8_t __a, int8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vcgtv8qi (__b, __a, 1); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vclt_s16 (int16x4_t __a, int16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vcgtv4hi (__b, __a, 1); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vclt_s32 (int32x2_t __a, int32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vcgtv2si (__b, __a, 1); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vclt_f32 (float32x2_t __a, float32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vcgtv2sf (__b, __a, 3); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vclt_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vcgtv8qi ((int8x8_t) __b, (int8x8_t) __a, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vclt_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vcgtv4hi ((int16x4_t) __b, (int16x4_t) __a, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vclt_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vcgtv2si ((int32x2_t) __b, (int32x2_t) __a, 0); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcltq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vcgtv16qi (__b, __a, 1); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcltq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vcgtv8hi (__b, __a, 1); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcltq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vcgtv4si (__b, __a, 1); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcltq_f32 (float32x4_t __a, float32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vcgtv4sf (__b, __a, 3); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcltq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vcgtv16qi ((int8x16_t) __b, (int8x16_t) __a, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcltq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vcgtv8hi ((int16x8_t) __b, (int16x8_t) __a, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcltq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vcgtv4si ((int32x4_t) __b, (int32x4_t) __a, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcage_f32 (float32x2_t __a, float32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vcagev2sf (__a, __b, 3); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcageq_f32 (float32x4_t __a, float32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vcagev4sf (__a, __b, 3); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcale_f32 (float32x2_t __a, float32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vcagev2sf (__b, __a, 3); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcaleq_f32 (float32x4_t __a, float32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vcagev4sf (__b, __a, 3); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcagt_f32 (float32x2_t __a, float32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vcagtv2sf (__a, __b, 3); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcagtq_f32 (float32x4_t __a, float32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vcagtv4sf (__a, __b, 3); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcalt_f32 (float32x2_t __a, float32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vcagtv2sf (__b, __a, 3); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcaltq_f32 (float32x4_t __a, float32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vcagtv4sf (__b, __a, 3); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtst_s8 (int8x8_t __a, int8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vtstv8qi (__a, __b, 1); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vtst_s16 (int16x4_t __a, int16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vtstv4hi (__a, __b, 1); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vtst_s32 (int32x2_t __a, int32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vtstv2si (__a, __b, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtst_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vtstv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vtst_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vtstv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vtst_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vtstv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtst_p8 (poly8x8_t __a, poly8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vtstv8qi ((int8x8_t) __a, (int8x8_t) __b, 2); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vtstq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vtstv16qi (__a, __b, 1); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vtstq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vtstv8hi (__a, __b, 1); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vtstq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vtstv4si (__a, __b, 1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vtstq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vtstv16qi ((int8x16_t) __a, (int8x16_t) __b, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vtstq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vtstv8hi ((int16x8_t) __a, (int16x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vtstq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vtstv4si ((int32x4_t) __a, (int32x4_t) __b, 0); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vtstq_p8 (poly8x16_t __a, poly8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vtstv16qi ((int8x16_t) __a, (int8x16_t) __b, 2); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vabd_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_neon_vabdv8qi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vabd_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_vabdv4hi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vabd_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_vabdv2si (__a, __b, 1); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vabd_f32 (float32x2_t __a, float32x2_t __b) +{ + return (float32x2_t)__builtin_neon_vabdv2sf (__a, __b, 3); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vabd_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vabdv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vabd_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vabdv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vabd_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vabdv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vabdq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (int8x16_t)__builtin_neon_vabdv16qi (__a, __b, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vabdq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int16x8_t)__builtin_neon_vabdv8hi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vabdq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int32x4_t)__builtin_neon_vabdv4si (__a, __b, 1); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vabdq_f32 (float32x4_t __a, float32x4_t __b) +{ + return (float32x4_t)__builtin_neon_vabdv4sf (__a, __b, 3); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vabdq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vabdv16qi ((int8x16_t) __a, (int8x16_t) __b, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vabdq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vabdv8hi ((int16x8_t) __a, (int16x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vabdq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vabdv4si ((int32x4_t) __a, (int32x4_t) __b, 0); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vabdl_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int16x8_t)__builtin_neon_vabdlv8qi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vabdl_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int32x4_t)__builtin_neon_vabdlv4hi (__a, __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vabdl_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int64x2_t)__builtin_neon_vabdlv2si (__a, __b, 1); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vabdl_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vabdlv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vabdl_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vabdlv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vabdl_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint64x2_t)__builtin_neon_vabdlv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vaba_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c) +{ + return (int8x8_t)__builtin_neon_vabav8qi (__a, __b, __c, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vaba_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c) +{ + return (int16x4_t)__builtin_neon_vabav4hi (__a, __b, __c, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vaba_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c) +{ + return (int32x2_t)__builtin_neon_vabav2si (__a, __b, __c, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vaba_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c) +{ + return (uint8x8_t)__builtin_neon_vabav8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vaba_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c) +{ + return (uint16x4_t)__builtin_neon_vabav4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vaba_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c) +{ + return (uint32x2_t)__builtin_neon_vabav2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, 0); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vabaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c) +{ + return (int8x16_t)__builtin_neon_vabav16qi (__a, __b, __c, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vabaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c) +{ + return (int16x8_t)__builtin_neon_vabav8hi (__a, __b, __c, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vabaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c) +{ + return (int32x4_t)__builtin_neon_vabav4si (__a, __b, __c, 1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vabaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c) +{ + return (uint8x16_t)__builtin_neon_vabav16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vabaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c) +{ + return (uint16x8_t)__builtin_neon_vabav8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vabaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) +{ + return (uint32x4_t)__builtin_neon_vabav4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c, 0); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vabal_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c) +{ + return (int16x8_t)__builtin_neon_vabalv8qi (__a, __b, __c, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vabal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c) +{ + return (int32x4_t)__builtin_neon_vabalv4hi (__a, __b, __c, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vabal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c) +{ + return (int64x2_t)__builtin_neon_vabalv2si (__a, __b, __c, 1); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vabal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c) +{ + return (uint16x8_t)__builtin_neon_vabalv8qi ((int16x8_t) __a, (int8x8_t) __b, (int8x8_t) __c, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vabal_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c) +{ + return (uint32x4_t)__builtin_neon_vabalv4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vabal_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c) +{ + return (uint64x2_t)__builtin_neon_vabalv2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vmax_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_neon_vmaxv8qi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmax_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_vmaxv4hi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmax_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_vmaxv2si (__a, __b, 1); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmax_f32 (float32x2_t __a, float32x2_t __b) +{ + return (float32x2_t)__builtin_neon_vmaxv2sf (__a, __b, 3); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vmax_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vmaxv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmax_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vmaxv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmax_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vmaxv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vmaxq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (int8x16_t)__builtin_neon_vmaxv16qi (__a, __b, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmaxq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int16x8_t)__builtin_neon_vmaxv8hi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmaxq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int32x4_t)__builtin_neon_vmaxv4si (__a, __b, 1); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmaxq_f32 (float32x4_t __a, float32x4_t __b) +{ + return (float32x4_t)__builtin_neon_vmaxv4sf (__a, __b, 3); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vmaxq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vmaxv16qi ((int8x16_t) __a, (int8x16_t) __b, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmaxq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vmaxv8hi ((int16x8_t) __a, (int16x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmaxq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vmaxv4si ((int32x4_t) __a, (int32x4_t) __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vmin_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_neon_vminv8qi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmin_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_vminv4hi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmin_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_vminv2si (__a, __b, 1); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmin_f32 (float32x2_t __a, float32x2_t __b) +{ + return (float32x2_t)__builtin_neon_vminv2sf (__a, __b, 3); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vmin_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vminv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmin_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vminv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmin_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vminv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vminq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (int8x16_t)__builtin_neon_vminv16qi (__a, __b, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vminq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int16x8_t)__builtin_neon_vminv8hi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vminq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int32x4_t)__builtin_neon_vminv4si (__a, __b, 1); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vminq_f32 (float32x4_t __a, float32x4_t __b) +{ + return (float32x4_t)__builtin_neon_vminv4sf (__a, __b, 3); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vminq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vminv16qi ((int8x16_t) __a, (int8x16_t) __b, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vminq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vminv8hi ((int16x8_t) __a, (int16x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vminq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vminv4si ((int32x4_t) __a, (int32x4_t) __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vpadd_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_neon_vpaddv8qi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vpadd_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_vpaddv4hi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vpadd_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_vpaddv2si (__a, __b, 1); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vpadd_f32 (float32x2_t __a, float32x2_t __b) +{ + return (float32x2_t)__builtin_neon_vpaddv2sf (__a, __b, 3); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vpadd_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vpaddv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vpadd_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vpaddv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vpadd_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vpaddv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vpaddl_s8 (int8x8_t __a) +{ + return (int16x4_t)__builtin_neon_vpaddlv8qi (__a, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vpaddl_s16 (int16x4_t __a) +{ + return (int32x2_t)__builtin_neon_vpaddlv4hi (__a, 1); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vpaddl_s32 (int32x2_t __a) +{ + return (int64x1_t)__builtin_neon_vpaddlv2si (__a, 1); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vpaddl_u8 (uint8x8_t __a) +{ + return (uint16x4_t)__builtin_neon_vpaddlv8qi ((int8x8_t) __a, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vpaddl_u16 (uint16x4_t __a) +{ + return (uint32x2_t)__builtin_neon_vpaddlv4hi ((int16x4_t) __a, 0); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vpaddl_u32 (uint32x2_t __a) +{ + return (uint64x1_t)__builtin_neon_vpaddlv2si ((int32x2_t) __a, 0); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vpaddlq_s8 (int8x16_t __a) +{ + return (int16x8_t)__builtin_neon_vpaddlv16qi (__a, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vpaddlq_s16 (int16x8_t __a) +{ + return (int32x4_t)__builtin_neon_vpaddlv8hi (__a, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vpaddlq_s32 (int32x4_t __a) +{ + return (int64x2_t)__builtin_neon_vpaddlv4si (__a, 1); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vpaddlq_u8 (uint8x16_t __a) +{ + return (uint16x8_t)__builtin_neon_vpaddlv16qi ((int8x16_t) __a, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vpaddlq_u16 (uint16x8_t __a) +{ + return (uint32x4_t)__builtin_neon_vpaddlv8hi ((int16x8_t) __a, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vpaddlq_u32 (uint32x4_t __a) +{ + return (uint64x2_t)__builtin_neon_vpaddlv4si ((int32x4_t) __a, 0); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vpadal_s8 (int16x4_t __a, int8x8_t __b) +{ + return (int16x4_t)__builtin_neon_vpadalv8qi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vpadal_s16 (int32x2_t __a, int16x4_t __b) +{ + return (int32x2_t)__builtin_neon_vpadalv4hi (__a, __b, 1); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vpadal_s32 (int64x1_t __a, int32x2_t __b) +{ + return (int64x1_t)__builtin_neon_vpadalv2si (__a, __b, 1); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vpadal_u8 (uint16x4_t __a, uint8x8_t __b) +{ + return (uint16x4_t)__builtin_neon_vpadalv8qi ((int16x4_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vpadal_u16 (uint32x2_t __a, uint16x4_t __b) +{ + return (uint32x2_t)__builtin_neon_vpadalv4hi ((int32x2_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vpadal_u32 (uint64x1_t __a, uint32x2_t __b) +{ + return (uint64x1_t)__builtin_neon_vpadalv2si ((int64x1_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vpadalq_s8 (int16x8_t __a, int8x16_t __b) +{ + return (int16x8_t)__builtin_neon_vpadalv16qi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vpadalq_s16 (int32x4_t __a, int16x8_t __b) +{ + return (int32x4_t)__builtin_neon_vpadalv8hi (__a, __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vpadalq_s32 (int64x2_t __a, int32x4_t __b) +{ + return (int64x2_t)__builtin_neon_vpadalv4si (__a, __b, 1); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vpadalq_u8 (uint16x8_t __a, uint8x16_t __b) +{ + return (uint16x8_t)__builtin_neon_vpadalv16qi ((int16x8_t) __a, (int8x16_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vpadalq_u16 (uint32x4_t __a, uint16x8_t __b) +{ + return (uint32x4_t)__builtin_neon_vpadalv8hi ((int32x4_t) __a, (int16x8_t) __b, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vpadalq_u32 (uint64x2_t __a, uint32x4_t __b) +{ + return (uint64x2_t)__builtin_neon_vpadalv4si ((int64x2_t) __a, (int32x4_t) __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vpmax_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_neon_vpmaxv8qi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vpmax_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_vpmaxv4hi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vpmax_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_vpmaxv2si (__a, __b, 1); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vpmax_f32 (float32x2_t __a, float32x2_t __b) +{ + return (float32x2_t)__builtin_neon_vpmaxv2sf (__a, __b, 3); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vpmax_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vpmaxv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vpmax_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vpmaxv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vpmax_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vpmaxv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vpmin_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_neon_vpminv8qi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vpmin_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_vpminv4hi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vpmin_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_vpminv2si (__a, __b, 1); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vpmin_f32 (float32x2_t __a, float32x2_t __b) +{ + return (float32x2_t)__builtin_neon_vpminv2sf (__a, __b, 3); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vpmin_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vpminv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vpmin_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vpminv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vpmin_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vpminv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vrecps_f32 (float32x2_t __a, float32x2_t __b) +{ + return (float32x2_t)__builtin_neon_vrecpsv2sf (__a, __b, 3); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vrecpsq_f32 (float32x4_t __a, float32x4_t __b) +{ + return (float32x4_t)__builtin_neon_vrecpsv4sf (__a, __b, 3); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vrsqrts_f32 (float32x2_t __a, float32x2_t __b) +{ + return (float32x2_t)__builtin_neon_vrsqrtsv2sf (__a, __b, 3); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vrsqrtsq_f32 (float32x4_t __a, float32x4_t __b) +{ + return (float32x4_t)__builtin_neon_vrsqrtsv4sf (__a, __b, 3); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vshl_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_neon_vshlv8qi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vshl_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_vshlv4hi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vshl_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_vshlv2si (__a, __b, 1); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vshl_s64 (int64x1_t __a, int64x1_t __b) +{ + return (int64x1_t)__builtin_neon_vshldi (__a, __b, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vshl_u8 (uint8x8_t __a, int8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vshlv8qi ((int8x8_t) __a, __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vshl_u16 (uint16x4_t __a, int16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vshlv4hi ((int16x4_t) __a, __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vshl_u32 (uint32x2_t __a, int32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vshlv2si ((int32x2_t) __a, __b, 0); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vshl_u64 (uint64x1_t __a, int64x1_t __b) +{ + return (uint64x1_t)__builtin_neon_vshldi ((int64x1_t) __a, __b, 0); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vshlq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (int8x16_t)__builtin_neon_vshlv16qi (__a, __b, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vshlq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int16x8_t)__builtin_neon_vshlv8hi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vshlq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int32x4_t)__builtin_neon_vshlv4si (__a, __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vshlq_s64 (int64x2_t __a, int64x2_t __b) +{ + return (int64x2_t)__builtin_neon_vshlv2di (__a, __b, 1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vshlq_u8 (uint8x16_t __a, int8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vshlv16qi ((int8x16_t) __a, __b, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vshlq_u16 (uint16x8_t __a, int16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vshlv8hi ((int16x8_t) __a, __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vshlq_u32 (uint32x4_t __a, int32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vshlv4si ((int32x4_t) __a, __b, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vshlq_u64 (uint64x2_t __a, int64x2_t __b) +{ + return (uint64x2_t)__builtin_neon_vshlv2di ((int64x2_t) __a, __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vrshl_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_neon_vshlv8qi (__a, __b, 5); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vrshl_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_vshlv4hi (__a, __b, 5); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vrshl_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_vshlv2si (__a, __b, 5); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vrshl_s64 (int64x1_t __a, int64x1_t __b) +{ + return (int64x1_t)__builtin_neon_vshldi (__a, __b, 5); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vrshl_u8 (uint8x8_t __a, int8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vshlv8qi ((int8x8_t) __a, __b, 4); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vrshl_u16 (uint16x4_t __a, int16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vshlv4hi ((int16x4_t) __a, __b, 4); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vrshl_u32 (uint32x2_t __a, int32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vshlv2si ((int32x2_t) __a, __b, 4); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vrshl_u64 (uint64x1_t __a, int64x1_t __b) +{ + return (uint64x1_t)__builtin_neon_vshldi ((int64x1_t) __a, __b, 4); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vrshlq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (int8x16_t)__builtin_neon_vshlv16qi (__a, __b, 5); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vrshlq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int16x8_t)__builtin_neon_vshlv8hi (__a, __b, 5); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vrshlq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int32x4_t)__builtin_neon_vshlv4si (__a, __b, 5); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vrshlq_s64 (int64x2_t __a, int64x2_t __b) +{ + return (int64x2_t)__builtin_neon_vshlv2di (__a, __b, 5); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vrshlq_u8 (uint8x16_t __a, int8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vshlv16qi ((int8x16_t) __a, __b, 4); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vrshlq_u16 (uint16x8_t __a, int16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vshlv8hi ((int16x8_t) __a, __b, 4); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vrshlq_u32 (uint32x4_t __a, int32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vshlv4si ((int32x4_t) __a, __b, 4); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vrshlq_u64 (uint64x2_t __a, int64x2_t __b) +{ + return (uint64x2_t)__builtin_neon_vshlv2di ((int64x2_t) __a, __b, 4); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqshl_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_neon_vqshlv8qi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vqshl_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_vqshlv4hi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vqshl_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_vqshlv2si (__a, __b, 1); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vqshl_s64 (int64x1_t __a, int64x1_t __b) +{ + return (int64x1_t)__builtin_neon_vqshldi (__a, __b, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqshl_u8 (uint8x8_t __a, int8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vqshlv8qi ((int8x8_t) __a, __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vqshl_u16 (uint16x4_t __a, int16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vqshlv4hi ((int16x4_t) __a, __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vqshl_u32 (uint32x2_t __a, int32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vqshlv2si ((int32x2_t) __a, __b, 0); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vqshl_u64 (uint64x1_t __a, int64x1_t __b) +{ + return (uint64x1_t)__builtin_neon_vqshldi ((int64x1_t) __a, __b, 0); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vqshlq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (int8x16_t)__builtin_neon_vqshlv16qi (__a, __b, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vqshlq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int16x8_t)__builtin_neon_vqshlv8hi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqshlq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int32x4_t)__builtin_neon_vqshlv4si (__a, __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vqshlq_s64 (int64x2_t __a, int64x2_t __b) +{ + return (int64x2_t)__builtin_neon_vqshlv2di (__a, __b, 1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vqshlq_u8 (uint8x16_t __a, int8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vqshlv16qi ((int8x16_t) __a, __b, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vqshlq_u16 (uint16x8_t __a, int16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vqshlv8hi ((int16x8_t) __a, __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vqshlq_u32 (uint32x4_t __a, int32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vqshlv4si ((int32x4_t) __a, __b, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vqshlq_u64 (uint64x2_t __a, int64x2_t __b) +{ + return (uint64x2_t)__builtin_neon_vqshlv2di ((int64x2_t) __a, __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqrshl_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_neon_vqshlv8qi (__a, __b, 5); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vqrshl_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_vqshlv4hi (__a, __b, 5); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vqrshl_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_vqshlv2si (__a, __b, 5); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vqrshl_s64 (int64x1_t __a, int64x1_t __b) +{ + return (int64x1_t)__builtin_neon_vqshldi (__a, __b, 5); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqrshl_u8 (uint8x8_t __a, int8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vqshlv8qi ((int8x8_t) __a, __b, 4); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vqrshl_u16 (uint16x4_t __a, int16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vqshlv4hi ((int16x4_t) __a, __b, 4); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vqrshl_u32 (uint32x2_t __a, int32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vqshlv2si ((int32x2_t) __a, __b, 4); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vqrshl_u64 (uint64x1_t __a, int64x1_t __b) +{ + return (uint64x1_t)__builtin_neon_vqshldi ((int64x1_t) __a, __b, 4); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vqrshlq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (int8x16_t)__builtin_neon_vqshlv16qi (__a, __b, 5); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vqrshlq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int16x8_t)__builtin_neon_vqshlv8hi (__a, __b, 5); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqrshlq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int32x4_t)__builtin_neon_vqshlv4si (__a, __b, 5); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vqrshlq_s64 (int64x2_t __a, int64x2_t __b) +{ + return (int64x2_t)__builtin_neon_vqshlv2di (__a, __b, 5); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vqrshlq_u8 (uint8x16_t __a, int8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vqshlv16qi ((int8x16_t) __a, __b, 4); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vqrshlq_u16 (uint16x8_t __a, int16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vqshlv8hi ((int16x8_t) __a, __b, 4); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vqrshlq_u32 (uint32x4_t __a, int32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vqshlv4si ((int32x4_t) __a, __b, 4); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vqrshlq_u64 (uint64x2_t __a, int64x2_t __b) +{ + return (uint64x2_t)__builtin_neon_vqshlv2di ((int64x2_t) __a, __b, 4); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vshr_n_s8 (int8x8_t __a, const int __b) +{ + return (int8x8_t)__builtin_neon_vshr_nv8qi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vshr_n_s16 (int16x4_t __a, const int __b) +{ + return (int16x4_t)__builtin_neon_vshr_nv4hi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vshr_n_s32 (int32x2_t __a, const int __b) +{ + return (int32x2_t)__builtin_neon_vshr_nv2si (__a, __b, 1); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vshr_n_s64 (int64x1_t __a, const int __b) +{ + return (int64x1_t)__builtin_neon_vshr_ndi (__a, __b, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vshr_n_u8 (uint8x8_t __a, const int __b) +{ + return (uint8x8_t)__builtin_neon_vshr_nv8qi ((int8x8_t) __a, __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vshr_n_u16 (uint16x4_t __a, const int __b) +{ + return (uint16x4_t)__builtin_neon_vshr_nv4hi ((int16x4_t) __a, __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vshr_n_u32 (uint32x2_t __a, const int __b) +{ + return (uint32x2_t)__builtin_neon_vshr_nv2si ((int32x2_t) __a, __b, 0); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vshr_n_u64 (uint64x1_t __a, const int __b) +{ + return (uint64x1_t)__builtin_neon_vshr_ndi ((int64x1_t) __a, __b, 0); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vshrq_n_s8 (int8x16_t __a, const int __b) +{ + return (int8x16_t)__builtin_neon_vshr_nv16qi (__a, __b, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vshrq_n_s16 (int16x8_t __a, const int __b) +{ + return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vshrq_n_s32 (int32x4_t __a, const int __b) +{ + return (int32x4_t)__builtin_neon_vshr_nv4si (__a, __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vshrq_n_s64 (int64x2_t __a, const int __b) +{ + return (int64x2_t)__builtin_neon_vshr_nv2di (__a, __b, 1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vshrq_n_u8 (uint8x16_t __a, const int __b) +{ + return (uint8x16_t)__builtin_neon_vshr_nv16qi ((int8x16_t) __a, __b, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vshrq_n_u16 (uint16x8_t __a, const int __b) +{ + return (uint16x8_t)__builtin_neon_vshr_nv8hi ((int16x8_t) __a, __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vshrq_n_u32 (uint32x4_t __a, const int __b) +{ + return (uint32x4_t)__builtin_neon_vshr_nv4si ((int32x4_t) __a, __b, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vshrq_n_u64 (uint64x2_t __a, const int __b) +{ + return (uint64x2_t)__builtin_neon_vshr_nv2di ((int64x2_t) __a, __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vrshr_n_s8 (int8x8_t __a, const int __b) +{ + return (int8x8_t)__builtin_neon_vshr_nv8qi (__a, __b, 5); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vrshr_n_s16 (int16x4_t __a, const int __b) +{ + return (int16x4_t)__builtin_neon_vshr_nv4hi (__a, __b, 5); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vrshr_n_s32 (int32x2_t __a, const int __b) +{ + return (int32x2_t)__builtin_neon_vshr_nv2si (__a, __b, 5); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vrshr_n_s64 (int64x1_t __a, const int __b) +{ + return (int64x1_t)__builtin_neon_vshr_ndi (__a, __b, 5); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vrshr_n_u8 (uint8x8_t __a, const int __b) +{ + return (uint8x8_t)__builtin_neon_vshr_nv8qi ((int8x8_t) __a, __b, 4); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vrshr_n_u16 (uint16x4_t __a, const int __b) +{ + return (uint16x4_t)__builtin_neon_vshr_nv4hi ((int16x4_t) __a, __b, 4); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vrshr_n_u32 (uint32x2_t __a, const int __b) +{ + return (uint32x2_t)__builtin_neon_vshr_nv2si ((int32x2_t) __a, __b, 4); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vrshr_n_u64 (uint64x1_t __a, const int __b) +{ + return (uint64x1_t)__builtin_neon_vshr_ndi ((int64x1_t) __a, __b, 4); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vrshrq_n_s8 (int8x16_t __a, const int __b) +{ + return (int8x16_t)__builtin_neon_vshr_nv16qi (__a, __b, 5); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vrshrq_n_s16 (int16x8_t __a, const int __b) +{ + return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 5); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vrshrq_n_s32 (int32x4_t __a, const int __b) +{ + return (int32x4_t)__builtin_neon_vshr_nv4si (__a, __b, 5); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vrshrq_n_s64 (int64x2_t __a, const int __b) +{ + return (int64x2_t)__builtin_neon_vshr_nv2di (__a, __b, 5); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vrshrq_n_u8 (uint8x16_t __a, const int __b) +{ + return (uint8x16_t)__builtin_neon_vshr_nv16qi ((int8x16_t) __a, __b, 4); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vrshrq_n_u16 (uint16x8_t __a, const int __b) +{ + return (uint16x8_t)__builtin_neon_vshr_nv8hi ((int16x8_t) __a, __b, 4); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vrshrq_n_u32 (uint32x4_t __a, const int __b) +{ + return (uint32x4_t)__builtin_neon_vshr_nv4si ((int32x4_t) __a, __b, 4); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vrshrq_n_u64 (uint64x2_t __a, const int __b) +{ + return (uint64x2_t)__builtin_neon_vshr_nv2di ((int64x2_t) __a, __b, 4); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vshrn_n_s16 (int16x8_t __a, const int __b) +{ + return (int8x8_t)__builtin_neon_vshrn_nv8hi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vshrn_n_s32 (int32x4_t __a, const int __b) +{ + return (int16x4_t)__builtin_neon_vshrn_nv4si (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vshrn_n_s64 (int64x2_t __a, const int __b) +{ + return (int32x2_t)__builtin_neon_vshrn_nv2di (__a, __b, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vshrn_n_u16 (uint16x8_t __a, const int __b) +{ + return (uint8x8_t)__builtin_neon_vshrn_nv8hi ((int16x8_t) __a, __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vshrn_n_u32 (uint32x4_t __a, const int __b) +{ + return (uint16x4_t)__builtin_neon_vshrn_nv4si ((int32x4_t) __a, __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vshrn_n_u64 (uint64x2_t __a, const int __b) +{ + return (uint32x2_t)__builtin_neon_vshrn_nv2di ((int64x2_t) __a, __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vrshrn_n_s16 (int16x8_t __a, const int __b) +{ + return (int8x8_t)__builtin_neon_vshrn_nv8hi (__a, __b, 5); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vrshrn_n_s32 (int32x4_t __a, const int __b) +{ + return (int16x4_t)__builtin_neon_vshrn_nv4si (__a, __b, 5); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vrshrn_n_s64 (int64x2_t __a, const int __b) +{ + return (int32x2_t)__builtin_neon_vshrn_nv2di (__a, __b, 5); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vrshrn_n_u16 (uint16x8_t __a, const int __b) +{ + return (uint8x8_t)__builtin_neon_vshrn_nv8hi ((int16x8_t) __a, __b, 4); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vrshrn_n_u32 (uint32x4_t __a, const int __b) +{ + return (uint16x4_t)__builtin_neon_vshrn_nv4si ((int32x4_t) __a, __b, 4); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vrshrn_n_u64 (uint64x2_t __a, const int __b) +{ + return (uint32x2_t)__builtin_neon_vshrn_nv2di ((int64x2_t) __a, __b, 4); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqshrn_n_s16 (int16x8_t __a, const int __b) +{ + return (int8x8_t)__builtin_neon_vqshrn_nv8hi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vqshrn_n_s32 (int32x4_t __a, const int __b) +{ + return (int16x4_t)__builtin_neon_vqshrn_nv4si (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vqshrn_n_s64 (int64x2_t __a, const int __b) +{ + return (int32x2_t)__builtin_neon_vqshrn_nv2di (__a, __b, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqshrn_n_u16 (uint16x8_t __a, const int __b) +{ + return (uint8x8_t)__builtin_neon_vqshrn_nv8hi ((int16x8_t) __a, __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vqshrn_n_u32 (uint32x4_t __a, const int __b) +{ + return (uint16x4_t)__builtin_neon_vqshrn_nv4si ((int32x4_t) __a, __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vqshrn_n_u64 (uint64x2_t __a, const int __b) +{ + return (uint32x2_t)__builtin_neon_vqshrn_nv2di ((int64x2_t) __a, __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqrshrn_n_s16 (int16x8_t __a, const int __b) +{ + return (int8x8_t)__builtin_neon_vqshrn_nv8hi (__a, __b, 5); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vqrshrn_n_s32 (int32x4_t __a, const int __b) +{ + return (int16x4_t)__builtin_neon_vqshrn_nv4si (__a, __b, 5); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vqrshrn_n_s64 (int64x2_t __a, const int __b) +{ + return (int32x2_t)__builtin_neon_vqshrn_nv2di (__a, __b, 5); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqrshrn_n_u16 (uint16x8_t __a, const int __b) +{ + return (uint8x8_t)__builtin_neon_vqshrn_nv8hi ((int16x8_t) __a, __b, 4); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vqrshrn_n_u32 (uint32x4_t __a, const int __b) +{ + return (uint16x4_t)__builtin_neon_vqshrn_nv4si ((int32x4_t) __a, __b, 4); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vqrshrn_n_u64 (uint64x2_t __a, const int __b) +{ + return (uint32x2_t)__builtin_neon_vqshrn_nv2di ((int64x2_t) __a, __b, 4); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqshrun_n_s16 (int16x8_t __a, const int __b) +{ + return (uint8x8_t)__builtin_neon_vqshrun_nv8hi (__a, __b, 1); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vqshrun_n_s32 (int32x4_t __a, const int __b) +{ + return (uint16x4_t)__builtin_neon_vqshrun_nv4si (__a, __b, 1); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vqshrun_n_s64 (int64x2_t __a, const int __b) +{ + return (uint32x2_t)__builtin_neon_vqshrun_nv2di (__a, __b, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqrshrun_n_s16 (int16x8_t __a, const int __b) +{ + return (uint8x8_t)__builtin_neon_vqshrun_nv8hi (__a, __b, 5); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vqrshrun_n_s32 (int32x4_t __a, const int __b) +{ + return (uint16x4_t)__builtin_neon_vqshrun_nv4si (__a, __b, 5); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vqrshrun_n_s64 (int64x2_t __a, const int __b) +{ + return (uint32x2_t)__builtin_neon_vqshrun_nv2di (__a, __b, 5); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vshl_n_s8 (int8x8_t __a, const int __b) +{ + return (int8x8_t)__builtin_neon_vshl_nv8qi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vshl_n_s16 (int16x4_t __a, const int __b) +{ + return (int16x4_t)__builtin_neon_vshl_nv4hi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vshl_n_s32 (int32x2_t __a, const int __b) +{ + return (int32x2_t)__builtin_neon_vshl_nv2si (__a, __b, 1); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vshl_n_s64 (int64x1_t __a, const int __b) +{ + return (int64x1_t)__builtin_neon_vshl_ndi (__a, __b, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vshl_n_u8 (uint8x8_t __a, const int __b) +{ + return (uint8x8_t)__builtin_neon_vshl_nv8qi ((int8x8_t) __a, __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vshl_n_u16 (uint16x4_t __a, const int __b) +{ + return (uint16x4_t)__builtin_neon_vshl_nv4hi ((int16x4_t) __a, __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vshl_n_u32 (uint32x2_t __a, const int __b) +{ + return (uint32x2_t)__builtin_neon_vshl_nv2si ((int32x2_t) __a, __b, 0); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vshl_n_u64 (uint64x1_t __a, const int __b) +{ + return (uint64x1_t)__builtin_neon_vshl_ndi ((int64x1_t) __a, __b, 0); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vshlq_n_s8 (int8x16_t __a, const int __b) +{ + return (int8x16_t)__builtin_neon_vshl_nv16qi (__a, __b, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vshlq_n_s16 (int16x8_t __a, const int __b) +{ + return (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vshlq_n_s32 (int32x4_t __a, const int __b) +{ + return (int32x4_t)__builtin_neon_vshl_nv4si (__a, __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vshlq_n_s64 (int64x2_t __a, const int __b) +{ + return (int64x2_t)__builtin_neon_vshl_nv2di (__a, __b, 1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vshlq_n_u8 (uint8x16_t __a, const int __b) +{ + return (uint8x16_t)__builtin_neon_vshl_nv16qi ((int8x16_t) __a, __b, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vshlq_n_u16 (uint16x8_t __a, const int __b) +{ + return (uint16x8_t)__builtin_neon_vshl_nv8hi ((int16x8_t) __a, __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vshlq_n_u32 (uint32x4_t __a, const int __b) +{ + return (uint32x4_t)__builtin_neon_vshl_nv4si ((int32x4_t) __a, __b, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vshlq_n_u64 (uint64x2_t __a, const int __b) +{ + return (uint64x2_t)__builtin_neon_vshl_nv2di ((int64x2_t) __a, __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqshl_n_s8 (int8x8_t __a, const int __b) +{ + return (int8x8_t)__builtin_neon_vqshl_nv8qi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vqshl_n_s16 (int16x4_t __a, const int __b) +{ + return (int16x4_t)__builtin_neon_vqshl_nv4hi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vqshl_n_s32 (int32x2_t __a, const int __b) +{ + return (int32x2_t)__builtin_neon_vqshl_nv2si (__a, __b, 1); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vqshl_n_s64 (int64x1_t __a, const int __b) +{ + return (int64x1_t)__builtin_neon_vqshl_ndi (__a, __b, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqshl_n_u8 (uint8x8_t __a, const int __b) +{ + return (uint8x8_t)__builtin_neon_vqshl_nv8qi ((int8x8_t) __a, __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vqshl_n_u16 (uint16x4_t __a, const int __b) +{ + return (uint16x4_t)__builtin_neon_vqshl_nv4hi ((int16x4_t) __a, __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vqshl_n_u32 (uint32x2_t __a, const int __b) +{ + return (uint32x2_t)__builtin_neon_vqshl_nv2si ((int32x2_t) __a, __b, 0); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vqshl_n_u64 (uint64x1_t __a, const int __b) +{ + return (uint64x1_t)__builtin_neon_vqshl_ndi ((int64x1_t) __a, __b, 0); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vqshlq_n_s8 (int8x16_t __a, const int __b) +{ + return (int8x16_t)__builtin_neon_vqshl_nv16qi (__a, __b, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vqshlq_n_s16 (int16x8_t __a, const int __b) +{ + return (int16x8_t)__builtin_neon_vqshl_nv8hi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqshlq_n_s32 (int32x4_t __a, const int __b) +{ + return (int32x4_t)__builtin_neon_vqshl_nv4si (__a, __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vqshlq_n_s64 (int64x2_t __a, const int __b) +{ + return (int64x2_t)__builtin_neon_vqshl_nv2di (__a, __b, 1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vqshlq_n_u8 (uint8x16_t __a, const int __b) +{ + return (uint8x16_t)__builtin_neon_vqshl_nv16qi ((int8x16_t) __a, __b, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vqshlq_n_u16 (uint16x8_t __a, const int __b) +{ + return (uint16x8_t)__builtin_neon_vqshl_nv8hi ((int16x8_t) __a, __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vqshlq_n_u32 (uint32x4_t __a, const int __b) +{ + return (uint32x4_t)__builtin_neon_vqshl_nv4si ((int32x4_t) __a, __b, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vqshlq_n_u64 (uint64x2_t __a, const int __b) +{ + return (uint64x2_t)__builtin_neon_vqshl_nv2di ((int64x2_t) __a, __b, 0); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqshlu_n_s8 (int8x8_t __a, const int __b) +{ + return (uint8x8_t)__builtin_neon_vqshlu_nv8qi (__a, __b, 1); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vqshlu_n_s16 (int16x4_t __a, const int __b) +{ + return (uint16x4_t)__builtin_neon_vqshlu_nv4hi (__a, __b, 1); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vqshlu_n_s32 (int32x2_t __a, const int __b) +{ + return (uint32x2_t)__builtin_neon_vqshlu_nv2si (__a, __b, 1); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vqshlu_n_s64 (int64x1_t __a, const int __b) +{ + return (uint64x1_t)__builtin_neon_vqshlu_ndi (__a, __b, 1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vqshluq_n_s8 (int8x16_t __a, const int __b) +{ + return (uint8x16_t)__builtin_neon_vqshlu_nv16qi (__a, __b, 1); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vqshluq_n_s16 (int16x8_t __a, const int __b) +{ + return (uint16x8_t)__builtin_neon_vqshlu_nv8hi (__a, __b, 1); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vqshluq_n_s32 (int32x4_t __a, const int __b) +{ + return (uint32x4_t)__builtin_neon_vqshlu_nv4si (__a, __b, 1); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vqshluq_n_s64 (int64x2_t __a, const int __b) +{ + return (uint64x2_t)__builtin_neon_vqshlu_nv2di (__a, __b, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vshll_n_s8 (int8x8_t __a, const int __b) +{ + return (int16x8_t)__builtin_neon_vshll_nv8qi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vshll_n_s16 (int16x4_t __a, const int __b) +{ + return (int32x4_t)__builtin_neon_vshll_nv4hi (__a, __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vshll_n_s32 (int32x2_t __a, const int __b) +{ + return (int64x2_t)__builtin_neon_vshll_nv2si (__a, __b, 1); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vshll_n_u8 (uint8x8_t __a, const int __b) +{ + return (uint16x8_t)__builtin_neon_vshll_nv8qi ((int8x8_t) __a, __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vshll_n_u16 (uint16x4_t __a, const int __b) +{ + return (uint32x4_t)__builtin_neon_vshll_nv4hi ((int16x4_t) __a, __b, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vshll_n_u32 (uint32x2_t __a, const int __b) +{ + return (uint64x2_t)__builtin_neon_vshll_nv2si ((int32x2_t) __a, __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c) +{ + return (int8x8_t)__builtin_neon_vsra_nv8qi (__a, __b, __c, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c) +{ + return (int16x4_t)__builtin_neon_vsra_nv4hi (__a, __b, __c, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c) +{ + return (int32x2_t)__builtin_neon_vsra_nv2si (__a, __b, __c, 1); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c) +{ + return (int64x1_t)__builtin_neon_vsra_ndi (__a, __b, __c, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c) +{ + return (uint8x8_t)__builtin_neon_vsra_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c) +{ + return (uint16x4_t)__builtin_neon_vsra_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c) +{ + return (uint32x2_t)__builtin_neon_vsra_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c, 0); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c) +{ + return (uint64x1_t)__builtin_neon_vsra_ndi ((int64x1_t) __a, (int64x1_t) __b, __c, 0); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c) +{ + return (int8x16_t)__builtin_neon_vsra_nv16qi (__a, __b, __c, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c) +{ + return (int16x8_t)__builtin_neon_vsra_nv8hi (__a, __b, __c, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c) +{ + return (int32x4_t)__builtin_neon_vsra_nv4si (__a, __b, __c, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c) +{ + return (int64x2_t)__builtin_neon_vsra_nv2di (__a, __b, __c, 1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c) +{ + return (uint8x16_t)__builtin_neon_vsra_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c) +{ + return (uint16x8_t)__builtin_neon_vsra_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c) +{ + return (uint32x4_t)__builtin_neon_vsra_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c) +{ + return (uint64x2_t)__builtin_neon_vsra_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vrsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c) +{ + return (int8x8_t)__builtin_neon_vsra_nv8qi (__a, __b, __c, 5); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vrsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c) +{ + return (int16x4_t)__builtin_neon_vsra_nv4hi (__a, __b, __c, 5); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vrsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c) +{ + return (int32x2_t)__builtin_neon_vsra_nv2si (__a, __b, __c, 5); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vrsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c) +{ + return (int64x1_t)__builtin_neon_vsra_ndi (__a, __b, __c, 5); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vrsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c) +{ + return (uint8x8_t)__builtin_neon_vsra_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c, 4); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vrsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c) +{ + return (uint16x4_t)__builtin_neon_vsra_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c, 4); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vrsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c) +{ + return (uint32x2_t)__builtin_neon_vsra_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c, 4); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vrsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c) +{ + return (uint64x1_t)__builtin_neon_vsra_ndi ((int64x1_t) __a, (int64x1_t) __b, __c, 4); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vrsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c) +{ + return (int8x16_t)__builtin_neon_vsra_nv16qi (__a, __b, __c, 5); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vrsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c) +{ + return (int16x8_t)__builtin_neon_vsra_nv8hi (__a, __b, __c, 5); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vrsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c) +{ + return (int32x4_t)__builtin_neon_vsra_nv4si (__a, __b, __c, 5); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vrsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c) +{ + return (int64x2_t)__builtin_neon_vsra_nv2di (__a, __b, __c, 5); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vrsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c) +{ + return (uint8x16_t)__builtin_neon_vsra_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c, 4); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vrsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c) +{ + return (uint16x8_t)__builtin_neon_vsra_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c, 4); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vrsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c) +{ + return (uint32x4_t)__builtin_neon_vsra_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c, 4); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vrsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c) +{ + return (uint64x2_t)__builtin_neon_vsra_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c, 4); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vsri_n_s8 (int8x8_t __a, int8x8_t __b, const int __c) +{ + return (int8x8_t)__builtin_neon_vsri_nv8qi (__a, __b, __c); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vsri_n_s16 (int16x4_t __a, int16x4_t __b, const int __c) +{ + return (int16x4_t)__builtin_neon_vsri_nv4hi (__a, __b, __c); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vsri_n_s32 (int32x2_t __a, int32x2_t __b, const int __c) +{ + return (int32x2_t)__builtin_neon_vsri_nv2si (__a, __b, __c); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vsri_n_s64 (int64x1_t __a, int64x1_t __b, const int __c) +{ + return (int64x1_t)__builtin_neon_vsri_ndi (__a, __b, __c); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vsri_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c) +{ + return (uint8x8_t)__builtin_neon_vsri_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vsri_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c) +{ + return (uint16x4_t)__builtin_neon_vsri_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vsri_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c) +{ + return (uint32x2_t)__builtin_neon_vsri_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vsri_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c) +{ + return (uint64x1_t)__builtin_neon_vsri_ndi ((int64x1_t) __a, (int64x1_t) __b, __c); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vsri_n_p8 (poly8x8_t __a, poly8x8_t __b, const int __c) +{ + return (poly8x8_t)__builtin_neon_vsri_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vsri_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c) +{ + return (poly16x4_t)__builtin_neon_vsri_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vsriq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c) +{ + return (int8x16_t)__builtin_neon_vsri_nv16qi (__a, __b, __c); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vsriq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c) +{ + return (int16x8_t)__builtin_neon_vsri_nv8hi (__a, __b, __c); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vsriq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c) +{ + return (int32x4_t)__builtin_neon_vsri_nv4si (__a, __b, __c); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vsriq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c) +{ + return (int64x2_t)__builtin_neon_vsri_nv2di (__a, __b, __c); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vsriq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c) +{ + return (uint8x16_t)__builtin_neon_vsri_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vsriq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c) +{ + return (uint16x8_t)__builtin_neon_vsri_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vsriq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c) +{ + return (uint32x4_t)__builtin_neon_vsri_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vsriq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c) +{ + return (uint64x2_t)__builtin_neon_vsri_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vsriq_n_p8 (poly8x16_t __a, poly8x16_t __b, const int __c) +{ + return (poly8x16_t)__builtin_neon_vsri_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vsriq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c) +{ + return (poly16x8_t)__builtin_neon_vsri_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vsli_n_s8 (int8x8_t __a, int8x8_t __b, const int __c) +{ + return (int8x8_t)__builtin_neon_vsli_nv8qi (__a, __b, __c); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vsli_n_s16 (int16x4_t __a, int16x4_t __b, const int __c) +{ + return (int16x4_t)__builtin_neon_vsli_nv4hi (__a, __b, __c); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vsli_n_s32 (int32x2_t __a, int32x2_t __b, const int __c) +{ + return (int32x2_t)__builtin_neon_vsli_nv2si (__a, __b, __c); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vsli_n_s64 (int64x1_t __a, int64x1_t __b, const int __c) +{ + return (int64x1_t)__builtin_neon_vsli_ndi (__a, __b, __c); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vsli_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c) +{ + return (uint8x8_t)__builtin_neon_vsli_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vsli_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c) +{ + return (uint16x4_t)__builtin_neon_vsli_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vsli_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c) +{ + return (uint32x2_t)__builtin_neon_vsli_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vsli_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c) +{ + return (uint64x1_t)__builtin_neon_vsli_ndi ((int64x1_t) __a, (int64x1_t) __b, __c); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vsli_n_p8 (poly8x8_t __a, poly8x8_t __b, const int __c) +{ + return (poly8x8_t)__builtin_neon_vsli_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vsli_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c) +{ + return (poly16x4_t)__builtin_neon_vsli_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vsliq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c) +{ + return (int8x16_t)__builtin_neon_vsli_nv16qi (__a, __b, __c); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vsliq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c) +{ + return (int16x8_t)__builtin_neon_vsli_nv8hi (__a, __b, __c); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vsliq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c) +{ + return (int32x4_t)__builtin_neon_vsli_nv4si (__a, __b, __c); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vsliq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c) +{ + return (int64x2_t)__builtin_neon_vsli_nv2di (__a, __b, __c); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vsliq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c) +{ + return (uint8x16_t)__builtin_neon_vsli_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vsliq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c) +{ + return (uint16x8_t)__builtin_neon_vsli_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vsliq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c) +{ + return (uint32x4_t)__builtin_neon_vsli_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vsliq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c) +{ + return (uint64x2_t)__builtin_neon_vsli_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vsliq_n_p8 (poly8x16_t __a, poly8x16_t __b, const int __c) +{ + return (poly8x16_t)__builtin_neon_vsli_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vsliq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c) +{ + return (poly16x8_t)__builtin_neon_vsli_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vabs_s8 (int8x8_t __a) +{ + return (int8x8_t)__builtin_neon_vabsv8qi (__a, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vabs_s16 (int16x4_t __a) +{ + return (int16x4_t)__builtin_neon_vabsv4hi (__a, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vabs_s32 (int32x2_t __a) +{ + return (int32x2_t)__builtin_neon_vabsv2si (__a, 1); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vabs_f32 (float32x2_t __a) +{ + return (float32x2_t)__builtin_neon_vabsv2sf (__a, 3); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vabsq_s8 (int8x16_t __a) +{ + return (int8x16_t)__builtin_neon_vabsv16qi (__a, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vabsq_s16 (int16x8_t __a) +{ + return (int16x8_t)__builtin_neon_vabsv8hi (__a, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vabsq_s32 (int32x4_t __a) +{ + return (int32x4_t)__builtin_neon_vabsv4si (__a, 1); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vabsq_f32 (float32x4_t __a) +{ + return (float32x4_t)__builtin_neon_vabsv4sf (__a, 3); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqabs_s8 (int8x8_t __a) +{ + return (int8x8_t)__builtin_neon_vqabsv8qi (__a, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vqabs_s16 (int16x4_t __a) +{ + return (int16x4_t)__builtin_neon_vqabsv4hi (__a, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vqabs_s32 (int32x2_t __a) +{ + return (int32x2_t)__builtin_neon_vqabsv2si (__a, 1); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vqabsq_s8 (int8x16_t __a) +{ + return (int8x16_t)__builtin_neon_vqabsv16qi (__a, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vqabsq_s16 (int16x8_t __a) +{ + return (int16x8_t)__builtin_neon_vqabsv8hi (__a, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqabsq_s32 (int32x4_t __a) +{ + return (int32x4_t)__builtin_neon_vqabsv4si (__a, 1); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vneg_s8 (int8x8_t __a) +{ + return (int8x8_t)__builtin_neon_vnegv8qi (__a, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vneg_s16 (int16x4_t __a) +{ + return (int16x4_t)__builtin_neon_vnegv4hi (__a, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vneg_s32 (int32x2_t __a) +{ + return (int32x2_t)__builtin_neon_vnegv2si (__a, 1); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vneg_f32 (float32x2_t __a) +{ + return (float32x2_t)__builtin_neon_vnegv2sf (__a, 3); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vnegq_s8 (int8x16_t __a) +{ + return (int8x16_t)__builtin_neon_vnegv16qi (__a, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vnegq_s16 (int16x8_t __a) +{ + return (int16x8_t)__builtin_neon_vnegv8hi (__a, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vnegq_s32 (int32x4_t __a) +{ + return (int32x4_t)__builtin_neon_vnegv4si (__a, 1); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vnegq_f32 (float32x4_t __a) +{ + return (float32x4_t)__builtin_neon_vnegv4sf (__a, 3); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqneg_s8 (int8x8_t __a) +{ + return (int8x8_t)__builtin_neon_vqnegv8qi (__a, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vqneg_s16 (int16x4_t __a) +{ + return (int16x4_t)__builtin_neon_vqnegv4hi (__a, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vqneg_s32 (int32x2_t __a) +{ + return (int32x2_t)__builtin_neon_vqnegv2si (__a, 1); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vqnegq_s8 (int8x16_t __a) +{ + return (int8x16_t)__builtin_neon_vqnegv16qi (__a, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vqnegq_s16 (int16x8_t __a) +{ + return (int16x8_t)__builtin_neon_vqnegv8hi (__a, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqnegq_s32 (int32x4_t __a) +{ + return (int32x4_t)__builtin_neon_vqnegv4si (__a, 1); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vmvn_s8 (int8x8_t __a) +{ + return (int8x8_t)__builtin_neon_vmvnv8qi (__a, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmvn_s16 (int16x4_t __a) +{ + return (int16x4_t)__builtin_neon_vmvnv4hi (__a, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmvn_s32 (int32x2_t __a) +{ + return (int32x2_t)__builtin_neon_vmvnv2si (__a, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vmvn_u8 (uint8x8_t __a) +{ + return (uint8x8_t)__builtin_neon_vmvnv8qi ((int8x8_t) __a, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmvn_u16 (uint16x4_t __a) +{ + return (uint16x4_t)__builtin_neon_vmvnv4hi ((int16x4_t) __a, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmvn_u32 (uint32x2_t __a) +{ + return (uint32x2_t)__builtin_neon_vmvnv2si ((int32x2_t) __a, 0); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vmvn_p8 (poly8x8_t __a) +{ + return (poly8x8_t)__builtin_neon_vmvnv8qi ((int8x8_t) __a, 2); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vmvnq_s8 (int8x16_t __a) +{ + return (int8x16_t)__builtin_neon_vmvnv16qi (__a, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmvnq_s16 (int16x8_t __a) +{ + return (int16x8_t)__builtin_neon_vmvnv8hi (__a, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmvnq_s32 (int32x4_t __a) +{ + return (int32x4_t)__builtin_neon_vmvnv4si (__a, 1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vmvnq_u8 (uint8x16_t __a) +{ + return (uint8x16_t)__builtin_neon_vmvnv16qi ((int8x16_t) __a, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmvnq_u16 (uint16x8_t __a) +{ + return (uint16x8_t)__builtin_neon_vmvnv8hi ((int16x8_t) __a, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmvnq_u32 (uint32x4_t __a) +{ + return (uint32x4_t)__builtin_neon_vmvnv4si ((int32x4_t) __a, 0); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vmvnq_p8 (poly8x16_t __a) +{ + return (poly8x16_t)__builtin_neon_vmvnv16qi ((int8x16_t) __a, 2); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vcls_s8 (int8x8_t __a) +{ + return (int8x8_t)__builtin_neon_vclsv8qi (__a, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vcls_s16 (int16x4_t __a) +{ + return (int16x4_t)__builtin_neon_vclsv4hi (__a, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vcls_s32 (int32x2_t __a) +{ + return (int32x2_t)__builtin_neon_vclsv2si (__a, 1); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vclsq_s8 (int8x16_t __a) +{ + return (int8x16_t)__builtin_neon_vclsv16qi (__a, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vclsq_s16 (int16x8_t __a) +{ + return (int16x8_t)__builtin_neon_vclsv8hi (__a, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vclsq_s32 (int32x4_t __a) +{ + return (int32x4_t)__builtin_neon_vclsv4si (__a, 1); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vclz_s8 (int8x8_t __a) +{ + return (int8x8_t)__builtin_neon_vclzv8qi (__a, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vclz_s16 (int16x4_t __a) +{ + return (int16x4_t)__builtin_neon_vclzv4hi (__a, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vclz_s32 (int32x2_t __a) +{ + return (int32x2_t)__builtin_neon_vclzv2si (__a, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vclz_u8 (uint8x8_t __a) +{ + return (uint8x8_t)__builtin_neon_vclzv8qi ((int8x8_t) __a, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vclz_u16 (uint16x4_t __a) +{ + return (uint16x4_t)__builtin_neon_vclzv4hi ((int16x4_t) __a, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vclz_u32 (uint32x2_t __a) +{ + return (uint32x2_t)__builtin_neon_vclzv2si ((int32x2_t) __a, 0); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vclzq_s8 (int8x16_t __a) +{ + return (int8x16_t)__builtin_neon_vclzv16qi (__a, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vclzq_s16 (int16x8_t __a) +{ + return (int16x8_t)__builtin_neon_vclzv8hi (__a, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vclzq_s32 (int32x4_t __a) +{ + return (int32x4_t)__builtin_neon_vclzv4si (__a, 1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vclzq_u8 (uint8x16_t __a) +{ + return (uint8x16_t)__builtin_neon_vclzv16qi ((int8x16_t) __a, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vclzq_u16 (uint16x8_t __a) +{ + return (uint16x8_t)__builtin_neon_vclzv8hi ((int16x8_t) __a, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vclzq_u32 (uint32x4_t __a) +{ + return (uint32x4_t)__builtin_neon_vclzv4si ((int32x4_t) __a, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vcnt_s8 (int8x8_t __a) +{ + return (int8x8_t)__builtin_neon_vcntv8qi (__a, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcnt_u8 (uint8x8_t __a) +{ + return (uint8x8_t)__builtin_neon_vcntv8qi ((int8x8_t) __a, 0); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vcnt_p8 (poly8x8_t __a) +{ + return (poly8x8_t)__builtin_neon_vcntv8qi ((int8x8_t) __a, 2); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vcntq_s8 (int8x16_t __a) +{ + return (int8x16_t)__builtin_neon_vcntv16qi (__a, 1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcntq_u8 (uint8x16_t __a) +{ + return (uint8x16_t)__builtin_neon_vcntv16qi ((int8x16_t) __a, 0); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vcntq_p8 (poly8x16_t __a) +{ + return (poly8x16_t)__builtin_neon_vcntv16qi ((int8x16_t) __a, 2); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vrecpe_f32 (float32x2_t __a) +{ + return (float32x2_t)__builtin_neon_vrecpev2sf (__a, 3); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vrecpe_u32 (uint32x2_t __a) +{ + return (uint32x2_t)__builtin_neon_vrecpev2si ((int32x2_t) __a, 0); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vrecpeq_f32 (float32x4_t __a) +{ + return (float32x4_t)__builtin_neon_vrecpev4sf (__a, 3); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vrecpeq_u32 (uint32x4_t __a) +{ + return (uint32x4_t)__builtin_neon_vrecpev4si ((int32x4_t) __a, 0); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vrsqrte_f32 (float32x2_t __a) +{ + return (float32x2_t)__builtin_neon_vrsqrtev2sf (__a, 3); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vrsqrte_u32 (uint32x2_t __a) +{ + return (uint32x2_t)__builtin_neon_vrsqrtev2si ((int32x2_t) __a, 0); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vrsqrteq_f32 (float32x4_t __a) +{ + return (float32x4_t)__builtin_neon_vrsqrtev4sf (__a, 3); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vrsqrteq_u32 (uint32x4_t __a) +{ + return (uint32x4_t)__builtin_neon_vrsqrtev4si ((int32x4_t) __a, 0); +} + +__extension__ static __inline int8_t __attribute__ ((__always_inline__)) +vget_lane_s8 (int8x8_t __a, const int __b) +{ + return (int8_t)__builtin_neon_vget_lanev8qi (__a, __b, 1); +} + +__extension__ static __inline int16_t __attribute__ ((__always_inline__)) +vget_lane_s16 (int16x4_t __a, const int __b) +{ + return (int16_t)__builtin_neon_vget_lanev4hi (__a, __b, 1); +} + +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vget_lane_s32 (int32x2_t __a, const int __b) +{ + return (int32_t)__builtin_neon_vget_lanev2si (__a, __b, 1); +} + +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vget_lane_f32 (float32x2_t __a, const int __b) +{ + return (float32_t)__builtin_neon_vget_lanev2sf (__a, __b, 3); +} + +__extension__ static __inline uint8_t __attribute__ ((__always_inline__)) +vget_lane_u8 (uint8x8_t __a, const int __b) +{ + return (uint8_t)__builtin_neon_vget_lanev8qi ((int8x8_t) __a, __b, 0); +} + +__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +vget_lane_u16 (uint16x4_t __a, const int __b) +{ + return (uint16_t)__builtin_neon_vget_lanev4hi ((int16x4_t) __a, __b, 0); +} + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vget_lane_u32 (uint32x2_t __a, const int __b) +{ + return (uint32_t)__builtin_neon_vget_lanev2si ((int32x2_t) __a, __b, 0); +} + +__extension__ static __inline poly8_t __attribute__ ((__always_inline__)) +vget_lane_p8 (poly8x8_t __a, const int __b) +{ + return (poly8_t)__builtin_neon_vget_lanev8qi ((int8x8_t) __a, __b, 2); +} + +__extension__ static __inline poly16_t __attribute__ ((__always_inline__)) +vget_lane_p16 (poly16x4_t __a, const int __b) +{ + return (poly16_t)__builtin_neon_vget_lanev4hi ((int16x4_t) __a, __b, 2); +} + +__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +vget_lane_s64 (int64x1_t __a, const int __b) +{ + return (int64_t)__builtin_neon_vget_lanedi (__a, __b, 1); +} + +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vget_lane_u64 (uint64x1_t __a, const int __b) +{ + return (uint64_t)__builtin_neon_vget_lanedi ((int64x1_t) __a, __b, 0); +} + +__extension__ static __inline int8_t __attribute__ ((__always_inline__)) +vgetq_lane_s8 (int8x16_t __a, const int __b) +{ + return (int8_t)__builtin_neon_vget_lanev16qi (__a, __b, 1); +} + +__extension__ static __inline int16_t __attribute__ ((__always_inline__)) +vgetq_lane_s16 (int16x8_t __a, const int __b) +{ + return (int16_t)__builtin_neon_vget_lanev8hi (__a, __b, 1); +} + +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vgetq_lane_s32 (int32x4_t __a, const int __b) +{ + return (int32_t)__builtin_neon_vget_lanev4si (__a, __b, 1); +} + +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vgetq_lane_f32 (float32x4_t __a, const int __b) +{ + return (float32_t)__builtin_neon_vget_lanev4sf (__a, __b, 3); +} + +__extension__ static __inline uint8_t __attribute__ ((__always_inline__)) +vgetq_lane_u8 (uint8x16_t __a, const int __b) +{ + return (uint8_t)__builtin_neon_vget_lanev16qi ((int8x16_t) __a, __b, 0); +} + +__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +vgetq_lane_u16 (uint16x8_t __a, const int __b) +{ + return (uint16_t)__builtin_neon_vget_lanev8hi ((int16x8_t) __a, __b, 0); +} + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vgetq_lane_u32 (uint32x4_t __a, const int __b) +{ + return (uint32_t)__builtin_neon_vget_lanev4si ((int32x4_t) __a, __b, 0); +} + +__extension__ static __inline poly8_t __attribute__ ((__always_inline__)) +vgetq_lane_p8 (poly8x16_t __a, const int __b) +{ + return (poly8_t)__builtin_neon_vget_lanev16qi ((int8x16_t) __a, __b, 2); +} + +__extension__ static __inline poly16_t __attribute__ ((__always_inline__)) +vgetq_lane_p16 (poly16x8_t __a, const int __b) +{ + return (poly16_t)__builtin_neon_vget_lanev8hi ((int16x8_t) __a, __b, 2); +} + +__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +vgetq_lane_s64 (int64x2_t __a, const int __b) +{ + return (int64_t)__builtin_neon_vget_lanev2di (__a, __b, 1); +} + +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vgetq_lane_u64 (uint64x2_t __a, const int __b) +{ + return (uint64_t)__builtin_neon_vget_lanev2di ((int64x2_t) __a, __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vset_lane_s8 (int8_t __a, int8x8_t __b, const int __c) +{ + return (int8x8_t)__builtin_neon_vset_lanev8qi ((__builtin_neon_qi) __a, __b, __c); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vset_lane_s16 (int16_t __a, int16x4_t __b, const int __c) +{ + return (int16x4_t)__builtin_neon_vset_lanev4hi ((__builtin_neon_hi) __a, __b, __c); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vset_lane_s32 (int32_t __a, int32x2_t __b, const int __c) +{ + return (int32x2_t)__builtin_neon_vset_lanev2si ((__builtin_neon_si) __a, __b, __c); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vset_lane_f32 (float32_t __a, float32x2_t __b, const int __c) +{ + return (float32x2_t)__builtin_neon_vset_lanev2sf ((__builtin_neon_sf) __a, __b, __c); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vset_lane_u8 (uint8_t __a, uint8x8_t __b, const int __c) +{ + return (uint8x8_t)__builtin_neon_vset_lanev8qi ((__builtin_neon_qi) __a, (int8x8_t) __b, __c); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vset_lane_u16 (uint16_t __a, uint16x4_t __b, const int __c) +{ + return (uint16x4_t)__builtin_neon_vset_lanev4hi ((__builtin_neon_hi) __a, (int16x4_t) __b, __c); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vset_lane_u32 (uint32_t __a, uint32x2_t __b, const int __c) +{ + return (uint32x2_t)__builtin_neon_vset_lanev2si ((__builtin_neon_si) __a, (int32x2_t) __b, __c); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vset_lane_p8 (poly8_t __a, poly8x8_t __b, const int __c) +{ + return (poly8x8_t)__builtin_neon_vset_lanev8qi ((__builtin_neon_qi) __a, (int8x8_t) __b, __c); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vset_lane_p16 (poly16_t __a, poly16x4_t __b, const int __c) +{ + return (poly16x4_t)__builtin_neon_vset_lanev4hi ((__builtin_neon_hi) __a, (int16x4_t) __b, __c); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vset_lane_s64 (int64_t __a, int64x1_t __b, const int __c) +{ + return (int64x1_t)__builtin_neon_vset_lanedi ((__builtin_neon_di) __a, __b, __c); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vset_lane_u64 (uint64_t __a, uint64x1_t __b, const int __c) +{ + return (uint64x1_t)__builtin_neon_vset_lanedi ((__builtin_neon_di) __a, (int64x1_t) __b, __c); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vsetq_lane_s8 (int8_t __a, int8x16_t __b, const int __c) +{ + return (int8x16_t)__builtin_neon_vset_lanev16qi ((__builtin_neon_qi) __a, __b, __c); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vsetq_lane_s16 (int16_t __a, int16x8_t __b, const int __c) +{ + return (int16x8_t)__builtin_neon_vset_lanev8hi ((__builtin_neon_hi) __a, __b, __c); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vsetq_lane_s32 (int32_t __a, int32x4_t __b, const int __c) +{ + return (int32x4_t)__builtin_neon_vset_lanev4si ((__builtin_neon_si) __a, __b, __c); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vsetq_lane_f32 (float32_t __a, float32x4_t __b, const int __c) +{ + return (float32x4_t)__builtin_neon_vset_lanev4sf ((__builtin_neon_sf) __a, __b, __c); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vsetq_lane_u8 (uint8_t __a, uint8x16_t __b, const int __c) +{ + return (uint8x16_t)__builtin_neon_vset_lanev16qi ((__builtin_neon_qi) __a, (int8x16_t) __b, __c); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vsetq_lane_u16 (uint16_t __a, uint16x8_t __b, const int __c) +{ + return (uint16x8_t)__builtin_neon_vset_lanev8hi ((__builtin_neon_hi) __a, (int16x8_t) __b, __c); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vsetq_lane_u32 (uint32_t __a, uint32x4_t __b, const int __c) +{ + return (uint32x4_t)__builtin_neon_vset_lanev4si ((__builtin_neon_si) __a, (int32x4_t) __b, __c); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vsetq_lane_p8 (poly8_t __a, poly8x16_t __b, const int __c) +{ + return (poly8x16_t)__builtin_neon_vset_lanev16qi ((__builtin_neon_qi) __a, (int8x16_t) __b, __c); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vsetq_lane_p16 (poly16_t __a, poly16x8_t __b, const int __c) +{ + return (poly16x8_t)__builtin_neon_vset_lanev8hi ((__builtin_neon_hi) __a, (int16x8_t) __b, __c); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vsetq_lane_s64 (int64_t __a, int64x2_t __b, const int __c) +{ + return (int64x2_t)__builtin_neon_vset_lanev2di ((__builtin_neon_di) __a, __b, __c); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vsetq_lane_u64 (uint64_t __a, uint64x2_t __b, const int __c) +{ + return (uint64x2_t)__builtin_neon_vset_lanev2di ((__builtin_neon_di) __a, (int64x2_t) __b, __c); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vcreate_s8 (uint64_t __a) +{ + return (int8x8_t)__builtin_neon_vcreatev8qi ((__builtin_neon_di) __a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vcreate_s16 (uint64_t __a) +{ + return (int16x4_t)__builtin_neon_vcreatev4hi ((__builtin_neon_di) __a); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vcreate_s32 (uint64_t __a) +{ + return (int32x2_t)__builtin_neon_vcreatev2si ((__builtin_neon_di) __a); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vcreate_s64 (uint64_t __a) +{ + return (int64x1_t)__builtin_neon_vcreatedi ((__builtin_neon_di) __a); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vcreate_f32 (uint64_t __a) +{ + return (float32x2_t)__builtin_neon_vcreatev2sf ((__builtin_neon_di) __a); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcreate_u8 (uint64_t __a) +{ + return (uint8x8_t)__builtin_neon_vcreatev8qi ((__builtin_neon_di) __a); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcreate_u16 (uint64_t __a) +{ + return (uint16x4_t)__builtin_neon_vcreatev4hi ((__builtin_neon_di) __a); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcreate_u32 (uint64_t __a) +{ + return (uint32x2_t)__builtin_neon_vcreatev2si ((__builtin_neon_di) __a); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcreate_u64 (uint64_t __a) +{ + return (uint64x1_t)__builtin_neon_vcreatedi ((__builtin_neon_di) __a); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vcreate_p8 (uint64_t __a) +{ + return (poly8x8_t)__builtin_neon_vcreatev8qi ((__builtin_neon_di) __a); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vcreate_p16 (uint64_t __a) +{ + return (poly16x4_t)__builtin_neon_vcreatev4hi ((__builtin_neon_di) __a); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vdup_n_s8 (int8_t __a) +{ + return (int8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vdup_n_s16 (int16_t __a) +{ + return (int16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vdup_n_s32 (int32_t __a) +{ + return (int32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vdup_n_f32 (float32_t __a) +{ + return (float32x2_t)__builtin_neon_vdup_nv2sf ((__builtin_neon_sf) __a); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vdup_n_u8 (uint8_t __a) +{ + return (uint8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vdup_n_u16 (uint16_t __a) +{ + return (uint16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vdup_n_u32 (uint32_t __a) +{ + return (uint32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vdup_n_p8 (poly8_t __a) +{ + return (poly8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vdup_n_p16 (poly16_t __a) +{ + return (poly16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vdup_n_s64 (int64_t __a) +{ + return (int64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vdup_n_u64 (uint64_t __a) +{ + return (uint64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vdupq_n_s8 (int8_t __a) +{ + return (int8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vdupq_n_s16 (int16_t __a) +{ + return (int16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vdupq_n_s32 (int32_t __a) +{ + return (int32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vdupq_n_f32 (float32_t __a) +{ + return (float32x4_t)__builtin_neon_vdup_nv4sf ((__builtin_neon_sf) __a); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vdupq_n_u8 (uint8_t __a) +{ + return (uint8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vdupq_n_u16 (uint16_t __a) +{ + return (uint16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vdupq_n_u32 (uint32_t __a) +{ + return (uint32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vdupq_n_p8 (poly8_t __a) +{ + return (poly8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vdupq_n_p16 (poly16_t __a) +{ + return (poly16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vdupq_n_s64 (int64_t __a) +{ + return (int64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vdupq_n_u64 (uint64_t __a) +{ + return (uint64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vmov_n_s8 (int8_t __a) +{ + return (int8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmov_n_s16 (int16_t __a) +{ + return (int16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmov_n_s32 (int32_t __a) +{ + return (int32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmov_n_f32 (float32_t __a) +{ + return (float32x2_t)__builtin_neon_vdup_nv2sf ((__builtin_neon_sf) __a); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vmov_n_u8 (uint8_t __a) +{ + return (uint8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmov_n_u16 (uint16_t __a) +{ + return (uint16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmov_n_u32 (uint32_t __a) +{ + return (uint32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vmov_n_p8 (poly8_t __a) +{ + return (poly8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vmov_n_p16 (poly16_t __a) +{ + return (poly16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vmov_n_s64 (int64_t __a) +{ + return (int64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vmov_n_u64 (uint64_t __a) +{ + return (uint64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vmovq_n_s8 (int8_t __a) +{ + return (int8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmovq_n_s16 (int16_t __a) +{ + return (int16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmovq_n_s32 (int32_t __a) +{ + return (int32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmovq_n_f32 (float32_t __a) +{ + return (float32x4_t)__builtin_neon_vdup_nv4sf ((__builtin_neon_sf) __a); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vmovq_n_u8 (uint8_t __a) +{ + return (uint8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmovq_n_u16 (uint16_t __a) +{ + return (uint16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmovq_n_u32 (uint32_t __a) +{ + return (uint32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vmovq_n_p8 (poly8_t __a) +{ + return (poly8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vmovq_n_p16 (poly16_t __a) +{ + return (poly16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmovq_n_s64 (int64_t __a) +{ + return (int64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmovq_n_u64 (uint64_t __a) +{ + return (uint64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vdup_lane_s8 (int8x8_t __a, const int __b) +{ + return (int8x8_t)__builtin_neon_vdup_lanev8qi (__a, __b); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vdup_lane_s16 (int16x4_t __a, const int __b) +{ + return (int16x4_t)__builtin_neon_vdup_lanev4hi (__a, __b); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vdup_lane_s32 (int32x2_t __a, const int __b) +{ + return (int32x2_t)__builtin_neon_vdup_lanev2si (__a, __b); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vdup_lane_f32 (float32x2_t __a, const int __b) +{ + return (float32x2_t)__builtin_neon_vdup_lanev2sf (__a, __b); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vdup_lane_u8 (uint8x8_t __a, const int __b) +{ + return (uint8x8_t)__builtin_neon_vdup_lanev8qi ((int8x8_t) __a, __b); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vdup_lane_u16 (uint16x4_t __a, const int __b) +{ + return (uint16x4_t)__builtin_neon_vdup_lanev4hi ((int16x4_t) __a, __b); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vdup_lane_u32 (uint32x2_t __a, const int __b) +{ + return (uint32x2_t)__builtin_neon_vdup_lanev2si ((int32x2_t) __a, __b); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vdup_lane_p8 (poly8x8_t __a, const int __b) +{ + return (poly8x8_t)__builtin_neon_vdup_lanev8qi ((int8x8_t) __a, __b); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vdup_lane_p16 (poly16x4_t __a, const int __b) +{ + return (poly16x4_t)__builtin_neon_vdup_lanev4hi ((int16x4_t) __a, __b); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vdup_lane_s64 (int64x1_t __a, const int __b) +{ + return (int64x1_t)__builtin_neon_vdup_lanedi (__a, __b); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vdup_lane_u64 (uint64x1_t __a, const int __b) +{ + return (uint64x1_t)__builtin_neon_vdup_lanedi ((int64x1_t) __a, __b); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vdupq_lane_s8 (int8x8_t __a, const int __b) +{ + return (int8x16_t)__builtin_neon_vdup_lanev16qi (__a, __b); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vdupq_lane_s16 (int16x4_t __a, const int __b) +{ + return (int16x8_t)__builtin_neon_vdup_lanev8hi (__a, __b); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vdupq_lane_s32 (int32x2_t __a, const int __b) +{ + return (int32x4_t)__builtin_neon_vdup_lanev4si (__a, __b); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vdupq_lane_f32 (float32x2_t __a, const int __b) +{ + return (float32x4_t)__builtin_neon_vdup_lanev4sf (__a, __b); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vdupq_lane_u8 (uint8x8_t __a, const int __b) +{ + return (uint8x16_t)__builtin_neon_vdup_lanev16qi ((int8x8_t) __a, __b); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vdupq_lane_u16 (uint16x4_t __a, const int __b) +{ + return (uint16x8_t)__builtin_neon_vdup_lanev8hi ((int16x4_t) __a, __b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vdupq_lane_u32 (uint32x2_t __a, const int __b) +{ + return (uint32x4_t)__builtin_neon_vdup_lanev4si ((int32x2_t) __a, __b); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vdupq_lane_p8 (poly8x8_t __a, const int __b) +{ + return (poly8x16_t)__builtin_neon_vdup_lanev16qi ((int8x8_t) __a, __b); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vdupq_lane_p16 (poly16x4_t __a, const int __b) +{ + return (poly16x8_t)__builtin_neon_vdup_lanev8hi ((int16x4_t) __a, __b); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vdupq_lane_s64 (int64x1_t __a, const int __b) +{ + return (int64x2_t)__builtin_neon_vdup_lanev2di (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vdupq_lane_u64 (uint64x1_t __a, const int __b) +{ + return (uint64x2_t)__builtin_neon_vdup_lanev2di ((int64x1_t) __a, __b); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vcombine_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x16_t)__builtin_neon_vcombinev8qi (__a, __b); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vcombine_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x8_t)__builtin_neon_vcombinev4hi (__a, __b); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vcombine_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x4_t)__builtin_neon_vcombinev2si (__a, __b); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vcombine_s64 (int64x1_t __a, int64x1_t __b) +{ + return (int64x2_t)__builtin_neon_vcombinedi (__a, __b); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vcombine_f32 (float32x2_t __a, float32x2_t __b) +{ + return (float32x4_t)__builtin_neon_vcombinev2sf (__a, __b); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcombine_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x16_t)__builtin_neon_vcombinev8qi ((int8x8_t) __a, (int8x8_t) __b); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcombine_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x8_t)__builtin_neon_vcombinev4hi ((int16x4_t) __a, (int16x4_t) __b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcombine_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x4_t)__builtin_neon_vcombinev2si ((int32x2_t) __a, (int32x2_t) __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcombine_u64 (uint64x1_t __a, uint64x1_t __b) +{ + return (uint64x2_t)__builtin_neon_vcombinedi ((int64x1_t) __a, (int64x1_t) __b); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vcombine_p8 (poly8x8_t __a, poly8x8_t __b) +{ + return (poly8x16_t)__builtin_neon_vcombinev8qi ((int8x8_t) __a, (int8x8_t) __b); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vcombine_p16 (poly16x4_t __a, poly16x4_t __b) +{ + return (poly16x8_t)__builtin_neon_vcombinev4hi ((int16x4_t) __a, (int16x4_t) __b); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vget_high_s8 (int8x16_t __a) +{ + return (int8x8_t)__builtin_neon_vget_highv16qi (__a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vget_high_s16 (int16x8_t __a) +{ + return (int16x4_t)__builtin_neon_vget_highv8hi (__a); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vget_high_s32 (int32x4_t __a) +{ + return (int32x2_t)__builtin_neon_vget_highv4si (__a); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vget_high_s64 (int64x2_t __a) +{ + return (int64x1_t)__builtin_neon_vget_highv2di (__a); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vget_high_f32 (float32x4_t __a) +{ + return (float32x2_t)__builtin_neon_vget_highv4sf (__a); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vget_high_u8 (uint8x16_t __a) +{ + return (uint8x8_t)__builtin_neon_vget_highv16qi ((int8x16_t) __a); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vget_high_u16 (uint16x8_t __a) +{ + return (uint16x4_t)__builtin_neon_vget_highv8hi ((int16x8_t) __a); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vget_high_u32 (uint32x4_t __a) +{ + return (uint32x2_t)__builtin_neon_vget_highv4si ((int32x4_t) __a); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vget_high_u64 (uint64x2_t __a) +{ + return (uint64x1_t)__builtin_neon_vget_highv2di ((int64x2_t) __a); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vget_high_p8 (poly8x16_t __a) +{ + return (poly8x8_t)__builtin_neon_vget_highv16qi ((int8x16_t) __a); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vget_high_p16 (poly16x8_t __a) +{ + return (poly16x4_t)__builtin_neon_vget_highv8hi ((int16x8_t) __a); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vget_low_s8 (int8x16_t __a) +{ + return (int8x8_t)__builtin_neon_vget_lowv16qi (__a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vget_low_s16 (int16x8_t __a) +{ + return (int16x4_t)__builtin_neon_vget_lowv8hi (__a); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vget_low_s32 (int32x4_t __a) +{ + return (int32x2_t)__builtin_neon_vget_lowv4si (__a); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vget_low_f32 (float32x4_t __a) +{ + return (float32x2_t)__builtin_neon_vget_lowv4sf (__a); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vget_low_u8 (uint8x16_t __a) +{ + return (uint8x8_t)__builtin_neon_vget_lowv16qi ((int8x16_t) __a); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vget_low_u16 (uint16x8_t __a) +{ + return (uint16x4_t)__builtin_neon_vget_lowv8hi ((int16x8_t) __a); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vget_low_u32 (uint32x4_t __a) +{ + return (uint32x2_t)__builtin_neon_vget_lowv4si ((int32x4_t) __a); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vget_low_p8 (poly8x16_t __a) +{ + return (poly8x8_t)__builtin_neon_vget_lowv16qi ((int8x16_t) __a); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vget_low_p16 (poly16x8_t __a) +{ + return (poly16x4_t)__builtin_neon_vget_lowv8hi ((int16x8_t) __a); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vget_low_s64 (int64x2_t __a) +{ + return (int64x1_t)__builtin_neon_vget_lowv2di (__a); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vget_low_u64 (uint64x2_t __a) +{ + return (uint64x1_t)__builtin_neon_vget_lowv2di ((int64x2_t) __a); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vcvt_s32_f32 (float32x2_t __a) +{ + return (int32x2_t)__builtin_neon_vcvtv2sf (__a, 1); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vcvt_f32_s32 (int32x2_t __a) +{ + return (float32x2_t)__builtin_neon_vcvtv2si (__a, 1); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vcvt_f32_u32 (uint32x2_t __a) +{ + return (float32x2_t)__builtin_neon_vcvtv2si ((int32x2_t) __a, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcvt_u32_f32 (float32x2_t __a) +{ + return (uint32x2_t)__builtin_neon_vcvtv2sf (__a, 0); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vcvtq_s32_f32 (float32x4_t __a) +{ + return (int32x4_t)__builtin_neon_vcvtv4sf (__a, 1); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vcvtq_f32_s32 (int32x4_t __a) +{ + return (float32x4_t)__builtin_neon_vcvtv4si (__a, 1); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vcvtq_f32_u32 (uint32x4_t __a) +{ + return (float32x4_t)__builtin_neon_vcvtv4si ((int32x4_t) __a, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcvtq_u32_f32 (float32x4_t __a) +{ + return (uint32x4_t)__builtin_neon_vcvtv4sf (__a, 0); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vcvt_n_s32_f32 (float32x2_t __a, const int __b) +{ + return (int32x2_t)__builtin_neon_vcvt_nv2sf (__a, __b, 1); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vcvt_n_f32_s32 (int32x2_t __a, const int __b) +{ + return (float32x2_t)__builtin_neon_vcvt_nv2si (__a, __b, 1); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vcvt_n_f32_u32 (uint32x2_t __a, const int __b) +{ + return (float32x2_t)__builtin_neon_vcvt_nv2si ((int32x2_t) __a, __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcvt_n_u32_f32 (float32x2_t __a, const int __b) +{ + return (uint32x2_t)__builtin_neon_vcvt_nv2sf (__a, __b, 0); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vcvtq_n_s32_f32 (float32x4_t __a, const int __b) +{ + return (int32x4_t)__builtin_neon_vcvt_nv4sf (__a, __b, 1); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vcvtq_n_f32_s32 (int32x4_t __a, const int __b) +{ + return (float32x4_t)__builtin_neon_vcvt_nv4si (__a, __b, 1); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vcvtq_n_f32_u32 (uint32x4_t __a, const int __b) +{ + return (float32x4_t)__builtin_neon_vcvt_nv4si ((int32x4_t) __a, __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcvtq_n_u32_f32 (float32x4_t __a, const int __b) +{ + return (uint32x4_t)__builtin_neon_vcvt_nv4sf (__a, __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vmovn_s16 (int16x8_t __a) +{ + return (int8x8_t)__builtin_neon_vmovnv8hi (__a, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmovn_s32 (int32x4_t __a) +{ + return (int16x4_t)__builtin_neon_vmovnv4si (__a, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmovn_s64 (int64x2_t __a) +{ + return (int32x2_t)__builtin_neon_vmovnv2di (__a, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vmovn_u16 (uint16x8_t __a) +{ + return (uint8x8_t)__builtin_neon_vmovnv8hi ((int16x8_t) __a, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmovn_u32 (uint32x4_t __a) +{ + return (uint16x4_t)__builtin_neon_vmovnv4si ((int32x4_t) __a, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmovn_u64 (uint64x2_t __a) +{ + return (uint32x2_t)__builtin_neon_vmovnv2di ((int64x2_t) __a, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqmovn_s16 (int16x8_t __a) +{ + return (int8x8_t)__builtin_neon_vqmovnv8hi (__a, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vqmovn_s32 (int32x4_t __a) +{ + return (int16x4_t)__builtin_neon_vqmovnv4si (__a, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vqmovn_s64 (int64x2_t __a) +{ + return (int32x2_t)__builtin_neon_vqmovnv2di (__a, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqmovn_u16 (uint16x8_t __a) +{ + return (uint8x8_t)__builtin_neon_vqmovnv8hi ((int16x8_t) __a, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vqmovn_u32 (uint32x4_t __a) +{ + return (uint16x4_t)__builtin_neon_vqmovnv4si ((int32x4_t) __a, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vqmovn_u64 (uint64x2_t __a) +{ + return (uint32x2_t)__builtin_neon_vqmovnv2di ((int64x2_t) __a, 0); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqmovun_s16 (int16x8_t __a) +{ + return (uint8x8_t)__builtin_neon_vqmovunv8hi (__a, 1); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vqmovun_s32 (int32x4_t __a) +{ + return (uint16x4_t)__builtin_neon_vqmovunv4si (__a, 1); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vqmovun_s64 (int64x2_t __a) +{ + return (uint32x2_t)__builtin_neon_vqmovunv2di (__a, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmovl_s8 (int8x8_t __a) +{ + return (int16x8_t)__builtin_neon_vmovlv8qi (__a, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmovl_s16 (int16x4_t __a) +{ + return (int32x4_t)__builtin_neon_vmovlv4hi (__a, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmovl_s32 (int32x2_t __a) +{ + return (int64x2_t)__builtin_neon_vmovlv2si (__a, 1); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmovl_u8 (uint8x8_t __a) +{ + return (uint16x8_t)__builtin_neon_vmovlv8qi ((int8x8_t) __a, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmovl_u16 (uint16x4_t __a) +{ + return (uint32x4_t)__builtin_neon_vmovlv4hi ((int16x4_t) __a, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmovl_u32 (uint32x2_t __a) +{ + return (uint64x2_t)__builtin_neon_vmovlv2si ((int32x2_t) __a, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbl1_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_neon_vtbl1v8qi (__a, __b); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbl1_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vtbl1v8qi ((int8x8_t) __a, (int8x8_t) __b); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbl1_p8 (poly8x8_t __a, uint8x8_t __b) +{ + return (poly8x8_t)__builtin_neon_vtbl1v8qi ((int8x8_t) __a, (int8x8_t) __b); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbl2_s8 (int8x8x2_t __a, int8x8_t __b) +{ + union { int8x8x2_t __i; __builtin_neon_ti __o; } __au = { __a }; + return (int8x8_t)__builtin_neon_vtbl2v8qi (__au.__o, __b); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbl2_u8 (uint8x8x2_t __a, uint8x8_t __b) +{ + union { uint8x8x2_t __i; __builtin_neon_ti __o; } __au = { __a }; + return (uint8x8_t)__builtin_neon_vtbl2v8qi (__au.__o, (int8x8_t) __b); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbl2_p8 (poly8x8x2_t __a, uint8x8_t __b) +{ + union { poly8x8x2_t __i; __builtin_neon_ti __o; } __au = { __a }; + return (poly8x8_t)__builtin_neon_vtbl2v8qi (__au.__o, (int8x8_t) __b); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbl3_s8 (int8x8x3_t __a, int8x8_t __b) +{ + union { int8x8x3_t __i; __builtin_neon_ei __o; } __au = { __a }; + return (int8x8_t)__builtin_neon_vtbl3v8qi (__au.__o, __b); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbl3_u8 (uint8x8x3_t __a, uint8x8_t __b) +{ + union { uint8x8x3_t __i; __builtin_neon_ei __o; } __au = { __a }; + return (uint8x8_t)__builtin_neon_vtbl3v8qi (__au.__o, (int8x8_t) __b); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbl3_p8 (poly8x8x3_t __a, uint8x8_t __b) +{ + union { poly8x8x3_t __i; __builtin_neon_ei __o; } __au = { __a }; + return (poly8x8_t)__builtin_neon_vtbl3v8qi (__au.__o, (int8x8_t) __b); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbl4_s8 (int8x8x4_t __a, int8x8_t __b) +{ + union { int8x8x4_t __i; __builtin_neon_oi __o; } __au = { __a }; + return (int8x8_t)__builtin_neon_vtbl4v8qi (__au.__o, __b); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbl4_u8 (uint8x8x4_t __a, uint8x8_t __b) +{ + union { uint8x8x4_t __i; __builtin_neon_oi __o; } __au = { __a }; + return (uint8x8_t)__builtin_neon_vtbl4v8qi (__au.__o, (int8x8_t) __b); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbl4_p8 (poly8x8x4_t __a, uint8x8_t __b) +{ + union { poly8x8x4_t __i; __builtin_neon_oi __o; } __au = { __a }; + return (poly8x8_t)__builtin_neon_vtbl4v8qi (__au.__o, (int8x8_t) __b); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbx1_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c) +{ + return (int8x8_t)__builtin_neon_vtbx1v8qi (__a, __b, __c); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbx1_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c) +{ + return (uint8x8_t)__builtin_neon_vtbx1v8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbx1_p8 (poly8x8_t __a, poly8x8_t __b, uint8x8_t __c) +{ + return (poly8x8_t)__builtin_neon_vtbx1v8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbx2_s8 (int8x8_t __a, int8x8x2_t __b, int8x8_t __c) +{ + union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + return (int8x8_t)__builtin_neon_vtbx2v8qi (__a, __bu.__o, __c); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbx2_u8 (uint8x8_t __a, uint8x8x2_t __b, uint8x8_t __c) +{ + union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + return (uint8x8_t)__builtin_neon_vtbx2v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbx2_p8 (poly8x8_t __a, poly8x8x2_t __b, uint8x8_t __c) +{ + union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + return (poly8x8_t)__builtin_neon_vtbx2v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbx3_s8 (int8x8_t __a, int8x8x3_t __b, int8x8_t __c) +{ + union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + return (int8x8_t)__builtin_neon_vtbx3v8qi (__a, __bu.__o, __c); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbx3_u8 (uint8x8_t __a, uint8x8x3_t __b, uint8x8_t __c) +{ + union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + return (uint8x8_t)__builtin_neon_vtbx3v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbx3_p8 (poly8x8_t __a, poly8x8x3_t __b, uint8x8_t __c) +{ + union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + return (poly8x8_t)__builtin_neon_vtbx3v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbx4_s8 (int8x8_t __a, int8x8x4_t __b, int8x8_t __c) +{ + union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + return (int8x8_t)__builtin_neon_vtbx4v8qi (__a, __bu.__o, __c); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbx4_u8 (uint8x8_t __a, uint8x8x4_t __b, uint8x8_t __c) +{ + union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + return (uint8x8_t)__builtin_neon_vtbx4v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbx4_p8 (poly8x8_t __a, poly8x8x4_t __b, uint8x8_t __c) +{ + union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + return (poly8x8_t)__builtin_neon_vtbx4v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmul_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c) +{ + return (int16x4_t)__builtin_neon_vmul_lanev4hi (__a, __b, __c, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmul_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c) +{ + return (int32x2_t)__builtin_neon_vmul_lanev2si (__a, __b, __c, 1); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmul_lane_f32 (float32x2_t __a, float32x2_t __b, const int __c) +{ + return (float32x2_t)__builtin_neon_vmul_lanev2sf (__a, __b, __c, 3); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmul_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __c) +{ + return (uint16x4_t)__builtin_neon_vmul_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, __c, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmul_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __c) +{ + return (uint32x2_t)__builtin_neon_vmul_lanev2si ((int32x2_t) __a, (int32x2_t) __b, __c, 0); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmulq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c) +{ + return (int16x8_t)__builtin_neon_vmul_lanev8hi (__a, __b, __c, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmulq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c) +{ + return (int32x4_t)__builtin_neon_vmul_lanev4si (__a, __b, __c, 1); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmulq_lane_f32 (float32x4_t __a, float32x2_t __b, const int __c) +{ + return (float32x4_t)__builtin_neon_vmul_lanev4sf (__a, __b, __c, 3); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmulq_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __c) +{ + return (uint16x8_t)__builtin_neon_vmul_lanev8hi ((int16x8_t) __a, (int16x4_t) __b, __c, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmulq_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __c) +{ + return (uint32x4_t)__builtin_neon_vmul_lanev4si ((int32x4_t) __a, (int32x2_t) __b, __c, 0); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmla_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d) +{ + return (int16x4_t)__builtin_neon_vmla_lanev4hi (__a, __b, __c, __d, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmla_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d) +{ + return (int32x2_t)__builtin_neon_vmla_lanev2si (__a, __b, __c, __d, 1); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmla_lane_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c, const int __d) +{ + return (float32x2_t)__builtin_neon_vmla_lanev2sf (__a, __b, __c, __d, 3); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmla_lane_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d) +{ + return (uint16x4_t)__builtin_neon_vmla_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmla_lane_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d) +{ + return (uint32x2_t)__builtin_neon_vmla_lanev2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d, 0); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmlaq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d) +{ + return (int16x8_t)__builtin_neon_vmla_lanev8hi (__a, __b, __c, __d, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlaq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d) +{ + return (int32x4_t)__builtin_neon_vmla_lanev4si (__a, __b, __c, __d, 1); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmlaq_lane_f32 (float32x4_t __a, float32x4_t __b, float32x2_t __c, const int __d) +{ + return (float32x4_t)__builtin_neon_vmla_lanev4sf (__a, __b, __c, __d, 3); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmlaq_lane_u16 (uint16x8_t __a, uint16x8_t __b, uint16x4_t __c, const int __d) +{ + return (uint16x8_t)__builtin_neon_vmla_lanev8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x4_t) __c, __d, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlaq_lane_u32 (uint32x4_t __a, uint32x4_t __b, uint32x2_t __c, const int __d) +{ + return (uint32x4_t)__builtin_neon_vmla_lanev4si ((int32x4_t) __a, (int32x4_t) __b, (int32x2_t) __c, __d, 0); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d) +{ + return (int32x4_t)__builtin_neon_vmlal_lanev4hi (__a, __b, __c, __d, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d) +{ + return (int64x2_t)__builtin_neon_vmlal_lanev2si (__a, __b, __c, __d, 1); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlal_lane_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d) +{ + return (uint32x4_t)__builtin_neon_vmlal_lanev4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmlal_lane_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d) +{ + return (uint64x2_t)__builtin_neon_vmlal_lanev2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d, 0); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqdmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d) +{ + return (int32x4_t)__builtin_neon_vqdmlal_lanev4hi (__a, __b, __c, __d, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vqdmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d) +{ + return (int64x2_t)__builtin_neon_vqdmlal_lanev2si (__a, __b, __c, __d, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmls_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d) +{ + return (int16x4_t)__builtin_neon_vmls_lanev4hi (__a, __b, __c, __d, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmls_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d) +{ + return (int32x2_t)__builtin_neon_vmls_lanev2si (__a, __b, __c, __d, 1); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmls_lane_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c, const int __d) +{ + return (float32x2_t)__builtin_neon_vmls_lanev2sf (__a, __b, __c, __d, 3); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmls_lane_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d) +{ + return (uint16x4_t)__builtin_neon_vmls_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmls_lane_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d) +{ + return (uint32x2_t)__builtin_neon_vmls_lanev2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d, 0); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmlsq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d) +{ + return (int16x8_t)__builtin_neon_vmls_lanev8hi (__a, __b, __c, __d, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlsq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d) +{ + return (int32x4_t)__builtin_neon_vmls_lanev4si (__a, __b, __c, __d, 1); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmlsq_lane_f32 (float32x4_t __a, float32x4_t __b, float32x2_t __c, const int __d) +{ + return (float32x4_t)__builtin_neon_vmls_lanev4sf (__a, __b, __c, __d, 3); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmlsq_lane_u16 (uint16x8_t __a, uint16x8_t __b, uint16x4_t __c, const int __d) +{ + return (uint16x8_t)__builtin_neon_vmls_lanev8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x4_t) __c, __d, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlsq_lane_u32 (uint32x4_t __a, uint32x4_t __b, uint32x2_t __c, const int __d) +{ + return (uint32x4_t)__builtin_neon_vmls_lanev4si ((int32x4_t) __a, (int32x4_t) __b, (int32x2_t) __c, __d, 0); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d) +{ + return (int32x4_t)__builtin_neon_vmlsl_lanev4hi (__a, __b, __c, __d, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d) +{ + return (int64x2_t)__builtin_neon_vmlsl_lanev2si (__a, __b, __c, __d, 1); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlsl_lane_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d) +{ + return (uint32x4_t)__builtin_neon_vmlsl_lanev4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmlsl_lane_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d) +{ + return (uint64x2_t)__builtin_neon_vmlsl_lanev2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d, 0); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqdmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d) +{ + return (int32x4_t)__builtin_neon_vqdmlsl_lanev4hi (__a, __b, __c, __d, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vqdmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d) +{ + return (int64x2_t)__builtin_neon_vqdmlsl_lanev2si (__a, __b, __c, __d, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c) +{ + return (int32x4_t)__builtin_neon_vmull_lanev4hi (__a, __b, __c, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c) +{ + return (int64x2_t)__builtin_neon_vmull_lanev2si (__a, __b, __c, 1); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmull_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __c) +{ + return (uint32x4_t)__builtin_neon_vmull_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, __c, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmull_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __c) +{ + return (uint64x2_t)__builtin_neon_vmull_lanev2si ((int32x2_t) __a, (int32x2_t) __b, __c, 0); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqdmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c) +{ + return (int32x4_t)__builtin_neon_vqdmull_lanev4hi (__a, __b, __c, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vqdmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c) +{ + return (int64x2_t)__builtin_neon_vqdmull_lanev2si (__a, __b, __c, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vqdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c) +{ + return (int16x8_t)__builtin_neon_vqdmulh_lanev8hi (__a, __b, __c, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c) +{ + return (int32x4_t)__builtin_neon_vqdmulh_lanev4si (__a, __b, __c, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vqdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c) +{ + return (int16x4_t)__builtin_neon_vqdmulh_lanev4hi (__a, __b, __c, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vqdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c) +{ + return (int32x2_t)__builtin_neon_vqdmulh_lanev2si (__a, __b, __c, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vqrdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c) +{ + return (int16x8_t)__builtin_neon_vqdmulh_lanev8hi (__a, __b, __c, 5); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqrdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c) +{ + return (int32x4_t)__builtin_neon_vqdmulh_lanev4si (__a, __b, __c, 5); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vqrdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c) +{ + return (int16x4_t)__builtin_neon_vqdmulh_lanev4hi (__a, __b, __c, 5); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vqrdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c) +{ + return (int32x2_t)__builtin_neon_vqdmulh_lanev2si (__a, __b, __c, 5); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmul_n_s16 (int16x4_t __a, int16_t __b) +{ + return (int16x4_t)__builtin_neon_vmul_nv4hi (__a, (__builtin_neon_hi) __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmul_n_s32 (int32x2_t __a, int32_t __b) +{ + return (int32x2_t)__builtin_neon_vmul_nv2si (__a, (__builtin_neon_si) __b, 1); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmul_n_f32 (float32x2_t __a, float32_t __b) +{ + return (float32x2_t)__builtin_neon_vmul_nv2sf (__a, (__builtin_neon_sf) __b, 3); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmul_n_u16 (uint16x4_t __a, uint16_t __b) +{ + return (uint16x4_t)__builtin_neon_vmul_nv4hi ((int16x4_t) __a, (__builtin_neon_hi) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmul_n_u32 (uint32x2_t __a, uint32_t __b) +{ + return (uint32x2_t)__builtin_neon_vmul_nv2si ((int32x2_t) __a, (__builtin_neon_si) __b, 0); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmulq_n_s16 (int16x8_t __a, int16_t __b) +{ + return (int16x8_t)__builtin_neon_vmul_nv8hi (__a, (__builtin_neon_hi) __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmulq_n_s32 (int32x4_t __a, int32_t __b) +{ + return (int32x4_t)__builtin_neon_vmul_nv4si (__a, (__builtin_neon_si) __b, 1); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmulq_n_f32 (float32x4_t __a, float32_t __b) +{ + return (float32x4_t)__builtin_neon_vmul_nv4sf (__a, (__builtin_neon_sf) __b, 3); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmulq_n_u16 (uint16x8_t __a, uint16_t __b) +{ + return (uint16x8_t)__builtin_neon_vmul_nv8hi ((int16x8_t) __a, (__builtin_neon_hi) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmulq_n_u32 (uint32x4_t __a, uint32_t __b) +{ + return (uint32x4_t)__builtin_neon_vmul_nv4si ((int32x4_t) __a, (__builtin_neon_si) __b, 0); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmull_n_s16 (int16x4_t __a, int16_t __b) +{ + return (int32x4_t)__builtin_neon_vmull_nv4hi (__a, (__builtin_neon_hi) __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmull_n_s32 (int32x2_t __a, int32_t __b) +{ + return (int64x2_t)__builtin_neon_vmull_nv2si (__a, (__builtin_neon_si) __b, 1); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmull_n_u16 (uint16x4_t __a, uint16_t __b) +{ + return (uint32x4_t)__builtin_neon_vmull_nv4hi ((int16x4_t) __a, (__builtin_neon_hi) __b, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmull_n_u32 (uint32x2_t __a, uint32_t __b) +{ + return (uint64x2_t)__builtin_neon_vmull_nv2si ((int32x2_t) __a, (__builtin_neon_si) __b, 0); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqdmull_n_s16 (int16x4_t __a, int16_t __b) +{ + return (int32x4_t)__builtin_neon_vqdmull_nv4hi (__a, (__builtin_neon_hi) __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vqdmull_n_s32 (int32x2_t __a, int32_t __b) +{ + return (int64x2_t)__builtin_neon_vqdmull_nv2si (__a, (__builtin_neon_si) __b, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vqdmulhq_n_s16 (int16x8_t __a, int16_t __b) +{ + return (int16x8_t)__builtin_neon_vqdmulh_nv8hi (__a, (__builtin_neon_hi) __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqdmulhq_n_s32 (int32x4_t __a, int32_t __b) +{ + return (int32x4_t)__builtin_neon_vqdmulh_nv4si (__a, (__builtin_neon_si) __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vqdmulh_n_s16 (int16x4_t __a, int16_t __b) +{ + return (int16x4_t)__builtin_neon_vqdmulh_nv4hi (__a, (__builtin_neon_hi) __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vqdmulh_n_s32 (int32x2_t __a, int32_t __b) +{ + return (int32x2_t)__builtin_neon_vqdmulh_nv2si (__a, (__builtin_neon_si) __b, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vqrdmulhq_n_s16 (int16x8_t __a, int16_t __b) +{ + return (int16x8_t)__builtin_neon_vqdmulh_nv8hi (__a, (__builtin_neon_hi) __b, 5); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqrdmulhq_n_s32 (int32x4_t __a, int32_t __b) +{ + return (int32x4_t)__builtin_neon_vqdmulh_nv4si (__a, (__builtin_neon_si) __b, 5); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vqrdmulh_n_s16 (int16x4_t __a, int16_t __b) +{ + return (int16x4_t)__builtin_neon_vqdmulh_nv4hi (__a, (__builtin_neon_hi) __b, 5); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vqrdmulh_n_s32 (int32x2_t __a, int32_t __b) +{ + return (int32x2_t)__builtin_neon_vqdmulh_nv2si (__a, (__builtin_neon_si) __b, 5); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmla_n_s16 (int16x4_t __a, int16x4_t __b, int16_t __c) +{ + return (int16x4_t)__builtin_neon_vmla_nv4hi (__a, __b, (__builtin_neon_hi) __c, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmla_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c) +{ + return (int32x2_t)__builtin_neon_vmla_nv2si (__a, __b, (__builtin_neon_si) __c, 1); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmla_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) +{ + return (float32x2_t)__builtin_neon_vmla_nv2sf (__a, __b, (__builtin_neon_sf) __c, 3); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmla_n_u16 (uint16x4_t __a, uint16x4_t __b, uint16_t __c) +{ + return (uint16x4_t)__builtin_neon_vmla_nv4hi ((int16x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmla_n_u32 (uint32x2_t __a, uint32x2_t __b, uint32_t __c) +{ + return (uint32x2_t)__builtin_neon_vmla_nv2si ((int32x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c, 0); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmlaq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c) +{ + return (int16x8_t)__builtin_neon_vmla_nv8hi (__a, __b, (__builtin_neon_hi) __c, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlaq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c) +{ + return (int32x4_t)__builtin_neon_vmla_nv4si (__a, __b, (__builtin_neon_si) __c, 1); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmlaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) +{ + return (float32x4_t)__builtin_neon_vmla_nv4sf (__a, __b, (__builtin_neon_sf) __c, 3); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmlaq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c) +{ + return (uint16x8_t)__builtin_neon_vmla_nv8hi ((int16x8_t) __a, (int16x8_t) __b, (__builtin_neon_hi) __c, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlaq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c) +{ + return (uint32x4_t)__builtin_neon_vmla_nv4si ((int32x4_t) __a, (int32x4_t) __b, (__builtin_neon_si) __c, 0); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c) +{ + return (int32x4_t)__builtin_neon_vmlal_nv4hi (__a, __b, (__builtin_neon_hi) __c, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c) +{ + return (int64x2_t)__builtin_neon_vmlal_nv2si (__a, __b, (__builtin_neon_si) __c, 1); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlal_n_u16 (uint32x4_t __a, uint16x4_t __b, uint16_t __c) +{ + return (uint32x4_t)__builtin_neon_vmlal_nv4hi ((int32x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmlal_n_u32 (uint64x2_t __a, uint32x2_t __b, uint32_t __c) +{ + return (uint64x2_t)__builtin_neon_vmlal_nv2si ((int64x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c, 0); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqdmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c) +{ + return (int32x4_t)__builtin_neon_vqdmlal_nv4hi (__a, __b, (__builtin_neon_hi) __c, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vqdmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c) +{ + return (int64x2_t)__builtin_neon_vqdmlal_nv2si (__a, __b, (__builtin_neon_si) __c, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmls_n_s16 (int16x4_t __a, int16x4_t __b, int16_t __c) +{ + return (int16x4_t)__builtin_neon_vmls_nv4hi (__a, __b, (__builtin_neon_hi) __c, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmls_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c) +{ + return (int32x2_t)__builtin_neon_vmls_nv2si (__a, __b, (__builtin_neon_si) __c, 1); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmls_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) +{ + return (float32x2_t)__builtin_neon_vmls_nv2sf (__a, __b, (__builtin_neon_sf) __c, 3); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmls_n_u16 (uint16x4_t __a, uint16x4_t __b, uint16_t __c) +{ + return (uint16x4_t)__builtin_neon_vmls_nv4hi ((int16x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmls_n_u32 (uint32x2_t __a, uint32x2_t __b, uint32_t __c) +{ + return (uint32x2_t)__builtin_neon_vmls_nv2si ((int32x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c, 0); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmlsq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c) +{ + return (int16x8_t)__builtin_neon_vmls_nv8hi (__a, __b, (__builtin_neon_hi) __c, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlsq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c) +{ + return (int32x4_t)__builtin_neon_vmls_nv4si (__a, __b, (__builtin_neon_si) __c, 1); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmlsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) +{ + return (float32x4_t)__builtin_neon_vmls_nv4sf (__a, __b, (__builtin_neon_sf) __c, 3); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmlsq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c) +{ + return (uint16x8_t)__builtin_neon_vmls_nv8hi ((int16x8_t) __a, (int16x8_t) __b, (__builtin_neon_hi) __c, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlsq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c) +{ + return (uint32x4_t)__builtin_neon_vmls_nv4si ((int32x4_t) __a, (int32x4_t) __b, (__builtin_neon_si) __c, 0); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c) +{ + return (int32x4_t)__builtin_neon_vmlsl_nv4hi (__a, __b, (__builtin_neon_hi) __c, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c) +{ + return (int64x2_t)__builtin_neon_vmlsl_nv2si (__a, __b, (__builtin_neon_si) __c, 1); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlsl_n_u16 (uint32x4_t __a, uint16x4_t __b, uint16_t __c) +{ + return (uint32x4_t)__builtin_neon_vmlsl_nv4hi ((int32x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmlsl_n_u32 (uint64x2_t __a, uint32x2_t __b, uint32_t __c) +{ + return (uint64x2_t)__builtin_neon_vmlsl_nv2si ((int64x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c, 0); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqdmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c) +{ + return (int32x4_t)__builtin_neon_vqdmlsl_nv4hi (__a, __b, (__builtin_neon_hi) __c, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vqdmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c) +{ + return (int64x2_t)__builtin_neon_vqdmlsl_nv2si (__a, __b, (__builtin_neon_si) __c, 1); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vext_s8 (int8x8_t __a, int8x8_t __b, const int __c) +{ + return (int8x8_t)__builtin_neon_vextv8qi (__a, __b, __c); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vext_s16 (int16x4_t __a, int16x4_t __b, const int __c) +{ + return (int16x4_t)__builtin_neon_vextv4hi (__a, __b, __c); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vext_s32 (int32x2_t __a, int32x2_t __b, const int __c) +{ + return (int32x2_t)__builtin_neon_vextv2si (__a, __b, __c); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vext_s64 (int64x1_t __a, int64x1_t __b, const int __c) +{ + return (int64x1_t)__builtin_neon_vextdi (__a, __b, __c); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vext_f32 (float32x2_t __a, float32x2_t __b, const int __c) +{ + return (float32x2_t)__builtin_neon_vextv2sf (__a, __b, __c); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vext_u8 (uint8x8_t __a, uint8x8_t __b, const int __c) +{ + return (uint8x8_t)__builtin_neon_vextv8qi ((int8x8_t) __a, (int8x8_t) __b, __c); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vext_u16 (uint16x4_t __a, uint16x4_t __b, const int __c) +{ + return (uint16x4_t)__builtin_neon_vextv4hi ((int16x4_t) __a, (int16x4_t) __b, __c); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vext_u32 (uint32x2_t __a, uint32x2_t __b, const int __c) +{ + return (uint32x2_t)__builtin_neon_vextv2si ((int32x2_t) __a, (int32x2_t) __b, __c); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vext_u64 (uint64x1_t __a, uint64x1_t __b, const int __c) +{ + return (uint64x1_t)__builtin_neon_vextdi ((int64x1_t) __a, (int64x1_t) __b, __c); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vext_p8 (poly8x8_t __a, poly8x8_t __b, const int __c) +{ + return (poly8x8_t)__builtin_neon_vextv8qi ((int8x8_t) __a, (int8x8_t) __b, __c); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vext_p16 (poly16x4_t __a, poly16x4_t __b, const int __c) +{ + return (poly16x4_t)__builtin_neon_vextv4hi ((int16x4_t) __a, (int16x4_t) __b, __c); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vextq_s8 (int8x16_t __a, int8x16_t __b, const int __c) +{ + return (int8x16_t)__builtin_neon_vextv16qi (__a, __b, __c); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vextq_s16 (int16x8_t __a, int16x8_t __b, const int __c) +{ + return (int16x8_t)__builtin_neon_vextv8hi (__a, __b, __c); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vextq_s32 (int32x4_t __a, int32x4_t __b, const int __c) +{ + return (int32x4_t)__builtin_neon_vextv4si (__a, __b, __c); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vextq_s64 (int64x2_t __a, int64x2_t __b, const int __c) +{ + return (int64x2_t)__builtin_neon_vextv2di (__a, __b, __c); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vextq_f32 (float32x4_t __a, float32x4_t __b, const int __c) +{ + return (float32x4_t)__builtin_neon_vextv4sf (__a, __b, __c); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vextq_u8 (uint8x16_t __a, uint8x16_t __b, const int __c) +{ + return (uint8x16_t)__builtin_neon_vextv16qi ((int8x16_t) __a, (int8x16_t) __b, __c); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vextq_u16 (uint16x8_t __a, uint16x8_t __b, const int __c) +{ + return (uint16x8_t)__builtin_neon_vextv8hi ((int16x8_t) __a, (int16x8_t) __b, __c); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vextq_u32 (uint32x4_t __a, uint32x4_t __b, const int __c) +{ + return (uint32x4_t)__builtin_neon_vextv4si ((int32x4_t) __a, (int32x4_t) __b, __c); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vextq_u64 (uint64x2_t __a, uint64x2_t __b, const int __c) +{ + return (uint64x2_t)__builtin_neon_vextv2di ((int64x2_t) __a, (int64x2_t) __b, __c); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vextq_p8 (poly8x16_t __a, poly8x16_t __b, const int __c) +{ + return (poly8x16_t)__builtin_neon_vextv16qi ((int8x16_t) __a, (int8x16_t) __b, __c); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vextq_p16 (poly16x8_t __a, poly16x8_t __b, const int __c) +{ + return (poly16x8_t)__builtin_neon_vextv8hi ((int16x8_t) __a, (int16x8_t) __b, __c); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vrev64_s8 (int8x8_t __a) +{ + return (int8x8_t)__builtin_neon_vrev64v8qi (__a, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vrev64_s16 (int16x4_t __a) +{ + return (int16x4_t)__builtin_neon_vrev64v4hi (__a, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vrev64_s32 (int32x2_t __a) +{ + return (int32x2_t)__builtin_neon_vrev64v2si (__a, 1); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vrev64_f32 (float32x2_t __a) +{ + return (float32x2_t)__builtin_neon_vrev64v2sf (__a, 3); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vrev64_u8 (uint8x8_t __a) +{ + return (uint8x8_t)__builtin_neon_vrev64v8qi ((int8x8_t) __a, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vrev64_u16 (uint16x4_t __a) +{ + return (uint16x4_t)__builtin_neon_vrev64v4hi ((int16x4_t) __a, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vrev64_u32 (uint32x2_t __a) +{ + return (uint32x2_t)__builtin_neon_vrev64v2si ((int32x2_t) __a, 0); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vrev64_p8 (poly8x8_t __a) +{ + return (poly8x8_t)__builtin_neon_vrev64v8qi ((int8x8_t) __a, 2); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vrev64_p16 (poly16x4_t __a) +{ + return (poly16x4_t)__builtin_neon_vrev64v4hi ((int16x4_t) __a, 2); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vrev64q_s8 (int8x16_t __a) +{ + return (int8x16_t)__builtin_neon_vrev64v16qi (__a, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vrev64q_s16 (int16x8_t __a) +{ + return (int16x8_t)__builtin_neon_vrev64v8hi (__a, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vrev64q_s32 (int32x4_t __a) +{ + return (int32x4_t)__builtin_neon_vrev64v4si (__a, 1); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vrev64q_f32 (float32x4_t __a) +{ + return (float32x4_t)__builtin_neon_vrev64v4sf (__a, 3); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vrev64q_u8 (uint8x16_t __a) +{ + return (uint8x16_t)__builtin_neon_vrev64v16qi ((int8x16_t) __a, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vrev64q_u16 (uint16x8_t __a) +{ + return (uint16x8_t)__builtin_neon_vrev64v8hi ((int16x8_t) __a, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vrev64q_u32 (uint32x4_t __a) +{ + return (uint32x4_t)__builtin_neon_vrev64v4si ((int32x4_t) __a, 0); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vrev64q_p8 (poly8x16_t __a) +{ + return (poly8x16_t)__builtin_neon_vrev64v16qi ((int8x16_t) __a, 2); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vrev64q_p16 (poly16x8_t __a) +{ + return (poly16x8_t)__builtin_neon_vrev64v8hi ((int16x8_t) __a, 2); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vrev32_s8 (int8x8_t __a) +{ + return (int8x8_t)__builtin_neon_vrev32v8qi (__a, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vrev32_s16 (int16x4_t __a) +{ + return (int16x4_t)__builtin_neon_vrev32v4hi (__a, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vrev32_u8 (uint8x8_t __a) +{ + return (uint8x8_t)__builtin_neon_vrev32v8qi ((int8x8_t) __a, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vrev32_u16 (uint16x4_t __a) +{ + return (uint16x4_t)__builtin_neon_vrev32v4hi ((int16x4_t) __a, 0); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vrev32_p8 (poly8x8_t __a) +{ + return (poly8x8_t)__builtin_neon_vrev32v8qi ((int8x8_t) __a, 2); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vrev32_p16 (poly16x4_t __a) +{ + return (poly16x4_t)__builtin_neon_vrev32v4hi ((int16x4_t) __a, 2); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vrev32q_s8 (int8x16_t __a) +{ + return (int8x16_t)__builtin_neon_vrev32v16qi (__a, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vrev32q_s16 (int16x8_t __a) +{ + return (int16x8_t)__builtin_neon_vrev32v8hi (__a, 1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vrev32q_u8 (uint8x16_t __a) +{ + return (uint8x16_t)__builtin_neon_vrev32v16qi ((int8x16_t) __a, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vrev32q_u16 (uint16x8_t __a) +{ + return (uint16x8_t)__builtin_neon_vrev32v8hi ((int16x8_t) __a, 0); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vrev32q_p8 (poly8x16_t __a) +{ + return (poly8x16_t)__builtin_neon_vrev32v16qi ((int8x16_t) __a, 2); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vrev32q_p16 (poly16x8_t __a) +{ + return (poly16x8_t)__builtin_neon_vrev32v8hi ((int16x8_t) __a, 2); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vrev16_s8 (int8x8_t __a) +{ + return (int8x8_t)__builtin_neon_vrev16v8qi (__a, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vrev16_u8 (uint8x8_t __a) +{ + return (uint8x8_t)__builtin_neon_vrev16v8qi ((int8x8_t) __a, 0); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vrev16_p8 (poly8x8_t __a) +{ + return (poly8x8_t)__builtin_neon_vrev16v8qi ((int8x8_t) __a, 2); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vrev16q_s8 (int8x16_t __a) +{ + return (int8x16_t)__builtin_neon_vrev16v16qi (__a, 1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vrev16q_u8 (uint8x16_t __a) +{ + return (uint8x16_t)__builtin_neon_vrev16v16qi ((int8x16_t) __a, 0); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vrev16q_p8 (poly8x16_t __a) +{ + return (poly8x16_t)__builtin_neon_vrev16v16qi ((int8x16_t) __a, 2); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vbsl_s8 (uint8x8_t __a, int8x8_t __b, int8x8_t __c) +{ + return (int8x8_t)__builtin_neon_vbslv8qi ((int8x8_t) __a, __b, __c); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vbsl_s16 (uint16x4_t __a, int16x4_t __b, int16x4_t __c) +{ + return (int16x4_t)__builtin_neon_vbslv4hi ((int16x4_t) __a, __b, __c); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vbsl_s32 (uint32x2_t __a, int32x2_t __b, int32x2_t __c) +{ + return (int32x2_t)__builtin_neon_vbslv2si ((int32x2_t) __a, __b, __c); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vbsl_s64 (uint64x1_t __a, int64x1_t __b, int64x1_t __c) +{ + return (int64x1_t)__builtin_neon_vbsldi ((int64x1_t) __a, __b, __c); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vbsl_f32 (uint32x2_t __a, float32x2_t __b, float32x2_t __c) +{ + return (float32x2_t)__builtin_neon_vbslv2sf ((int32x2_t) __a, __b, __c); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vbsl_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c) +{ + return (uint8x8_t)__builtin_neon_vbslv8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vbsl_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c) +{ + return (uint16x4_t)__builtin_neon_vbslv4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vbsl_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c) +{ + return (uint32x2_t)__builtin_neon_vbslv2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vbsl_u64 (uint64x1_t __a, uint64x1_t __b, uint64x1_t __c) +{ + return (uint64x1_t)__builtin_neon_vbsldi ((int64x1_t) __a, (int64x1_t) __b, (int64x1_t) __c); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vbsl_p8 (uint8x8_t __a, poly8x8_t __b, poly8x8_t __c) +{ + return (poly8x8_t)__builtin_neon_vbslv8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vbsl_p16 (uint16x4_t __a, poly16x4_t __b, poly16x4_t __c) +{ + return (poly16x4_t)__builtin_neon_vbslv4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vbslq_s8 (uint8x16_t __a, int8x16_t __b, int8x16_t __c) +{ + return (int8x16_t)__builtin_neon_vbslv16qi ((int8x16_t) __a, __b, __c); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vbslq_s16 (uint16x8_t __a, int16x8_t __b, int16x8_t __c) +{ + return (int16x8_t)__builtin_neon_vbslv8hi ((int16x8_t) __a, __b, __c); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vbslq_s32 (uint32x4_t __a, int32x4_t __b, int32x4_t __c) +{ + return (int32x4_t)__builtin_neon_vbslv4si ((int32x4_t) __a, __b, __c); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vbslq_s64 (uint64x2_t __a, int64x2_t __b, int64x2_t __c) +{ + return (int64x2_t)__builtin_neon_vbslv2di ((int64x2_t) __a, __b, __c); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vbslq_f32 (uint32x4_t __a, float32x4_t __b, float32x4_t __c) +{ + return (float32x4_t)__builtin_neon_vbslv4sf ((int32x4_t) __a, __b, __c); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vbslq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c) +{ + return (uint8x16_t)__builtin_neon_vbslv16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vbslq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c) +{ + return (uint16x8_t)__builtin_neon_vbslv8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vbslq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) +{ + return (uint32x4_t)__builtin_neon_vbslv4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vbslq_u64 (uint64x2_t __a, uint64x2_t __b, uint64x2_t __c) +{ + return (uint64x2_t)__builtin_neon_vbslv2di ((int64x2_t) __a, (int64x2_t) __b, (int64x2_t) __c); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vbslq_p8 (uint8x16_t __a, poly8x16_t __b, poly8x16_t __c) +{ + return (poly8x16_t)__builtin_neon_vbslv16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vbslq_p16 (uint16x8_t __a, poly16x8_t __b, poly16x8_t __c) +{ + return (poly16x8_t)__builtin_neon_vbslv8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c); +} + +__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__)) +vtrn_s8 (int8x8_t __a, int8x8_t __b) +{ + int8x8x2_t __rv; + __builtin_neon_vtrnv8qi (&__rv.val[0], __a, __b); + return __rv; +} + +__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__)) +vtrn_s16 (int16x4_t __a, int16x4_t __b) +{ + int16x4x2_t __rv; + __builtin_neon_vtrnv4hi (&__rv.val[0], __a, __b); + return __rv; +} + +__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__)) +vtrn_s32 (int32x2_t __a, int32x2_t __b) +{ + int32x2x2_t __rv; + __builtin_neon_vtrnv2si (&__rv.val[0], __a, __b); + return __rv; +} + +__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__)) +vtrn_f32 (float32x2_t __a, float32x2_t __b) +{ + float32x2x2_t __rv; + __builtin_neon_vtrnv2sf (&__rv.val[0], __a, __b); + return __rv; +} + +__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__)) +vtrn_u8 (uint8x8_t __a, uint8x8_t __b) +{ + uint8x8x2_t __rv; + __builtin_neon_vtrnv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b); + return __rv; +} + +__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__)) +vtrn_u16 (uint16x4_t __a, uint16x4_t __b) +{ + uint16x4x2_t __rv; + __builtin_neon_vtrnv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b); + return __rv; +} + +__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__)) +vtrn_u32 (uint32x2_t __a, uint32x2_t __b) +{ + uint32x2x2_t __rv; + __builtin_neon_vtrnv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b); + return __rv; +} + +__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__)) +vtrn_p8 (poly8x8_t __a, poly8x8_t __b) +{ + poly8x8x2_t __rv; + __builtin_neon_vtrnv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b); + return __rv; +} + +__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__)) +vtrn_p16 (poly16x4_t __a, poly16x4_t __b) +{ + poly16x4x2_t __rv; + __builtin_neon_vtrnv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b); + return __rv; +} + +__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__)) +vtrnq_s8 (int8x16_t __a, int8x16_t __b) +{ + int8x16x2_t __rv; + __builtin_neon_vtrnv16qi (&__rv.val[0], __a, __b); + return __rv; +} + +__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__)) +vtrnq_s16 (int16x8_t __a, int16x8_t __b) +{ + int16x8x2_t __rv; + __builtin_neon_vtrnv8hi (&__rv.val[0], __a, __b); + return __rv; +} + +__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__)) +vtrnq_s32 (int32x4_t __a, int32x4_t __b) +{ + int32x4x2_t __rv; + __builtin_neon_vtrnv4si (&__rv.val[0], __a, __b); + return __rv; +} + +__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__)) +vtrnq_f32 (float32x4_t __a, float32x4_t __b) +{ + float32x4x2_t __rv; + __builtin_neon_vtrnv4sf (&__rv.val[0], __a, __b); + return __rv; +} + +__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__)) +vtrnq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + uint8x16x2_t __rv; + __builtin_neon_vtrnv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b); + return __rv; +} + +__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__)) +vtrnq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + uint16x8x2_t __rv; + __builtin_neon_vtrnv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b); + return __rv; +} + +__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__)) +vtrnq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + uint32x4x2_t __rv; + __builtin_neon_vtrnv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b); + return __rv; +} + +__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__)) +vtrnq_p8 (poly8x16_t __a, poly8x16_t __b) +{ + poly8x16x2_t __rv; + __builtin_neon_vtrnv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b); + return __rv; +} + +__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__)) +vtrnq_p16 (poly16x8_t __a, poly16x8_t __b) +{ + poly16x8x2_t __rv; + __builtin_neon_vtrnv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b); + return __rv; +} + +__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__)) +vzip_s8 (int8x8_t __a, int8x8_t __b) +{ + int8x8x2_t __rv; + __builtin_neon_vzipv8qi (&__rv.val[0], __a, __b); + return __rv; +} + +__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__)) +vzip_s16 (int16x4_t __a, int16x4_t __b) +{ + int16x4x2_t __rv; + __builtin_neon_vzipv4hi (&__rv.val[0], __a, __b); + return __rv; +} + +__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__)) +vzip_s32 (int32x2_t __a, int32x2_t __b) +{ + int32x2x2_t __rv; + __builtin_neon_vzipv2si (&__rv.val[0], __a, __b); + return __rv; +} + +__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__)) +vzip_f32 (float32x2_t __a, float32x2_t __b) +{ + float32x2x2_t __rv; + __builtin_neon_vzipv2sf (&__rv.val[0], __a, __b); + return __rv; +} + +__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__)) +vzip_u8 (uint8x8_t __a, uint8x8_t __b) +{ + uint8x8x2_t __rv; + __builtin_neon_vzipv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b); + return __rv; +} + +__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__)) +vzip_u16 (uint16x4_t __a, uint16x4_t __b) +{ + uint16x4x2_t __rv; + __builtin_neon_vzipv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b); + return __rv; +} + +__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__)) +vzip_u32 (uint32x2_t __a, uint32x2_t __b) +{ + uint32x2x2_t __rv; + __builtin_neon_vzipv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b); + return __rv; +} + +__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__)) +vzip_p8 (poly8x8_t __a, poly8x8_t __b) +{ + poly8x8x2_t __rv; + __builtin_neon_vzipv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b); + return __rv; +} + +__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__)) +vzip_p16 (poly16x4_t __a, poly16x4_t __b) +{ + poly16x4x2_t __rv; + __builtin_neon_vzipv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b); + return __rv; +} + +__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__)) +vzipq_s8 (int8x16_t __a, int8x16_t __b) +{ + int8x16x2_t __rv; + __builtin_neon_vzipv16qi (&__rv.val[0], __a, __b); + return __rv; +} + +__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__)) +vzipq_s16 (int16x8_t __a, int16x8_t __b) +{ + int16x8x2_t __rv; + __builtin_neon_vzipv8hi (&__rv.val[0], __a, __b); + return __rv; +} + +__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__)) +vzipq_s32 (int32x4_t __a, int32x4_t __b) +{ + int32x4x2_t __rv; + __builtin_neon_vzipv4si (&__rv.val[0], __a, __b); + return __rv; +} + +__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__)) +vzipq_f32 (float32x4_t __a, float32x4_t __b) +{ + float32x4x2_t __rv; + __builtin_neon_vzipv4sf (&__rv.val[0], __a, __b); + return __rv; +} + +__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__)) +vzipq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + uint8x16x2_t __rv; + __builtin_neon_vzipv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b); + return __rv; +} + +__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__)) +vzipq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + uint16x8x2_t __rv; + __builtin_neon_vzipv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b); + return __rv; +} + +__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__)) +vzipq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + uint32x4x2_t __rv; + __builtin_neon_vzipv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b); + return __rv; +} + +__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__)) +vzipq_p8 (poly8x16_t __a, poly8x16_t __b) +{ + poly8x16x2_t __rv; + __builtin_neon_vzipv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b); + return __rv; +} + +__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__)) +vzipq_p16 (poly16x8_t __a, poly16x8_t __b) +{ + poly16x8x2_t __rv; + __builtin_neon_vzipv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b); + return __rv; +} + +__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__)) +vuzp_s8 (int8x8_t __a, int8x8_t __b) +{ + int8x8x2_t __rv; + __builtin_neon_vuzpv8qi (&__rv.val[0], __a, __b); + return __rv; +} + +__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__)) +vuzp_s16 (int16x4_t __a, int16x4_t __b) +{ + int16x4x2_t __rv; + __builtin_neon_vuzpv4hi (&__rv.val[0], __a, __b); + return __rv; +} + +__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__)) +vuzp_s32 (int32x2_t __a, int32x2_t __b) +{ + int32x2x2_t __rv; + __builtin_neon_vuzpv2si (&__rv.val[0], __a, __b); + return __rv; +} + +__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__)) +vuzp_f32 (float32x2_t __a, float32x2_t __b) +{ + float32x2x2_t __rv; + __builtin_neon_vuzpv2sf (&__rv.val[0], __a, __b); + return __rv; +} + +__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__)) +vuzp_u8 (uint8x8_t __a, uint8x8_t __b) +{ + uint8x8x2_t __rv; + __builtin_neon_vuzpv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b); + return __rv; +} + +__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__)) +vuzp_u16 (uint16x4_t __a, uint16x4_t __b) +{ + uint16x4x2_t __rv; + __builtin_neon_vuzpv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b); + return __rv; +} + +__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__)) +vuzp_u32 (uint32x2_t __a, uint32x2_t __b) +{ + uint32x2x2_t __rv; + __builtin_neon_vuzpv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b); + return __rv; +} + +__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__)) +vuzp_p8 (poly8x8_t __a, poly8x8_t __b) +{ + poly8x8x2_t __rv; + __builtin_neon_vuzpv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b); + return __rv; +} + +__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__)) +vuzp_p16 (poly16x4_t __a, poly16x4_t __b) +{ + poly16x4x2_t __rv; + __builtin_neon_vuzpv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b); + return __rv; +} + +__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__)) +vuzpq_s8 (int8x16_t __a, int8x16_t __b) +{ + int8x16x2_t __rv; + __builtin_neon_vuzpv16qi (&__rv.val[0], __a, __b); + return __rv; +} + +__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__)) +vuzpq_s16 (int16x8_t __a, int16x8_t __b) +{ + int16x8x2_t __rv; + __builtin_neon_vuzpv8hi (&__rv.val[0], __a, __b); + return __rv; +} + +__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__)) +vuzpq_s32 (int32x4_t __a, int32x4_t __b) +{ + int32x4x2_t __rv; + __builtin_neon_vuzpv4si (&__rv.val[0], __a, __b); + return __rv; +} + +__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__)) +vuzpq_f32 (float32x4_t __a, float32x4_t __b) +{ + float32x4x2_t __rv; + __builtin_neon_vuzpv4sf (&__rv.val[0], __a, __b); + return __rv; +} + +__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__)) +vuzpq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + uint8x16x2_t __rv; + __builtin_neon_vuzpv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b); + return __rv; +} + +__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__)) +vuzpq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + uint16x8x2_t __rv; + __builtin_neon_vuzpv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b); + return __rv; +} + +__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__)) +vuzpq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + uint32x4x2_t __rv; + __builtin_neon_vuzpv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b); + return __rv; +} + +__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__)) +vuzpq_p8 (poly8x16_t __a, poly8x16_t __b) +{ + poly8x16x2_t __rv; + __builtin_neon_vuzpv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b); + return __rv; +} + +__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__)) +vuzpq_p16 (poly16x8_t __a, poly16x8_t __b) +{ + poly16x8x2_t __rv; + __builtin_neon_vuzpv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b); + return __rv; +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vld1_s8 (const int8_t * __a) +{ + return (int8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vld1_s16 (const int16_t * __a) +{ + return (int16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vld1_s32 (const int32_t * __a) +{ + return (int32x2_t)__builtin_neon_vld1v2si ((const __builtin_neon_si *) __a); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vld1_s64 (const int64_t * __a) +{ + return (int64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vld1_f32 (const float32_t * __a) +{ + return (float32x2_t)__builtin_neon_vld1v2sf ((const __builtin_neon_sf *) __a); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vld1_u8 (const uint8_t * __a) +{ + return (uint8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vld1_u16 (const uint16_t * __a) +{ + return (uint16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vld1_u32 (const uint32_t * __a) +{ + return (uint32x2_t)__builtin_neon_vld1v2si ((const __builtin_neon_si *) __a); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vld1_u64 (const uint64_t * __a) +{ + return (uint64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vld1_p8 (const poly8_t * __a) +{ + return (poly8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vld1_p16 (const poly16_t * __a) +{ + return (poly16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vld1q_s8 (const int8_t * __a) +{ + return (int8x16_t)__builtin_neon_vld1v16qi ((const __builtin_neon_qi *) __a); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vld1q_s16 (const int16_t * __a) +{ + return (int16x8_t)__builtin_neon_vld1v8hi ((const __builtin_neon_hi *) __a); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vld1q_s32 (const int32_t * __a) +{ + return (int32x4_t)__builtin_neon_vld1v4si ((const __builtin_neon_si *) __a); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vld1q_s64 (const int64_t * __a) +{ + return (int64x2_t)__builtin_neon_vld1v2di ((const __builtin_neon_di *) __a); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vld1q_f32 (const float32_t * __a) +{ + return (float32x4_t)__builtin_neon_vld1v4sf ((const __builtin_neon_sf *) __a); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vld1q_u8 (const uint8_t * __a) +{ + return (uint8x16_t)__builtin_neon_vld1v16qi ((const __builtin_neon_qi *) __a); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vld1q_u16 (const uint16_t * __a) +{ + return (uint16x8_t)__builtin_neon_vld1v8hi ((const __builtin_neon_hi *) __a); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vld1q_u32 (const uint32_t * __a) +{ + return (uint32x4_t)__builtin_neon_vld1v4si ((const __builtin_neon_si *) __a); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vld1q_u64 (const uint64_t * __a) +{ + return (uint64x2_t)__builtin_neon_vld1v2di ((const __builtin_neon_di *) __a); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vld1q_p8 (const poly8_t * __a) +{ + return (poly8x16_t)__builtin_neon_vld1v16qi ((const __builtin_neon_qi *) __a); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vld1q_p16 (const poly16_t * __a) +{ + return (poly16x8_t)__builtin_neon_vld1v8hi ((const __builtin_neon_hi *) __a); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vld1_lane_s8 (const int8_t * __a, int8x8_t __b, const int __c) +{ + return (int8x8_t)__builtin_neon_vld1_lanev8qi ((const __builtin_neon_qi *) __a, __b, __c); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vld1_lane_s16 (const int16_t * __a, int16x4_t __b, const int __c) +{ + return (int16x4_t)__builtin_neon_vld1_lanev4hi ((const __builtin_neon_hi *) __a, __b, __c); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vld1_lane_s32 (const int32_t * __a, int32x2_t __b, const int __c) +{ + return (int32x2_t)__builtin_neon_vld1_lanev2si ((const __builtin_neon_si *) __a, __b, __c); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vld1_lane_f32 (const float32_t * __a, float32x2_t __b, const int __c) +{ + return (float32x2_t)__builtin_neon_vld1_lanev2sf ((const __builtin_neon_sf *) __a, __b, __c); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vld1_lane_u8 (const uint8_t * __a, uint8x8_t __b, const int __c) +{ + return (uint8x8_t)__builtin_neon_vld1_lanev8qi ((const __builtin_neon_qi *) __a, (int8x8_t) __b, __c); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vld1_lane_u16 (const uint16_t * __a, uint16x4_t __b, const int __c) +{ + return (uint16x4_t)__builtin_neon_vld1_lanev4hi ((const __builtin_neon_hi *) __a, (int16x4_t) __b, __c); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vld1_lane_u32 (const uint32_t * __a, uint32x2_t __b, const int __c) +{ + return (uint32x2_t)__builtin_neon_vld1_lanev2si ((const __builtin_neon_si *) __a, (int32x2_t) __b, __c); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vld1_lane_p8 (const poly8_t * __a, poly8x8_t __b, const int __c) +{ + return (poly8x8_t)__builtin_neon_vld1_lanev8qi ((const __builtin_neon_qi *) __a, (int8x8_t) __b, __c); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vld1_lane_p16 (const poly16_t * __a, poly16x4_t __b, const int __c) +{ + return (poly16x4_t)__builtin_neon_vld1_lanev4hi ((const __builtin_neon_hi *) __a, (int16x4_t) __b, __c); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vld1_lane_s64 (const int64_t * __a, int64x1_t __b, const int __c) +{ + return (int64x1_t)__builtin_neon_vld1_lanedi ((const __builtin_neon_di *) __a, __b, __c); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vld1_lane_u64 (const uint64_t * __a, uint64x1_t __b, const int __c) +{ + return (uint64x1_t)__builtin_neon_vld1_lanedi ((const __builtin_neon_di *) __a, (int64x1_t) __b, __c); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vld1q_lane_s8 (const int8_t * __a, int8x16_t __b, const int __c) +{ + return (int8x16_t)__builtin_neon_vld1_lanev16qi ((const __builtin_neon_qi *) __a, __b, __c); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vld1q_lane_s16 (const int16_t * __a, int16x8_t __b, const int __c) +{ + return (int16x8_t)__builtin_neon_vld1_lanev8hi ((const __builtin_neon_hi *) __a, __b, __c); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vld1q_lane_s32 (const int32_t * __a, int32x4_t __b, const int __c) +{ + return (int32x4_t)__builtin_neon_vld1_lanev4si ((const __builtin_neon_si *) __a, __b, __c); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vld1q_lane_f32 (const float32_t * __a, float32x4_t __b, const int __c) +{ + return (float32x4_t)__builtin_neon_vld1_lanev4sf ((const __builtin_neon_sf *) __a, __b, __c); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vld1q_lane_u8 (const uint8_t * __a, uint8x16_t __b, const int __c) +{ + return (uint8x16_t)__builtin_neon_vld1_lanev16qi ((const __builtin_neon_qi *) __a, (int8x16_t) __b, __c); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vld1q_lane_u16 (const uint16_t * __a, uint16x8_t __b, const int __c) +{ + return (uint16x8_t)__builtin_neon_vld1_lanev8hi ((const __builtin_neon_hi *) __a, (int16x8_t) __b, __c); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vld1q_lane_u32 (const uint32_t * __a, uint32x4_t __b, const int __c) +{ + return (uint32x4_t)__builtin_neon_vld1_lanev4si ((const __builtin_neon_si *) __a, (int32x4_t) __b, __c); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vld1q_lane_p8 (const poly8_t * __a, poly8x16_t __b, const int __c) +{ + return (poly8x16_t)__builtin_neon_vld1_lanev16qi ((const __builtin_neon_qi *) __a, (int8x16_t) __b, __c); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vld1q_lane_p16 (const poly16_t * __a, poly16x8_t __b, const int __c) +{ + return (poly16x8_t)__builtin_neon_vld1_lanev8hi ((const __builtin_neon_hi *) __a, (int16x8_t) __b, __c); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vld1q_lane_s64 (const int64_t * __a, int64x2_t __b, const int __c) +{ + return (int64x2_t)__builtin_neon_vld1_lanev2di ((const __builtin_neon_di *) __a, __b, __c); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vld1q_lane_u64 (const uint64_t * __a, uint64x2_t __b, const int __c) +{ + return (uint64x2_t)__builtin_neon_vld1_lanev2di ((const __builtin_neon_di *) __a, (int64x2_t) __b, __c); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vld1_dup_s8 (const int8_t * __a) +{ + return (int8x8_t)__builtin_neon_vld1_dupv8qi ((const __builtin_neon_qi *) __a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vld1_dup_s16 (const int16_t * __a) +{ + return (int16x4_t)__builtin_neon_vld1_dupv4hi ((const __builtin_neon_hi *) __a); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vld1_dup_s32 (const int32_t * __a) +{ + return (int32x2_t)__builtin_neon_vld1_dupv2si ((const __builtin_neon_si *) __a); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vld1_dup_f32 (const float32_t * __a) +{ + return (float32x2_t)__builtin_neon_vld1_dupv2sf ((const __builtin_neon_sf *) __a); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vld1_dup_u8 (const uint8_t * __a) +{ + return (uint8x8_t)__builtin_neon_vld1_dupv8qi ((const __builtin_neon_qi *) __a); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vld1_dup_u16 (const uint16_t * __a) +{ + return (uint16x4_t)__builtin_neon_vld1_dupv4hi ((const __builtin_neon_hi *) __a); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vld1_dup_u32 (const uint32_t * __a) +{ + return (uint32x2_t)__builtin_neon_vld1_dupv2si ((const __builtin_neon_si *) __a); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vld1_dup_p8 (const poly8_t * __a) +{ + return (poly8x8_t)__builtin_neon_vld1_dupv8qi ((const __builtin_neon_qi *) __a); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vld1_dup_p16 (const poly16_t * __a) +{ + return (poly16x4_t)__builtin_neon_vld1_dupv4hi ((const __builtin_neon_hi *) __a); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vld1_dup_s64 (const int64_t * __a) +{ + return (int64x1_t)__builtin_neon_vld1_dupdi ((const __builtin_neon_di *) __a); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vld1_dup_u64 (const uint64_t * __a) +{ + return (uint64x1_t)__builtin_neon_vld1_dupdi ((const __builtin_neon_di *) __a); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vld1q_dup_s8 (const int8_t * __a) +{ + return (int8x16_t)__builtin_neon_vld1_dupv16qi ((const __builtin_neon_qi *) __a); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vld1q_dup_s16 (const int16_t * __a) +{ + return (int16x8_t)__builtin_neon_vld1_dupv8hi ((const __builtin_neon_hi *) __a); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vld1q_dup_s32 (const int32_t * __a) +{ + return (int32x4_t)__builtin_neon_vld1_dupv4si ((const __builtin_neon_si *) __a); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vld1q_dup_f32 (const float32_t * __a) +{ + return (float32x4_t)__builtin_neon_vld1_dupv4sf ((const __builtin_neon_sf *) __a); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vld1q_dup_u8 (const uint8_t * __a) +{ + return (uint8x16_t)__builtin_neon_vld1_dupv16qi ((const __builtin_neon_qi *) __a); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vld1q_dup_u16 (const uint16_t * __a) +{ + return (uint16x8_t)__builtin_neon_vld1_dupv8hi ((const __builtin_neon_hi *) __a); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vld1q_dup_u32 (const uint32_t * __a) +{ + return (uint32x4_t)__builtin_neon_vld1_dupv4si ((const __builtin_neon_si *) __a); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vld1q_dup_p8 (const poly8_t * __a) +{ + return (poly8x16_t)__builtin_neon_vld1_dupv16qi ((const __builtin_neon_qi *) __a); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vld1q_dup_p16 (const poly16_t * __a) +{ + return (poly16x8_t)__builtin_neon_vld1_dupv8hi ((const __builtin_neon_hi *) __a); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vld1q_dup_s64 (const int64_t * __a) +{ + return (int64x2_t)__builtin_neon_vld1_dupv2di ((const __builtin_neon_di *) __a); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vld1q_dup_u64 (const uint64_t * __a) +{ + return (uint64x2_t)__builtin_neon_vld1_dupv2di ((const __builtin_neon_di *) __a); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_s8 (int8_t * __a, int8x8_t __b) +{ + __builtin_neon_vst1v8qi ((__builtin_neon_qi *) __a, __b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_s16 (int16_t * __a, int16x4_t __b) +{ + __builtin_neon_vst1v4hi ((__builtin_neon_hi *) __a, __b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_s32 (int32_t * __a, int32x2_t __b) +{ + __builtin_neon_vst1v2si ((__builtin_neon_si *) __a, __b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_s64 (int64_t * __a, int64x1_t __b) +{ + __builtin_neon_vst1di ((__builtin_neon_di *) __a, __b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_f32 (float32_t * __a, float32x2_t __b) +{ + __builtin_neon_vst1v2sf ((__builtin_neon_sf *) __a, __b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_u8 (uint8_t * __a, uint8x8_t __b) +{ + __builtin_neon_vst1v8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_u16 (uint16_t * __a, uint16x4_t __b) +{ + __builtin_neon_vst1v4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_u32 (uint32_t * __a, uint32x2_t __b) +{ + __builtin_neon_vst1v2si ((__builtin_neon_si *) __a, (int32x2_t) __b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_u64 (uint64_t * __a, uint64x1_t __b) +{ + __builtin_neon_vst1di ((__builtin_neon_di *) __a, (int64x1_t) __b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_p8 (poly8_t * __a, poly8x8_t __b) +{ + __builtin_neon_vst1v8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_p16 (poly16_t * __a, poly16x4_t __b) +{ + __builtin_neon_vst1v4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_s8 (int8_t * __a, int8x16_t __b) +{ + __builtin_neon_vst1v16qi ((__builtin_neon_qi *) __a, __b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_s16 (int16_t * __a, int16x8_t __b) +{ + __builtin_neon_vst1v8hi ((__builtin_neon_hi *) __a, __b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_s32 (int32_t * __a, int32x4_t __b) +{ + __builtin_neon_vst1v4si ((__builtin_neon_si *) __a, __b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_s64 (int64_t * __a, int64x2_t __b) +{ + __builtin_neon_vst1v2di ((__builtin_neon_di *) __a, __b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_f32 (float32_t * __a, float32x4_t __b) +{ + __builtin_neon_vst1v4sf ((__builtin_neon_sf *) __a, __b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_u8 (uint8_t * __a, uint8x16_t __b) +{ + __builtin_neon_vst1v16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_u16 (uint16_t * __a, uint16x8_t __b) +{ + __builtin_neon_vst1v8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_u32 (uint32_t * __a, uint32x4_t __b) +{ + __builtin_neon_vst1v4si ((__builtin_neon_si *) __a, (int32x4_t) __b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_u64 (uint64_t * __a, uint64x2_t __b) +{ + __builtin_neon_vst1v2di ((__builtin_neon_di *) __a, (int64x2_t) __b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_p8 (poly8_t * __a, poly8x16_t __b) +{ + __builtin_neon_vst1v16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_p16 (poly16_t * __a, poly16x8_t __b) +{ + __builtin_neon_vst1v8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_lane_s8 (int8_t * __a, int8x8_t __b, const int __c) +{ + __builtin_neon_vst1_lanev8qi ((__builtin_neon_qi *) __a, __b, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_lane_s16 (int16_t * __a, int16x4_t __b, const int __c) +{ + __builtin_neon_vst1_lanev4hi ((__builtin_neon_hi *) __a, __b, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_lane_s32 (int32_t * __a, int32x2_t __b, const int __c) +{ + __builtin_neon_vst1_lanev2si ((__builtin_neon_si *) __a, __b, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_lane_f32 (float32_t * __a, float32x2_t __b, const int __c) +{ + __builtin_neon_vst1_lanev2sf ((__builtin_neon_sf *) __a, __b, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_lane_u8 (uint8_t * __a, uint8x8_t __b, const int __c) +{ + __builtin_neon_vst1_lanev8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_lane_u16 (uint16_t * __a, uint16x4_t __b, const int __c) +{ + __builtin_neon_vst1_lanev4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_lane_u32 (uint32_t * __a, uint32x2_t __b, const int __c) +{ + __builtin_neon_vst1_lanev2si ((__builtin_neon_si *) __a, (int32x2_t) __b, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_lane_p8 (poly8_t * __a, poly8x8_t __b, const int __c) +{ + __builtin_neon_vst1_lanev8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_lane_p16 (poly16_t * __a, poly16x4_t __b, const int __c) +{ + __builtin_neon_vst1_lanev4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_lane_s64 (int64_t * __a, int64x1_t __b, const int __c) +{ + __builtin_neon_vst1_lanedi ((__builtin_neon_di *) __a, __b, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_lane_u64 (uint64_t * __a, uint64x1_t __b, const int __c) +{ + __builtin_neon_vst1_lanedi ((__builtin_neon_di *) __a, (int64x1_t) __b, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_lane_s8 (int8_t * __a, int8x16_t __b, const int __c) +{ + __builtin_neon_vst1_lanev16qi ((__builtin_neon_qi *) __a, __b, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_lane_s16 (int16_t * __a, int16x8_t __b, const int __c) +{ + __builtin_neon_vst1_lanev8hi ((__builtin_neon_hi *) __a, __b, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_lane_s32 (int32_t * __a, int32x4_t __b, const int __c) +{ + __builtin_neon_vst1_lanev4si ((__builtin_neon_si *) __a, __b, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_lane_f32 (float32_t * __a, float32x4_t __b, const int __c) +{ + __builtin_neon_vst1_lanev4sf ((__builtin_neon_sf *) __a, __b, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_lane_u8 (uint8_t * __a, uint8x16_t __b, const int __c) +{ + __builtin_neon_vst1_lanev16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_lane_u16 (uint16_t * __a, uint16x8_t __b, const int __c) +{ + __builtin_neon_vst1_lanev8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_lane_u32 (uint32_t * __a, uint32x4_t __b, const int __c) +{ + __builtin_neon_vst1_lanev4si ((__builtin_neon_si *) __a, (int32x4_t) __b, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_lane_p8 (poly8_t * __a, poly8x16_t __b, const int __c) +{ + __builtin_neon_vst1_lanev16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_lane_p16 (poly16_t * __a, poly16x8_t __b, const int __c) +{ + __builtin_neon_vst1_lanev8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_lane_s64 (int64_t * __a, int64x2_t __b, const int __c) +{ + __builtin_neon_vst1_lanev2di ((__builtin_neon_di *) __a, __b, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_lane_u64 (uint64_t * __a, uint64x2_t __b, const int __c) +{ + __builtin_neon_vst1_lanev2di ((__builtin_neon_di *) __a, (int64x2_t) __b, __c); +} + +__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__)) +vld2_s8 (const int8_t * __a) +{ + union { int8x8x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2v8qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__)) +vld2_s16 (const int16_t * __a) +{ + union { int16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2v4hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__)) +vld2_s32 (const int32_t * __a) +{ + union { int32x2x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2v2si ((const __builtin_neon_si *) __a); + return __rv.__i; +} + +__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__)) +vld2_f32 (const float32_t * __a) +{ + union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2v2sf ((const __builtin_neon_sf *) __a); + return __rv.__i; +} + +__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__)) +vld2_u8 (const uint8_t * __a) +{ + union { uint8x8x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2v8qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__)) +vld2_u16 (const uint16_t * __a) +{ + union { uint16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2v4hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__)) +vld2_u32 (const uint32_t * __a) +{ + union { uint32x2x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2v2si ((const __builtin_neon_si *) __a); + return __rv.__i; +} + +__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__)) +vld2_p8 (const poly8_t * __a) +{ + union { poly8x8x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2v8qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__)) +vld2_p16 (const poly16_t * __a) +{ + union { poly16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2v4hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__)) +vld2_s64 (const int64_t * __a) +{ + union { int64x1x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2di ((const __builtin_neon_di *) __a); + return __rv.__i; +} + +__extension__ static __inline uint64x1x2_t __attribute__ ((__always_inline__)) +vld2_u64 (const uint64_t * __a) +{ + union { uint64x1x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2di ((const __builtin_neon_di *) __a); + return __rv.__i; +} + +__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__)) +vld2q_s8 (const int8_t * __a) +{ + union { int8x16x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2v16qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__)) +vld2q_s16 (const int16_t * __a) +{ + union { int16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2v8hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__)) +vld2q_s32 (const int32_t * __a) +{ + union { int32x4x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2v4si ((const __builtin_neon_si *) __a); + return __rv.__i; +} + +__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__)) +vld2q_f32 (const float32_t * __a) +{ + union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2v4sf ((const __builtin_neon_sf *) __a); + return __rv.__i; +} + +__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__)) +vld2q_u8 (const uint8_t * __a) +{ + union { uint8x16x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2v16qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__)) +vld2q_u16 (const uint16_t * __a) +{ + union { uint16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2v8hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__)) +vld2q_u32 (const uint32_t * __a) +{ + union { uint32x4x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2v4si ((const __builtin_neon_si *) __a); + return __rv.__i; +} + +__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__)) +vld2q_p8 (const poly8_t * __a) +{ + union { poly8x16x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2v16qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__)) +vld2q_p16 (const poly16_t * __a) +{ + union { poly16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2v8hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__)) +vld2_lane_s8 (const int8_t * __a, int8x8x2_t __b, const int __c) +{ + union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + union { int8x8x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__)) +vld2_lane_s16 (const int16_t * __a, int16x4x2_t __b, const int __c) +{ + union { int16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + union { int16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__)) +vld2_lane_s32 (const int32_t * __a, int32x2x2_t __b, const int __c) +{ + union { int32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + union { int32x2x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_lanev2si ((const __builtin_neon_si *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__)) +vld2_lane_f32 (const float32_t * __a, float32x2x2_t __b, const int __c) +{ + union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_lanev2sf ((const __builtin_neon_sf *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__)) +vld2_lane_u8 (const uint8_t * __a, uint8x8x2_t __b, const int __c) +{ + union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + union { uint8x8x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__)) +vld2_lane_u16 (const uint16_t * __a, uint16x4x2_t __b, const int __c) +{ + union { uint16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + union { uint16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__)) +vld2_lane_u32 (const uint32_t * __a, uint32x2x2_t __b, const int __c) +{ + union { uint32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + union { uint32x2x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_lanev2si ((const __builtin_neon_si *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__)) +vld2_lane_p8 (const poly8_t * __a, poly8x8x2_t __b, const int __c) +{ + union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + union { poly8x8x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__)) +vld2_lane_p16 (const poly16_t * __a, poly16x4x2_t __b, const int __c) +{ + union { poly16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + union { poly16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__)) +vld2q_lane_s16 (const int16_t * __a, int16x8x2_t __b, const int __c) +{ + union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + union { int16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__)) +vld2q_lane_s32 (const int32_t * __a, int32x4x2_t __b, const int __c) +{ + union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + union { int32x4x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2_lanev4si ((const __builtin_neon_si *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__)) +vld2q_lane_f32 (const float32_t * __a, float32x4x2_t __b, const int __c) +{ + union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2_lanev4sf ((const __builtin_neon_sf *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__)) +vld2q_lane_u16 (const uint16_t * __a, uint16x8x2_t __b, const int __c) +{ + union { uint16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + union { uint16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__)) +vld2q_lane_u32 (const uint32_t * __a, uint32x4x2_t __b, const int __c) +{ + union { uint32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + union { uint32x4x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2_lanev4si ((const __builtin_neon_si *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__)) +vld2q_lane_p16 (const poly16_t * __a, poly16x8x2_t __b, const int __c) +{ + union { poly16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + union { poly16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__)) +vld2_dup_s8 (const int8_t * __a) +{ + union { int8x8x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_dupv8qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__)) +vld2_dup_s16 (const int16_t * __a) +{ + union { int16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_dupv4hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__)) +vld2_dup_s32 (const int32_t * __a) +{ + union { int32x2x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_dupv2si ((const __builtin_neon_si *) __a); + return __rv.__i; +} + +__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__)) +vld2_dup_f32 (const float32_t * __a) +{ + union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_dupv2sf ((const __builtin_neon_sf *) __a); + return __rv.__i; +} + +__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__)) +vld2_dup_u8 (const uint8_t * __a) +{ + union { uint8x8x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_dupv8qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__)) +vld2_dup_u16 (const uint16_t * __a) +{ + union { uint16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_dupv4hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__)) +vld2_dup_u32 (const uint32_t * __a) +{ + union { uint32x2x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_dupv2si ((const __builtin_neon_si *) __a); + return __rv.__i; +} + +__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__)) +vld2_dup_p8 (const poly8_t * __a) +{ + union { poly8x8x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_dupv8qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__)) +vld2_dup_p16 (const poly16_t * __a) +{ + union { poly16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_dupv4hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__)) +vld2_dup_s64 (const int64_t * __a) +{ + union { int64x1x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_dupdi ((const __builtin_neon_di *) __a); + return __rv.__i; +} + +__extension__ static __inline uint64x1x2_t __attribute__ ((__always_inline__)) +vld2_dup_u64 (const uint64_t * __a) +{ + union { uint64x1x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_dupdi ((const __builtin_neon_di *) __a); + return __rv.__i; +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_s8 (int8_t * __a, int8x8x2_t __b) +{ + union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_s16 (int16_t * __a, int16x4x2_t __b) +{ + union { int16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + __builtin_neon_vst2v4hi ((__builtin_neon_hi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_s32 (int32_t * __a, int32x2x2_t __b) +{ + union { int32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + __builtin_neon_vst2v2si ((__builtin_neon_si *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_f32 (float32_t * __a, float32x2x2_t __b) +{ + union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + __builtin_neon_vst2v2sf ((__builtin_neon_sf *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_u8 (uint8_t * __a, uint8x8x2_t __b) +{ + union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_u16 (uint16_t * __a, uint16x4x2_t __b) +{ + union { uint16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + __builtin_neon_vst2v4hi ((__builtin_neon_hi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_u32 (uint32_t * __a, uint32x2x2_t __b) +{ + union { uint32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + __builtin_neon_vst2v2si ((__builtin_neon_si *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_p8 (poly8_t * __a, poly8x8x2_t __b) +{ + union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_p16 (poly16_t * __a, poly16x4x2_t __b) +{ + union { poly16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + __builtin_neon_vst2v4hi ((__builtin_neon_hi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_s64 (int64_t * __a, int64x1x2_t __b) +{ + union { int64x1x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + __builtin_neon_vst2di ((__builtin_neon_di *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_u64 (uint64_t * __a, uint64x1x2_t __b) +{ + union { uint64x1x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + __builtin_neon_vst2di ((__builtin_neon_di *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_s8 (int8_t * __a, int8x16x2_t __b) +{ + union { int8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst2v16qi ((__builtin_neon_qi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_s16 (int16_t * __a, int16x8x2_t __b) +{ + union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst2v8hi ((__builtin_neon_hi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_s32 (int32_t * __a, int32x4x2_t __b) +{ + union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst2v4si ((__builtin_neon_si *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_f32 (float32_t * __a, float32x4x2_t __b) +{ + union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst2v4sf ((__builtin_neon_sf *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_u8 (uint8_t * __a, uint8x16x2_t __b) +{ + union { uint8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst2v16qi ((__builtin_neon_qi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_u16 (uint16_t * __a, uint16x8x2_t __b) +{ + union { uint16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst2v8hi ((__builtin_neon_hi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_u32 (uint32_t * __a, uint32x4x2_t __b) +{ + union { uint32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst2v4si ((__builtin_neon_si *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_p8 (poly8_t * __a, poly8x16x2_t __b) +{ + union { poly8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst2v16qi ((__builtin_neon_qi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_p16 (poly16_t * __a, poly16x8x2_t __b) +{ + union { poly16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst2v8hi ((__builtin_neon_hi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_lane_s8 (int8_t * __a, int8x8x2_t __b, const int __c) +{ + union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + __builtin_neon_vst2_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_lane_s16 (int16_t * __a, int16x4x2_t __b, const int __c) +{ + union { int16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + __builtin_neon_vst2_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_lane_s32 (int32_t * __a, int32x2x2_t __b, const int __c) +{ + union { int32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + __builtin_neon_vst2_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_lane_f32 (float32_t * __a, float32x2x2_t __b, const int __c) +{ + union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + __builtin_neon_vst2_lanev2sf ((__builtin_neon_sf *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_lane_u8 (uint8_t * __a, uint8x8x2_t __b, const int __c) +{ + union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + __builtin_neon_vst2_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_lane_u16 (uint16_t * __a, uint16x4x2_t __b, const int __c) +{ + union { uint16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + __builtin_neon_vst2_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_lane_u32 (uint32_t * __a, uint32x2x2_t __b, const int __c) +{ + union { uint32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + __builtin_neon_vst2_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_lane_p8 (poly8_t * __a, poly8x8x2_t __b, const int __c) +{ + union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + __builtin_neon_vst2_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_lane_p16 (poly16_t * __a, poly16x4x2_t __b, const int __c) +{ + union { poly16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + __builtin_neon_vst2_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_lane_s16 (int16_t * __a, int16x8x2_t __b, const int __c) +{ + union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst2_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_lane_s32 (int32_t * __a, int32x4x2_t __b, const int __c) +{ + union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst2_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_lane_f32 (float32_t * __a, float32x4x2_t __b, const int __c) +{ + union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst2_lanev4sf ((__builtin_neon_sf *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_lane_u16 (uint16_t * __a, uint16x8x2_t __b, const int __c) +{ + union { uint16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst2_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_lane_u32 (uint32_t * __a, uint32x4x2_t __b, const int __c) +{ + union { uint32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst2_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_lane_p16 (poly16_t * __a, poly16x8x2_t __b, const int __c) +{ + union { poly16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst2_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c); +} + +__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__)) +vld3_s8 (const int8_t * __a) +{ + union { int8x8x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3v8qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__)) +vld3_s16 (const int16_t * __a) +{ + union { int16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3v4hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__)) +vld3_s32 (const int32_t * __a) +{ + union { int32x2x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3v2si ((const __builtin_neon_si *) __a); + return __rv.__i; +} + +__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__)) +vld3_f32 (const float32_t * __a) +{ + union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3v2sf ((const __builtin_neon_sf *) __a); + return __rv.__i; +} + +__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__)) +vld3_u8 (const uint8_t * __a) +{ + union { uint8x8x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3v8qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__)) +vld3_u16 (const uint16_t * __a) +{ + union { uint16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3v4hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__)) +vld3_u32 (const uint32_t * __a) +{ + union { uint32x2x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3v2si ((const __builtin_neon_si *) __a); + return __rv.__i; +} + +__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__)) +vld3_p8 (const poly8_t * __a) +{ + union { poly8x8x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3v8qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__)) +vld3_p16 (const poly16_t * __a) +{ + union { poly16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3v4hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__)) +vld3_s64 (const int64_t * __a) +{ + union { int64x1x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3di ((const __builtin_neon_di *) __a); + return __rv.__i; +} + +__extension__ static __inline uint64x1x3_t __attribute__ ((__always_inline__)) +vld3_u64 (const uint64_t * __a) +{ + union { uint64x1x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3di ((const __builtin_neon_di *) __a); + return __rv.__i; +} + +__extension__ static __inline int8x16x3_t __attribute__ ((__always_inline__)) +vld3q_s8 (const int8_t * __a) +{ + union { int8x16x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3v16qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__)) +vld3q_s16 (const int16_t * __a) +{ + union { int16x8x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3v8hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__)) +vld3q_s32 (const int32_t * __a) +{ + union { int32x4x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3v4si ((const __builtin_neon_si *) __a); + return __rv.__i; +} + +__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__)) +vld3q_f32 (const float32_t * __a) +{ + union { float32x4x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3v4sf ((const __builtin_neon_sf *) __a); + return __rv.__i; +} + +__extension__ static __inline uint8x16x3_t __attribute__ ((__always_inline__)) +vld3q_u8 (const uint8_t * __a) +{ + union { uint8x16x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3v16qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__)) +vld3q_u16 (const uint16_t * __a) +{ + union { uint16x8x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3v8hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__)) +vld3q_u32 (const uint32_t * __a) +{ + union { uint32x4x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3v4si ((const __builtin_neon_si *) __a); + return __rv.__i; +} + +__extension__ static __inline poly8x16x3_t __attribute__ ((__always_inline__)) +vld3q_p8 (const poly8_t * __a) +{ + union { poly8x16x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3v16qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__)) +vld3q_p16 (const poly16_t * __a) +{ + union { poly16x8x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3v8hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__)) +vld3_lane_s8 (const int8_t * __a, int8x8x3_t __b, const int __c) +{ + union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + union { int8x8x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__)) +vld3_lane_s16 (const int16_t * __a, int16x4x3_t __b, const int __c) +{ + union { int16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + union { int16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__)) +vld3_lane_s32 (const int32_t * __a, int32x2x3_t __b, const int __c) +{ + union { int32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + union { int32x2x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_lanev2si ((const __builtin_neon_si *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__)) +vld3_lane_f32 (const float32_t * __a, float32x2x3_t __b, const int __c) +{ + union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_lanev2sf ((const __builtin_neon_sf *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__)) +vld3_lane_u8 (const uint8_t * __a, uint8x8x3_t __b, const int __c) +{ + union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + union { uint8x8x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__)) +vld3_lane_u16 (const uint16_t * __a, uint16x4x3_t __b, const int __c) +{ + union { uint16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + union { uint16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__)) +vld3_lane_u32 (const uint32_t * __a, uint32x2x3_t __b, const int __c) +{ + union { uint32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + union { uint32x2x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_lanev2si ((const __builtin_neon_si *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__)) +vld3_lane_p8 (const poly8_t * __a, poly8x8x3_t __b, const int __c) +{ + union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + union { poly8x8x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__)) +vld3_lane_p16 (const poly16_t * __a, poly16x4x3_t __b, const int __c) +{ + union { poly16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + union { poly16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__)) +vld3q_lane_s16 (const int16_t * __a, int16x8x3_t __b, const int __c) +{ + union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + union { int16x8x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__)) +vld3q_lane_s32 (const int32_t * __a, int32x4x3_t __b, const int __c) +{ + union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + union { int32x4x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3_lanev4si ((const __builtin_neon_si *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__)) +vld3q_lane_f32 (const float32_t * __a, float32x4x3_t __b, const int __c) +{ + union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + union { float32x4x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3_lanev4sf ((const __builtin_neon_sf *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__)) +vld3q_lane_u16 (const uint16_t * __a, uint16x8x3_t __b, const int __c) +{ + union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + union { uint16x8x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__)) +vld3q_lane_u32 (const uint32_t * __a, uint32x4x3_t __b, const int __c) +{ + union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + union { uint32x4x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3_lanev4si ((const __builtin_neon_si *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__)) +vld3q_lane_p16 (const poly16_t * __a, poly16x8x3_t __b, const int __c) +{ + union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + union { poly16x8x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__)) +vld3_dup_s8 (const int8_t * __a) +{ + union { int8x8x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_dupv8qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__)) +vld3_dup_s16 (const int16_t * __a) +{ + union { int16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_dupv4hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__)) +vld3_dup_s32 (const int32_t * __a) +{ + union { int32x2x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_dupv2si ((const __builtin_neon_si *) __a); + return __rv.__i; +} + +__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__)) +vld3_dup_f32 (const float32_t * __a) +{ + union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_dupv2sf ((const __builtin_neon_sf *) __a); + return __rv.__i; +} + +__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__)) +vld3_dup_u8 (const uint8_t * __a) +{ + union { uint8x8x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_dupv8qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__)) +vld3_dup_u16 (const uint16_t * __a) +{ + union { uint16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_dupv4hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__)) +vld3_dup_u32 (const uint32_t * __a) +{ + union { uint32x2x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_dupv2si ((const __builtin_neon_si *) __a); + return __rv.__i; +} + +__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__)) +vld3_dup_p8 (const poly8_t * __a) +{ + union { poly8x8x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_dupv8qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__)) +vld3_dup_p16 (const poly16_t * __a) +{ + union { poly16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_dupv4hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__)) +vld3_dup_s64 (const int64_t * __a) +{ + union { int64x1x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_dupdi ((const __builtin_neon_di *) __a); + return __rv.__i; +} + +__extension__ static __inline uint64x1x3_t __attribute__ ((__always_inline__)) +vld3_dup_u64 (const uint64_t * __a) +{ + union { uint64x1x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_dupdi ((const __builtin_neon_di *) __a); + return __rv.__i; +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_s8 (int8_t * __a, int8x8x3_t __b) +{ + union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + __builtin_neon_vst3v8qi ((__builtin_neon_qi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_s16 (int16_t * __a, int16x4x3_t __b) +{ + union { int16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + __builtin_neon_vst3v4hi ((__builtin_neon_hi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_s32 (int32_t * __a, int32x2x3_t __b) +{ + union { int32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + __builtin_neon_vst3v2si ((__builtin_neon_si *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_f32 (float32_t * __a, float32x2x3_t __b) +{ + union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + __builtin_neon_vst3v2sf ((__builtin_neon_sf *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_u8 (uint8_t * __a, uint8x8x3_t __b) +{ + union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + __builtin_neon_vst3v8qi ((__builtin_neon_qi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_u16 (uint16_t * __a, uint16x4x3_t __b) +{ + union { uint16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + __builtin_neon_vst3v4hi ((__builtin_neon_hi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_u32 (uint32_t * __a, uint32x2x3_t __b) +{ + union { uint32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + __builtin_neon_vst3v2si ((__builtin_neon_si *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_p8 (poly8_t * __a, poly8x8x3_t __b) +{ + union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + __builtin_neon_vst3v8qi ((__builtin_neon_qi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_p16 (poly16_t * __a, poly16x4x3_t __b) +{ + union { poly16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + __builtin_neon_vst3v4hi ((__builtin_neon_hi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_s64 (int64_t * __a, int64x1x3_t __b) +{ + union { int64x1x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + __builtin_neon_vst3di ((__builtin_neon_di *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_u64 (uint64_t * __a, uint64x1x3_t __b) +{ + union { uint64x1x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + __builtin_neon_vst3di ((__builtin_neon_di *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_s8 (int8_t * __a, int8x16x3_t __b) +{ + union { int8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst3v16qi ((__builtin_neon_qi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_s16 (int16_t * __a, int16x8x3_t __b) +{ + union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst3v8hi ((__builtin_neon_hi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_s32 (int32_t * __a, int32x4x3_t __b) +{ + union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst3v4si ((__builtin_neon_si *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_f32 (float32_t * __a, float32x4x3_t __b) +{ + union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst3v4sf ((__builtin_neon_sf *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_u8 (uint8_t * __a, uint8x16x3_t __b) +{ + union { uint8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst3v16qi ((__builtin_neon_qi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_u16 (uint16_t * __a, uint16x8x3_t __b) +{ + union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst3v8hi ((__builtin_neon_hi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_u32 (uint32_t * __a, uint32x4x3_t __b) +{ + union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst3v4si ((__builtin_neon_si *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_p8 (poly8_t * __a, poly8x16x3_t __b) +{ + union { poly8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst3v16qi ((__builtin_neon_qi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_p16 (poly16_t * __a, poly16x8x3_t __b) +{ + union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst3v8hi ((__builtin_neon_hi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_lane_s8 (int8_t * __a, int8x8x3_t __b, const int __c) +{ + union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + __builtin_neon_vst3_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_lane_s16 (int16_t * __a, int16x4x3_t __b, const int __c) +{ + union { int16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + __builtin_neon_vst3_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_lane_s32 (int32_t * __a, int32x2x3_t __b, const int __c) +{ + union { int32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + __builtin_neon_vst3_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_lane_f32 (float32_t * __a, float32x2x3_t __b, const int __c) +{ + union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + __builtin_neon_vst3_lanev2sf ((__builtin_neon_sf *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_lane_u8 (uint8_t * __a, uint8x8x3_t __b, const int __c) +{ + union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + __builtin_neon_vst3_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_lane_u16 (uint16_t * __a, uint16x4x3_t __b, const int __c) +{ + union { uint16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + __builtin_neon_vst3_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_lane_u32 (uint32_t * __a, uint32x2x3_t __b, const int __c) +{ + union { uint32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + __builtin_neon_vst3_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_lane_p8 (poly8_t * __a, poly8x8x3_t __b, const int __c) +{ + union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + __builtin_neon_vst3_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_lane_p16 (poly16_t * __a, poly16x4x3_t __b, const int __c) +{ + union { poly16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + __builtin_neon_vst3_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_lane_s16 (int16_t * __a, int16x8x3_t __b, const int __c) +{ + union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst3_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_lane_s32 (int32_t * __a, int32x4x3_t __b, const int __c) +{ + union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst3_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_lane_f32 (float32_t * __a, float32x4x3_t __b, const int __c) +{ + union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst3_lanev4sf ((__builtin_neon_sf *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_lane_u16 (uint16_t * __a, uint16x8x3_t __b, const int __c) +{ + union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst3_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_lane_u32 (uint32_t * __a, uint32x4x3_t __b, const int __c) +{ + union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst3_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_lane_p16 (poly16_t * __a, poly16x8x3_t __b, const int __c) +{ + union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst3_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c); +} + +__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__)) +vld4_s8 (const int8_t * __a) +{ + union { int8x8x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4v8qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__)) +vld4_s16 (const int16_t * __a) +{ + union { int16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4v4hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__)) +vld4_s32 (const int32_t * __a) +{ + union { int32x2x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4v2si ((const __builtin_neon_si *) __a); + return __rv.__i; +} + +__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__)) +vld4_f32 (const float32_t * __a) +{ + union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4v2sf ((const __builtin_neon_sf *) __a); + return __rv.__i; +} + +__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__)) +vld4_u8 (const uint8_t * __a) +{ + union { uint8x8x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4v8qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__)) +vld4_u16 (const uint16_t * __a) +{ + union { uint16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4v4hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__)) +vld4_u32 (const uint32_t * __a) +{ + union { uint32x2x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4v2si ((const __builtin_neon_si *) __a); + return __rv.__i; +} + +__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__)) +vld4_p8 (const poly8_t * __a) +{ + union { poly8x8x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4v8qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__)) +vld4_p16 (const poly16_t * __a) +{ + union { poly16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4v4hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__)) +vld4_s64 (const int64_t * __a) +{ + union { int64x1x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4di ((const __builtin_neon_di *) __a); + return __rv.__i; +} + +__extension__ static __inline uint64x1x4_t __attribute__ ((__always_inline__)) +vld4_u64 (const uint64_t * __a) +{ + union { uint64x1x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4di ((const __builtin_neon_di *) __a); + return __rv.__i; +} + +__extension__ static __inline int8x16x4_t __attribute__ ((__always_inline__)) +vld4q_s8 (const int8_t * __a) +{ + union { int8x16x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4v16qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__)) +vld4q_s16 (const int16_t * __a) +{ + union { int16x8x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4v8hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__)) +vld4q_s32 (const int32_t * __a) +{ + union { int32x4x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4v4si ((const __builtin_neon_si *) __a); + return __rv.__i; +} + +__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__)) +vld4q_f32 (const float32_t * __a) +{ + union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4v4sf ((const __builtin_neon_sf *) __a); + return __rv.__i; +} + +__extension__ static __inline uint8x16x4_t __attribute__ ((__always_inline__)) +vld4q_u8 (const uint8_t * __a) +{ + union { uint8x16x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4v16qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__)) +vld4q_u16 (const uint16_t * __a) +{ + union { uint16x8x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4v8hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__)) +vld4q_u32 (const uint32_t * __a) +{ + union { uint32x4x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4v4si ((const __builtin_neon_si *) __a); + return __rv.__i; +} + +__extension__ static __inline poly8x16x4_t __attribute__ ((__always_inline__)) +vld4q_p8 (const poly8_t * __a) +{ + union { poly8x16x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4v16qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__)) +vld4q_p16 (const poly16_t * __a) +{ + union { poly16x8x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4v8hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__)) +vld4_lane_s8 (const int8_t * __a, int8x8x4_t __b, const int __c) +{ + union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + union { int8x8x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__)) +vld4_lane_s16 (const int16_t * __a, int16x4x4_t __b, const int __c) +{ + union { int16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + union { int16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__)) +vld4_lane_s32 (const int32_t * __a, int32x2x4_t __b, const int __c) +{ + union { int32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + union { int32x2x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_lanev2si ((const __builtin_neon_si *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__)) +vld4_lane_f32 (const float32_t * __a, float32x2x4_t __b, const int __c) +{ + union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_lanev2sf ((const __builtin_neon_sf *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__)) +vld4_lane_u8 (const uint8_t * __a, uint8x8x4_t __b, const int __c) +{ + union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + union { uint8x8x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__)) +vld4_lane_u16 (const uint16_t * __a, uint16x4x4_t __b, const int __c) +{ + union { uint16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + union { uint16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__)) +vld4_lane_u32 (const uint32_t * __a, uint32x2x4_t __b, const int __c) +{ + union { uint32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + union { uint32x2x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_lanev2si ((const __builtin_neon_si *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__)) +vld4_lane_p8 (const poly8_t * __a, poly8x8x4_t __b, const int __c) +{ + union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + union { poly8x8x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__)) +vld4_lane_p16 (const poly16_t * __a, poly16x4x4_t __b, const int __c) +{ + union { poly16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + union { poly16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__)) +vld4q_lane_s16 (const int16_t * __a, int16x8x4_t __b, const int __c) +{ + union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + union { int16x8x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__)) +vld4q_lane_s32 (const int32_t * __a, int32x4x4_t __b, const int __c) +{ + union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + union { int32x4x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4_lanev4si ((const __builtin_neon_si *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__)) +vld4q_lane_f32 (const float32_t * __a, float32x4x4_t __b, const int __c) +{ + union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4_lanev4sf ((const __builtin_neon_sf *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__)) +vld4q_lane_u16 (const uint16_t * __a, uint16x8x4_t __b, const int __c) +{ + union { uint16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + union { uint16x8x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__)) +vld4q_lane_u32 (const uint32_t * __a, uint32x4x4_t __b, const int __c) +{ + union { uint32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + union { uint32x4x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4_lanev4si ((const __builtin_neon_si *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__)) +vld4q_lane_p16 (const poly16_t * __a, poly16x8x4_t __b, const int __c) +{ + union { poly16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + union { poly16x8x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__)) +vld4_dup_s8 (const int8_t * __a) +{ + union { int8x8x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_dupv8qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__)) +vld4_dup_s16 (const int16_t * __a) +{ + union { int16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_dupv4hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__)) +vld4_dup_s32 (const int32_t * __a) +{ + union { int32x2x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_dupv2si ((const __builtin_neon_si *) __a); + return __rv.__i; +} + +__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__)) +vld4_dup_f32 (const float32_t * __a) +{ + union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_dupv2sf ((const __builtin_neon_sf *) __a); + return __rv.__i; +} + +__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__)) +vld4_dup_u8 (const uint8_t * __a) +{ + union { uint8x8x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_dupv8qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__)) +vld4_dup_u16 (const uint16_t * __a) +{ + union { uint16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_dupv4hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__)) +vld4_dup_u32 (const uint32_t * __a) +{ + union { uint32x2x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_dupv2si ((const __builtin_neon_si *) __a); + return __rv.__i; +} + +__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__)) +vld4_dup_p8 (const poly8_t * __a) +{ + union { poly8x8x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_dupv8qi ((const __builtin_neon_qi *) __a); + return __rv.__i; +} + +__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__)) +vld4_dup_p16 (const poly16_t * __a) +{ + union { poly16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_dupv4hi ((const __builtin_neon_hi *) __a); + return __rv.__i; +} + +__extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__)) +vld4_dup_s64 (const int64_t * __a) +{ + union { int64x1x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_dupdi ((const __builtin_neon_di *) __a); + return __rv.__i; +} + +__extension__ static __inline uint64x1x4_t __attribute__ ((__always_inline__)) +vld4_dup_u64 (const uint64_t * __a) +{ + union { uint64x1x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_dupdi ((const __builtin_neon_di *) __a); + return __rv.__i; +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_s8 (int8_t * __a, int8x8x4_t __b) +{ + union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_s16 (int16_t * __a, int16x4x4_t __b) +{ + union { int16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst4v4hi ((__builtin_neon_hi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_s32 (int32_t * __a, int32x2x4_t __b) +{ + union { int32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst4v2si ((__builtin_neon_si *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_f32 (float32_t * __a, float32x2x4_t __b) +{ + union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst4v2sf ((__builtin_neon_sf *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_u8 (uint8_t * __a, uint8x8x4_t __b) +{ + union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_u16 (uint16_t * __a, uint16x4x4_t __b) +{ + union { uint16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst4v4hi ((__builtin_neon_hi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_u32 (uint32_t * __a, uint32x2x4_t __b) +{ + union { uint32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst4v2si ((__builtin_neon_si *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_p8 (poly8_t * __a, poly8x8x4_t __b) +{ + union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_p16 (poly16_t * __a, poly16x4x4_t __b) +{ + union { poly16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst4v4hi ((__builtin_neon_hi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_s64 (int64_t * __a, int64x1x4_t __b) +{ + union { int64x1x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst4di ((__builtin_neon_di *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_u64 (uint64_t * __a, uint64x1x4_t __b) +{ + union { uint64x1x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst4di ((__builtin_neon_di *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_s8 (int8_t * __a, int8x16x4_t __b) +{ + union { int8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + __builtin_neon_vst4v16qi ((__builtin_neon_qi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_s16 (int16_t * __a, int16x8x4_t __b) +{ + union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + __builtin_neon_vst4v8hi ((__builtin_neon_hi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_s32 (int32_t * __a, int32x4x4_t __b) +{ + union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + __builtin_neon_vst4v4si ((__builtin_neon_si *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_f32 (float32_t * __a, float32x4x4_t __b) +{ + union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + __builtin_neon_vst4v4sf ((__builtin_neon_sf *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_u8 (uint8_t * __a, uint8x16x4_t __b) +{ + union { uint8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + __builtin_neon_vst4v16qi ((__builtin_neon_qi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_u16 (uint16_t * __a, uint16x8x4_t __b) +{ + union { uint16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + __builtin_neon_vst4v8hi ((__builtin_neon_hi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_u32 (uint32_t * __a, uint32x4x4_t __b) +{ + union { uint32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + __builtin_neon_vst4v4si ((__builtin_neon_si *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_p8 (poly8_t * __a, poly8x16x4_t __b) +{ + union { poly8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + __builtin_neon_vst4v16qi ((__builtin_neon_qi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_p16 (poly16_t * __a, poly16x8x4_t __b) +{ + union { poly16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + __builtin_neon_vst4v8hi ((__builtin_neon_hi *) __a, __bu.__o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_lane_s8 (int8_t * __a, int8x8x4_t __b, const int __c) +{ + union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst4_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_lane_s16 (int16_t * __a, int16x4x4_t __b, const int __c) +{ + union { int16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst4_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_lane_s32 (int32_t * __a, int32x2x4_t __b, const int __c) +{ + union { int32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst4_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_lane_f32 (float32_t * __a, float32x2x4_t __b, const int __c) +{ + union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst4_lanev2sf ((__builtin_neon_sf *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_lane_u8 (uint8_t * __a, uint8x8x4_t __b, const int __c) +{ + union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst4_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_lane_u16 (uint16_t * __a, uint16x4x4_t __b, const int __c) +{ + union { uint16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst4_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_lane_u32 (uint32_t * __a, uint32x2x4_t __b, const int __c) +{ + union { uint32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst4_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_lane_p8 (poly8_t * __a, poly8x8x4_t __b, const int __c) +{ + union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst4_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_lane_p16 (poly16_t * __a, poly16x4x4_t __b, const int __c) +{ + union { poly16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst4_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_lane_s16 (int16_t * __a, int16x8x4_t __b, const int __c) +{ + union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + __builtin_neon_vst4_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_lane_s32 (int32_t * __a, int32x4x4_t __b, const int __c) +{ + union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + __builtin_neon_vst4_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_lane_f32 (float32_t * __a, float32x4x4_t __b, const int __c) +{ + union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + __builtin_neon_vst4_lanev4sf ((__builtin_neon_sf *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_lane_u16 (uint16_t * __a, uint16x8x4_t __b, const int __c) +{ + union { uint16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + __builtin_neon_vst4_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_lane_u32 (uint32_t * __a, uint32x4x4_t __b, const int __c) +{ + union { uint32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + __builtin_neon_vst4_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_lane_p16 (poly16_t * __a, poly16x8x4_t __b, const int __c) +{ + union { poly16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + __builtin_neon_vst4_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vand_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_neon_vandv8qi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vand_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_vandv4hi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vand_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_vandv2si (__a, __b, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vand_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vandv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vand_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vandv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vand_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vandv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vand_s64 (int64x1_t __a, int64x1_t __b) +{ + return (int64x1_t)__builtin_neon_vanddi (__a, __b, 1); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vand_u64 (uint64x1_t __a, uint64x1_t __b) +{ + return (uint64x1_t)__builtin_neon_vanddi ((int64x1_t) __a, (int64x1_t) __b, 0); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vandq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (int8x16_t)__builtin_neon_vandv16qi (__a, __b, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vandq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int16x8_t)__builtin_neon_vandv8hi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vandq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int32x4_t)__builtin_neon_vandv4si (__a, __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vandq_s64 (int64x2_t __a, int64x2_t __b) +{ + return (int64x2_t)__builtin_neon_vandv2di (__a, __b, 1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vandq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vandv16qi ((int8x16_t) __a, (int8x16_t) __b, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vandq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vandv8hi ((int16x8_t) __a, (int16x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vandq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vandv4si ((int32x4_t) __a, (int32x4_t) __b, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vandq_u64 (uint64x2_t __a, uint64x2_t __b) +{ + return (uint64x2_t)__builtin_neon_vandv2di ((int64x2_t) __a, (int64x2_t) __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vorr_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_neon_vorrv8qi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vorr_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_vorrv4hi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vorr_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_vorrv2si (__a, __b, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vorr_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vorrv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vorr_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vorrv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vorr_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vorrv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vorr_s64 (int64x1_t __a, int64x1_t __b) +{ + return (int64x1_t)__builtin_neon_vorrdi (__a, __b, 1); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vorr_u64 (uint64x1_t __a, uint64x1_t __b) +{ + return (uint64x1_t)__builtin_neon_vorrdi ((int64x1_t) __a, (int64x1_t) __b, 0); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vorrq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (int8x16_t)__builtin_neon_vorrv16qi (__a, __b, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vorrq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int16x8_t)__builtin_neon_vorrv8hi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vorrq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int32x4_t)__builtin_neon_vorrv4si (__a, __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vorrq_s64 (int64x2_t __a, int64x2_t __b) +{ + return (int64x2_t)__builtin_neon_vorrv2di (__a, __b, 1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vorrq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vorrv16qi ((int8x16_t) __a, (int8x16_t) __b, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vorrq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vorrv8hi ((int16x8_t) __a, (int16x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vorrq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vorrv4si ((int32x4_t) __a, (int32x4_t) __b, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vorrq_u64 (uint64x2_t __a, uint64x2_t __b) +{ + return (uint64x2_t)__builtin_neon_vorrv2di ((int64x2_t) __a, (int64x2_t) __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +veor_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_neon_veorv8qi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +veor_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_veorv4hi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +veor_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_veorv2si (__a, __b, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +veor_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_veorv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +veor_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_veorv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +veor_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_veorv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +veor_s64 (int64x1_t __a, int64x1_t __b) +{ + return (int64x1_t)__builtin_neon_veordi (__a, __b, 1); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +veor_u64 (uint64x1_t __a, uint64x1_t __b) +{ + return (uint64x1_t)__builtin_neon_veordi ((int64x1_t) __a, (int64x1_t) __b, 0); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +veorq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (int8x16_t)__builtin_neon_veorv16qi (__a, __b, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +veorq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int16x8_t)__builtin_neon_veorv8hi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +veorq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int32x4_t)__builtin_neon_veorv4si (__a, __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +veorq_s64 (int64x2_t __a, int64x2_t __b) +{ + return (int64x2_t)__builtin_neon_veorv2di (__a, __b, 1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +veorq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_veorv16qi ((int8x16_t) __a, (int8x16_t) __b, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +veorq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_veorv8hi ((int16x8_t) __a, (int16x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +veorq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_veorv4si ((int32x4_t) __a, (int32x4_t) __b, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +veorq_u64 (uint64x2_t __a, uint64x2_t __b) +{ + return (uint64x2_t)__builtin_neon_veorv2di ((int64x2_t) __a, (int64x2_t) __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vbic_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_neon_vbicv8qi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vbic_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_vbicv4hi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vbic_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_vbicv2si (__a, __b, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vbic_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vbicv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vbic_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vbicv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vbic_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vbicv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vbic_s64 (int64x1_t __a, int64x1_t __b) +{ + return (int64x1_t)__builtin_neon_vbicdi (__a, __b, 1); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vbic_u64 (uint64x1_t __a, uint64x1_t __b) +{ + return (uint64x1_t)__builtin_neon_vbicdi ((int64x1_t) __a, (int64x1_t) __b, 0); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vbicq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (int8x16_t)__builtin_neon_vbicv16qi (__a, __b, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vbicq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int16x8_t)__builtin_neon_vbicv8hi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vbicq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int32x4_t)__builtin_neon_vbicv4si (__a, __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vbicq_s64 (int64x2_t __a, int64x2_t __b) +{ + return (int64x2_t)__builtin_neon_vbicv2di (__a, __b, 1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vbicq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vbicv16qi ((int8x16_t) __a, (int8x16_t) __b, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vbicq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vbicv8hi ((int16x8_t) __a, (int16x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vbicq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vbicv4si ((int32x4_t) __a, (int32x4_t) __b, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vbicq_u64 (uint64x2_t __a, uint64x2_t __b) +{ + return (uint64x2_t)__builtin_neon_vbicv2di ((int64x2_t) __a, (int64x2_t) __b, 0); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vorn_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_neon_vornv8qi (__a, __b, 1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vorn_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t)__builtin_neon_vornv4hi (__a, __b, 1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vorn_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t)__builtin_neon_vornv2si (__a, __b, 1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vorn_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t)__builtin_neon_vornv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vorn_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t)__builtin_neon_vornv4hi ((int16x4_t) __a, (int16x4_t) __b, 0); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vorn_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t)__builtin_neon_vornv2si ((int32x2_t) __a, (int32x2_t) __b, 0); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vorn_s64 (int64x1_t __a, int64x1_t __b) +{ + return (int64x1_t)__builtin_neon_vorndi (__a, __b, 1); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vorn_u64 (uint64x1_t __a, uint64x1_t __b) +{ + return (uint64x1_t)__builtin_neon_vorndi ((int64x1_t) __a, (int64x1_t) __b, 0); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vornq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (int8x16_t)__builtin_neon_vornv16qi (__a, __b, 1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vornq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int16x8_t)__builtin_neon_vornv8hi (__a, __b, 1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vornq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int32x4_t)__builtin_neon_vornv4si (__a, __b, 1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vornq_s64 (int64x2_t __a, int64x2_t __b) +{ + return (int64x2_t)__builtin_neon_vornv2di (__a, __b, 1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vornq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t)__builtin_neon_vornv16qi ((int8x16_t) __a, (int8x16_t) __b, 0); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vornq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t)__builtin_neon_vornv8hi ((int16x8_t) __a, (int16x8_t) __b, 0); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vornq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t)__builtin_neon_vornv4si ((int32x4_t) __a, (int32x4_t) __b, 0); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vornq_u64 (uint64x2_t __a, uint64x2_t __b) +{ + return (uint64x2_t)__builtin_neon_vornv2di ((int64x2_t) __a, (int64x2_t) __b, 0); +} + + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vreinterpret_p8_s8 (int8x8_t __a) +{ + return (poly8x8_t)__builtin_neon_vreinterpretv8qiv8qi (__a); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vreinterpret_p8_s16 (int16x4_t __a) +{ + return (poly8x8_t)__builtin_neon_vreinterpretv8qiv4hi (__a); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vreinterpret_p8_s32 (int32x2_t __a) +{ + return (poly8x8_t)__builtin_neon_vreinterpretv8qiv2si (__a); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vreinterpret_p8_s64 (int64x1_t __a) +{ + return (poly8x8_t)__builtin_neon_vreinterpretv8qidi (__a); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vreinterpret_p8_f32 (float32x2_t __a) +{ + return (poly8x8_t)__builtin_neon_vreinterpretv8qiv2sf (__a); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vreinterpret_p8_u8 (uint8x8_t __a) +{ + return (poly8x8_t)__builtin_neon_vreinterpretv8qiv8qi ((int8x8_t) __a); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vreinterpret_p8_u16 (uint16x4_t __a) +{ + return (poly8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vreinterpret_p8_u32 (uint32x2_t __a) +{ + return (poly8x8_t)__builtin_neon_vreinterpretv8qiv2si ((int32x2_t) __a); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vreinterpret_p8_u64 (uint64x1_t __a) +{ + return (poly8x8_t)__builtin_neon_vreinterpretv8qidi ((int64x1_t) __a); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vreinterpret_p8_p16 (poly16x4_t __a) +{ + return (poly8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_p8_s8 (int8x16_t __a) +{ + return (poly8x16_t)__builtin_neon_vreinterpretv16qiv16qi (__a); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_p8_s16 (int16x8_t __a) +{ + return (poly8x16_t)__builtin_neon_vreinterpretv16qiv8hi (__a); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_p8_s32 (int32x4_t __a) +{ + return (poly8x16_t)__builtin_neon_vreinterpretv16qiv4si (__a); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_p8_s64 (int64x2_t __a) +{ + return (poly8x16_t)__builtin_neon_vreinterpretv16qiv2di (__a); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_p8_f32 (float32x4_t __a) +{ + return (poly8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_p8_u8 (uint8x16_t __a) +{ + return (poly8x16_t)__builtin_neon_vreinterpretv16qiv16qi ((int8x16_t) __a); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_p8_u16 (uint16x8_t __a) +{ + return (poly8x16_t)__builtin_neon_vreinterpretv16qiv8hi ((int16x8_t) __a); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_p8_u32 (uint32x4_t __a) +{ + return (poly8x16_t)__builtin_neon_vreinterpretv16qiv4si ((int32x4_t) __a); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_p8_u64 (uint64x2_t __a) +{ + return (poly8x16_t)__builtin_neon_vreinterpretv16qiv2di ((int64x2_t) __a); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_p8_p16 (poly16x8_t __a) +{ + return (poly8x16_t)__builtin_neon_vreinterpretv16qiv8hi ((int16x8_t) __a); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vreinterpret_p16_s8 (int8x8_t __a) +{ + return (poly16x4_t)__builtin_neon_vreinterpretv4hiv8qi (__a); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vreinterpret_p16_s16 (int16x4_t __a) +{ + return (poly16x4_t)__builtin_neon_vreinterpretv4hiv4hi (__a); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vreinterpret_p16_s32 (int32x2_t __a) +{ + return (poly16x4_t)__builtin_neon_vreinterpretv4hiv2si (__a); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vreinterpret_p16_s64 (int64x1_t __a) +{ + return (poly16x4_t)__builtin_neon_vreinterpretv4hidi (__a); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vreinterpret_p16_f32 (float32x2_t __a) +{ + return (poly16x4_t)__builtin_neon_vreinterpretv4hiv2sf (__a); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vreinterpret_p16_u8 (uint8x8_t __a) +{ + return (poly16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vreinterpret_p16_u16 (uint16x4_t __a) +{ + return (poly16x4_t)__builtin_neon_vreinterpretv4hiv4hi ((int16x4_t) __a); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vreinterpret_p16_u32 (uint32x2_t __a) +{ + return (poly16x4_t)__builtin_neon_vreinterpretv4hiv2si ((int32x2_t) __a); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vreinterpret_p16_u64 (uint64x1_t __a) +{ + return (poly16x4_t)__builtin_neon_vreinterpretv4hidi ((int64x1_t) __a); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vreinterpret_p16_p8 (poly8x8_t __a) +{ + return (poly16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_p16_s8 (int8x16_t __a) +{ + return (poly16x8_t)__builtin_neon_vreinterpretv8hiv16qi (__a); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_p16_s16 (int16x8_t __a) +{ + return (poly16x8_t)__builtin_neon_vreinterpretv8hiv8hi (__a); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_p16_s32 (int32x4_t __a) +{ + return (poly16x8_t)__builtin_neon_vreinterpretv8hiv4si (__a); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_p16_s64 (int64x2_t __a) +{ + return (poly16x8_t)__builtin_neon_vreinterpretv8hiv2di (__a); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_p16_f32 (float32x4_t __a) +{ + return (poly16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_p16_u8 (uint8x16_t __a) +{ + return (poly16x8_t)__builtin_neon_vreinterpretv8hiv16qi ((int8x16_t) __a); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_p16_u16 (uint16x8_t __a) +{ + return (poly16x8_t)__builtin_neon_vreinterpretv8hiv8hi ((int16x8_t) __a); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_p16_u32 (uint32x4_t __a) +{ + return (poly16x8_t)__builtin_neon_vreinterpretv8hiv4si ((int32x4_t) __a); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_p16_u64 (uint64x2_t __a) +{ + return (poly16x8_t)__builtin_neon_vreinterpretv8hiv2di ((int64x2_t) __a); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_p16_p8 (poly8x16_t __a) +{ + return (poly16x8_t)__builtin_neon_vreinterpretv8hiv16qi ((int8x16_t) __a); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vreinterpret_f32_s8 (int8x8_t __a) +{ + return (float32x2_t)__builtin_neon_vreinterpretv2sfv8qi (__a); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vreinterpret_f32_s16 (int16x4_t __a) +{ + return (float32x2_t)__builtin_neon_vreinterpretv2sfv4hi (__a); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vreinterpret_f32_s32 (int32x2_t __a) +{ + return (float32x2_t)__builtin_neon_vreinterpretv2sfv2si (__a); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vreinterpret_f32_s64 (int64x1_t __a) +{ + return (float32x2_t)__builtin_neon_vreinterpretv2sfdi (__a); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vreinterpret_f32_u8 (uint8x8_t __a) +{ + return (float32x2_t)__builtin_neon_vreinterpretv2sfv8qi ((int8x8_t) __a); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vreinterpret_f32_u16 (uint16x4_t __a) +{ + return (float32x2_t)__builtin_neon_vreinterpretv2sfv4hi ((int16x4_t) __a); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vreinterpret_f32_u32 (uint32x2_t __a) +{ + return (float32x2_t)__builtin_neon_vreinterpretv2sfv2si ((int32x2_t) __a); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vreinterpret_f32_u64 (uint64x1_t __a) +{ + return (float32x2_t)__builtin_neon_vreinterpretv2sfdi ((int64x1_t) __a); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vreinterpret_f32_p8 (poly8x8_t __a) +{ + return (float32x2_t)__builtin_neon_vreinterpretv2sfv8qi ((int8x8_t) __a); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vreinterpret_f32_p16 (poly16x4_t __a) +{ + return (float32x2_t)__builtin_neon_vreinterpretv2sfv4hi ((int16x4_t) __a); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_f32_s8 (int8x16_t __a) +{ + return (float32x4_t)__builtin_neon_vreinterpretv4sfv16qi (__a); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_f32_s16 (int16x8_t __a) +{ + return (float32x4_t)__builtin_neon_vreinterpretv4sfv8hi (__a); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_f32_s32 (int32x4_t __a) +{ + return (float32x4_t)__builtin_neon_vreinterpretv4sfv4si (__a); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_f32_s64 (int64x2_t __a) +{ + return (float32x4_t)__builtin_neon_vreinterpretv4sfv2di (__a); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_f32_u8 (uint8x16_t __a) +{ + return (float32x4_t)__builtin_neon_vreinterpretv4sfv16qi ((int8x16_t) __a); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_f32_u16 (uint16x8_t __a) +{ + return (float32x4_t)__builtin_neon_vreinterpretv4sfv8hi ((int16x8_t) __a); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_f32_u32 (uint32x4_t __a) +{ + return (float32x4_t)__builtin_neon_vreinterpretv4sfv4si ((int32x4_t) __a); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_f32_u64 (uint64x2_t __a) +{ + return (float32x4_t)__builtin_neon_vreinterpretv4sfv2di ((int64x2_t) __a); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_f32_p8 (poly8x16_t __a) +{ + return (float32x4_t)__builtin_neon_vreinterpretv4sfv16qi ((int8x16_t) __a); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_f32_p16 (poly16x8_t __a) +{ + return (float32x4_t)__builtin_neon_vreinterpretv4sfv8hi ((int16x8_t) __a); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vreinterpret_s64_s8 (int8x8_t __a) +{ + return (int64x1_t)__builtin_neon_vreinterpretdiv8qi (__a); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vreinterpret_s64_s16 (int16x4_t __a) +{ + return (int64x1_t)__builtin_neon_vreinterpretdiv4hi (__a); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vreinterpret_s64_s32 (int32x2_t __a) +{ + return (int64x1_t)__builtin_neon_vreinterpretdiv2si (__a); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vreinterpret_s64_f32 (float32x2_t __a) +{ + return (int64x1_t)__builtin_neon_vreinterpretdiv2sf (__a); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vreinterpret_s64_u8 (uint8x8_t __a) +{ + return (int64x1_t)__builtin_neon_vreinterpretdiv8qi ((int8x8_t) __a); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vreinterpret_s64_u16 (uint16x4_t __a) +{ + return (int64x1_t)__builtin_neon_vreinterpretdiv4hi ((int16x4_t) __a); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vreinterpret_s64_u32 (uint32x2_t __a) +{ + return (int64x1_t)__builtin_neon_vreinterpretdiv2si ((int32x2_t) __a); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vreinterpret_s64_u64 (uint64x1_t __a) +{ + return (int64x1_t)__builtin_neon_vreinterpretdidi ((int64x1_t) __a); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vreinterpret_s64_p8 (poly8x8_t __a) +{ + return (int64x1_t)__builtin_neon_vreinterpretdiv8qi ((int8x8_t) __a); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vreinterpret_s64_p16 (poly16x4_t __a) +{ + return (int64x1_t)__builtin_neon_vreinterpretdiv4hi ((int16x4_t) __a); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_s64_s8 (int8x16_t __a) +{ + return (int64x2_t)__builtin_neon_vreinterpretv2div16qi (__a); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_s64_s16 (int16x8_t __a) +{ + return (int64x2_t)__builtin_neon_vreinterpretv2div8hi (__a); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_s64_s32 (int32x4_t __a) +{ + return (int64x2_t)__builtin_neon_vreinterpretv2div4si (__a); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_s64_f32 (float32x4_t __a) +{ + return (int64x2_t)__builtin_neon_vreinterpretv2div4sf (__a); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_s64_u8 (uint8x16_t __a) +{ + return (int64x2_t)__builtin_neon_vreinterpretv2div16qi ((int8x16_t) __a); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_s64_u16 (uint16x8_t __a) +{ + return (int64x2_t)__builtin_neon_vreinterpretv2div8hi ((int16x8_t) __a); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_s64_u32 (uint32x4_t __a) +{ + return (int64x2_t)__builtin_neon_vreinterpretv2div4si ((int32x4_t) __a); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_s64_u64 (uint64x2_t __a) +{ + return (int64x2_t)__builtin_neon_vreinterpretv2div2di ((int64x2_t) __a); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_s64_p8 (poly8x16_t __a) +{ + return (int64x2_t)__builtin_neon_vreinterpretv2div16qi ((int8x16_t) __a); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_s64_p16 (poly16x8_t __a) +{ + return (int64x2_t)__builtin_neon_vreinterpretv2div8hi ((int16x8_t) __a); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vreinterpret_u64_s8 (int8x8_t __a) +{ + return (uint64x1_t)__builtin_neon_vreinterpretdiv8qi (__a); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vreinterpret_u64_s16 (int16x4_t __a) +{ + return (uint64x1_t)__builtin_neon_vreinterpretdiv4hi (__a); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vreinterpret_u64_s32 (int32x2_t __a) +{ + return (uint64x1_t)__builtin_neon_vreinterpretdiv2si (__a); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vreinterpret_u64_s64 (int64x1_t __a) +{ + return (uint64x1_t)__builtin_neon_vreinterpretdidi (__a); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vreinterpret_u64_f32 (float32x2_t __a) +{ + return (uint64x1_t)__builtin_neon_vreinterpretdiv2sf (__a); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vreinterpret_u64_u8 (uint8x8_t __a) +{ + return (uint64x1_t)__builtin_neon_vreinterpretdiv8qi ((int8x8_t) __a); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vreinterpret_u64_u16 (uint16x4_t __a) +{ + return (uint64x1_t)__builtin_neon_vreinterpretdiv4hi ((int16x4_t) __a); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vreinterpret_u64_u32 (uint32x2_t __a) +{ + return (uint64x1_t)__builtin_neon_vreinterpretdiv2si ((int32x2_t) __a); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vreinterpret_u64_p8 (poly8x8_t __a) +{ + return (uint64x1_t)__builtin_neon_vreinterpretdiv8qi ((int8x8_t) __a); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vreinterpret_u64_p16 (poly16x4_t __a) +{ + return (uint64x1_t)__builtin_neon_vreinterpretdiv4hi ((int16x4_t) __a); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_u64_s8 (int8x16_t __a) +{ + return (uint64x2_t)__builtin_neon_vreinterpretv2div16qi (__a); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_u64_s16 (int16x8_t __a) +{ + return (uint64x2_t)__builtin_neon_vreinterpretv2div8hi (__a); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_u64_s32 (int32x4_t __a) +{ + return (uint64x2_t)__builtin_neon_vreinterpretv2div4si (__a); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_u64_s64 (int64x2_t __a) +{ + return (uint64x2_t)__builtin_neon_vreinterpretv2div2di (__a); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_u64_f32 (float32x4_t __a) +{ + return (uint64x2_t)__builtin_neon_vreinterpretv2div4sf (__a); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_u64_u8 (uint8x16_t __a) +{ + return (uint64x2_t)__builtin_neon_vreinterpretv2div16qi ((int8x16_t) __a); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_u64_u16 (uint16x8_t __a) +{ + return (uint64x2_t)__builtin_neon_vreinterpretv2div8hi ((int16x8_t) __a); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_u64_u32 (uint32x4_t __a) +{ + return (uint64x2_t)__builtin_neon_vreinterpretv2div4si ((int32x4_t) __a); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_u64_p8 (poly8x16_t __a) +{ + return (uint64x2_t)__builtin_neon_vreinterpretv2div16qi ((int8x16_t) __a); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_u64_p16 (poly16x8_t __a) +{ + return (uint64x2_t)__builtin_neon_vreinterpretv2div8hi ((int16x8_t) __a); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vreinterpret_s8_s16 (int16x4_t __a) +{ + return (int8x8_t)__builtin_neon_vreinterpretv8qiv4hi (__a); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vreinterpret_s8_s32 (int32x2_t __a) +{ + return (int8x8_t)__builtin_neon_vreinterpretv8qiv2si (__a); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vreinterpret_s8_s64 (int64x1_t __a) +{ + return (int8x8_t)__builtin_neon_vreinterpretv8qidi (__a); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vreinterpret_s8_f32 (float32x2_t __a) +{ + return (int8x8_t)__builtin_neon_vreinterpretv8qiv2sf (__a); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vreinterpret_s8_u8 (uint8x8_t __a) +{ + return (int8x8_t)__builtin_neon_vreinterpretv8qiv8qi ((int8x8_t) __a); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vreinterpret_s8_u16 (uint16x4_t __a) +{ + return (int8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vreinterpret_s8_u32 (uint32x2_t __a) +{ + return (int8x8_t)__builtin_neon_vreinterpretv8qiv2si ((int32x2_t) __a); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vreinterpret_s8_u64 (uint64x1_t __a) +{ + return (int8x8_t)__builtin_neon_vreinterpretv8qidi ((int64x1_t) __a); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vreinterpret_s8_p8 (poly8x8_t __a) +{ + return (int8x8_t)__builtin_neon_vreinterpretv8qiv8qi ((int8x8_t) __a); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vreinterpret_s8_p16 (poly16x4_t __a) +{ + return (int8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_s8_s16 (int16x8_t __a) +{ + return (int8x16_t)__builtin_neon_vreinterpretv16qiv8hi (__a); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_s8_s32 (int32x4_t __a) +{ + return (int8x16_t)__builtin_neon_vreinterpretv16qiv4si (__a); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_s8_s64 (int64x2_t __a) +{ + return (int8x16_t)__builtin_neon_vreinterpretv16qiv2di (__a); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_s8_f32 (float32x4_t __a) +{ + return (int8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_s8_u8 (uint8x16_t __a) +{ + return (int8x16_t)__builtin_neon_vreinterpretv16qiv16qi ((int8x16_t) __a); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_s8_u16 (uint16x8_t __a) +{ + return (int8x16_t)__builtin_neon_vreinterpretv16qiv8hi ((int16x8_t) __a); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_s8_u32 (uint32x4_t __a) +{ + return (int8x16_t)__builtin_neon_vreinterpretv16qiv4si ((int32x4_t) __a); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_s8_u64 (uint64x2_t __a) +{ + return (int8x16_t)__builtin_neon_vreinterpretv16qiv2di ((int64x2_t) __a); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_s8_p8 (poly8x16_t __a) +{ + return (int8x16_t)__builtin_neon_vreinterpretv16qiv16qi ((int8x16_t) __a); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_s8_p16 (poly16x8_t __a) +{ + return (int8x16_t)__builtin_neon_vreinterpretv16qiv8hi ((int16x8_t) __a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vreinterpret_s16_s8 (int8x8_t __a) +{ + return (int16x4_t)__builtin_neon_vreinterpretv4hiv8qi (__a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vreinterpret_s16_s32 (int32x2_t __a) +{ + return (int16x4_t)__builtin_neon_vreinterpretv4hiv2si (__a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vreinterpret_s16_s64 (int64x1_t __a) +{ + return (int16x4_t)__builtin_neon_vreinterpretv4hidi (__a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vreinterpret_s16_f32 (float32x2_t __a) +{ + return (int16x4_t)__builtin_neon_vreinterpretv4hiv2sf (__a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vreinterpret_s16_u8 (uint8x8_t __a) +{ + return (int16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vreinterpret_s16_u16 (uint16x4_t __a) +{ + return (int16x4_t)__builtin_neon_vreinterpretv4hiv4hi ((int16x4_t) __a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vreinterpret_s16_u32 (uint32x2_t __a) +{ + return (int16x4_t)__builtin_neon_vreinterpretv4hiv2si ((int32x2_t) __a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vreinterpret_s16_u64 (uint64x1_t __a) +{ + return (int16x4_t)__builtin_neon_vreinterpretv4hidi ((int64x1_t) __a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vreinterpret_s16_p8 (poly8x8_t __a) +{ + return (int16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vreinterpret_s16_p16 (poly16x4_t __a) +{ + return (int16x4_t)__builtin_neon_vreinterpretv4hiv4hi ((int16x4_t) __a); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_s16_s8 (int8x16_t __a) +{ + return (int16x8_t)__builtin_neon_vreinterpretv8hiv16qi (__a); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_s16_s32 (int32x4_t __a) +{ + return (int16x8_t)__builtin_neon_vreinterpretv8hiv4si (__a); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_s16_s64 (int64x2_t __a) +{ + return (int16x8_t)__builtin_neon_vreinterpretv8hiv2di (__a); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_s16_f32 (float32x4_t __a) +{ + return (int16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_s16_u8 (uint8x16_t __a) +{ + return (int16x8_t)__builtin_neon_vreinterpretv8hiv16qi ((int8x16_t) __a); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_s16_u16 (uint16x8_t __a) +{ + return (int16x8_t)__builtin_neon_vreinterpretv8hiv8hi ((int16x8_t) __a); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_s16_u32 (uint32x4_t __a) +{ + return (int16x8_t)__builtin_neon_vreinterpretv8hiv4si ((int32x4_t) __a); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_s16_u64 (uint64x2_t __a) +{ + return (int16x8_t)__builtin_neon_vreinterpretv8hiv2di ((int64x2_t) __a); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_s16_p8 (poly8x16_t __a) +{ + return (int16x8_t)__builtin_neon_vreinterpretv8hiv16qi ((int8x16_t) __a); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_s16_p16 (poly16x8_t __a) +{ + return (int16x8_t)__builtin_neon_vreinterpretv8hiv8hi ((int16x8_t) __a); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vreinterpret_s32_s8 (int8x8_t __a) +{ + return (int32x2_t)__builtin_neon_vreinterpretv2siv8qi (__a); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vreinterpret_s32_s16 (int16x4_t __a) +{ + return (int32x2_t)__builtin_neon_vreinterpretv2siv4hi (__a); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vreinterpret_s32_s64 (int64x1_t __a) +{ + return (int32x2_t)__builtin_neon_vreinterpretv2sidi (__a); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vreinterpret_s32_f32 (float32x2_t __a) +{ + return (int32x2_t)__builtin_neon_vreinterpretv2siv2sf (__a); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vreinterpret_s32_u8 (uint8x8_t __a) +{ + return (int32x2_t)__builtin_neon_vreinterpretv2siv8qi ((int8x8_t) __a); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vreinterpret_s32_u16 (uint16x4_t __a) +{ + return (int32x2_t)__builtin_neon_vreinterpretv2siv4hi ((int16x4_t) __a); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vreinterpret_s32_u32 (uint32x2_t __a) +{ + return (int32x2_t)__builtin_neon_vreinterpretv2siv2si ((int32x2_t) __a); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vreinterpret_s32_u64 (uint64x1_t __a) +{ + return (int32x2_t)__builtin_neon_vreinterpretv2sidi ((int64x1_t) __a); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vreinterpret_s32_p8 (poly8x8_t __a) +{ + return (int32x2_t)__builtin_neon_vreinterpretv2siv8qi ((int8x8_t) __a); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vreinterpret_s32_p16 (poly16x4_t __a) +{ + return (int32x2_t)__builtin_neon_vreinterpretv2siv4hi ((int16x4_t) __a); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_s32_s8 (int8x16_t __a) +{ + return (int32x4_t)__builtin_neon_vreinterpretv4siv16qi (__a); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_s32_s16 (int16x8_t __a) +{ + return (int32x4_t)__builtin_neon_vreinterpretv4siv8hi (__a); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_s32_s64 (int64x2_t __a) +{ + return (int32x4_t)__builtin_neon_vreinterpretv4siv2di (__a); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_s32_f32 (float32x4_t __a) +{ + return (int32x4_t)__builtin_neon_vreinterpretv4siv4sf (__a); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_s32_u8 (uint8x16_t __a) +{ + return (int32x4_t)__builtin_neon_vreinterpretv4siv16qi ((int8x16_t) __a); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_s32_u16 (uint16x8_t __a) +{ + return (int32x4_t)__builtin_neon_vreinterpretv4siv8hi ((int16x8_t) __a); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_s32_u32 (uint32x4_t __a) +{ + return (int32x4_t)__builtin_neon_vreinterpretv4siv4si ((int32x4_t) __a); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_s32_u64 (uint64x2_t __a) +{ + return (int32x4_t)__builtin_neon_vreinterpretv4siv2di ((int64x2_t) __a); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_s32_p8 (poly8x16_t __a) +{ + return (int32x4_t)__builtin_neon_vreinterpretv4siv16qi ((int8x16_t) __a); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_s32_p16 (poly16x8_t __a) +{ + return (int32x4_t)__builtin_neon_vreinterpretv4siv8hi ((int16x8_t) __a); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vreinterpret_u8_s8 (int8x8_t __a) +{ + return (uint8x8_t)__builtin_neon_vreinterpretv8qiv8qi (__a); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vreinterpret_u8_s16 (int16x4_t __a) +{ + return (uint8x8_t)__builtin_neon_vreinterpretv8qiv4hi (__a); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vreinterpret_u8_s32 (int32x2_t __a) +{ + return (uint8x8_t)__builtin_neon_vreinterpretv8qiv2si (__a); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vreinterpret_u8_s64 (int64x1_t __a) +{ + return (uint8x8_t)__builtin_neon_vreinterpretv8qidi (__a); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vreinterpret_u8_f32 (float32x2_t __a) +{ + return (uint8x8_t)__builtin_neon_vreinterpretv8qiv2sf (__a); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vreinterpret_u8_u16 (uint16x4_t __a) +{ + return (uint8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vreinterpret_u8_u32 (uint32x2_t __a) +{ + return (uint8x8_t)__builtin_neon_vreinterpretv8qiv2si ((int32x2_t) __a); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vreinterpret_u8_u64 (uint64x1_t __a) +{ + return (uint8x8_t)__builtin_neon_vreinterpretv8qidi ((int64x1_t) __a); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vreinterpret_u8_p8 (poly8x8_t __a) +{ + return (uint8x8_t)__builtin_neon_vreinterpretv8qiv8qi ((int8x8_t) __a); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vreinterpret_u8_p16 (poly16x4_t __a) +{ + return (uint8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_u8_s8 (int8x16_t __a) +{ + return (uint8x16_t)__builtin_neon_vreinterpretv16qiv16qi (__a); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_u8_s16 (int16x8_t __a) +{ + return (uint8x16_t)__builtin_neon_vreinterpretv16qiv8hi (__a); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_u8_s32 (int32x4_t __a) +{ + return (uint8x16_t)__builtin_neon_vreinterpretv16qiv4si (__a); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_u8_s64 (int64x2_t __a) +{ + return (uint8x16_t)__builtin_neon_vreinterpretv16qiv2di (__a); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_u8_f32 (float32x4_t __a) +{ + return (uint8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_u8_u16 (uint16x8_t __a) +{ + return (uint8x16_t)__builtin_neon_vreinterpretv16qiv8hi ((int16x8_t) __a); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_u8_u32 (uint32x4_t __a) +{ + return (uint8x16_t)__builtin_neon_vreinterpretv16qiv4si ((int32x4_t) __a); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_u8_u64 (uint64x2_t __a) +{ + return (uint8x16_t)__builtin_neon_vreinterpretv16qiv2di ((int64x2_t) __a); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_u8_p8 (poly8x16_t __a) +{ + return (uint8x16_t)__builtin_neon_vreinterpretv16qiv16qi ((int8x16_t) __a); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_u8_p16 (poly16x8_t __a) +{ + return (uint8x16_t)__builtin_neon_vreinterpretv16qiv8hi ((int16x8_t) __a); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vreinterpret_u16_s8 (int8x8_t __a) +{ + return (uint16x4_t)__builtin_neon_vreinterpretv4hiv8qi (__a); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vreinterpret_u16_s16 (int16x4_t __a) +{ + return (uint16x4_t)__builtin_neon_vreinterpretv4hiv4hi (__a); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vreinterpret_u16_s32 (int32x2_t __a) +{ + return (uint16x4_t)__builtin_neon_vreinterpretv4hiv2si (__a); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vreinterpret_u16_s64 (int64x1_t __a) +{ + return (uint16x4_t)__builtin_neon_vreinterpretv4hidi (__a); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vreinterpret_u16_f32 (float32x2_t __a) +{ + return (uint16x4_t)__builtin_neon_vreinterpretv4hiv2sf (__a); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vreinterpret_u16_u8 (uint8x8_t __a) +{ + return (uint16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vreinterpret_u16_u32 (uint32x2_t __a) +{ + return (uint16x4_t)__builtin_neon_vreinterpretv4hiv2si ((int32x2_t) __a); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vreinterpret_u16_u64 (uint64x1_t __a) +{ + return (uint16x4_t)__builtin_neon_vreinterpretv4hidi ((int64x1_t) __a); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vreinterpret_u16_p8 (poly8x8_t __a) +{ + return (uint16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vreinterpret_u16_p16 (poly16x4_t __a) +{ + return (uint16x4_t)__builtin_neon_vreinterpretv4hiv4hi ((int16x4_t) __a); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_u16_s8 (int8x16_t __a) +{ + return (uint16x8_t)__builtin_neon_vreinterpretv8hiv16qi (__a); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_u16_s16 (int16x8_t __a) +{ + return (uint16x8_t)__builtin_neon_vreinterpretv8hiv8hi (__a); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_u16_s32 (int32x4_t __a) +{ + return (uint16x8_t)__builtin_neon_vreinterpretv8hiv4si (__a); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_u16_s64 (int64x2_t __a) +{ + return (uint16x8_t)__builtin_neon_vreinterpretv8hiv2di (__a); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_u16_f32 (float32x4_t __a) +{ + return (uint16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_u16_u8 (uint8x16_t __a) +{ + return (uint16x8_t)__builtin_neon_vreinterpretv8hiv16qi ((int8x16_t) __a); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_u16_u32 (uint32x4_t __a) +{ + return (uint16x8_t)__builtin_neon_vreinterpretv8hiv4si ((int32x4_t) __a); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_u16_u64 (uint64x2_t __a) +{ + return (uint16x8_t)__builtin_neon_vreinterpretv8hiv2di ((int64x2_t) __a); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_u16_p8 (poly8x16_t __a) +{ + return (uint16x8_t)__builtin_neon_vreinterpretv8hiv16qi ((int8x16_t) __a); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_u16_p16 (poly16x8_t __a) +{ + return (uint16x8_t)__builtin_neon_vreinterpretv8hiv8hi ((int16x8_t) __a); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vreinterpret_u32_s8 (int8x8_t __a) +{ + return (uint32x2_t)__builtin_neon_vreinterpretv2siv8qi (__a); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vreinterpret_u32_s16 (int16x4_t __a) +{ + return (uint32x2_t)__builtin_neon_vreinterpretv2siv4hi (__a); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vreinterpret_u32_s32 (int32x2_t __a) +{ + return (uint32x2_t)__builtin_neon_vreinterpretv2siv2si (__a); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vreinterpret_u32_s64 (int64x1_t __a) +{ + return (uint32x2_t)__builtin_neon_vreinterpretv2sidi (__a); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vreinterpret_u32_f32 (float32x2_t __a) +{ + return (uint32x2_t)__builtin_neon_vreinterpretv2siv2sf (__a); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vreinterpret_u32_u8 (uint8x8_t __a) +{ + return (uint32x2_t)__builtin_neon_vreinterpretv2siv8qi ((int8x8_t) __a); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vreinterpret_u32_u16 (uint16x4_t __a) +{ + return (uint32x2_t)__builtin_neon_vreinterpretv2siv4hi ((int16x4_t) __a); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vreinterpret_u32_u64 (uint64x1_t __a) +{ + return (uint32x2_t)__builtin_neon_vreinterpretv2sidi ((int64x1_t) __a); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vreinterpret_u32_p8 (poly8x8_t __a) +{ + return (uint32x2_t)__builtin_neon_vreinterpretv2siv8qi ((int8x8_t) __a); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vreinterpret_u32_p16 (poly16x4_t __a) +{ + return (uint32x2_t)__builtin_neon_vreinterpretv2siv4hi ((int16x4_t) __a); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_u32_s8 (int8x16_t __a) +{ + return (uint32x4_t)__builtin_neon_vreinterpretv4siv16qi (__a); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_u32_s16 (int16x8_t __a) +{ + return (uint32x4_t)__builtin_neon_vreinterpretv4siv8hi (__a); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_u32_s32 (int32x4_t __a) +{ + return (uint32x4_t)__builtin_neon_vreinterpretv4siv4si (__a); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_u32_s64 (int64x2_t __a) +{ + return (uint32x4_t)__builtin_neon_vreinterpretv4siv2di (__a); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_u32_f32 (float32x4_t __a) +{ + return (uint32x4_t)__builtin_neon_vreinterpretv4siv4sf (__a); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_u32_u8 (uint8x16_t __a) +{ + return (uint32x4_t)__builtin_neon_vreinterpretv4siv16qi ((int8x16_t) __a); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_u32_u16 (uint16x8_t __a) +{ + return (uint32x4_t)__builtin_neon_vreinterpretv4siv8hi ((int16x8_t) __a); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_u32_u64 (uint64x2_t __a) +{ + return (uint32x4_t)__builtin_neon_vreinterpretv4siv2di ((int64x2_t) __a); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_u32_p8 (poly8x16_t __a) +{ + return (uint32x4_t)__builtin_neon_vreinterpretv4siv16qi ((int8x16_t) __a); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_u32_p16 (poly16x8_t __a) +{ + return (uint32x4_t)__builtin_neon_vreinterpretv4siv8hi ((int16x8_t) __a); +} + +#ifdef __cplusplus +} +#endif +#endif +#endif diff --git a/gcc/config/arm/bpabi-v6m.S b/gcc/config/arm/bpabi-v6m.S new file mode 100644 index 000000000..4ecea6da5 --- /dev/null +++ b/gcc/config/arm/bpabi-v6m.S @@ -0,0 +1,318 @@ +/* Miscellaneous BPABI functions. ARMv6M implementation + + Copyright (C) 2006, 2008, 2009, 2010 Free Software Foundation, Inc. + Contributed by CodeSourcery. + + This file is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any + later version. + + This file is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifdef __ARM_EABI__ +/* Some attributes that are common to all routines in this file. */ + /* Tag_ABI_align_needed: This code does not require 8-byte + alignment from the caller. */ + /* .eabi_attribute 24, 0 -- default setting. */ + /* Tag_ABI_align_preserved: This code preserves 8-byte + alignment in any callee. */ + .eabi_attribute 25, 1 +#endif /* __ARM_EABI__ */ + +#ifdef L_aeabi_lcmp + +FUNC_START aeabi_lcmp + cmp xxh, yyh + beq 1f + bgt 2f + mov r0, #1 + neg r0, r0 + RET +2: + mov r0, #1 + RET +1: + sub r0, xxl, yyl + beq 1f + bhi 2f + mov r0, #1 + neg r0, r0 + RET +2: + mov r0, #1 +1: + RET + FUNC_END aeabi_lcmp + +#endif /* L_aeabi_lcmp */ + +#ifdef L_aeabi_ulcmp + +FUNC_START aeabi_ulcmp + cmp xxh, yyh + bne 1f + sub r0, xxl, yyl + beq 2f +1: + bcs 1f + mov r0, #1 + neg r0, r0 + RET +1: + mov r0, #1 +2: + RET + FUNC_END aeabi_ulcmp + +#endif /* L_aeabi_ulcmp */ + +.macro test_div_by_zero signed + cmp yyh, #0 + bne 7f + cmp yyl, #0 + bne 7f + cmp xxh, #0 + bne 2f + cmp xxl, #0 +2: + .ifc \signed, unsigned + beq 3f + mov xxh, #0 + mvn xxh, xxh @ 0xffffffff + mov xxl, xxh +3: + .else + beq 5f + blt 6f + mov xxl, #0 + mvn xxl, xxl @ 0xffffffff + lsr xxh, xxl, #1 @ 0x7fffffff + b 5f +6: mov xxh, #0x80 + lsl xxh, xxh, #24 @ 0x80000000 + mov xxl, #0 +5: + .endif + @ tailcalls are tricky on v6-m. + push {r0, r1, r2} + ldr r0, 1f + adr r1, 1f + add r0, r1 + str r0, [sp, #8] + @ We know we are not on armv4t, so pop pc is safe. + pop {r0, r1, pc} + .align 2 +1: + .word __aeabi_ldiv0 - 1b +7: +.endm + +#ifdef L_aeabi_ldivmod + +FUNC_START aeabi_ldivmod + test_div_by_zero signed + + push {r0, r1} + mov r0, sp + push {r0, lr} + ldr r0, [sp, #8] + bl SYM(__gnu_ldivmod_helper) + ldr r3, [sp, #4] + mov lr, r3 + add sp, sp, #8 + pop {r2, r3} + RET + FUNC_END aeabi_ldivmod + +#endif /* L_aeabi_ldivmod */ + +#ifdef L_aeabi_uldivmod + +FUNC_START aeabi_uldivmod + test_div_by_zero unsigned + + push {r0, r1} + mov r0, sp + push {r0, lr} + ldr r0, [sp, #8] + bl SYM(__gnu_uldivmod_helper) + ldr r3, [sp, #4] + mov lr, r3 + add sp, sp, #8 + pop {r2, r3} + RET + FUNC_END aeabi_uldivmod + +#endif /* L_aeabi_uldivmod */ + +#ifdef L_arm_addsubsf3 + +FUNC_START aeabi_frsub + + push {r4, lr} + mov r4, #1 + lsl r4, #31 + eor r0, r0, r4 + bl __aeabi_fadd + pop {r4, pc} + + FUNC_END aeabi_frsub + +#endif /* L_arm_addsubsf3 */ + +#ifdef L_arm_cmpsf2 + +FUNC_START aeabi_cfrcmple + + mov ip, r0 + mov r0, r1 + mov r1, ip + b 6f + +FUNC_START aeabi_cfcmpeq +FUNC_ALIAS aeabi_cfcmple aeabi_cfcmpeq + + @ The status-returning routines are required to preserve all + @ registers except ip, lr, and cpsr. +6: push {r0, r1, r2, r3, r4, lr} + bl __lesf2 + @ Set the Z flag correctly, and the C flag unconditionally. + cmp r0, #0 + @ Clear the C flag if the return value was -1, indicating + @ that the first operand was smaller than the second. + bmi 1f + mov r1, #0 + cmn r0, r1 +1: + pop {r0, r1, r2, r3, r4, pc} + + FUNC_END aeabi_cfcmple + FUNC_END aeabi_cfcmpeq + FUNC_END aeabi_cfrcmple + +FUNC_START aeabi_fcmpeq + + push {r4, lr} + bl __eqsf2 + neg r0, r0 + add r0, r0, #1 + pop {r4, pc} + + FUNC_END aeabi_fcmpeq + +.macro COMPARISON cond, helper, mode=sf2 +FUNC_START aeabi_fcmp\cond + + push {r4, lr} + bl __\helper\mode + cmp r0, #0 + b\cond 1f + mov r0, #0 + pop {r4, pc} +1: + mov r0, #1 + pop {r4, pc} + + FUNC_END aeabi_fcmp\cond +.endm + +COMPARISON lt, le +COMPARISON le, le +COMPARISON gt, ge +COMPARISON ge, ge + +#endif /* L_arm_cmpsf2 */ + +#ifdef L_arm_addsubdf3 + +FUNC_START aeabi_drsub + + push {r4, lr} + mov r4, #1 + lsl r4, #31 + eor xxh, xxh, r4 + bl __aeabi_dadd + pop {r4, pc} + + FUNC_END aeabi_drsub + +#endif /* L_arm_addsubdf3 */ + +#ifdef L_arm_cmpdf2 + +FUNC_START aeabi_cdrcmple + + mov ip, r0 + mov r0, r2 + mov r2, ip + mov ip, r1 + mov r1, r3 + mov r3, ip + b 6f + +FUNC_START aeabi_cdcmpeq +FUNC_ALIAS aeabi_cdcmple aeabi_cdcmpeq + + @ The status-returning routines are required to preserve all + @ registers except ip, lr, and cpsr. +6: push {r0, r1, r2, r3, r4, lr} + bl __ledf2 + @ Set the Z flag correctly, and the C flag unconditionally. + cmp r0, #0 + @ Clear the C flag if the return value was -1, indicating + @ that the first operand was smaller than the second. + bmi 1f + mov r1, #0 + cmn r0, r1 +1: + pop {r0, r1, r2, r3, r4, pc} + + FUNC_END aeabi_cdcmple + FUNC_END aeabi_cdcmpeq + FUNC_END aeabi_cdrcmple + +FUNC_START aeabi_dcmpeq + + push {r4, lr} + bl __eqdf2 + neg r0, r0 + add r0, r0, #1 + pop {r4, pc} + + FUNC_END aeabi_dcmpeq + +.macro COMPARISON cond, helper, mode=df2 +FUNC_START aeabi_dcmp\cond + + push {r4, lr} + bl __\helper\mode + cmp r0, #0 + b\cond 1f + mov r0, #0 + pop {r4, pc} +1: + mov r0, #1 + pop {r4, pc} + + FUNC_END aeabi_dcmp\cond +.endm + +COMPARISON lt, le +COMPARISON le, le +COMPARISON gt, ge +COMPARISON ge, ge + +#endif /* L_arm_cmpdf2 */ diff --git a/gcc/config/arm/bpabi.S b/gcc/config/arm/bpabi.S new file mode 100644 index 000000000..2ff338927 --- /dev/null +++ b/gcc/config/arm/bpabi.S @@ -0,0 +1,163 @@ +/* Miscellaneous BPABI functions. + + Copyright (C) 2003, 2004, 2007, 2008, 2009, 2010 + Free Software Foundation, Inc. + Contributed by CodeSourcery, LLC. + + This file is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any + later version. + + This file is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifdef __ARM_EABI__ +/* Some attributes that are common to all routines in this file. */ + /* Tag_ABI_align_needed: This code does not require 8-byte + alignment from the caller. */ + /* .eabi_attribute 24, 0 -- default setting. */ + /* Tag_ABI_align_preserved: This code preserves 8-byte + alignment in any callee. */ + .eabi_attribute 25, 1 +#endif /* __ARM_EABI__ */ + +#ifdef L_aeabi_lcmp + +ARM_FUNC_START aeabi_lcmp + cmp xxh, yyh + do_it lt + movlt r0, #-1 + do_it gt + movgt r0, #1 + do_it ne + RETc(ne) + subs r0, xxl, yyl + do_it lo + movlo r0, #-1 + do_it hi + movhi r0, #1 + RET + FUNC_END aeabi_lcmp + +#endif /* L_aeabi_lcmp */ + +#ifdef L_aeabi_ulcmp + +ARM_FUNC_START aeabi_ulcmp + cmp xxh, yyh + do_it lo + movlo r0, #-1 + do_it hi + movhi r0, #1 + do_it ne + RETc(ne) + cmp xxl, yyl + do_it lo + movlo r0, #-1 + do_it hi + movhi r0, #1 + do_it eq + moveq r0, #0 + RET + FUNC_END aeabi_ulcmp + +#endif /* L_aeabi_ulcmp */ + +.macro test_div_by_zero signed +/* Tail-call to divide-by-zero handlers which may be overridden by the user, + so unwinding works properly. */ +#if defined(__thumb2__) + cbnz yyh, 1f + cbnz yyl, 1f + cmp xxh, #0 + do_it eq + cmpeq xxl, #0 + .ifc \signed, unsigned + beq 2f + mov xxh, #0xffffffff + mov xxl, xxh +2: + .else + do_it lt, t + movlt xxl, #0 + movlt xxh, #0x80000000 + do_it gt, t + movgt xxh, #0x7fffffff + movgt xxl, #0xffffffff + .endif + b SYM (__aeabi_ldiv0) __PLT__ +1: +#else + /* Note: Thumb-1 code calls via an ARM shim on processors which + support ARM mode. */ + cmp yyh, #0 + cmpeq yyl, #0 + bne 2f + cmp xxh, #0 + cmpeq xxl, #0 + .ifc \signed, unsigned + movne xxh, #0xffffffff + movne xxl, #0xffffffff + .else + movlt xxh, #0x80000000 + movlt xxl, #0 + movgt xxh, #0x7fffffff + movgt xxl, #0xffffffff + .endif + b SYM (__aeabi_ldiv0) __PLT__ +2: +#endif +.endm + +#ifdef L_aeabi_ldivmod + +ARM_FUNC_START aeabi_ldivmod + test_div_by_zero signed + + sub sp, sp, #8 +#if defined(__thumb2__) + mov ip, sp + push {ip, lr} +#else + do_push {sp, lr} +#endif + bl SYM(__gnu_ldivmod_helper) __PLT__ + ldr lr, [sp, #4] + add sp, sp, #8 + do_pop {r2, r3} + RET + +#endif /* L_aeabi_ldivmod */ + +#ifdef L_aeabi_uldivmod + +ARM_FUNC_START aeabi_uldivmod + test_div_by_zero unsigned + + sub sp, sp, #8 +#if defined(__thumb2__) + mov ip, sp + push {ip, lr} +#else + do_push {sp, lr} +#endif + bl SYM(__gnu_uldivmod_helper) __PLT__ + ldr lr, [sp, #4] + add sp, sp, #8 + do_pop {r2, r3} + RET + +#endif /* L_aeabi_divmod */ + diff --git a/gcc/config/arm/bpabi.c b/gcc/config/arm/bpabi.c new file mode 100644 index 000000000..283bdc0ac --- /dev/null +++ b/gcc/config/arm/bpabi.c @@ -0,0 +1,56 @@ +/* Miscellaneous BPABI functions. + + Copyright (C) 2003, 2004, 2009 Free Software Foundation, Inc. + Contributed by CodeSourcery, LLC. + + This file is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any + later version. + + This file is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +extern long long __divdi3 (long long, long long); +extern unsigned long long __udivdi3 (unsigned long long, + unsigned long long); +extern long long __gnu_ldivmod_helper (long long, long long, long long *); +extern unsigned long long __gnu_uldivmod_helper (unsigned long long, + unsigned long long, + unsigned long long *); + + +long long +__gnu_ldivmod_helper (long long a, + long long b, + long long *remainder) +{ + long long quotient; + + quotient = __divdi3 (a, b); + *remainder = a - b * quotient; + return quotient; +} + +unsigned long long +__gnu_uldivmod_helper (unsigned long long a, + unsigned long long b, + unsigned long long *remainder) +{ + unsigned long long quotient; + + quotient = __udivdi3 (a, b); + *remainder = a - b * quotient; + return quotient; +} diff --git a/gcc/config/arm/bpabi.h b/gcc/config/arm/bpabi.h new file mode 100644 index 000000000..7b5ee6231 --- /dev/null +++ b/gcc/config/arm/bpabi.h @@ -0,0 +1,125 @@ +/* Configuration file for ARM BPABI targets. + Copyright (C) 2004, 2005, 2007, 2008, 2009, 2010 + Free Software Foundation, Inc. + Contributed by CodeSourcery, LLC + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +/* Use the AAPCS ABI by default. */ +#define ARM_DEFAULT_ABI ARM_ABI_AAPCS + +/* Assume that AAPCS ABIs should adhere to the full BPABI. */ +#define TARGET_BPABI (TARGET_AAPCS_BASED) + +/* BPABI targets use EABI frame unwinding tables. */ +#undef ARM_UNWIND_INFO +#define ARM_UNWIND_INFO 1 + +/* Section 4.1 of the AAPCS requires the use of VFP format. */ +#undef FPUTYPE_DEFAULT +#define FPUTYPE_DEFAULT "vfp" + +/* TARGET_BIG_ENDIAN_DEFAULT is set in + config.gcc for big endian configurations. */ +#if TARGET_BIG_ENDIAN_DEFAULT +#define TARGET_ENDIAN_DEFAULT MASK_BIG_END +#else +#define TARGET_ENDIAN_DEFAULT 0 +#endif + +/* EABI targets should enable interworking by default. */ +#undef TARGET_DEFAULT +#define TARGET_DEFAULT (MASK_INTERWORK | TARGET_ENDIAN_DEFAULT) + +/* The ARM BPABI functions return a boolean; they use no special + calling convention. */ +#define FLOAT_LIB_COMPARE_RETURNS_BOOL(MODE, COMPARISON) TARGET_BPABI + +/* The BPABI integer comparison routines return { -1, 0, 1 }. */ +#define TARGET_LIB_INT_CMP_BIASED !TARGET_BPABI + +#define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*"\ + "|march=armv4|mcpu=fa526|mcpu=fa626:--fix-v4bx}" + +#define BE8_LINK_SPEC " %{mbig-endian:%{march=armv7-a|mcpu=cortex-a5"\ + "|mcpu=cortex-a8|mcpu=cortex-a9|mcpu=cortex-a15:%{!r:--be8}}}" + +/* Tell the assembler to build BPABI binaries. */ +#undef SUBTARGET_EXTRA_ASM_SPEC +#define SUBTARGET_EXTRA_ASM_SPEC \ + "%{mabi=apcs-gnu|mabi=atpcs:-meabi=gnu;:-meabi=5}" TARGET_FIX_V4BX_SPEC + +#ifndef SUBTARGET_EXTRA_LINK_SPEC +#define SUBTARGET_EXTRA_LINK_SPEC "" +#endif + +/* The generic link spec in elf.h does not support shared libraries. */ +#define BPABI_LINK_SPEC \ + "%{mbig-endian:-EB} %{mlittle-endian:-EL} " \ + "%{static:-Bstatic} %{shared:-shared} %{symbolic:-Bsymbolic} " \ + "-X" SUBTARGET_EXTRA_LINK_SPEC TARGET_FIX_V4BX_SPEC BE8_LINK_SPEC + +#undef LINK_SPEC +#define LINK_SPEC BPABI_LINK_SPEC + +/* The BPABI requires that we always use an out-of-line implementation + of RTTI comparison, even if the target supports weak symbols, + because the same object file might be used on a target that does + not support merging symbols across DLL boundaries. This macro is + broken out separately so that it can be used within + TARGET_OS_CPP_BUILTINS in configuration files for systems based on + the BPABI. */ +#define TARGET_BPABI_CPP_BUILTINS() \ + do \ + { \ + builtin_define ("__GXX_TYPEINFO_EQUALITY_INLINE=0"); \ + } \ + while (false) + +#undef TARGET_OS_CPP_BUILTINS +#define TARGET_OS_CPP_BUILTINS() \ + TARGET_BPABI_CPP_BUILTINS() + +/* The BPABI specifies the use of .{init,fini}_array. Therefore, we + do not want GCC to put anything into the .{init,fini} sections. */ +#undef INIT_SECTION_ASM_OP +#undef FINI_SECTION_ASM_OP +#define INIT_ARRAY_SECTION_ASM_OP ARM_EABI_CTORS_SECTION_OP +#define FINI_ARRAY_SECTION_ASM_OP ARM_EABI_DTORS_SECTION_OP + +/* The legacy _mcount implementation assumes r11 points to a + 4-word APCS frame. This is generally not true for EABI targets, + particularly not in Thumb mode. We assume the mcount + implementation does not require a counter variable (No Counter). + Note that __gnu_mcount_nc will be entered with a misaligned stack. + This is OK because it uses a special calling convention anyway. */ + +#undef NO_PROFILE_COUNTERS +#define NO_PROFILE_COUNTERS 1 +#undef ARM_FUNCTION_PROFILER +#define ARM_FUNCTION_PROFILER(STREAM, LABELNO) \ +{ \ + fprintf (STREAM, "\tpush\t{lr}\n"); \ + fprintf (STREAM, "\tbl\t__gnu_mcount_nc\n"); \ +} + +#undef SUBTARGET_FRAME_POINTER_REQUIRED +#define SUBTARGET_FRAME_POINTER_REQUIRED 0 + +/* __gnu_mcount_nc restores the original LR value before returning. Ensure + that there is no unnecessary hook set up. */ +#undef PROFILE_HOOK diff --git a/gcc/config/arm/cirrus.md b/gcc/config/arm/cirrus.md new file mode 100644 index 000000000..f08da0bdc --- /dev/null +++ b/gcc/config/arm/cirrus.md @@ -0,0 +1,540 @@ +;; Cirrus EP9312 "Maverick" ARM floating point co-processor description. +;; Copyright (C) 2003, 2004, 2005, 2007 Free Software Foundation, Inc. +;; Contributed by Red Hat. +;; Written by Aldy Hernandez (aldyh@redhat.com) + +;; This file is part of GCC. + +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. + +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + + +; Cirrus types for invalid insn combinations +; not Not a cirrus insn +; normal Any Cirrus insn not covered by the special cases below +; double cfldrd, cfldr64, cfstrd, cfstr64 +; compare cfcmps, cfcmpd, cfcmp32, cfcmp64 +; move cfmvdlr, cfmvdhr, cfmvsr, cfmv64lr, cfmv64hr +(define_attr "cirrus" "not,normal,double,compare,move" (const_string "not")) + + +(define_insn "cirrus_adddi3" + [(set (match_operand:DI 0 "cirrus_fp_register" "=v") + (plus:DI (match_operand:DI 1 "cirrus_fp_register" "v") + (match_operand:DI 2 "cirrus_fp_register" "v")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfadd64%?\\t%V0, %V1, %V2" + [(set_attr "type" "mav_farith") + (set_attr "cirrus" "normal")] +) + +(define_insn "*cirrus_addsi3" + [(set (match_operand:SI 0 "cirrus_fp_register" "=v") + (plus:SI (match_operand:SI 1 "cirrus_fp_register" "v") + (match_operand:SI 2 "cirrus_fp_register" "v")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0" + "cfadd32%?\\t%V0, %V1, %V2" + [(set_attr "type" "mav_farith") + (set_attr "cirrus" "normal")] +) + +(define_insn "*cirrus_addsf3" + [(set (match_operand:SF 0 "cirrus_fp_register" "=v") + (plus:SF (match_operand:SF 1 "cirrus_fp_register" "v") + (match_operand:SF 2 "cirrus_fp_register" "v")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfadds%?\\t%V0, %V1, %V2" + [(set_attr "type" "mav_farith") + (set_attr "cirrus" "normal")] +) + +(define_insn "*cirrus_adddf3" + [(set (match_operand:DF 0 "cirrus_fp_register" "=v") + (plus:DF (match_operand:DF 1 "cirrus_fp_register" "v") + (match_operand:DF 2 "cirrus_fp_register" "v")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfaddd%?\\t%V0, %V1, %V2" + [(set_attr "type" "mav_farith") + (set_attr "cirrus" "normal")] +) + +(define_insn "cirrus_subdi3" + [(set (match_operand:DI 0 "cirrus_fp_register" "=v") + (minus:DI (match_operand:DI 1 "cirrus_fp_register" "v") + (match_operand:DI 2 "cirrus_fp_register" "v")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfsub64%?\\t%V0, %V1, %V2" + [(set_attr "type" "mav_farith") + (set_attr "cirrus" "normal")] +) + +(define_insn "*cirrus_subsi3_insn" + [(set (match_operand:SI 0 "cirrus_fp_register" "=v") + (minus:SI (match_operand:SI 1 "cirrus_fp_register" "v") + (match_operand:SI 2 "cirrus_fp_register" "v")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0" + "cfsub32%?\\t%V0, %V1, %V2" + [(set_attr "type" "mav_farith") + (set_attr "cirrus" "normal")] +) + +(define_insn "*cirrus_subsf3" + [(set (match_operand:SF 0 "cirrus_fp_register" "=v") + (minus:SF (match_operand:SF 1 "cirrus_fp_register" "v") + (match_operand:SF 2 "cirrus_fp_register" "v")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfsubs%?\\t%V0, %V1, %V2" + [(set_attr "type" "mav_farith") + (set_attr "cirrus" "normal")] +) + +(define_insn "*cirrus_subdf3" + [(set (match_operand:DF 0 "cirrus_fp_register" "=v") + (minus:DF (match_operand:DF 1 "cirrus_fp_register" "v") + (match_operand:DF 2 "cirrus_fp_register" "v")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfsubd%?\\t%V0, %V1, %V2" + [(set_attr "type" "mav_farith") + (set_attr "cirrus" "normal")] +) + +(define_insn "*cirrus_mulsi3" + [(set (match_operand:SI 0 "cirrus_fp_register" "=v") + (mult:SI (match_operand:SI 2 "cirrus_fp_register" "v") + (match_operand:SI 1 "cirrus_fp_register" "v")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0" + "cfmul32%?\\t%V0, %V1, %V2" + [(set_attr "type" "mav_farith") + (set_attr "cirrus" "normal")] +) + +(define_insn "muldi3" + [(set (match_operand:DI 0 "cirrus_fp_register" "=v") + (mult:DI (match_operand:DI 2 "cirrus_fp_register" "v") + (match_operand:DI 1 "cirrus_fp_register" "v")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfmul64%?\\t%V0, %V1, %V2" + [(set_attr "type" "mav_dmult") + (set_attr "cirrus" "normal")] +) + +(define_insn "*cirrus_mulsi3addsi" + [(set (match_operand:SI 0 "cirrus_fp_register" "=v") + (plus:SI + (mult:SI (match_operand:SI 1 "cirrus_fp_register" "v") + (match_operand:SI 2 "cirrus_fp_register" "v")) + (match_operand:SI 3 "cirrus_fp_register" "0")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0" + "cfmac32%?\\t%V0, %V1, %V2" + [(set_attr "type" "mav_farith") + (set_attr "cirrus" "normal")] +) + +;; Cirrus SI multiply-subtract +(define_insn "*cirrus_mulsi3subsi" + [(set (match_operand:SI 0 "cirrus_fp_register" "=v") + (minus:SI + (match_operand:SI 1 "cirrus_fp_register" "0") + (mult:SI (match_operand:SI 2 "cirrus_fp_register" "v") + (match_operand:SI 3 "cirrus_fp_register" "v"))))] + "0 && TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfmsc32%?\\t%V0, %V2, %V3" + [(set_attr "type" "mav_farith") + (set_attr "cirrus" "normal")] +) + +(define_insn "*cirrus_mulsf3" + [(set (match_operand:SF 0 "cirrus_fp_register" "=v") + (mult:SF (match_operand:SF 1 "cirrus_fp_register" "v") + (match_operand:SF 2 "cirrus_fp_register" "v")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfmuls%?\\t%V0, %V1, %V2" + [(set_attr "type" "mav_farith") + (set_attr "cirrus" "normal")] +) + +(define_insn "*cirrus_muldf3" + [(set (match_operand:DF 0 "cirrus_fp_register" "=v") + (mult:DF (match_operand:DF 1 "cirrus_fp_register" "v") + (match_operand:DF 2 "cirrus_fp_register" "v")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfmuld%?\\t%V0, %V1, %V2" + [(set_attr "type" "mav_dmult") + (set_attr "cirrus" "normal")] +) + +(define_insn "cirrus_ashl_const" + [(set (match_operand:SI 0 "cirrus_fp_register" "=v") + (ashift:SI (match_operand:SI 1 "cirrus_fp_register" "v") + (match_operand:SI 2 "cirrus_shift_const" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0" + "cfsh32%?\\t%V0, %V1, #%s2" + [(set_attr "cirrus" "normal")] +) + +(define_insn "cirrus_ashiftrt_const" + [(set (match_operand:SI 0 "cirrus_fp_register" "=v") + (ashiftrt:SI (match_operand:SI 1 "cirrus_fp_register" "v") + (match_operand:SI 2 "cirrus_shift_const" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0" + "cfsh32%?\\t%V0, %V1, #-%s2" + [(set_attr "cirrus" "normal")] +) + +(define_insn "cirrus_ashlsi3" + [(set (match_operand:SI 0 "cirrus_fp_register" "=v") + (ashift:SI (match_operand:SI 1 "cirrus_fp_register" "v") + (match_operand:SI 2 "register_operand" "r")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0" + "cfrshl32%?\\t%V1, %V0, %s2" + [(set_attr "cirrus" "normal")] +) + +(define_insn "ashldi3_cirrus" + [(set (match_operand:DI 0 "cirrus_fp_register" "=v") + (ashift:DI (match_operand:DI 1 "cirrus_fp_register" "v") + (match_operand:SI 2 "register_operand" "r")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfrshl64%?\\t%V1, %V0, %s2" + [(set_attr "cirrus" "normal")] +) + +(define_insn "cirrus_ashldi_const" + [(set (match_operand:DI 0 "cirrus_fp_register" "=v") + (ashift:DI (match_operand:DI 1 "cirrus_fp_register" "v") + (match_operand:SI 2 "cirrus_shift_const" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfsh64%?\\t%V0, %V1, #%s2" + [(set_attr "cirrus" "normal")] +) + +(define_insn "cirrus_ashiftrtdi_const" + [(set (match_operand:DI 0 "cirrus_fp_register" "=v") + (ashiftrt:DI (match_operand:DI 1 "cirrus_fp_register" "v") + (match_operand:SI 2 "cirrus_shift_const" "")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfsh64%?\\t%V0, %V1, #-%s2" + [(set_attr "cirrus" "normal")] +) + +(define_insn "*cirrus_absdi2" + [(set (match_operand:DI 0 "cirrus_fp_register" "=v") + (abs:DI (match_operand:DI 1 "cirrus_fp_register" "v")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfabs64%?\\t%V0, %V1" + [(set_attr "cirrus" "normal")] +) + +;; This doesn't really clobber ``cc''. Fixme: aldyh. +(define_insn "*cirrus_negdi2" + [(set (match_operand:DI 0 "cirrus_fp_register" "=v") + (neg:DI (match_operand:DI 1 "cirrus_fp_register" "v"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfneg64%?\\t%V0, %V1" + [(set_attr "cirrus" "normal")] +) + +(define_insn "*cirrus_negsi2" + [(set (match_operand:SI 0 "cirrus_fp_register" "=v") + (neg:SI (match_operand:SI 1 "cirrus_fp_register" "v")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0" + "cfneg32%?\\t%V0, %V1" + [(set_attr "cirrus" "normal")] +) + +(define_insn "*cirrus_negsf2" + [(set (match_operand:SF 0 "cirrus_fp_register" "=v") + (neg:SF (match_operand:SF 1 "cirrus_fp_register" "v")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfnegs%?\\t%V0, %V1" + [(set_attr "cirrus" "normal")] +) + +(define_insn "*cirrus_negdf2" + [(set (match_operand:DF 0 "cirrus_fp_register" "=v") + (neg:DF (match_operand:DF 1 "cirrus_fp_register" "v")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfnegd%?\\t%V0, %V1" + [(set_attr "cirrus" "normal")] +) + +;; This doesn't really clobber the condition codes either. +(define_insn "*cirrus_abssi2" + [(set (match_operand:SI 0 "cirrus_fp_register" "=v") + (abs:SI (match_operand:SI 1 "cirrus_fp_register" "v"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0" + "cfabs32%?\\t%V0, %V1" + [(set_attr "cirrus" "normal")] +) + +(define_insn "*cirrus_abssf2" + [(set (match_operand:SF 0 "cirrus_fp_register" "=v") + (abs:SF (match_operand:SF 1 "cirrus_fp_register" "v")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfabss%?\\t%V0, %V1" + [(set_attr "cirrus" "normal")] +) + +(define_insn "*cirrus_absdf2" + [(set (match_operand:DF 0 "cirrus_fp_register" "=v") + (abs:DF (match_operand:DF 1 "cirrus_fp_register" "v")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfabsd%?\\t%V0, %V1" + [(set_attr "cirrus" "normal")] +) + +;; Convert Cirrus-SI to Cirrus-SF +(define_insn "cirrus_floatsisf2" + [(set (match_operand:SF 0 "cirrus_fp_register" "=v") + (float:SF (match_operand:SI 1 "s_register_operand" "r"))) + (clobber (match_scratch:DF 2 "=v"))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfmv64lr%?\\t%Z2, %1\;cfcvt32s%?\\t%V0, %Y2" + [(set_attr "length" "8") + (set_attr "cirrus" "move")] +) + +(define_insn "cirrus_floatsidf2" + [(set (match_operand:DF 0 "cirrus_fp_register" "=v") + (float:DF (match_operand:SI 1 "s_register_operand" "r"))) + (clobber (match_scratch:DF 2 "=v"))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfmv64lr%?\\t%Z2, %1\;cfcvt32d%?\\t%V0, %Y2" + [(set_attr "length" "8") + (set_attr "cirrus" "move")] +) + +(define_insn "floatdisf2" + [(set (match_operand:SF 0 "cirrus_fp_register" "=v") + (float:SF (match_operand:DI 1 "cirrus_fp_register" "v")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfcvt64s%?\\t%V0, %V1" + [(set_attr "cirrus" "normal")]) + +(define_insn "floatdidf2" + [(set (match_operand:DF 0 "cirrus_fp_register" "=v") + (float:DF (match_operand:DI 1 "cirrus_fp_register" "v")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfcvt64d%?\\t%V0, %V1" + [(set_attr "cirrus" "normal")]) + +(define_insn "cirrus_truncsfsi2" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (fix:SI (fix:SF (match_operand:SF 1 "cirrus_fp_register" "v")))) + (clobber (match_scratch:DF 2 "=v"))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cftruncs32%?\\t%Y2, %V1\;cfmvr64l%?\\t%0, %Z2" + [(set_attr "length" "8") + (set_attr "cirrus" "normal")] +) + +(define_insn "cirrus_truncdfsi2" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (fix:SI (fix:DF (match_operand:DF 1 "cirrus_fp_register" "v")))) + (clobber (match_scratch:DF 2 "=v"))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cftruncd32%?\\t%Y2, %V1\;cfmvr64l%?\\t%0, %Z2" + [(set_attr "length" "8")] +) + +(define_insn "*cirrus_truncdfsf2" + [(set (match_operand:SF 0 "cirrus_fp_register" "=v") + (float_truncate:SF + (match_operand:DF 1 "cirrus_fp_register" "v")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfcvtds%?\\t%V0, %V1" + [(set_attr "cirrus" "normal")] +) + +(define_insn "*cirrus_extendsfdf2" + [(set (match_operand:DF 0 "cirrus_fp_register" "=v") + (float_extend:DF (match_operand:SF 1 "cirrus_fp_register" "v")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "cfcvtsd%?\\t%V0, %V1" + [(set_attr "cirrus" "normal")] +) + +(define_insn "*cirrus_arm_movdi" + [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r,r,o<>,v,r,v,m,v") + (match_operand:DI 1 "di_operand" "rIK,mi,r,r,v,mi,v,v"))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "* + { + switch (which_alternative) + { + case 0: + return \"#\"; + case 1: + case 2: + return output_move_double (operands); + + case 3: return \"cfmv64lr%?\\t%V0, %Q1\;cfmv64hr%?\\t%V0, %R1\"; + case 4: return \"cfmvr64l%?\\t%Q0, %V1\;cfmvr64h%?\\t%R0, %V1\"; + + case 5: return \"cfldr64%?\\t%V0, %1\"; + case 6: return \"cfstr64%?\\t%V1, %0\"; + + /* Shifting by 0 will just copy %1 into %0. */ + case 7: return \"cfsh64%?\\t%V0, %V1, #0\"; + + default: gcc_unreachable (); + } + }" + [(set_attr "length" " 8, 8, 8, 8, 8, 4, 4, 4") + (set_attr "type" " *,load2,store2, *, *, load2,store2, *") + (set_attr "pool_range" " *,1020, *, *, *, 1020, *, *") + (set_attr "neg_pool_range" " *,1012, *, *, *, 1008, *, *") + (set_attr "cirrus" "not, not, not,move,normal,double,double,normal")] +) + +;; Cirrus SI values have been outlawed. Look in arm.h for the comment +;; on HARD_REGNO_MODE_OK. + +(define_insn "*cirrus_movsf_hard_insn" + [(set (match_operand:SF 0 "nonimmediate_operand" "=v,v,v,r,m,r,r,m") + (match_operand:SF 1 "general_operand" "v,mE,r,v,v,r,mE,r"))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK + && (GET_CODE (operands[0]) != MEM + || register_operand (operands[1], SFmode))" + "@ + cfcpys%?\\t%V0, %V1 + cfldrs%?\\t%V0, %1 + cfmvsr%?\\t%V0, %1 + cfmvrs%?\\t%0, %V1 + cfstrs%?\\t%V1, %0 + mov%?\\t%0, %1 + ldr%?\\t%0, %1\\t%@ float + str%?\\t%1, %0\\t%@ float" + [(set_attr "length" " *, *, *, *, *, 4, 4, 4") + (set_attr "type" " *, load1, *, *,store1, *,load1,store1") + (set_attr "pool_range" " *, 1020, *, *, *, *,4096, *") + (set_attr "neg_pool_range" " *, 1008, *, *, *, *,4084, *") + (set_attr "cirrus" "normal,normal,move,normal,normal,not, not, not")] +) + +(define_insn "*cirrus_movdf_hard_insn" + [(set (match_operand:DF 0 "nonimmediate_operand" "=r,Q,r,m,r,v,v,v,r,m") + (match_operand:DF 1 "general_operand" "Q,r,r,r,mF,v,mF,r,v,v"))] + "TARGET_ARM + && TARGET_HARD_FLOAT && TARGET_MAVERICK + && (GET_CODE (operands[0]) != MEM + || register_operand (operands[1], DFmode))" + "* + { + switch (which_alternative) + { + case 0: return \"ldm%?ia\\t%m1, %M0\\t%@ double\"; + case 1: return \"stm%?ia\\t%m0, %M1\\t%@ double\"; + case 2: return \"#\"; + case 3: case 4: return output_move_double (operands); + case 5: return \"cfcpyd%?\\t%V0, %V1\"; + case 6: return \"cfldrd%?\\t%V0, %1\"; + case 7: return \"cfmvdlr\\t%V0, %Q1\;cfmvdhr%?\\t%V0, %R1\"; + case 8: return \"cfmvrdl%?\\t%Q0, %V1\;cfmvrdh%?\\t%R0, %V1\"; + case 9: return \"cfstrd%?\\t%V1, %0\"; + default: gcc_unreachable (); + } + }" + [(set_attr "type" "load1,store2, *,store2,load1, *, load1, *, *,store2") + (set_attr "length" " 4, 4, 8, 8, 8, 4, 4, 8, 8, 4") + (set_attr "pool_range" " *, *, *, *, 252, *, 1020, *, *, *") + (set_attr "neg_pool_range" " *, *, *, *, 244, *, 1008, *, *, *") + (set_attr "cirrus" " not, not,not, not, not,normal,double,move,normal,double")] +) + +(define_insn "*cirrus_thumb2_movdi" + [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r,r,o<>,v,r,v,m,v") + (match_operand:DI 1 "di_operand" "rIK,mi,r,r,v,mi,v,v"))] + "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_MAVERICK" + "* + { + switch (which_alternative) + { + case 0: + case 1: + case 2: + return (output_move_double (operands)); + + case 3: return \"cfmv64lr%?\\t%V0, %Q1\;cfmv64hr%?\\t%V0, %R1\"; + case 4: return \"cfmvr64l%?\\t%Q0, %V1\;cfmvr64h%?\\t%R0, %V1\"; + + case 5: return \"cfldr64%?\\t%V0, %1\"; + case 6: return \"cfstr64%?\\t%V1, %0\"; + + /* Shifting by 0 will just copy %1 into %0. */ + case 7: return \"cfsh64%?\\t%V0, %V1, #0\"; + + default: abort (); + } + }" + [(set_attr "length" " 8, 8, 8, 8, 8, 4, 4, 4") + (set_attr "type" " *,load2,store2, *, *, load2,store2, *") + (set_attr "pool_range" " *,4096, *, *, *, 1020, *, *") + (set_attr "neg_pool_range" " *, 0, *, *, *, 1008, *, *") + (set_attr "cirrus" "not, not, not,move,normal,double,double,normal")] +) + +(define_insn "*thumb2_cirrus_movsf_hard_insn" + [(set (match_operand:SF 0 "nonimmediate_operand" "=v,v,v,r,m,r,r,m") + (match_operand:SF 1 "general_operand" "v,mE,r,v,v,r,mE,r"))] + "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_MAVERICK + && (GET_CODE (operands[0]) != MEM + || register_operand (operands[1], SFmode))" + "@ + cfcpys%?\\t%V0, %V1 + cfldrs%?\\t%V0, %1 + cfmvsr%?\\t%V0, %1 + cfmvrs%?\\t%0, %V1 + cfstrs%?\\t%V1, %0 + mov%?\\t%0, %1 + ldr%?\\t%0, %1\\t%@ float + str%?\\t%1, %0\\t%@ float" + [(set_attr "length" " *, *, *, *, *, 4, 4, 4") + (set_attr "type" " *, load1, *, *,store1, *,load1,store1") + (set_attr "pool_range" " *, 1020, *, *, *, *,4096, *") + (set_attr "neg_pool_range" " *, 1008, *, *, *, *, 0, *") + (set_attr "cirrus" "normal,normal,move,normal,normal,not, not, not")] +) + +(define_insn "*thumb2_cirrus_movdf_hard_insn" + [(set (match_operand:DF 0 "nonimmediate_operand" "=r,Q,r,m,r,v,v,v,r,m") + (match_operand:DF 1 "general_operand" "Q,r,r,r,mF,v,mF,r,v,v"))] + "TARGET_THUMB2 + && TARGET_HARD_FLOAT && TARGET_MAVERICK + && (GET_CODE (operands[0]) != MEM + || register_operand (operands[1], DFmode))" + "* + { + switch (which_alternative) + { + case 0: return \"ldm%?ia\\t%m1, %M0\\t%@ double\"; + case 1: return \"stm%?ia\\t%m0, %M1\\t%@ double\"; + case 2: case 3: case 4: return output_move_double (operands); + case 5: return \"cfcpyd%?\\t%V0, %V1\"; + case 6: return \"cfldrd%?\\t%V0, %1\"; + case 7: return \"cfmvdlr\\t%V0, %Q1\;cfmvdhr%?\\t%V0, %R1\"; + case 8: return \"cfmvrdl%?\\t%Q0, %V1\;cfmvrdh%?\\t%R0, %V1\"; + case 9: return \"cfstrd%?\\t%V1, %0\"; + default: abort (); + } + }" + [(set_attr "type" "load1,store2, *,store2,load1, *, load1, *, *,store2") + (set_attr "length" " 4, 4, 8, 8, 8, 4, 4, 8, 8, 4") + (set_attr "pool_range" " *, *, *, *,4092, *, 1020, *, *, *") + (set_attr "neg_pool_range" " *, *, *, *, 0, *, 1008, *, *, *") + (set_attr "cirrus" " not, not,not, not, not,normal,double,move,normal,double")] +) + diff --git a/gcc/config/arm/coff.h b/gcc/config/arm/coff.h new file mode 100644 index 000000000..bd3e6f85d --- /dev/null +++ b/gcc/config/arm/coff.h @@ -0,0 +1,86 @@ +/* Definitions of target machine for GNU compiler. + For ARM with COFF object format. + Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2002, 2003, 2004, 2005, + 2007 Free Software Foundation, Inc. + Contributed by Doug Evans (devans@cygnus.com). + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +/* Note - it is important that this definition matches the one in tcoff.h. */ +#undef USER_LABEL_PREFIX +#define USER_LABEL_PREFIX "_" + + +/* Run-time Target Specification. */ +#undef TARGET_VERSION +#define TARGET_VERSION fputs (" (ARM/coff)", stderr) + +#undef TARGET_DEFAULT_FLOAT_ABI +#define TARGET_DEFAULT_FLOAT_ABI ARM_FLOAT_ABI_SOFT + +#undef TARGET_DEFAULT +#define TARGET_DEFAULT (MASK_APCS_FRAME) + +#ifndef MULTILIB_DEFAULTS +#define MULTILIB_DEFAULTS \ + { "marm", "mlittle-endian", "msoft-float", "mno-thumb-interwork" } +#endif + +/* This is COFF, but prefer stabs. */ +#define SDB_DEBUGGING_INFO 1 + +#define PREFERRED_DEBUGGING_TYPE DBX_DEBUG + + +#define TARGET_ASM_FILE_START_APP_OFF true + +/* Switch into a generic section. */ +#define TARGET_ASM_NAMED_SECTION default_coff_asm_named_section + +/* Support the ctors/dtors and other sections. */ + +#undef INIT_SECTION_ASM_OP + +/* Define this macro if jump tables (for `tablejump' insns) should be + output in the text section, along with the assembler instructions. + Otherwise, the readonly data section is used. */ +/* We put ARM and Thumb-2 jump tables in the text section, because it makes + the code more efficient, but for Thumb-1 it's better to put them out of + band unless we are generating compressed tables. */ +#define JUMP_TABLES_IN_TEXT_SECTION \ + (TARGET_32BIT || (TARGET_THUMB && (optimize_size || flag_pic))) + +#undef READONLY_DATA_SECTION_ASM_OP +#define READONLY_DATA_SECTION_ASM_OP "\t.section .rdata" +#undef CTORS_SECTION_ASM_OP +#define CTORS_SECTION_ASM_OP "\t.section .ctors,\"x\"" +#undef DTORS_SECTION_ASM_OP +#define DTORS_SECTION_ASM_OP "\t.section .dtors,\"x\"" + +/* Support the ctors/dtors sections for g++. */ + +/* __CTOR_LIST__ and __DTOR_LIST__ must be defined by the linker script. */ +#define CTOR_LISTS_DEFINED_EXTERNALLY + +#undef DO_GLOBAL_CTORS_BODY +#undef DO_GLOBAL_DTORS_BODY + +/* The ARM development system defines __main. */ +#define NAME__MAIN "__gccmain" +#define SYMBOL__MAIN __gccmain + +#define SUPPORTS_INIT_PRIORITY 0 diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md new file mode 100644 index 000000000..4e220e530 --- /dev/null +++ b/gcc/config/arm/constraints.md @@ -0,0 +1,335 @@ +;; Constraint definitions for ARM and Thumb +;; Copyright (C) 2006, 2007, 2008, 2010 Free Software Foundation, Inc. +;; Contributed by ARM Ltd. + +;; This file is part of GCC. + +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published +;; by the Free Software Foundation; either version 3, or (at your +;; option) any later version. + +;; GCC is distributed in the hope that it will be useful, but WITHOUT +;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public +;; License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +;; The following register constraints have been used: +;; - in ARM/Thumb-2 state: f, t, v, w, x, y, z +;; - in Thumb state: h, b +;; - in both states: l, c, k +;; In ARM state, 'l' is an alias for 'r' + +;; The following normal constraints have been used: +;; in ARM/Thumb-2 state: G, H, I, j, J, K, L, M +;; in Thumb-1 state: I, J, K, L, M, N, O + +;; The following multi-letter normal constraints have been used: +;; in ARM/Thumb-2 state: Da, Db, Dc, Dn, Dl, DL, Dv, Dy, Di, Dz +;; in Thumb-1 state: Pa, Pb, Pc, Pd +;; in Thumb-2 state: Ps, Pt, Pu, Pv, Pw, Px + +;; The following memory constraints have been used: +;; in ARM/Thumb-2 state: Q, Ut, Uv, Uy, Un, Um, Us +;; in ARM state: Uq + + +(define_register_constraint "f" "TARGET_ARM ? FPA_REGS : NO_REGS" + "Legacy FPA registers @code{f0}-@code{f7}.") + +(define_register_constraint "t" "TARGET_32BIT ? VFP_LO_REGS : NO_REGS" + "The VFP registers @code{s0}-@code{s31}.") + +(define_register_constraint "v" "TARGET_ARM ? CIRRUS_REGS : NO_REGS" + "The Cirrus Maverick co-processor registers.") + +(define_register_constraint "w" + "TARGET_32BIT ? (TARGET_VFPD32 ? VFP_REGS : VFP_LO_REGS) : NO_REGS" + "The VFP registers @code{d0}-@code{d15}, or @code{d0}-@code{d31} for VFPv3.") + +(define_register_constraint "x" "TARGET_32BIT ? VFP_D0_D7_REGS : NO_REGS" + "The VFP registers @code{d0}-@code{d7}.") + +(define_register_constraint "y" "TARGET_REALLY_IWMMXT ? IWMMXT_REGS : NO_REGS" + "The Intel iWMMX co-processor registers.") + +(define_register_constraint "z" + "TARGET_REALLY_IWMMXT ? IWMMXT_GR_REGS : NO_REGS" + "The Intel iWMMX GR registers.") + +(define_register_constraint "l" "TARGET_THUMB ? LO_REGS : GENERAL_REGS" + "In Thumb state the core registers @code{r0}-@code{r7}.") + +(define_register_constraint "h" "TARGET_THUMB ? HI_REGS : NO_REGS" + "In Thumb state the core registers @code{r8}-@code{r15}.") + +(define_constraint "j" + "A constant suitable for a MOVW instruction. (ARM/Thumb-2)" + (and (match_test "TARGET_32BIT && arm_arch_thumb2") + (ior (match_code "high") + (and (match_code "const_int") + (match_test "(ival & 0xffff0000) == 0"))))) + +(define_register_constraint "k" "STACK_REG" + "@internal The stack register.") + +(define_register_constraint "b" "TARGET_THUMB ? BASE_REGS : NO_REGS" + "@internal + Thumb only. The union of the low registers and the stack register.") + +(define_register_constraint "c" "CC_REG" + "@internal The condition code register.") + +(define_constraint "I" + "In ARM/Thumb-2 state a constant that can be used as an immediate value in a + Data Processing instruction. In Thumb-1 state a constant in the range + 0-255." + (and (match_code "const_int") + (match_test "TARGET_32BIT ? const_ok_for_arm (ival) + : ival >= 0 && ival <= 255"))) + +(define_constraint "J" + "In ARM/Thumb-2 state a constant in the range @minus{}4095-4095. In Thumb-1 + state a constant in the range @minus{}255-@minus{}1." + (and (match_code "const_int") + (match_test "TARGET_32BIT ? (ival >= -4095 && ival <= 4095) + : (ival >= -255 && ival <= -1)"))) + +(define_constraint "K" + "In ARM/Thumb-2 state a constant that satisfies the @code{I} constraint if + inverted. In Thumb-1 state a constant that satisfies the @code{I} + constraint multiplied by any power of 2." + (and (match_code "const_int") + (match_test "TARGET_32BIT ? const_ok_for_arm (~ival) + : thumb_shiftable_const (ival)"))) + +(define_constraint "L" + "In ARM/Thumb-2 state a constant that satisfies the @code{I} constraint if + negated. In Thumb-1 state a constant in the range @minus{}7-7." + (and (match_code "const_int") + (match_test "TARGET_32BIT ? const_ok_for_arm (-ival) + : (ival >= -7 && ival <= 7)"))) + +;; The ARM state version is internal... +;; @internal In ARM/Thumb-2 state a constant in the range 0-32 or any +;; power of 2. +(define_constraint "M" + "In Thumb-1 state a constant that is a multiple of 4 in the range 0-1020." + (and (match_code "const_int") + (match_test "TARGET_32BIT ? ((ival >= 0 && ival <= 32) + || (((ival & (ival - 1)) & 0xFFFFFFFF) == 0)) + : ival >= 0 && ival <= 1020 && (ival & 3) == 0"))) + +(define_constraint "N" + "Thumb-1 state a constant in the range 0-31." + (and (match_code "const_int") + (match_test "!TARGET_32BIT && (ival >= 0 && ival <= 31)"))) + +(define_constraint "O" + "In Thumb-1 state a constant that is a multiple of 4 in the range + @minus{}508-508." + (and (match_code "const_int") + (match_test "TARGET_THUMB1 && ival >= -508 && ival <= 508 + && ((ival & 3) == 0)"))) + +(define_constraint "Pa" + "@internal In Thumb-1 state a constant in the range -510 to +510" + (and (match_code "const_int") + (match_test "TARGET_THUMB1 && ival >= -510 && ival <= 510 + && (ival > 255 || ival < -255)"))) + +(define_constraint "Pb" + "@internal In Thumb-1 state a constant in the range -262 to +262" + (and (match_code "const_int") + (match_test "TARGET_THUMB1 && ival >= -262 && ival <= 262 + && (ival > 255 || ival < -255)"))) + +(define_constraint "Pc" + "@internal In Thumb-1 state a constant that is in the range 1021 to 1275" + (and (match_code "const_int") + (match_test "TARGET_THUMB1 + && ival > 1020 && ival <= 1275"))) + +(define_constraint "Pd" + "@internal In Thumb-1 state a constant in the range 0 to 7" + (and (match_code "const_int") + (match_test "TARGET_THUMB1 && ival >= 0 && ival <= 7"))) + +(define_constraint "Ps" + "@internal In Thumb-2 state a constant in the range -255 to +255" + (and (match_code "const_int") + (match_test "TARGET_THUMB2 && ival >= -255 && ival <= 255"))) + +(define_constraint "Pt" + "@internal In Thumb-2 state a constant in the range -7 to +7" + (and (match_code "const_int") + (match_test "TARGET_THUMB2 && ival >= -7 && ival <= 7"))) + +(define_constraint "Pu" + "@internal In Thumb-2 state a constant in the range +1 to +8" + (and (match_code "const_int") + (match_test "TARGET_THUMB2 && ival >= 1 && ival <= 8"))) + +(define_constraint "Pv" + "@internal In Thumb-2 state a constant in the range -255 to 0" + (and (match_code "const_int") + (match_test "TARGET_THUMB2 && ival >= -255 && ival <= 0"))) + +(define_constraint "Pw" + "@internal In Thumb-2 state a constant in the range -255 to -1" + (and (match_code "const_int") + (match_test "TARGET_THUMB2 && ival >= -255 && ival <= -1"))) + +(define_constraint "Px" + "@internal In Thumb-2 state a constant in the range -7 to -1" + (and (match_code "const_int") + (match_test "TARGET_THUMB2 && ival >= -7 && ival <= -1"))) + +(define_constraint "G" + "In ARM/Thumb-2 state a valid FPA immediate constant." + (and (match_code "const_double") + (match_test "TARGET_32BIT && arm_const_double_rtx (op)"))) + +(define_constraint "H" + "In ARM/Thumb-2 state a valid FPA immediate constant when negated." + (and (match_code "const_double") + (match_test "TARGET_32BIT && neg_const_double_rtx_ok_for_fpa (op)"))) + +(define_constraint "Dz" + "@internal + In ARM/Thumb-2 state a vector of constant zeros." + (and (match_code "const_vector") + (match_test "TARGET_NEON && op == CONST0_RTX (mode)"))) + +(define_constraint "Da" + "@internal + In ARM/Thumb-2 state a const_int, const_double or const_vector that can + be generated with two Data Processing insns." + (and (match_code "const_double,const_int,const_vector") + (match_test "TARGET_32BIT && arm_const_double_inline_cost (op) == 2"))) + +(define_constraint "Db" + "@internal + In ARM/Thumb-2 state a const_int, const_double or const_vector that can + be generated with three Data Processing insns." + (and (match_code "const_double,const_int,const_vector") + (match_test "TARGET_32BIT && arm_const_double_inline_cost (op) == 3"))) + +(define_constraint "Dc" + "@internal + In ARM/Thumb-2 state a const_int, const_double or const_vector that can + be generated with four Data Processing insns. This pattern is disabled + if optimizing for space or when we have load-delay slots to fill." + (and (match_code "const_double,const_int,const_vector") + (match_test "TARGET_32BIT && arm_const_double_inline_cost (op) == 4 + && !(optimize_size || arm_ld_sched)"))) + +(define_constraint "Di" + "@internal + In ARM/Thumb-2 state a const_int or const_double where both the high + and low SImode words can be generated as immediates in 32-bit instructions." + (and (match_code "const_double,const_int") + (match_test "TARGET_32BIT && arm_const_double_by_immediates (op)"))) + +(define_constraint "Dn" + "@internal + In ARM/Thumb-2 state a const_vector which can be loaded with a Neon vmov + immediate instruction." + (and (match_code "const_vector") + (match_test "TARGET_32BIT + && imm_for_neon_mov_operand (op, GET_MODE (op))"))) + +(define_constraint "Dl" + "@internal + In ARM/Thumb-2 state a const_vector which can be used with a Neon vorr or + vbic instruction." + (and (match_code "const_vector") + (match_test "TARGET_32BIT + && imm_for_neon_logic_operand (op, GET_MODE (op))"))) + +(define_constraint "DL" + "@internal + In ARM/Thumb-2 state a const_vector which can be used with a Neon vorn or + vand instruction." + (and (match_code "const_vector") + (match_test "TARGET_32BIT + && imm_for_neon_inv_logic_operand (op, GET_MODE (op))"))) + +(define_constraint "Dv" + "@internal + In ARM/Thumb-2 state a const_double which can be used with a VFP fconsts + instruction." + (and (match_code "const_double") + (match_test "TARGET_32BIT && vfp3_const_double_rtx (op)"))) + +(define_constraint "Dy" + "@internal + In ARM/Thumb-2 state a const_double which can be used with a VFP fconstd + instruction." + (and (match_code "const_double") + (match_test "TARGET_32BIT && TARGET_VFP_DOUBLE && vfp3_const_double_rtx (op)"))) + +(define_memory_constraint "Ut" + "@internal + In ARM/Thumb-2 state an address valid for loading/storing opaque structure + types wider than TImode." + (and (match_code "mem") + (match_test "TARGET_32BIT && neon_struct_mem_operand (op)"))) + +(define_memory_constraint "Uv" + "@internal + In ARM/Thumb-2 state a valid VFP load/store address." + (and (match_code "mem") + (match_test "TARGET_32BIT && arm_coproc_mem_operand (op, FALSE)"))) + +(define_memory_constraint "Uy" + "@internal + In ARM/Thumb-2 state a valid iWMMX load/store address." + (and (match_code "mem") + (match_test "TARGET_32BIT && arm_coproc_mem_operand (op, TRUE)"))) + +(define_memory_constraint "Un" + "@internal + In ARM/Thumb-2 state a valid address for Neon doubleword vector + load/store instructions." + (and (match_code "mem") + (match_test "TARGET_32BIT && neon_vector_mem_operand (op, 0)"))) + +(define_memory_constraint "Um" + "@internal + In ARM/Thumb-2 state a valid address for Neon element and structure + load/store instructions." + (and (match_code "mem") + (match_test "TARGET_32BIT && neon_vector_mem_operand (op, 2)"))) + +(define_memory_constraint "Us" + "@internal + In ARM/Thumb-2 state a valid address for non-offset loads/stores of + quad-word values in four ARM registers." + (and (match_code "mem") + (match_test "TARGET_32BIT && neon_vector_mem_operand (op, 1)"))) + +(define_memory_constraint "Uq" + "@internal + In ARM state an address valid in ldrsb instructions." + (and (match_code "mem") + (match_test "TARGET_ARM + && arm_legitimate_address_outer_p (GET_MODE (op), XEXP (op, 0), + SIGN_EXTEND, 0)"))) + +(define_memory_constraint "Q" + "@internal + In ARM/Thumb-2 state an address that is a single base register." + (and (match_code "mem") + (match_test "REG_P (XEXP (op, 0))"))) + +;; We used to have constraint letters for S and R in ARM state, but +;; all uses of these now appear to have been removed. + +;; Additionally, we used to have a Q constraint in Thumb state, but +;; this wasn't really a valid memory constraint. Again, all uses of +;; this now seem to have been removed. diff --git a/gcc/config/arm/cortex-a5.md b/gcc/config/arm/cortex-a5.md new file mode 100644 index 000000000..eb154e298 --- /dev/null +++ b/gcc/config/arm/cortex-a5.md @@ -0,0 +1,297 @@ +;; ARM Cortex-A5 pipeline description +;; Copyright (C) 2010 Free Software Foundation, Inc. +;; Contributed by CodeSourcery. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +(define_automaton "cortex_a5") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Functional units. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; The integer (ALU) pipeline. There are five DPU pipeline +;; stages. However the decode/issue stages operate the same for all +;; instructions, so do not model them. We only need to model the +;; first execute stage because instructions always advance one stage +;; per cycle in order. Only branch instructions may dual-issue, so a +;; single unit covers all of the LS, ALU, MAC and FPU pipelines. + +(define_cpu_unit "cortex_a5_ex1" "cortex_a5") + +;; The branch pipeline. Branches can dual-issue with other instructions +;; (except when those instructions take multiple cycles to issue). + +(define_cpu_unit "cortex_a5_branch" "cortex_a5") + +;; Pseudo-unit for blocking the multiply pipeline when a double-precision +;; multiply is in progress. + +(define_cpu_unit "cortex_a5_fpmul_pipe" "cortex_a5") + +;; The floating-point add pipeline (ex1/f1 stage), used to model the usage +;; of the add pipeline by fmac instructions, etc. + +(define_cpu_unit "cortex_a5_fpadd_pipe" "cortex_a5") + +;; Floating-point div/sqrt (long latency, out-of-order completion). + +(define_cpu_unit "cortex_a5_fp_div_sqrt" "cortex_a5") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ALU instructions. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn_reservation "cortex_a5_alu" 2 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "alu")) + "cortex_a5_ex1") + +(define_insn_reservation "cortex_a5_alu_shift" 2 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "alu_shift,alu_shift_reg")) + "cortex_a5_ex1") + +;; Forwarding path for unshifted operands. + +(define_bypass 1 "cortex_a5_alu,cortex_a5_alu_shift" + "cortex_a5_alu") + +(define_bypass 1 "cortex_a5_alu,cortex_a5_alu_shift" + "cortex_a5_alu_shift" + "arm_no_early_alu_shift_dep") + +;; The multiplier pipeline can forward results from wr stage only so +;; there's no need to specify bypasses). + +(define_insn_reservation "cortex_a5_mul" 2 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "mult")) + "cortex_a5_ex1") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Load/store instructions. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Address-generation happens in the issue stage, which is one stage behind +;; the ex1 stage (the first stage we care about for scheduling purposes). The +;; dc1 stage is parallel with ex1, dc2 with ex2 and rot with wr. + +(define_insn_reservation "cortex_a5_load1" 2 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "load_byte,load1")) + "cortex_a5_ex1") + +(define_insn_reservation "cortex_a5_store1" 0 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "store1")) + "cortex_a5_ex1") + +(define_insn_reservation "cortex_a5_load2" 3 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "load2")) + "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") + +(define_insn_reservation "cortex_a5_store2" 0 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "store2")) + "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") + +(define_insn_reservation "cortex_a5_load3" 4 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "load3")) + "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\ + cortex_a5_ex1") + +(define_insn_reservation "cortex_a5_store3" 0 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "store3")) + "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\ + cortex_a5_ex1") + +(define_insn_reservation "cortex_a5_load4" 5 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "load3")) + "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\ + cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") + +(define_insn_reservation "cortex_a5_store4" 0 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "store3")) + "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\ + cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Branches. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Direct branches are the only instructions we can dual-issue (also IT and +;; nop, but those aren't very interesting for scheduling). (The latency here +;; is meant to represent when the branch actually takes place, but may not be +;; entirely correct.) + +(define_insn_reservation "cortex_a5_branch" 3 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "branch,call")) + "cortex_a5_branch") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Floating-point arithmetic. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn_reservation "cortex_a5_fpalu" 4 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "ffariths, fadds, ffarithd, faddd, fcpys, fmuls, f_cvt,\ + fcmps, fcmpd")) + "cortex_a5_ex1+cortex_a5_fpadd_pipe") + +;; For fconsts and fconstd, 8-bit immediate data is passed directly from +;; f1 to f3 (which I think reduces the latency by one cycle). + +(define_insn_reservation "cortex_a5_fconst" 3 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "fconsts,fconstd")) + "cortex_a5_ex1+cortex_a5_fpadd_pipe") + +;; We should try not to attempt to issue a single-precision multiplication in +;; the middle of a double-precision multiplication operation (the usage of +;; cortex_a5_fpmul_pipe). + +(define_insn_reservation "cortex_a5_fpmuls" 4 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "fmuls")) + "cortex_a5_ex1+cortex_a5_fpmul_pipe") + +;; For single-precision multiply-accumulate, the add (accumulate) is issued +;; whilst the multiply is in F4. The multiply result can then be forwarded +;; from F5 to F1. The issue unit is only used once (when we first start +;; processing the instruction), but the usage of the FP add pipeline could +;; block other instructions attempting to use it simultaneously. We try to +;; avoid that using cortex_a5_fpadd_pipe. + +(define_insn_reservation "cortex_a5_fpmacs" 8 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "fmacs")) + "cortex_a5_ex1+cortex_a5_fpmul_pipe, nothing*3, cortex_a5_fpadd_pipe") + +;; Non-multiply instructions can issue in the middle two instructions of a +;; double-precision multiply. Note that it isn't entirely clear when a branch +;; can dual-issue when a multi-cycle multiplication is in progress; we ignore +;; that for now though. + +(define_insn_reservation "cortex_a5_fpmuld" 7 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "fmuld")) + "cortex_a5_ex1+cortex_a5_fpmul_pipe, cortex_a5_fpmul_pipe*2,\ + cortex_a5_ex1+cortex_a5_fpmul_pipe") + +(define_insn_reservation "cortex_a5_fpmacd" 11 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "fmacd")) + "cortex_a5_ex1+cortex_a5_fpmul_pipe, cortex_a5_fpmul_pipe*2,\ + cortex_a5_ex1+cortex_a5_fpmul_pipe, nothing*3, cortex_a5_fpadd_pipe") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Floating-point divide/square root instructions. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; ??? Not sure if the 14 cycles taken for single-precision divide to complete +;; includes the time taken for the special instruction used to collect the +;; result to travel down the multiply pipeline, or not. Assuming so. (If +;; that's wrong, the latency should be increased by a few cycles.) + +;; fsqrt takes one cycle less, but that is not modelled, nor is the use of the +;; multiply pipeline to collect the divide/square-root result. + +(define_insn_reservation "cortex_a5_fdivs" 14 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "fdivs")) + "cortex_a5_ex1, cortex_a5_fp_div_sqrt * 13") + +;; ??? Similarly for fdivd. + +(define_insn_reservation "cortex_a5_fdivd" 29 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "fdivd")) + "cortex_a5_ex1, cortex_a5_fp_div_sqrt * 28") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; VFP to/from core transfers. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; FP loads take data from wr/rot/f3. + +;; Core-to-VFP transfers use the multiply pipeline. + +(define_insn_reservation "cortex_a5_r2f" 4 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "r_2_f")) + "cortex_a5_ex1") + +(define_insn_reservation "cortex_a5_f2r" 2 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "f_2_r")) + "cortex_a5_ex1") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; VFP flag transfer. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; ??? The flag forwarding from fmstat to the ex2 stage of the second +;; instruction is not modeled at present. + +(define_insn_reservation "cortex_a5_f_flags" 4 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "f_flag")) + "cortex_a5_ex1") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; VFP load/store. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn_reservation "cortex_a5_f_loads" 4 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "f_loads")) + "cortex_a5_ex1") + +(define_insn_reservation "cortex_a5_f_loadd" 5 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "f_loadd")) + "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") + +(define_insn_reservation "cortex_a5_f_stores" 0 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "f_stores")) + "cortex_a5_ex1") + +(define_insn_reservation "cortex_a5_f_stored" 0 + (and (eq_attr "tune" "cortexa5") + (eq_attr "type" "f_stored")) + "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") + +;; Load-to-use for floating-point values has a penalty of one cycle, +;; i.e. a latency of two. + +(define_bypass 2 "cortex_a5_f_loads" + "cortex_a5_fpalu, cortex_a5_fpmacs, cortex_a5_fpmuld,\ + cortex_a5_fpmacd, cortex_a5_fdivs, cortex_a5_fdivd,\ + cortex_a5_f2r") + +(define_bypass 3 "cortex_a5_f_loadd" + "cortex_a5_fpalu, cortex_a5_fpmacs, cortex_a5_fpmuld,\ + cortex_a5_fpmacd, cortex_a5_fdivs, cortex_a5_fdivd,\ + cortex_a5_f2r") diff --git a/gcc/config/arm/cortex-a8-neon.md b/gcc/config/arm/cortex-a8-neon.md new file mode 100644 index 000000000..03f52b2df --- /dev/null +++ b/gcc/config/arm/cortex-a8-neon.md @@ -0,0 +1,1312 @@ +;; ARM Cortex-A8 NEON scheduling description. +;; Copyright (C) 2007, 2008, 2010 Free Software Foundation, Inc. +;; Contributed by CodeSourcery. + +;; This file is part of GCC. + +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published +;; by the Free Software Foundation; either version 3, or (at your +;; option) any later version. + +;; GCC is distributed in the hope that it will be useful, but WITHOUT +;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public +;; License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + + +(define_automaton "cortex_a8_neon") + +;; Only one load, store, permute, MCR or MRC instruction can be issued +;; per cycle. +(define_cpu_unit "cortex_a8_neon_issue_perm" "cortex_a8_neon") + +;; Only one data-processing instruction can be issued per cycle. +(define_cpu_unit "cortex_a8_neon_issue_dp" "cortex_a8_neon") + +;; The VFPLite unit (non-pipelined). +(define_cpu_unit "cortex_a8_vfplite" "cortex_a8_neon") + +;; We need a special mutual exclusion (to be used in addition to +;; cortex_a8_neon_issue_dp) for the case when an instruction such as +;; vmla.f is forwarded from E5 of the floating-point multiply pipeline to +;; E2 of the floating-point add pipeline. On the cycle previous to that +;; forward we must prevent issue of any instruction to the floating-point +;; add pipeline, but still allow issue of a data-processing instruction +;; to any of the other pipelines. +(define_cpu_unit "cortex_a8_neon_issue_fadd" "cortex_a8_neon") + +;; Patterns of reservation. +;; We model the NEON issue units as running in parallel with the core ones. +;; We assume that multi-cycle NEON instructions get decomposed into +;; micro-ops as they are issued into the NEON pipeline, and not as they +;; are issued into the ARM pipeline. Dual issue may not occur except +;; upon the first and last cycles of a multi-cycle instruction, but it +;; is unclear whether two multi-cycle instructions can issue together (in +;; this model they cannot). It is also unclear whether a pair of +;; a multi-cycle and single-cycle instructions, that could potentially +;; issue together, only do so if (say) the single-cycle one precedes +;; the other. + +(define_reservation "cortex_a8_neon_dp" + "(cortex_a8_alu0|cortex_a8_alu1)+cortex_a8_neon_issue_dp") +(define_reservation "cortex_a8_neon_dp_2" + "(cortex_a8_alu0|cortex_a8_alu1)+cortex_a8_neon_issue_dp,\ + cortex_a8_neon_issue_dp") +(define_reservation "cortex_a8_neon_dp_4" + "(cortex_a8_alu0|cortex_a8_alu1)+cortex_a8_neon_issue_dp,\ + cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\ + cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\ + cortex_a8_neon_issue_dp") + +(define_reservation "cortex_a8_neon_fadd" + "(cortex_a8_alu0|cortex_a8_alu1)+cortex_a8_neon_issue_dp+\ + cortex_a8_neon_issue_fadd") +(define_reservation "cortex_a8_neon_fadd_2" + "(cortex_a8_alu0|cortex_a8_alu1)+cortex_a8_neon_issue_dp+\ + cortex_a8_neon_issue_fadd,\ + cortex_a8_neon_issue_dp+cortex_a8_neon_issue_fadd") + +(define_reservation "cortex_a8_neon_perm" + "(cortex_a8_alu0|cortex_a8_alu1)+\ + cortex_a8_neon_issue_perm") +(define_reservation "cortex_a8_neon_perm_2" + "(cortex_a8_alu0|cortex_a8_alu1)+\ + cortex_a8_neon_issue_perm,\ + cortex_a8_neon_issue_perm") +(define_reservation "cortex_a8_neon_perm_3" + "(cortex_a8_alu0|cortex_a8_alu1)+\ + cortex_a8_neon_issue_perm,\ + cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\ + cortex_a8_neon_issue_perm") + +(define_reservation "cortex_a8_neon_ls" + "cortex_a8_issue_ls+cortex_a8_neon_issue_perm") +(define_reservation "cortex_a8_neon_ls_2" + "cortex_a8_issue_ls+cortex_a8_neon_issue_perm,\ + cortex_a8_neon_issue_perm") +(define_reservation "cortex_a8_neon_ls_3" + "cortex_a8_issue_ls+cortex_a8_neon_issue_perm,\ + cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\ + cortex_a8_neon_issue_perm") +(define_reservation "cortex_a8_neon_ls_4" + "cortex_a8_issue_ls+cortex_a8_neon_issue_perm,\ + cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\ + cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\ + cortex_a8_neon_issue_perm") +(define_reservation "cortex_a8_neon_ls_5" + "cortex_a8_issue_ls+cortex_a8_neon_issue_perm,\ + cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\ + cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\ + cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\ + cortex_a8_neon_issue_perm") + +(define_reservation "cortex_a8_neon_fmul_then_fadd" + "(cortex_a8_alu0|cortex_a8_alu1)+cortex_a8_neon_issue_dp,\ + nothing*3,\ + cortex_a8_neon_issue_fadd") +(define_reservation "cortex_a8_neon_fmul_then_fadd_2" + "(cortex_a8_alu0|cortex_a8_alu1)+cortex_a8_neon_issue_dp,\ + cortex_a8_neon_issue_dp,\ + nothing*2,\ + cortex_a8_neon_issue_fadd,\ + cortex_a8_neon_issue_fadd") + +;; VFP instructions can only be single-issued into the NEON pipeline. +(define_reservation "cortex_a8_vfp" + "(cortex_a8_alu0|cortex_a8_alu1)+cortex_a8_neon_issue_dp+\ + cortex_a8_neon_issue_perm+cortex_a8_vfplite") + +;; VFP instructions. +;; The VFPLite unit that executes these isn't pipelined; we give the +;; worst-case latencies (and choose the double-precision ones where we +;; do not distinguish on precision). We assume RunFast mode is not +;; enabled and therefore do not model the possible VFP instruction +;; execution in the NEON floating point pipelines, nor additional +;; latencies for the processing of subnormals. +;; +;; TODO: RunFast mode could potentially be enabled when -ffast-math +;; is specified. + +(define_insn_reservation "cortex_a8_vfp_add_sub" 10 + (and (eq_attr "tune" "cortexa8") + (eq_attr "type" "fconsts,fconstd,fadds,faddd")) + "cortex_a8_vfp,cortex_a8_vfplite*9") + +(define_insn_reservation "cortex_a8_vfp_muls" 12 + (and (eq_attr "tune" "cortexa8") + (eq_attr "type" "fmuls")) + "cortex_a8_vfp,cortex_a8_vfplite*11") + +(define_insn_reservation "cortex_a8_vfp_muld" 17 + (and (eq_attr "tune" "cortexa8") + (eq_attr "type" "fmuld")) + "cortex_a8_vfp,cortex_a8_vfplite*16") + +(define_insn_reservation "cortex_a8_vfp_macs" 21 + (and (eq_attr "tune" "cortexa8") + (eq_attr "type" "fmacs")) + "cortex_a8_vfp,cortex_a8_vfplite*20") + +(define_insn_reservation "cortex_a8_vfp_macd" 26 + (and (eq_attr "tune" "cortexa8") + (eq_attr "type" "fmacd")) + "cortex_a8_vfp,cortex_a8_vfplite*25") + +(define_insn_reservation "cortex_a8_vfp_divs" 37 + (and (eq_attr "tune" "cortexa8") + (eq_attr "type" "fdivs")) + "cortex_a8_vfp,cortex_a8_vfplite*36") + +(define_insn_reservation "cortex_a8_vfp_divd" 65 + (and (eq_attr "tune" "cortexa8") + (eq_attr "type" "fdivd")) + "cortex_a8_vfp,cortex_a8_vfplite*64") + +;; Comparisons can actually take 7 cycles sometimes instead of four, +;; but given all the other instructions lumped into type=ffarith that +;; take four cycles, we pick that latency. +(define_insn_reservation "cortex_a8_vfp_farith" 4 + (and (eq_attr "tune" "cortexa8") + (eq_attr "type" "fcpys,ffariths,ffarithd,fconsts,fconstd,fcmps,fcmpd")) + "cortex_a8_vfp,cortex_a8_vfplite*3") + +(define_insn_reservation "cortex_a8_vfp_cvt" 7 + (and (eq_attr "tune" "cortexa8") + (eq_attr "type" "f_cvt")) + "cortex_a8_vfp,cortex_a8_vfplite*6") + +;; NEON -> core transfers. + +(define_insn_reservation "cortex_a8_neon_mrc" 20 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_mrc")) + "cortex_a8_neon_ls") + +(define_insn_reservation "cortex_a8_neon_mrrc" 21 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_mrrc")) + "cortex_a8_neon_ls_2") + +;; The remainder of this file is auto-generated by neon-schedgen. + +;; Instructions using this reservation read their source operands at N2, and +;; produce a result at N3. +(define_insn_reservation "cortex_a8_neon_int_1" 3 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_int_1")) + "cortex_a8_neon_dp") + +;; Instructions using this reservation read their (D|Q)m operands at N1, +;; their (D|Q)n operands at N2, and produce a result at N3. +(define_insn_reservation "cortex_a8_neon_int_2" 3 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_int_2")) + "cortex_a8_neon_dp") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N3. +(define_insn_reservation "cortex_a8_neon_int_3" 3 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_int_3")) + "cortex_a8_neon_dp") + +;; Instructions using this reservation read their source operands at N2, and +;; produce a result at N4. +(define_insn_reservation "cortex_a8_neon_int_4" 4 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_int_4")) + "cortex_a8_neon_dp") + +;; Instructions using this reservation read their (D|Q)m operands at N1, +;; their (D|Q)n operands at N2, and produce a result at N4. +(define_insn_reservation "cortex_a8_neon_int_5" 4 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_int_5")) + "cortex_a8_neon_dp") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N4. +(define_insn_reservation "cortex_a8_neon_vqneg_vqabs" 4 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_vqneg_vqabs")) + "cortex_a8_neon_dp") + +;; Instructions using this reservation produce a result at N3. +(define_insn_reservation "cortex_a8_neon_vmov" 3 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_vmov")) + "cortex_a8_neon_dp") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and +;; produce a result at N6. +(define_insn_reservation "cortex_a8_neon_vaba" 6 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_vaba")) + "cortex_a8_neon_dp") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and +;; produce a result at N6 on cycle 2. +(define_insn_reservation "cortex_a8_neon_vaba_qqq" 7 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_vaba_qqq")) + "cortex_a8_neon_dp_2") + +;; Instructions using this reservation read their (D|Q)m operands at N1, +;; their (D|Q)d operands at N3, and produce a result at N6. +(define_insn_reservation "cortex_a8_neon_vsma" 6 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_vsma")) + "cortex_a8_neon_dp") + +;; Instructions using this reservation read their source operands at N2, and +;; produce a result at N6. +(define_insn_reservation "cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long" 6 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long")) + "cortex_a8_neon_dp") + +;; Instructions using this reservation read their source operands at N2, and +;; produce a result at N6 on cycle 2. +(define_insn_reservation "cortex_a8_neon_mul_qqq_8_16_32_ddd_32" 7 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_mul_qqq_8_16_32_ddd_32")) + "cortex_a8_neon_dp_2") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N1, and produce a result at N6 on cycle 2. +(define_insn_reservation "cortex_a8_neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar" 7 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar")) + "cortex_a8_neon_dp_2") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N2, their (D|Q)d operands at N3, and +;; produce a result at N6. +(define_insn_reservation "cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long" 6 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long")) + "cortex_a8_neon_dp") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N2, their (D|Q)d operands at N3, and +;; produce a result at N6 on cycle 2. +(define_insn_reservation "cortex_a8_neon_mla_qqq_8_16" 7 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_mla_qqq_8_16")) + "cortex_a8_neon_dp_2") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and +;; produce a result at N6 on cycle 2. +(define_insn_reservation "cortex_a8_neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long" 7 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")) + "cortex_a8_neon_dp_2") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and +;; produce a result at N6 on cycle 4. +(define_insn_reservation "cortex_a8_neon_mla_qqq_32_qqd_32_scalar" 9 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_mla_qqq_32_qqd_32_scalar")) + "cortex_a8_neon_dp_4") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N1, and produce a result at N6. +(define_insn_reservation "cortex_a8_neon_mul_ddd_16_scalar_32_16_long_scalar" 6 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_mul_ddd_16_scalar_32_16_long_scalar")) + "cortex_a8_neon_dp") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N1, and produce a result at N6 on cycle 4. +(define_insn_reservation "cortex_a8_neon_mul_qqd_32_scalar" 9 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_mul_qqd_32_scalar")) + "cortex_a8_neon_dp_4") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and +;; produce a result at N6. +(define_insn_reservation "cortex_a8_neon_mla_ddd_16_scalar_qdd_32_16_long_scalar" 6 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar")) + "cortex_a8_neon_dp") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N3. +(define_insn_reservation "cortex_a8_neon_shift_1" 3 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_shift_1")) + "cortex_a8_neon_dp") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N4. +(define_insn_reservation "cortex_a8_neon_shift_2" 4 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_shift_2")) + "cortex_a8_neon_dp") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N3 on cycle 2. +(define_insn_reservation "cortex_a8_neon_shift_3" 4 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_shift_3")) + "cortex_a8_neon_dp_2") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N1. +(define_insn_reservation "cortex_a8_neon_vshl_ddd" 1 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_vshl_ddd")) + "cortex_a8_neon_dp") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N4 on cycle 2. +(define_insn_reservation "cortex_a8_neon_vqshl_vrshl_vqrshl_qqq" 5 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_vqshl_vrshl_vqrshl_qqq")) + "cortex_a8_neon_dp_2") + +;; Instructions using this reservation read their (D|Q)m operands at N1, +;; their (D|Q)d operands at N3, and produce a result at N6. +(define_insn_reservation "cortex_a8_neon_vsra_vrsra" 6 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_vsra_vrsra")) + "cortex_a8_neon_dp") + +;; Instructions using this reservation read their source operands at N2, and +;; produce a result at N5. +(define_insn_reservation "cortex_a8_neon_fp_vadd_ddd_vabs_dd" 5 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_fp_vadd_ddd_vabs_dd")) + "cortex_a8_neon_fadd") + +;; Instructions using this reservation read their source operands at N2, and +;; produce a result at N5 on cycle 2. +(define_insn_reservation "cortex_a8_neon_fp_vadd_qqq_vabs_qq" 6 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_fp_vadd_qqq_vabs_qq")) + "cortex_a8_neon_fadd_2") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N5. +(define_insn_reservation "cortex_a8_neon_fp_vsum" 5 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_fp_vsum")) + "cortex_a8_neon_fadd") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N1, and produce a result at N5. +(define_insn_reservation "cortex_a8_neon_fp_vmul_ddd" 5 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_fp_vmul_ddd")) + "cortex_a8_neon_dp") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N1, and produce a result at N5 on cycle 2. +(define_insn_reservation "cortex_a8_neon_fp_vmul_qqd" 6 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_fp_vmul_qqd")) + "cortex_a8_neon_dp_2") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N2, their (D|Q)d operands at N3, and +;; produce a result at N9. +(define_insn_reservation "cortex_a8_neon_fp_vmla_ddd" 9 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_fp_vmla_ddd")) + "cortex_a8_neon_fmul_then_fadd") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N2, their (D|Q)d operands at N3, and +;; produce a result at N9 on cycle 2. +(define_insn_reservation "cortex_a8_neon_fp_vmla_qqq" 10 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_fp_vmla_qqq")) + "cortex_a8_neon_fmul_then_fadd_2") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and +;; produce a result at N9. +(define_insn_reservation "cortex_a8_neon_fp_vmla_ddd_scalar" 9 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_fp_vmla_ddd_scalar")) + "cortex_a8_neon_fmul_then_fadd") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and +;; produce a result at N9 on cycle 2. +(define_insn_reservation "cortex_a8_neon_fp_vmla_qqq_scalar" 10 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_fp_vmla_qqq_scalar")) + "cortex_a8_neon_fmul_then_fadd_2") + +;; Instructions using this reservation read their source operands at N2, and +;; produce a result at N9. +(define_insn_reservation "cortex_a8_neon_fp_vrecps_vrsqrts_ddd" 9 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_fp_vrecps_vrsqrts_ddd")) + "cortex_a8_neon_fmul_then_fadd") + +;; Instructions using this reservation read their source operands at N2, and +;; produce a result at N9 on cycle 2. +(define_insn_reservation "cortex_a8_neon_fp_vrecps_vrsqrts_qqq" 10 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_fp_vrecps_vrsqrts_qqq")) + "cortex_a8_neon_fmul_then_fadd_2") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N2. +(define_insn_reservation "cortex_a8_neon_bp_simple" 2 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_bp_simple")) + "cortex_a8_neon_perm") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N2 on cycle 2. +(define_insn_reservation "cortex_a8_neon_bp_2cycle" 3 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_bp_2cycle")) + "cortex_a8_neon_perm_2") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N2 on cycle 3. +(define_insn_reservation "cortex_a8_neon_bp_3cycle" 4 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_bp_3cycle")) + "cortex_a8_neon_perm_3") + +;; Instructions using this reservation produce a result at N1. +(define_insn_reservation "cortex_a8_neon_ldr" 1 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_ldr")) + "cortex_a8_neon_ls") + +;; Instructions using this reservation read their source operands at N1. +(define_insn_reservation "cortex_a8_neon_str" 0 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_str")) + "cortex_a8_neon_ls") + +;; Instructions using this reservation produce a result at N1 on cycle 2. +(define_insn_reservation "cortex_a8_neon_vld1_1_2_regs" 2 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_vld1_1_2_regs")) + "cortex_a8_neon_ls_2") + +;; Instructions using this reservation produce a result at N1 on cycle 3. +(define_insn_reservation "cortex_a8_neon_vld1_3_4_regs" 3 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_vld1_3_4_regs")) + "cortex_a8_neon_ls_3") + +;; Instructions using this reservation produce a result at N2 on cycle 2. +(define_insn_reservation "cortex_a8_neon_vld2_2_regs_vld1_vld2_all_lanes" 3 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_vld2_2_regs_vld1_vld2_all_lanes")) + "cortex_a8_neon_ls_2") + +;; Instructions using this reservation produce a result at N2 on cycle 3. +(define_insn_reservation "cortex_a8_neon_vld2_4_regs" 4 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_vld2_4_regs")) + "cortex_a8_neon_ls_3") + +;; Instructions using this reservation produce a result at N2 on cycle 4. +(define_insn_reservation "cortex_a8_neon_vld3_vld4" 5 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_vld3_vld4")) + "cortex_a8_neon_ls_4") + +;; Instructions using this reservation read their source operands at N1. +(define_insn_reservation "cortex_a8_neon_vst1_1_2_regs_vst2_2_regs" 0 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_vst1_1_2_regs_vst2_2_regs")) + "cortex_a8_neon_ls_2") + +;; Instructions using this reservation read their source operands at N1. +(define_insn_reservation "cortex_a8_neon_vst1_3_4_regs" 0 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_vst1_3_4_regs")) + "cortex_a8_neon_ls_3") + +;; Instructions using this reservation read their source operands at N1. +(define_insn_reservation "cortex_a8_neon_vst2_4_regs_vst3_vst4" 0 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_vst2_4_regs_vst3_vst4")) + "cortex_a8_neon_ls_4") + +;; Instructions using this reservation read their source operands at N1. +(define_insn_reservation "cortex_a8_neon_vst3_vst4" 0 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_vst3_vst4")) + "cortex_a8_neon_ls_4") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N2 on cycle 3. +(define_insn_reservation "cortex_a8_neon_vld1_vld2_lane" 4 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_vld1_vld2_lane")) + "cortex_a8_neon_ls_3") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N2 on cycle 5. +(define_insn_reservation "cortex_a8_neon_vld3_vld4_lane" 6 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_vld3_vld4_lane")) + "cortex_a8_neon_ls_5") + +;; Instructions using this reservation read their source operands at N1. +(define_insn_reservation "cortex_a8_neon_vst1_vst2_lane" 0 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_vst1_vst2_lane")) + "cortex_a8_neon_ls_2") + +;; Instructions using this reservation read their source operands at N1. +(define_insn_reservation "cortex_a8_neon_vst3_vst4_lane" 0 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_vst3_vst4_lane")) + "cortex_a8_neon_ls_3") + +;; Instructions using this reservation produce a result at N2 on cycle 2. +(define_insn_reservation "cortex_a8_neon_vld3_vld4_all_lanes" 3 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_vld3_vld4_all_lanes")) + "cortex_a8_neon_ls_3") + +;; Instructions using this reservation produce a result at N2. +(define_insn_reservation "cortex_a8_neon_mcr" 2 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_mcr")) + "cortex_a8_neon_perm") + +;; Instructions using this reservation produce a result at N2. +(define_insn_reservation "cortex_a8_neon_mcr_2_mcrr" 2 + (and (eq_attr "tune" "cortexa8") + (eq_attr "neon_type" "neon_mcr_2_mcrr")) + "cortex_a8_neon_perm_2") + +;; Exceptions to the default latencies. + +(define_bypass 1 "cortex_a8_neon_mcr_2_mcrr" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 1 "cortex_a8_neon_mcr" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 2 "cortex_a8_neon_vld3_vld4_all_lanes" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 5 "cortex_a8_neon_vld3_vld4_lane" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 3 "cortex_a8_neon_vld1_vld2_lane" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 4 "cortex_a8_neon_vld3_vld4" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 3 "cortex_a8_neon_vld2_4_regs" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 2 "cortex_a8_neon_vld2_2_regs_vld1_vld2_all_lanes" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 2 "cortex_a8_neon_vld1_3_4_regs" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 1 "cortex_a8_neon_vld1_1_2_regs" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 0 "cortex_a8_neon_ldr" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 3 "cortex_a8_neon_bp_3cycle" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 2 "cortex_a8_neon_bp_2cycle" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 1 "cortex_a8_neon_bp_simple" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 9 "cortex_a8_neon_fp_vrecps_vrsqrts_qqq" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 8 "cortex_a8_neon_fp_vrecps_vrsqrts_ddd" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 9 "cortex_a8_neon_fp_vmla_qqq_scalar" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 8 "cortex_a8_neon_fp_vmla_ddd_scalar" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 9 "cortex_a8_neon_fp_vmla_qqq" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 8 "cortex_a8_neon_fp_vmla_ddd" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 5 "cortex_a8_neon_fp_vmul_qqd" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 4 "cortex_a8_neon_fp_vmul_ddd" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 4 "cortex_a8_neon_fp_vsum" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 5 "cortex_a8_neon_fp_vadd_qqq_vabs_qq" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 4 "cortex_a8_neon_fp_vadd_ddd_vabs_dd" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 5 "cortex_a8_neon_vsra_vrsra" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 4 "cortex_a8_neon_vqshl_vrshl_vqrshl_qqq" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 0 "cortex_a8_neon_vshl_ddd" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 3 "cortex_a8_neon_shift_3" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 3 "cortex_a8_neon_shift_2" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 2 "cortex_a8_neon_shift_1" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 5 "cortex_a8_neon_mla_ddd_16_scalar_qdd_32_16_long_scalar" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 8 "cortex_a8_neon_mul_qqd_32_scalar" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 5 "cortex_a8_neon_mul_ddd_16_scalar_32_16_long_scalar" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 8 "cortex_a8_neon_mla_qqq_32_qqd_32_scalar" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 6 "cortex_a8_neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 6 "cortex_a8_neon_mla_qqq_8_16" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 5 "cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 6 "cortex_a8_neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 6 "cortex_a8_neon_mul_qqq_8_16_32_ddd_32" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 5 "cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 5 "cortex_a8_neon_vsma" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 6 "cortex_a8_neon_vaba_qqq" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 5 "cortex_a8_neon_vaba" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 2 "cortex_a8_neon_vmov" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 3 "cortex_a8_neon_vqneg_vqabs" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 3 "cortex_a8_neon_int_5" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 3 "cortex_a8_neon_int_4" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 2 "cortex_a8_neon_int_3" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 2 "cortex_a8_neon_int_2" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 2 "cortex_a8_neon_int_1" + "cortex_a8_neon_int_1,\ + cortex_a8_neon_int_4,\ + cortex_a8_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a8_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a8_neon_mla_qqq_8_16,\ + cortex_a8_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a8_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a8_neon_fp_vmla_ddd,\ + cortex_a8_neon_fp_vmla_qqq,\ + cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a8_neon_fp_vrecps_vrsqrts_qqq") + diff --git a/gcc/config/arm/cortex-a8.md b/gcc/config/arm/cortex-a8.md new file mode 100644 index 000000000..1922e5cf4 --- /dev/null +++ b/gcc/config/arm/cortex-a8.md @@ -0,0 +1,275 @@ +;; ARM Cortex-A8 scheduling description. +;; Copyright (C) 2007, 2010 Free Software Foundation, Inc. +;; Contributed by CodeSourcery. + +;; This file is part of GCC. + +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published +;; by the Free Software Foundation; either version 3, or (at your +;; option) any later version. + +;; GCC is distributed in the hope that it will be useful, but WITHOUT +;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public +;; License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +(define_automaton "cortex_a8") + +;; Only one load/store instruction can be issued per cycle +;; (although reservation of this unit is only required for single +;; loads and stores -- see below). +(define_cpu_unit "cortex_a8_issue_ls" "cortex_a8") + +;; Only one branch instruction can be issued per cycle. +(define_cpu_unit "cortex_a8_issue_branch" "cortex_a8") + +;; The two ALU pipelines. +(define_cpu_unit "cortex_a8_alu0" "cortex_a8") +(define_cpu_unit "cortex_a8_alu1" "cortex_a8") + +;; The usual flow of an instruction through the pipelines. +(define_reservation "cortex_a8_default" + "cortex_a8_alu0|cortex_a8_alu1") + +;; The flow of a branch instruction through the pipelines. +(define_reservation "cortex_a8_branch" + "(cortex_a8_alu0+cortex_a8_issue_branch)|\ + (cortex_a8_alu1+cortex_a8_issue_branch)") + +;; The flow of a load or store instruction through the pipeline in +;; the case where that instruction consists of only one micro-op... +(define_reservation "cortex_a8_load_store_1" + "(cortex_a8_alu0+cortex_a8_issue_ls)|\ + (cortex_a8_alu1+cortex_a8_issue_ls)") + +;; ...and in the case of two micro-ops. Dual issue is altogether forbidden +;; during the issue cycle of the first micro-op. (Instead of modelling +;; a separate issue unit, we instead reserve alu0 and alu1 to +;; prevent any other instructions from being issued upon that first cycle.) +;; Even though the load/store pipeline is usually available in either +;; ALU pipe, multi-cycle instructions always issue in pipeline 0. +(define_reservation "cortex_a8_load_store_2" + "cortex_a8_alu0+cortex_a8_alu1+cortex_a8_issue_ls,\ + cortex_a8_alu0+cortex_a8_issue_ls") + +;; The flow of a single-cycle multiplication. +(define_reservation "cortex_a8_multiply" + "cortex_a8_alu0") + +;; The flow of a multiplication instruction that gets decomposed into +;; two micro-ops. The two micro-ops will be issued to pipeline 0 on +;; successive cycles. Dual issue cannot happen at the same time as the +;; first of the micro-ops. +(define_reservation "cortex_a8_multiply_2" + "cortex_a8_alu0+cortex_a8_alu1,\ + cortex_a8_alu0") + +;; Similarly, the flow of a multiplication instruction that gets +;; decomposed into three micro-ops. Dual issue cannot occur except on +;; the cycle upon which the third micro-op is issued. +(define_reservation "cortex_a8_multiply_3" + "cortex_a8_alu0+cortex_a8_alu1,\ + cortex_a8_alu0+cortex_a8_alu1,\ + cortex_a8_alu0") + +;; The model given here assumes that all instructions are unconditional. + +;; Data processing instructions, but not move instructions. + +;; We include CLZ with these since it has the same execution pattern +;; (source read in E2 and destination available at the end of that cycle). +(define_insn_reservation "cortex_a8_alu" 2 + (and (eq_attr "tune" "cortexa8") + (ior (and (and (eq_attr "type" "alu") + (eq_attr "neon_type" "none")) + (not (eq_attr "insn" "mov,mvn"))) + (eq_attr "insn" "clz"))) + "cortex_a8_default") + +(define_insn_reservation "cortex_a8_alu_shift" 2 + (and (eq_attr "tune" "cortexa8") + (and (eq_attr "type" "alu_shift") + (not (eq_attr "insn" "mov,mvn")))) + "cortex_a8_default") + +(define_insn_reservation "cortex_a8_alu_shift_reg" 2 + (and (eq_attr "tune" "cortexa8") + (and (eq_attr "type" "alu_shift_reg") + (not (eq_attr "insn" "mov,mvn")))) + "cortex_a8_default") + +;; Move instructions. + +(define_insn_reservation "cortex_a8_mov" 1 + (and (eq_attr "tune" "cortexa8") + (and (eq_attr "type" "alu,alu_shift,alu_shift_reg") + (eq_attr "insn" "mov,mvn"))) + "cortex_a8_default") + +;; Exceptions to the default latencies for data processing instructions. + +;; A move followed by an ALU instruction with no early dep. +;; (Such a pair can be issued in parallel, hence latency zero.) +(define_bypass 0 "cortex_a8_mov" "cortex_a8_alu") +(define_bypass 0 "cortex_a8_mov" "cortex_a8_alu_shift" + "arm_no_early_alu_shift_dep") +(define_bypass 0 "cortex_a8_mov" "cortex_a8_alu_shift_reg" + "arm_no_early_alu_shift_value_dep") + +;; An ALU instruction followed by an ALU instruction with no early dep. +(define_bypass 1 "cortex_a8_alu,cortex_a8_alu_shift,cortex_a8_alu_shift_reg" + "cortex_a8_alu") +(define_bypass 1 "cortex_a8_alu,cortex_a8_alu_shift,cortex_a8_alu_shift_reg" + "cortex_a8_alu_shift" + "arm_no_early_alu_shift_dep") +(define_bypass 1 "cortex_a8_alu,cortex_a8_alu_shift,cortex_a8_alu_shift_reg" + "cortex_a8_alu_shift_reg" + "arm_no_early_alu_shift_value_dep") + +;; Multiplication instructions. These are categorized according to their +;; reservation behavior and the need below to distinguish certain +;; varieties for bypasses. Results are available at the E5 stage +;; (but some of these are multi-cycle instructions which explains the +;; latencies below). + +(define_insn_reservation "cortex_a8_mul" 6 + (and (eq_attr "tune" "cortexa8") + (eq_attr "insn" "mul,smulxy,smmul")) + "cortex_a8_multiply_2") + +(define_insn_reservation "cortex_a8_mla" 6 + (and (eq_attr "tune" "cortexa8") + (eq_attr "insn" "mla,smlaxy,smlawy,smmla,smlad,smlsd")) + "cortex_a8_multiply_2") + +(define_insn_reservation "cortex_a8_mull" 7 + (and (eq_attr "tune" "cortexa8") + (eq_attr "insn" "smull,umull,smlal,umlal,umaal,smlalxy")) + "cortex_a8_multiply_3") + +(define_insn_reservation "cortex_a8_smulwy" 5 + (and (eq_attr "tune" "cortexa8") + (eq_attr "insn" "smulwy,smuad,smusd")) + "cortex_a8_multiply") + +;; smlald and smlsld are multiply-accumulate instructions but do not +;; received bypassed data from other multiplication results; thus, they +;; cannot go in cortex_a8_mla above. (See below for bypass details.) +(define_insn_reservation "cortex_a8_smlald" 6 + (and (eq_attr "tune" "cortexa8") + (eq_attr "insn" "smlald,smlsld")) + "cortex_a8_multiply_2") + +;; A multiply with a single-register result or an MLA, followed by an +;; MLA with an accumulator dependency, has its result forwarded so two +;; such instructions can issue back-to-back. +(define_bypass 1 "cortex_a8_mul,cortex_a8_mla,cortex_a8_smulwy" + "cortex_a8_mla" + "arm_mac_accumulator_is_mul_result") + +;; A multiply followed by an ALU instruction needing the multiply +;; result only at E2 has lower latency than one needing it at E1. +(define_bypass 4 "cortex_a8_mul,cortex_a8_mla,cortex_a8_mull,\ + cortex_a8_smulwy,cortex_a8_smlald" + "cortex_a8_alu") +(define_bypass 4 "cortex_a8_mul,cortex_a8_mla,cortex_a8_mull,\ + cortex_a8_smulwy,cortex_a8_smlald" + "cortex_a8_alu_shift" + "arm_no_early_alu_shift_dep") +(define_bypass 4 "cortex_a8_mul,cortex_a8_mla,cortex_a8_mull,\ + cortex_a8_smulwy,cortex_a8_smlald" + "cortex_a8_alu_shift_reg" + "arm_no_early_alu_shift_value_dep") + +;; Load instructions. +;; The presence of any register writeback is ignored here. + +;; A load result has latency 3 unless the dependent instruction has +;; no early dep, in which case it is only latency two. +;; We assume 64-bit alignment for doubleword loads. +(define_insn_reservation "cortex_a8_load1_2" 3 + (and (eq_attr "tune" "cortexa8") + (eq_attr "type" "load1,load2,load_byte")) + "cortex_a8_load_store_1") + +(define_bypass 2 "cortex_a8_load1_2" + "cortex_a8_alu") +(define_bypass 2 "cortex_a8_load1_2" + "cortex_a8_alu_shift" + "arm_no_early_alu_shift_dep") +(define_bypass 2 "cortex_a8_load1_2" + "cortex_a8_alu_shift_reg" + "arm_no_early_alu_shift_value_dep") + +;; We do not currently model the fact that loads with scaled register +;; offsets that are not LSL #2 have an extra cycle latency (they issue +;; as two micro-ops). + +;; A load multiple of three registers is usually issued as two micro-ops. +;; The first register will be available at E3 of the first iteration, +;; the second at E3 of the second iteration, and the third at E4 of +;; the second iteration. A load multiple of four registers is usually +;; issued as two micro-ops. +(define_insn_reservation "cortex_a8_load3_4" 5 + (and (eq_attr "tune" "cortexa8") + (eq_attr "type" "load3,load4")) + "cortex_a8_load_store_2") + +(define_bypass 4 "cortex_a8_load3_4" + "cortex_a8_alu") +(define_bypass 4 "cortex_a8_load3_4" + "cortex_a8_alu_shift" + "arm_no_early_alu_shift_dep") +(define_bypass 4 "cortex_a8_load3_4" + "cortex_a8_alu_shift_reg" + "arm_no_early_alu_shift_value_dep") + +;; Store instructions. +;; Writeback is again ignored. + +(define_insn_reservation "cortex_a8_store1_2" 0 + (and (eq_attr "tune" "cortexa8") + (eq_attr "type" "store1,store2")) + "cortex_a8_load_store_1") + +(define_insn_reservation "cortex_a8_store3_4" 0 + (and (eq_attr "tune" "cortexa8") + (eq_attr "type" "store3,store4")) + "cortex_a8_load_store_2") + +;; An ALU instruction acting as a producer for a store instruction +;; that only uses the result as the value to be stored (as opposed to +;; using it to calculate the address) has latency zero; the store +;; reads the value to be stored at the start of E3 and the ALU insn +;; writes it at the end of E2. Move instructions actually produce the +;; result at the end of E1, but since we don't have delay slots, the +;; scheduling behavior will be the same. +(define_bypass 0 "cortex_a8_alu,cortex_a8_alu_shift,\ + cortex_a8_alu_shift_reg,cortex_a8_mov" + "cortex_a8_store1_2,cortex_a8_store3_4" + "arm_no_early_store_addr_dep") + +;; Branch instructions + +(define_insn_reservation "cortex_a8_branch" 0 + (and (eq_attr "tune" "cortexa8") + (eq_attr "type" "branch")) + "cortex_a8_branch") + +;; Call latencies are not predictable. A semi-arbitrary very large +;; number is used as "positive infinity" so that everything should be +;; finished by the time of return. +(define_insn_reservation "cortex_a8_call" 32 + (and (eq_attr "tune" "cortexa8") + (eq_attr "type" "call")) + "cortex_a8_issue_branch") + +;; NEON (including VFP) instructions. + +(include "cortex-a8-neon.md") + diff --git a/gcc/config/arm/cortex-a9-neon.md b/gcc/config/arm/cortex-a9-neon.md new file mode 100644 index 000000000..2e8ec9b14 --- /dev/null +++ b/gcc/config/arm/cortex-a9-neon.md @@ -0,0 +1,1237 @@ +;; ARM Cortex-A9 pipeline description +;; Copyright (C) 2010 Free Software Foundation, Inc. +;; +;; Neon pipeline description contributed by ARM Ltd. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + + +(define_automaton "cortex_a9_neon") + +;; Only one instruction can be issued per cycle. +(define_cpu_unit "cortex_a9_neon_issue_perm" "cortex_a9_neon") + +;; Only one data-processing instruction can be issued per cycle. +(define_cpu_unit "cortex_a9_neon_issue_dp" "cortex_a9_neon") + +;; We need a special mutual exclusion (to be used in addition to +;; cortex_a9_neon_issue_dp) for the case when an instruction such as +;; vmla.f is forwarded from E5 of the floating-point multiply pipeline to +;; E2 of the floating-point add pipeline. On the cycle previous to that +;; forward we must prevent issue of any instruction to the floating-point +;; add pipeline, but still allow issue of a data-processing instruction +;; to any of the other pipelines. +(define_cpu_unit "cortex_a9_neon_issue_fadd" "cortex_a9_neon") +(define_cpu_unit "cortex_a9_neon_mcr" "cortex_a9_neon") + + +;; Patterns of reservation. +;; We model the NEON issue units as running in parallel with the core ones. +;; We assume that multi-cycle NEON instructions get decomposed into +;; micro-ops as they are issued into the NEON pipeline. + +(define_reservation "cortex_a9_neon_dp" + "ca9_issue_vfp_neon + cortex_a9_neon_issue_dp") +(define_reservation "cortex_a9_neon_dp_2" + "ca9_issue_vfp_neon + cortex_a9_neon_issue_dp,\ + cortex_a9_neon_issue_dp") +(define_reservation "cortex_a9_neon_dp_4" + "ca9_issue_vfp_neon + cortex_a9_neon_issue_dp,\ + cortex_a9_neon_issue_dp + cortex_a9_neon_issue_perm,\ + cortex_a9_neon_issue_dp + cortex_a9_neon_issue_perm,\ + cortex_a9_neon_issue_dp") + +(define_reservation "cortex_a9_neon_fadd" + "ca9_issue_vfp_neon + cortex_a9_neon_issue_dp + \ + cortex_a9_neon_issue_fadd") +(define_reservation "cortex_a9_neon_fadd_2" + "ca9_issue_vfp_neon + cortex_a9_neon_issue_dp,\ + cortex_a9_neon_issue_fadd,\ + cortex_a9_neon_issue_dp") + +(define_reservation "cortex_a9_neon_perm" + "ca9_issue_vfp_neon+cortex_a9_neon_issue_perm") +(define_reservation "cortex_a9_neon_perm_2" + "ca9_issue_vfp_neon+cortex_a9_neon_issue_perm, \ + cortex_a9_neon_issue_perm") +(define_reservation "cortex_a9_neon_perm_3" + "ca9_issue_vfp_neon+cortex_a9_neon_issue_perm,\ + cortex_a9_neon_issue_dp+cortex_a9_neon_issue_perm,\ + cortex_a9_neon_issue_perm") + +(define_reservation "cortex_a9_neon_ls" + "ca9_issue_vfp_neon+cortex_a9_neon_issue_perm+cortex_a9_ls") +(define_reservation "cortex_a9_neon_ls_2" + "ca9_issue_vfp_neon+cortex_a9_neon_issue_perm,\ + cortex_a9_neon_issue_perm") +(define_reservation "cortex_a9_neon_ls_3" + "ca9_issue_vfp_neon+cortex_a9_neon_issue_perm,\ + cortex_a9_neon_issue_dp+cortex_a9_neon_issue_perm,\ + cortex_a9_neon_issue_perm") +(define_reservation "cortex_a9_neon_ls_4" + "ca9_issue_vfp_neon+cortex_a9_neon_issue_perm,\ + cortex_a9_neon_issue_dp+cortex_a9_neon_issue_perm,\ + cortex_a9_neon_issue_dp+cortex_a9_neon_issue_perm,\ + cortex_a9_neon_issue_perm") +(define_reservation "cortex_a9_neon_ls_5" + "ca9_issue_vfp_neon + cortex_a9_neon_issue_perm,\ + cortex_a9_neon_issue_dp+cortex_a9_neon_issue_perm,\ + cortex_a9_neon_issue_dp+cortex_a9_neon_issue_perm,\ + cortex_a9_neon_issue_dp+cortex_a9_neon_issue_perm,\ + cortex_a9_neon_issue_perm") + +(define_reservation "cortex_a9_neon_fmul_then_fadd" + "ca9_issue_vfp_neon + cortex_a9_neon_issue_dp,\ + nothing*3,\ + cortex_a9_neon_issue_fadd") +(define_reservation "cortex_a9_neon_fmul_then_fadd_2" + "ca9_issue_vfp_neon + cortex_a9_neon_issue_dp,\ + cortex_a9_neon_issue_dp,\ + nothing*2,\ + cortex_a9_neon_issue_fadd,\ + cortex_a9_neon_issue_fadd") + + +;; NEON -> core transfers. +(define_insn_reservation "ca9_neon_mrc" 1 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_mrc")) + "ca9_issue_vfp_neon + cortex_a9_neon_mcr") + +(define_insn_reservation "ca9_neon_mrrc" 1 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_mrrc")) + "ca9_issue_vfp_neon + cortex_a9_neon_mcr") + +;; The remainder of this file is auto-generated by neon-schedgen. + +;; Instructions using this reservation read their source operands at N2, and +;; produce a result at N3. +(define_insn_reservation "cortex_a9_neon_int_1" 3 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_int_1")) + "cortex_a9_neon_dp") + +;; Instructions using this reservation read their (D|Q)m operands at N1, +;; their (D|Q)n operands at N2, and produce a result at N3. +(define_insn_reservation "cortex_a9_neon_int_2" 3 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_int_2")) + "cortex_a9_neon_dp") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N3. +(define_insn_reservation "cortex_a9_neon_int_3" 3 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_int_3")) + "cortex_a9_neon_dp") + +;; Instructions using this reservation read their source operands at N2, and +;; produce a result at N4. +(define_insn_reservation "cortex_a9_neon_int_4" 4 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_int_4")) + "cortex_a9_neon_dp") + +;; Instructions using this reservation read their (D|Q)m operands at N1, +;; their (D|Q)n operands at N2, and produce a result at N4. +(define_insn_reservation "cortex_a9_neon_int_5" 4 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_int_5")) + "cortex_a9_neon_dp") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N4. +(define_insn_reservation "cortex_a9_neon_vqneg_vqabs" 4 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_vqneg_vqabs")) + "cortex_a9_neon_dp") + +;; Instructions using this reservation produce a result at N3. +(define_insn_reservation "cortex_a9_neon_vmov" 3 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_vmov")) + "cortex_a9_neon_dp") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and +;; produce a result at N6. +(define_insn_reservation "cortex_a9_neon_vaba" 6 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_vaba")) + "cortex_a9_neon_dp") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and +;; produce a result at N6 on cycle 2. +(define_insn_reservation "cortex_a9_neon_vaba_qqq" 7 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_vaba_qqq")) + "cortex_a9_neon_dp_2") + +;; Instructions using this reservation read their (D|Q)m operands at N1, +;; their (D|Q)d operands at N3, and produce a result at N6. +(define_insn_reservation "cortex_a9_neon_vsma" 6 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_vsma")) + "cortex_a9_neon_dp") + +;; Instructions using this reservation read their source operands at N2, and +;; produce a result at N6. +(define_insn_reservation "cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long" 6 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long")) + "cortex_a9_neon_dp") + +;; Instructions using this reservation read their source operands at N2, and +;; produce a result at N6 on cycle 2. +(define_insn_reservation "cortex_a9_neon_mul_qqq_8_16_32_ddd_32" 7 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_mul_qqq_8_16_32_ddd_32")) + "cortex_a9_neon_dp_2") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N1, and produce a result at N6 on cycle 2. +(define_insn_reservation "cortex_a9_neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar" 7 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar")) + "cortex_a9_neon_dp_2") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N2, their (D|Q)d operands at N3, and +;; produce a result at N6. +(define_insn_reservation "cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long" 6 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long")) + "cortex_a9_neon_dp") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N2, their (D|Q)d operands at N3, and +;; produce a result at N6 on cycle 2. +(define_insn_reservation "cortex_a9_neon_mla_qqq_8_16" 7 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_mla_qqq_8_16")) + "cortex_a9_neon_dp_2") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and +;; produce a result at N6 on cycle 2. +(define_insn_reservation "cortex_a9_neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long" 7 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")) + "cortex_a9_neon_dp_2") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and +;; produce a result at N6 on cycle 4. +(define_insn_reservation "cortex_a9_neon_mla_qqq_32_qqd_32_scalar" 9 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_mla_qqq_32_qqd_32_scalar")) + "cortex_a9_neon_dp_4") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N1, and produce a result at N6. +(define_insn_reservation "cortex_a9_neon_mul_ddd_16_scalar_32_16_long_scalar" 6 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_mul_ddd_16_scalar_32_16_long_scalar")) + "cortex_a9_neon_dp") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N1, and produce a result at N6 on cycle 4. +(define_insn_reservation "cortex_a9_neon_mul_qqd_32_scalar" 9 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_mul_qqd_32_scalar")) + "cortex_a9_neon_dp_4") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and +;; produce a result at N6. +(define_insn_reservation "cortex_a9_neon_mla_ddd_16_scalar_qdd_32_16_long_scalar" 6 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar")) + "cortex_a9_neon_dp") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N3. +(define_insn_reservation "cortex_a9_neon_shift_1" 3 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_shift_1")) + "cortex_a9_neon_dp") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N4. +(define_insn_reservation "cortex_a9_neon_shift_2" 4 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_shift_2")) + "cortex_a9_neon_dp") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N3 on cycle 2. +(define_insn_reservation "cortex_a9_neon_shift_3" 4 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_shift_3")) + "cortex_a9_neon_dp_2") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N1. +(define_insn_reservation "cortex_a9_neon_vshl_ddd" 1 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_vshl_ddd")) + "cortex_a9_neon_dp") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N4 on cycle 2. +(define_insn_reservation "cortex_a9_neon_vqshl_vrshl_vqrshl_qqq" 5 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_vqshl_vrshl_vqrshl_qqq")) + "cortex_a9_neon_dp_2") + +;; Instructions using this reservation read their (D|Q)m operands at N1, +;; their (D|Q)d operands at N3, and produce a result at N6. +(define_insn_reservation "cortex_a9_neon_vsra_vrsra" 6 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_vsra_vrsra")) + "cortex_a9_neon_dp") + +;; Instructions using this reservation read their source operands at N2, and +;; produce a result at N5. +(define_insn_reservation "cortex_a9_neon_fp_vadd_ddd_vabs_dd" 5 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_fp_vadd_ddd_vabs_dd")) + "cortex_a9_neon_fadd") + +;; Instructions using this reservation read their source operands at N2, and +;; produce a result at N5 on cycle 2. +(define_insn_reservation "cortex_a9_neon_fp_vadd_qqq_vabs_qq" 6 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_fp_vadd_qqq_vabs_qq")) + "cortex_a9_neon_fadd_2") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N5. +(define_insn_reservation "cortex_a9_neon_fp_vsum" 5 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_fp_vsum")) + "cortex_a9_neon_fadd") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N1, and produce a result at N5. +(define_insn_reservation "cortex_a9_neon_fp_vmul_ddd" 5 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_fp_vmul_ddd")) + "cortex_a9_neon_dp") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N1, and produce a result at N5 on cycle 2. +(define_insn_reservation "cortex_a9_neon_fp_vmul_qqd" 6 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_fp_vmul_qqd")) + "cortex_a9_neon_dp_2") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N2, their (D|Q)d operands at N3, and +;; produce a result at N9. +(define_insn_reservation "cortex_a9_neon_fp_vmla_ddd" 9 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_fp_vmla_ddd")) + "cortex_a9_neon_fmul_then_fadd") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N2, their (D|Q)d operands at N3, and +;; produce a result at N9 on cycle 2. +(define_insn_reservation "cortex_a9_neon_fp_vmla_qqq" 10 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_fp_vmla_qqq")) + "cortex_a9_neon_fmul_then_fadd_2") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and +;; produce a result at N9. +(define_insn_reservation "cortex_a9_neon_fp_vmla_ddd_scalar" 9 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_fp_vmla_ddd_scalar")) + "cortex_a9_neon_fmul_then_fadd") + +;; Instructions using this reservation read their (D|Q)n operands at N2, +;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and +;; produce a result at N9 on cycle 2. +(define_insn_reservation "cortex_a9_neon_fp_vmla_qqq_scalar" 10 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_fp_vmla_qqq_scalar")) + "cortex_a9_neon_fmul_then_fadd_2") + +;; Instructions using this reservation read their source operands at N2, and +;; produce a result at N9. +(define_insn_reservation "cortex_a9_neon_fp_vrecps_vrsqrts_ddd" 9 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_fp_vrecps_vrsqrts_ddd")) + "cortex_a9_neon_fmul_then_fadd") + +;; Instructions using this reservation read their source operands at N2, and +;; produce a result at N9 on cycle 2. +(define_insn_reservation "cortex_a9_neon_fp_vrecps_vrsqrts_qqq" 10 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_fp_vrecps_vrsqrts_qqq")) + "cortex_a9_neon_fmul_then_fadd_2") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N2. +(define_insn_reservation "cortex_a9_neon_bp_simple" 2 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_bp_simple")) + "cortex_a9_neon_perm") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N2 on cycle 2. +(define_insn_reservation "cortex_a9_neon_bp_2cycle" 3 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_bp_2cycle")) + "cortex_a9_neon_perm_2") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N2 on cycle 3. +(define_insn_reservation "cortex_a9_neon_bp_3cycle" 4 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_bp_3cycle")) + "cortex_a9_neon_perm_3") + +;; Instructions using this reservation produce a result at N1. +(define_insn_reservation "cortex_a9_neon_ldr" 1 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_ldr")) + "cortex_a9_neon_ls") + +;; Instructions using this reservation read their source operands at N1. +(define_insn_reservation "cortex_a9_neon_str" 0 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_str")) + "cortex_a9_neon_ls") + +;; Instructions using this reservation produce a result at N1 on cycle 2. +(define_insn_reservation "cortex_a9_neon_vld1_1_2_regs" 2 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_vld1_1_2_regs")) + "cortex_a9_neon_ls_2") + +;; Instructions using this reservation produce a result at N1 on cycle 3. +(define_insn_reservation "cortex_a9_neon_vld1_3_4_regs" 3 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_vld1_3_4_regs")) + "cortex_a9_neon_ls_3") + +;; Instructions using this reservation produce a result at N2 on cycle 2. +(define_insn_reservation "cortex_a9_neon_vld2_2_regs_vld1_vld2_all_lanes" 3 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_vld2_2_regs_vld1_vld2_all_lanes")) + "cortex_a9_neon_ls_2") + +;; Instructions using this reservation produce a result at N2 on cycle 3. +(define_insn_reservation "cortex_a9_neon_vld2_4_regs" 4 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_vld2_4_regs")) + "cortex_a9_neon_ls_3") + +;; Instructions using this reservation produce a result at N2 on cycle 4. +(define_insn_reservation "cortex_a9_neon_vld3_vld4" 5 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_vld3_vld4")) + "cortex_a9_neon_ls_4") + +;; Instructions using this reservation read their source operands at N1. +(define_insn_reservation "cortex_a9_neon_vst1_1_2_regs_vst2_2_regs" 0 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_vst1_1_2_regs_vst2_2_regs")) + "cortex_a9_neon_ls_2") + +;; Instructions using this reservation read their source operands at N1. +(define_insn_reservation "cortex_a9_neon_vst1_3_4_regs" 0 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_vst1_3_4_regs")) + "cortex_a9_neon_ls_3") + +;; Instructions using this reservation read their source operands at N1. +(define_insn_reservation "cortex_a9_neon_vst2_4_regs_vst3_vst4" 0 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_vst2_4_regs_vst3_vst4")) + "cortex_a9_neon_ls_4") + +;; Instructions using this reservation read their source operands at N1. +(define_insn_reservation "cortex_a9_neon_vst3_vst4" 0 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_vst3_vst4")) + "cortex_a9_neon_ls_4") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N2 on cycle 3. +(define_insn_reservation "cortex_a9_neon_vld1_vld2_lane" 4 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_vld1_vld2_lane")) + "cortex_a9_neon_ls_3") + +;; Instructions using this reservation read their source operands at N1, and +;; produce a result at N2 on cycle 5. +(define_insn_reservation "cortex_a9_neon_vld3_vld4_lane" 6 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_vld3_vld4_lane")) + "cortex_a9_neon_ls_5") + +;; Instructions using this reservation read their source operands at N1. +(define_insn_reservation "cortex_a9_neon_vst1_vst2_lane" 0 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_vst1_vst2_lane")) + "cortex_a9_neon_ls_2") + +;; Instructions using this reservation read their source operands at N1. +(define_insn_reservation "cortex_a9_neon_vst3_vst4_lane" 0 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_vst3_vst4_lane")) + "cortex_a9_neon_ls_3") + +;; Instructions using this reservation produce a result at N2 on cycle 2. +(define_insn_reservation "cortex_a9_neon_vld3_vld4_all_lanes" 3 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_vld3_vld4_all_lanes")) + "cortex_a9_neon_ls_3") + +;; Instructions using this reservation produce a result at N2. +(define_insn_reservation "cortex_a9_neon_mcr" 2 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_mcr")) + "cortex_a9_neon_perm") + +;; Instructions using this reservation produce a result at N2. +(define_insn_reservation "cortex_a9_neon_mcr_2_mcrr" 2 + (and (eq_attr "tune" "cortexa9") + (eq_attr "neon_type" "neon_mcr_2_mcrr")) + "cortex_a9_neon_perm_2") + +;; Exceptions to the default latencies. + +(define_bypass 1 "cortex_a9_neon_mcr_2_mcrr" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 1 "cortex_a9_neon_mcr" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 2 "cortex_a9_neon_vld3_vld4_all_lanes" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 5 "cortex_a9_neon_vld3_vld4_lane" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 3 "cortex_a9_neon_vld1_vld2_lane" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 4 "cortex_a9_neon_vld3_vld4" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 3 "cortex_a9_neon_vld2_4_regs" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 2 "cortex_a9_neon_vld2_2_regs_vld1_vld2_all_lanes" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 2 "cortex_a9_neon_vld1_3_4_regs" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 1 "cortex_a9_neon_vld1_1_2_regs" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 0 "cortex_a9_neon_ldr" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 3 "cortex_a9_neon_bp_3cycle" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 2 "cortex_a9_neon_bp_2cycle" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 1 "cortex_a9_neon_bp_simple" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 9 "cortex_a9_neon_fp_vrecps_vrsqrts_qqq" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 8 "cortex_a9_neon_fp_vrecps_vrsqrts_ddd" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 9 "cortex_a9_neon_fp_vmla_qqq_scalar" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 8 "cortex_a9_neon_fp_vmla_ddd_scalar" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 9 "cortex_a9_neon_fp_vmla_qqq" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 8 "cortex_a9_neon_fp_vmla_ddd" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 5 "cortex_a9_neon_fp_vmul_qqd" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 4 "cortex_a9_neon_fp_vmul_ddd" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 4 "cortex_a9_neon_fp_vsum" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 5 "cortex_a9_neon_fp_vadd_qqq_vabs_qq" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 4 "cortex_a9_neon_fp_vadd_ddd_vabs_dd" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 5 "cortex_a9_neon_vsra_vrsra" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 4 "cortex_a9_neon_vqshl_vrshl_vqrshl_qqq" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 0 "cortex_a9_neon_vshl_ddd" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 3 "cortex_a9_neon_shift_3" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 3 "cortex_a9_neon_shift_2" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 2 "cortex_a9_neon_shift_1" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 5 "cortex_a9_neon_mla_ddd_16_scalar_qdd_32_16_long_scalar" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 8 "cortex_a9_neon_mul_qqd_32_scalar" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 5 "cortex_a9_neon_mul_ddd_16_scalar_32_16_long_scalar" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 8 "cortex_a9_neon_mla_qqq_32_qqd_32_scalar" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 6 "cortex_a9_neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 6 "cortex_a9_neon_mla_qqq_8_16" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 5 "cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 6 "cortex_a9_neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 6 "cortex_a9_neon_mul_qqq_8_16_32_ddd_32" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 5 "cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 5 "cortex_a9_neon_vsma" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 6 "cortex_a9_neon_vaba_qqq" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 5 "cortex_a9_neon_vaba" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 2 "cortex_a9_neon_vmov" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 3 "cortex_a9_neon_vqneg_vqabs" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 3 "cortex_a9_neon_int_5" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 3 "cortex_a9_neon_int_4" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 2 "cortex_a9_neon_int_3" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 2 "cortex_a9_neon_int_2" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + +(define_bypass 2 "cortex_a9_neon_int_1" + "cortex_a9_neon_int_1,\ + cortex_a9_neon_int_4,\ + cortex_a9_neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mul_qqq_8_16_32_ddd_32,\ + cortex_a9_neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\ + cortex_a9_neon_mla_qqq_8_16,\ + cortex_a9_neon_fp_vadd_ddd_vabs_dd,\ + cortex_a9_neon_fp_vadd_qqq_vabs_qq,\ + cortex_a9_neon_fp_vmla_ddd,\ + cortex_a9_neon_fp_vmla_qqq,\ + cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ + cortex_a9_neon_fp_vrecps_vrsqrts_qqq") + diff --git a/gcc/config/arm/cortex-a9.md b/gcc/config/arm/cortex-a9.md new file mode 100644 index 000000000..b74ace833 --- /dev/null +++ b/gcc/config/arm/cortex-a9.md @@ -0,0 +1,269 @@ +;; ARM Cortex-A9 pipeline description +;; Copyright (C) 2008, 2009, 2010 Free Software Foundation, Inc. +;; Originally written by CodeSourcery for VFP. +;; +;; Rewritten by Ramana Radhakrishnan +;; Integer Pipeline description contributed by ARM Ltd. +;; VFP Pipeline description rewritten and contributed by ARM Ltd. + +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +(define_automaton "cortex_a9") + +;; The Cortex-A9 core is modelled as a dual issue pipeline that has +;; the following components. +;; 1. 1 Load Store Pipeline. +;; 2. P0 / main pipeline for data processing instructions. +;; 3. P1 / Dual pipeline for Data processing instructions. +;; 4. MAC pipeline for multiply as well as multiply +;; and accumulate instructions. +;; 5. 1 VFP and an optional Neon unit. +;; The Load/Store, VFP and Neon issue pipeline are multiplexed. +;; The P0 / main pipeline and M1 stage of the MAC pipeline are +;; multiplexed. +;; The P1 / dual pipeline and M2 stage of the MAC pipeline are +;; multiplexed. +;; There are only 4 integer register read ports and hence at any point of +;; time we can't have issue down the E1 and the E2 ports unless +;; of course there are bypass paths that get exercised. +;; Both P0 and P1 have 2 stages E1 and E2. +;; Data processing instructions issue to E1 or E2 depending on +;; whether they have an early shift or not. + +(define_cpu_unit "ca9_issue_vfp_neon, cortex_a9_ls" "cortex_a9") +(define_cpu_unit "cortex_a9_p0_e1, cortex_a9_p0_e2" "cortex_a9") +(define_cpu_unit "cortex_a9_p1_e1, cortex_a9_p1_e2" "cortex_a9") +(define_cpu_unit "cortex_a9_p0_wb, cortex_a9_p1_wb" "cortex_a9") +(define_cpu_unit "cortex_a9_mac_m1, cortex_a9_mac_m2" "cortex_a9") +(define_cpu_unit "cortex_a9_branch, cortex_a9_issue_branch" "cortex_a9") + +(define_reservation "cortex_a9_p0_default" "cortex_a9_p0_e2, cortex_a9_p0_wb") +(define_reservation "cortex_a9_p1_default" "cortex_a9_p1_e2, cortex_a9_p1_wb") +(define_reservation "cortex_a9_p0_shift" "cortex_a9_p0_e1, cortex_a9_p0_default") +(define_reservation "cortex_a9_p1_shift" "cortex_a9_p1_e1, cortex_a9_p1_default") + +(define_reservation "cortex_a9_multcycle1" + "cortex_a9_p0_e2 + cortex_a9_mac_m1 + cortex_a9_mac_m2 + \ +cortex_a9_p1_e2 + cortex_a9_p0_e1 + cortex_a9_p1_e1") + +(define_reservation "cortex_a9_mult16" + "cortex_a9_mac_m1, cortex_a9_mac_m2, cortex_a9_p0_wb") +(define_reservation "cortex_a9_mac16" + "cortex_a9_multcycle1, cortex_a9_mac_m2, cortex_a9_p0_wb") +(define_reservation "cortex_a9_mult" + "cortex_a9_mac_m1*2, cortex_a9_mac_m2, cortex_a9_p0_wb") +(define_reservation "cortex_a9_mac" + "cortex_a9_multcycle1*2 ,cortex_a9_mac_m2, cortex_a9_p0_wb") + + +;; Issue at the same time along the load store pipeline and +;; the VFP / Neon pipeline is not possible. +(exclusion_set "cortex_a9_ls" "ca9_issue_vfp_neon") + +;; Default data processing instruction without any shift +;; The only exception to this is the mov instruction +;; which can go down E2 without any problem. +(define_insn_reservation "cortex_a9_dp" 2 + (and (eq_attr "tune" "cortexa9") + (ior (and (eq_attr "type" "alu") + (eq_attr "neon_type" "none")) + (and (and (eq_attr "type" "alu_shift_reg, alu_shift") + (eq_attr "insn" "mov")) + (eq_attr "neon_type" "none")))) + "cortex_a9_p0_default|cortex_a9_p1_default") + +;; An instruction using the shifter will go down E1. +(define_insn_reservation "cortex_a9_dp_shift" 3 + (and (eq_attr "tune" "cortexa9") + (and (eq_attr "type" "alu_shift_reg, alu_shift") + (not (eq_attr "insn" "mov")))) + "cortex_a9_p0_shift | cortex_a9_p1_shift") + +;; Loads have a latency of 4 cycles. +;; We don't model autoincrement instructions. These +;; instructions use the load store pipeline and 1 of +;; the E2 units to write back the result of the increment. + +(define_insn_reservation "cortex_a9_load1_2" 4 + (and (eq_attr "tune" "cortexa9") + (eq_attr "type" "load1, load2, load_byte, f_loads, f_loadd")) + "cortex_a9_ls") + +;; Loads multiples and store multiples can't be issued for 2 cycles in a +;; row. The description below assumes that addresses are 64 bit aligned. +;; If not, there is an extra cycle latency which is not modelled. + +(define_insn_reservation "cortex_a9_load3_4" 5 + (and (eq_attr "tune" "cortexa9") + (eq_attr "type" "load3, load4")) + "cortex_a9_ls, cortex_a9_ls") + +(define_insn_reservation "cortex_a9_store1_2" 0 + (and (eq_attr "tune" "cortexa9") + (eq_attr "type" "store1, store2, f_stores, f_stored")) + "cortex_a9_ls") + +;; Almost all our store multiples use an auto-increment +;; form. Don't issue back to back load and store multiples +;; because the load store unit will stall. + +(define_insn_reservation "cortex_a9_store3_4" 0 + (and (eq_attr "tune" "cortexa9") + (eq_attr "type" "store3, store4")) + "cortex_a9_ls+(cortex_a9_p0_default | cortex_a9_p1_default), cortex_a9_ls") + +;; We get 16*16 multiply / mac results in 3 cycles. +(define_insn_reservation "cortex_a9_mult16" 3 + (and (eq_attr "tune" "cortexa9") + (eq_attr "insn" "smulxy")) + "cortex_a9_mult16") + +;; The 16*16 mac is slightly different that it +;; reserves M1 and M2 in the same cycle. +(define_insn_reservation "cortex_a9_mac16" 3 + (and (eq_attr "tune" "cortexa9") + (eq_attr "insn" "smlaxy")) + "cortex_a9_mac16") + + +(define_insn_reservation "cortex_a9_multiply" 4 + (and (eq_attr "tune" "cortexa9") + (eq_attr "insn" "mul")) + "cortex_a9_mult") + +(define_insn_reservation "cortex_a9_mac" 4 + (and (eq_attr "tune" "cortexa9") + (eq_attr "insn" "mla")) + "cortex_a9_mac") + +;; An instruction with a result in E2 can be forwarded +;; to E2 or E1 or M1 or the load store unit in the next cycle. + +(define_bypass 1 "cortex_a9_dp" + "cortex_a9_dp_shift, cortex_a9_multiply, + cortex_a9_load1_2, cortex_a9_dp, cortex_a9_store1_2, + cortex_a9_mult16, cortex_a9_mac16, cortex_a9_mac, cortex_a9_store3_4, cortex_a9_load3_4") + +(define_bypass 2 "cortex_a9_dp_shift" + "cortex_a9_dp_shift, cortex_a9_multiply, + cortex_a9_load1_2, cortex_a9_dp, cortex_a9_store1_2, + cortex_a9_mult16, cortex_a9_mac16, cortex_a9_mac, cortex_a9_store3_4, cortex_a9_load3_4") + +;; An instruction in the load store pipeline can provide +;; read access to a DP instruction in the P0 default pipeline +;; before the writeback stage. + +(define_bypass 3 "cortex_a9_load1_2" "cortex_a9_dp, cortex_a9_load1_2, +cortex_a9_store3_4, cortex_a9_store1_2") + +(define_bypass 4 "cortex_a9_load3_4" "cortex_a9_dp, cortex_a9_load1_2, +cortex_a9_store3_4, cortex_a9_store1_2, cortex_a9_load3_4") + +;; Calls and branches. + +;; Branch instructions + +(define_insn_reservation "cortex_a9_branch" 0 + (and (eq_attr "tune" "cortexa9") + (eq_attr "type" "branch")) + "cortex_a9_branch") + +;; Call latencies are essentially 0 but make sure +;; dual issue doesn't happen i.e the next instruction +;; starts at the next cycle. +(define_insn_reservation "cortex_a9_call" 0 + (and (eq_attr "tune" "cortexa9") + (eq_attr "type" "call")) + "cortex_a9_issue_branch + cortex_a9_multcycle1 + cortex_a9_ls + ca9_issue_vfp_neon") + + +;; Pipelining for VFP instructions. +;; Issue happens either along load store unit or the VFP / Neon unit. +;; Pipeline Instruction Classification. +;; FPS - fcpys, ffariths, ffarithd,r_2_f,f_2_r +;; FP_ADD - fadds, faddd, fcmps (1) +;; FPMUL - fmul{s,d}, fmac{s,d} +;; FPDIV - fdiv{s,d} +(define_cpu_unit "ca9fps" "cortex_a9") +(define_cpu_unit "ca9fp_add1, ca9fp_add2, ca9fp_add3, ca9fp_add4" "cortex_a9") +(define_cpu_unit "ca9fp_mul1, ca9fp_mul2 , ca9fp_mul3, ca9fp_mul4" "cortex_a9") +(define_cpu_unit "ca9fp_ds1" "cortex_a9") + + +;; fmrs, fmrrd, fmstat and fmrx - The data is available after 1 cycle. +(define_insn_reservation "cortex_a9_fps" 2 + (and (eq_attr "tune" "cortexa9") + (eq_attr "type" "fcpys, fconsts, fconstd, ffariths, ffarithd, r_2_f, f_2_r, f_flag")) + "ca9_issue_vfp_neon + ca9fps") + +(define_bypass 1 + "cortex_a9_fps" + "cortex_a9_fadd, cortex_a9_fps, cortex_a9_fcmp, cortex_a9_dp, cortex_a9_dp_shift, cortex_a9_multiply") + +;; Scheduling on the FP_ADD pipeline. +(define_reservation "ca9fp_add" "ca9_issue_vfp_neon + ca9fp_add1, ca9fp_add2, ca9fp_add3, ca9fp_add4") + +(define_insn_reservation "cortex_a9_fadd" 4 + (and (eq_attr "tune" "cortexa9") + (eq_attr "type" "fadds, faddd, f_cvt")) + "ca9fp_add") + +(define_insn_reservation "cortex_a9_fcmp" 1 + (and (eq_attr "tune" "cortexa9") + (eq_attr "type" "fcmps, fcmpd")) + "ca9_issue_vfp_neon + ca9fp_add1") + +;; Scheduling for the Multiply and MAC instructions. +(define_reservation "ca9fmuls" + "ca9fp_mul1 + ca9_issue_vfp_neon, ca9fp_mul2, ca9fp_mul3, ca9fp_mul4") + +(define_reservation "ca9fmuld" + "ca9fp_mul1 + ca9_issue_vfp_neon, (ca9fp_mul1 + ca9fp_mul2), ca9fp_mul2, ca9fp_mul3, ca9fp_mul4") + +(define_insn_reservation "cortex_a9_fmuls" 4 + (and (eq_attr "tune" "cortexa9") + (eq_attr "type" "fmuls")) + "ca9fmuls") + +(define_insn_reservation "cortex_a9_fmuld" 5 + (and (eq_attr "tune" "cortexa9") + (eq_attr "type" "fmuld")) + "ca9fmuld") + +(define_insn_reservation "cortex_a9_fmacs" 8 + (and (eq_attr "tune" "cortexa9") + (eq_attr "type" "fmacs")) + "ca9fmuls, ca9fp_add") + +(define_insn_reservation "cortex_a9_fmacd" 9 + (and (eq_attr "tune" "cortexa9") + (eq_attr "type" "fmacd")) + "ca9fmuld, ca9fp_add") + +;; Division pipeline description. +(define_insn_reservation "cortex_a9_fdivs" 15 + (and (eq_attr "tune" "cortexa9") + (eq_attr "type" "fdivs")) + "ca9fp_ds1 + ca9_issue_vfp_neon, nothing*14") + +(define_insn_reservation "cortex_a9_fdivd" 25 + (and (eq_attr "tune" "cortexa9") + (eq_attr "type" "fdivd")) + "ca9fp_ds1 + ca9_issue_vfp_neon, nothing*24") + +;; Include Neon pipeline description +(include "cortex-a9-neon.md") diff --git a/gcc/config/arm/cortex-m4-fpu.md b/gcc/config/arm/cortex-m4-fpu.md new file mode 100644 index 000000000..6fd5faf74 --- /dev/null +++ b/gcc/config/arm/cortex-m4-fpu.md @@ -0,0 +1,111 @@ +;; ARM Cortex-M4 FPU pipeline description +;; Copyright (C) 2010 Free Software Foundation, Inc. +;; Contributed by CodeSourcery. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +;; Use an artifial unit to model FPU. +(define_cpu_unit "cortex_m4_v" "cortex_m4") + +(define_reservation "cortex_m4_ex_v" "cortex_m4_ex+cortex_m4_v") + +;; Integer instructions following VDIV or VSQRT complete out-of-order. +(define_insn_reservation "cortex_m4_fdivs" 15 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "fdivs")) + "cortex_m4_ex_v,cortex_m4_v*13") + +(define_insn_reservation "cortex_m4_vmov_1" 1 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "fcpys,fconsts")) + "cortex_m4_ex_v") + +(define_insn_reservation "cortex_m4_vmov_2" 2 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "f_2_r,r_2_f")) + "cortex_m4_ex_v*2") + +(define_insn_reservation "cortex_m4_fmuls" 2 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "fmuls")) + "cortex_m4_ex_v") + +(define_insn_reservation "cortex_m4_fmacs" 4 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "fmacs")) + "cortex_m4_ex_v*3") + +(define_insn_reservation "cortex_m4_ffariths" 1 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "ffariths")) + "cortex_m4_ex_v") + +(define_insn_reservation "cortex_m4_fadds" 2 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "fadds")) + "cortex_m4_ex_v") + +(define_insn_reservation "cortex_m4_fcmps" 1 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "fcmps")) + "cortex_m4_ex_v") + +(define_insn_reservation "cortex_m4_f_flag" 1 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "f_flag")) + "cortex_m4_ex_v") + +(define_insn_reservation "cortex_m4_f_cvt" 2 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "f_cvt")) + "cortex_m4_ex_v") + +(define_insn_reservation "cortex_m4_f_load" 2 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "f_loads")) + "cortex_m4_ex_v*2") + +(define_insn_reservation "cortex_m4_f_store" 2 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "f_stores")) + "cortex_m4_ex_v*2") + +(define_insn_reservation "cortex_m4_f_loadd" 3 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "f_loadd")) + "cortex_m4_ex_v*3") + +(define_insn_reservation "cortex_m4_f_stored" 3 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "f_stored")) + "cortex_m4_ex_v*3") + +;; MAC instructions consume their addend one cycle later. If the result +;; of an arithmetic instruction is consumed as the addend of the following +;; MAC instruction, the latency can be decreased by one. + +(define_bypass 1 "cortex_m4_fadds,cortex_m4_fmuls,cortex_m4_f_cvt" + "cortex_m4_fmacs" + "arm_no_early_mul_dep") + +(define_bypass 3 "cortex_m4_fmacs" + "cortex_m4_fmacs" + "arm_no_early_mul_dep") + +(define_bypass 14 "cortex_m4_fdivs" + "cortex_m4_fmacs" + "arm_no_early_mul_dep") diff --git a/gcc/config/arm/cortex-m4.md b/gcc/config/arm/cortex-m4.md new file mode 100644 index 000000000..b71037585 --- /dev/null +++ b/gcc/config/arm/cortex-m4.md @@ -0,0 +1,111 @@ +;; ARM Cortex-M4 pipeline description +;; Copyright (C) 2010 Free Software Foundation, Inc. +;; Contributed by CodeSourcery. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +(define_automaton "cortex_m4") + +;; We model the pipelining of LDR instructions by using two artificial units. + +(define_cpu_unit "cortex_m4_a" "cortex_m4") + +(define_cpu_unit "cortex_m4_b" "cortex_m4") + +(define_reservation "cortex_m4_ex" "cortex_m4_a+cortex_m4_b") + +;; ALU and multiply is one cycle. +(define_insn_reservation "cortex_m4_alu" 1 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "alu,alu_shift,alu_shift_reg,mult")) + "cortex_m4_ex") + +;; Byte, half-word and word load is two cycles. +(define_insn_reservation "cortex_m4_load1" 2 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "load_byte,load1")) + "cortex_m4_a, cortex_m4_b") + +;; str rx, [ry, #imm] is always one cycle. +(define_insn_reservation "cortex_m4_store1_1" 1 + (and (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "store1")) + (ne (symbol_ref ("arm_address_offset_is_imm (insn)")) (const_int 0))) + "cortex_m4_a") + +;; Other byte, half-word and word load is two cycles. +(define_insn_reservation "cortex_m4_store1_2" 2 + (and (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "store1")) + (eq (symbol_ref ("arm_address_offset_is_imm (insn)")) (const_int 0))) + "cortex_m4_a*2") + +(define_insn_reservation "cortex_m4_load2" 3 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "load2")) + "cortex_m4_ex*3") + +(define_insn_reservation "cortex_m4_store2" 3 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "store2")) + "cortex_m4_ex*3") + +(define_insn_reservation "cortex_m4_load3" 4 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "load3")) + "cortex_m4_ex*4") + +(define_insn_reservation "cortex_m4_store3" 4 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "store3")) + "cortex_m4_ex*4") + +(define_insn_reservation "cortex_m4_load4" 5 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "load4")) + "cortex_m4_ex*5") + +(define_insn_reservation "cortex_m4_store4" 5 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "store4")) + "cortex_m4_ex*5") + +;; If the address of load or store depends on the result of the preceding +;; instruction, the latency is increased by one. + +(define_bypass 2 "cortex_m4_alu" + "cortex_m4_load1" + "arm_early_load_addr_dep") + +(define_bypass 2 "cortex_m4_alu" + "cortex_m4_store1_1,cortex_m4_store1_2" + "arm_early_store_addr_dep") + +(define_insn_reservation "cortex_m4_branch" 3 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "branch")) + "cortex_m4_ex*3") + +(define_insn_reservation "cortex_m4_call" 3 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "call")) + "cortex_m4_ex*3") + +(define_insn_reservation "cortex_m4_block" 1 + (and (eq_attr "tune" "cortexm4") + (eq_attr "type" "block")) + "cortex_m4_ex") diff --git a/gcc/config/arm/cortex-r4.md b/gcc/config/arm/cortex-r4.md new file mode 100644 index 000000000..e26c3d45d --- /dev/null +++ b/gcc/config/arm/cortex-r4.md @@ -0,0 +1,292 @@ +;; ARM Cortex-R4 scheduling description. +;; Copyright (C) 2007, 2008 Free Software Foundation, Inc. +;; Contributed by CodeSourcery. + +;; This file is part of GCC. + +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published +;; by the Free Software Foundation; either version 3, or (at your +;; option) any later version. + +;; GCC is distributed in the hope that it will be useful, but WITHOUT +;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public +;; License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +(define_automaton "cortex_r4") + +;; We approximate the dual-issue constraints of this core using four +;; "issue units" and a reservation matrix as follows. The numbers indicate +;; the instruction groups' preferences in order. Multiple entries for +;; the same numbered preference indicate units that must be reserved +;; together. +;; +;; Issue unit: A B C ALU +;; +;; ALU w/o reg shift 1st 2nd 1st and 2nd +;; ALU w/ reg shift 1st 2nd 2nd 1st and 2nd +;; Moves 1st 2nd 2nd +;; Multiplication 1st 1st +;; Division 1st 1st +;; Load/store single 1st 1st +;; Other load/store 1st 1st +;; Branches 1st + +(define_cpu_unit "cortex_r4_issue_a" "cortex_r4") +(define_cpu_unit "cortex_r4_issue_b" "cortex_r4") +(define_cpu_unit "cortex_r4_issue_c" "cortex_r4") +(define_cpu_unit "cortex_r4_issue_alu" "cortex_r4") + +(define_reservation "cortex_r4_alu" + "(cortex_r4_issue_a+cortex_r4_issue_alu)|\ + (cortex_r4_issue_b+cortex_r4_issue_alu)") +(define_reservation "cortex_r4_alu_shift_reg" + "(cortex_r4_issue_a+cortex_r4_issue_alu)|\ + (cortex_r4_issue_b+cortex_r4_issue_c+\ + cortex_r4_issue_alu)") +(define_reservation "cortex_r4_mov" + "cortex_r4_issue_a|(cortex_r4_issue_b+\ + cortex_r4_issue_alu)") +(define_reservation "cortex_r4_mul" "cortex_r4_issue_a+cortex_r4_issue_alu") +(define_reservation "cortex_r4_mul_2" + "(cortex_r4_issue_a+cortex_r4_issue_alu)*2") +;; Division instructions execute out-of-order with respect to the +;; rest of the pipeline and only require reservations on their first and +;; final cycles. +(define_reservation "cortex_r4_div_9" + "cortex_r4_issue_a+cortex_r4_issue_alu,\ + nothing*7,\ + cortex_r4_issue_a+cortex_r4_issue_alu") +(define_reservation "cortex_r4_div_10" + "cortex_r4_issue_a+cortex_r4_issue_alu,\ + nothing*8,\ + cortex_r4_issue_a+cortex_r4_issue_alu") +(define_reservation "cortex_r4_load_store" + "cortex_r4_issue_a+cortex_r4_issue_c") +(define_reservation "cortex_r4_load_store_2" + "(cortex_r4_issue_a+cortex_r4_issue_b)*2") +(define_reservation "cortex_r4_branch" "cortex_r4_issue_b") + +;; We assume that all instructions are unconditional. + +;; Data processing instructions. Moves without shifts are kept separate +;; for the purposes of the dual-issue constraints above. +(define_insn_reservation "cortex_r4_alu" 2 + (and (eq_attr "tune_cortexr4" "yes") + (and (eq_attr "type" "alu") + (not (eq_attr "insn" "mov")))) + "cortex_r4_alu") + +(define_insn_reservation "cortex_r4_mov" 2 + (and (eq_attr "tune_cortexr4" "yes") + (and (eq_attr "type" "alu") + (eq_attr "insn" "mov"))) + "cortex_r4_mov") + +(define_insn_reservation "cortex_r4_alu_shift" 2 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "alu_shift")) + "cortex_r4_alu") + +(define_insn_reservation "cortex_r4_alu_shift_reg" 2 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "alu_shift_reg")) + "cortex_r4_alu_shift_reg") + +;; An ALU instruction followed by an ALU instruction with no early dep. +(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\ + cortex_r4_mov" + "cortex_r4_alu") +(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\ + cortex_r4_mov" + "cortex_r4_alu_shift" + "arm_no_early_alu_shift_dep") +(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\ + cortex_r4_mov" + "cortex_r4_alu_shift_reg" + "arm_no_early_alu_shift_value_dep") + +;; In terms of availabilities, a consumer mov could theoretically be +;; issued together with a producer ALU instruction, without stalls. +;; In practice this cannot happen because mov;add (in that order) is not +;; eligible for dual issue and furthermore dual issue is not permitted +;; when a dependency is involved. We therefore note it as latency one. +;; A mov followed by another of the same is also latency one. +(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\ + cortex_r4_mov" + "cortex_r4_mov") + +;; qadd, qdadd, qsub and qdsub are not currently emitted, and neither are +;; media data processing instructions nor sad instructions. + +;; Multiplication instructions. + +(define_insn_reservation "cortex_r4_mul_4" 4 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "insn" "mul,smmul")) + "cortex_r4_mul_2") + +(define_insn_reservation "cortex_r4_mul_3" 3 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "insn" "smulxy,smulwy,smuad,smusd")) + "cortex_r4_mul") + +(define_insn_reservation "cortex_r4_mla_4" 4 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "insn" "mla,smmla")) + "cortex_r4_mul_2") + +(define_insn_reservation "cortex_r4_mla_3" 3 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "insn" "smlaxy,smlawy,smlad,smlsd")) + "cortex_r4_mul") + +(define_insn_reservation "cortex_r4_smlald" 3 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "insn" "smlald,smlsld")) + "cortex_r4_mul") + +(define_insn_reservation "cortex_r4_mull" 4 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "insn" "smull,umull,umlal,umaal")) + "cortex_r4_mul_2") + +;; A multiply or an MLA with a single-register result, followed by an +;; MLA with an accumulator dependency, has its result forwarded. +(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3" + "cortex_r4_mla_3,cortex_r4_mla_4" + "arm_mac_accumulator_is_mul_result") + +(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4" + "cortex_r4_mla_3,cortex_r4_mla_4" + "arm_mac_accumulator_is_mul_result") + +;; A multiply followed by an ALU instruction needing the multiply +;; result only at ALU has lower latency than one needing it at Shift. +(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald" + "cortex_r4_alu") +(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald" + "cortex_r4_alu_shift" + "arm_no_early_alu_shift_dep") +(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald" + "cortex_r4_alu_shift_reg" + "arm_no_early_alu_shift_value_dep") +(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull" + "cortex_r4_alu") +(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull" + "cortex_r4_alu_shift" + "arm_no_early_alu_shift_dep") +(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull" + "cortex_r4_alu_shift_reg" + "arm_no_early_alu_shift_value_dep") + +;; A multiply followed by a mov has one cycle lower latency again. +(define_bypass 1 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald" + "cortex_r4_mov") +(define_bypass 2 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull" + "cortex_r4_mov") + +;; We guess that division of A/B using sdiv or udiv, on average, +;; is performed with B having ten more leading zeros than A. +;; This gives a latency of nine for udiv and ten for sdiv. +(define_insn_reservation "cortex_r4_udiv" 9 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "insn" "udiv")) + "cortex_r4_div_9") + +(define_insn_reservation "cortex_r4_sdiv" 10 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "insn" "sdiv")) + "cortex_r4_div_10") + +;; Branches. We assume correct prediction. + +(define_insn_reservation "cortex_r4_branch" 0 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "branch")) + "cortex_r4_branch") + +;; Call latencies are not predictable. A semi-arbitrary very large +;; number is used as "positive infinity" so that everything should be +;; finished by the time of return. +(define_insn_reservation "cortex_r4_call" 32 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "call")) + "nothing") + +;; Status register access instructions are not currently emitted. + +;; Load instructions. +;; We do not model the "addr_md_3cycle" cases and assume that +;; accesses following are correctly aligned. + +(define_insn_reservation "cortex_r4_load_1_2" 3 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "load1,load2")) + "cortex_r4_load_store") + +(define_insn_reservation "cortex_r4_load_3_4" 4 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "load3,load4")) + "cortex_r4_load_store_2") + +;; If a producing load is followed by an instruction consuming only +;; as a Normal Reg, there is one fewer cycle of latency. + +(define_bypass 2 "cortex_r4_load_1_2" + "cortex_r4_alu") +(define_bypass 2 "cortex_r4_load_1_2" + "cortex_r4_alu_shift" + "arm_no_early_alu_shift_dep") +(define_bypass 2 "cortex_r4_load_1_2" + "cortex_r4_alu_shift_reg" + "arm_no_early_alu_shift_value_dep") + +(define_bypass 3 "cortex_r4_load_3_4" + "cortex_r4_alu") +(define_bypass 3 "cortex_r4_load_3_4" + "cortex_r4_alu_shift" + "arm_no_early_alu_shift_dep") +(define_bypass 3 "cortex_r4_load_3_4" + "cortex_r4_alu_shift_reg" + "arm_no_early_alu_shift_value_dep") + +;; If a producing load is followed by an instruction consuming only +;; as a Late Reg, there are two fewer cycles of latency. Such consumer +;; instructions are moves and stores. + +(define_bypass 1 "cortex_r4_load_1_2" + "cortex_r4_mov,cortex_r4_store_1_2,cortex_r4_store_3_4") +(define_bypass 2 "cortex_r4_load_3_4" + "cortex_r4_mov,cortex_r4_store_1_2,cortex_r4_store_3_4") + +;; If a producer's result is required as the base or offset of a load, +;; there is an extra cycle latency. + +(define_bypass 3 "cortex_r4_alu,cortex_r4_mov,cortex_r4_alu_shift,\ + cortex_r4_alu_shift_reg" + "cortex_r4_load_1_2,cortex_r4_load_3_4") + +(define_bypass 4 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald" + "cortex_r4_load_1_2,cortex_r4_load_3_4") + +(define_bypass 5 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull" + "cortex_r4_load_1_2,cortex_r4_load_3_4") + +;; Store instructions. + +(define_insn_reservation "cortex_r4_store_1_2" 0 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "store1,store2")) + "cortex_r4_load_store") + +(define_insn_reservation "cortex_r4_store_3_4" 0 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "store3,store4")) + "cortex_r4_load_store_2") + diff --git a/gcc/config/arm/cortex-r4f.md b/gcc/config/arm/cortex-r4f.md new file mode 100644 index 000000000..8982bc068 --- /dev/null +++ b/gcc/config/arm/cortex-r4f.md @@ -0,0 +1,161 @@ +;; ARM Cortex-R4F VFP pipeline description +;; Copyright (C) 2007, 2008 Free Software Foundation, Inc. +;; Written by CodeSourcery. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +;; With the exception of simple VMOV , instructions and +;; the accululate operand of a multiply-accumulate instruction, all +;; registers are early registers. Thus base latencies are 1 more than +;; those listed in the TRM. + +;; We use the A, B abd C units from the integer core, plus two additional +;; units to enforce VFP dual issue constraints. + +;; A B C V1 VMLA +;; fcpy 1 2 +;; farith 1 2 1 +;; fmrc 1 2 +;; fconst 1 2 * * +;; ffarith 1 2 * * +;; fmac 1 2 1 2 +;; fdiv 1 2 * +;; f_loads * * * +;; f_stores * * * + +(define_cpu_unit "cortex_r4_v1" "cortex_r4") + +(define_cpu_unit "cortex_r4_vmla" "cortex_r4") + +(define_reservation "cortex_r4_issue_ab" + "(cortex_r4_issue_a|cortex_r4_issue_b)") +(define_reservation "cortex_r4_single_issue" + "cortex_r4_issue_a+cortex_r4_issue_b") + +(define_insn_reservation "cortex_r4_fcpys" 2 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "fcpys")) + "cortex_r4_issue_ab") + +(define_insn_reservation "cortex_r4_ffariths" 2 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "ffariths,fconsts,fcmps")) + "cortex_r4_issue_ab+cortex_r4_issue_c+cortex_r4_v1") + +(define_insn_reservation "cortex_r4_fariths" 3 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "fadds,fmuls")) + "(cortex_r4_issue_a+cortex_r4_v1)|cortex_r4_issue_b") + +(define_insn_reservation "cortex_r4_fmacs" 6 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "fmacs")) + "(cortex_r4_issue_a+cortex_r4_v1)|(cortex_r4_issue_b+cortex_r4_vmla)") + +(define_insn_reservation "cortex_r4_fdivs" 17 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "fdivs")) + "cortex_r4_issue_ab+cortex_r4_v1,cortex_r4_issue_a+cortex_r4_v1") + +(define_insn_reservation "cortex_r4_floads" 2 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "f_loads")) + "cortex_r4_issue_a+cortex_r4_issue_c+cortex_r4_v1") + +(define_insn_reservation "cortex_r4_fstores" 1 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "f_stores")) + "cortex_r4_issue_a+cortex_r4_issue_c+cortex_r4_vmla") + +(define_insn_reservation "cortex_r4_mcr" 2 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "r_2_f")) + "cortex_r4_issue_ab") + +(define_insn_reservation "cortex_r4_mrc" 3 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "f_2_r")) + "cortex_r4_issue_ab") + +;; Bypasses for normal (not early) regs. +(define_bypass 1 "cortex_r4_ffariths,cortex_r4_fcpys,cortex_r4_mcr" + "cortex_r4_fcpys") +(define_bypass 2 "cortex_r4_fariths" + "cortex_r4_fcpys") +(define_bypass 5 "cortex_r4_fmacs" + "cortex_r4_fcpys") +(define_bypass 16 "cortex_r4_fdivs" + "cortex_r4_fcpys") + +(define_bypass 1 "cortex_r4_ffariths,cortex_r4_fcpys,cortex_r4_mcr" + "cortex_r4_fmacs" + "arm_no_early_mul_dep") +(define_bypass 2 "cortex_r4_fariths" + "cortex_r4_fmacs" + "arm_no_early_mul_dep") +;; mac->mac has an extra forwarding path. +(define_bypass 3 "cortex_r4_fmacs" + "cortex_r4_fmacs" + "arm_no_early_mul_dep") +(define_bypass 16 "cortex_r4_fdivs" + "cortex_r4_fmacs" + "arm_no_early_mul_dep") + +;; Double precision operations. These can not dual issue. + +(define_insn_reservation "cortex_r4_fmacd" 20 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "fmacd")) + "cortex_r4_single_issue*13") + +(define_insn_reservation "cortex_r4_farith" 10 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "faddd,fmuld")) + "cortex_r4_single_issue*3") + +;; FIXME: The short cycle count suggests these instructions complete +;; out of order. Chances are this is not a pipelined operation. +(define_insn_reservation "cortex_r4_fdivd" 97 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "fdivd")) + "cortex_r4_single_issue*3") + +(define_insn_reservation "cortex_r4_ffarithd" 2 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "ffarithd,fconstd")) + "cortex_r4_single_issue") + +(define_insn_reservation "cortex_r4_fcmpd" 2 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "fcmpd")) + "cortex_r4_single_issue*2") + +(define_insn_reservation "cortex_r4_f_cvt" 8 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "f_cvt")) + "cortex_r4_single_issue*3") + +(define_insn_reservation "cortex_r4_f_memd" 8 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "f_loadd,f_stored")) + "cortex_r4_single_issue") + +(define_insn_reservation "cortex_r4_f_flag" 1 + (and (eq_attr "tune_cortexr4" "yes") + (eq_attr "type" "f_stores")) + "cortex_r4_single_issue") + diff --git a/gcc/config/arm/crti.asm b/gcc/config/arm/crti.asm new file mode 100644 index 000000000..9454273dd --- /dev/null +++ b/gcc/config/arm/crti.asm @@ -0,0 +1,86 @@ +# Copyright (C) 2001, 2008, 2009, 2010 Free Software Foundation, Inc. +# Written By Nick Clifton +# +# This file is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3, or (at your option) any +# later version. +# +# This file is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# Under Section 7 of GPL version 3, you are granted additional +# permissions described in the GCC Runtime Library Exception, version +# 3.1, as published by the Free Software Foundation. +# +# You should have received a copy of the GNU General Public License and +# a copy of the GCC Runtime Library Exception along with this program; +# see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +# . + +/* An executable stack is *not* required for these functions. */ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +.previous +#endif + +# This file just make a stack frame for the contents of the .fini and +# .init sections. Users may put any desired instructions in those +# sections. + +#ifdef __ELF__ +#define TYPE(x) .type x,function +#else +#define TYPE(x) +#endif +#ifdef __ARM_EABI__ +/* Some attributes that are common to all routines in this file. */ + /* Tag_ABI_align_needed: This code does not require 8-byte + alignment from the caller. */ + /* .eabi_attribute 24, 0 -- default setting. */ + /* Tag_ABI_align_preserved: This code preserves 8-byte + alignment in any callee. */ + .eabi_attribute 25, 1 +#endif /* __ARM_EABI__ */ + + # Note - this macro is complemented by the FUNC_END macro + # in crtn.asm. If you change this macro you must also change + # that macro match. +.macro FUNC_START +#ifdef __thumb__ + .thumb + + push {r3, r4, r5, r6, r7, lr} +#else + .arm + # Create a stack frame and save any call-preserved registers + mov ip, sp + stmdb sp!, {r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr, pc} + sub fp, ip, #4 +#endif +.endm + + .section ".init" + .align 2 + .global _init +#ifdef __thumb__ + .thumb_func +#endif + TYPE(_init) +_init: + FUNC_START + + + .section ".fini" + .align 2 + .global _fini +#ifdef __thumb__ + .thumb_func +#endif + TYPE(_fini) +_fini: + FUNC_START + +# end of crti.asm diff --git a/gcc/config/arm/crtn.asm b/gcc/config/arm/crtn.asm new file mode 100644 index 000000000..c7f90814d --- /dev/null +++ b/gcc/config/arm/crtn.asm @@ -0,0 +1,82 @@ +# Copyright (C) 2001, 2004, 2008, 2009, 2010 Free Software Foundation, Inc. +# Written By Nick Clifton +# +# This file is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3, or (at your option) any +# later version. +# +# This file is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# Under Section 7 of GPL version 3, you are granted additional +# permissions described in the GCC Runtime Library Exception, version +# 3.1, as published by the Free Software Foundation. +# +# You should have received a copy of the GNU General Public License and +# a copy of the GCC Runtime Library Exception along with this program; +# see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +# . + +/* An executable stack is *not* required for these functions. */ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +.previous +#endif + +#ifdef __ARM_EABI__ +/* Some attributes that are common to all routines in this file. */ + /* Tag_ABI_align_needed: This code does not require 8-byte + alignment from the caller. */ + /* .eabi_attribute 24, 0 -- default setting. */ + /* Tag_ABI_align_preserved: This code preserves 8-byte + alignment in any callee. */ + .eabi_attribute 25, 1 +#endif /* __ARM_EABI__ */ + +# This file just makes sure that the .fini and .init sections do in +# fact return. Users may put any desired instructions in those sections. +# This file is the last thing linked into any executable. + + # Note - this macro is complemented by the FUNC_START macro + # in crti.asm. If you change this macro you must also change + # that macro match. + # + # Note - we do not try any fancy optimizations of the return + # sequences here, it is just not worth it. Instead keep things + # simple. Restore all the save resgisters, including the link + # register and then perform the correct function return instruction. + # We also save/restore r3 to ensure stack alignment. +.macro FUNC_END +#ifdef __thumb__ + .thumb + + pop {r3, r4, r5, r6, r7} + pop {r3} + mov lr, r3 +#else + .arm + + sub sp, fp, #40 + ldmfd sp, {r4, r5, r6, r7, r8, r9, sl, fp, sp, lr} +#endif + +#if defined __THUMB_INTERWORK__ || defined __thumb__ + bx lr +#else + mov pc, lr +#endif +.endm + + + .section ".init" + ;; + FUNC_END + + .section ".fini" + ;; + FUNC_END + +# end of crtn.asm diff --git a/gcc/config/arm/ecos-elf.h b/gcc/config/arm/ecos-elf.h new file mode 100644 index 000000000..9e9fa7046 --- /dev/null +++ b/gcc/config/arm/ecos-elf.h @@ -0,0 +1,27 @@ +/* Definitions for ecos based ARM systems using ELF + Copyright (C) 1998, 2001, 2007 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +/* Run-time Target Specification. */ +#undef TARGET_VERSION +#define TARGET_VERSION fputs (" (ARM/ELF Ecos)", stderr); + +#define HAS_INIT_SECTION + +#undef INVOKE_main + diff --git a/gcc/config/arm/elf.h b/gcc/config/arm/elf.h new file mode 100644 index 000000000..88400884e --- /dev/null +++ b/gcc/config/arm/elf.h @@ -0,0 +1,166 @@ +/* Definitions of target machine for GNU compiler. + For ARM with ELF obj format. + Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2004, 2005, 2007, + 2008 Free Software Foundation, Inc. + Contributed by Philip Blundell and + Catherine Moore + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +#ifndef OBJECT_FORMAT_ELF + #error elf.h included before elfos.h +#endif + +#ifndef LOCAL_LABEL_PREFIX +#define LOCAL_LABEL_PREFIX "." +#endif + +#ifndef SUBTARGET_CPP_SPEC +#define SUBTARGET_CPP_SPEC "-D__ELF__" +#endif + +#ifndef SUBTARGET_EXTRA_SPECS +#define SUBTARGET_EXTRA_SPECS \ + { "subtarget_extra_asm_spec", SUBTARGET_EXTRA_ASM_SPEC }, \ + { "subtarget_asm_float_spec", SUBTARGET_ASM_FLOAT_SPEC }, \ + SUBSUBTARGET_EXTRA_SPECS +#endif + +#ifndef SUBTARGET_EXTRA_ASM_SPEC +#define SUBTARGET_EXTRA_ASM_SPEC "" +#endif + +#ifndef SUBTARGET_ASM_FLOAT_SPEC +#define SUBTARGET_ASM_FLOAT_SPEC "\ +%{mapcs-float:-mfloat}" +#endif + +#undef SUBSUBTARGET_EXTRA_SPECS +#define SUBSUBTARGET_EXTRA_SPECS + +#ifndef ASM_SPEC +#define ASM_SPEC "\ +%{mbig-endian:-EB} \ +%{mlittle-endian:-EL} \ +%{mcpu=*:-mcpu=%*} \ +%{march=*:-march=%*} \ +%{mapcs-*:-mapcs-%*} \ +%(subtarget_asm_float_spec) \ +%{mthumb-interwork:-mthumb-interwork} \ +%{msoft-float:-mfloat-abi=soft} %{mhard-float:-mfloat-abi=hard} \ +%{mfloat-abi=*} %{mfpu=*} \ +%(subtarget_extra_asm_spec)" +#endif + +/* The ARM uses @ are a comment character so we need to redefine + TYPE_OPERAND_FMT. */ +#undef TYPE_OPERAND_FMT +#define TYPE_OPERAND_FMT "%%%s" + +/* We might need a ARM specific header to function declarations. */ +#undef ASM_DECLARE_FUNCTION_NAME +#define ASM_DECLARE_FUNCTION_NAME(FILE, NAME, DECL) \ + do \ + { \ + ARM_DECLARE_FUNCTION_NAME (FILE, NAME, DECL); \ + ASM_OUTPUT_TYPE_DIRECTIVE (FILE, NAME, "function"); \ + ASM_DECLARE_RESULT (FILE, DECL_RESULT (DECL)); \ + ASM_OUTPUT_LABEL(FILE, NAME); \ + ARM_OUTPUT_FN_UNWIND (FILE, TRUE); \ + } \ + while (0) + +/* We might need an ARM specific trailer for function declarations. */ +#undef ASM_DECLARE_FUNCTION_SIZE +#define ASM_DECLARE_FUNCTION_SIZE(FILE, FNAME, DECL) \ + do \ + { \ + ARM_OUTPUT_FN_UNWIND (FILE, FALSE); \ + if (!flag_inhibit_size_directive) \ + ASM_OUTPUT_MEASURED_SIZE (FILE, FNAME); \ + } \ + while (0) + +/* Define this macro if jump tables (for `tablejump' insns) should be + output in the text section, along with the assembler instructions. + Otherwise, the readonly data section is used. */ +/* We put ARM and Thumb-2 jump tables in the text section, because it makes + the code more efficient, but for Thumb-1 it's better to put them out of + band unless we are generating compressed tables. */ +#define JUMP_TABLES_IN_TEXT_SECTION \ + (TARGET_32BIT || (TARGET_THUMB && (optimize_size || flag_pic))) + +#ifndef LINK_SPEC +#define LINK_SPEC "%{mbig-endian:-EB} %{mlittle-endian:-EL} -X" +#endif + +/* Run-time Target Specification. */ +#ifndef TARGET_VERSION +#define TARGET_VERSION fputs (" (ARM/elf)", stderr) +#endif + +#ifndef TARGET_DEFAULT +#define TARGET_DEFAULT (MASK_APCS_FRAME) +#endif + +#ifndef MULTILIB_DEFAULTS +#define MULTILIB_DEFAULTS \ + { "marm", "mlittle-endian", "msoft-float", "mno-thumb-interwork", "fno-leading-underscore" } +#endif + +#define TARGET_ASM_FILE_START_APP_OFF true +#define TARGET_ASM_FILE_START_FILE_DIRECTIVE true + + +/* Output an element in the static constructor array. */ +#undef TARGET_ASM_CONSTRUCTOR +#define TARGET_ASM_CONSTRUCTOR arm_elf_asm_constructor + +#undef TARGET_ASM_DESTRUCTOR +#define TARGET_ASM_DESTRUCTOR arm_elf_asm_destructor + +/* For PIC code we need to explicitly specify (PLT) and (GOT) relocs. */ +#define NEED_PLT_RELOC flag_pic +#define NEED_GOT_RELOC flag_pic + +/* The ELF assembler handles GOT addressing differently to NetBSD. */ +#define GOT_PCREL 0 + +/* Align output to a power of two. Note ".align 0" is redundant, + and also GAS will treat it as ".align 2" which we do not want. */ +#define ASM_OUTPUT_ALIGN(STREAM, POWER) \ + do \ + { \ + if ((POWER) > 0) \ + fprintf (STREAM, "\t.align\t%d\n", POWER); \ + } \ + while (0) + +/* Horrible hack: We want to prevent some libgcc routines being included + for some multilibs. */ +#ifndef __ARM_ARCH_6M__ +#undef L_fixdfsi +#undef L_fixunsdfsi +#undef L_truncdfsf2 +#undef L_fixsfsi +#undef L_fixunssfsi +#undef L_floatdidf +#undef L_floatdisf +#undef L_floatundidf +#undef L_floatundisf +#endif + diff --git a/gcc/config/arm/fa526.md b/gcc/config/arm/fa526.md new file mode 100644 index 000000000..42eb9b272 --- /dev/null +++ b/gcc/config/arm/fa526.md @@ -0,0 +1,161 @@ +;; Faraday FA526 Pipeline Description +;; Copyright (C) 2010 Free Software Foundation, Inc. +;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description. + +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it under +;; the terms of the GNU General Public License as published by the Free +;; Software Foundation; either version 3, or (at your option) any later +;; version. +;; +;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY +;; WARRANTY; without even the implied warranty of MERCHANTABILITY or +;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +;; for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ + +;; These descriptions are based on the information contained in the +;; FA526 Core Design Note, Copyright (c) 2010 Faraday Technology Corp. +;; +;; Modeled pipeline characteristics: +;; LD -> any use: latency = 3 (2 cycle penalty). +;; ALU -> any use: latency = 2 (1 cycle penalty). + +;; This automaton provides a pipeline description for the Faraday +;; FA526 core. +;; +;; The model given here assumes that the condition for all conditional +;; instructions is "true", i.e., that all of the instructions are +;; actually executed. + +(define_automaton "fa526") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Pipelines +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; There is a single pipeline +;; +;; The ALU pipeline has fetch, decode, execute, memory, and +;; write stages. We only need to model the execute, memory and write +;; stages. + +;; S E M W + +(define_cpu_unit "fa526_core" "fa526") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ALU Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; ALU instructions require two cycles to execute, and use the ALU +;; pipeline in each of the three stages. The results are available +;; after the execute stage stage has finished. +;; +;; If the destination register is the PC, the pipelines are stalled +;; for several cycles. That case is not modeled here. + +;; ALU operations +(define_insn_reservation "526_alu_op" 1 + (and (eq_attr "tune" "fa526") + (eq_attr "type" "alu")) + "fa526_core") + +(define_insn_reservation "526_alu_shift_op" 2 + (and (eq_attr "tune" "fa526") + (eq_attr "type" "alu_shift,alu_shift_reg")) + "fa526_core") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Multiplication Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn_reservation "526_mult1" 2 + (and (eq_attr "tune" "fa526") + (eq_attr "insn" "smlalxy,smulxy,smlaxy,smlalxy")) + "fa526_core") + +(define_insn_reservation "526_mult2" 5 + (and (eq_attr "tune" "fa526") + (eq_attr "insn" "mul,mla,muls,mlas,umull,umlal,smull,smlal,umulls,\ + umlals,smulls,smlals,smlawx")) + "fa526_core*4") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Load/Store Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; The models for load/store instructions do not accurately describe +;; the difference between operations with a base register writeback +;; (such as "ldm!"). These models assume that all memory references +;; hit in dcache. + +(define_insn_reservation "526_load1_op" 3 + (and (eq_attr "tune" "fa526") + (eq_attr "type" "load1,load_byte")) + "fa526_core") + +(define_insn_reservation "526_load2_op" 4 + (and (eq_attr "tune" "fa526") + (eq_attr "type" "load2")) + "fa526_core*2") + +(define_insn_reservation "526_load3_op" 5 + (and (eq_attr "tune" "fa526") + (eq_attr "type" "load3")) + "fa526_core*3") + +(define_insn_reservation "526_load4_op" 6 + (and (eq_attr "tune" "fa526") + (eq_attr "type" "load4")) + "fa526_core*4") + +(define_insn_reservation "526_store1_op" 0 + (and (eq_attr "tune" "fa526") + (eq_attr "type" "store1")) + "fa526_core") + +(define_insn_reservation "526_store2_op" 1 + (and (eq_attr "tune" "fa526") + (eq_attr "type" "store2")) + "fa526_core*2") + +(define_insn_reservation "526_store3_op" 2 + (and (eq_attr "tune" "fa526") + (eq_attr "type" "store3")) + "fa526_core*3") + +(define_insn_reservation "526_store4_op" 3 + (and (eq_attr "tune" "fa526") + (eq_attr "type" "store4")) + "fa526_core*4") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Branch and Call Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Branch instructions are difficult to model accurately. The FA526 +;; core can predict most branches. If the branch is predicted +;; correctly, and predicted early enough, the branch can be completely +;; eliminated from the instruction stream. Some branches can +;; therefore appear to require zero cycle to execute. We assume that +;; all branches are predicted correctly, and that the latency is +;; therefore the minimum value. + +(define_insn_reservation "526_branch_op" 0 + (and (eq_attr "tune" "fa526") + (eq_attr "type" "branch")) + "fa526_core") + +;; The latency for a call is actually the latency when the result is available. +;; i.e. R0 ready for int return value. For most cases, the return value is set +;; by a mov instruction, which has 1 cycle latency. +(define_insn_reservation "526_call_op" 1 + (and (eq_attr "tune" "fa526") + (eq_attr "type" "call")) + "fa526_core") + diff --git a/gcc/config/arm/fa606te.md b/gcc/config/arm/fa606te.md new file mode 100644 index 000000000..06e63d696 --- /dev/null +++ b/gcc/config/arm/fa606te.md @@ -0,0 +1,171 @@ +;; Faraday FA606TE Pipeline Description +;; Copyright (C) 2010 Free Software Foundation, Inc. +;; Written by Mingfeng Wu, based on ARM926EJ-S Pipeline Description. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it under +;; the terms of the GNU General Public License as published by the Free +;; Software Foundation; either version 3, or (at your option) any later +;; version. +;; +;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY +;; WARRANTY; without even the implied warranty of MERCHANTABILITY or +;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +;; for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ + +;; These descriptions are based on the information contained in the +;; FA606TE Core Design Note, Copyright (c) 2010 Faraday Technology Corp. + +;; Modeled pipeline characteristics: +;; LD -> any use: latency = 2 (1 cycle penalty). +;; ALU -> any use: latency = 1 (0 cycle penalty). + +;; This automaton provides a pipeline description for the Faraday +;; FA606TE core. +;; +;; The model given here assumes that the condition for all conditional +;; instructions is "true", i.e., that all of the instructions are +;; actually executed. + +(define_automaton "fa606te") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Pipelines +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; There is a single pipeline +;; +;; The ALU pipeline has fetch, decode, execute, memory, and +;; write stages. We only need to model the execute, memory and write +;; stages. + +;; E M W + +(define_cpu_unit "fa606te_core" "fa606te") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ALU Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; ALU instructions require two cycles to execute, and use the ALU +;; pipeline in each of the three stages. The results are available +;; after the execute stage stage has finished. +;; +;; If the destination register is the PC, the pipelines are stalled +;; for several cycles. That case is not modeled here. + +;; ALU operations +(define_insn_reservation "606te_alu_op" 1 + (and (eq_attr "tune" "fa606te") + (eq_attr "type" "alu,alu_shift,alu_shift_reg")) + "fa606te_core") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Multiplication Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn_reservation "606te_mult1" 2 + (and (eq_attr "tune" "fa606te") + (eq_attr "insn" "smlalxy")) + "fa606te_core") + +(define_insn_reservation "606te_mult2" 3 + (and (eq_attr "tune" "fa606te") + (eq_attr "insn" "smlaxy,smulxy,smulwy,smlawy")) + "fa606te_core*2") + +(define_insn_reservation "606te_mult3" 4 + (and (eq_attr "tune" "fa606te") + (eq_attr "insn" "mul,mla,muls,mlas")) + "fa606te_core*3") + +(define_insn_reservation "606te_mult4" 5 + (and (eq_attr "tune" "fa606te") + (eq_attr "insn" "umull,umlal,smull,smlal,umulls,umlals,smulls,smlals")) + "fa606te_core*4") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Load/Store Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; The models for load/store instructions do not accurately describe +;; the difference between operations with a base register writeback +;; (such as "ldm!"). These models assume that all memory references +;; hit in dcache. + +(define_insn_reservation "606te_load1_op" 2 + (and (eq_attr "tune" "fa606te") + (eq_attr "type" "load1,load_byte")) + "fa606te_core") + +(define_insn_reservation "606te_load2_op" 3 + (and (eq_attr "tune" "fa606te") + (eq_attr "type" "load2")) + "fa606te_core*2") + +(define_insn_reservation "606te_load3_op" 4 + (and (eq_attr "tune" "fa606te") + (eq_attr "type" "load3")) + "fa606te_core*3") + +(define_insn_reservation "606te_load4_op" 5 + (and (eq_attr "tune" "fa606te") + (eq_attr "type" "load4")) + "fa606te_core*4") + +(define_insn_reservation "606te_store1_op" 0 + (and (eq_attr "tune" "fa606te") + (eq_attr "type" "store1")) + "fa606te_core") + +(define_insn_reservation "606te_store2_op" 1 + (and (eq_attr "tune" "fa606te") + (eq_attr "type" "store2")) + "fa606te_core*2") + +(define_insn_reservation "606te_store3_op" 2 + (and (eq_attr "tune" "fa606te") + (eq_attr "type" "store3")) + "fa606te_core*3") + +(define_insn_reservation "606te_store4_op" 3 + (and (eq_attr "tune" "fa606te") + (eq_attr "type" "store4")) + "fa606te_core*4") + + +;;(define_insn_reservation "606te_ldm_op" 9 +;; (and (eq_attr "tune" "fa606te") +;; (eq_attr "type" "load2,load3,load4,store2,store3,store4")) +;; "fa606te_core*7") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Branch and Call Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Branch instructions are difficult to model accurately. The FA606TE +;; core can predict most branches. If the branch is predicted +;; correctly, and predicted early enough, the branch can be completely +;; eliminated from the instruction stream. Some branches can +;; therefore appear to require zero cycles to execute. We assume that +;; all branches are predicted correctly, and that the latency is +;; therefore the minimum value. + +(define_insn_reservation "606te_branch_op" 0 + (and (eq_attr "tune" "fa606te") + (eq_attr "type" "branch")) + "fa606te_core") + +;; The latency for a call is actually the latency when the result is available. +;; i.e. R0 ready for int return value. For most cases, the return value is set +;; by a mov instruction, which has 1 cycle latency. +(define_insn_reservation "606te_call_op" 1 + (and (eq_attr "tune" "fa606te") + (eq_attr "type" "call")) + "fa606te_core") + diff --git a/gcc/config/arm/fa626te.md b/gcc/config/arm/fa626te.md new file mode 100644 index 000000000..7fe1c8724 --- /dev/null +++ b/gcc/config/arm/fa626te.md @@ -0,0 +1,165 @@ +;; Faraday FA626TE Pipeline Description +;; Copyright (C) 2010 Free Software Foundation, Inc. +;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it under +;; the terms of the GNU General Public License as published by the Free +;; Software Foundation; either version 3, or (at your option) any later +;; version. +;; +;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY +;; WARRANTY; without even the implied warranty of MERCHANTABILITY or +;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +;; for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ + +;; These descriptions are based on the information contained in the +;; FA626TE Core Design Note, Copyright (c) 2010 Faraday Technology Corp. + +;; Modeled pipeline characteristics: +;; ALU -> simple address LDR/STR: latency = 2 (available after 2 cycles). +;; ALU -> shifted address LDR/STR: latency = 3. +;; ( extra 1 cycle unavoidable stall). +;; ALU -> other use: latency = 2 (available after 2 cycles). +;; LD -> simple address LDR/STR: latency = 3 (available after 3 cycles). +;; LD -> shifted address LDR/STR: latency = 4 +;; ( extra 1 cycle unavoidable stall). +;; LD -> any other use: latency = 3 (available after 3 cycles). + +;; This automaton provides a pipeline description for the Faraday +;; FA626TE core. +;; +;; The model given here assumes that the condition for all conditional +;; instructions is "true", i.e., that all of the instructions are +;; actually executed. + +(define_automaton "fa626te") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Pipelines +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; There is a single pipeline +;; +;; The ALU pipeline has fetch, decode, execute, memory, and +;; write stages. We only need to model the execute, memory and write +;; stages. + +;; S E M W + +(define_cpu_unit "fa626te_core" "fa626te") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ALU Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; ALU instructions require two cycles to execute, and use the ALU +;; pipeline in each of the three stages. The results are available +;; after the execute stage stage has finished. +;; +;; If the destination register is the PC, the pipelines are stalled +;; for several cycles. That case is not modeled here. + +;; ALU operations +(define_insn_reservation "626te_alu_op" 1 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "type" "alu")) + "fa626te_core") + +(define_insn_reservation "626te_alu_shift_op" 2 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "type" "alu_shift,alu_shift_reg")) + "fa626te_core") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Multiplication Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn_reservation "626te_mult1" 2 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "insn" "smulwy,smlawy,smulxy,smlaxy")) + "fa626te_core") + +(define_insn_reservation "626te_mult2" 2 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "insn" "mul,mla")) + "fa626te_core") + +(define_insn_reservation "626te_mult3" 3 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "insn" "muls,mlas,smull,smlal,umull,umlal,smlalxy,smlawx")) + "fa626te_core*2") + +(define_insn_reservation "626te_mult4" 4 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "insn" "smulls,smlals,umulls,umlals")) + "fa626te_core*3") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Load/Store Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; The models for load/store instructions do not accurately describe +;; the difference between operations with a base register writeback +;; (such as "ldm!"). These models assume that all memory references +;; hit in dcache. + +(define_insn_reservation "626te_load1_op" 3 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "type" "load1,load_byte")) + "fa626te_core") + +(define_insn_reservation "626te_load2_op" 4 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "type" "load2,load3")) + "fa626te_core*2") + +(define_insn_reservation "626te_load3_op" 5 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "type" "load4")) + "fa626te_core*3") + +(define_insn_reservation "626te_store1_op" 0 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "type" "store1")) + "fa626te_core") + +(define_insn_reservation "626te_store2_op" 1 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "type" "store2,store3")) + "fa626te_core*2") + +(define_insn_reservation "626te_store3_op" 2 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "type" "store4")) + "fa626te_core*3") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Branch and Call Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Branch instructions are difficult to model accurately. The FA626TE +;; core can predict most branches. If the branch is predicted +;; correctly, and predicted early enough, the branch can be completely +;; eliminated from the instruction stream. Some branches can +;; therefore appear to require zero cycle to execute. We assume that +;; all branches are predicted correctly, and that the latency is +;; therefore the minimum value. + +(define_insn_reservation "626te_branch_op" 0 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "type" "branch")) + "fa626te_core") + +;; The latency for a call is actually the latency when the result is available. +;; i.e. R0 ready for int return value. +(define_insn_reservation "626te_call_op" 1 + (and (eq_attr "tune" "fa626,fa626te") + (eq_attr "type" "call")) + "fa626te_core") + diff --git a/gcc/config/arm/fa726te.md b/gcc/config/arm/fa726te.md new file mode 100644 index 000000000..3c33d5971 --- /dev/null +++ b/gcc/config/arm/fa726te.md @@ -0,0 +1,218 @@ +;; Faraday FA726TE Pipeline Description +;; Copyright (C) 2010 Free Software Foundation, Inc. +;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it under +;; the terms of the GNU General Public License as published by the Free +;; Software Foundation; either version 3, or (at your option) any later +;; version. +;; +;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY +;; WARRANTY; without even the implied warranty of MERCHANTABILITY or +;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +;; for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ + +;; These descriptions are based on the information contained in the +;; FA726TE Core Design Note, Copyright (c) 2010 Faraday Technology Corp. + +;; This automaton provides a pipeline description for the Faraday +;; FA726TE core. +;; +;; The model given here assumes that the condition for all conditional +;; instructions is "true", i.e., that all of the instructions are +;; actually executed. + +(define_automaton "fa726te") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Pipelines +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; The ALU pipeline has fetch, decode, execute, memory, and +;; write stages. We only need to model the execute, memory and write +;; stages. + +;; E1 E2 E3 E4 E5 WB +;;______________________________________________________ +;; +;; <-------------- LD/ST -----------> +;; shifter + LU <-- AU --> +;; <-- AU --> shifter + LU CPSR (Pipe 0) +;;______________________________________________________ +;; +;; <---------- MUL ---------> +;; shifter + LU <-- AU --> +;; <-- AU --> shifter + LU CPSR (Pipe 1) + + +(define_cpu_unit "fa726te_alu0_pipe,fa726te_alu1_pipe" "fa726te") +(define_cpu_unit "fa726te_mac_pipe" "fa726te") +(define_cpu_unit "fa726te_lsu_pipe_e,fa726te_lsu_pipe_w" "fa726te") + +;; Pretend we have 2 LSUs (the second is ONLY for LDR), which can possibly +;; improve code quality. +(define_query_cpu_unit "fa726te_lsu1_pipe_e,fa726te_lsu1_pipe_w" "fa726te") +(define_cpu_unit "fa726te_is0,fa726te_is1" "fa726te") + +(define_reservation "fa726te_issue" "(fa726te_is0|fa726te_is1)") +;; Reservation to restrict issue to 1. +(define_reservation "fa726te_blockage" "(fa726te_is0+fa726te_is1)") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ALU Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; ALU instructions require three cycles to execute, and use the ALU +;; pipeline in each of the three stages. The results are available +;; after the execute stage stage has finished. +;; +;; If the destination register is the PC, the pipelines are stalled +;; for several cycles. That case is not modeled here. + +;; Move instructions. +(define_insn_reservation "726te_shift_op" 1 + (and (eq_attr "tune" "fa726te") + (eq_attr "insn" "mov,mvn")) + "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)") + +;; ALU operations with no shifted operand will finished in 1 cycle +;; Other ALU instructions 2 cycles. +(define_insn_reservation "726te_alu_op" 1 + (and (eq_attr "tune" "fa726te") + (and (eq_attr "type" "alu") + (not (eq_attr "insn" "mov,mvn")))) + "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)") + +;; ALU operations with a shift-by-register operand. +;; These really stall in the decoder, in order to read the shift value +;; in the first cycle. If the instruction uses both shifter and AU, +;; it takes 3 cycles. +(define_insn_reservation "726te_alu_shift_op" 3 + (and (eq_attr "tune" "fa726te") + (and (eq_attr "type" "alu_shift") + (not (eq_attr "insn" "mov,mvn")))) + "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)") + +(define_insn_reservation "726te_alu_shift_reg_op" 3 + (and (eq_attr "tune" "fa726te") + (and (eq_attr "type" "alu_shift_reg") + (not (eq_attr "insn" "mov,mvn")))) + "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)") +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Multiplication Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Multiplication instructions loop in the execute stage until the +;; instruction has been passed through the multiplier array enough +;; times. Multiply operations occur in both the execute and memory +;; stages of the pipeline + +(define_insn_reservation "726te_mult_op" 3 + (and (eq_attr "tune" "fa726te") + (eq_attr "insn" "smlalxy,mul,mla,muls,mlas,umull,umlal,smull,smlal,\ + umulls,umlals,smulls,smlals,smlawx,smulxy,smlaxy")) + "fa726te_issue+fa726te_mac_pipe") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Load/Store Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; The models for load/store instructions do not accurately describe +;; the difference between operations with a base register writeback +;; (such as "ldm!"). These models assume that all memory references +;; hit in dcache. + +;; Loads with a shifted offset take 3 cycles, and are (a) probably the +;; most common and (b) the pessimistic assumption will lead to fewer stalls. + +;; Scalar loads are pipelined in FA726TE LSU pipe. +;; Here we model the resource conflict between Load@E3-stage & Store@W-stage. +;; The 2nd LSU (lsu1) is to model the fact that if 2 loads are scheduled in the +;; same "bundle", and the 2nd load will introudce another ISSUE stall but is +;; still ok to execute (and may be benefical sometimes). + +(define_insn_reservation "726te_load1_op" 3 + (and (eq_attr "tune" "fa726te") + (eq_attr "type" "load1,load_byte")) + "(fa726te_issue+fa726te_lsu_pipe_e+fa726te_lsu_pipe_w)\ + | (fa726te_issue+fa726te_lsu1_pipe_e+fa726te_lsu1_pipe_w,fa726te_blockage)") + +(define_insn_reservation "726te_store1_op" 1 + (and (eq_attr "tune" "fa726te") + (eq_attr "type" "store1")) + "fa726te_blockage*2") + +;; Load/Store Multiple blocks all pipelines in EX stages until WB. +;; No other instructions can be issued together. Since they essentially +;; prevent all scheduling opportunities, we model them together here. + +;; The LDM is breaking into multiple load instructions, later instruction in +;; the pipe 1 is stalled. +(define_insn_reservation "726te_ldm2_op" 4 + (and (eq_attr "tune" "fa726te") + (eq_attr "type" "load2,load3")) + "fa726te_blockage*4") + +(define_insn_reservation "726te_ldm3_op" 5 + (and (eq_attr "tune" "fa726te") + (eq_attr "type" "load4")) + "fa726te_blockage*5") + +(define_insn_reservation "726te_stm2_op" 2 + (and (eq_attr "tune" "fa726te") + (eq_attr "type" "store2,store3")) + "fa726te_blockage*3") + +(define_insn_reservation "726te_stm3_op" 3 + (and (eq_attr "tune" "fa726te") + (eq_attr "type" "store4")) + "fa726te_blockage*4") + +(define_bypass 1 "726te_load1_op,726te_ldm2_op,726te_ldm3_op" "726te_store1_op,\ + 726te_stm2_op,726te_stm3_op" "arm_no_early_store_addr_dep") +(define_bypass 0 "726te_shift_op,726te_alu_op,726te_alu_shift_op,\ + 726te_alu_shift_reg_op,726te_mult_op" "726te_store1_op" + "arm_no_early_store_addr_dep") +(define_bypass 0 "726te_shift_op,726te_alu_op" "726te_shift_op,726te_alu_op") +(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op" + "726te_shift_op,726te_alu_op") +(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op" + "726te_alu_shift_op" "arm_no_early_alu_shift_dep") +(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op" + "726te_alu_shift_reg_op" "arm_no_early_alu_shift_value_dep") +(define_bypass 1 "726te_mult_op" "726te_shift_op,726te_alu_op") + +(define_bypass 4 "726te_load1_op" "726te_mult_op") +(define_bypass 5 "726te_ldm2_op" "726te_mult_op") +(define_bypass 6 "726te_ldm3_op" "726te_mult_op") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Branch and Call Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Branch instructions are difficult to model accurately. The FA726TE +;; core can predict most branches. If the branch is predicted +;; correctly, and predicted early enough, the branch can be completely +;; eliminated from the instruction stream. Some branches can +;; therefore appear to require zero cycle to execute. We assume that +;; all branches are predicted correctly, and that the latency is +;; therefore the minimum value. + +(define_insn_reservation "726te_branch_op" 0 + (and (eq_attr "tune" "fa726te") + (eq_attr "type" "branch")) + "fa726te_blockage") + +;; The latency for a call is actually the latency when the result is available. +;; i.e. R0 is ready for int return value. +(define_insn_reservation "726te_call_op" 1 + (and (eq_attr "tune" "fa726te") + (eq_attr "type" "call")) + "fa726te_blockage") + diff --git a/gcc/config/arm/fmp626.md b/gcc/config/arm/fmp626.md new file mode 100644 index 000000000..9ba33ddec --- /dev/null +++ b/gcc/config/arm/fmp626.md @@ -0,0 +1,182 @@ +;; Faraday FA626TE Pipeline Description +;; Copyright (C) 2010 Free Software Foundation, Inc. +;; Written by Mingfeng Wu, based on ARM926EJ-S Pipeline Description. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it under +;; the terms of the GNU General Public License as published by the Free +;; Software Foundation; either version 3, or (at your option) any later +;; version. +;; +;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY +;; WARRANTY; without even the implied warranty of MERCHANTABILITY or +;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +;; for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ + +;; These descriptions are based on the information contained in the +;; FMP626 Core Design Note, Copyright (c) 2010 Faraday Technology Corp. + +;; Pipeline architecture +;; S E M W(Q1) Q2 +;; ___________________________________________ +;; shifter alu +;; mul1 mul2 mul3 +;; ld/st1 ld/st2 ld/st3 ld/st4 ld/st5 + +;; This automaton provides a pipeline description for the Faraday +;; FMP626 core. +;; +;; The model given here assumes that the condition for all conditional +;; instructions is "true", i.e., that all of the instructions are +;; actually executed. + +(define_automaton "fmp626") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Pipelines +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; There is a single pipeline +;; +;; The ALU pipeline has fetch, decode, execute, memory, and +;; write stages. We only need to model the execute, memory and write +;; stages. + +(define_cpu_unit "fmp626_core" "fmp626") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ALU Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; ALU instructions require two cycles to execute, and use the ALU +;; pipeline in each of the three stages. The results are available +;; after the execute stage stage has finished. +;; +;; If the destination register is the PC, the pipelines are stalled +;; for several cycles. That case is not modeled here. + +;; ALU operations +(define_insn_reservation "mp626_alu_op" 1 + (and (eq_attr "tune" "fmp626") + (eq_attr "type" "alu")) + "fmp626_core") + +(define_insn_reservation "mp626_alu_shift_op" 2 + (and (eq_attr "tune" "fmp626") + (eq_attr "type" "alu_shift,alu_shift_reg")) + "fmp626_core") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Multiplication Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn_reservation "mp626_mult1" 2 + (and (eq_attr "tune" "fmp626") + (eq_attr "insn" "smulwy,smlawy,smulxy,smlaxy")) + "fmp626_core") + +(define_insn_reservation "mp626_mult2" 2 + (and (eq_attr "tune" "fmp626") + (eq_attr "insn" "mul,mla")) + "fmp626_core") + +(define_insn_reservation "mp626_mult3" 3 + (and (eq_attr "tune" "fmp626") + (eq_attr "insn" "muls,mlas,smull,smlal,umull,umlal,smlalxy,smlawx")) + "fmp626_core*2") + +(define_insn_reservation "mp626_mult4" 4 + (and (eq_attr "tune" "fmp626") + (eq_attr "insn" "smulls,smlals,umulls,umlals")) + "fmp626_core*3") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Load/Store Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; The models for load/store instructions do not accurately describe +;; the difference between operations with a base register writeback +;; (such as "ldm!"). These models assume that all memory references +;; hit in dcache. + +(define_insn_reservation "mp626_load1_op" 5 + (and (eq_attr "tune" "fmp626") + (eq_attr "type" "load1,load_byte")) + "fmp626_core") + +(define_insn_reservation "mp626_load2_op" 6 + (and (eq_attr "tune" "fmp626") + (eq_attr "type" "load2,load3")) + "fmp626_core*2") + +(define_insn_reservation "mp626_load3_op" 7 + (and (eq_attr "tune" "fmp626") + (eq_attr "type" "load4")) + "fmp626_core*3") + +(define_insn_reservation "mp626_store1_op" 0 + (and (eq_attr "tune" "fmp626") + (eq_attr "type" "store1")) + "fmp626_core") + +(define_insn_reservation "mp626_store2_op" 1 + (and (eq_attr "tune" "fmp626") + (eq_attr "type" "store2,store3")) + "fmp626_core*2") + +(define_insn_reservation "mp626_store3_op" 2 + (and (eq_attr "tune" "fmp626") + (eq_attr "type" "store4")) + "fmp626_core*3") + +(define_bypass 1 "mp626_load1_op,mp626_load2_op,mp626_load3_op" + "mp626_store1_op,mp626_store2_op,mp626_store3_op" + "arm_no_early_store_addr_dep") +(define_bypass 1 "mp626_alu_op,mp626_alu_shift_op,mp626_mult1,mp626_mult2,\ + mp626_mult3,mp626_mult4" "mp626_store1_op" + "arm_no_early_store_addr_dep") +(define_bypass 1 "mp626_alu_shift_op" "mp626_alu_op") +(define_bypass 1 "mp626_alu_shift_op" "mp626_alu_shift_op" + "arm_no_early_alu_shift_dep") +(define_bypass 1 "mp626_mult1,mp626_mult2" "mp626_alu_shift_op" + "arm_no_early_alu_shift_dep") +(define_bypass 2 "mp626_mult3" "mp626_alu_shift_op" + "arm_no_early_alu_shift_dep") +(define_bypass 3 "mp626_mult4" "mp626_alu_shift_op" + "arm_no_early_alu_shift_dep") +(define_bypass 1 "mp626_mult1,mp626_mult2" "mp626_alu_op") +(define_bypass 2 "mp626_mult3" "mp626_alu_op") +(define_bypass 3 "mp626_mult4" "mp626_alu_op") +(define_bypass 4 "mp626_load1_op" "mp626_alu_op") +(define_bypass 5 "mp626_load2_op" "mp626_alu_op") +(define_bypass 6 "mp626_load3_op" "mp626_alu_op") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Branch and Call Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Branch instructions are difficult to model accurately. The FMP626 +;; core can predict most branches. If the branch is predicted +;; correctly, and predicted early enough, the branch can be completely +;; eliminated from the instruction stream. Some branches can +;; therefore appear to require zero cycle to execute. We assume that +;; all branches are predicted correctly, and that the latency is +;; therefore the minimum value. + +(define_insn_reservation "mp626_branch_op" 0 + (and (eq_attr "tune" "fmp626") + (eq_attr "type" "branch")) + "fmp626_core") + +;; The latency for a call is actually the latency when the result is available. +;; i.e. R0 ready for int return value. +(define_insn_reservation "mp626_call_op" 1 + (and (eq_attr "tune" "fmp626") + (eq_attr "type" "call")) + "fmp626_core") + diff --git a/gcc/config/arm/fp16.c b/gcc/config/arm/fp16.c new file mode 100644 index 000000000..936caeb78 --- /dev/null +++ b/gcc/config/arm/fp16.c @@ -0,0 +1,145 @@ +/* Half-float conversion routines. + + Copyright (C) 2008, 2009 Free Software Foundation, Inc. + Contributed by CodeSourcery. + + This file is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any + later version. + + This file is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +static inline unsigned short +__gnu_f2h_internal(unsigned int a, int ieee) +{ + unsigned short sign = (a >> 16) & 0x8000; + int aexp = (a >> 23) & 0xff; + unsigned int mantissa = a & 0x007fffff; + unsigned int mask; + unsigned int increment; + + if (aexp == 0xff) + { + if (!ieee) + return sign; + return sign | 0x7e00 | (mantissa >> 13); + } + + if (aexp == 0 && mantissa == 0) + return sign; + + aexp -= 127; + + /* Decimal point between bits 22 and 23. */ + mantissa |= 0x00800000; + if (aexp < -14) + { + mask = 0x007fffff; + if (aexp < -25) + aexp = -26; + else if (aexp != -25) + mask >>= 24 + aexp; + } + else + mask = 0x00001fff; + + /* Round. */ + if (mantissa & mask) + { + increment = (mask + 1) >> 1; + if ((mantissa & mask) == increment) + increment = mantissa & (increment << 1); + mantissa += increment; + if (mantissa >= 0x01000000) + { + mantissa >>= 1; + aexp++; + } + } + + if (ieee) + { + if (aexp > 15) + return sign | 0x7c00; + } + else + { + if (aexp > 16) + return sign | 0x7fff; + } + + if (aexp < -24) + return sign; + + if (aexp < -14) + { + mantissa >>= -14 - aexp; + aexp = -14; + } + + /* We leave the leading 1 in the mantissa, and subtract one + from the exponent bias to compensate. */ + return sign | (((aexp + 14) << 10) + (mantissa >> 13)); +} + +unsigned int +__gnu_h2f_internal(unsigned short a, int ieee) +{ + unsigned int sign = (unsigned int)(a & 0x8000) << 16; + int aexp = (a >> 10) & 0x1f; + unsigned int mantissa = a & 0x3ff; + + if (aexp == 0x1f && ieee) + return sign | 0x7f800000 | (mantissa << 13); + + if (aexp == 0) + { + int shift; + + if (mantissa == 0) + return sign; + + shift = __builtin_clz(mantissa) - 21; + mantissa <<= shift; + aexp = -shift; + } + + return sign | (((aexp + 0x70) << 23) + (mantissa << 13)); +} + +unsigned short +__gnu_f2h_ieee(unsigned int a) +{ + return __gnu_f2h_internal(a, 1); +} + +unsigned int +__gnu_h2f_ieee(unsigned short a) +{ + return __gnu_h2f_internal(a, 1); +} + +unsigned short +__gnu_f2h_alternative(unsigned int x) +{ + return __gnu_f2h_internal(x, 0); +} + +unsigned int +__gnu_h2f_alternative(unsigned short a) +{ + return __gnu_h2f_internal(a, 0); +} diff --git a/gcc/config/arm/fpa.md b/gcc/config/arm/fpa.md new file mode 100644 index 000000000..6e6dd8d43 --- /dev/null +++ b/gcc/config/arm/fpa.md @@ -0,0 +1,889 @@ +;;- Machine description for FPA co-processor for ARM cpus. +;; Copyright 1991, 1993, 1994, 1995, 1996, 1996, 1997, 1998, 1999, 2000, +;; 2001, 2002, 2003, 2004, 2005, 2007 Free Software Foundation, Inc. +;; Contributed by Pieter `Tiggr' Schoenmakers (rcpieter@win.tue.nl) +;; and Martin Simmons (@harleqn.co.uk). +;; More major hacks by Richard Earnshaw (rearnsha@arm.com). + +;; This file is part of GCC. + +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published +;; by the Free Software Foundation; either version 3, or (at your +;; option) any later version. + +;; GCC is distributed in the hope that it will be useful, but WITHOUT +;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public +;; License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +;; Some FPA mnemonics are ambiguous between conditional infixes and +;; conditional suffixes. All instructions use a conditional infix, +;; even in unified assembly mode. + +;; FPA automaton. +(define_automaton "armfp") + +;; Floating point unit (FPA) +(define_cpu_unit "fpa" "armfp") + +; The fpa10 doesn't really have a memory read unit, but it can start +; to speculatively execute the instruction in the pipeline, provided +; the data is already loaded, so pretend reads have a delay of 2 (and +; that the pipeline is infinite). +(define_cpu_unit "fpa_mem" "arm") + +(define_insn_reservation "fdivx" 71 + (and (eq_attr "fpu" "fpa") + (eq_attr "type" "fdivx")) + "core+fpa*69") + +(define_insn_reservation "fdivd" 59 + (and (eq_attr "fpu" "fpa") + (eq_attr "type" "fdivd")) + "core+fpa*57") + +(define_insn_reservation "fdivs" 31 + (and (eq_attr "fpu" "fpa") + (eq_attr "type" "fdivs")) + "core+fpa*29") + +(define_insn_reservation "fmul" 9 + (and (eq_attr "fpu" "fpa") + (eq_attr "type" "fmul")) + "core+fpa*7") + +(define_insn_reservation "ffmul" 6 + (and (eq_attr "fpu" "fpa") + (eq_attr "type" "ffmul")) + "core+fpa*4") + +(define_insn_reservation "farith" 4 + (and (eq_attr "fpu" "fpa") + (eq_attr "type" "farith")) + "core+fpa*2") + +(define_insn_reservation "ffarith" 2 + (and (eq_attr "fpu" "fpa") + (eq_attr "type" "ffarith")) + "core+fpa*2") + +(define_insn_reservation "r_2_f" 5 + (and (eq_attr "fpu" "fpa") + (eq_attr "type" "r_2_f")) + "core+fpa*3") + +(define_insn_reservation "f_2_r" 1 + (and (eq_attr "fpu" "fpa") + (eq_attr "type" "f_2_r")) + "core+fpa*2") + +(define_insn_reservation "f_load" 3 + (and (eq_attr "fpu" "fpa") (eq_attr "type" "f_fpa_load")) + "fpa_mem+core*3") + +(define_insn_reservation "f_store" 4 + (and (eq_attr "fpu" "fpa") (eq_attr "type" "f_fpa_store")) + "core*4") + +(define_insn_reservation "r_mem_f" 6 + (and (eq_attr "model_wbuf" "no") + (and (eq_attr "fpu" "fpa") (eq_attr "type" "r_mem_f"))) + "core*6") + +(define_insn_reservation "f_mem_r" 7 + (and (eq_attr "fpu" "fpa") (eq_attr "type" "f_mem_r")) + "core*7") + + +(define_insn "*addsf3_fpa" + [(set (match_operand:SF 0 "s_register_operand" "=f,f") + (plus:SF (match_operand:SF 1 "s_register_operand" "%f,f") + (match_operand:SF 2 "arm_float_add_operand" "fG,H")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "@ + adf%?s\\t%0, %1, %2 + suf%?s\\t%0, %1, #%N2" + [(set_attr "type" "farith") + (set_attr "predicable" "yes")] +) + +(define_insn "*adddf3_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f,f") + (plus:DF (match_operand:DF 1 "s_register_operand" "%f,f") + (match_operand:DF 2 "arm_float_add_operand" "fG,H")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "@ + adf%?d\\t%0, %1, %2 + suf%?d\\t%0, %1, #%N2" + [(set_attr "type" "farith") + (set_attr "predicable" "yes")] +) + +(define_insn "*adddf_esfdf_df_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f,f") + (plus:DF (float_extend:DF + (match_operand:SF 1 "s_register_operand" "f,f")) + (match_operand:DF 2 "arm_float_add_operand" "fG,H")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "@ + adf%?d\\t%0, %1, %2 + suf%?d\\t%0, %1, #%N2" + [(set_attr "type" "farith") + (set_attr "predicable" "yes")] +) + +(define_insn "*adddf_df_esfdf_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f") + (plus:DF (match_operand:DF 1 "s_register_operand" "f") + (float_extend:DF + (match_operand:SF 2 "s_register_operand" "f"))))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "adf%?d\\t%0, %1, %2" + [(set_attr "type" "farith") + (set_attr "predicable" "yes")] +) + +(define_insn "*adddf_esfdf_esfdf_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f") + (plus:DF (float_extend:DF + (match_operand:SF 1 "s_register_operand" "f")) + (float_extend:DF + (match_operand:SF 2 "s_register_operand" "f"))))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "adf%?d\\t%0, %1, %2" + [(set_attr "type" "farith") + (set_attr "predicable" "yes")] +) + +(define_insn "*subsf3_fpa" + [(set (match_operand:SF 0 "s_register_operand" "=f,f") + (minus:SF (match_operand:SF 1 "arm_float_rhs_operand" "f,G") + (match_operand:SF 2 "arm_float_rhs_operand" "fG,f")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "@ + suf%?s\\t%0, %1, %2 + rsf%?s\\t%0, %2, %1" + [(set_attr "type" "farith")] +) + +(define_insn "*subdf3_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f,f") + (minus:DF (match_operand:DF 1 "arm_float_rhs_operand" "f,G") + (match_operand:DF 2 "arm_float_rhs_operand" "fG,f")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "@ + suf%?d\\t%0, %1, %2 + rsf%?d\\t%0, %2, %1" + [(set_attr "type" "farith") + (set_attr "predicable" "yes")] +) + +(define_insn "*subdf_esfdf_df_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f") + (minus:DF (float_extend:DF + (match_operand:SF 1 "s_register_operand" "f")) + (match_operand:DF 2 "arm_float_rhs_operand" "fG")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "suf%?d\\t%0, %1, %2" + [(set_attr "type" "farith") + (set_attr "predicable" "yes")] +) + +(define_insn "*subdf_df_esfdf_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f,f") + (minus:DF (match_operand:DF 1 "arm_float_rhs_operand" "f,G") + (float_extend:DF + (match_operand:SF 2 "s_register_operand" "f,f"))))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "@ + suf%?d\\t%0, %1, %2 + rsf%?d\\t%0, %2, %1" + [(set_attr "type" "farith") + (set_attr "predicable" "yes")] +) + +(define_insn "*subdf_esfdf_esfdf_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f") + (minus:DF (float_extend:DF + (match_operand:SF 1 "s_register_operand" "f")) + (float_extend:DF + (match_operand:SF 2 "s_register_operand" "f"))))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "suf%?d\\t%0, %1, %2" + [(set_attr "type" "farith") + (set_attr "predicable" "yes")] +) + +(define_insn "*mulsf3_fpa" + [(set (match_operand:SF 0 "s_register_operand" "=f") + (mult:SF (match_operand:SF 1 "s_register_operand" "f") + (match_operand:SF 2 "arm_float_rhs_operand" "fG")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "fml%?s\\t%0, %1, %2" + [(set_attr "type" "ffmul") + (set_attr "predicable" "yes")] +) + +(define_insn "*muldf3_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f") + (mult:DF (match_operand:DF 1 "s_register_operand" "f") + (match_operand:DF 2 "arm_float_rhs_operand" "fG")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "muf%?d\\t%0, %1, %2" + [(set_attr "type" "fmul") + (set_attr "predicable" "yes")] +) + +(define_insn "*muldf_esfdf_df_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f") + (mult:DF (float_extend:DF + (match_operand:SF 1 "s_register_operand" "f")) + (match_operand:DF 2 "arm_float_rhs_operand" "fG")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "muf%?d\\t%0, %1, %2" + [(set_attr "type" "fmul") + (set_attr "predicable" "yes")] +) + +(define_insn "*muldf_df_esfdf_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f") + (mult:DF (match_operand:DF 1 "s_register_operand" "f") + (float_extend:DF + (match_operand:SF 2 "s_register_operand" "f"))))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "muf%?d\\t%0, %1, %2" + [(set_attr "type" "fmul") + (set_attr "predicable" "yes")] +) + +(define_insn "*muldf_esfdf_esfdf_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f") + (mult:DF + (float_extend:DF (match_operand:SF 1 "s_register_operand" "f")) + (float_extend:DF (match_operand:SF 2 "s_register_operand" "f"))))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "muf%?d\\t%0, %1, %2" + [(set_attr "type" "fmul") + (set_attr "predicable" "yes")] +) + +;; Division insns + +(define_insn "*divsf3_fpa" + [(set (match_operand:SF 0 "s_register_operand" "=f,f") + (div:SF (match_operand:SF 1 "arm_float_rhs_operand" "f,G") + (match_operand:SF 2 "arm_float_rhs_operand" "fG,f")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "@ + fdv%?s\\t%0, %1, %2 + frd%?s\\t%0, %2, %1" + [(set_attr "type" "fdivs") + (set_attr "predicable" "yes")] +) + +(define_insn "*divdf3_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f,f") + (div:DF (match_operand:DF 1 "arm_float_rhs_operand" "f,G") + (match_operand:DF 2 "arm_float_rhs_operand" "fG,f")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "@ + dvf%?d\\t%0, %1, %2 + rdf%?d\\t%0, %2, %1" + [(set_attr "type" "fdivd") + (set_attr "predicable" "yes")] +) + +(define_insn "*divdf_esfdf_df_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f") + (div:DF (float_extend:DF + (match_operand:SF 1 "s_register_operand" "f")) + (match_operand:DF 2 "arm_float_rhs_operand" "fG")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "dvf%?d\\t%0, %1, %2" + [(set_attr "type" "fdivd") + (set_attr "predicable" "yes")] +) + +(define_insn "*divdf_df_esfdf_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f") + (div:DF (match_operand:DF 1 "arm_float_rhs_operand" "fG") + (float_extend:DF + (match_operand:SF 2 "s_register_operand" "f"))))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "rdf%?d\\t%0, %2, %1" + [(set_attr "type" "fdivd") + (set_attr "predicable" "yes")] +) + +(define_insn "*divdf_esfdf_esfdf_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f") + (div:DF (float_extend:DF + (match_operand:SF 1 "s_register_operand" "f")) + (float_extend:DF + (match_operand:SF 2 "s_register_operand" "f"))))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "dvf%?d\\t%0, %1, %2" + [(set_attr "type" "fdivd") + (set_attr "predicable" "yes")] +) + +(define_insn "*modsf3_fpa" + [(set (match_operand:SF 0 "s_register_operand" "=f") + (mod:SF (match_operand:SF 1 "s_register_operand" "f") + (match_operand:SF 2 "arm_float_rhs_operand" "fG")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "rmf%?s\\t%0, %1, %2" + [(set_attr "type" "fdivs") + (set_attr "predicable" "yes")] +) + +(define_insn "*moddf3_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f") + (mod:DF (match_operand:DF 1 "s_register_operand" "f") + (match_operand:DF 2 "arm_float_rhs_operand" "fG")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "rmf%?d\\t%0, %1, %2" + [(set_attr "type" "fdivd") + (set_attr "predicable" "yes")] +) + +(define_insn "*moddf_esfdf_df_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f") + (mod:DF (float_extend:DF + (match_operand:SF 1 "s_register_operand" "f")) + (match_operand:DF 2 "arm_float_rhs_operand" "fG")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "rmf%?d\\t%0, %1, %2" + [(set_attr "type" "fdivd") + (set_attr "predicable" "yes")] +) + +(define_insn "*moddf_df_esfdf_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f") + (mod:DF (match_operand:DF 1 "s_register_operand" "f") + (float_extend:DF + (match_operand:SF 2 "s_register_operand" "f"))))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "rmf%?d\\t%0, %1, %2" + [(set_attr "type" "fdivd") + (set_attr "predicable" "yes")] +) + +(define_insn "*moddf_esfdf_esfdf_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f") + (mod:DF (float_extend:DF + (match_operand:SF 1 "s_register_operand" "f")) + (float_extend:DF + (match_operand:SF 2 "s_register_operand" "f"))))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "rmf%?d\\t%0, %1, %2" + [(set_attr "type" "fdivd") + (set_attr "predicable" "yes")] +) + +(define_insn "*negsf2_fpa" + [(set (match_operand:SF 0 "s_register_operand" "=f") + (neg:SF (match_operand:SF 1 "s_register_operand" "f")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "mnf%?s\\t%0, %1" + [(set_attr "type" "ffarith") + (set_attr "predicable" "yes")] +) + +(define_insn "*negdf2_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f") + (neg:DF (match_operand:DF 1 "s_register_operand" "f")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "mnf%?d\\t%0, %1" + [(set_attr "type" "ffarith") + (set_attr "predicable" "yes")] +) + +(define_insn "*negdf_esfdf_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f") + (neg:DF (float_extend:DF + (match_operand:SF 1 "s_register_operand" "f"))))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "mnf%?d\\t%0, %1" + [(set_attr "type" "ffarith") + (set_attr "predicable" "yes")] +) + +(define_insn "*abssf2_fpa" + [(set (match_operand:SF 0 "s_register_operand" "=f") + (abs:SF (match_operand:SF 1 "s_register_operand" "f")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "abs%?s\\t%0, %1" + [(set_attr "type" "ffarith") + (set_attr "predicable" "yes")] +) + +(define_insn "*absdf2_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f") + (abs:DF (match_operand:DF 1 "s_register_operand" "f")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "abs%?d\\t%0, %1" + [(set_attr "type" "ffarith") + (set_attr "predicable" "yes")] +) + +(define_insn "*absdf_esfdf_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f") + (abs:DF (float_extend:DF + (match_operand:SF 1 "s_register_operand" "f"))))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "abs%?d\\t%0, %1" + [(set_attr "type" "ffarith") + (set_attr "predicable" "yes")] +) + +(define_insn "*sqrtsf2_fpa" + [(set (match_operand:SF 0 "s_register_operand" "=f") + (sqrt:SF (match_operand:SF 1 "s_register_operand" "f")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "sqt%?s\\t%0, %1" + [(set_attr "type" "float_em") + (set_attr "predicable" "yes")] +) + +(define_insn "*sqrtdf2_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f") + (sqrt:DF (match_operand:DF 1 "s_register_operand" "f")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "sqt%?d\\t%0, %1" + [(set_attr "type" "float_em") + (set_attr "predicable" "yes")] +) + +(define_insn "*sqrtdf_esfdf_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f") + (sqrt:DF (float_extend:DF + (match_operand:SF 1 "s_register_operand" "f"))))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "sqt%?d\\t%0, %1" + [(set_attr "type" "float_em") + (set_attr "predicable" "yes")] +) + +(define_insn "*floatsisf2_fpa" + [(set (match_operand:SF 0 "s_register_operand" "=f") + (float:SF (match_operand:SI 1 "s_register_operand" "r")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "flt%?s\\t%0, %1" + [(set_attr "type" "r_2_f") + (set_attr "predicable" "yes")] +) + +(define_insn "*floatsidf2_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f") + (float:DF (match_operand:SI 1 "s_register_operand" "r")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "flt%?d\\t%0, %1" + [(set_attr "type" "r_2_f") + (set_attr "predicable" "yes")] +) + +(define_insn "*fix_truncsfsi2_fpa" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (fix:SI (fix:SF (match_operand:SF 1 "s_register_operand" "f"))))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "fix%?z\\t%0, %1" + [(set_attr "type" "f_2_r") + (set_attr "predicable" "yes")] +) + +(define_insn "*fix_truncdfsi2_fpa" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (fix:SI (fix:DF (match_operand:DF 1 "s_register_operand" "f"))))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "fix%?z\\t%0, %1" + [(set_attr "type" "f_2_r") + (set_attr "predicable" "yes")] +) + +(define_insn "*truncdfsf2_fpa" + [(set (match_operand:SF 0 "s_register_operand" "=f") + (float_truncate:SF + (match_operand:DF 1 "s_register_operand" "f")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "mvf%?s\\t%0, %1" + [(set_attr "type" "ffarith") + (set_attr "predicable" "yes")] +) + +(define_insn "*extendsfdf2_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f") + (float_extend:DF (match_operand:SF 1 "s_register_operand" "f")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "mvf%?d\\t%0, %1" + [(set_attr "type" "ffarith") + (set_attr "predicable" "yes")] +) + +(define_insn "*movsf_fpa" + [(set (match_operand:SF 0 "nonimmediate_operand" "=f,f,f, m,f,r,r,r, m") + (match_operand:SF 1 "general_operand" "fG,H,mE,f,r,f,r,mE,r"))] + "TARGET_ARM + && TARGET_HARD_FLOAT && TARGET_FPA + && (GET_CODE (operands[0]) != MEM + || register_operand (operands[1], SFmode))" + "@ + mvf%?s\\t%0, %1 + mnf%?s\\t%0, #%N1 + ldf%?s\\t%0, %1 + stf%?s\\t%1, %0 + str%?\\t%1, [%|sp, #-4]!\;ldf%?s\\t%0, [%|sp], #4 + stf%?s\\t%1, [%|sp, #-4]!\;ldr%?\\t%0, [%|sp], #4 + mov%?\\t%0, %1 + ldr%?\\t%0, %1\\t%@ float + str%?\\t%1, %0\\t%@ float" + [(set_attr "length" "4,4,4,4,8,8,4,4,4") + (set_attr "predicable" "yes") + (set_attr "type" + "ffarith,ffarith,f_fpa_load,f_fpa_store,r_mem_f,f_mem_r,*,load1,store1") + (set_attr "pool_range" "*,*,1024,*,*,*,*,4096,*") + (set_attr "neg_pool_range" "*,*,1012,*,*,*,*,4084,*")] +) + +(define_insn "*movdf_fpa" + [(set (match_operand:DF 0 "nonimmediate_operand" + "=r,Q,r,m,r, f, f,f, m,!f,!r") + (match_operand:DF 1 "general_operand" + "Q, r,r,r,mF,fG,H,mF,f,r, f"))] + "TARGET_ARM + && TARGET_HARD_FLOAT && TARGET_FPA + && (GET_CODE (operands[0]) != MEM + || register_operand (operands[1], DFmode))" + "* + { + switch (which_alternative) + { + default: + case 0: return \"ldm%(ia%)\\t%m1, %M0\\t%@ double\"; + case 1: return \"stm%(ia%)\\t%m0, %M1\\t%@ double\"; + case 2: return \"#\"; + case 3: case 4: return output_move_double (operands); + case 5: return \"mvf%?d\\t%0, %1\"; + case 6: return \"mnf%?d\\t%0, #%N1\"; + case 7: return \"ldf%?d\\t%0, %1\"; + case 8: return \"stf%?d\\t%1, %0\"; + case 9: return output_mov_double_fpa_from_arm (operands); + case 10: return output_mov_double_arm_from_fpa (operands); + } + } + " + [(set_attr "length" "4,4,8,8,8,4,4,4,4,8,8") + (set_attr "predicable" "yes") + (set_attr "type" + "load1,store2,*,store2,load1,ffarith,ffarith,f_fpa_load,f_fpa_store,r_mem_f,f_mem_r") + (set_attr "pool_range" "*,*,*,*,1020,*,*,1024,*,*,*") + (set_attr "neg_pool_range" "*,*,*,*,1008,*,*,1008,*,*,*")] +) + +;; We treat XFmode as meaning 'internal format'. It's the right size and we +;; don't use it for anything else. We only support moving between FPA +;; registers and moving an FPA register to/from memory. +(define_insn "*movxf_fpa" + [(set (match_operand:XF 0 "nonimmediate_operand" "=f,f,m") + (match_operand:XF 1 "general_operand" "f,m,f"))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA + && (register_operand (operands[0], XFmode) + || register_operand (operands[1], XFmode))" + "* + switch (which_alternative) + { + default: + case 0: return \"mvf%?e\\t%0, %1\"; + case 1: if (TARGET_FPA_EMU2) + return \"ldf%?e\\t%0, %1\"; + return \"lfm%?\\t%0, 1, %1\"; + case 2: if (TARGET_FPA_EMU2) + return \"stf%?e\\t%1, %0\"; + return \"sfm%?\\t%1, 1, %0\"; + } + " + [(set_attr "length" "4,4,4") + (set_attr "predicable" "yes") + (set_attr "type" "ffarith,f_fpa_load,f_fpa_store")] +) + +;; stfs/ldfs always use a conditional infix. This works around the +;; ambiguity between "stf pl s" and "sftp ls". +(define_insn "*thumb2_movsf_fpa" + [(set (match_operand:SF 0 "nonimmediate_operand" "=f,f,f, m,f,r,r,r, m") + (match_operand:SF 1 "general_operand" "fG,H,mE,f,r,f,r,mE,r"))] + "TARGET_THUMB2 + && TARGET_HARD_FLOAT && TARGET_FPA + && (GET_CODE (operands[0]) != MEM + || register_operand (operands[1], SFmode))" + "@ + mvf%?s\\t%0, %1 + mnf%?s\\t%0, #%N1 + ldf%?s\\t%0, %1 + stf%?s\\t%1, %0 + str%?\\t%1, [%|sp, #-4]!\;ldf%?s\\t%0, [%|sp], #4 + stf%?s\\t%1, [%|sp, #-4]!\;ldr%?\\t%0, [%|sp], #4 + mov%?\\t%0, %1 @bar + ldr%?\\t%0, %1\\t%@ float + str%?\\t%1, %0\\t%@ float" + [(set_attr "length" "4,4,4,4,8,8,4,4,4") + (set_attr "ce_count" "1,1,1,1,2,2,1,1,1") + (set_attr "predicable" "yes") + (set_attr "type" + "ffarith,ffarith,f_fpa_load,f_fpa_store,r_mem_f,f_mem_r,*,load1,store1") + (set_attr "pool_range" "*,*,1024,*,*,*,*,4096,*") + (set_attr "neg_pool_range" "*,*,1012,*,*,*,*,0,*")] +) + +;; Not predicable because we don't know the number of instructions. +(define_insn "*thumb2_movdf_fpa" + [(set (match_operand:DF 0 "nonimmediate_operand" + "=r,Q,r,m,r, f, f,f, m,!f,!r") + (match_operand:DF 1 "general_operand" + "Q, r,r,r,mF,fG,H,mF,f,r, f"))] + "TARGET_THUMB2 + && TARGET_HARD_FLOAT && TARGET_FPA + && (GET_CODE (operands[0]) != MEM + || register_operand (operands[1], DFmode))" + "* + { + switch (which_alternative) + { + default: + case 0: return \"ldm%(ia%)\\t%m1, %M0\\t%@ double\"; + case 1: return \"stm%(ia%)\\t%m0, %M1\\t%@ double\"; + case 2: case 3: case 4: return output_move_double (operands); + case 5: return \"mvf%?d\\t%0, %1\"; + case 6: return \"mnf%?d\\t%0, #%N1\"; + case 7: return \"ldf%?d\\t%0, %1\"; + case 8: return \"stf%?d\\t%1, %0\"; + case 9: return output_mov_double_fpa_from_arm (operands); + case 10: return output_mov_double_arm_from_fpa (operands); + } + } + " + [(set_attr "length" "4,4,8,8,8,4,4,4,4,8,8") + (set_attr "type" + "load1,store2,*,store2,load1,ffarith,ffarith,f_fpa_load,f_fpa_store,r_mem_f,f_mem_r") + (set_attr "pool_range" "*,*,*,*,4092,*,*,1024,*,*,*") + (set_attr "neg_pool_range" "*,*,*,*,0,*,*,1020,*,*,*")] +) + +;; Saving and restoring the floating point registers in the prologue should +;; be done in XFmode, even though we don't support that for anything else +;; (Well, strictly it's 'internal representation', but that's effectively +;; XFmode). +;; Not predicable because we don't know the number of instructions. + +(define_insn "*thumb2_movxf_fpa" + [(set (match_operand:XF 0 "nonimmediate_operand" "=f,f,f,m,f,r,r") + (match_operand:XF 1 "general_operand" "fG,H,m,f,r,f,r"))] + "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_FPA && reload_completed" + "* + switch (which_alternative) + { + default: + case 0: return \"mvf%?e\\t%0, %1\"; + case 1: return \"mnf%?e\\t%0, #%N1\"; + case 2: return \"ldf%?e\\t%0, %1\"; + case 3: return \"stf%?e\\t%1, %0\"; + case 4: return output_mov_long_double_fpa_from_arm (operands); + case 5: return output_mov_long_double_arm_from_fpa (operands); + case 6: return output_mov_long_double_arm_from_arm (operands); + } + " + [(set_attr "length" "4,4,4,4,8,8,12") + (set_attr "type" "ffarith,ffarith,f_fpa_load,f_fpa_store,r_mem_f,f_mem_r,*") + (set_attr "pool_range" "*,*,1024,*,*,*,*") + (set_attr "neg_pool_range" "*,*,1004,*,*,*,*")] +) + +(define_insn "*cmpsf_fpa" + [(set (reg:CCFP CC_REGNUM) + (compare:CCFP (match_operand:SF 0 "s_register_operand" "f,f") + (match_operand:SF 1 "arm_float_add_operand" "fG,H")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "@ + cmf%?\\t%0, %1 + cnf%?\\t%0, #%N1" + [(set_attr "conds" "set") + (set_attr "type" "f_2_r")] +) + +(define_insn "*cmpdf_fpa" + [(set (reg:CCFP CC_REGNUM) + (compare:CCFP (match_operand:DF 0 "s_register_operand" "f,f") + (match_operand:DF 1 "arm_float_add_operand" "fG,H")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "@ + cmf%?\\t%0, %1 + cnf%?\\t%0, #%N1" + [(set_attr "conds" "set") + (set_attr "type" "f_2_r")] +) + +(define_insn "*cmpesfdf_df_fpa" + [(set (reg:CCFP CC_REGNUM) + (compare:CCFP (float_extend:DF + (match_operand:SF 0 "s_register_operand" "f,f")) + (match_operand:DF 1 "arm_float_add_operand" "fG,H")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "@ + cmf%?\\t%0, %1 + cnf%?\\t%0, #%N1" + [(set_attr "conds" "set") + (set_attr "type" "f_2_r")] +) + +(define_insn "*cmpdf_esfdf_fpa" + [(set (reg:CCFP CC_REGNUM) + (compare:CCFP (match_operand:DF 0 "s_register_operand" "f") + (float_extend:DF + (match_operand:SF 1 "s_register_operand" "f"))))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "cmf%?\\t%0, %1" + [(set_attr "conds" "set") + (set_attr "type" "f_2_r")] +) + +(define_insn "*cmpsf_trap_fpa" + [(set (reg:CCFPE CC_REGNUM) + (compare:CCFPE (match_operand:SF 0 "s_register_operand" "f,f") + (match_operand:SF 1 "arm_float_add_operand" "fG,H")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "@ + cmf%?e\\t%0, %1 + cnf%?e\\t%0, #%N1" + [(set_attr "conds" "set") + (set_attr "type" "f_2_r")] +) + +(define_insn "*cmpdf_trap_fpa" + [(set (reg:CCFPE CC_REGNUM) + (compare:CCFPE (match_operand:DF 0 "s_register_operand" "f,f") + (match_operand:DF 1 "arm_float_add_operand" "fG,H")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "@ + cmf%?e\\t%0, %1 + cnf%?e\\t%0, #%N1" + [(set_attr "conds" "set") + (set_attr "type" "f_2_r")] +) + +(define_insn "*cmp_esfdf_df_trap_fpa" + [(set (reg:CCFPE CC_REGNUM) + (compare:CCFPE (float_extend:DF + (match_operand:SF 0 "s_register_operand" "f,f")) + (match_operand:DF 1 "arm_float_add_operand" "fG,H")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "@ + cmf%?e\\t%0, %1 + cnf%?e\\t%0, #%N1" + [(set_attr "conds" "set") + (set_attr "type" "f_2_r")] +) + +(define_insn "*cmp_df_esfdf_trap_fpa" + [(set (reg:CCFPE CC_REGNUM) + (compare:CCFPE (match_operand:DF 0 "s_register_operand" "f") + (float_extend:DF + (match_operand:SF 1 "s_register_operand" "f"))))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA" + "cmf%?e\\t%0, %1" + [(set_attr "conds" "set") + (set_attr "type" "f_2_r")] +) + +(define_insn "*movsfcc_fpa" + [(set (match_operand:SF 0 "s_register_operand" "=f,f,f,f,f,f,f,f") + (if_then_else:SF + (match_operator 3 "arm_comparison_operator" + [(match_operand 4 "cc_register" "") (const_int 0)]) + (match_operand:SF 1 "arm_float_add_operand" "0,0,fG,H,fG,fG,H,H") + (match_operand:SF 2 "arm_float_add_operand" "fG,H,0,0,fG,H,fG,H")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA" + "@ + mvf%D3s\\t%0, %2 + mnf%D3s\\t%0, #%N2 + mvf%d3s\\t%0, %1 + mnf%d3s\\t%0, #%N1 + mvf%d3s\\t%0, %1\;mvf%D3s\\t%0, %2 + mvf%d3s\\t%0, %1\;mnf%D3s\\t%0, #%N2 + mnf%d3s\\t%0, #%N1\;mvf%D3s\\t%0, %2 + mnf%d3s\\t%0, #%N1\;mnf%D3s\\t%0, #%N2" + [(set_attr "length" "4,4,4,4,8,8,8,8") + (set_attr "type" "ffarith") + (set_attr "conds" "use")] +) + +(define_insn "*movdfcc_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f,f,f,f,f,f,f,f") + (if_then_else:DF + (match_operator 3 "arm_comparison_operator" + [(match_operand 4 "cc_register" "") (const_int 0)]) + (match_operand:DF 1 "arm_float_add_operand" "0,0,fG,H,fG,fG,H,H") + (match_operand:DF 2 "arm_float_add_operand" "fG,H,0,0,fG,H,fG,H")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA" + "@ + mvf%D3d\\t%0, %2 + mnf%D3d\\t%0, #%N2 + mvf%d3d\\t%0, %1 + mnf%d3d\\t%0, #%N1 + mvf%d3d\\t%0, %1\;mvf%D3d\\t%0, %2 + mvf%d3d\\t%0, %1\;mnf%D3d\\t%0, #%N2 + mnf%d3d\\t%0, #%N1\;mvf%D3d\\t%0, %2 + mnf%d3d\\t%0, #%N1\;mnf%D3d\\t%0, #%N2" + [(set_attr "length" "4,4,4,4,8,8,8,8") + (set_attr "type" "ffarith") + (set_attr "conds" "use")] +) + +(define_insn "*thumb2_movsfcc_fpa" + [(set (match_operand:SF 0 "s_register_operand" "=f,f,f,f,f,f,f,f") + (if_then_else:SF + (match_operator 3 "arm_comparison_operator" + [(match_operand 4 "cc_register" "") (const_int 0)]) + (match_operand:SF 1 "arm_float_add_operand" "0,0,fG,H,fG,fG,H,H") + (match_operand:SF 2 "arm_float_add_operand" "fG,H,0,0,fG,H,fG,H")))] + "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_FPA" + "@ + it\\t%D3\;mvf%D3s\\t%0, %2 + it\\t%D3\;mnf%D3s\\t%0, #%N2 + it\\t%d3\;mvf%d3s\\t%0, %1 + it\\t%d3\;mnf%d3s\\t%0, #%N1 + ite\\t%d3\;mvf%d3s\\t%0, %1\;mvf%D3s\\t%0, %2 + ite\\t%d3\;mvf%d3s\\t%0, %1\;mnf%D3s\\t%0, #%N2 + ite\\t%d3\;mnf%d3s\\t%0, #%N1\;mvf%D3s\\t%0, %2 + ite\\t%d3\;mnf%d3s\\t%0, #%N1\;mnf%D3s\\t%0, #%N2" + [(set_attr "length" "6,6,6,6,10,10,10,10") + (set_attr "type" "ffarith") + (set_attr "conds" "use")] +) + +(define_insn "*thumb2_movdfcc_fpa" + [(set (match_operand:DF 0 "s_register_operand" "=f,f,f,f,f,f,f,f") + (if_then_else:DF + (match_operator 3 "arm_comparison_operator" + [(match_operand 4 "cc_register" "") (const_int 0)]) + (match_operand:DF 1 "arm_float_add_operand" "0,0,fG,H,fG,fG,H,H") + (match_operand:DF 2 "arm_float_add_operand" "fG,H,0,0,fG,H,fG,H")))] + "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_FPA" + "@ + it\\t%D3\;mvf%D3d\\t%0, %2 + it\\t%D3\;mnf%D3d\\t%0, #%N2 + it\\t%d3\;mvf%d3d\\t%0, %1 + it\\t%d3\;mnf%d3d\\t%0, #%N1 + ite\\t%d3\;mvf%d3d\\t%0, %1\;mvf%D3d\\t%0, %2 + ite\\t%d3\;mvf%d3d\\t%0, %1\;mnf%D3d\\t%0, #%N2 + ite\\t%d3\;mnf%d3d\\t%0, #%N1\;mvf%D3d\\t%0, %2 + ite\\t%d3\;mnf%d3d\\t%0, #%N1\;mnf%D3d\\t%0, #%N2" + [(set_attr "length" "6,6,6,6,10,10,10,10") + (set_attr "type" "ffarith") + (set_attr "conds" "use")] +) + diff --git a/gcc/config/arm/freebsd.h b/gcc/config/arm/freebsd.h new file mode 100644 index 000000000..701bb1499 --- /dev/null +++ b/gcc/config/arm/freebsd.h @@ -0,0 +1,67 @@ +/* Definitions for StrongARM running FreeBSD using the ELF format + Copyright (C) 2001, 2004, 2007, 2010 Free Software Foundation, Inc. + Contributed by David E. O'Brien and BSDi. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + + +#undef SUBTARGET_EXTRA_SPECS +#define SUBTARGET_EXTRA_SPECS \ + { "fbsd_dynamic_linker", FBSD_DYNAMIC_LINKER } + +#undef SUBTARGET_CPP_SPEC +#define SUBTARGET_CPP_SPEC FBSD_CPP_SPEC + +#undef LINK_SPEC +#define LINK_SPEC " \ + %{p:%nconsider using '-pg' instead of '-p' with gprof(1)} \ + %{v:-V} \ + %{assert*} %{R*} %{rpath*} %{defsym*} \ + %{shared:-Bshareable %{h*} %{soname*}} \ + %{!shared: \ + %{!static: \ + %{rdynamic:-export-dynamic} \ + -dynamic-linker %(fbsd_dynamic_linker) } \ + %{static:-Bstatic}} \ + %{symbolic:-Bsymbolic}" + + +/************************[ Target stuff ]***********************************/ + +/* Define the actual types of some ANSI-mandated types. + Needs to agree with . GCC defaults come from c-decl.c, + c-common.c, and config//.h. */ + +/* arm.h gets this wrong for FreeBSD. We use the GCC defaults instead. */ + +#undef SIZE_TYPE +#define SIZE_TYPE "unsigned int" + +#undef PTRDIFF_TYPE +#define PTRDIFF_TYPE "int" + +/* We use the GCC defaults here. */ +#undef WCHAR_TYPE + +#undef WCHAR_TYPE_SIZE +#define WCHAR_TYPE_SIZE 32 + +#undef SUBTARGET_CPU_DEFAULT +#define SUBTARGET_CPU_DEFAULT TARGET_CPU_strongarm + +#undef TARGET_VERSION +#define TARGET_VERSION fprintf (stderr, " (FreeBSD/StrongARM ELF)"); diff --git a/gcc/config/arm/gentune.sh b/gcc/config/arm/gentune.sh new file mode 100755 index 000000000..a873973e3 --- /dev/null +++ b/gcc/config/arm/gentune.sh @@ -0,0 +1,29 @@ +#!/bin/sh +# Generate arm-tune.md, a file containing the tune attribute from the list of +# CPUs in arm-cores.def +# Copyright (C) 2004, 2009 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +echo ";; -*- buffer-read-only: t -*-" +echo ";; Generated automatically by gentune.sh from arm-cores.def" + +allcores=`awk -F'[(, ]+' '/^ARM_CORE/ { cores = cores$3"," } END { print cores } ' $1` + +echo "(define_attr \"tune\"" +echo " \"$allcores\"" | sed -e 's/,"$/"/' +echo " (const (symbol_ref \"((enum attr_tune) arm_tune)\")))" diff --git a/gcc/config/arm/ieee754-df.S b/gcc/config/arm/ieee754-df.S new file mode 100644 index 000000000..eb0c38632 --- /dev/null +++ b/gcc/config/arm/ieee754-df.S @@ -0,0 +1,1447 @@ +/* ieee754-df.S double-precision floating point support for ARM + + Copyright (C) 2003, 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc. + Contributed by Nicolas Pitre (nico@cam.org) + + This file is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any + later version. + + This file is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* + * Notes: + * + * The goal of this code is to be as fast as possible. This is + * not meant to be easy to understand for the casual reader. + * For slightly simpler code please see the single precision version + * of this file. + * + * Only the default rounding mode is intended for best performances. + * Exceptions aren't supported yet, but that can be added quite easily + * if necessary without impacting performances. + */ + + +@ For FPA, float words are always big-endian. +@ For VFP, floats words follow the memory system mode. +#if defined(__VFP_FP__) && !defined(__ARMEB__) +#define xl r0 +#define xh r1 +#define yl r2 +#define yh r3 +#else +#define xh r0 +#define xl r1 +#define yh r2 +#define yl r3 +#endif + + +#ifdef L_arm_negdf2 + +ARM_FUNC_START negdf2 +ARM_FUNC_ALIAS aeabi_dneg negdf2 + + @ flip sign bit + eor xh, xh, #0x80000000 + RET + + FUNC_END aeabi_dneg + FUNC_END negdf2 + +#endif + +#ifdef L_arm_addsubdf3 + +ARM_FUNC_START aeabi_drsub + + eor xh, xh, #0x80000000 @ flip sign bit of first arg + b 1f + +ARM_FUNC_START subdf3 +ARM_FUNC_ALIAS aeabi_dsub subdf3 + + eor yh, yh, #0x80000000 @ flip sign bit of second arg +#if defined(__INTERWORKING_STUBS__) + b 1f @ Skip Thumb-code prologue +#endif + +ARM_FUNC_START adddf3 +ARM_FUNC_ALIAS aeabi_dadd adddf3 + +1: do_push {r4, r5, lr} + + @ Look for zeroes, equal values, INF, or NAN. + shift1 lsl, r4, xh, #1 + shift1 lsl, r5, yh, #1 + teq r4, r5 + do_it eq + teqeq xl, yl + do_it ne, ttt + COND(orr,s,ne) ip, r4, xl + COND(orr,s,ne) ip, r5, yl + COND(mvn,s,ne) ip, r4, asr #21 + COND(mvn,s,ne) ip, r5, asr #21 + beq LSYM(Lad_s) + + @ Compute exponent difference. Make largest exponent in r4, + @ corresponding arg in xh-xl, and positive exponent difference in r5. + shift1 lsr, r4, r4, #21 + rsbs r5, r4, r5, lsr #21 + do_it lt + rsblt r5, r5, #0 + ble 1f + add r4, r4, r5 + eor yl, xl, yl + eor yh, xh, yh + eor xl, yl, xl + eor xh, yh, xh + eor yl, xl, yl + eor yh, xh, yh +1: + @ If exponent difference is too large, return largest argument + @ already in xh-xl. We need up to 54 bit to handle proper rounding + @ of 0x1p54 - 1.1. + cmp r5, #54 + do_it hi + RETLDM "r4, r5" hi + + @ Convert mantissa to signed integer. + tst xh, #0x80000000 + mov xh, xh, lsl #12 + mov ip, #0x00100000 + orr xh, ip, xh, lsr #12 + beq 1f +#if defined(__thumb2__) + negs xl, xl + sbc xh, xh, xh, lsl #1 +#else + rsbs xl, xl, #0 + rsc xh, xh, #0 +#endif +1: + tst yh, #0x80000000 + mov yh, yh, lsl #12 + orr yh, ip, yh, lsr #12 + beq 1f +#if defined(__thumb2__) + negs yl, yl + sbc yh, yh, yh, lsl #1 +#else + rsbs yl, yl, #0 + rsc yh, yh, #0 +#endif +1: + @ If exponent == difference, one or both args were denormalized. + @ Since this is not common case, rescale them off line. + teq r4, r5 + beq LSYM(Lad_d) +LSYM(Lad_x): + + @ Compensate for the exponent overlapping the mantissa MSB added later + sub r4, r4, #1 + + @ Shift yh-yl right per r5, add to xh-xl, keep leftover bits into ip. + rsbs lr, r5, #32 + blt 1f + shift1 lsl, ip, yl, lr + shiftop adds xl xl yl lsr r5 yl + adc xh, xh, #0 + shiftop adds xl xl yh lsl lr yl + shiftop adcs xh xh yh asr r5 yh + b 2f +1: sub r5, r5, #32 + add lr, lr, #32 + cmp yl, #1 + shift1 lsl,ip, yh, lr + do_it cs + orrcs ip, ip, #2 @ 2 not 1, to allow lsr #1 later + shiftop adds xl xl yh asr r5 yh + adcs xh, xh, yh, asr #31 +2: + @ We now have a result in xh-xl-ip. + @ Keep absolute value in xh-xl-ip, sign in r5 (the n bit was set above) + and r5, xh, #0x80000000 + bpl LSYM(Lad_p) +#if defined(__thumb2__) + mov lr, #0 + negs ip, ip + sbcs xl, lr, xl + sbc xh, lr, xh +#else + rsbs ip, ip, #0 + rscs xl, xl, #0 + rsc xh, xh, #0 +#endif + + @ Determine how to normalize the result. +LSYM(Lad_p): + cmp xh, #0x00100000 + bcc LSYM(Lad_a) + cmp xh, #0x00200000 + bcc LSYM(Lad_e) + + @ Result needs to be shifted right. + movs xh, xh, lsr #1 + movs xl, xl, rrx + mov ip, ip, rrx + add r4, r4, #1 + + @ Make sure we did not bust our exponent. + mov r2, r4, lsl #21 + cmn r2, #(2 << 21) + bcs LSYM(Lad_o) + + @ Our result is now properly aligned into xh-xl, remaining bits in ip. + @ Round with MSB of ip. If halfway between two numbers, round towards + @ LSB of xl = 0. + @ Pack final result together. +LSYM(Lad_e): + cmp ip, #0x80000000 + do_it eq + COND(mov,s,eq) ip, xl, lsr #1 + adcs xl, xl, #0 + adc xh, xh, r4, lsl #20 + orr xh, xh, r5 + RETLDM "r4, r5" + + @ Result must be shifted left and exponent adjusted. +LSYM(Lad_a): + movs ip, ip, lsl #1 + adcs xl, xl, xl + adc xh, xh, xh + tst xh, #0x00100000 + sub r4, r4, #1 + bne LSYM(Lad_e) + + @ No rounding necessary since ip will always be 0 at this point. +LSYM(Lad_l): + +#if __ARM_ARCH__ < 5 + + teq xh, #0 + movne r3, #20 + moveq r3, #52 + moveq xh, xl + moveq xl, #0 + mov r2, xh + cmp r2, #(1 << 16) + movhs r2, r2, lsr #16 + subhs r3, r3, #16 + cmp r2, #(1 << 8) + movhs r2, r2, lsr #8 + subhs r3, r3, #8 + cmp r2, #(1 << 4) + movhs r2, r2, lsr #4 + subhs r3, r3, #4 + cmp r2, #(1 << 2) + subhs r3, r3, #2 + sublo r3, r3, r2, lsr #1 + sub r3, r3, r2, lsr #3 + +#else + + teq xh, #0 + do_it eq, t + moveq xh, xl + moveq xl, #0 + clz r3, xh + do_it eq + addeq r3, r3, #32 + sub r3, r3, #11 + +#endif + + @ determine how to shift the value. + subs r2, r3, #32 + bge 2f + adds r2, r2, #12 + ble 1f + + @ shift value left 21 to 31 bits, or actually right 11 to 1 bits + @ since a register switch happened above. + add ip, r2, #20 + rsb r2, r2, #12 + shift1 lsl, xl, xh, ip + shift1 lsr, xh, xh, r2 + b 3f + + @ actually shift value left 1 to 20 bits, which might also represent + @ 32 to 52 bits if counting the register switch that happened earlier. +1: add r2, r2, #20 +2: do_it le + rsble ip, r2, #32 + shift1 lsl, xh, xh, r2 +#if defined(__thumb2__) + lsr ip, xl, ip + itt le + orrle xh, xh, ip + lslle xl, xl, r2 +#else + orrle xh, xh, xl, lsr ip + movle xl, xl, lsl r2 +#endif + + @ adjust exponent accordingly. +3: subs r4, r4, r3 + do_it ge, tt + addge xh, xh, r4, lsl #20 + orrge xh, xh, r5 + RETLDM "r4, r5" ge + + @ Exponent too small, denormalize result. + @ Find out proper shift value. + mvn r4, r4 + subs r4, r4, #31 + bge 2f + adds r4, r4, #12 + bgt 1f + + @ shift result right of 1 to 20 bits, sign is in r5. + add r4, r4, #20 + rsb r2, r4, #32 + shift1 lsr, xl, xl, r4 + shiftop orr xl xl xh lsl r2 yh + shiftop orr xh r5 xh lsr r4 yh + RETLDM "r4, r5" + + @ shift result right of 21 to 31 bits, or left 11 to 1 bits after + @ a register switch from xh to xl. +1: rsb r4, r4, #12 + rsb r2, r4, #32 + shift1 lsr, xl, xl, r2 + shiftop orr xl xl xh lsl r4 yh + mov xh, r5 + RETLDM "r4, r5" + + @ Shift value right of 32 to 64 bits, or 0 to 32 bits after a switch + @ from xh to xl. +2: shift1 lsr, xl, xh, r4 + mov xh, r5 + RETLDM "r4, r5" + + @ Adjust exponents for denormalized arguments. + @ Note that r4 must not remain equal to 0. +LSYM(Lad_d): + teq r4, #0 + eor yh, yh, #0x00100000 + do_it eq, te + eoreq xh, xh, #0x00100000 + addeq r4, r4, #1 + subne r5, r5, #1 + b LSYM(Lad_x) + + +LSYM(Lad_s): + mvns ip, r4, asr #21 + do_it ne + COND(mvn,s,ne) ip, r5, asr #21 + beq LSYM(Lad_i) + + teq r4, r5 + do_it eq + teqeq xl, yl + beq 1f + + @ Result is x + 0.0 = x or 0.0 + y = y. + orrs ip, r4, xl + do_it eq, t + moveq xh, yh + moveq xl, yl + RETLDM "r4, r5" + +1: teq xh, yh + + @ Result is x - x = 0. + do_it ne, tt + movne xh, #0 + movne xl, #0 + RETLDM "r4, r5" ne + + @ Result is x + x = 2x. + movs ip, r4, lsr #21 + bne 2f + movs xl, xl, lsl #1 + adcs xh, xh, xh + do_it cs + orrcs xh, xh, #0x80000000 + RETLDM "r4, r5" +2: adds r4, r4, #(2 << 21) + do_it cc, t + addcc xh, xh, #(1 << 20) + RETLDM "r4, r5" cc + and r5, xh, #0x80000000 + + @ Overflow: return INF. +LSYM(Lad_o): + orr xh, r5, #0x7f000000 + orr xh, xh, #0x00f00000 + mov xl, #0 + RETLDM "r4, r5" + + @ At least one of x or y is INF/NAN. + @ if xh-xl != INF/NAN: return yh-yl (which is INF/NAN) + @ if yh-yl != INF/NAN: return xh-xl (which is INF/NAN) + @ if either is NAN: return NAN + @ if opposite sign: return NAN + @ otherwise return xh-xl (which is INF or -INF) +LSYM(Lad_i): + mvns ip, r4, asr #21 + do_it ne, te + movne xh, yh + movne xl, yl + COND(mvn,s,eq) ip, r5, asr #21 + do_it ne, t + movne yh, xh + movne yl, xl + orrs r4, xl, xh, lsl #12 + do_it eq, te + COND(orr,s,eq) r5, yl, yh, lsl #12 + teqeq xh, yh + orrne xh, xh, #0x00080000 @ quiet NAN + RETLDM "r4, r5" + + FUNC_END aeabi_dsub + FUNC_END subdf3 + FUNC_END aeabi_dadd + FUNC_END adddf3 + +ARM_FUNC_START floatunsidf +ARM_FUNC_ALIAS aeabi_ui2d floatunsidf + + teq r0, #0 + do_it eq, t + moveq r1, #0 + RETc(eq) + do_push {r4, r5, lr} + mov r4, #0x400 @ initial exponent + add r4, r4, #(52-1 - 1) + mov r5, #0 @ sign bit is 0 + .ifnc xl, r0 + mov xl, r0 + .endif + mov xh, #0 + b LSYM(Lad_l) + + FUNC_END aeabi_ui2d + FUNC_END floatunsidf + +ARM_FUNC_START floatsidf +ARM_FUNC_ALIAS aeabi_i2d floatsidf + + teq r0, #0 + do_it eq, t + moveq r1, #0 + RETc(eq) + do_push {r4, r5, lr} + mov r4, #0x400 @ initial exponent + add r4, r4, #(52-1 - 1) + ands r5, r0, #0x80000000 @ sign bit in r5 + do_it mi + rsbmi r0, r0, #0 @ absolute value + .ifnc xl, r0 + mov xl, r0 + .endif + mov xh, #0 + b LSYM(Lad_l) + + FUNC_END aeabi_i2d + FUNC_END floatsidf + +ARM_FUNC_START extendsfdf2 +ARM_FUNC_ALIAS aeabi_f2d extendsfdf2 + + movs r2, r0, lsl #1 @ toss sign bit + mov xh, r2, asr #3 @ stretch exponent + mov xh, xh, rrx @ retrieve sign bit + mov xl, r2, lsl #28 @ retrieve remaining bits + do_it ne, ttt + COND(and,s,ne) r3, r2, #0xff000000 @ isolate exponent + teqne r3, #0xff000000 @ if not 0, check if INF or NAN + eorne xh, xh, #0x38000000 @ fixup exponent otherwise. + RETc(ne) @ and return it. + + teq r2, #0 @ if actually 0 + do_it ne, e + teqne r3, #0xff000000 @ or INF or NAN + RETc(eq) @ we are done already. + + @ value was denormalized. We can normalize it now. + do_push {r4, r5, lr} + mov r4, #0x380 @ setup corresponding exponent + and r5, xh, #0x80000000 @ move sign bit in r5 + bic xh, xh, #0x80000000 + b LSYM(Lad_l) + + FUNC_END aeabi_f2d + FUNC_END extendsfdf2 + +ARM_FUNC_START floatundidf +ARM_FUNC_ALIAS aeabi_ul2d floatundidf + + orrs r2, r0, r1 +#if !defined (__VFP_FP__) && !defined(__SOFTFP__) + do_it eq, t + mvfeqd f0, #0.0 +#else + do_it eq +#endif + RETc(eq) + +#if !defined (__VFP_FP__) && !defined(__SOFTFP__) + @ For hard FPA code we want to return via the tail below so that + @ we can return the result in f0 as well as in r0/r1 for backwards + @ compatibility. + adr ip, LSYM(f0_ret) + @ Push pc as well so that RETLDM works correctly. + do_push {r4, r5, ip, lr, pc} +#else + do_push {r4, r5, lr} +#endif + + mov r5, #0 + b 2f + +ARM_FUNC_START floatdidf +ARM_FUNC_ALIAS aeabi_l2d floatdidf + + orrs r2, r0, r1 +#if !defined (__VFP_FP__) && !defined(__SOFTFP__) + do_it eq, t + mvfeqd f0, #0.0 +#else + do_it eq +#endif + RETc(eq) + +#if !defined (__VFP_FP__) && !defined(__SOFTFP__) + @ For hard FPA code we want to return via the tail below so that + @ we can return the result in f0 as well as in r0/r1 for backwards + @ compatibility. + adr ip, LSYM(f0_ret) + @ Push pc as well so that RETLDM works correctly. + do_push {r4, r5, ip, lr, pc} +#else + do_push {r4, r5, lr} +#endif + + ands r5, ah, #0x80000000 @ sign bit in r5 + bpl 2f +#if defined(__thumb2__) + negs al, al + sbc ah, ah, ah, lsl #1 +#else + rsbs al, al, #0 + rsc ah, ah, #0 +#endif +2: + mov r4, #0x400 @ initial exponent + add r4, r4, #(52-1 - 1) + + @ FPA little-endian: must swap the word order. + .ifnc xh, ah + mov ip, al + mov xh, ah + mov xl, ip + .endif + + movs ip, xh, lsr #22 + beq LSYM(Lad_p) + + @ The value is too big. Scale it down a bit... + mov r2, #3 + movs ip, ip, lsr #3 + do_it ne + addne r2, r2, #3 + movs ip, ip, lsr #3 + do_it ne + addne r2, r2, #3 + add r2, r2, ip, lsr #3 + + rsb r3, r2, #32 + shift1 lsl, ip, xl, r3 + shift1 lsr, xl, xl, r2 + shiftop orr xl xl xh lsl r3 lr + shift1 lsr, xh, xh, r2 + add r4, r4, r2 + b LSYM(Lad_p) + +#if !defined (__VFP_FP__) && !defined(__SOFTFP__) + + @ Legacy code expects the result to be returned in f0. Copy it + @ there as well. +LSYM(f0_ret): + do_push {r0, r1} + ldfd f0, [sp], #8 + RETLDM + +#endif + + FUNC_END floatdidf + FUNC_END aeabi_l2d + FUNC_END floatundidf + FUNC_END aeabi_ul2d + +#endif /* L_addsubdf3 */ + +#ifdef L_arm_muldivdf3 + +ARM_FUNC_START muldf3 +ARM_FUNC_ALIAS aeabi_dmul muldf3 + do_push {r4, r5, r6, lr} + + @ Mask out exponents, trap any zero/denormal/INF/NAN. + mov ip, #0xff + orr ip, ip, #0x700 + ands r4, ip, xh, lsr #20 + do_it ne, tte + COND(and,s,ne) r5, ip, yh, lsr #20 + teqne r4, ip + teqne r5, ip + bleq LSYM(Lml_s) + + @ Add exponents together + add r4, r4, r5 + + @ Determine final sign. + eor r6, xh, yh + + @ Convert mantissa to unsigned integer. + @ If power of two, branch to a separate path. + bic xh, xh, ip, lsl #21 + bic yh, yh, ip, lsl #21 + orrs r5, xl, xh, lsl #12 + do_it ne + COND(orr,s,ne) r5, yl, yh, lsl #12 + orr xh, xh, #0x00100000 + orr yh, yh, #0x00100000 + beq LSYM(Lml_1) + +#if __ARM_ARCH__ < 4 + + @ Put sign bit in r6, which will be restored in yl later. + and r6, r6, #0x80000000 + + @ Well, no way to make it shorter without the umull instruction. + stmfd sp!, {r6, r7, r8, r9, sl, fp} + mov r7, xl, lsr #16 + mov r8, yl, lsr #16 + mov r9, xh, lsr #16 + mov sl, yh, lsr #16 + bic xl, xl, r7, lsl #16 + bic yl, yl, r8, lsl #16 + bic xh, xh, r9, lsl #16 + bic yh, yh, sl, lsl #16 + mul ip, xl, yl + mul fp, xl, r8 + mov lr, #0 + adds ip, ip, fp, lsl #16 + adc lr, lr, fp, lsr #16 + mul fp, r7, yl + adds ip, ip, fp, lsl #16 + adc lr, lr, fp, lsr #16 + mul fp, xl, sl + mov r5, #0 + adds lr, lr, fp, lsl #16 + adc r5, r5, fp, lsr #16 + mul fp, r7, yh + adds lr, lr, fp, lsl #16 + adc r5, r5, fp, lsr #16 + mul fp, xh, r8 + adds lr, lr, fp, lsl #16 + adc r5, r5, fp, lsr #16 + mul fp, r9, yl + adds lr, lr, fp, lsl #16 + adc r5, r5, fp, lsr #16 + mul fp, xh, sl + mul r6, r9, sl + adds r5, r5, fp, lsl #16 + adc r6, r6, fp, lsr #16 + mul fp, r9, yh + adds r5, r5, fp, lsl #16 + adc r6, r6, fp, lsr #16 + mul fp, xl, yh + adds lr, lr, fp + mul fp, r7, sl + adcs r5, r5, fp + mul fp, xh, yl + adc r6, r6, #0 + adds lr, lr, fp + mul fp, r9, r8 + adcs r5, r5, fp + mul fp, r7, r8 + adc r6, r6, #0 + adds lr, lr, fp + mul fp, xh, yh + adcs r5, r5, fp + adc r6, r6, #0 + ldmfd sp!, {yl, r7, r8, r9, sl, fp} + +#else + + @ Here is the actual multiplication. + umull ip, lr, xl, yl + mov r5, #0 + umlal lr, r5, xh, yl + and yl, r6, #0x80000000 + umlal lr, r5, xl, yh + mov r6, #0 + umlal r5, r6, xh, yh + +#endif + + @ The LSBs in ip are only significant for the final rounding. + @ Fold them into lr. + teq ip, #0 + do_it ne + orrne lr, lr, #1 + + @ Adjust result upon the MSB position. + sub r4, r4, #0xff + cmp r6, #(1 << (20-11)) + sbc r4, r4, #0x300 + bcs 1f + movs lr, lr, lsl #1 + adcs r5, r5, r5 + adc r6, r6, r6 +1: + @ Shift to final position, add sign to result. + orr xh, yl, r6, lsl #11 + orr xh, xh, r5, lsr #21 + mov xl, r5, lsl #11 + orr xl, xl, lr, lsr #21 + mov lr, lr, lsl #11 + + @ Check exponent range for under/overflow. + subs ip, r4, #(254 - 1) + do_it hi + cmphi ip, #0x700 + bhi LSYM(Lml_u) + + @ Round the result, merge final exponent. + cmp lr, #0x80000000 + do_it eq + COND(mov,s,eq) lr, xl, lsr #1 + adcs xl, xl, #0 + adc xh, xh, r4, lsl #20 + RETLDM "r4, r5, r6" + + @ Multiplication by 0x1p*: let''s shortcut a lot of code. +LSYM(Lml_1): + and r6, r6, #0x80000000 + orr xh, r6, xh + orr xl, xl, yl + eor xh, xh, yh + subs r4, r4, ip, lsr #1 + do_it gt, tt + COND(rsb,s,gt) r5, r4, ip + orrgt xh, xh, r4, lsl #20 + RETLDM "r4, r5, r6" gt + + @ Under/overflow: fix things up for the code below. + orr xh, xh, #0x00100000 + mov lr, #0 + subs r4, r4, #1 + +LSYM(Lml_u): + @ Overflow? + bgt LSYM(Lml_o) + + @ Check if denormalized result is possible, otherwise return signed 0. + cmn r4, #(53 + 1) + do_it le, tt + movle xl, #0 + bicle xh, xh, #0x7fffffff + RETLDM "r4, r5, r6" le + + @ Find out proper shift value. + rsb r4, r4, #0 + subs r4, r4, #32 + bge 2f + adds r4, r4, #12 + bgt 1f + + @ shift result right of 1 to 20 bits, preserve sign bit, round, etc. + add r4, r4, #20 + rsb r5, r4, #32 + shift1 lsl, r3, xl, r5 + shift1 lsr, xl, xl, r4 + shiftop orr xl xl xh lsl r5 r2 + and r2, xh, #0x80000000 + bic xh, xh, #0x80000000 + adds xl, xl, r3, lsr #31 + shiftop adc xh r2 xh lsr r4 r6 + orrs lr, lr, r3, lsl #1 + do_it eq + biceq xl, xl, r3, lsr #31 + RETLDM "r4, r5, r6" + + @ shift result right of 21 to 31 bits, or left 11 to 1 bits after + @ a register switch from xh to xl. Then round. +1: rsb r4, r4, #12 + rsb r5, r4, #32 + shift1 lsl, r3, xl, r4 + shift1 lsr, xl, xl, r5 + shiftop orr xl xl xh lsl r4 r2 + bic xh, xh, #0x7fffffff + adds xl, xl, r3, lsr #31 + adc xh, xh, #0 + orrs lr, lr, r3, lsl #1 + do_it eq + biceq xl, xl, r3, lsr #31 + RETLDM "r4, r5, r6" + + @ Shift value right of 32 to 64 bits, or 0 to 32 bits after a switch + @ from xh to xl. Leftover bits are in r3-r6-lr for rounding. +2: rsb r5, r4, #32 + shiftop orr lr lr xl lsl r5 r2 + shift1 lsr, r3, xl, r4 + shiftop orr r3 r3 xh lsl r5 r2 + shift1 lsr, xl, xh, r4 + bic xh, xh, #0x7fffffff + shiftop bic xl xl xh lsr r4 r2 + add xl, xl, r3, lsr #31 + orrs lr, lr, r3, lsl #1 + do_it eq + biceq xl, xl, r3, lsr #31 + RETLDM "r4, r5, r6" + + @ One or both arguments are denormalized. + @ Scale them leftwards and preserve sign bit. +LSYM(Lml_d): + teq r4, #0 + bne 2f + and r6, xh, #0x80000000 +1: movs xl, xl, lsl #1 + adc xh, xh, xh + tst xh, #0x00100000 + do_it eq + subeq r4, r4, #1 + beq 1b + orr xh, xh, r6 + teq r5, #0 + do_it ne + RETc(ne) +2: and r6, yh, #0x80000000 +3: movs yl, yl, lsl #1 + adc yh, yh, yh + tst yh, #0x00100000 + do_it eq + subeq r5, r5, #1 + beq 3b + orr yh, yh, r6 + RET + +LSYM(Lml_s): + @ Isolate the INF and NAN cases away + teq r4, ip + and r5, ip, yh, lsr #20 + do_it ne + teqne r5, ip + beq 1f + + @ Here, one or more arguments are either denormalized or zero. + orrs r6, xl, xh, lsl #1 + do_it ne + COND(orr,s,ne) r6, yl, yh, lsl #1 + bne LSYM(Lml_d) + + @ Result is 0, but determine sign anyway. +LSYM(Lml_z): + eor xh, xh, yh + and xh, xh, #0x80000000 + mov xl, #0 + RETLDM "r4, r5, r6" + +1: @ One or both args are INF or NAN. + orrs r6, xl, xh, lsl #1 + do_it eq, te + moveq xl, yl + moveq xh, yh + COND(orr,s,ne) r6, yl, yh, lsl #1 + beq LSYM(Lml_n) @ 0 * INF or INF * 0 -> NAN + teq r4, ip + bne 1f + orrs r6, xl, xh, lsl #12 + bne LSYM(Lml_n) @ NAN * -> NAN +1: teq r5, ip + bne LSYM(Lml_i) + orrs r6, yl, yh, lsl #12 + do_it ne, t + movne xl, yl + movne xh, yh + bne LSYM(Lml_n) @ * NAN -> NAN + + @ Result is INF, but we need to determine its sign. +LSYM(Lml_i): + eor xh, xh, yh + + @ Overflow: return INF (sign already in xh). +LSYM(Lml_o): + and xh, xh, #0x80000000 + orr xh, xh, #0x7f000000 + orr xh, xh, #0x00f00000 + mov xl, #0 + RETLDM "r4, r5, r6" + + @ Return a quiet NAN. +LSYM(Lml_n): + orr xh, xh, #0x7f000000 + orr xh, xh, #0x00f80000 + RETLDM "r4, r5, r6" + + FUNC_END aeabi_dmul + FUNC_END muldf3 + +ARM_FUNC_START divdf3 +ARM_FUNC_ALIAS aeabi_ddiv divdf3 + + do_push {r4, r5, r6, lr} + + @ Mask out exponents, trap any zero/denormal/INF/NAN. + mov ip, #0xff + orr ip, ip, #0x700 + ands r4, ip, xh, lsr #20 + do_it ne, tte + COND(and,s,ne) r5, ip, yh, lsr #20 + teqne r4, ip + teqne r5, ip + bleq LSYM(Ldv_s) + + @ Substract divisor exponent from dividend''s. + sub r4, r4, r5 + + @ Preserve final sign into lr. + eor lr, xh, yh + + @ Convert mantissa to unsigned integer. + @ Dividend -> r5-r6, divisor -> yh-yl. + orrs r5, yl, yh, lsl #12 + mov xh, xh, lsl #12 + beq LSYM(Ldv_1) + mov yh, yh, lsl #12 + mov r5, #0x10000000 + orr yh, r5, yh, lsr #4 + orr yh, yh, yl, lsr #24 + mov yl, yl, lsl #8 + orr r5, r5, xh, lsr #4 + orr r5, r5, xl, lsr #24 + mov r6, xl, lsl #8 + + @ Initialize xh with final sign bit. + and xh, lr, #0x80000000 + + @ Ensure result will land to known bit position. + @ Apply exponent bias accordingly. + cmp r5, yh + do_it eq + cmpeq r6, yl + adc r4, r4, #(255 - 2) + add r4, r4, #0x300 + bcs 1f + movs yh, yh, lsr #1 + mov yl, yl, rrx +1: + @ Perform first substraction to align result to a nibble. + subs r6, r6, yl + sbc r5, r5, yh + movs yh, yh, lsr #1 + mov yl, yl, rrx + mov xl, #0x00100000 + mov ip, #0x00080000 + + @ The actual division loop. +1: subs lr, r6, yl + sbcs lr, r5, yh + do_it cs, tt + subcs r6, r6, yl + movcs r5, lr + orrcs xl, xl, ip + movs yh, yh, lsr #1 + mov yl, yl, rrx + subs lr, r6, yl + sbcs lr, r5, yh + do_it cs, tt + subcs r6, r6, yl + movcs r5, lr + orrcs xl, xl, ip, lsr #1 + movs yh, yh, lsr #1 + mov yl, yl, rrx + subs lr, r6, yl + sbcs lr, r5, yh + do_it cs, tt + subcs r6, r6, yl + movcs r5, lr + orrcs xl, xl, ip, lsr #2 + movs yh, yh, lsr #1 + mov yl, yl, rrx + subs lr, r6, yl + sbcs lr, r5, yh + do_it cs, tt + subcs r6, r6, yl + movcs r5, lr + orrcs xl, xl, ip, lsr #3 + + orrs lr, r5, r6 + beq 2f + mov r5, r5, lsl #4 + orr r5, r5, r6, lsr #28 + mov r6, r6, lsl #4 + mov yh, yh, lsl #3 + orr yh, yh, yl, lsr #29 + mov yl, yl, lsl #3 + movs ip, ip, lsr #4 + bne 1b + + @ We are done with a word of the result. + @ Loop again for the low word if this pass was for the high word. + tst xh, #0x00100000 + bne 3f + orr xh, xh, xl + mov xl, #0 + mov ip, #0x80000000 + b 1b +2: + @ Be sure result starts in the high word. + tst xh, #0x00100000 + do_it eq, t + orreq xh, xh, xl + moveq xl, #0 +3: + @ Check exponent range for under/overflow. + subs ip, r4, #(254 - 1) + do_it hi + cmphi ip, #0x700 + bhi LSYM(Lml_u) + + @ Round the result, merge final exponent. + subs ip, r5, yh + do_it eq, t + COND(sub,s,eq) ip, r6, yl + COND(mov,s,eq) ip, xl, lsr #1 + adcs xl, xl, #0 + adc xh, xh, r4, lsl #20 + RETLDM "r4, r5, r6" + + @ Division by 0x1p*: shortcut a lot of code. +LSYM(Ldv_1): + and lr, lr, #0x80000000 + orr xh, lr, xh, lsr #12 + adds r4, r4, ip, lsr #1 + do_it gt, tt + COND(rsb,s,gt) r5, r4, ip + orrgt xh, xh, r4, lsl #20 + RETLDM "r4, r5, r6" gt + + orr xh, xh, #0x00100000 + mov lr, #0 + subs r4, r4, #1 + b LSYM(Lml_u) + + @ Result mightt need to be denormalized: put remainder bits + @ in lr for rounding considerations. +LSYM(Ldv_u): + orr lr, r5, r6 + b LSYM(Lml_u) + + @ One or both arguments is either INF, NAN or zero. +LSYM(Ldv_s): + and r5, ip, yh, lsr #20 + teq r4, ip + do_it eq + teqeq r5, ip + beq LSYM(Lml_n) @ INF/NAN / INF/NAN -> NAN + teq r4, ip + bne 1f + orrs r4, xl, xh, lsl #12 + bne LSYM(Lml_n) @ NAN / -> NAN + teq r5, ip + bne LSYM(Lml_i) @ INF / -> INF + mov xl, yl + mov xh, yh + b LSYM(Lml_n) @ INF / (INF or NAN) -> NAN +1: teq r5, ip + bne 2f + orrs r5, yl, yh, lsl #12 + beq LSYM(Lml_z) @ / INF -> 0 + mov xl, yl + mov xh, yh + b LSYM(Lml_n) @ / NAN -> NAN +2: @ If both are nonzero, we need to normalize and resume above. + orrs r6, xl, xh, lsl #1 + do_it ne + COND(orr,s,ne) r6, yl, yh, lsl #1 + bne LSYM(Lml_d) + @ One or both arguments are 0. + orrs r4, xl, xh, lsl #1 + bne LSYM(Lml_i) @ / 0 -> INF + orrs r5, yl, yh, lsl #1 + bne LSYM(Lml_z) @ 0 / -> 0 + b LSYM(Lml_n) @ 0 / 0 -> NAN + + FUNC_END aeabi_ddiv + FUNC_END divdf3 + +#endif /* L_muldivdf3 */ + +#ifdef L_arm_cmpdf2 + +@ Note: only r0 (return value) and ip are clobbered here. + +ARM_FUNC_START gtdf2 +ARM_FUNC_ALIAS gedf2 gtdf2 + mov ip, #-1 + b 1f + +ARM_FUNC_START ltdf2 +ARM_FUNC_ALIAS ledf2 ltdf2 + mov ip, #1 + b 1f + +ARM_FUNC_START cmpdf2 +ARM_FUNC_ALIAS nedf2 cmpdf2 +ARM_FUNC_ALIAS eqdf2 cmpdf2 + mov ip, #1 @ how should we specify unordered here? + +1: str ip, [sp, #-4]! + + @ Trap any INF/NAN first. + mov ip, xh, lsl #1 + mvns ip, ip, asr #21 + mov ip, yh, lsl #1 + do_it ne + COND(mvn,s,ne) ip, ip, asr #21 + beq 3f + + @ Test for equality. + @ Note that 0.0 is equal to -0.0. +2: add sp, sp, #4 + orrs ip, xl, xh, lsl #1 @ if x == 0.0 or -0.0 + do_it eq, e + COND(orr,s,eq) ip, yl, yh, lsl #1 @ and y == 0.0 or -0.0 + teqne xh, yh @ or xh == yh + do_it eq, tt + teqeq xl, yl @ and xl == yl + moveq r0, #0 @ then equal. + RETc(eq) + + @ Clear C flag + cmn r0, #0 + + @ Compare sign, + teq xh, yh + + @ Compare values if same sign + do_it pl + cmppl xh, yh + do_it eq + cmpeq xl, yl + + @ Result: + do_it cs, e + movcs r0, yh, asr #31 + mvncc r0, yh, asr #31 + orr r0, r0, #1 + RET + + @ Look for a NAN. +3: mov ip, xh, lsl #1 + mvns ip, ip, asr #21 + bne 4f + orrs ip, xl, xh, lsl #12 + bne 5f @ x is NAN +4: mov ip, yh, lsl #1 + mvns ip, ip, asr #21 + bne 2b + orrs ip, yl, yh, lsl #12 + beq 2b @ y is not NAN +5: ldr r0, [sp], #4 @ unordered return code + RET + + FUNC_END gedf2 + FUNC_END gtdf2 + FUNC_END ledf2 + FUNC_END ltdf2 + FUNC_END nedf2 + FUNC_END eqdf2 + FUNC_END cmpdf2 + +ARM_FUNC_START aeabi_cdrcmple + + mov ip, r0 + mov r0, r2 + mov r2, ip + mov ip, r1 + mov r1, r3 + mov r3, ip + b 6f + +ARM_FUNC_START aeabi_cdcmpeq +ARM_FUNC_ALIAS aeabi_cdcmple aeabi_cdcmpeq + + @ The status-returning routines are required to preserve all + @ registers except ip, lr, and cpsr. +6: do_push {r0, lr} + ARM_CALL cmpdf2 + @ Set the Z flag correctly, and the C flag unconditionally. + cmp r0, #0 + @ Clear the C flag if the return value was -1, indicating + @ that the first operand was smaller than the second. + do_it mi + cmnmi r0, #0 + RETLDM "r0" + + FUNC_END aeabi_cdcmple + FUNC_END aeabi_cdcmpeq + FUNC_END aeabi_cdrcmple + +ARM_FUNC_START aeabi_dcmpeq + + str lr, [sp, #-8]! + ARM_CALL aeabi_cdcmple + do_it eq, e + moveq r0, #1 @ Equal to. + movne r0, #0 @ Less than, greater than, or unordered. + RETLDM + + FUNC_END aeabi_dcmpeq + +ARM_FUNC_START aeabi_dcmplt + + str lr, [sp, #-8]! + ARM_CALL aeabi_cdcmple + do_it cc, e + movcc r0, #1 @ Less than. + movcs r0, #0 @ Equal to, greater than, or unordered. + RETLDM + + FUNC_END aeabi_dcmplt + +ARM_FUNC_START aeabi_dcmple + + str lr, [sp, #-8]! + ARM_CALL aeabi_cdcmple + do_it ls, e + movls r0, #1 @ Less than or equal to. + movhi r0, #0 @ Greater than or unordered. + RETLDM + + FUNC_END aeabi_dcmple + +ARM_FUNC_START aeabi_dcmpge + + str lr, [sp, #-8]! + ARM_CALL aeabi_cdrcmple + do_it ls, e + movls r0, #1 @ Operand 2 is less than or equal to operand 1. + movhi r0, #0 @ Operand 2 greater than operand 1, or unordered. + RETLDM + + FUNC_END aeabi_dcmpge + +ARM_FUNC_START aeabi_dcmpgt + + str lr, [sp, #-8]! + ARM_CALL aeabi_cdrcmple + do_it cc, e + movcc r0, #1 @ Operand 2 is less than operand 1. + movcs r0, #0 @ Operand 2 is greater than or equal to operand 1, + @ or they are unordered. + RETLDM + + FUNC_END aeabi_dcmpgt + +#endif /* L_cmpdf2 */ + +#ifdef L_arm_unorddf2 + +ARM_FUNC_START unorddf2 +ARM_FUNC_ALIAS aeabi_dcmpun unorddf2 + + mov ip, xh, lsl #1 + mvns ip, ip, asr #21 + bne 1f + orrs ip, xl, xh, lsl #12 + bne 3f @ x is NAN +1: mov ip, yh, lsl #1 + mvns ip, ip, asr #21 + bne 2f + orrs ip, yl, yh, lsl #12 + bne 3f @ y is NAN +2: mov r0, #0 @ arguments are ordered. + RET + +3: mov r0, #1 @ arguments are unordered. + RET + + FUNC_END aeabi_dcmpun + FUNC_END unorddf2 + +#endif /* L_unorddf2 */ + +#ifdef L_arm_fixdfsi + +ARM_FUNC_START fixdfsi +ARM_FUNC_ALIAS aeabi_d2iz fixdfsi + + @ check exponent range. + mov r2, xh, lsl #1 + adds r2, r2, #(1 << 21) + bcs 2f @ value is INF or NAN + bpl 1f @ value is too small + mov r3, #(0xfffffc00 + 31) + subs r2, r3, r2, asr #21 + bls 3f @ value is too large + + @ scale value + mov r3, xh, lsl #11 + orr r3, r3, #0x80000000 + orr r3, r3, xl, lsr #21 + tst xh, #0x80000000 @ the sign bit + shift1 lsr, r0, r3, r2 + do_it ne + rsbne r0, r0, #0 + RET + +1: mov r0, #0 + RET + +2: orrs xl, xl, xh, lsl #12 + bne 4f @ x is NAN. +3: ands r0, xh, #0x80000000 @ the sign bit + do_it eq + moveq r0, #0x7fffffff @ maximum signed positive si + RET + +4: mov r0, #0 @ How should we convert NAN? + RET + + FUNC_END aeabi_d2iz + FUNC_END fixdfsi + +#endif /* L_fixdfsi */ + +#ifdef L_arm_fixunsdfsi + +ARM_FUNC_START fixunsdfsi +ARM_FUNC_ALIAS aeabi_d2uiz fixunsdfsi + + @ check exponent range. + movs r2, xh, lsl #1 + bcs 1f @ value is negative + adds r2, r2, #(1 << 21) + bcs 2f @ value is INF or NAN + bpl 1f @ value is too small + mov r3, #(0xfffffc00 + 31) + subs r2, r3, r2, asr #21 + bmi 3f @ value is too large + + @ scale value + mov r3, xh, lsl #11 + orr r3, r3, #0x80000000 + orr r3, r3, xl, lsr #21 + shift1 lsr, r0, r3, r2 + RET + +1: mov r0, #0 + RET + +2: orrs xl, xl, xh, lsl #12 + bne 4f @ value is NAN. +3: mov r0, #0xffffffff @ maximum unsigned si + RET + +4: mov r0, #0 @ How should we convert NAN? + RET + + FUNC_END aeabi_d2uiz + FUNC_END fixunsdfsi + +#endif /* L_fixunsdfsi */ + +#ifdef L_arm_truncdfsf2 + +ARM_FUNC_START truncdfsf2 +ARM_FUNC_ALIAS aeabi_d2f truncdfsf2 + + @ check exponent range. + mov r2, xh, lsl #1 + subs r3, r2, #((1023 - 127) << 21) + do_it cs, t + COND(sub,s,cs) ip, r3, #(1 << 21) + COND(rsb,s,cs) ip, ip, #(254 << 21) + bls 2f @ value is out of range + +1: @ shift and round mantissa + and ip, xh, #0x80000000 + mov r2, xl, lsl #3 + orr xl, ip, xl, lsr #29 + cmp r2, #0x80000000 + adc r0, xl, r3, lsl #2 + do_it eq + biceq r0, r0, #1 + RET + +2: @ either overflow or underflow + tst xh, #0x40000000 + bne 3f @ overflow + + @ check if denormalized value is possible + adds r2, r3, #(23 << 21) + do_it lt, t + andlt r0, xh, #0x80000000 @ too small, return signed 0. + RETc(lt) + + @ denormalize value so we can resume with the code above afterwards. + orr xh, xh, #0x00100000 + mov r2, r2, lsr #21 + rsb r2, r2, #24 + rsb ip, r2, #32 +#if defined(__thumb2__) + lsls r3, xl, ip +#else + movs r3, xl, lsl ip +#endif + shift1 lsr, xl, xl, r2 + do_it ne + orrne xl, xl, #1 @ fold r3 for rounding considerations. + mov r3, xh, lsl #11 + mov r3, r3, lsr #11 + shiftop orr xl xl r3 lsl ip ip + shift1 lsr, r3, r3, r2 + mov r3, r3, lsl #1 + b 1b + +3: @ chech for NAN + mvns r3, r2, asr #21 + bne 5f @ simple overflow + orrs r3, xl, xh, lsl #12 + do_it ne, tt + movne r0, #0x7f000000 + orrne r0, r0, #0x00c00000 + RETc(ne) @ return NAN + +5: @ return INF with sign + and r0, xh, #0x80000000 + orr r0, r0, #0x7f000000 + orr r0, r0, #0x00800000 + RET + + FUNC_END aeabi_d2f + FUNC_END truncdfsf2 + +#endif /* L_truncdfsf2 */ diff --git a/gcc/config/arm/ieee754-sf.S b/gcc/config/arm/ieee754-sf.S new file mode 100644 index 000000000..c93f66d8f --- /dev/null +++ b/gcc/config/arm/ieee754-sf.S @@ -0,0 +1,1060 @@ +/* ieee754-sf.S single-precision floating point support for ARM + + Copyright (C) 2003, 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc. + Contributed by Nicolas Pitre (nico@cam.org) + + This file is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any + later version. + + This file is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* + * Notes: + * + * The goal of this code is to be as fast as possible. This is + * not meant to be easy to understand for the casual reader. + * + * Only the default rounding mode is intended for best performances. + * Exceptions aren't supported yet, but that can be added quite easily + * if necessary without impacting performances. + */ + +#ifdef L_arm_negsf2 + +ARM_FUNC_START negsf2 +ARM_FUNC_ALIAS aeabi_fneg negsf2 + + eor r0, r0, #0x80000000 @ flip sign bit + RET + + FUNC_END aeabi_fneg + FUNC_END negsf2 + +#endif + +#ifdef L_arm_addsubsf3 + +ARM_FUNC_START aeabi_frsub + + eor r0, r0, #0x80000000 @ flip sign bit of first arg + b 1f + +ARM_FUNC_START subsf3 +ARM_FUNC_ALIAS aeabi_fsub subsf3 + + eor r1, r1, #0x80000000 @ flip sign bit of second arg +#if defined(__INTERWORKING_STUBS__) + b 1f @ Skip Thumb-code prologue +#endif + +ARM_FUNC_START addsf3 +ARM_FUNC_ALIAS aeabi_fadd addsf3 + +1: @ Look for zeroes, equal values, INF, or NAN. + movs r2, r0, lsl #1 + do_it ne, ttt + COND(mov,s,ne) r3, r1, lsl #1 + teqne r2, r3 + COND(mvn,s,ne) ip, r2, asr #24 + COND(mvn,s,ne) ip, r3, asr #24 + beq LSYM(Lad_s) + + @ Compute exponent difference. Make largest exponent in r2, + @ corresponding arg in r0, and positive exponent difference in r3. + mov r2, r2, lsr #24 + rsbs r3, r2, r3, lsr #24 + do_it gt, ttt + addgt r2, r2, r3 + eorgt r1, r0, r1 + eorgt r0, r1, r0 + eorgt r1, r0, r1 + do_it lt + rsblt r3, r3, #0 + + @ If exponent difference is too large, return largest argument + @ already in r0. We need up to 25 bit to handle proper rounding + @ of 0x1p25 - 1.1. + cmp r3, #25 + do_it hi + RETc(hi) + + @ Convert mantissa to signed integer. + tst r0, #0x80000000 + orr r0, r0, #0x00800000 + bic r0, r0, #0xff000000 + do_it ne + rsbne r0, r0, #0 + tst r1, #0x80000000 + orr r1, r1, #0x00800000 + bic r1, r1, #0xff000000 + do_it ne + rsbne r1, r1, #0 + + @ If exponent == difference, one or both args were denormalized. + @ Since this is not common case, rescale them off line. + teq r2, r3 + beq LSYM(Lad_d) +LSYM(Lad_x): + + @ Compensate for the exponent overlapping the mantissa MSB added later + sub r2, r2, #1 + + @ Shift and add second arg to first arg in r0. + @ Keep leftover bits into r1. + shiftop adds r0 r0 r1 asr r3 ip + rsb r3, r3, #32 + shift1 lsl, r1, r1, r3 + + @ Keep absolute value in r0-r1, sign in r3 (the n bit was set above) + and r3, r0, #0x80000000 + bpl LSYM(Lad_p) +#if defined(__thumb2__) + negs r1, r1 + sbc r0, r0, r0, lsl #1 +#else + rsbs r1, r1, #0 + rsc r0, r0, #0 +#endif + + @ Determine how to normalize the result. +LSYM(Lad_p): + cmp r0, #0x00800000 + bcc LSYM(Lad_a) + cmp r0, #0x01000000 + bcc LSYM(Lad_e) + + @ Result needs to be shifted right. + movs r0, r0, lsr #1 + mov r1, r1, rrx + add r2, r2, #1 + + @ Make sure we did not bust our exponent. + cmp r2, #254 + bhs LSYM(Lad_o) + + @ Our result is now properly aligned into r0, remaining bits in r1. + @ Pack final result together. + @ Round with MSB of r1. If halfway between two numbers, round towards + @ LSB of r0 = 0. +LSYM(Lad_e): + cmp r1, #0x80000000 + adc r0, r0, r2, lsl #23 + do_it eq + biceq r0, r0, #1 + orr r0, r0, r3 + RET + + @ Result must be shifted left and exponent adjusted. +LSYM(Lad_a): + movs r1, r1, lsl #1 + adc r0, r0, r0 + tst r0, #0x00800000 + sub r2, r2, #1 + bne LSYM(Lad_e) + + @ No rounding necessary since r1 will always be 0 at this point. +LSYM(Lad_l): + +#if __ARM_ARCH__ < 5 + + movs ip, r0, lsr #12 + moveq r0, r0, lsl #12 + subeq r2, r2, #12 + tst r0, #0x00ff0000 + moveq r0, r0, lsl #8 + subeq r2, r2, #8 + tst r0, #0x00f00000 + moveq r0, r0, lsl #4 + subeq r2, r2, #4 + tst r0, #0x00c00000 + moveq r0, r0, lsl #2 + subeq r2, r2, #2 + cmp r0, #0x00800000 + movcc r0, r0, lsl #1 + sbcs r2, r2, #0 + +#else + + clz ip, r0 + sub ip, ip, #8 + subs r2, r2, ip + shift1 lsl, r0, r0, ip + +#endif + + @ Final result with sign + @ If exponent negative, denormalize result. + do_it ge, et + addge r0, r0, r2, lsl #23 + rsblt r2, r2, #0 + orrge r0, r0, r3 +#if defined(__thumb2__) + do_it lt, t + lsrlt r0, r0, r2 + orrlt r0, r3, r0 +#else + orrlt r0, r3, r0, lsr r2 +#endif + RET + + @ Fixup and adjust bit position for denormalized arguments. + @ Note that r2 must not remain equal to 0. +LSYM(Lad_d): + teq r2, #0 + eor r1, r1, #0x00800000 + do_it eq, te + eoreq r0, r0, #0x00800000 + addeq r2, r2, #1 + subne r3, r3, #1 + b LSYM(Lad_x) + +LSYM(Lad_s): + mov r3, r1, lsl #1 + + mvns ip, r2, asr #24 + do_it ne + COND(mvn,s,ne) ip, r3, asr #24 + beq LSYM(Lad_i) + + teq r2, r3 + beq 1f + + @ Result is x + 0.0 = x or 0.0 + y = y. + teq r2, #0 + do_it eq + moveq r0, r1 + RET + +1: teq r0, r1 + + @ Result is x - x = 0. + do_it ne, t + movne r0, #0 + RETc(ne) + + @ Result is x + x = 2x. + tst r2, #0xff000000 + bne 2f + movs r0, r0, lsl #1 + do_it cs + orrcs r0, r0, #0x80000000 + RET +2: adds r2, r2, #(2 << 24) + do_it cc, t + addcc r0, r0, #(1 << 23) + RETc(cc) + and r3, r0, #0x80000000 + + @ Overflow: return INF. +LSYM(Lad_o): + orr r0, r3, #0x7f000000 + orr r0, r0, #0x00800000 + RET + + @ At least one of r0/r1 is INF/NAN. + @ if r0 != INF/NAN: return r1 (which is INF/NAN) + @ if r1 != INF/NAN: return r0 (which is INF/NAN) + @ if r0 or r1 is NAN: return NAN + @ if opposite sign: return NAN + @ otherwise return r0 (which is INF or -INF) +LSYM(Lad_i): + mvns r2, r2, asr #24 + do_it ne, et + movne r0, r1 + COND(mvn,s,eq) r3, r3, asr #24 + movne r1, r0 + movs r2, r0, lsl #9 + do_it eq, te + COND(mov,s,eq) r3, r1, lsl #9 + teqeq r0, r1 + orrne r0, r0, #0x00400000 @ quiet NAN + RET + + FUNC_END aeabi_frsub + FUNC_END aeabi_fadd + FUNC_END addsf3 + FUNC_END aeabi_fsub + FUNC_END subsf3 + +ARM_FUNC_START floatunsisf +ARM_FUNC_ALIAS aeabi_ui2f floatunsisf + + mov r3, #0 + b 1f + +ARM_FUNC_START floatsisf +ARM_FUNC_ALIAS aeabi_i2f floatsisf + + ands r3, r0, #0x80000000 + do_it mi + rsbmi r0, r0, #0 + +1: movs ip, r0 + do_it eq + RETc(eq) + + @ Add initial exponent to sign + orr r3, r3, #((127 + 23) << 23) + + .ifnc ah, r0 + mov ah, r0 + .endif + mov al, #0 + b 2f + + FUNC_END aeabi_i2f + FUNC_END floatsisf + FUNC_END aeabi_ui2f + FUNC_END floatunsisf + +ARM_FUNC_START floatundisf +ARM_FUNC_ALIAS aeabi_ul2f floatundisf + + orrs r2, r0, r1 +#if !defined (__VFP_FP__) && !defined(__SOFTFP__) + do_it eq, t + mvfeqs f0, #0.0 +#else + do_it eq +#endif + RETc(eq) + + mov r3, #0 + b 1f + +ARM_FUNC_START floatdisf +ARM_FUNC_ALIAS aeabi_l2f floatdisf + + orrs r2, r0, r1 +#if !defined (__VFP_FP__) && !defined(__SOFTFP__) + do_it eq, t + mvfeqs f0, #0.0 +#else + do_it eq +#endif + RETc(eq) + + ands r3, ah, #0x80000000 @ sign bit in r3 + bpl 1f +#if defined(__thumb2__) + negs al, al + sbc ah, ah, ah, lsl #1 +#else + rsbs al, al, #0 + rsc ah, ah, #0 +#endif +1: +#if !defined (__VFP_FP__) && !defined(__SOFTFP__) + @ For hard FPA code we want to return via the tail below so that + @ we can return the result in f0 as well as in r0 for backwards + @ compatibility. + str lr, [sp, #-8]! + adr lr, LSYM(f0_ret) +#endif + + movs ip, ah + do_it eq, tt + moveq ip, al + moveq ah, al + moveq al, #0 + + @ Add initial exponent to sign + orr r3, r3, #((127 + 23 + 32) << 23) + do_it eq + subeq r3, r3, #(32 << 23) +2: sub r3, r3, #(1 << 23) + +#if __ARM_ARCH__ < 5 + + mov r2, #23 + cmp ip, #(1 << 16) + do_it hs, t + movhs ip, ip, lsr #16 + subhs r2, r2, #16 + cmp ip, #(1 << 8) + do_it hs, t + movhs ip, ip, lsr #8 + subhs r2, r2, #8 + cmp ip, #(1 << 4) + do_it hs, t + movhs ip, ip, lsr #4 + subhs r2, r2, #4 + cmp ip, #(1 << 2) + do_it hs, e + subhs r2, r2, #2 + sublo r2, r2, ip, lsr #1 + subs r2, r2, ip, lsr #3 + +#else + + clz r2, ip + subs r2, r2, #8 + +#endif + + sub r3, r3, r2, lsl #23 + blt 3f + + shiftop add r3 r3 ah lsl r2 ip + shift1 lsl, ip, al, r2 + rsb r2, r2, #32 + cmp ip, #0x80000000 + shiftop adc r0 r3 al lsr r2 r2 + do_it eq + biceq r0, r0, #1 + RET + +3: add r2, r2, #32 + shift1 lsl, ip, ah, r2 + rsb r2, r2, #32 + orrs al, al, ip, lsl #1 + shiftop adc r0 r3 ah lsr r2 r2 + do_it eq + biceq r0, r0, ip, lsr #31 + RET + +#if !defined (__VFP_FP__) && !defined(__SOFTFP__) + +LSYM(f0_ret): + str r0, [sp, #-4]! + ldfs f0, [sp], #4 + RETLDM + +#endif + + FUNC_END floatdisf + FUNC_END aeabi_l2f + FUNC_END floatundisf + FUNC_END aeabi_ul2f + +#endif /* L_addsubsf3 */ + +#ifdef L_arm_muldivsf3 + +ARM_FUNC_START mulsf3 +ARM_FUNC_ALIAS aeabi_fmul mulsf3 + + @ Mask out exponents, trap any zero/denormal/INF/NAN. + mov ip, #0xff + ands r2, ip, r0, lsr #23 + do_it ne, tt + COND(and,s,ne) r3, ip, r1, lsr #23 + teqne r2, ip + teqne r3, ip + beq LSYM(Lml_s) +LSYM(Lml_x): + + @ Add exponents together + add r2, r2, r3 + + @ Determine final sign. + eor ip, r0, r1 + + @ Convert mantissa to unsigned integer. + @ If power of two, branch to a separate path. + @ Make up for final alignment. + movs r0, r0, lsl #9 + do_it ne + COND(mov,s,ne) r1, r1, lsl #9 + beq LSYM(Lml_1) + mov r3, #0x08000000 + orr r0, r3, r0, lsr #5 + orr r1, r3, r1, lsr #5 + +#if __ARM_ARCH__ < 4 + + @ Put sign bit in r3, which will be restored into r0 later. + and r3, ip, #0x80000000 + + @ Well, no way to make it shorter without the umull instruction. + do_push {r3, r4, r5} + mov r4, r0, lsr #16 + mov r5, r1, lsr #16 + bic r0, r0, r4, lsl #16 + bic r1, r1, r5, lsl #16 + mul ip, r4, r5 + mul r3, r0, r1 + mul r0, r5, r0 + mla r0, r4, r1, r0 + adds r3, r3, r0, lsl #16 + adc r1, ip, r0, lsr #16 + do_pop {r0, r4, r5} + +#else + + @ The actual multiplication. + umull r3, r1, r0, r1 + + @ Put final sign in r0. + and r0, ip, #0x80000000 + +#endif + + @ Adjust result upon the MSB position. + cmp r1, #(1 << 23) + do_it cc, tt + movcc r1, r1, lsl #1 + orrcc r1, r1, r3, lsr #31 + movcc r3, r3, lsl #1 + + @ Add sign to result. + orr r0, r0, r1 + + @ Apply exponent bias, check for under/overflow. + sbc r2, r2, #127 + cmp r2, #(254 - 1) + bhi LSYM(Lml_u) + + @ Round the result, merge final exponent. + cmp r3, #0x80000000 + adc r0, r0, r2, lsl #23 + do_it eq + biceq r0, r0, #1 + RET + + @ Multiplication by 0x1p*: let''s shortcut a lot of code. +LSYM(Lml_1): + teq r0, #0 + and ip, ip, #0x80000000 + do_it eq + moveq r1, r1, lsl #9 + orr r0, ip, r0, lsr #9 + orr r0, r0, r1, lsr #9 + subs r2, r2, #127 + do_it gt, tt + COND(rsb,s,gt) r3, r2, #255 + orrgt r0, r0, r2, lsl #23 + RETc(gt) + + @ Under/overflow: fix things up for the code below. + orr r0, r0, #0x00800000 + mov r3, #0 + subs r2, r2, #1 + +LSYM(Lml_u): + @ Overflow? + bgt LSYM(Lml_o) + + @ Check if denormalized result is possible, otherwise return signed 0. + cmn r2, #(24 + 1) + do_it le, t + bicle r0, r0, #0x7fffffff + RETc(le) + + @ Shift value right, round, etc. + rsb r2, r2, #0 + movs r1, r0, lsl #1 + shift1 lsr, r1, r1, r2 + rsb r2, r2, #32 + shift1 lsl, ip, r0, r2 + movs r0, r1, rrx + adc r0, r0, #0 + orrs r3, r3, ip, lsl #1 + do_it eq + biceq r0, r0, ip, lsr #31 + RET + + @ One or both arguments are denormalized. + @ Scale them leftwards and preserve sign bit. +LSYM(Lml_d): + teq r2, #0 + and ip, r0, #0x80000000 +1: do_it eq, tt + moveq r0, r0, lsl #1 + tsteq r0, #0x00800000 + subeq r2, r2, #1 + beq 1b + orr r0, r0, ip + teq r3, #0 + and ip, r1, #0x80000000 +2: do_it eq, tt + moveq r1, r1, lsl #1 + tsteq r1, #0x00800000 + subeq r3, r3, #1 + beq 2b + orr r1, r1, ip + b LSYM(Lml_x) + +LSYM(Lml_s): + @ Isolate the INF and NAN cases away + and r3, ip, r1, lsr #23 + teq r2, ip + do_it ne + teqne r3, ip + beq 1f + + @ Here, one or more arguments are either denormalized or zero. + bics ip, r0, #0x80000000 + do_it ne + COND(bic,s,ne) ip, r1, #0x80000000 + bne LSYM(Lml_d) + + @ Result is 0, but determine sign anyway. +LSYM(Lml_z): + eor r0, r0, r1 + bic r0, r0, #0x7fffffff + RET + +1: @ One or both args are INF or NAN. + teq r0, #0x0 + do_it ne, ett + teqne r0, #0x80000000 + moveq r0, r1 + teqne r1, #0x0 + teqne r1, #0x80000000 + beq LSYM(Lml_n) @ 0 * INF or INF * 0 -> NAN + teq r2, ip + bne 1f + movs r2, r0, lsl #9 + bne LSYM(Lml_n) @ NAN * -> NAN +1: teq r3, ip + bne LSYM(Lml_i) + movs r3, r1, lsl #9 + do_it ne + movne r0, r1 + bne LSYM(Lml_n) @ * NAN -> NAN + + @ Result is INF, but we need to determine its sign. +LSYM(Lml_i): + eor r0, r0, r1 + + @ Overflow: return INF (sign already in r0). +LSYM(Lml_o): + and r0, r0, #0x80000000 + orr r0, r0, #0x7f000000 + orr r0, r0, #0x00800000 + RET + + @ Return a quiet NAN. +LSYM(Lml_n): + orr r0, r0, #0x7f000000 + orr r0, r0, #0x00c00000 + RET + + FUNC_END aeabi_fmul + FUNC_END mulsf3 + +ARM_FUNC_START divsf3 +ARM_FUNC_ALIAS aeabi_fdiv divsf3 + + @ Mask out exponents, trap any zero/denormal/INF/NAN. + mov ip, #0xff + ands r2, ip, r0, lsr #23 + do_it ne, tt + COND(and,s,ne) r3, ip, r1, lsr #23 + teqne r2, ip + teqne r3, ip + beq LSYM(Ldv_s) +LSYM(Ldv_x): + + @ Substract divisor exponent from dividend''s + sub r2, r2, r3 + + @ Preserve final sign into ip. + eor ip, r0, r1 + + @ Convert mantissa to unsigned integer. + @ Dividend -> r3, divisor -> r1. + movs r1, r1, lsl #9 + mov r0, r0, lsl #9 + beq LSYM(Ldv_1) + mov r3, #0x10000000 + orr r1, r3, r1, lsr #4 + orr r3, r3, r0, lsr #4 + + @ Initialize r0 (result) with final sign bit. + and r0, ip, #0x80000000 + + @ Ensure result will land to known bit position. + @ Apply exponent bias accordingly. + cmp r3, r1 + do_it cc + movcc r3, r3, lsl #1 + adc r2, r2, #(127 - 2) + + @ The actual division loop. + mov ip, #0x00800000 +1: cmp r3, r1 + do_it cs, t + subcs r3, r3, r1 + orrcs r0, r0, ip + cmp r3, r1, lsr #1 + do_it cs, t + subcs r3, r3, r1, lsr #1 + orrcs r0, r0, ip, lsr #1 + cmp r3, r1, lsr #2 + do_it cs, t + subcs r3, r3, r1, lsr #2 + orrcs r0, r0, ip, lsr #2 + cmp r3, r1, lsr #3 + do_it cs, t + subcs r3, r3, r1, lsr #3 + orrcs r0, r0, ip, lsr #3 + movs r3, r3, lsl #4 + do_it ne + COND(mov,s,ne) ip, ip, lsr #4 + bne 1b + + @ Check exponent for under/overflow. + cmp r2, #(254 - 1) + bhi LSYM(Lml_u) + + @ Round the result, merge final exponent. + cmp r3, r1 + adc r0, r0, r2, lsl #23 + do_it eq + biceq r0, r0, #1 + RET + + @ Division by 0x1p*: let''s shortcut a lot of code. +LSYM(Ldv_1): + and ip, ip, #0x80000000 + orr r0, ip, r0, lsr #9 + adds r2, r2, #127 + do_it gt, tt + COND(rsb,s,gt) r3, r2, #255 + orrgt r0, r0, r2, lsl #23 + RETc(gt) + + orr r0, r0, #0x00800000 + mov r3, #0 + subs r2, r2, #1 + b LSYM(Lml_u) + + @ One or both arguments are denormalized. + @ Scale them leftwards and preserve sign bit. +LSYM(Ldv_d): + teq r2, #0 + and ip, r0, #0x80000000 +1: do_it eq, tt + moveq r0, r0, lsl #1 + tsteq r0, #0x00800000 + subeq r2, r2, #1 + beq 1b + orr r0, r0, ip + teq r3, #0 + and ip, r1, #0x80000000 +2: do_it eq, tt + moveq r1, r1, lsl #1 + tsteq r1, #0x00800000 + subeq r3, r3, #1 + beq 2b + orr r1, r1, ip + b LSYM(Ldv_x) + + @ One or both arguments are either INF, NAN, zero or denormalized. +LSYM(Ldv_s): + and r3, ip, r1, lsr #23 + teq r2, ip + bne 1f + movs r2, r0, lsl #9 + bne LSYM(Lml_n) @ NAN / -> NAN + teq r3, ip + bne LSYM(Lml_i) @ INF / -> INF + mov r0, r1 + b LSYM(Lml_n) @ INF / (INF or NAN) -> NAN +1: teq r3, ip + bne 2f + movs r3, r1, lsl #9 + beq LSYM(Lml_z) @ / INF -> 0 + mov r0, r1 + b LSYM(Lml_n) @ / NAN -> NAN +2: @ If both are nonzero, we need to normalize and resume above. + bics ip, r0, #0x80000000 + do_it ne + COND(bic,s,ne) ip, r1, #0x80000000 + bne LSYM(Ldv_d) + @ One or both arguments are zero. + bics r2, r0, #0x80000000 + bne LSYM(Lml_i) @ / 0 -> INF + bics r3, r1, #0x80000000 + bne LSYM(Lml_z) @ 0 / -> 0 + b LSYM(Lml_n) @ 0 / 0 -> NAN + + FUNC_END aeabi_fdiv + FUNC_END divsf3 + +#endif /* L_muldivsf3 */ + +#ifdef L_arm_cmpsf2 + + @ The return value in r0 is + @ + @ 0 if the operands are equal + @ 1 if the first operand is greater than the second, or + @ the operands are unordered and the operation is + @ CMP, LT, LE, NE, or EQ. + @ -1 if the first operand is less than the second, or + @ the operands are unordered and the operation is GT + @ or GE. + @ + @ The Z flag will be set iff the operands are equal. + @ + @ The following registers are clobbered by this function: + @ ip, r0, r1, r2, r3 + +ARM_FUNC_START gtsf2 +ARM_FUNC_ALIAS gesf2 gtsf2 + mov ip, #-1 + b 1f + +ARM_FUNC_START ltsf2 +ARM_FUNC_ALIAS lesf2 ltsf2 + mov ip, #1 + b 1f + +ARM_FUNC_START cmpsf2 +ARM_FUNC_ALIAS nesf2 cmpsf2 +ARM_FUNC_ALIAS eqsf2 cmpsf2 + mov ip, #1 @ how should we specify unordered here? + +1: str ip, [sp, #-4]! + + @ Trap any INF/NAN first. + mov r2, r0, lsl #1 + mov r3, r1, lsl #1 + mvns ip, r2, asr #24 + do_it ne + COND(mvn,s,ne) ip, r3, asr #24 + beq 3f + + @ Compare values. + @ Note that 0.0 is equal to -0.0. +2: add sp, sp, #4 + orrs ip, r2, r3, lsr #1 @ test if both are 0, clear C flag + do_it ne + teqne r0, r1 @ if not 0 compare sign + do_it pl + COND(sub,s,pl) r0, r2, r3 @ if same sign compare values, set r0 + + @ Result: + do_it hi + movhi r0, r1, asr #31 + do_it lo + mvnlo r0, r1, asr #31 + do_it ne + orrne r0, r0, #1 + RET + + @ Look for a NAN. +3: mvns ip, r2, asr #24 + bne 4f + movs ip, r0, lsl #9 + bne 5f @ r0 is NAN +4: mvns ip, r3, asr #24 + bne 2b + movs ip, r1, lsl #9 + beq 2b @ r1 is not NAN +5: ldr r0, [sp], #4 @ return unordered code. + RET + + FUNC_END gesf2 + FUNC_END gtsf2 + FUNC_END lesf2 + FUNC_END ltsf2 + FUNC_END nesf2 + FUNC_END eqsf2 + FUNC_END cmpsf2 + +ARM_FUNC_START aeabi_cfrcmple + + mov ip, r0 + mov r0, r1 + mov r1, ip + b 6f + +ARM_FUNC_START aeabi_cfcmpeq +ARM_FUNC_ALIAS aeabi_cfcmple aeabi_cfcmpeq + + @ The status-returning routines are required to preserve all + @ registers except ip, lr, and cpsr. +6: do_push {r0, r1, r2, r3, lr} + ARM_CALL cmpsf2 + @ Set the Z flag correctly, and the C flag unconditionally. + cmp r0, #0 + @ Clear the C flag if the return value was -1, indicating + @ that the first operand was smaller than the second. + do_it mi + cmnmi r0, #0 + RETLDM "r0, r1, r2, r3" + + FUNC_END aeabi_cfcmple + FUNC_END aeabi_cfcmpeq + FUNC_END aeabi_cfrcmple + +ARM_FUNC_START aeabi_fcmpeq + + str lr, [sp, #-8]! + ARM_CALL aeabi_cfcmple + do_it eq, e + moveq r0, #1 @ Equal to. + movne r0, #0 @ Less than, greater than, or unordered. + RETLDM + + FUNC_END aeabi_fcmpeq + +ARM_FUNC_START aeabi_fcmplt + + str lr, [sp, #-8]! + ARM_CALL aeabi_cfcmple + do_it cc, e + movcc r0, #1 @ Less than. + movcs r0, #0 @ Equal to, greater than, or unordered. + RETLDM + + FUNC_END aeabi_fcmplt + +ARM_FUNC_START aeabi_fcmple + + str lr, [sp, #-8]! + ARM_CALL aeabi_cfcmple + do_it ls, e + movls r0, #1 @ Less than or equal to. + movhi r0, #0 @ Greater than or unordered. + RETLDM + + FUNC_END aeabi_fcmple + +ARM_FUNC_START aeabi_fcmpge + + str lr, [sp, #-8]! + ARM_CALL aeabi_cfrcmple + do_it ls, e + movls r0, #1 @ Operand 2 is less than or equal to operand 1. + movhi r0, #0 @ Operand 2 greater than operand 1, or unordered. + RETLDM + + FUNC_END aeabi_fcmpge + +ARM_FUNC_START aeabi_fcmpgt + + str lr, [sp, #-8]! + ARM_CALL aeabi_cfrcmple + do_it cc, e + movcc r0, #1 @ Operand 2 is less than operand 1. + movcs r0, #0 @ Operand 2 is greater than or equal to operand 1, + @ or they are unordered. + RETLDM + + FUNC_END aeabi_fcmpgt + +#endif /* L_cmpsf2 */ + +#ifdef L_arm_unordsf2 + +ARM_FUNC_START unordsf2 +ARM_FUNC_ALIAS aeabi_fcmpun unordsf2 + + mov r2, r0, lsl #1 + mov r3, r1, lsl #1 + mvns ip, r2, asr #24 + bne 1f + movs ip, r0, lsl #9 + bne 3f @ r0 is NAN +1: mvns ip, r3, asr #24 + bne 2f + movs ip, r1, lsl #9 + bne 3f @ r1 is NAN +2: mov r0, #0 @ arguments are ordered. + RET +3: mov r0, #1 @ arguments are unordered. + RET + + FUNC_END aeabi_fcmpun + FUNC_END unordsf2 + +#endif /* L_unordsf2 */ + +#ifdef L_arm_fixsfsi + +ARM_FUNC_START fixsfsi +ARM_FUNC_ALIAS aeabi_f2iz fixsfsi + + @ check exponent range. + mov r2, r0, lsl #1 + cmp r2, #(127 << 24) + bcc 1f @ value is too small + mov r3, #(127 + 31) + subs r2, r3, r2, lsr #24 + bls 2f @ value is too large + + @ scale value + mov r3, r0, lsl #8 + orr r3, r3, #0x80000000 + tst r0, #0x80000000 @ the sign bit + shift1 lsr, r0, r3, r2 + do_it ne + rsbne r0, r0, #0 + RET + +1: mov r0, #0 + RET + +2: cmp r2, #(127 + 31 - 0xff) + bne 3f + movs r2, r0, lsl #9 + bne 4f @ r0 is NAN. +3: ands r0, r0, #0x80000000 @ the sign bit + do_it eq + moveq r0, #0x7fffffff @ the maximum signed positive si + RET + +4: mov r0, #0 @ What should we convert NAN to? + RET + + FUNC_END aeabi_f2iz + FUNC_END fixsfsi + +#endif /* L_fixsfsi */ + +#ifdef L_arm_fixunssfsi + +ARM_FUNC_START fixunssfsi +ARM_FUNC_ALIAS aeabi_f2uiz fixunssfsi + + @ check exponent range. + movs r2, r0, lsl #1 + bcs 1f @ value is negative + cmp r2, #(127 << 24) + bcc 1f @ value is too small + mov r3, #(127 + 31) + subs r2, r3, r2, lsr #24 + bmi 2f @ value is too large + + @ scale the value + mov r3, r0, lsl #8 + orr r3, r3, #0x80000000 + shift1 lsr, r0, r3, r2 + RET + +1: mov r0, #0 + RET + +2: cmp r2, #(127 + 31 - 0xff) + bne 3f + movs r2, r0, lsl #9 + bne 4f @ r0 is NAN. +3: mov r0, #0xffffffff @ maximum unsigned si + RET + +4: mov r0, #0 @ What should we convert NAN to? + RET + + FUNC_END aeabi_f2uiz + FUNC_END fixunssfsi + +#endif /* L_fixunssfsi */ diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md new file mode 100644 index 000000000..887c962ba --- /dev/null +++ b/gcc/config/arm/iterators.md @@ -0,0 +1,405 @@ +;; Code and mode itertator and attribute definitions for the ARM backend +;; Copyright (C) 2010 Free Software Foundation, Inc. +;; Contributed by ARM Ltd. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published +;; by the Free Software Foundation; either version 3, or (at your +;; option) any later version. + +;; GCC is distributed in the hope that it will be useful, but WITHOUT +;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public +;; License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + + +;;---------------------------------------------------------------------------- +;; Mode iterators +;;---------------------------------------------------------------------------- + +;; A list of modes that are exactly 64 bits in size. This is used to expand +;; some splits that are the same for all modes when operating on ARM +;; registers. +(define_mode_iterator ANY64 [DI DF V8QI V4HI V2SI V2SF]) + +(define_mode_iterator ANY128 [V2DI V2DF V16QI V8HI V4SI V4SF]) + +;; A list of integer modes that are up to one word long +(define_mode_iterator QHSI [QI HI SI]) + +;; Integer element sizes implemented by IWMMXT. +(define_mode_iterator VMMX [V2SI V4HI V8QI]) + +;; Integer element sizes for shifts. +(define_mode_iterator VSHFT [V4HI V2SI DI]) + +;; Integer and float modes supported by Neon and IWMMXT. +(define_mode_iterator VALL [V2DI V2SI V4HI V8QI V2SF V4SI V8HI V16QI V4SF]) + +;; Integer and float modes supported by Neon and IWMMXT, except V2DI. +(define_mode_iterator VALLW [V2SI V4HI V8QI V2SF V4SI V8HI V16QI V4SF]) + +;; Integer modes supported by Neon and IWMMXT +(define_mode_iterator VINT [V2DI V2SI V4HI V8QI V4SI V8HI V16QI]) + +;; Integer modes supported by Neon and IWMMXT, except V2DI +(define_mode_iterator VINTW [V2SI V4HI V8QI V4SI V8HI V16QI]) + +;; Double-width vector modes. +(define_mode_iterator VD [V8QI V4HI V2SI V2SF]) + +;; Double-width vector modes plus 64-bit elements. +(define_mode_iterator VDX [V8QI V4HI V2SI V2SF DI]) + +;; Double-width vector modes without floating-point elements. +(define_mode_iterator VDI [V8QI V4HI V2SI]) + +;; Quad-width vector modes. +(define_mode_iterator VQ [V16QI V8HI V4SI V4SF]) + +;; Quad-width vector modes plus 64-bit elements. +(define_mode_iterator VQX [V16QI V8HI V4SI V4SF V2DI]) + +;; Quad-width vector modes without floating-point elements. +(define_mode_iterator VQI [V16QI V8HI V4SI]) + +;; Quad-width vector modes, with TImode added, for moves. +(define_mode_iterator VQXMOV [V16QI V8HI V4SI V4SF V2DI TI]) + +;; Opaque structure types wider than TImode. +(define_mode_iterator VSTRUCT [EI OI CI XI]) + +;; Opaque structure types used in table lookups (except vtbl1/vtbx1). +(define_mode_iterator VTAB [TI EI OI]) + +;; Widenable modes. +(define_mode_iterator VW [V8QI V4HI V2SI]) + +;; Narrowable modes. +(define_mode_iterator VN [V8HI V4SI V2DI]) + +;; All supported vector modes (except singleton DImode). +(define_mode_iterator VDQ [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF V2DI]) + +;; All supported vector modes (except those with 64-bit integer elements). +(define_mode_iterator VDQW [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF]) + +;; Supported integer vector modes (not 64 bit elements). +(define_mode_iterator VDQIW [V8QI V16QI V4HI V8HI V2SI V4SI]) + +;; Supported integer vector modes (not singleton DI) +(define_mode_iterator VDQI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI]) + +;; Vector modes, including 64-bit integer elements. +(define_mode_iterator VDQX [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF DI V2DI]) + +;; Vector modes including 64-bit integer elements, but no floats. +(define_mode_iterator VDQIX [V8QI V16QI V4HI V8HI V2SI V4SI DI V2DI]) + +;; Vector modes for float->int conversions. +(define_mode_iterator VCVTF [V2SF V4SF]) + +;; Vector modes form int->float conversions. +(define_mode_iterator VCVTI [V2SI V4SI]) + +;; Vector modes for doubleword multiply-accumulate, etc. insns. +(define_mode_iterator VMD [V4HI V2SI V2SF]) + +;; Vector modes for quadword multiply-accumulate, etc. insns. +(define_mode_iterator VMQ [V8HI V4SI V4SF]) + +;; Above modes combined. +(define_mode_iterator VMDQ [V4HI V2SI V2SF V8HI V4SI V4SF]) + +;; As VMD, but integer modes only. +(define_mode_iterator VMDI [V4HI V2SI]) + +;; As VMQ, but integer modes only. +(define_mode_iterator VMQI [V8HI V4SI]) + +;; Above modes combined. +(define_mode_iterator VMDQI [V4HI V2SI V8HI V4SI]) + +;; Modes with 8-bit and 16-bit elements. +(define_mode_iterator VX [V8QI V4HI V16QI V8HI]) + +;; Modes with 8-bit elements. +(define_mode_iterator VE [V8QI V16QI]) + +;; Modes with 64-bit elements only. +(define_mode_iterator V64 [DI V2DI]) + +;; Modes with 32-bit elements only. +(define_mode_iterator V32 [V2SI V2SF V4SI V4SF]) + +;; Modes with 8-bit, 16-bit and 32-bit elements. +(define_mode_iterator VU [V16QI V8HI V4SI]) + +;;---------------------------------------------------------------------------- +;; Code iterators +;;---------------------------------------------------------------------------- + +;; A list of condition codes used in compare instructions where +;; the carry flag from the addition is used instead of doing the +;; compare a second time. +(define_code_iterator LTUGEU [ltu geu]) + +;; A list of ... +(define_code_iterator ior_xor [ior xor]) + +;; Operations on two halves of a quadword vector. +(define_code_iterator vqh_ops [plus smin smax umin umax]) + +;; Operations on two halves of a quadword vector, +;; without unsigned variants (for use with *SFmode pattern). +(define_code_iterator vqhs_ops [plus smin smax]) + +;; A list of widening operators +(define_code_iterator SE [sign_extend zero_extend]) + +;;---------------------------------------------------------------------------- +;; Mode attributes +;;---------------------------------------------------------------------------- + +;; Determine element size suffix from vector mode. +(define_mode_attr MMX_char [(V8QI "b") (V4HI "h") (V2SI "w") (DI "d")]) + +;; vtbl suffix for NEON vector modes. +(define_mode_attr VTAB_n [(TI "2") (EI "3") (OI "4")]) + +;; (Opposite) mode to convert to/from for NEON mode conversions. +(define_mode_attr V_CVTTO [(V2SI "V2SF") (V2SF "V2SI") + (V4SI "V4SF") (V4SF "V4SI")]) + +;; Define element mode for each vector mode. +(define_mode_attr V_elem [(V8QI "QI") (V16QI "QI") + (V4HI "HI") (V8HI "HI") + (V2SI "SI") (V4SI "SI") + (V2SF "SF") (V4SF "SF") + (DI "DI") (V2DI "DI")]) + +;; Element modes for vector extraction, padded up to register size. + +(define_mode_attr V_ext [(V8QI "SI") (V16QI "SI") + (V4HI "SI") (V8HI "SI") + (V2SI "SI") (V4SI "SI") + (V2SF "SF") (V4SF "SF") + (DI "DI") (V2DI "DI")]) + +;; Mode of pair of elements for each vector mode, to define transfer +;; size for structure lane/dup loads and stores. +(define_mode_attr V_two_elem [(V8QI "HI") (V16QI "HI") + (V4HI "SI") (V8HI "SI") + (V2SI "V2SI") (V4SI "V2SI") + (V2SF "V2SF") (V4SF "V2SF") + (DI "V2DI") (V2DI "V2DI")]) + +;; Similar, for three elements. +;; ??? Should we define extra modes so that sizes of all three-element +;; accesses can be accurately represented? +(define_mode_attr V_three_elem [(V8QI "SI") (V16QI "SI") + (V4HI "V4HI") (V8HI "V4HI") + (V2SI "V4SI") (V4SI "V4SI") + (V2SF "V4SF") (V4SF "V4SF") + (DI "EI") (V2DI "EI")]) + +;; Similar, for four elements. +(define_mode_attr V_four_elem [(V8QI "SI") (V16QI "SI") + (V4HI "V4HI") (V8HI "V4HI") + (V2SI "V4SI") (V4SI "V4SI") + (V2SF "V4SF") (V4SF "V4SF") + (DI "OI") (V2DI "OI")]) + +;; Register width from element mode +(define_mode_attr V_reg [(V8QI "P") (V16QI "q") + (V4HI "P") (V8HI "q") + (V2SI "P") (V4SI "q") + (V2SF "P") (V4SF "q") + (DI "P") (V2DI "q")]) + +;; Wider modes with the same number of elements. +(define_mode_attr V_widen [(V8QI "V8HI") (V4HI "V4SI") (V2SI "V2DI")]) + +;; Narrower modes with the same number of elements. +(define_mode_attr V_narrow [(V8HI "V8QI") (V4SI "V4HI") (V2DI "V2SI")]) + +;; Narrower modes with double the number of elements. +(define_mode_attr V_narrow_pack [(V4SI "V8HI") (V8HI "V16QI") (V2DI "V4SI") + (V4HI "V8QI") (V2SI "V4HI") (DI "V2SI")]) + +;; Modes with half the number of equal-sized elements. +(define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI") + (V4SI "V2SI") (V4SF "V2SF") (V2DF "DF") + (V2DI "DI")]) + +;; Same, but lower-case. +(define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi") + (V4SI "v2si") (V4SF "v2sf") + (V2DI "di")]) + +;; Modes with twice the number of equal-sized elements. +(define_mode_attr V_DOUBLE [(V8QI "V16QI") (V4HI "V8HI") + (V2SI "V4SI") (V2SF "V4SF") (DF "V2DF") + (DI "V2DI")]) + +;; Same, but lower-case. +(define_mode_attr V_double [(V8QI "v16qi") (V4HI "v8hi") + (V2SI "v4si") (V2SF "v4sf") + (DI "v2di")]) + +;; Modes with double-width elements. +(define_mode_attr V_double_width [(V8QI "V4HI") (V16QI "V8HI") + (V4HI "V2SI") (V8HI "V4SI") + (V2SI "DI") (V4SI "V2DI")]) + +;; Double-sized modes with the same element size. +;; Used for neon_vdup_lane, where the second operand is double-sized +;; even when the first one is quad. +(define_mode_attr V_double_vector_mode [(V16QI "V8QI") (V8HI "V4HI") + (V4SI "V2SI") (V4SF "V2SF") + (V8QI "V8QI") (V4HI "V4HI") + (V2SI "V2SI") (V2SF "V2SF")]) + +;; Mode of result of comparison operations (and bit-select operand 1). +(define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI") + (V4HI "V4HI") (V8HI "V8HI") + (V2SI "V2SI") (V4SI "V4SI") + (V2SF "V2SI") (V4SF "V4SI") + (DI "DI") (V2DI "V2DI")]) + +;; Get element type from double-width mode, for operations where we +;; don't care about signedness. +(define_mode_attr V_if_elem [(V8QI "i8") (V16QI "i8") + (V4HI "i16") (V8HI "i16") + (V2SI "i32") (V4SI "i32") + (DI "i64") (V2DI "i64") + (V2SF "f32") (V4SF "f32")]) + +;; Same, but for operations which work on signed values. +(define_mode_attr V_s_elem [(V8QI "s8") (V16QI "s8") + (V4HI "s16") (V8HI "s16") + (V2SI "s32") (V4SI "s32") + (DI "s64") (V2DI "s64") + (V2SF "f32") (V4SF "f32")]) + +;; Same, but for operations which work on unsigned values. +(define_mode_attr V_u_elem [(V8QI "u8") (V16QI "u8") + (V4HI "u16") (V8HI "u16") + (V2SI "u32") (V4SI "u32") + (DI "u64") (V2DI "u64") + (V2SF "f32") (V4SF "f32")]) + +;; Element types for extraction of unsigned scalars. +(define_mode_attr V_uf_sclr [(V8QI "u8") (V16QI "u8") + (V4HI "u16") (V8HI "u16") + (V2SI "32") (V4SI "32") + (V2SF "32") (V4SF "32")]) + +(define_mode_attr V_sz_elem [(V8QI "8") (V16QI "8") + (V4HI "16") (V8HI "16") + (V2SI "32") (V4SI "32") + (DI "64") (V2DI "64") + (V2SF "32") (V4SF "32")]) + +;; Element sizes for duplicating ARM registers to all elements of a vector. +(define_mode_attr VD_dup [(V8QI "8") (V4HI "16") (V2SI "32") (V2SF "32")]) + +;; Opaque integer types for results of pair-forming intrinsics (vtrn, etc.) +(define_mode_attr V_PAIR [(V8QI "TI") (V16QI "OI") + (V4HI "TI") (V8HI "OI") + (V2SI "TI") (V4SI "OI") + (V2SF "TI") (V4SF "OI") + (DI "TI") (V2DI "OI")]) + +;; Same, but lower-case. +(define_mode_attr V_pair [(V8QI "ti") (V16QI "oi") + (V4HI "ti") (V8HI "oi") + (V2SI "ti") (V4SI "oi") + (V2SF "ti") (V4SF "oi") + (DI "ti") (V2DI "oi")]) + +;; Extra suffix on some 64-bit insn names (to avoid collision with standard +;; names which we don't want to define). +(define_mode_attr V_suf64 [(V8QI "") (V16QI "") + (V4HI "") (V8HI "") + (V2SI "") (V4SI "") + (V2SF "") (V4SF "") + (DI "_neon") (V2DI "")]) + + +;; Scalars to be presented to scalar multiplication instructions +;; must satisfy the following constraints. +;; 1. If the mode specifies 16-bit elements, the scalar must be in D0-D7. +;; 2. If the mode specifies 32-bit elements, the scalar must be in D0-D15. + +;; This mode attribute is used to obtain the correct register constraints. + +(define_mode_attr scalar_mul_constraint [(V4HI "x") (V2SI "t") (V2SF "t") + (V8HI "x") (V4SI "t") (V4SF "t")]) + +;; Predicates used for setting neon_type + +(define_mode_attr Is_float_mode [(V8QI "false") (V16QI "false") + (V4HI "false") (V8HI "false") + (V2SI "false") (V4SI "false") + (V2SF "true") (V4SF "true") + (DI "false") (V2DI "false")]) + +(define_mode_attr Scalar_mul_8_16 [(V8QI "true") (V16QI "true") + (V4HI "true") (V8HI "true") + (V2SI "false") (V4SI "false") + (V2SF "false") (V4SF "false") + (DI "false") (V2DI "false")]) + + +(define_mode_attr Is_d_reg [(V8QI "true") (V16QI "false") + (V4HI "true") (V8HI "false") + (V2SI "true") (V4SI "false") + (V2SF "true") (V4SF "false") + (DI "true") (V2DI "false")]) + +(define_mode_attr V_mode_nunits [(V8QI "8") (V16QI "16") + (V4HI "4") (V8HI "8") + (V2SI "2") (V4SI "4") + (V2SF "2") (V4SF "4") + (DI "1") (V2DI "2") + (DF "1") (V2DF "2")]) + +;; Same as V_widen, but lower-case. +(define_mode_attr V_widen_l [(V8QI "v8hi") (V4HI "v4si") ( V2SI "v2di")]) + +;; Widen. Result is half the number of elements, but widened to double-width. +(define_mode_attr V_unpack [(V16QI "V8HI") (V8HI "V4SI") (V4SI "V2DI")]) + +;; Conditions to be used in extenddi patterns. +(define_mode_attr qhs_zextenddi_cond [(SI "") (HI "&& arm_arch6") (QI "")]) +(define_mode_attr qhs_sextenddi_cond [(SI "") (HI "&& arm_arch6") + (QI "&& arm_arch6")]) +(define_mode_attr qhs_extenddi_op [(SI "s_register_operand") + (HI "nonimmediate_operand") + (QI "nonimmediate_operand")]) +(define_mode_attr qhs_extenddi_cstr [(SI "r") (HI "rm") (QI "rm")]) + +;;---------------------------------------------------------------------------- +;; Code attributes +;;---------------------------------------------------------------------------- + +;; Assembler mnemonics for vqh_ops and vqhs_ops iterators. +(define_code_attr VQH_mnem [(plus "vadd") (smin "vmin") (smax "vmax") + (umin "vmin") (umax "vmax")]) + +;; Signs of above, where relevant. +(define_code_attr VQH_sign [(plus "i") (smin "s") (smax "s") (umin "u") + (umax "u")]) + +(define_code_attr cnb [(ltu "CC_C") (geu "CC")]) +(define_code_attr optab [(ltu "ltu") (geu "geu")]) + +;; Assembler mnemonics for signedness of widening operations. +(define_code_attr US [(sign_extend "s") (zero_extend "u")]) diff --git a/gcc/config/arm/iwmmxt.md b/gcc/config/arm/iwmmxt.md new file mode 100644 index 000000000..7f13ae49b --- /dev/null +++ b/gcc/config/arm/iwmmxt.md @@ -0,0 +1,1332 @@ +;; ??? This file needs auditing for thumb2 +;; Patterns for the Intel Wireless MMX technology architecture. +;; Copyright (C) 2003, 2004, 2005, 2007, 2008, 2010 +;; Free Software Foundation, Inc. +;; Contributed by Red Hat. + +;; This file is part of GCC. + +;; GCC is free software; you can redistribute it and/or modify it under +;; the terms of the GNU General Public License as published by the Free +;; Software Foundation; either version 3, or (at your option) any later +;; version. + +;; GCC is distributed in the hope that it will be useful, but WITHOUT +;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public +;; License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + + +(define_insn "iwmmxt_iordi3" + [(set (match_operand:DI 0 "register_operand" "=y,?&r,?&r") + (ior:DI (match_operand:DI 1 "register_operand" "%y,0,r") + (match_operand:DI 2 "register_operand" "y,r,r")))] + "TARGET_REALLY_IWMMXT" + "@ + wor%?\\t%0, %1, %2 + # + #" + [(set_attr "predicable" "yes") + (set_attr "length" "4,8,8")]) + +(define_insn "iwmmxt_xordi3" + [(set (match_operand:DI 0 "register_operand" "=y,?&r,?&r") + (xor:DI (match_operand:DI 1 "register_operand" "%y,0,r") + (match_operand:DI 2 "register_operand" "y,r,r")))] + "TARGET_REALLY_IWMMXT" + "@ + wxor%?\\t%0, %1, %2 + # + #" + [(set_attr "predicable" "yes") + (set_attr "length" "4,8,8")]) + +(define_insn "iwmmxt_anddi3" + [(set (match_operand:DI 0 "register_operand" "=y,?&r,?&r") + (and:DI (match_operand:DI 1 "register_operand" "%y,0,r") + (match_operand:DI 2 "register_operand" "y,r,r")))] + "TARGET_REALLY_IWMMXT" + "@ + wand%?\\t%0, %1, %2 + # + #" + [(set_attr "predicable" "yes") + (set_attr "length" "4,8,8")]) + +(define_insn "iwmmxt_nanddi3" + [(set (match_operand:DI 0 "register_operand" "=y") + (and:DI (match_operand:DI 1 "register_operand" "y") + (not:DI (match_operand:DI 2 "register_operand" "y"))))] + "TARGET_REALLY_IWMMXT" + "wandn%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "*iwmmxt_arm_movdi" + [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r, r, m,y,y,yr,y,yrUy") + (match_operand:DI 1 "di_operand" "rIK,mi,r,y,yr,y,yrUy,y"))] + "TARGET_REALLY_IWMMXT + && ( register_operand (operands[0], DImode) + || register_operand (operands[1], DImode))" + "* +{ + switch (which_alternative) + { + default: + return output_move_double (operands); + case 0: + return \"#\"; + case 3: + return \"wmov%?\\t%0,%1\"; + case 4: + return \"tmcrr%?\\t%0,%Q1,%R1\"; + case 5: + return \"tmrrc%?\\t%Q0,%R0,%1\"; + case 6: + return \"wldrd%?\\t%0,%1\"; + case 7: + return \"wstrd%?\\t%1,%0\"; + } +}" + [(set_attr "length" "8,8,8,4,4,4,4,4") + (set_attr "type" "*,load1,store2,*,*,*,*,*") + (set_attr "pool_range" "*,1020,*,*,*,*,*,*") + (set_attr "neg_pool_range" "*,1012,*,*,*,*,*,*")] +) + +(define_insn "*iwmmxt_movsi_insn" + [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,r,rk, m,z,r,?z,Uy,z") + (match_operand:SI 1 "general_operand" "rk, I,K,mi,rk,r,z,Uy,z, z"))] + "TARGET_REALLY_IWMMXT + && ( register_operand (operands[0], SImode) + || register_operand (operands[1], SImode))" + "* + switch (which_alternative) + { + case 0: return \"mov\\t%0, %1\"; + case 1: return \"mov\\t%0, %1\"; + case 2: return \"mvn\\t%0, #%B1\"; + case 3: return \"ldr\\t%0, %1\"; + case 4: return \"str\\t%1, %0\"; + case 5: return \"tmcr\\t%0, %1\"; + case 6: return \"tmrc\\t%0, %1\"; + case 7: return arm_output_load_gr (operands); + case 8: return \"wstrw\\t%1, %0\"; + default:return \"wstrw\\t%1, [sp, #-4]!\;wldrw\\t%0, [sp], #4\\t@move CG reg\"; + }" + [(set_attr "type" "*,*,*,load1,store1,*,*,load1,store1,*") + (set_attr "length" "*,*,*,*, *,*,*, 16, *,8") + (set_attr "pool_range" "*,*,*,4096, *,*,*,1024, *,*") + (set_attr "neg_pool_range" "*,*,*,4084, *,*,*, *, 1012,*") + ;; Note - the "predicable" attribute is not allowed to have alternatives. + ;; Since the wSTRw wCx instruction is not predicable, we cannot support + ;; predicating any of the alternatives in this template. Instead, + ;; we do the predication ourselves, in cond_iwmmxt_movsi_insn. + (set_attr "predicable" "no") + ;; Also - we have to pretend that these insns clobber the condition code + ;; bits as otherwise arm_final_prescan_insn() will try to conditionalize + ;; them. + (set_attr "conds" "clob")] +) + +;; Because iwmmxt_movsi_insn is not predicable, we provide the +;; cond_exec version explicitly, with appropriate constraints. + +(define_insn "*cond_iwmmxt_movsi_insn" + [(cond_exec + (match_operator 2 "arm_comparison_operator" + [(match_operand 3 "cc_register" "") + (const_int 0)]) + (set (match_operand:SI 0 "nonimmediate_operand" "=r,r,r, m,z,r") + (match_operand:SI 1 "general_operand" "rI,K,mi,r,r,z")))] + "TARGET_REALLY_IWMMXT + && ( register_operand (operands[0], SImode) + || register_operand (operands[1], SImode))" + "* + switch (which_alternative) + { + case 0: return \"mov%?\\t%0, %1\"; + case 1: return \"mvn%?\\t%0, #%B1\"; + case 2: return \"ldr%?\\t%0, %1\"; + case 3: return \"str%?\\t%1, %0\"; + case 4: return \"tmcr%?\\t%0, %1\"; + default: return \"tmrc%?\\t%0, %1\"; + }" + [(set_attr "type" "*,*,load1,store1,*,*") + (set_attr "pool_range" "*,*,4096, *,*,*") + (set_attr "neg_pool_range" "*,*,4084, *,*,*")] +) + +(define_insn "mov_internal" + [(set (match_operand:VMMX 0 "nonimmediate_operand" "=y,m,y,?r,?y,?r,?r,?m") + (match_operand:VMMX 1 "general_operand" "y,y,mi,y,r,r,mi,r"))] + "TARGET_REALLY_IWMMXT" + "* + switch (which_alternative) + { + case 0: return \"wmov%?\\t%0, %1\"; + case 1: return \"wstrd%?\\t%1, %0\"; + case 2: return \"wldrd%?\\t%0, %1\"; + case 3: return \"tmrrc%?\\t%Q0, %R0, %1\"; + case 4: return \"tmcrr%?\\t%0, %Q1, %R1\"; + case 5: return \"#\"; + default: return output_move_double (operands); + }" + [(set_attr "predicable" "yes") + (set_attr "length" "4, 4, 4,4,4,8, 8,8") + (set_attr "type" "*,store1,load1,*,*,*,load1,store1") + (set_attr "pool_range" "*, *, 256,*,*,*, 256,*") + (set_attr "neg_pool_range" "*, *, 244,*,*,*, 244,*")]) + +;; Vector add/subtract + +(define_insn "*add3_iwmmxt" + [(set (match_operand:VMMX 0 "register_operand" "=y") + (plus:VMMX (match_operand:VMMX 1 "register_operand" "y") + (match_operand:VMMX 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wadd%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "ssaddv8qi3" + [(set (match_operand:V8QI 0 "register_operand" "=y") + (ss_plus:V8QI (match_operand:V8QI 1 "register_operand" "y") + (match_operand:V8QI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "waddbss%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "ssaddv4hi3" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (ss_plus:V4HI (match_operand:V4HI 1 "register_operand" "y") + (match_operand:V4HI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "waddhss%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "ssaddv2si3" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (ss_plus:V2SI (match_operand:V2SI 1 "register_operand" "y") + (match_operand:V2SI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "waddwss%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "usaddv8qi3" + [(set (match_operand:V8QI 0 "register_operand" "=y") + (us_plus:V8QI (match_operand:V8QI 1 "register_operand" "y") + (match_operand:V8QI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "waddbus%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "usaddv4hi3" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (us_plus:V4HI (match_operand:V4HI 1 "register_operand" "y") + (match_operand:V4HI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "waddhus%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "usaddv2si3" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (us_plus:V2SI (match_operand:V2SI 1 "register_operand" "y") + (match_operand:V2SI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "waddwus%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "*sub3_iwmmxt" + [(set (match_operand:VMMX 0 "register_operand" "=y") + (minus:VMMX (match_operand:VMMX 1 "register_operand" "y") + (match_operand:VMMX 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wsub%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "sssubv8qi3" + [(set (match_operand:V8QI 0 "register_operand" "=y") + (ss_minus:V8QI (match_operand:V8QI 1 "register_operand" "y") + (match_operand:V8QI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wsubbss%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "sssubv4hi3" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (ss_minus:V4HI (match_operand:V4HI 1 "register_operand" "y") + (match_operand:V4HI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wsubhss%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "sssubv2si3" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (ss_minus:V2SI (match_operand:V2SI 1 "register_operand" "y") + (match_operand:V2SI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wsubwss%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "ussubv8qi3" + [(set (match_operand:V8QI 0 "register_operand" "=y") + (us_minus:V8QI (match_operand:V8QI 1 "register_operand" "y") + (match_operand:V8QI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wsubbus%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "ussubv4hi3" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (us_minus:V4HI (match_operand:V4HI 1 "register_operand" "y") + (match_operand:V4HI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wsubhus%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "ussubv2si3" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (us_minus:V2SI (match_operand:V2SI 1 "register_operand" "y") + (match_operand:V2SI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wsubwus%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "*mulv4hi3_iwmmxt" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (mult:V4HI (match_operand:V4HI 1 "register_operand" "y") + (match_operand:V4HI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wmulul%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "smulv4hi3_highpart" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (truncate:V4HI + (lshiftrt:V4SI + (mult:V4SI (sign_extend:V4SI (match_operand:V4HI 1 "register_operand" "y")) + (sign_extend:V4SI (match_operand:V4HI 2 "register_operand" "y"))) + (const_int 16))))] + "TARGET_REALLY_IWMMXT" + "wmulsm%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "umulv4hi3_highpart" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (truncate:V4HI + (lshiftrt:V4SI + (mult:V4SI (zero_extend:V4SI (match_operand:V4HI 1 "register_operand" "y")) + (zero_extend:V4SI (match_operand:V4HI 2 "register_operand" "y"))) + (const_int 16))))] + "TARGET_REALLY_IWMMXT" + "wmulum%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wmacs" + [(set (match_operand:DI 0 "register_operand" "=y") + (unspec:DI [(match_operand:DI 1 "register_operand" "0") + (match_operand:V4HI 2 "register_operand" "y") + (match_operand:V4HI 3 "register_operand" "y")] UNSPEC_WMACS))] + "TARGET_REALLY_IWMMXT" + "wmacs%?\\t%0, %2, %3" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wmacsz" + [(set (match_operand:DI 0 "register_operand" "=y") + (unspec:DI [(match_operand:V4HI 1 "register_operand" "y") + (match_operand:V4HI 2 "register_operand" "y")] UNSPEC_WMACSZ))] + "TARGET_REALLY_IWMMXT" + "wmacsz%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wmacu" + [(set (match_operand:DI 0 "register_operand" "=y") + (unspec:DI [(match_operand:DI 1 "register_operand" "0") + (match_operand:V4HI 2 "register_operand" "y") + (match_operand:V4HI 3 "register_operand" "y")] UNSPEC_WMACU))] + "TARGET_REALLY_IWMMXT" + "wmacu%?\\t%0, %2, %3" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wmacuz" + [(set (match_operand:DI 0 "register_operand" "=y") + (unspec:DI [(match_operand:V4HI 1 "register_operand" "y") + (match_operand:V4HI 2 "register_operand" "y")] UNSPEC_WMACUZ))] + "TARGET_REALLY_IWMMXT" + "wmacuz%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +;; Same as xordi3, but don't show input operands so that we don't think +;; they are live. +(define_insn "iwmmxt_clrdi" + [(set (match_operand:DI 0 "register_operand" "=y") + (unspec:DI [(const_int 0)] UNSPEC_CLRDI))] + "TARGET_REALLY_IWMMXT" + "wxor%?\\t%0, %0, %0" + [(set_attr "predicable" "yes")]) + +;; Seems like cse likes to generate these, so we have to support them. + +(define_insn "*iwmmxt_clrv8qi" + [(set (match_operand:V8QI 0 "register_operand" "=y") + (const_vector:V8QI [(const_int 0) (const_int 0) + (const_int 0) (const_int 0) + (const_int 0) (const_int 0) + (const_int 0) (const_int 0)]))] + "TARGET_REALLY_IWMMXT" + "wxor%?\\t%0, %0, %0" + [(set_attr "predicable" "yes")]) + +(define_insn "*iwmmxt_clrv4hi" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (const_vector:V4HI [(const_int 0) (const_int 0) + (const_int 0) (const_int 0)]))] + "TARGET_REALLY_IWMMXT" + "wxor%?\\t%0, %0, %0" + [(set_attr "predicable" "yes")]) + +(define_insn "*iwmmxt_clrv2si" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (const_vector:V2SI [(const_int 0) (const_int 0)]))] + "TARGET_REALLY_IWMMXT" + "wxor%?\\t%0, %0, %0" + [(set_attr "predicable" "yes")]) + +;; Unsigned averages/sum of absolute differences + +(define_insn "iwmmxt_uavgrndv8qi3" + [(set (match_operand:V8QI 0 "register_operand" "=y") + (ashiftrt:V8QI + (plus:V8QI (plus:V8QI + (match_operand:V8QI 1 "register_operand" "y") + (match_operand:V8QI 2 "register_operand" "y")) + (const_vector:V8QI [(const_int 1) + (const_int 1) + (const_int 1) + (const_int 1) + (const_int 1) + (const_int 1) + (const_int 1) + (const_int 1)])) + (const_int 1)))] + "TARGET_REALLY_IWMMXT" + "wavg2br%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_uavgrndv4hi3" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (ashiftrt:V4HI + (plus:V4HI (plus:V4HI + (match_operand:V4HI 1 "register_operand" "y") + (match_operand:V4HI 2 "register_operand" "y")) + (const_vector:V4HI [(const_int 1) + (const_int 1) + (const_int 1) + (const_int 1)])) + (const_int 1)))] + "TARGET_REALLY_IWMMXT" + "wavg2hr%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + + +(define_insn "iwmmxt_uavgv8qi3" + [(set (match_operand:V8QI 0 "register_operand" "=y") + (ashiftrt:V8QI (plus:V8QI + (match_operand:V8QI 1 "register_operand" "y") + (match_operand:V8QI 2 "register_operand" "y")) + (const_int 1)))] + "TARGET_REALLY_IWMMXT" + "wavg2b%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_uavgv4hi3" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (ashiftrt:V4HI (plus:V4HI + (match_operand:V4HI 1 "register_operand" "y") + (match_operand:V4HI 2 "register_operand" "y")) + (const_int 1)))] + "TARGET_REALLY_IWMMXT" + "wavg2h%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_psadbw" + [(set (match_operand:V8QI 0 "register_operand" "=y") + (abs:V8QI (minus:V8QI (match_operand:V8QI 1 "register_operand" "y") + (match_operand:V8QI 2 "register_operand" "y"))))] + "TARGET_REALLY_IWMMXT" + "psadbw%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + + +;; Insert/extract/shuffle + +(define_insn "iwmmxt_tinsrb" + [(set (match_operand:V8QI 0 "register_operand" "=y") + (vec_merge:V8QI (match_operand:V8QI 1 "register_operand" "0") + (vec_duplicate:V8QI + (truncate:QI (match_operand:SI 2 "nonimmediate_operand" "r"))) + (match_operand:SI 3 "immediate_operand" "i")))] + "TARGET_REALLY_IWMMXT" + "tinsrb%?\\t%0, %2, %3" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_tinsrh" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (vec_merge:V4HI (match_operand:V4HI 1 "register_operand" "0") + (vec_duplicate:V4HI + (truncate:HI (match_operand:SI 2 "nonimmediate_operand" "r"))) + (match_operand:SI 3 "immediate_operand" "i")))] + "TARGET_REALLY_IWMMXT" + "tinsrh%?\\t%0, %2, %3" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_tinsrw" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (vec_merge:V2SI (match_operand:V2SI 1 "register_operand" "0") + (vec_duplicate:V2SI + (match_operand:SI 2 "nonimmediate_operand" "r")) + (match_operand:SI 3 "immediate_operand" "i")))] + "TARGET_REALLY_IWMMXT" + "tinsrw%?\\t%0, %2, %3" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_textrmub" + [(set (match_operand:SI 0 "register_operand" "=r") + (zero_extend:SI (vec_select:QI (match_operand:V8QI 1 "register_operand" "y") + (parallel + [(match_operand:SI 2 "immediate_operand" "i")]))))] + "TARGET_REALLY_IWMMXT" + "textrmub%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_textrmsb" + [(set (match_operand:SI 0 "register_operand" "=r") + (sign_extend:SI (vec_select:QI (match_operand:V8QI 1 "register_operand" "y") + (parallel + [(match_operand:SI 2 "immediate_operand" "i")]))))] + "TARGET_REALLY_IWMMXT" + "textrmsb%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_textrmuh" + [(set (match_operand:SI 0 "register_operand" "=r") + (zero_extend:SI (vec_select:HI (match_operand:V4HI 1 "register_operand" "y") + (parallel + [(match_operand:SI 2 "immediate_operand" "i")]))))] + "TARGET_REALLY_IWMMXT" + "textrmuh%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_textrmsh" + [(set (match_operand:SI 0 "register_operand" "=r") + (sign_extend:SI (vec_select:HI (match_operand:V4HI 1 "register_operand" "y") + (parallel + [(match_operand:SI 2 "immediate_operand" "i")]))))] + "TARGET_REALLY_IWMMXT" + "textrmsh%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +;; There are signed/unsigned variants of this instruction, but they are +;; pointless. +(define_insn "iwmmxt_textrmw" + [(set (match_operand:SI 0 "register_operand" "=r") + (vec_select:SI (match_operand:V2SI 1 "register_operand" "y") + (parallel [(match_operand:SI 2 "immediate_operand" "i")])))] + "TARGET_REALLY_IWMMXT" + "textrmsw%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wshufh" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (unspec:V4HI [(match_operand:V4HI 1 "register_operand" "y") + (match_operand:SI 2 "immediate_operand" "i")] UNSPEC_WSHUFH))] + "TARGET_REALLY_IWMMXT" + "wshufh%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +;; Mask-generating comparisons +;; +;; Note - you cannot use patterns like these here: +;; +;; (set (match:) (: (match:) (match:))) +;; +;; Because GCC will assume that the truth value (1 or 0) is installed +;; into the entire destination vector, (with the '1' going into the least +;; significant element of the vector). This is not how these instructions +;; behave. +;; +;; Unfortunately the current patterns are illegal. They are SET insns +;; without a SET in them. They work in most cases for ordinary code +;; generation, but there are circumstances where they can cause gcc to fail. +;; XXX - FIXME. + +(define_insn "eqv8qi3" + [(unspec_volatile [(match_operand:V8QI 0 "register_operand" "=y") + (match_operand:V8QI 1 "register_operand" "y") + (match_operand:V8QI 2 "register_operand" "y")] + VUNSPEC_WCMP_EQ)] + "TARGET_REALLY_IWMMXT" + "wcmpeqb%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "eqv4hi3" + [(unspec_volatile [(match_operand:V4HI 0 "register_operand" "=y") + (match_operand:V4HI 1 "register_operand" "y") + (match_operand:V4HI 2 "register_operand" "y")] + VUNSPEC_WCMP_EQ)] + "TARGET_REALLY_IWMMXT" + "wcmpeqh%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "eqv2si3" + [(unspec_volatile:V2SI [(match_operand:V2SI 0 "register_operand" "=y") + (match_operand:V2SI 1 "register_operand" "y") + (match_operand:V2SI 2 "register_operand" "y")] + VUNSPEC_WCMP_EQ)] + "TARGET_REALLY_IWMMXT" + "wcmpeqw%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "gtuv8qi3" + [(unspec_volatile [(match_operand:V8QI 0 "register_operand" "=y") + (match_operand:V8QI 1 "register_operand" "y") + (match_operand:V8QI 2 "register_operand" "y")] + VUNSPEC_WCMP_GTU)] + "TARGET_REALLY_IWMMXT" + "wcmpgtub%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "gtuv4hi3" + [(unspec_volatile [(match_operand:V4HI 0 "register_operand" "=y") + (match_operand:V4HI 1 "register_operand" "y") + (match_operand:V4HI 2 "register_operand" "y")] + VUNSPEC_WCMP_GTU)] + "TARGET_REALLY_IWMMXT" + "wcmpgtuh%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "gtuv2si3" + [(unspec_volatile [(match_operand:V2SI 0 "register_operand" "=y") + (match_operand:V2SI 1 "register_operand" "y") + (match_operand:V2SI 2 "register_operand" "y")] + VUNSPEC_WCMP_GTU)] + "TARGET_REALLY_IWMMXT" + "wcmpgtuw%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "gtv8qi3" + [(unspec_volatile [(match_operand:V8QI 0 "register_operand" "=y") + (match_operand:V8QI 1 "register_operand" "y") + (match_operand:V8QI 2 "register_operand" "y")] + VUNSPEC_WCMP_GT)] + "TARGET_REALLY_IWMMXT" + "wcmpgtsb%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "gtv4hi3" + [(unspec_volatile [(match_operand:V4HI 0 "register_operand" "=y") + (match_operand:V4HI 1 "register_operand" "y") + (match_operand:V4HI 2 "register_operand" "y")] + VUNSPEC_WCMP_GT)] + "TARGET_REALLY_IWMMXT" + "wcmpgtsh%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "gtv2si3" + [(unspec_volatile [(match_operand:V2SI 0 "register_operand" "=y") + (match_operand:V2SI 1 "register_operand" "y") + (match_operand:V2SI 2 "register_operand" "y")] + VUNSPEC_WCMP_GT)] + "TARGET_REALLY_IWMMXT" + "wcmpgtsw%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +;; Max/min insns + +(define_insn "*smax3_iwmmxt" + [(set (match_operand:VMMX 0 "register_operand" "=y") + (smax:VMMX (match_operand:VMMX 1 "register_operand" "y") + (match_operand:VMMX 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wmaxs%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "*umax3_iwmmxt" + [(set (match_operand:VMMX 0 "register_operand" "=y") + (umax:VMMX (match_operand:VMMX 1 "register_operand" "y") + (match_operand:VMMX 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wmaxu%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "*smin3_iwmmxt" + [(set (match_operand:VMMX 0 "register_operand" "=y") + (smin:VMMX (match_operand:VMMX 1 "register_operand" "y") + (match_operand:VMMX 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wmins%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "*umin3_iwmmxt" + [(set (match_operand:VMMX 0 "register_operand" "=y") + (umin:VMMX (match_operand:VMMX 1 "register_operand" "y") + (match_operand:VMMX 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wminu%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +;; Pack/unpack insns. + +(define_insn "iwmmxt_wpackhss" + [(set (match_operand:V8QI 0 "register_operand" "=y") + (vec_concat:V8QI + (ss_truncate:V4QI (match_operand:V4HI 1 "register_operand" "y")) + (ss_truncate:V4QI (match_operand:V4HI 2 "register_operand" "y"))))] + "TARGET_REALLY_IWMMXT" + "wpackhss%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wpackwss" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (vec_concat:V4HI + (ss_truncate:V2HI (match_operand:V2SI 1 "register_operand" "y")) + (ss_truncate:V2HI (match_operand:V2SI 2 "register_operand" "y"))))] + "TARGET_REALLY_IWMMXT" + "wpackwss%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wpackdss" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (vec_concat:V2SI + (ss_truncate:SI (match_operand:DI 1 "register_operand" "y")) + (ss_truncate:SI (match_operand:DI 2 "register_operand" "y"))))] + "TARGET_REALLY_IWMMXT" + "wpackdss%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wpackhus" + [(set (match_operand:V8QI 0 "register_operand" "=y") + (vec_concat:V8QI + (us_truncate:V4QI (match_operand:V4HI 1 "register_operand" "y")) + (us_truncate:V4QI (match_operand:V4HI 2 "register_operand" "y"))))] + "TARGET_REALLY_IWMMXT" + "wpackhus%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wpackwus" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (vec_concat:V4HI + (us_truncate:V2HI (match_operand:V2SI 1 "register_operand" "y")) + (us_truncate:V2HI (match_operand:V2SI 2 "register_operand" "y"))))] + "TARGET_REALLY_IWMMXT" + "wpackwus%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wpackdus" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (vec_concat:V2SI + (us_truncate:SI (match_operand:DI 1 "register_operand" "y")) + (us_truncate:SI (match_operand:DI 2 "register_operand" "y"))))] + "TARGET_REALLY_IWMMXT" + "wpackdus%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + + +(define_insn "iwmmxt_wunpckihb" + [(set (match_operand:V8QI 0 "register_operand" "=y") + (vec_merge:V8QI + (vec_select:V8QI (match_operand:V8QI 1 "register_operand" "y") + (parallel [(const_int 4) + (const_int 0) + (const_int 5) + (const_int 1) + (const_int 6) + (const_int 2) + (const_int 7) + (const_int 3)])) + (vec_select:V8QI (match_operand:V8QI 2 "register_operand" "y") + (parallel [(const_int 0) + (const_int 4) + (const_int 1) + (const_int 5) + (const_int 2) + (const_int 6) + (const_int 3) + (const_int 7)])) + (const_int 85)))] + "TARGET_REALLY_IWMMXT" + "wunpckihb%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wunpckihh" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (vec_merge:V4HI + (vec_select:V4HI (match_operand:V4HI 1 "register_operand" "y") + (parallel [(const_int 0) + (const_int 2) + (const_int 1) + (const_int 3)])) + (vec_select:V4HI (match_operand:V4HI 2 "register_operand" "y") + (parallel [(const_int 2) + (const_int 0) + (const_int 3) + (const_int 1)])) + (const_int 5)))] + "TARGET_REALLY_IWMMXT" + "wunpckihh%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wunpckihw" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (vec_merge:V2SI + (vec_select:V2SI (match_operand:V2SI 1 "register_operand" "y") + (parallel [(const_int 0) + (const_int 1)])) + (vec_select:V2SI (match_operand:V2SI 2 "register_operand" "y") + (parallel [(const_int 1) + (const_int 0)])) + (const_int 1)))] + "TARGET_REALLY_IWMMXT" + "wunpckihw%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wunpckilb" + [(set (match_operand:V8QI 0 "register_operand" "=y") + (vec_merge:V8QI + (vec_select:V8QI (match_operand:V8QI 1 "register_operand" "y") + (parallel [(const_int 0) + (const_int 4) + (const_int 1) + (const_int 5) + (const_int 2) + (const_int 6) + (const_int 3) + (const_int 7)])) + (vec_select:V8QI (match_operand:V8QI 2 "register_operand" "y") + (parallel [(const_int 4) + (const_int 0) + (const_int 5) + (const_int 1) + (const_int 6) + (const_int 2) + (const_int 7) + (const_int 3)])) + (const_int 85)))] + "TARGET_REALLY_IWMMXT" + "wunpckilb%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wunpckilh" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (vec_merge:V4HI + (vec_select:V4HI (match_operand:V4HI 1 "register_operand" "y") + (parallel [(const_int 2) + (const_int 0) + (const_int 3) + (const_int 1)])) + (vec_select:V4HI (match_operand:V4HI 2 "register_operand" "y") + (parallel [(const_int 0) + (const_int 2) + (const_int 1) + (const_int 3)])) + (const_int 5)))] + "TARGET_REALLY_IWMMXT" + "wunpckilh%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wunpckilw" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (vec_merge:V2SI + (vec_select:V2SI (match_operand:V2SI 1 "register_operand" "y") + (parallel [(const_int 1) + (const_int 0)])) + (vec_select:V2SI (match_operand:V2SI 2 "register_operand" "y") + (parallel [(const_int 0) + (const_int 1)])) + (const_int 1)))] + "TARGET_REALLY_IWMMXT" + "wunpckilw%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wunpckehub" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (zero_extend:V4HI + (vec_select:V4QI (match_operand:V8QI 1 "register_operand" "y") + (parallel [(const_int 4) (const_int 5) + (const_int 6) (const_int 7)]))))] + "TARGET_REALLY_IWMMXT" + "wunpckehub%?\\t%0, %1" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wunpckehuh" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (zero_extend:V2SI + (vec_select:V2HI (match_operand:V4HI 1 "register_operand" "y") + (parallel [(const_int 2) (const_int 3)]))))] + "TARGET_REALLY_IWMMXT" + "wunpckehuh%?\\t%0, %1" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wunpckehuw" + [(set (match_operand:DI 0 "register_operand" "=y") + (zero_extend:DI + (vec_select:SI (match_operand:V2SI 1 "register_operand" "y") + (parallel [(const_int 1)]))))] + "TARGET_REALLY_IWMMXT" + "wunpckehuw%?\\t%0, %1" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wunpckehsb" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (sign_extend:V4HI + (vec_select:V4QI (match_operand:V8QI 1 "register_operand" "y") + (parallel [(const_int 4) (const_int 5) + (const_int 6) (const_int 7)]))))] + "TARGET_REALLY_IWMMXT" + "wunpckehsb%?\\t%0, %1" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wunpckehsh" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (sign_extend:V2SI + (vec_select:V2HI (match_operand:V4HI 1 "register_operand" "y") + (parallel [(const_int 2) (const_int 3)]))))] + "TARGET_REALLY_IWMMXT" + "wunpckehsh%?\\t%0, %1" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wunpckehsw" + [(set (match_operand:DI 0 "register_operand" "=y") + (sign_extend:DI + (vec_select:SI (match_operand:V2SI 1 "register_operand" "y") + (parallel [(const_int 1)]))))] + "TARGET_REALLY_IWMMXT" + "wunpckehsw%?\\t%0, %1" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wunpckelub" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (zero_extend:V4HI + (vec_select:V4QI (match_operand:V8QI 1 "register_operand" "y") + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)]))))] + "TARGET_REALLY_IWMMXT" + "wunpckelub%?\\t%0, %1" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wunpckeluh" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (zero_extend:V2SI + (vec_select:V2HI (match_operand:V4HI 1 "register_operand" "y") + (parallel [(const_int 0) (const_int 1)]))))] + "TARGET_REALLY_IWMMXT" + "wunpckeluh%?\\t%0, %1" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wunpckeluw" + [(set (match_operand:DI 0 "register_operand" "=y") + (zero_extend:DI + (vec_select:SI (match_operand:V2SI 1 "register_operand" "y") + (parallel [(const_int 0)]))))] + "TARGET_REALLY_IWMMXT" + "wunpckeluw%?\\t%0, %1" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wunpckelsb" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (sign_extend:V4HI + (vec_select:V4QI (match_operand:V8QI 1 "register_operand" "y") + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)]))))] + "TARGET_REALLY_IWMMXT" + "wunpckelsb%?\\t%0, %1" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wunpckelsh" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (sign_extend:V2SI + (vec_select:V2HI (match_operand:V4HI 1 "register_operand" "y") + (parallel [(const_int 0) (const_int 1)]))))] + "TARGET_REALLY_IWMMXT" + "wunpckelsh%?\\t%0, %1" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wunpckelsw" + [(set (match_operand:DI 0 "register_operand" "=y") + (sign_extend:DI + (vec_select:SI (match_operand:V2SI 1 "register_operand" "y") + (parallel [(const_int 0)]))))] + "TARGET_REALLY_IWMMXT" + "wunpckelsw%?\\t%0, %1" + [(set_attr "predicable" "yes")]) + +;; Shifts + +(define_insn "rorv4hi3" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (rotatert:V4HI (match_operand:V4HI 1 "register_operand" "y") + (match_operand:SI 2 "register_operand" "z")))] + "TARGET_REALLY_IWMMXT" + "wrorhg%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "rorv2si3" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (rotatert:V2SI (match_operand:V2SI 1 "register_operand" "y") + (match_operand:SI 2 "register_operand" "z")))] + "TARGET_REALLY_IWMMXT" + "wrorwg%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "rordi3" + [(set (match_operand:DI 0 "register_operand" "=y") + (rotatert:DI (match_operand:DI 1 "register_operand" "y") + (match_operand:SI 2 "register_operand" "z")))] + "TARGET_REALLY_IWMMXT" + "wrordg%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "ashr3_iwmmxt" + [(set (match_operand:VSHFT 0 "register_operand" "=y") + (ashiftrt:VSHFT (match_operand:VSHFT 1 "register_operand" "y") + (match_operand:SI 2 "register_operand" "z")))] + "TARGET_REALLY_IWMMXT" + "wsrag%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "lshr3_iwmmxt" + [(set (match_operand:VSHFT 0 "register_operand" "=y") + (lshiftrt:VSHFT (match_operand:VSHFT 1 "register_operand" "y") + (match_operand:SI 2 "register_operand" "z")))] + "TARGET_REALLY_IWMMXT" + "wsrlg%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "ashl3_iwmmxt" + [(set (match_operand:VSHFT 0 "register_operand" "=y") + (ashift:VSHFT (match_operand:VSHFT 1 "register_operand" "y") + (match_operand:SI 2 "register_operand" "z")))] + "TARGET_REALLY_IWMMXT" + "wsllg%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "rorv4hi3_di" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (rotatert:V4HI (match_operand:V4HI 1 "register_operand" "y") + (match_operand:DI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wrorh%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "rorv2si3_di" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (rotatert:V2SI (match_operand:V2SI 1 "register_operand" "y") + (match_operand:DI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wrorw%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "rordi3_di" + [(set (match_operand:DI 0 "register_operand" "=y") + (rotatert:DI (match_operand:DI 1 "register_operand" "y") + (match_operand:DI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wrord%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "ashrv4hi3_di" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (ashiftrt:V4HI (match_operand:V4HI 1 "register_operand" "y") + (match_operand:DI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wsrah%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "ashrv2si3_di" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (ashiftrt:V2SI (match_operand:V2SI 1 "register_operand" "y") + (match_operand:DI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wsraw%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "ashrdi3_di" + [(set (match_operand:DI 0 "register_operand" "=y") + (ashiftrt:DI (match_operand:DI 1 "register_operand" "y") + (match_operand:DI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wsrad%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "lshrv4hi3_di" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (lshiftrt:V4HI (match_operand:V4HI 1 "register_operand" "y") + (match_operand:DI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wsrlh%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "lshrv2si3_di" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (lshiftrt:V2SI (match_operand:V2SI 1 "register_operand" "y") + (match_operand:DI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wsrlw%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "lshrdi3_di" + [(set (match_operand:DI 0 "register_operand" "=y") + (lshiftrt:DI (match_operand:DI 1 "register_operand" "y") + (match_operand:DI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wsrld%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "ashlv4hi3_di" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (ashift:V4HI (match_operand:V4HI 1 "register_operand" "y") + (match_operand:DI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wsllh%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "ashlv2si3_di" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (ashift:V2SI (match_operand:V2SI 1 "register_operand" "y") + (match_operand:DI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wsllw%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "ashldi3_di" + [(set (match_operand:DI 0 "register_operand" "=y") + (ashift:DI (match_operand:DI 1 "register_operand" "y") + (match_operand:DI 2 "register_operand" "y")))] + "TARGET_REALLY_IWMMXT" + "wslld%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wmadds" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (unspec:V4HI [(match_operand:V4HI 1 "register_operand" "y") + (match_operand:V4HI 2 "register_operand" "y")] UNSPEC_WMADDS))] + "TARGET_REALLY_IWMMXT" + "wmadds%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wmaddu" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (unspec:V4HI [(match_operand:V4HI 1 "register_operand" "y") + (match_operand:V4HI 2 "register_operand" "y")] UNSPEC_WMADDU))] + "TARGET_REALLY_IWMMXT" + "wmaddu%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_tmia" + [(set (match_operand:DI 0 "register_operand" "=y") + (plus:DI (match_operand:DI 1 "register_operand" "0") + (mult:DI (sign_extend:DI + (match_operand:SI 2 "register_operand" "r")) + (sign_extend:DI + (match_operand:SI 3 "register_operand" "r")))))] + "TARGET_REALLY_IWMMXT" + "tmia%?\\t%0, %2, %3" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_tmiaph" + [(set (match_operand:DI 0 "register_operand" "=y") + (plus:DI (match_operand:DI 1 "register_operand" "0") + (plus:DI + (mult:DI (sign_extend:DI + (truncate:HI (match_operand:SI 2 "register_operand" "r"))) + (sign_extend:DI + (truncate:HI (match_operand:SI 3 "register_operand" "r")))) + (mult:DI (sign_extend:DI + (truncate:HI (ashiftrt:SI (match_dup 2) (const_int 16)))) + (sign_extend:DI + (truncate:HI (ashiftrt:SI (match_dup 3) (const_int 16))))))))] + "TARGET_REALLY_IWMMXT" + "tmiaph%?\\t%0, %2, %3" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_tmiabb" + [(set (match_operand:DI 0 "register_operand" "=y") + (plus:DI (match_operand:DI 1 "register_operand" "0") + (mult:DI (sign_extend:DI + (truncate:HI (match_operand:SI 2 "register_operand" "r"))) + (sign_extend:DI + (truncate:HI (match_operand:SI 3 "register_operand" "r"))))))] + "TARGET_REALLY_IWMMXT" + "tmiabb%?\\t%0, %2, %3" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_tmiatb" + [(set (match_operand:DI 0 "register_operand" "=y") + (plus:DI (match_operand:DI 1 "register_operand" "0") + (mult:DI (sign_extend:DI + (truncate:HI (ashiftrt:SI + (match_operand:SI 2 "register_operand" "r") + (const_int 16)))) + (sign_extend:DI + (truncate:HI (match_operand:SI 3 "register_operand" "r"))))))] + "TARGET_REALLY_IWMMXT" + "tmiatb%?\\t%0, %2, %3" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_tmiabt" + [(set (match_operand:DI 0 "register_operand" "=y") + (plus:DI (match_operand:DI 1 "register_operand" "0") + (mult:DI (sign_extend:DI + (truncate:HI (match_operand:SI 2 "register_operand" "r"))) + (sign_extend:DI + (truncate:HI (ashiftrt:SI + (match_operand:SI 3 "register_operand" "r") + (const_int 16)))))))] + "TARGET_REALLY_IWMMXT" + "tmiabt%?\\t%0, %2, %3" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_tmiatt" + [(set (match_operand:DI 0 "register_operand" "=y") + (plus:DI (match_operand:DI 1 "register_operand" "0") + (mult:DI (sign_extend:DI + (truncate:HI (ashiftrt:SI + (match_operand:SI 2 "register_operand" "r") + (const_int 16)))) + (sign_extend:DI + (truncate:HI (ashiftrt:SI + (match_operand:SI 3 "register_operand" "r") + (const_int 16)))))))] + "TARGET_REALLY_IWMMXT" + "tmiatt%?\\t%0, %2, %3" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_tbcstqi" + [(set (match_operand:V8QI 0 "register_operand" "=y") + (vec_duplicate:V8QI (match_operand:QI 1 "register_operand" "r")))] + "TARGET_REALLY_IWMMXT" + "tbcstb%?\\t%0, %1" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_tbcsthi" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (vec_duplicate:V4HI (match_operand:HI 1 "register_operand" "r")))] + "TARGET_REALLY_IWMMXT" + "tbcsth%?\\t%0, %1" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_tbcstsi" + [(set (match_operand:V2SI 0 "register_operand" "=y") + (vec_duplicate:V2SI (match_operand:SI 1 "register_operand" "r")))] + "TARGET_REALLY_IWMMXT" + "tbcstw%?\\t%0, %1" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_tmovmskb" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec:SI [(match_operand:V8QI 1 "register_operand" "y")] UNSPEC_TMOVMSK))] + "TARGET_REALLY_IWMMXT" + "tmovmskb%?\\t%0, %1" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_tmovmskh" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec:SI [(match_operand:V4HI 1 "register_operand" "y")] UNSPEC_TMOVMSK))] + "TARGET_REALLY_IWMMXT" + "tmovmskh%?\\t%0, %1" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_tmovmskw" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec:SI [(match_operand:V2SI 1 "register_operand" "y")] UNSPEC_TMOVMSK))] + "TARGET_REALLY_IWMMXT" + "tmovmskw%?\\t%0, %1" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_waccb" + [(set (match_operand:DI 0 "register_operand" "=y") + (unspec:DI [(match_operand:V8QI 1 "register_operand" "y")] UNSPEC_WACC))] + "TARGET_REALLY_IWMMXT" + "waccb%?\\t%0, %1" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wacch" + [(set (match_operand:DI 0 "register_operand" "=y") + (unspec:DI [(match_operand:V4HI 1 "register_operand" "y")] UNSPEC_WACC))] + "TARGET_REALLY_IWMMXT" + "wacch%?\\t%0, %1" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_waccw" + [(set (match_operand:DI 0 "register_operand" "=y") + (unspec:DI [(match_operand:V2SI 1 "register_operand" "y")] UNSPEC_WACC))] + "TARGET_REALLY_IWMMXT" + "waccw%?\\t%0, %1" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_walign" + [(set (match_operand:V8QI 0 "register_operand" "=y,y") + (subreg:V8QI (ashiftrt:TI + (subreg:TI (vec_concat:V16QI + (match_operand:V8QI 1 "register_operand" "y,y") + (match_operand:V8QI 2 "register_operand" "y,y")) 0) + (mult:SI + (match_operand:SI 3 "nonmemory_operand" "i,z") + (const_int 8))) 0))] + "TARGET_REALLY_IWMMXT" + "@ + waligni%?\\t%0, %1, %2, %3 + walignr%U3%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_tmrc" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec_volatile:SI [(match_operand:SI 1 "immediate_operand" "i")] + VUNSPEC_TMRC))] + "TARGET_REALLY_IWMMXT" + "tmrc%?\\t%0, %w1" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_tmcr" + [(unspec_volatile:SI [(match_operand:SI 0 "immediate_operand" "i") + (match_operand:SI 1 "register_operand" "r")] + VUNSPEC_TMCR)] + "TARGET_REALLY_IWMMXT" + "tmcr%?\\t%w0, %1" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wsadb" + [(set (match_operand:V8QI 0 "register_operand" "=y") + (unspec:V8QI [(match_operand:V8QI 1 "register_operand" "y") + (match_operand:V8QI 2 "register_operand" "y")] UNSPEC_WSAD))] + "TARGET_REALLY_IWMMXT" + "wsadb%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wsadh" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (unspec:V4HI [(match_operand:V4HI 1 "register_operand" "y") + (match_operand:V4HI 2 "register_operand" "y")] UNSPEC_WSAD))] + "TARGET_REALLY_IWMMXT" + "wsadh%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wsadbz" + [(set (match_operand:V8QI 0 "register_operand" "=y") + (unspec:V8QI [(match_operand:V8QI 1 "register_operand" "y") + (match_operand:V8QI 2 "register_operand" "y")] UNSPEC_WSADZ))] + "TARGET_REALLY_IWMMXT" + "wsadbz%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + +(define_insn "iwmmxt_wsadhz" + [(set (match_operand:V4HI 0 "register_operand" "=y") + (unspec:V4HI [(match_operand:V4HI 1 "register_operand" "y") + (match_operand:V4HI 2 "register_operand" "y")] UNSPEC_WSADZ))] + "TARGET_REALLY_IWMMXT" + "wsadhz%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")]) + diff --git a/gcc/config/arm/ldmstm.md b/gcc/config/arm/ldmstm.md new file mode 100644 index 000000000..5db4a3269 --- /dev/null +++ b/gcc/config/arm/ldmstm.md @@ -0,0 +1,1191 @@ +/* ARM ldm/stm instruction patterns. This file was automatically generated + using arm-ldmstm.ml. Please do not edit manually. + + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by CodeSourcery. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +(define_insn "*ldm4_ia" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (match_operand:SI 5 "s_register_operand" "rk"))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int 4)))) + (set (match_operand:SI 3 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int 8)))) + (set (match_operand:SI 4 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int 12))))])] + "TARGET_32BIT && XVECLEN (operands[0], 0) == 4" + "ldm%(ia%)\t%5, {%1, %2, %3, %4}" + [(set_attr "type" "load4") + (set_attr "predicable" "yes")]) + +(define_insn "*thumb_ldm4_ia" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (match_operand:SI 5 "s_register_operand" "l"))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int 4)))) + (set (match_operand:SI 3 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int 8)))) + (set (match_operand:SI 4 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int 12))))])] + "TARGET_THUMB1 && XVECLEN (operands[0], 0) == 4" + "ldm%(ia%)\t%5, {%1, %2, %3, %4}" + [(set_attr "type" "load4")]) + +(define_insn "*ldm4_ia_update" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 5 "s_register_operand" "+&rk") + (plus:SI (match_dup 5) (const_int 16))) + (set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (match_dup 5))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int 4)))) + (set (match_operand:SI 3 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int 8)))) + (set (match_operand:SI 4 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int 12))))])] + "TARGET_32BIT && XVECLEN (operands[0], 0) == 5" + "ldm%(ia%)\t%5!, {%1, %2, %3, %4}" + [(set_attr "type" "load4") + (set_attr "predicable" "yes")]) + +(define_insn "*thumb_ldm4_ia_update" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 5 "s_register_operand" "+&l") + (plus:SI (match_dup 5) (const_int 16))) + (set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (match_dup 5))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int 4)))) + (set (match_operand:SI 3 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int 8)))) + (set (match_operand:SI 4 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int 12))))])] + "TARGET_THUMB1 && XVECLEN (operands[0], 0) == 5" + "ldm%(ia%)\t%5!, {%1, %2, %3, %4}" + [(set_attr "type" "load4")]) + +(define_insn "*stm4_ia" + [(match_parallel 0 "store_multiple_operation" + [(set (mem:SI (match_operand:SI 5 "s_register_operand" "rk")) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int 4))) + (match_operand:SI 2 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int 8))) + (match_operand:SI 3 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int 12))) + (match_operand:SI 4 "arm_hard_register_operand" ""))])] + "TARGET_32BIT && XVECLEN (operands[0], 0) == 4" + "stm%(ia%)\t%5, {%1, %2, %3, %4}" + [(set_attr "type" "store4") + (set_attr "predicable" "yes")]) + +(define_insn "*stm4_ia_update" + [(match_parallel 0 "store_multiple_operation" + [(set (match_operand:SI 5 "s_register_operand" "+&rk") + (plus:SI (match_dup 5) (const_int 16))) + (set (mem:SI (match_dup 5)) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int 4))) + (match_operand:SI 2 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int 8))) + (match_operand:SI 3 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int 12))) + (match_operand:SI 4 "arm_hard_register_operand" ""))])] + "TARGET_32BIT && XVECLEN (operands[0], 0) == 5" + "stm%(ia%)\t%5!, {%1, %2, %3, %4}" + [(set_attr "type" "store4") + (set_attr "predicable" "yes")]) + +(define_insn "*thumb_stm4_ia_update" + [(match_parallel 0 "store_multiple_operation" + [(set (match_operand:SI 5 "s_register_operand" "+&l") + (plus:SI (match_dup 5) (const_int 16))) + (set (mem:SI (match_dup 5)) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int 4))) + (match_operand:SI 2 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int 8))) + (match_operand:SI 3 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int 12))) + (match_operand:SI 4 "arm_hard_register_operand" ""))])] + "TARGET_THUMB1 && XVECLEN (operands[0], 0) == 5" + "stm%(ia%)\t%5!, {%1, %2, %3, %4}" + [(set_attr "type" "store4")]) + +(define_insn "*ldm4_ib" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_operand:SI 5 "s_register_operand" "rk") + (const_int 4)))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int 8)))) + (set (match_operand:SI 3 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int 12)))) + (set (match_operand:SI 4 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int 16))))])] + "TARGET_ARM && XVECLEN (operands[0], 0) == 4" + "ldm%(ib%)\t%5, {%1, %2, %3, %4}" + [(set_attr "type" "load4") + (set_attr "predicable" "yes")]) + +(define_insn "*ldm4_ib_update" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 5 "s_register_operand" "+&rk") + (plus:SI (match_dup 5) (const_int 16))) + (set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int 4)))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int 8)))) + (set (match_operand:SI 3 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int 12)))) + (set (match_operand:SI 4 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int 16))))])] + "TARGET_ARM && XVECLEN (operands[0], 0) == 5" + "ldm%(ib%)\t%5!, {%1, %2, %3, %4}" + [(set_attr "type" "load4") + (set_attr "predicable" "yes")]) + +(define_insn "*stm4_ib" + [(match_parallel 0 "store_multiple_operation" + [(set (mem:SI (plus:SI (match_operand:SI 5 "s_register_operand" "rk") (const_int 4))) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int 8))) + (match_operand:SI 2 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int 12))) + (match_operand:SI 3 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int 16))) + (match_operand:SI 4 "arm_hard_register_operand" ""))])] + "TARGET_ARM && XVECLEN (operands[0], 0) == 4" + "stm%(ib%)\t%5, {%1, %2, %3, %4}" + [(set_attr "type" "store4") + (set_attr "predicable" "yes")]) + +(define_insn "*stm4_ib_update" + [(match_parallel 0 "store_multiple_operation" + [(set (match_operand:SI 5 "s_register_operand" "+&rk") + (plus:SI (match_dup 5) (const_int 16))) + (set (mem:SI (plus:SI (match_dup 5) (const_int 4))) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int 8))) + (match_operand:SI 2 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int 12))) + (match_operand:SI 3 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int 16))) + (match_operand:SI 4 "arm_hard_register_operand" ""))])] + "TARGET_ARM && XVECLEN (operands[0], 0) == 5" + "stm%(ib%)\t%5!, {%1, %2, %3, %4}" + [(set_attr "type" "store4") + (set_attr "predicable" "yes")]) + +(define_insn "*ldm4_da" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_operand:SI 5 "s_register_operand" "rk") + (const_int -12)))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int -8)))) + (set (match_operand:SI 3 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int -4)))) + (set (match_operand:SI 4 "arm_hard_register_operand" "") + (mem:SI (match_dup 5)))])] + "TARGET_ARM && XVECLEN (operands[0], 0) == 4" + "ldm%(da%)\t%5, {%1, %2, %3, %4}" + [(set_attr "type" "load4") + (set_attr "predicable" "yes")]) + +(define_insn "*ldm4_da_update" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 5 "s_register_operand" "+&rk") + (plus:SI (match_dup 5) (const_int -16))) + (set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int -12)))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int -8)))) + (set (match_operand:SI 3 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int -4)))) + (set (match_operand:SI 4 "arm_hard_register_operand" "") + (mem:SI (match_dup 5)))])] + "TARGET_ARM && XVECLEN (operands[0], 0) == 5" + "ldm%(da%)\t%5!, {%1, %2, %3, %4}" + [(set_attr "type" "load4") + (set_attr "predicable" "yes")]) + +(define_insn "*stm4_da" + [(match_parallel 0 "store_multiple_operation" + [(set (mem:SI (plus:SI (match_operand:SI 5 "s_register_operand" "rk") (const_int -12))) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int -8))) + (match_operand:SI 2 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int -4))) + (match_operand:SI 3 "arm_hard_register_operand" "")) + (set (mem:SI (match_dup 5)) + (match_operand:SI 4 "arm_hard_register_operand" ""))])] + "TARGET_ARM && XVECLEN (operands[0], 0) == 4" + "stm%(da%)\t%5, {%1, %2, %3, %4}" + [(set_attr "type" "store4") + (set_attr "predicable" "yes")]) + +(define_insn "*stm4_da_update" + [(match_parallel 0 "store_multiple_operation" + [(set (match_operand:SI 5 "s_register_operand" "+&rk") + (plus:SI (match_dup 5) (const_int -16))) + (set (mem:SI (plus:SI (match_dup 5) (const_int -12))) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int -8))) + (match_operand:SI 2 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int -4))) + (match_operand:SI 3 "arm_hard_register_operand" "")) + (set (mem:SI (match_dup 5)) + (match_operand:SI 4 "arm_hard_register_operand" ""))])] + "TARGET_ARM && XVECLEN (operands[0], 0) == 5" + "stm%(da%)\t%5!, {%1, %2, %3, %4}" + [(set_attr "type" "store4") + (set_attr "predicable" "yes")]) + +(define_insn "*ldm4_db" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_operand:SI 5 "s_register_operand" "rk") + (const_int -16)))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int -12)))) + (set (match_operand:SI 3 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int -8)))) + (set (match_operand:SI 4 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int -4))))])] + "TARGET_32BIT && XVECLEN (operands[0], 0) == 4" + "ldm%(db%)\t%5, {%1, %2, %3, %4}" + [(set_attr "type" "load4") + (set_attr "predicable" "yes")]) + +(define_insn "*ldm4_db_update" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 5 "s_register_operand" "+&rk") + (plus:SI (match_dup 5) (const_int -16))) + (set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int -16)))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int -12)))) + (set (match_operand:SI 3 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int -8)))) + (set (match_operand:SI 4 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 5) + (const_int -4))))])] + "TARGET_32BIT && XVECLEN (operands[0], 0) == 5" + "ldm%(db%)\t%5!, {%1, %2, %3, %4}" + [(set_attr "type" "load4") + (set_attr "predicable" "yes")]) + +(define_insn "*stm4_db" + [(match_parallel 0 "store_multiple_operation" + [(set (mem:SI (plus:SI (match_operand:SI 5 "s_register_operand" "rk") (const_int -16))) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int -12))) + (match_operand:SI 2 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int -8))) + (match_operand:SI 3 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int -4))) + (match_operand:SI 4 "arm_hard_register_operand" ""))])] + "TARGET_32BIT && XVECLEN (operands[0], 0) == 4" + "stm%(db%)\t%5, {%1, %2, %3, %4}" + [(set_attr "type" "store4") + (set_attr "predicable" "yes")]) + +(define_insn "*stm4_db_update" + [(match_parallel 0 "store_multiple_operation" + [(set (match_operand:SI 5 "s_register_operand" "+&rk") + (plus:SI (match_dup 5) (const_int -16))) + (set (mem:SI (plus:SI (match_dup 5) (const_int -16))) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int -12))) + (match_operand:SI 2 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int -8))) + (match_operand:SI 3 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 5) (const_int -4))) + (match_operand:SI 4 "arm_hard_register_operand" ""))])] + "TARGET_32BIT && XVECLEN (operands[0], 0) == 5" + "stm%(db%)\t%5!, {%1, %2, %3, %4}" + [(set_attr "type" "store4") + (set_attr "predicable" "yes")]) + +(define_peephole2 + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operand:SI 4 "memory_operand" "")) + (set (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 5 "memory_operand" "")) + (set (match_operand:SI 2 "s_register_operand" "") + (match_operand:SI 6 "memory_operand" "")) + (set (match_operand:SI 3 "s_register_operand" "") + (match_operand:SI 7 "memory_operand" ""))] + "" + [(const_int 0)] +{ + if (gen_ldm_seq (operands, 4, false)) + DONE; + else + FAIL; +}) + +(define_peephole2 + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operand:SI 4 "memory_operand" "")) + (parallel + [(set (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 5 "memory_operand" "")) + (set (match_operand:SI 2 "s_register_operand" "") + (match_operand:SI 6 "memory_operand" "")) + (set (match_operand:SI 3 "s_register_operand" "") + (match_operand:SI 7 "memory_operand" ""))])] + "" + [(const_int 0)] +{ + if (gen_ldm_seq (operands, 4, false)) + DONE; + else + FAIL; +}) + +(define_peephole2 + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operand:SI 8 "const_int_operand" "")) + (set (match_operand:SI 4 "memory_operand" "") + (match_dup 0)) + (set (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 9 "const_int_operand" "")) + (set (match_operand:SI 5 "memory_operand" "") + (match_dup 1)) + (set (match_operand:SI 2 "s_register_operand" "") + (match_operand:SI 10 "const_int_operand" "")) + (set (match_operand:SI 6 "memory_operand" "") + (match_dup 2)) + (set (match_operand:SI 3 "s_register_operand" "") + (match_operand:SI 11 "const_int_operand" "")) + (set (match_operand:SI 7 "memory_operand" "") + (match_dup 3))] + "" + [(const_int 0)] +{ + if (gen_const_stm_seq (operands, 4)) + DONE; + else + FAIL; +}) + +(define_peephole2 + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operand:SI 8 "const_int_operand" "")) + (set (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 9 "const_int_operand" "")) + (set (match_operand:SI 2 "s_register_operand" "") + (match_operand:SI 10 "const_int_operand" "")) + (set (match_operand:SI 3 "s_register_operand" "") + (match_operand:SI 11 "const_int_operand" "")) + (set (match_operand:SI 4 "memory_operand" "") + (match_dup 0)) + (set (match_operand:SI 5 "memory_operand" "") + (match_dup 1)) + (set (match_operand:SI 6 "memory_operand" "") + (match_dup 2)) + (set (match_operand:SI 7 "memory_operand" "") + (match_dup 3))] + "" + [(const_int 0)] +{ + if (gen_const_stm_seq (operands, 4)) + DONE; + else + FAIL; +}) + +(define_peephole2 + [(set (match_operand:SI 4 "memory_operand" "") + (match_operand:SI 0 "s_register_operand" "")) + (set (match_operand:SI 5 "memory_operand" "") + (match_operand:SI 1 "s_register_operand" "")) + (set (match_operand:SI 6 "memory_operand" "") + (match_operand:SI 2 "s_register_operand" "")) + (set (match_operand:SI 7 "memory_operand" "") + (match_operand:SI 3 "s_register_operand" ""))] + "" + [(const_int 0)] +{ + if (gen_stm_seq (operands, 4)) + DONE; + else + FAIL; +}) + +(define_insn "*ldm3_ia" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (match_operand:SI 4 "s_register_operand" "rk"))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 4) + (const_int 4)))) + (set (match_operand:SI 3 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 4) + (const_int 8))))])] + "TARGET_32BIT && XVECLEN (operands[0], 0) == 3" + "ldm%(ia%)\t%4, {%1, %2, %3}" + [(set_attr "type" "load3") + (set_attr "predicable" "yes")]) + +(define_insn "*thumb_ldm3_ia" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (match_operand:SI 4 "s_register_operand" "l"))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 4) + (const_int 4)))) + (set (match_operand:SI 3 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 4) + (const_int 8))))])] + "TARGET_THUMB1 && XVECLEN (operands[0], 0) == 3" + "ldm%(ia%)\t%4, {%1, %2, %3}" + [(set_attr "type" "load3")]) + +(define_insn "*ldm3_ia_update" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 4 "s_register_operand" "+&rk") + (plus:SI (match_dup 4) (const_int 12))) + (set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (match_dup 4))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 4) + (const_int 4)))) + (set (match_operand:SI 3 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 4) + (const_int 8))))])] + "TARGET_32BIT && XVECLEN (operands[0], 0) == 4" + "ldm%(ia%)\t%4!, {%1, %2, %3}" + [(set_attr "type" "load3") + (set_attr "predicable" "yes")]) + +(define_insn "*thumb_ldm3_ia_update" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 4 "s_register_operand" "+&l") + (plus:SI (match_dup 4) (const_int 12))) + (set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (match_dup 4))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 4) + (const_int 4)))) + (set (match_operand:SI 3 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 4) + (const_int 8))))])] + "TARGET_THUMB1 && XVECLEN (operands[0], 0) == 4" + "ldm%(ia%)\t%4!, {%1, %2, %3}" + [(set_attr "type" "load3")]) + +(define_insn "*stm3_ia" + [(match_parallel 0 "store_multiple_operation" + [(set (mem:SI (match_operand:SI 4 "s_register_operand" "rk")) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 4) (const_int 4))) + (match_operand:SI 2 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 4) (const_int 8))) + (match_operand:SI 3 "arm_hard_register_operand" ""))])] + "TARGET_32BIT && XVECLEN (operands[0], 0) == 3" + "stm%(ia%)\t%4, {%1, %2, %3}" + [(set_attr "type" "store3") + (set_attr "predicable" "yes")]) + +(define_insn "*stm3_ia_update" + [(match_parallel 0 "store_multiple_operation" + [(set (match_operand:SI 4 "s_register_operand" "+&rk") + (plus:SI (match_dup 4) (const_int 12))) + (set (mem:SI (match_dup 4)) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 4) (const_int 4))) + (match_operand:SI 2 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 4) (const_int 8))) + (match_operand:SI 3 "arm_hard_register_operand" ""))])] + "TARGET_32BIT && XVECLEN (operands[0], 0) == 4" + "stm%(ia%)\t%4!, {%1, %2, %3}" + [(set_attr "type" "store3") + (set_attr "predicable" "yes")]) + +(define_insn "*thumb_stm3_ia_update" + [(match_parallel 0 "store_multiple_operation" + [(set (match_operand:SI 4 "s_register_operand" "+&l") + (plus:SI (match_dup 4) (const_int 12))) + (set (mem:SI (match_dup 4)) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 4) (const_int 4))) + (match_operand:SI 2 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 4) (const_int 8))) + (match_operand:SI 3 "arm_hard_register_operand" ""))])] + "TARGET_THUMB1 && XVECLEN (operands[0], 0) == 4" + "stm%(ia%)\t%4!, {%1, %2, %3}" + [(set_attr "type" "store3")]) + +(define_insn "*ldm3_ib" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_operand:SI 4 "s_register_operand" "rk") + (const_int 4)))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 4) + (const_int 8)))) + (set (match_operand:SI 3 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 4) + (const_int 12))))])] + "TARGET_ARM && XVECLEN (operands[0], 0) == 3" + "ldm%(ib%)\t%4, {%1, %2, %3}" + [(set_attr "type" "load3") + (set_attr "predicable" "yes")]) + +(define_insn "*ldm3_ib_update" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 4 "s_register_operand" "+&rk") + (plus:SI (match_dup 4) (const_int 12))) + (set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 4) + (const_int 4)))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 4) + (const_int 8)))) + (set (match_operand:SI 3 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 4) + (const_int 12))))])] + "TARGET_ARM && XVECLEN (operands[0], 0) == 4" + "ldm%(ib%)\t%4!, {%1, %2, %3}" + [(set_attr "type" "load3") + (set_attr "predicable" "yes")]) + +(define_insn "*stm3_ib" + [(match_parallel 0 "store_multiple_operation" + [(set (mem:SI (plus:SI (match_operand:SI 4 "s_register_operand" "rk") (const_int 4))) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 4) (const_int 8))) + (match_operand:SI 2 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 4) (const_int 12))) + (match_operand:SI 3 "arm_hard_register_operand" ""))])] + "TARGET_ARM && XVECLEN (operands[0], 0) == 3" + "stm%(ib%)\t%4, {%1, %2, %3}" + [(set_attr "type" "store3") + (set_attr "predicable" "yes")]) + +(define_insn "*stm3_ib_update" + [(match_parallel 0 "store_multiple_operation" + [(set (match_operand:SI 4 "s_register_operand" "+&rk") + (plus:SI (match_dup 4) (const_int 12))) + (set (mem:SI (plus:SI (match_dup 4) (const_int 4))) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 4) (const_int 8))) + (match_operand:SI 2 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 4) (const_int 12))) + (match_operand:SI 3 "arm_hard_register_operand" ""))])] + "TARGET_ARM && XVECLEN (operands[0], 0) == 4" + "stm%(ib%)\t%4!, {%1, %2, %3}" + [(set_attr "type" "store3") + (set_attr "predicable" "yes")]) + +(define_insn "*ldm3_da" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_operand:SI 4 "s_register_operand" "rk") + (const_int -8)))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 4) + (const_int -4)))) + (set (match_operand:SI 3 "arm_hard_register_operand" "") + (mem:SI (match_dup 4)))])] + "TARGET_ARM && XVECLEN (operands[0], 0) == 3" + "ldm%(da%)\t%4, {%1, %2, %3}" + [(set_attr "type" "load3") + (set_attr "predicable" "yes")]) + +(define_insn "*ldm3_da_update" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 4 "s_register_operand" "+&rk") + (plus:SI (match_dup 4) (const_int -12))) + (set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 4) + (const_int -8)))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 4) + (const_int -4)))) + (set (match_operand:SI 3 "arm_hard_register_operand" "") + (mem:SI (match_dup 4)))])] + "TARGET_ARM && XVECLEN (operands[0], 0) == 4" + "ldm%(da%)\t%4!, {%1, %2, %3}" + [(set_attr "type" "load3") + (set_attr "predicable" "yes")]) + +(define_insn "*stm3_da" + [(match_parallel 0 "store_multiple_operation" + [(set (mem:SI (plus:SI (match_operand:SI 4 "s_register_operand" "rk") (const_int -8))) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 4) (const_int -4))) + (match_operand:SI 2 "arm_hard_register_operand" "")) + (set (mem:SI (match_dup 4)) + (match_operand:SI 3 "arm_hard_register_operand" ""))])] + "TARGET_ARM && XVECLEN (operands[0], 0) == 3" + "stm%(da%)\t%4, {%1, %2, %3}" + [(set_attr "type" "store3") + (set_attr "predicable" "yes")]) + +(define_insn "*stm3_da_update" + [(match_parallel 0 "store_multiple_operation" + [(set (match_operand:SI 4 "s_register_operand" "+&rk") + (plus:SI (match_dup 4) (const_int -12))) + (set (mem:SI (plus:SI (match_dup 4) (const_int -8))) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 4) (const_int -4))) + (match_operand:SI 2 "arm_hard_register_operand" "")) + (set (mem:SI (match_dup 4)) + (match_operand:SI 3 "arm_hard_register_operand" ""))])] + "TARGET_ARM && XVECLEN (operands[0], 0) == 4" + "stm%(da%)\t%4!, {%1, %2, %3}" + [(set_attr "type" "store3") + (set_attr "predicable" "yes")]) + +(define_insn "*ldm3_db" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_operand:SI 4 "s_register_operand" "rk") + (const_int -12)))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 4) + (const_int -8)))) + (set (match_operand:SI 3 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 4) + (const_int -4))))])] + "TARGET_32BIT && XVECLEN (operands[0], 0) == 3" + "ldm%(db%)\t%4, {%1, %2, %3}" + [(set_attr "type" "load3") + (set_attr "predicable" "yes")]) + +(define_insn "*ldm3_db_update" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 4 "s_register_operand" "+&rk") + (plus:SI (match_dup 4) (const_int -12))) + (set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 4) + (const_int -12)))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 4) + (const_int -8)))) + (set (match_operand:SI 3 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 4) + (const_int -4))))])] + "TARGET_32BIT && XVECLEN (operands[0], 0) == 4" + "ldm%(db%)\t%4!, {%1, %2, %3}" + [(set_attr "type" "load3") + (set_attr "predicable" "yes")]) + +(define_insn "*stm3_db" + [(match_parallel 0 "store_multiple_operation" + [(set (mem:SI (plus:SI (match_operand:SI 4 "s_register_operand" "rk") (const_int -12))) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 4) (const_int -8))) + (match_operand:SI 2 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 4) (const_int -4))) + (match_operand:SI 3 "arm_hard_register_operand" ""))])] + "TARGET_32BIT && XVECLEN (operands[0], 0) == 3" + "stm%(db%)\t%4, {%1, %2, %3}" + [(set_attr "type" "store3") + (set_attr "predicable" "yes")]) + +(define_insn "*stm3_db_update" + [(match_parallel 0 "store_multiple_operation" + [(set (match_operand:SI 4 "s_register_operand" "+&rk") + (plus:SI (match_dup 4) (const_int -12))) + (set (mem:SI (plus:SI (match_dup 4) (const_int -12))) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 4) (const_int -8))) + (match_operand:SI 2 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 4) (const_int -4))) + (match_operand:SI 3 "arm_hard_register_operand" ""))])] + "TARGET_32BIT && XVECLEN (operands[0], 0) == 4" + "stm%(db%)\t%4!, {%1, %2, %3}" + [(set_attr "type" "store3") + (set_attr "predicable" "yes")]) + +(define_peephole2 + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operand:SI 3 "memory_operand" "")) + (set (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 4 "memory_operand" "")) + (set (match_operand:SI 2 "s_register_operand" "") + (match_operand:SI 5 "memory_operand" ""))] + "" + [(const_int 0)] +{ + if (gen_ldm_seq (operands, 3, false)) + DONE; + else + FAIL; +}) + +(define_peephole2 + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operand:SI 3 "memory_operand" "")) + (parallel + [(set (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 4 "memory_operand" "")) + (set (match_operand:SI 2 "s_register_operand" "") + (match_operand:SI 5 "memory_operand" ""))])] + "" + [(const_int 0)] +{ + if (gen_ldm_seq (operands, 3, false)) + DONE; + else + FAIL; +}) + +(define_peephole2 + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operand:SI 6 "const_int_operand" "")) + (set (match_operand:SI 3 "memory_operand" "") + (match_dup 0)) + (set (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 7 "const_int_operand" "")) + (set (match_operand:SI 4 "memory_operand" "") + (match_dup 1)) + (set (match_operand:SI 2 "s_register_operand" "") + (match_operand:SI 8 "const_int_operand" "")) + (set (match_operand:SI 5 "memory_operand" "") + (match_dup 2))] + "" + [(const_int 0)] +{ + if (gen_const_stm_seq (operands, 3)) + DONE; + else + FAIL; +}) + +(define_peephole2 + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operand:SI 6 "const_int_operand" "")) + (set (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 7 "const_int_operand" "")) + (set (match_operand:SI 2 "s_register_operand" "") + (match_operand:SI 8 "const_int_operand" "")) + (set (match_operand:SI 3 "memory_operand" "") + (match_dup 0)) + (set (match_operand:SI 4 "memory_operand" "") + (match_dup 1)) + (set (match_operand:SI 5 "memory_operand" "") + (match_dup 2))] + "" + [(const_int 0)] +{ + if (gen_const_stm_seq (operands, 3)) + DONE; + else + FAIL; +}) + +(define_peephole2 + [(set (match_operand:SI 3 "memory_operand" "") + (match_operand:SI 0 "s_register_operand" "")) + (set (match_operand:SI 4 "memory_operand" "") + (match_operand:SI 1 "s_register_operand" "")) + (set (match_operand:SI 5 "memory_operand" "") + (match_operand:SI 2 "s_register_operand" ""))] + "" + [(const_int 0)] +{ + if (gen_stm_seq (operands, 3)) + DONE; + else + FAIL; +}) + +(define_insn "*ldm2_ia" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (match_operand:SI 3 "s_register_operand" "rk"))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 3) + (const_int 4))))])] + "TARGET_32BIT && XVECLEN (operands[0], 0) == 2" + "ldm%(ia%)\t%3, {%1, %2}" + [(set_attr "type" "load2") + (set_attr "predicable" "yes")]) + +(define_insn "*thumb_ldm2_ia" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (match_operand:SI 3 "s_register_operand" "l"))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 3) + (const_int 4))))])] + "TARGET_THUMB1 && XVECLEN (operands[0], 0) == 2" + "ldm%(ia%)\t%3, {%1, %2}" + [(set_attr "type" "load2")]) + +(define_insn "*ldm2_ia_update" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 3 "s_register_operand" "+&rk") + (plus:SI (match_dup 3) (const_int 8))) + (set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (match_dup 3))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 3) + (const_int 4))))])] + "TARGET_32BIT && XVECLEN (operands[0], 0) == 3" + "ldm%(ia%)\t%3!, {%1, %2}" + [(set_attr "type" "load2") + (set_attr "predicable" "yes")]) + +(define_insn "*thumb_ldm2_ia_update" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 3 "s_register_operand" "+&l") + (plus:SI (match_dup 3) (const_int 8))) + (set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (match_dup 3))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 3) + (const_int 4))))])] + "TARGET_THUMB1 && XVECLEN (operands[0], 0) == 3" + "ldm%(ia%)\t%3!, {%1, %2}" + [(set_attr "type" "load2")]) + +(define_insn "*stm2_ia" + [(match_parallel 0 "store_multiple_operation" + [(set (mem:SI (match_operand:SI 3 "s_register_operand" "rk")) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 3) (const_int 4))) + (match_operand:SI 2 "arm_hard_register_operand" ""))])] + "TARGET_32BIT && XVECLEN (operands[0], 0) == 2" + "stm%(ia%)\t%3, {%1, %2}" + [(set_attr "type" "store2") + (set_attr "predicable" "yes")]) + +(define_insn "*stm2_ia_update" + [(match_parallel 0 "store_multiple_operation" + [(set (match_operand:SI 3 "s_register_operand" "+&rk") + (plus:SI (match_dup 3) (const_int 8))) + (set (mem:SI (match_dup 3)) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 3) (const_int 4))) + (match_operand:SI 2 "arm_hard_register_operand" ""))])] + "TARGET_32BIT && XVECLEN (operands[0], 0) == 3" + "stm%(ia%)\t%3!, {%1, %2}" + [(set_attr "type" "store2") + (set_attr "predicable" "yes")]) + +(define_insn "*thumb_stm2_ia_update" + [(match_parallel 0 "store_multiple_operation" + [(set (match_operand:SI 3 "s_register_operand" "+&l") + (plus:SI (match_dup 3) (const_int 8))) + (set (mem:SI (match_dup 3)) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 3) (const_int 4))) + (match_operand:SI 2 "arm_hard_register_operand" ""))])] + "TARGET_THUMB1 && XVECLEN (operands[0], 0) == 3" + "stm%(ia%)\t%3!, {%1, %2}" + [(set_attr "type" "store2")]) + +(define_insn "*ldm2_ib" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_operand:SI 3 "s_register_operand" "rk") + (const_int 4)))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 3) + (const_int 8))))])] + "TARGET_ARM && XVECLEN (operands[0], 0) == 2" + "ldm%(ib%)\t%3, {%1, %2}" + [(set_attr "type" "load2") + (set_attr "predicable" "yes")]) + +(define_insn "*ldm2_ib_update" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 3 "s_register_operand" "+&rk") + (plus:SI (match_dup 3) (const_int 8))) + (set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 3) + (const_int 4)))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 3) + (const_int 8))))])] + "TARGET_ARM && XVECLEN (operands[0], 0) == 3" + "ldm%(ib%)\t%3!, {%1, %2}" + [(set_attr "type" "load2") + (set_attr "predicable" "yes")]) + +(define_insn "*stm2_ib" + [(match_parallel 0 "store_multiple_operation" + [(set (mem:SI (plus:SI (match_operand:SI 3 "s_register_operand" "rk") (const_int 4))) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 3) (const_int 8))) + (match_operand:SI 2 "arm_hard_register_operand" ""))])] + "TARGET_ARM && XVECLEN (operands[0], 0) == 2" + "stm%(ib%)\t%3, {%1, %2}" + [(set_attr "type" "store2") + (set_attr "predicable" "yes")]) + +(define_insn "*stm2_ib_update" + [(match_parallel 0 "store_multiple_operation" + [(set (match_operand:SI 3 "s_register_operand" "+&rk") + (plus:SI (match_dup 3) (const_int 8))) + (set (mem:SI (plus:SI (match_dup 3) (const_int 4))) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 3) (const_int 8))) + (match_operand:SI 2 "arm_hard_register_operand" ""))])] + "TARGET_ARM && XVECLEN (operands[0], 0) == 3" + "stm%(ib%)\t%3!, {%1, %2}" + [(set_attr "type" "store2") + (set_attr "predicable" "yes")]) + +(define_insn "*ldm2_da" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_operand:SI 3 "s_register_operand" "rk") + (const_int -4)))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (match_dup 3)))])] + "TARGET_ARM && XVECLEN (operands[0], 0) == 2" + "ldm%(da%)\t%3, {%1, %2}" + [(set_attr "type" "load2") + (set_attr "predicable" "yes")]) + +(define_insn "*ldm2_da_update" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 3 "s_register_operand" "+&rk") + (plus:SI (match_dup 3) (const_int -8))) + (set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 3) + (const_int -4)))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (match_dup 3)))])] + "TARGET_ARM && XVECLEN (operands[0], 0) == 3" + "ldm%(da%)\t%3!, {%1, %2}" + [(set_attr "type" "load2") + (set_attr "predicable" "yes")]) + +(define_insn "*stm2_da" + [(match_parallel 0 "store_multiple_operation" + [(set (mem:SI (plus:SI (match_operand:SI 3 "s_register_operand" "rk") (const_int -4))) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (match_dup 3)) + (match_operand:SI 2 "arm_hard_register_operand" ""))])] + "TARGET_ARM && XVECLEN (operands[0], 0) == 2" + "stm%(da%)\t%3, {%1, %2}" + [(set_attr "type" "store2") + (set_attr "predicable" "yes")]) + +(define_insn "*stm2_da_update" + [(match_parallel 0 "store_multiple_operation" + [(set (match_operand:SI 3 "s_register_operand" "+&rk") + (plus:SI (match_dup 3) (const_int -8))) + (set (mem:SI (plus:SI (match_dup 3) (const_int -4))) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (match_dup 3)) + (match_operand:SI 2 "arm_hard_register_operand" ""))])] + "TARGET_ARM && XVECLEN (operands[0], 0) == 3" + "stm%(da%)\t%3!, {%1, %2}" + [(set_attr "type" "store2") + (set_attr "predicable" "yes")]) + +(define_insn "*ldm2_db" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_operand:SI 3 "s_register_operand" "rk") + (const_int -8)))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 3) + (const_int -4))))])] + "TARGET_32BIT && XVECLEN (operands[0], 0) == 2" + "ldm%(db%)\t%3, {%1, %2}" + [(set_attr "type" "load2") + (set_attr "predicable" "yes")]) + +(define_insn "*ldm2_db_update" + [(match_parallel 0 "load_multiple_operation" + [(set (match_operand:SI 3 "s_register_operand" "+&rk") + (plus:SI (match_dup 3) (const_int -8))) + (set (match_operand:SI 1 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 3) + (const_int -8)))) + (set (match_operand:SI 2 "arm_hard_register_operand" "") + (mem:SI (plus:SI (match_dup 3) + (const_int -4))))])] + "TARGET_32BIT && XVECLEN (operands[0], 0) == 3" + "ldm%(db%)\t%3!, {%1, %2}" + [(set_attr "type" "load2") + (set_attr "predicable" "yes")]) + +(define_insn "*stm2_db" + [(match_parallel 0 "store_multiple_operation" + [(set (mem:SI (plus:SI (match_operand:SI 3 "s_register_operand" "rk") (const_int -8))) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 3) (const_int -4))) + (match_operand:SI 2 "arm_hard_register_operand" ""))])] + "TARGET_32BIT && XVECLEN (operands[0], 0) == 2" + "stm%(db%)\t%3, {%1, %2}" + [(set_attr "type" "store2") + (set_attr "predicable" "yes")]) + +(define_insn "*stm2_db_update" + [(match_parallel 0 "store_multiple_operation" + [(set (match_operand:SI 3 "s_register_operand" "+&rk") + (plus:SI (match_dup 3) (const_int -8))) + (set (mem:SI (plus:SI (match_dup 3) (const_int -8))) + (match_operand:SI 1 "arm_hard_register_operand" "")) + (set (mem:SI (plus:SI (match_dup 3) (const_int -4))) + (match_operand:SI 2 "arm_hard_register_operand" ""))])] + "TARGET_32BIT && XVECLEN (operands[0], 0) == 3" + "stm%(db%)\t%3!, {%1, %2}" + [(set_attr "type" "store2") + (set_attr "predicable" "yes")]) + +(define_peephole2 + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operand:SI 2 "memory_operand" "")) + (set (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 3 "memory_operand" ""))] + "" + [(const_int 0)] +{ + if (gen_ldm_seq (operands, 2, false)) + DONE; + else + FAIL; +}) + +(define_peephole2 + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operand:SI 4 "const_int_operand" "")) + (set (match_operand:SI 2 "memory_operand" "") + (match_dup 0)) + (set (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 5 "const_int_operand" "")) + (set (match_operand:SI 3 "memory_operand" "") + (match_dup 1))] + "" + [(const_int 0)] +{ + if (gen_const_stm_seq (operands, 2)) + DONE; + else + FAIL; +}) + +(define_peephole2 + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operand:SI 4 "const_int_operand" "")) + (set (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 5 "const_int_operand" "")) + (set (match_operand:SI 2 "memory_operand" "") + (match_dup 0)) + (set (match_operand:SI 3 "memory_operand" "") + (match_dup 1))] + "" + [(const_int 0)] +{ + if (gen_const_stm_seq (operands, 2)) + DONE; + else + FAIL; +}) + +(define_peephole2 + [(set (match_operand:SI 2 "memory_operand" "") + (match_operand:SI 0 "s_register_operand" "")) + (set (match_operand:SI 3 "memory_operand" "") + (match_operand:SI 1 "s_register_operand" ""))] + "" + [(const_int 0)] +{ + if (gen_stm_seq (operands, 2)) + DONE; + else + FAIL; +}) + +(define_peephole2 + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operand:SI 2 "memory_operand" "")) + (set (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 3 "memory_operand" "")) + (parallel + [(set (match_operand:SI 4 "s_register_operand" "") + (match_operator:SI 5 "commutative_binary_operator" + [(match_operand:SI 6 "s_register_operand" "") + (match_operand:SI 7 "s_register_operand" "")])) + (clobber (reg:CC CC_REGNUM))])] + "(((operands[6] == operands[0] && operands[7] == operands[1]) + || (operands[7] == operands[0] && operands[6] == operands[1])) + && peep2_reg_dead_p (3, operands[0]) && peep2_reg_dead_p (3, operands[1]))" + [(parallel + [(set (match_dup 4) (match_op_dup 5 [(match_dup 6) (match_dup 7)])) + (clobber (reg:CC CC_REGNUM))])] +{ + if (!gen_ldm_seq (operands, 2, true)) + FAIL; +}) + +(define_peephole2 + [(set (match_operand:SI 0 "s_register_operand" "") + (match_operand:SI 2 "memory_operand" "")) + (set (match_operand:SI 1 "s_register_operand" "") + (match_operand:SI 3 "memory_operand" "")) + (set (match_operand:SI 4 "s_register_operand" "") + (match_operator:SI 5 "commutative_binary_operator" + [(match_operand:SI 6 "s_register_operand" "") + (match_operand:SI 7 "s_register_operand" "")]))] + "(((operands[6] == operands[0] && operands[7] == operands[1]) + || (operands[7] == operands[0] && operands[6] == operands[1])) + && peep2_reg_dead_p (3, operands[0]) && peep2_reg_dead_p (3, operands[1]))" + [(set (match_dup 4) (match_op_dup 5 [(match_dup 6) (match_dup 7)]))] +{ + if (!gen_ldm_seq (operands, 2, true)) + FAIL; +}) + diff --git a/gcc/config/arm/lib1funcs.asm b/gcc/config/arm/lib1funcs.asm new file mode 100644 index 000000000..2e76c01df --- /dev/null +++ b/gcc/config/arm/lib1funcs.asm @@ -0,0 +1,1829 @@ +@ libgcc routines for ARM cpu. +@ Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk) + +/* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005, 2007, 2008, + 2009, 2010 Free Software Foundation, Inc. + +This file is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3, or (at your option) any +later version. + +This file is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ + +/* An executable stack is *not* required for these functions. */ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +.previous +#endif /* __ELF__ and __linux__ */ + +#ifdef __ARM_EABI__ +/* Some attributes that are common to all routines in this file. */ + /* Tag_ABI_align_needed: This code does not require 8-byte + alignment from the caller. */ + /* .eabi_attribute 24, 0 -- default setting. */ + /* Tag_ABI_align_preserved: This code preserves 8-byte + alignment in any callee. */ + .eabi_attribute 25, 1 +#endif /* __ARM_EABI__ */ +/* ------------------------------------------------------------------------ */ + +/* We need to know what prefix to add to function names. */ + +#ifndef __USER_LABEL_PREFIX__ +#error __USER_LABEL_PREFIX__ not defined +#endif + +/* ANSI concatenation macros. */ + +#define CONCAT1(a, b) CONCAT2(a, b) +#define CONCAT2(a, b) a ## b + +/* Use the right prefix for global labels. */ + +#define SYM(x) CONCAT1 (__USER_LABEL_PREFIX__, x) + +#ifdef __ELF__ +#ifdef __thumb__ +#define __PLT__ /* Not supported in Thumb assembler (for now). */ +#elif defined __vxworks && !defined __PIC__ +#define __PLT__ /* Not supported by the kernel loader. */ +#else +#define __PLT__ (PLT) +#endif +#define TYPE(x) .type SYM(x),function +#define SIZE(x) .size SYM(x), . - SYM(x) +#define LSYM(x) .x +#else +#define __PLT__ +#define TYPE(x) +#define SIZE(x) +#define LSYM(x) x +#endif + +/* Function end macros. Variants for interworking. */ + +#if defined(__ARM_ARCH_2__) +# define __ARM_ARCH__ 2 +#endif + +#if defined(__ARM_ARCH_3__) +# define __ARM_ARCH__ 3 +#endif + +#if defined(__ARM_ARCH_3M__) || defined(__ARM_ARCH_4__) \ + || defined(__ARM_ARCH_4T__) +/* We use __ARM_ARCH__ set to 4 here, but in reality it's any processor with + long multiply instructions. That includes v3M. */ +# define __ARM_ARCH__ 4 +#endif + +#if defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) \ + || defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \ + || defined(__ARM_ARCH_5TEJ__) +# define __ARM_ARCH__ 5 +#endif + +#if defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ + || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) \ + || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) \ + || defined(__ARM_ARCH_6M__) +# define __ARM_ARCH__ 6 +#endif + +#if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ + || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ + || defined(__ARM_ARCH_7EM__) +# define __ARM_ARCH__ 7 +#endif + +#ifndef __ARM_ARCH__ +#error Unable to determine architecture. +#endif + +/* There are times when we might prefer Thumb1 code even if ARM code is + permitted, for example, the code might be smaller, or there might be + interworking problems with switching to ARM state if interworking is + disabled. */ +#if (defined(__thumb__) \ + && !defined(__thumb2__) \ + && (!defined(__THUMB_INTERWORK__) \ + || defined (__OPTIMIZE_SIZE__) \ + || defined(__ARM_ARCH_6M__))) +# define __prefer_thumb__ +#endif + +/* How to return from a function call depends on the architecture variant. */ + +#if (__ARM_ARCH__ > 4) || defined(__ARM_ARCH_4T__) + +# define RET bx lr +# define RETc(x) bx##x lr + +/* Special precautions for interworking on armv4t. */ +# if (__ARM_ARCH__ == 4) + +/* Always use bx, not ldr pc. */ +# if (defined(__thumb__) || defined(__THUMB_INTERWORK__)) +# define __INTERWORKING__ +# endif /* __THUMB__ || __THUMB_INTERWORK__ */ + +/* Include thumb stub before arm mode code. */ +# if defined(__thumb__) && !defined(__THUMB_INTERWORK__) +# define __INTERWORKING_STUBS__ +# endif /* __thumb__ && !__THUMB_INTERWORK__ */ + +#endif /* __ARM_ARCH == 4 */ + +#else + +# define RET mov pc, lr +# define RETc(x) mov##x pc, lr + +#endif + +.macro cfi_pop advance, reg, cfa_offset +#ifdef __ELF__ + .pushsection .debug_frame + .byte 0x4 /* DW_CFA_advance_loc4 */ + .4byte \advance + .byte (0xc0 | \reg) /* DW_CFA_restore */ + .byte 0xe /* DW_CFA_def_cfa_offset */ + .uleb128 \cfa_offset + .popsection +#endif +.endm +.macro cfi_push advance, reg, offset, cfa_offset +#ifdef __ELF__ + .pushsection .debug_frame + .byte 0x4 /* DW_CFA_advance_loc4 */ + .4byte \advance + .byte (0x80 | \reg) /* DW_CFA_offset */ + .uleb128 (\offset / -4) + .byte 0xe /* DW_CFA_def_cfa_offset */ + .uleb128 \cfa_offset + .popsection +#endif +.endm +.macro cfi_start start_label, end_label +#ifdef __ELF__ + .pushsection .debug_frame +LSYM(Lstart_frame): + .4byte LSYM(Lend_cie) - LSYM(Lstart_cie) @ Length of CIE +LSYM(Lstart_cie): + .4byte 0xffffffff @ CIE Identifier Tag + .byte 0x1 @ CIE Version + .ascii "\0" @ CIE Augmentation + .uleb128 0x1 @ CIE Code Alignment Factor + .sleb128 -4 @ CIE Data Alignment Factor + .byte 0xe @ CIE RA Column + .byte 0xc @ DW_CFA_def_cfa + .uleb128 0xd + .uleb128 0x0 + + .align 2 +LSYM(Lend_cie): + .4byte LSYM(Lend_fde)-LSYM(Lstart_fde) @ FDE Length +LSYM(Lstart_fde): + .4byte LSYM(Lstart_frame) @ FDE CIE offset + .4byte \start_label @ FDE initial location + .4byte \end_label-\start_label @ FDE address range + .popsection +#endif +.endm +.macro cfi_end end_label +#ifdef __ELF__ + .pushsection .debug_frame + .align 2 +LSYM(Lend_fde): + .popsection +\end_label: +#endif +.endm + +/* Don't pass dirn, it's there just to get token pasting right. */ + +.macro RETLDM regs=, cond=, unwind=, dirn=ia +#if defined (__INTERWORKING__) + .ifc "\regs","" + ldr\cond lr, [sp], #8 + .else +# if defined(__thumb2__) + pop\cond {\regs, lr} +# else + ldm\cond\dirn sp!, {\regs, lr} +# endif + .endif + .ifnc "\unwind", "" + /* Mark LR as restored. */ +97: cfi_pop 97b - \unwind, 0xe, 0x0 + .endif + bx\cond lr +#else + /* Caller is responsible for providing IT instruction. */ + .ifc "\regs","" + ldr\cond pc, [sp], #8 + .else +# if defined(__thumb2__) + pop\cond {\regs, pc} +# else + ldm\cond\dirn sp!, {\regs, pc} +# endif + .endif +#endif +.endm + +/* The Unified assembly syntax allows the same code to be assembled for both + ARM and Thumb-2. However this is only supported by recent gas, so define + a set of macros to allow ARM code on older assemblers. */ +#if defined(__thumb2__) +.macro do_it cond, suffix="" + it\suffix \cond +.endm +.macro shift1 op, arg0, arg1, arg2 + \op \arg0, \arg1, \arg2 +.endm +#define do_push push +#define do_pop pop +#define COND(op1, op2, cond) op1 ## op2 ## cond +/* Perform an arithmetic operation with a variable shift operand. This + requires two instructions and a scratch register on Thumb-2. */ +.macro shiftop name, dest, src1, src2, shiftop, shiftreg, tmp + \shiftop \tmp, \src2, \shiftreg + \name \dest, \src1, \tmp +.endm +#else +.macro do_it cond, suffix="" +.endm +.macro shift1 op, arg0, arg1, arg2 + mov \arg0, \arg1, \op \arg2 +.endm +#define do_push stmfd sp!, +#define do_pop ldmfd sp!, +#define COND(op1, op2, cond) op1 ## cond ## op2 +.macro shiftop name, dest, src1, src2, shiftop, shiftreg, tmp + \name \dest, \src1, \src2, \shiftop \shiftreg +.endm +#endif + +#ifdef __ARM_EABI__ +.macro ARM_LDIV0 name signed + cmp r0, #0 + .ifc \signed, unsigned + movne r0, #0xffffffff + .else + movgt r0, #0x7fffffff + movlt r0, #0x80000000 + .endif + b SYM (__aeabi_idiv0) __PLT__ +.endm +#else +.macro ARM_LDIV0 name signed + str lr, [sp, #-8]! +98: cfi_push 98b - __\name, 0xe, -0x8, 0x8 + bl SYM (__div0) __PLT__ + mov r0, #0 @ About as wrong as it could be. + RETLDM unwind=98b +.endm +#endif + + +#ifdef __ARM_EABI__ +.macro THUMB_LDIV0 name signed +#if defined(__ARM_ARCH_6M__) + .ifc \signed, unsigned + cmp r0, #0 + beq 1f + mov r0, #0 + mvn r0, r0 @ 0xffffffff +1: + .else + cmp r0, #0 + beq 2f + blt 3f + mov r0, #0 + mvn r0, r0 + lsr r0, r0, #1 @ 0x7fffffff + b 2f +3: mov r0, #0x80 + lsl r0, r0, #24 @ 0x80000000 +2: + .endif + push {r0, r1, r2} + ldr r0, 4f + adr r1, 4f + add r0, r1 + str r0, [sp, #8] + @ We know we are not on armv4t, so pop pc is safe. + pop {r0, r1, pc} + .align 2 +4: + .word __aeabi_idiv0 - 4b +#elif defined(__thumb2__) + .syntax unified + .ifc \signed, unsigned + cbz r0, 1f + mov r0, #0xffffffff +1: + .else + cmp r0, #0 + do_it gt + movgt r0, #0x7fffffff + do_it lt + movlt r0, #0x80000000 + .endif + b.w SYM(__aeabi_idiv0) __PLT__ +#else + .align 2 + bx pc + nop + .arm + cmp r0, #0 + .ifc \signed, unsigned + movne r0, #0xffffffff + .else + movgt r0, #0x7fffffff + movlt r0, #0x80000000 + .endif + b SYM(__aeabi_idiv0) __PLT__ + .thumb +#endif +.endm +#else +.macro THUMB_LDIV0 name signed + push { r1, lr } +98: cfi_push 98b - __\name, 0xe, -0x4, 0x8 + bl SYM (__div0) + mov r0, #0 @ About as wrong as it could be. +#if defined (__INTERWORKING__) + pop { r1, r2 } + bx r2 +#else + pop { r1, pc } +#endif +.endm +#endif + +.macro FUNC_END name + SIZE (__\name) +.endm + +.macro DIV_FUNC_END name signed + cfi_start __\name, LSYM(Lend_div0) +LSYM(Ldiv0): +#ifdef __thumb__ + THUMB_LDIV0 \name \signed +#else + ARM_LDIV0 \name \signed +#endif + cfi_end LSYM(Lend_div0) + FUNC_END \name +.endm + +.macro THUMB_FUNC_START name + .globl SYM (\name) + TYPE (\name) + .thumb_func +SYM (\name): +.endm + +/* Function start macros. Variants for ARM and Thumb. */ + +#ifdef __thumb__ +#define THUMB_FUNC .thumb_func +#define THUMB_CODE .force_thumb +# if defined(__thumb2__) +#define THUMB_SYNTAX .syntax divided +# else +#define THUMB_SYNTAX +# endif +#else +#define THUMB_FUNC +#define THUMB_CODE +#define THUMB_SYNTAX +#endif + +.macro FUNC_START name + .text + .globl SYM (__\name) + TYPE (__\name) + .align 0 + THUMB_CODE + THUMB_FUNC + THUMB_SYNTAX +SYM (__\name): +.endm + +/* Special function that will always be coded in ARM assembly, even if + in Thumb-only compilation. */ + +#if defined(__thumb2__) + +/* For Thumb-2 we build everything in thumb mode. */ +.macro ARM_FUNC_START name + FUNC_START \name + .syntax unified +.endm +#define EQUIV .thumb_set +.macro ARM_CALL name + bl __\name +.endm + +#elif defined(__INTERWORKING_STUBS__) + +.macro ARM_FUNC_START name + FUNC_START \name + bx pc + nop + .arm +/* A hook to tell gdb that we've switched to ARM mode. Also used to call + directly from other local arm routines. */ +_L__\name: +.endm +#define EQUIV .thumb_set +/* Branch directly to a function declared with ARM_FUNC_START. + Must be called in arm mode. */ +.macro ARM_CALL name + bl _L__\name +.endm + +#else /* !(__INTERWORKING_STUBS__ || __thumb2__) */ + +#ifdef __ARM_ARCH_6M__ +#define EQUIV .thumb_set +#else +.macro ARM_FUNC_START name + .text + .globl SYM (__\name) + TYPE (__\name) + .align 0 + .arm +SYM (__\name): +.endm +#define EQUIV .set +.macro ARM_CALL name + bl __\name +.endm +#endif + +#endif + +.macro FUNC_ALIAS new old + .globl SYM (__\new) +#if defined (__thumb__) + .thumb_set SYM (__\new), SYM (__\old) +#else + .set SYM (__\new), SYM (__\old) +#endif +.endm + +#ifndef __ARM_ARCH_6M__ +.macro ARM_FUNC_ALIAS new old + .globl SYM (__\new) + EQUIV SYM (__\new), SYM (__\old) +#if defined(__INTERWORKING_STUBS__) + .set SYM (_L__\new), SYM (_L__\old) +#endif +.endm +#endif + +#ifdef __ARMEB__ +#define xxh r0 +#define xxl r1 +#define yyh r2 +#define yyl r3 +#else +#define xxh r1 +#define xxl r0 +#define yyh r3 +#define yyl r2 +#endif + +#ifdef __ARM_EABI__ +.macro WEAK name + .weak SYM (__\name) +.endm +#endif + +#ifdef __thumb__ +/* Register aliases. */ + +work .req r4 @ XXXX is this safe ? +dividend .req r0 +divisor .req r1 +overdone .req r2 +result .req r2 +curbit .req r3 +#endif +#if 0 +ip .req r12 +sp .req r13 +lr .req r14 +pc .req r15 +#endif + +/* ------------------------------------------------------------------------ */ +/* Bodies of the division and modulo routines. */ +/* ------------------------------------------------------------------------ */ +.macro ARM_DIV_BODY dividend, divisor, result, curbit + +#if __ARM_ARCH__ >= 5 && ! defined (__OPTIMIZE_SIZE__) + +#if defined (__thumb2__) + clz \curbit, \dividend + clz \result, \divisor + sub \curbit, \result, \curbit + rsb \curbit, \curbit, #31 + adr \result, 1f + add \curbit, \result, \curbit, lsl #4 + mov \result, #0 + mov pc, \curbit +.p2align 3 +1: + .set shift, 32 + .rept 32 + .set shift, shift - 1 + cmp.w \dividend, \divisor, lsl #shift + nop.n + adc.w \result, \result, \result + it cs + subcs.w \dividend, \dividend, \divisor, lsl #shift + .endr +#else + clz \curbit, \dividend + clz \result, \divisor + sub \curbit, \result, \curbit + rsbs \curbit, \curbit, #31 + addne \curbit, \curbit, \curbit, lsl #1 + mov \result, #0 + addne pc, pc, \curbit, lsl #2 + nop + .set shift, 32 + .rept 32 + .set shift, shift - 1 + cmp \dividend, \divisor, lsl #shift + adc \result, \result, \result + subcs \dividend, \dividend, \divisor, lsl #shift + .endr +#endif + +#else /* __ARM_ARCH__ < 5 || defined (__OPTIMIZE_SIZE__) */ +#if __ARM_ARCH__ >= 5 + + clz \curbit, \divisor + clz \result, \dividend + sub \result, \curbit, \result + mov \curbit, #1 + mov \divisor, \divisor, lsl \result + mov \curbit, \curbit, lsl \result + mov \result, #0 + +#else /* __ARM_ARCH__ < 5 */ + + @ Initially shift the divisor left 3 bits if possible, + @ set curbit accordingly. This allows for curbit to be located + @ at the left end of each 4-bit nibbles in the division loop + @ to save one loop in most cases. + tst \divisor, #0xe0000000 + moveq \divisor, \divisor, lsl #3 + moveq \curbit, #8 + movne \curbit, #1 + + @ Unless the divisor is very big, shift it up in multiples of + @ four bits, since this is the amount of unwinding in the main + @ division loop. Continue shifting until the divisor is + @ larger than the dividend. +1: cmp \divisor, #0x10000000 + cmplo \divisor, \dividend + movlo \divisor, \divisor, lsl #4 + movlo \curbit, \curbit, lsl #4 + blo 1b + + @ For very big divisors, we must shift it a bit at a time, or + @ we will be in danger of overflowing. +1: cmp \divisor, #0x80000000 + cmplo \divisor, \dividend + movlo \divisor, \divisor, lsl #1 + movlo \curbit, \curbit, lsl #1 + blo 1b + + mov \result, #0 + +#endif /* __ARM_ARCH__ < 5 */ + + @ Division loop +1: cmp \dividend, \divisor + do_it hs, t + subhs \dividend, \dividend, \divisor + orrhs \result, \result, \curbit + cmp \dividend, \divisor, lsr #1 + do_it hs, t + subhs \dividend, \dividend, \divisor, lsr #1 + orrhs \result, \result, \curbit, lsr #1 + cmp \dividend, \divisor, lsr #2 + do_it hs, t + subhs \dividend, \dividend, \divisor, lsr #2 + orrhs \result, \result, \curbit, lsr #2 + cmp \dividend, \divisor, lsr #3 + do_it hs, t + subhs \dividend, \dividend, \divisor, lsr #3 + orrhs \result, \result, \curbit, lsr #3 + cmp \dividend, #0 @ Early termination? + do_it ne, t + movnes \curbit, \curbit, lsr #4 @ No, any more bits to do? + movne \divisor, \divisor, lsr #4 + bne 1b + +#endif /* __ARM_ARCH__ < 5 || defined (__OPTIMIZE_SIZE__) */ + +.endm +/* ------------------------------------------------------------------------ */ +.macro ARM_DIV2_ORDER divisor, order + +#if __ARM_ARCH__ >= 5 + + clz \order, \divisor + rsb \order, \order, #31 + +#else + + cmp \divisor, #(1 << 16) + movhs \divisor, \divisor, lsr #16 + movhs \order, #16 + movlo \order, #0 + + cmp \divisor, #(1 << 8) + movhs \divisor, \divisor, lsr #8 + addhs \order, \order, #8 + + cmp \divisor, #(1 << 4) + movhs \divisor, \divisor, lsr #4 + addhs \order, \order, #4 + + cmp \divisor, #(1 << 2) + addhi \order, \order, #3 + addls \order, \order, \divisor, lsr #1 + +#endif + +.endm +/* ------------------------------------------------------------------------ */ +.macro ARM_MOD_BODY dividend, divisor, order, spare + +#if __ARM_ARCH__ >= 5 && ! defined (__OPTIMIZE_SIZE__) + + clz \order, \divisor + clz \spare, \dividend + sub \order, \order, \spare + rsbs \order, \order, #31 + addne pc, pc, \order, lsl #3 + nop + .set shift, 32 + .rept 32 + .set shift, shift - 1 + cmp \dividend, \divisor, lsl #shift + subcs \dividend, \dividend, \divisor, lsl #shift + .endr + +#else /* __ARM_ARCH__ < 5 || defined (__OPTIMIZE_SIZE__) */ +#if __ARM_ARCH__ >= 5 + + clz \order, \divisor + clz \spare, \dividend + sub \order, \order, \spare + mov \divisor, \divisor, lsl \order + +#else /* __ARM_ARCH__ < 5 */ + + mov \order, #0 + + @ Unless the divisor is very big, shift it up in multiples of + @ four bits, since this is the amount of unwinding in the main + @ division loop. Continue shifting until the divisor is + @ larger than the dividend. +1: cmp \divisor, #0x10000000 + cmplo \divisor, \dividend + movlo \divisor, \divisor, lsl #4 + addlo \order, \order, #4 + blo 1b + + @ For very big divisors, we must shift it a bit at a time, or + @ we will be in danger of overflowing. +1: cmp \divisor, #0x80000000 + cmplo \divisor, \dividend + movlo \divisor, \divisor, lsl #1 + addlo \order, \order, #1 + blo 1b + +#endif /* __ARM_ARCH__ < 5 */ + + @ Perform all needed substractions to keep only the reminder. + @ Do comparisons in batch of 4 first. + subs \order, \order, #3 @ yes, 3 is intended here + blt 2f + +1: cmp \dividend, \divisor + subhs \dividend, \dividend, \divisor + cmp \dividend, \divisor, lsr #1 + subhs \dividend, \dividend, \divisor, lsr #1 + cmp \dividend, \divisor, lsr #2 + subhs \dividend, \dividend, \divisor, lsr #2 + cmp \dividend, \divisor, lsr #3 + subhs \dividend, \dividend, \divisor, lsr #3 + cmp \dividend, #1 + mov \divisor, \divisor, lsr #4 + subges \order, \order, #4 + bge 1b + + tst \order, #3 + teqne \dividend, #0 + beq 5f + + @ Either 1, 2 or 3 comparison/substractions are left. +2: cmn \order, #2 + blt 4f + beq 3f + cmp \dividend, \divisor + subhs \dividend, \dividend, \divisor + mov \divisor, \divisor, lsr #1 +3: cmp \dividend, \divisor + subhs \dividend, \dividend, \divisor + mov \divisor, \divisor, lsr #1 +4: cmp \dividend, \divisor + subhs \dividend, \dividend, \divisor +5: + +#endif /* __ARM_ARCH__ < 5 || defined (__OPTIMIZE_SIZE__) */ + +.endm +/* ------------------------------------------------------------------------ */ +.macro THUMB_DIV_MOD_BODY modulo + @ Load the constant 0x10000000 into our work register. + mov work, #1 + lsl work, #28 +LSYM(Loop1): + @ Unless the divisor is very big, shift it up in multiples of + @ four bits, since this is the amount of unwinding in the main + @ division loop. Continue shifting until the divisor is + @ larger than the dividend. + cmp divisor, work + bhs LSYM(Lbignum) + cmp divisor, dividend + bhs LSYM(Lbignum) + lsl divisor, #4 + lsl curbit, #4 + b LSYM(Loop1) +LSYM(Lbignum): + @ Set work to 0x80000000 + lsl work, #3 +LSYM(Loop2): + @ For very big divisors, we must shift it a bit at a time, or + @ we will be in danger of overflowing. + cmp divisor, work + bhs LSYM(Loop3) + cmp divisor, dividend + bhs LSYM(Loop3) + lsl divisor, #1 + lsl curbit, #1 + b LSYM(Loop2) +LSYM(Loop3): + @ Test for possible subtractions ... + .if \modulo + @ ... On the final pass, this may subtract too much from the dividend, + @ so keep track of which subtractions are done, we can fix them up + @ afterwards. + mov overdone, #0 + cmp dividend, divisor + blo LSYM(Lover1) + sub dividend, dividend, divisor +LSYM(Lover1): + lsr work, divisor, #1 + cmp dividend, work + blo LSYM(Lover2) + sub dividend, dividend, work + mov ip, curbit + mov work, #1 + ror curbit, work + orr overdone, curbit + mov curbit, ip +LSYM(Lover2): + lsr work, divisor, #2 + cmp dividend, work + blo LSYM(Lover3) + sub dividend, dividend, work + mov ip, curbit + mov work, #2 + ror curbit, work + orr overdone, curbit + mov curbit, ip +LSYM(Lover3): + lsr work, divisor, #3 + cmp dividend, work + blo LSYM(Lover4) + sub dividend, dividend, work + mov ip, curbit + mov work, #3 + ror curbit, work + orr overdone, curbit + mov curbit, ip +LSYM(Lover4): + mov ip, curbit + .else + @ ... and note which bits are done in the result. On the final pass, + @ this may subtract too much from the dividend, but the result will be ok, + @ since the "bit" will have been shifted out at the bottom. + cmp dividend, divisor + blo LSYM(Lover1) + sub dividend, dividend, divisor + orr result, result, curbit +LSYM(Lover1): + lsr work, divisor, #1 + cmp dividend, work + blo LSYM(Lover2) + sub dividend, dividend, work + lsr work, curbit, #1 + orr result, work +LSYM(Lover2): + lsr work, divisor, #2 + cmp dividend, work + blo LSYM(Lover3) + sub dividend, dividend, work + lsr work, curbit, #2 + orr result, work +LSYM(Lover3): + lsr work, divisor, #3 + cmp dividend, work + blo LSYM(Lover4) + sub dividend, dividend, work + lsr work, curbit, #3 + orr result, work +LSYM(Lover4): + .endif + + cmp dividend, #0 @ Early termination? + beq LSYM(Lover5) + lsr curbit, #4 @ No, any more bits to do? + beq LSYM(Lover5) + lsr divisor, #4 + b LSYM(Loop3) +LSYM(Lover5): + .if \modulo + @ Any subtractions that we should not have done will be recorded in + @ the top three bits of "overdone". Exactly which were not needed + @ are governed by the position of the bit, stored in ip. + mov work, #0xe + lsl work, #28 + and overdone, work + beq LSYM(Lgot_result) + + @ If we terminated early, because dividend became zero, then the + @ bit in ip will not be in the bottom nibble, and we should not + @ perform the additions below. We must test for this though + @ (rather relying upon the TSTs to prevent the additions) since + @ the bit in ip could be in the top two bits which might then match + @ with one of the smaller RORs. + mov curbit, ip + mov work, #0x7 + tst curbit, work + beq LSYM(Lgot_result) + + mov curbit, ip + mov work, #3 + ror curbit, work + tst overdone, curbit + beq LSYM(Lover6) + lsr work, divisor, #3 + add dividend, work +LSYM(Lover6): + mov curbit, ip + mov work, #2 + ror curbit, work + tst overdone, curbit + beq LSYM(Lover7) + lsr work, divisor, #2 + add dividend, work +LSYM(Lover7): + mov curbit, ip + mov work, #1 + ror curbit, work + tst overdone, curbit + beq LSYM(Lgot_result) + lsr work, divisor, #1 + add dividend, work + .endif +LSYM(Lgot_result): +.endm +/* ------------------------------------------------------------------------ */ +/* Start of the Real Functions */ +/* ------------------------------------------------------------------------ */ +#ifdef L_udivsi3 + +#if defined(__prefer_thumb__) + + FUNC_START udivsi3 + FUNC_ALIAS aeabi_uidiv udivsi3 + + cmp divisor, #0 + beq LSYM(Ldiv0) +LSYM(udivsi3_skip_div0_test): + mov curbit, #1 + mov result, #0 + + push { work } + cmp dividend, divisor + blo LSYM(Lgot_result) + + THUMB_DIV_MOD_BODY 0 + + mov r0, result + pop { work } + RET + +#else /* ARM version/Thumb-2. */ + + ARM_FUNC_START udivsi3 + ARM_FUNC_ALIAS aeabi_uidiv udivsi3 + + /* Note: if called via udivsi3_skip_div0_test, this will unnecessarily + check for division-by-zero a second time. */ +LSYM(udivsi3_skip_div0_test): + subs r2, r1, #1 + do_it eq + RETc(eq) + bcc LSYM(Ldiv0) + cmp r0, r1 + bls 11f + tst r1, r2 + beq 12f + + ARM_DIV_BODY r0, r1, r2, r3 + + mov r0, r2 + RET + +11: do_it eq, e + moveq r0, #1 + movne r0, #0 + RET + +12: ARM_DIV2_ORDER r1, r2 + + mov r0, r0, lsr r2 + RET + +#endif /* ARM version */ + + DIV_FUNC_END udivsi3 unsigned + +#if defined(__prefer_thumb__) +FUNC_START aeabi_uidivmod + cmp r1, #0 + beq LSYM(Ldiv0) + push {r0, r1, lr} + bl LSYM(udivsi3_skip_div0_test) + POP {r1, r2, r3} + mul r2, r0 + sub r1, r1, r2 + bx r3 +#else +ARM_FUNC_START aeabi_uidivmod + cmp r1, #0 + beq LSYM(Ldiv0) + stmfd sp!, { r0, r1, lr } + bl LSYM(udivsi3_skip_div0_test) + ldmfd sp!, { r1, r2, lr } + mul r3, r2, r0 + sub r1, r1, r3 + RET +#endif + FUNC_END aeabi_uidivmod + +#endif /* L_udivsi3 */ +/* ------------------------------------------------------------------------ */ +#ifdef L_umodsi3 + + FUNC_START umodsi3 + +#ifdef __thumb__ + + cmp divisor, #0 + beq LSYM(Ldiv0) + mov curbit, #1 + cmp dividend, divisor + bhs LSYM(Lover10) + RET + +LSYM(Lover10): + push { work } + + THUMB_DIV_MOD_BODY 1 + + pop { work } + RET + +#else /* ARM version. */ + + subs r2, r1, #1 @ compare divisor with 1 + bcc LSYM(Ldiv0) + cmpne r0, r1 @ compare dividend with divisor + moveq r0, #0 + tsthi r1, r2 @ see if divisor is power of 2 + andeq r0, r0, r2 + RETc(ls) + + ARM_MOD_BODY r0, r1, r2, r3 + + RET + +#endif /* ARM version. */ + + DIV_FUNC_END umodsi3 unsigned + +#endif /* L_umodsi3 */ +/* ------------------------------------------------------------------------ */ +#ifdef L_divsi3 + +#if defined(__prefer_thumb__) + + FUNC_START divsi3 + FUNC_ALIAS aeabi_idiv divsi3 + + cmp divisor, #0 + beq LSYM(Ldiv0) +LSYM(divsi3_skip_div0_test): + push { work } + mov work, dividend + eor work, divisor @ Save the sign of the result. + mov ip, work + mov curbit, #1 + mov result, #0 + cmp divisor, #0 + bpl LSYM(Lover10) + neg divisor, divisor @ Loops below use unsigned. +LSYM(Lover10): + cmp dividend, #0 + bpl LSYM(Lover11) + neg dividend, dividend +LSYM(Lover11): + cmp dividend, divisor + blo LSYM(Lgot_result) + + THUMB_DIV_MOD_BODY 0 + + mov r0, result + mov work, ip + cmp work, #0 + bpl LSYM(Lover12) + neg r0, r0 +LSYM(Lover12): + pop { work } + RET + +#else /* ARM/Thumb-2 version. */ + + ARM_FUNC_START divsi3 + ARM_FUNC_ALIAS aeabi_idiv divsi3 + + cmp r1, #0 + beq LSYM(Ldiv0) +LSYM(divsi3_skip_div0_test): + eor ip, r0, r1 @ save the sign of the result. + do_it mi + rsbmi r1, r1, #0 @ loops below use unsigned. + subs r2, r1, #1 @ division by 1 or -1 ? + beq 10f + movs r3, r0 + do_it mi + rsbmi r3, r0, #0 @ positive dividend value + cmp r3, r1 + bls 11f + tst r1, r2 @ divisor is power of 2 ? + beq 12f + + ARM_DIV_BODY r3, r1, r0, r2 + + cmp ip, #0 + do_it mi + rsbmi r0, r0, #0 + RET + +10: teq ip, r0 @ same sign ? + do_it mi + rsbmi r0, r0, #0 + RET + +11: do_it lo + movlo r0, #0 + do_it eq,t + moveq r0, ip, asr #31 + orreq r0, r0, #1 + RET + +12: ARM_DIV2_ORDER r1, r2 + + cmp ip, #0 + mov r0, r3, lsr r2 + do_it mi + rsbmi r0, r0, #0 + RET + +#endif /* ARM version */ + + DIV_FUNC_END divsi3 signed + +#if defined(__prefer_thumb__) +FUNC_START aeabi_idivmod + cmp r1, #0 + beq LSYM(Ldiv0) + push {r0, r1, lr} + bl LSYM(divsi3_skip_div0_test) + POP {r1, r2, r3} + mul r2, r0 + sub r1, r1, r2 + bx r3 +#else +ARM_FUNC_START aeabi_idivmod + cmp r1, #0 + beq LSYM(Ldiv0) + stmfd sp!, { r0, r1, lr } + bl LSYM(divsi3_skip_div0_test) + ldmfd sp!, { r1, r2, lr } + mul r3, r2, r0 + sub r1, r1, r3 + RET +#endif + FUNC_END aeabi_idivmod + +#endif /* L_divsi3 */ +/* ------------------------------------------------------------------------ */ +#ifdef L_modsi3 + + FUNC_START modsi3 + +#ifdef __thumb__ + + mov curbit, #1 + cmp divisor, #0 + beq LSYM(Ldiv0) + bpl LSYM(Lover10) + neg divisor, divisor @ Loops below use unsigned. +LSYM(Lover10): + push { work } + @ Need to save the sign of the dividend, unfortunately, we need + @ work later on. Must do this after saving the original value of + @ the work register, because we will pop this value off first. + push { dividend } + cmp dividend, #0 + bpl LSYM(Lover11) + neg dividend, dividend +LSYM(Lover11): + cmp dividend, divisor + blo LSYM(Lgot_result) + + THUMB_DIV_MOD_BODY 1 + + pop { work } + cmp work, #0 + bpl LSYM(Lover12) + neg dividend, dividend +LSYM(Lover12): + pop { work } + RET + +#else /* ARM version. */ + + cmp r1, #0 + beq LSYM(Ldiv0) + rsbmi r1, r1, #0 @ loops below use unsigned. + movs ip, r0 @ preserve sign of dividend + rsbmi r0, r0, #0 @ if negative make positive + subs r2, r1, #1 @ compare divisor with 1 + cmpne r0, r1 @ compare dividend with divisor + moveq r0, #0 + tsthi r1, r2 @ see if divisor is power of 2 + andeq r0, r0, r2 + bls 10f + + ARM_MOD_BODY r0, r1, r2, r3 + +10: cmp ip, #0 + rsbmi r0, r0, #0 + RET + +#endif /* ARM version */ + + DIV_FUNC_END modsi3 signed + +#endif /* L_modsi3 */ +/* ------------------------------------------------------------------------ */ +#ifdef L_dvmd_tls + +#ifdef __ARM_EABI__ + WEAK aeabi_idiv0 + WEAK aeabi_ldiv0 + FUNC_START aeabi_idiv0 + FUNC_START aeabi_ldiv0 + RET + FUNC_END aeabi_ldiv0 + FUNC_END aeabi_idiv0 +#else + FUNC_START div0 + RET + FUNC_END div0 +#endif + +#endif /* L_divmodsi_tools */ +/* ------------------------------------------------------------------------ */ +#ifdef L_dvmd_lnx +@ GNU/Linux division-by zero handler. Used in place of L_dvmd_tls + +/* Constant taken from . */ +#define SIGFPE 8 + +#ifdef __ARM_EABI__ + WEAK aeabi_idiv0 + WEAK aeabi_ldiv0 + ARM_FUNC_START aeabi_idiv0 + ARM_FUNC_START aeabi_ldiv0 +#else + ARM_FUNC_START div0 +#endif + + do_push {r1, lr} + mov r0, #SIGFPE + bl SYM(raise) __PLT__ + RETLDM r1 + +#ifdef __ARM_EABI__ + FUNC_END aeabi_ldiv0 + FUNC_END aeabi_idiv0 +#else + FUNC_END div0 +#endif + +#endif /* L_dvmd_lnx */ +#ifdef L_clear_cache +#if defined __ARM_EABI__ && defined __linux__ +@ EABI GNU/Linux call to cacheflush syscall. + ARM_FUNC_START clear_cache + do_push {r7} +#if __ARM_ARCH__ >= 7 || defined(__ARM_ARCH_6T2__) + movw r7, #2 + movt r7, #0xf +#else + mov r7, #0xf0000 + add r7, r7, #2 +#endif + mov r2, #0 + swi 0 + do_pop {r7} + RET + FUNC_END clear_cache +#else +#error "This is only for ARM EABI GNU/Linux" +#endif +#endif /* L_clear_cache */ +/* ------------------------------------------------------------------------ */ +/* Dword shift operations. */ +/* All the following Dword shift variants rely on the fact that + shft xxx, Reg + is in fact done as + shft xxx, (Reg & 255) + so for Reg value in (32...63) and (-1...-31) we will get zero (in the + case of logical shifts) or the sign (for asr). */ + +#ifdef __ARMEB__ +#define al r1 +#define ah r0 +#else +#define al r0 +#define ah r1 +#endif + +/* Prevent __aeabi double-word shifts from being produced on SymbianOS. */ +#ifndef __symbian__ + +#ifdef L_lshrdi3 + + FUNC_START lshrdi3 + FUNC_ALIAS aeabi_llsr lshrdi3 + +#ifdef __thumb__ + lsr al, r2 + mov r3, ah + lsr ah, r2 + mov ip, r3 + sub r2, #32 + lsr r3, r2 + orr al, r3 + neg r2, r2 + mov r3, ip + lsl r3, r2 + orr al, r3 + RET +#else + subs r3, r2, #32 + rsb ip, r2, #32 + movmi al, al, lsr r2 + movpl al, ah, lsr r3 + orrmi al, al, ah, lsl ip + mov ah, ah, lsr r2 + RET +#endif + FUNC_END aeabi_llsr + FUNC_END lshrdi3 + +#endif + +#ifdef L_ashrdi3 + + FUNC_START ashrdi3 + FUNC_ALIAS aeabi_lasr ashrdi3 + +#ifdef __thumb__ + lsr al, r2 + mov r3, ah + asr ah, r2 + sub r2, #32 + @ If r2 is negative at this point the following step would OR + @ the sign bit into all of AL. That's not what we want... + bmi 1f + mov ip, r3 + asr r3, r2 + orr al, r3 + mov r3, ip +1: + neg r2, r2 + lsl r3, r2 + orr al, r3 + RET +#else + subs r3, r2, #32 + rsb ip, r2, #32 + movmi al, al, lsr r2 + movpl al, ah, asr r3 + orrmi al, al, ah, lsl ip + mov ah, ah, asr r2 + RET +#endif + + FUNC_END aeabi_lasr + FUNC_END ashrdi3 + +#endif + +#ifdef L_ashldi3 + + FUNC_START ashldi3 + FUNC_ALIAS aeabi_llsl ashldi3 + +#ifdef __thumb__ + lsl ah, r2 + mov r3, al + lsl al, r2 + mov ip, r3 + sub r2, #32 + lsl r3, r2 + orr ah, r3 + neg r2, r2 + mov r3, ip + lsr r3, r2 + orr ah, r3 + RET +#else + subs r3, r2, #32 + rsb ip, r2, #32 + movmi ah, ah, lsl r2 + movpl ah, al, lsl r3 + orrmi ah, ah, al, lsr ip + mov al, al, lsl r2 + RET +#endif + FUNC_END aeabi_llsl + FUNC_END ashldi3 + +#endif + +#endif /* __symbian__ */ + +#if ((__ARM_ARCH__ > 5) && !defined(__ARM_ARCH_6M__)) \ + || defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \ + || defined(__ARM_ARCH_5TEJ__) +#define HAVE_ARM_CLZ 1 +#endif + +#ifdef L_clzsi2 +#if defined(__ARM_ARCH_6M__) +FUNC_START clzsi2 + mov r1, #28 + mov r3, #1 + lsl r3, r3, #16 + cmp r0, r3 /* 0x10000 */ + bcc 2f + lsr r0, r0, #16 + sub r1, r1, #16 +2: lsr r3, r3, #8 + cmp r0, r3 /* #0x100 */ + bcc 2f + lsr r0, r0, #8 + sub r1, r1, #8 +2: lsr r3, r3, #4 + cmp r0, r3 /* #0x10 */ + bcc 2f + lsr r0, r0, #4 + sub r1, r1, #4 +2: adr r2, 1f + ldrb r0, [r2, r0] + add r0, r0, r1 + bx lr +.align 2 +1: +.byte 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 + FUNC_END clzsi2 +#else +ARM_FUNC_START clzsi2 +# if defined(HAVE_ARM_CLZ) + clz r0, r0 + RET +# else + mov r1, #28 + cmp r0, #0x10000 + do_it cs, t + movcs r0, r0, lsr #16 + subcs r1, r1, #16 + cmp r0, #0x100 + do_it cs, t + movcs r0, r0, lsr #8 + subcs r1, r1, #8 + cmp r0, #0x10 + do_it cs, t + movcs r0, r0, lsr #4 + subcs r1, r1, #4 + adr r2, 1f + ldrb r0, [r2, r0] + add r0, r0, r1 + RET +.align 2 +1: +.byte 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 +# endif /* !HAVE_ARM_CLZ */ + FUNC_END clzsi2 +#endif +#endif /* L_clzsi2 */ + +#ifdef L_clzdi2 +#if !defined(HAVE_ARM_CLZ) + +# if defined(__ARM_ARCH_6M__) +FUNC_START clzdi2 + push {r4, lr} +# else +ARM_FUNC_START clzdi2 + do_push {r4, lr} +# endif + cmp xxh, #0 + bne 1f +# ifdef __ARMEB__ + mov r0, xxl + bl __clzsi2 + add r0, r0, #32 + b 2f +1: + bl __clzsi2 +# else + bl __clzsi2 + add r0, r0, #32 + b 2f +1: + mov r0, xxh + bl __clzsi2 +# endif +2: +# if defined(__ARM_ARCH_6M__) + pop {r4, pc} +# else + RETLDM r4 +# endif + FUNC_END clzdi2 + +#else /* HAVE_ARM_CLZ */ + +ARM_FUNC_START clzdi2 + cmp xxh, #0 + do_it eq, et + clzeq r0, xxl + clzne r0, xxh + addeq r0, r0, #32 + RET + FUNC_END clzdi2 + +#endif +#endif /* L_clzdi2 */ + +/* ------------------------------------------------------------------------ */ +/* These next two sections are here despite the fact that they contain Thumb + assembler because their presence allows interworked code to be linked even + when the GCC library is this one. */ + +/* Do not build the interworking functions when the target architecture does + not support Thumb instructions. (This can be a multilib option). */ +#if defined __ARM_ARCH_4T__ || defined __ARM_ARCH_5T__\ + || defined __ARM_ARCH_5TE__ || defined __ARM_ARCH_5TEJ__ \ + || __ARM_ARCH__ >= 6 + +#if defined L_call_via_rX + +/* These labels & instructions are used by the Arm/Thumb interworking code. + The address of function to be called is loaded into a register and then + one of these labels is called via a BL instruction. This puts the + return address into the link register with the bottom bit set, and the + code here switches to the correct mode before executing the function. */ + + .text + .align 0 + .force_thumb + +.macro call_via register + THUMB_FUNC_START _call_via_\register + + bx \register + nop + + SIZE (_call_via_\register) +.endm + + call_via r0 + call_via r1 + call_via r2 + call_via r3 + call_via r4 + call_via r5 + call_via r6 + call_via r7 + call_via r8 + call_via r9 + call_via sl + call_via fp + call_via ip + call_via sp + call_via lr + +#endif /* L_call_via_rX */ + +/* Don't bother with the old interworking routines for Thumb-2. */ +/* ??? Maybe only omit these on "m" variants. */ +#if !defined(__thumb2__) && !defined(__ARM_ARCH_6M__) + +#if defined L_interwork_call_via_rX + +/* These labels & instructions are used by the Arm/Thumb interworking code, + when the target address is in an unknown instruction set. The address + of function to be called is loaded into a register and then one of these + labels is called via a BL instruction. This puts the return address + into the link register with the bottom bit set, and the code here + switches to the correct mode before executing the function. Unfortunately + the target code cannot be relied upon to return via a BX instruction, so + instead we have to store the resturn address on the stack and allow the + called function to return here instead. Upon return we recover the real + return address and use a BX to get back to Thumb mode. + + There are three variations of this code. The first, + _interwork_call_via_rN(), will push the return address onto the + stack and pop it in _arm_return(). It should only be used if all + arguments are passed in registers. + + The second, _interwork_r7_call_via_rN(), instead stores the return + address at [r7, #-4]. It is the caller's responsibility to ensure + that this address is valid and contains no useful data. + + The third, _interwork_r11_call_via_rN(), works in the same way but + uses r11 instead of r7. It is useful if the caller does not really + need a frame pointer. */ + + .text + .align 0 + + .code 32 + .globl _arm_return +LSYM(Lstart_arm_return): + cfi_start LSYM(Lstart_arm_return) LSYM(Lend_arm_return) + cfi_push 0, 0xe, -0x8, 0x8 + nop @ This nop is for the benefit of debuggers, so that + @ backtraces will use the correct unwind information. +_arm_return: + RETLDM unwind=LSYM(Lstart_arm_return) + cfi_end LSYM(Lend_arm_return) + + .globl _arm_return_r7 +_arm_return_r7: + ldr lr, [r7, #-4] + bx lr + + .globl _arm_return_r11 +_arm_return_r11: + ldr lr, [r11, #-4] + bx lr + +.macro interwork_with_frame frame, register, name, return + .code 16 + + THUMB_FUNC_START \name + + bx pc + nop + + .code 32 + tst \register, #1 + streq lr, [\frame, #-4] + adreq lr, _arm_return_\frame + bx \register + + SIZE (\name) +.endm + +.macro interwork register + .code 16 + + THUMB_FUNC_START _interwork_call_via_\register + + bx pc + nop + + .code 32 + .globl LSYM(Lchange_\register) +LSYM(Lchange_\register): + tst \register, #1 + streq lr, [sp, #-8]! + adreq lr, _arm_return + bx \register + + SIZE (_interwork_call_via_\register) + + interwork_with_frame r7,\register,_interwork_r7_call_via_\register + interwork_with_frame r11,\register,_interwork_r11_call_via_\register +.endm + + interwork r0 + interwork r1 + interwork r2 + interwork r3 + interwork r4 + interwork r5 + interwork r6 + interwork r7 + interwork r8 + interwork r9 + interwork sl + interwork fp + interwork ip + interwork sp + + /* The LR case has to be handled a little differently... */ + .code 16 + + THUMB_FUNC_START _interwork_call_via_lr + + bx pc + nop + + .code 32 + .globl .Lchange_lr +.Lchange_lr: + tst lr, #1 + stmeqdb r13!, {lr, pc} + mov ip, lr + adreq lr, _arm_return + bx ip + + SIZE (_interwork_call_via_lr) + +#endif /* L_interwork_call_via_rX */ +#endif /* !__thumb2__ */ + +/* Functions to support compact pic switch tables in thumb1 state. + All these routines take an index into the table in r0. The + table is at LR & ~1 (but this must be rounded up in the case + of 32-bit entires). They are only permitted to clobber r12 + and r14 and r0 must be preserved on exit. */ +#ifdef L_thumb1_case_sqi + + .text + .align 0 + .force_thumb + .syntax unified + THUMB_FUNC_START __gnu_thumb1_case_sqi + push {r1} + mov r1, lr + lsrs r1, r1, #1 + lsls r1, r1, #1 + ldrsb r1, [r1, r0] + lsls r1, r1, #1 + add lr, lr, r1 + pop {r1} + bx lr + SIZE (__gnu_thumb1_case_sqi) +#endif + +#ifdef L_thumb1_case_uqi + + .text + .align 0 + .force_thumb + .syntax unified + THUMB_FUNC_START __gnu_thumb1_case_uqi + push {r1} + mov r1, lr + lsrs r1, r1, #1 + lsls r1, r1, #1 + ldrb r1, [r1, r0] + lsls r1, r1, #1 + add lr, lr, r1 + pop {r1} + bx lr + SIZE (__gnu_thumb1_case_uqi) +#endif + +#ifdef L_thumb1_case_shi + + .text + .align 0 + .force_thumb + .syntax unified + THUMB_FUNC_START __gnu_thumb1_case_shi + push {r0, r1} + mov r1, lr + lsrs r1, r1, #1 + lsls r0, r0, #1 + lsls r1, r1, #1 + ldrsh r1, [r1, r0] + lsls r1, r1, #1 + add lr, lr, r1 + pop {r0, r1} + bx lr + SIZE (__gnu_thumb1_case_shi) +#endif + +#ifdef L_thumb1_case_uhi + + .text + .align 0 + .force_thumb + .syntax unified + THUMB_FUNC_START __gnu_thumb1_case_uhi + push {r0, r1} + mov r1, lr + lsrs r1, r1, #1 + lsls r0, r0, #1 + lsls r1, r1, #1 + ldrh r1, [r1, r0] + lsls r1, r1, #1 + add lr, lr, r1 + pop {r0, r1} + bx lr + SIZE (__gnu_thumb1_case_uhi) +#endif + +#ifdef L_thumb1_case_si + + .text + .align 0 + .force_thumb + .syntax unified + THUMB_FUNC_START __gnu_thumb1_case_si + push {r0, r1} + mov r1, lr + adds.n r1, r1, #2 /* Align to word. */ + lsrs r1, r1, #2 + lsls r0, r0, #2 + lsls r1, r1, #2 + ldr r0, [r1, r0] + adds r0, r0, r1 + mov lr, r0 + pop {r0, r1} + mov pc, lr /* We know we were called from thumb code. */ + SIZE (__gnu_thumb1_case_si) +#endif + +#endif /* Arch supports thumb. */ + +#ifndef __symbian__ +#ifndef __ARM_ARCH_6M__ +#include "ieee754-df.S" +#include "ieee754-sf.S" +#include "bpabi.S" +#else /* __ARM_ARCH_6M__ */ +#include "bpabi-v6m.S" +#endif /* __ARM_ARCH_6M__ */ +#endif /* !__symbian__ */ diff --git a/gcc/config/arm/libgcc-bpabi.ver b/gcc/config/arm/libgcc-bpabi.ver new file mode 100644 index 000000000..3ba8364dc --- /dev/null +++ b/gcc/config/arm/libgcc-bpabi.ver @@ -0,0 +1,108 @@ +# Copyright (C) 2004, 2005, 2007 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +GCC_3.5 { + # BPABI symbols + __aeabi_cdcmpeq + __aeabi_cdcmple + __aeabi_cdrcmple + __aeabi_cfcmpeq + __aeabi_cfcmple + __aeabi_cfrcmple + __aeabi_d2f + __aeabi_d2iz + __aeabi_d2lz + __aeabi_d2uiz + __aeabi_d2ulz + __aeabi_dadd + __aeabi_dcmpeq + __aeabi_dcmpge + __aeabi_dcmpgt + __aeabi_dcmple + __aeabi_dcmplt + __aeabi_dcmpun + __aeabi_ddiv + __aeabi_dmul + __aeabi_dneg + __aeabi_drsub + __aeabi_dsub + __aeabi_f2d + __aeabi_f2iz + __aeabi_f2lz + __aeabi_f2uiz + __aeabi_f2ulz + __aeabi_fadd + __aeabi_fcmpeq + __aeabi_fcmpge + __aeabi_fcmpgt + __aeabi_fcmple + __aeabi_fcmplt + __aeabi_fcmpun + __aeabi_fdiv + __aeabi_fmul + __aeabi_fneg + __aeabi_frsub + __aeabi_fsub + __aeabi_i2d + __aeabi_i2f + __aeabi_idiv + __aeabi_idiv0 + __aeabi_idivmod + __aeabi_l2d + __aeabi_l2f + __aeabi_lasr + __aeabi_lcmp + __aeabi_ldiv0 + __aeabi_ldivmod + __aeabi_llsl + __aeabi_llsr + __aeabi_lmul + __aeabi_ui2d + __aeabi_ui2f + __aeabi_uidiv + __aeabi_uidivmod + __aeabi_uldivmod + __aeabi_ulcmp + __aeabi_ul2d + __aeabi_ul2f + __aeabi_uread4 + __aeabi_uread8 + __aeabi_uwrite4 + __aeabi_uwrite8 + + # Exception-Handling + # \S 7.5 + _Unwind_Complete + _Unwind_VRS_Get + _Unwind_VRS_Set + _Unwind_VRS_Pop + # \S 9.2 + __aeabi_unwind_cpp_pr0 + __aeabi_unwind_cpp_pr1 + __aeabi_unwind_cpp_pr2 + # The libstdc++ exception-handling personality routine uses this + # GNU-specific entry point. + __gnu_unwind_frame +} + +%exclude { + _Unwind_Backtrace +} +GCC_4.3.0 { + _Unwind_Backtrace +} diff --git a/gcc/config/arm/libunwind.S b/gcc/config/arm/libunwind.S new file mode 100644 index 000000000..48eb592fd --- /dev/null +++ b/gcc/config/arm/libunwind.S @@ -0,0 +1,363 @@ +/* Support functions for the unwinder. + Copyright (C) 2003, 2004, 2005, 2007, 2008, 2009, 2010 + Free Software Foundation, Inc. + Contributed by Paul Brook + + This file is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any + later version. + + This file is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* An executable stack is *not* required for these functions. */ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +.previous +#endif + +#ifdef __ARM_EABI__ +/* Some attributes that are common to all routines in this file. */ + /* Tag_ABI_align_needed: This code does not require 8-byte + alignment from the caller. */ + /* .eabi_attribute 24, 0 -- default setting. */ + /* Tag_ABI_align_preserved: This code preserves 8-byte + alignment in any callee. */ + .eabi_attribute 25, 1 +#endif /* __ARM_EABI__ */ + +#ifndef __symbian__ + +#include "lib1funcs.asm" + +.macro UNPREFIX name + .global SYM (\name) + EQUIV SYM (\name), SYM (__\name) +.endm + +#if (__ARM_ARCH__ == 4) +/* Some coprocessors require armv5. We know this code will never be run on + other cpus. Tell gas to allow armv5, but only mark the objects as armv4. + */ +.arch armv5t +#ifdef __ARM_ARCH_4T__ +.object_arch armv4t +#else +.object_arch armv4 +#endif +#endif + +#ifdef __ARM_ARCH_6M__ + +/* r0 points to a 16-word block. Upload these values to the actual core + state. */ +FUNC_START restore_core_regs + mov r1, r0 + add r1, r1, #52 + ldmia r1!, {r3, r4, r5} + sub r3, r3, #4 + mov ip, r3 + str r5, [r3] + mov lr, r4 + /* Restore r8-r11. */ + mov r1, r0 + add r1, r1, #32 + ldmia r1!, {r2, r3, r4, r5} + mov r8, r2 + mov r9, r3 + mov sl, r4 + mov fp, r5 + mov r1, r0 + add r1, r1, #8 + ldmia r1!, {r2, r3, r4, r5, r6, r7} + ldr r1, [r0, #4] + ldr r0, [r0] + mov sp, ip + pop {pc} + FUNC_END restore_core_regs + UNPREFIX restore_core_regs + +/* ARMV6M does not have coprocessors, so these should never be used. */ +FUNC_START gnu_Unwind_Restore_VFP + RET + +/* Store VFR regsters d0-d15 to the address in r0. */ +FUNC_START gnu_Unwind_Save_VFP + RET + +/* Load VFP registers d0-d15 from the address in r0. + Use this to load from FSTMD format. */ +FUNC_START gnu_Unwind_Restore_VFP_D + RET + +/* Store VFP registers d0-d15 to the address in r0. + Use this to store in FLDMD format. */ +FUNC_START gnu_Unwind_Save_VFP_D + RET + +/* Load VFP registers d16-d31 from the address in r0. + Use this to load from FSTMD (=VSTM) format. Needs VFPv3. */ +FUNC_START gnu_Unwind_Restore_VFP_D_16_to_31 + RET + +/* Store VFP registers d16-d31 to the address in r0. + Use this to store in FLDMD (=VLDM) format. Needs VFPv3. */ +FUNC_START gnu_Unwind_Save_VFP_D_16_to_31 + RET + +FUNC_START gnu_Unwind_Restore_WMMXD + RET + +FUNC_START gnu_Unwind_Save_WMMXD + RET + +FUNC_START gnu_Unwind_Restore_WMMXC + RET + +FUNC_START gnu_Unwind_Save_WMMXC + RET + +.macro UNWIND_WRAPPER name nargs + FUNC_START \name + /* Create a phase2_vrs structure. */ + /* Save r0 in the PC slot so we can use it as a scratch register. */ + push {r0} + add r0, sp, #4 + push {r0, lr} /* Push original SP and LR. */ + /* Make space for r8-r12. */ + sub sp, sp, #20 + /* Save low registers. */ + push {r0, r1, r2, r3, r4, r5, r6, r7} + /* Save high registers. */ + add r0, sp, #32 + mov r1, r8 + mov r2, r9 + mov r3, sl + mov r4, fp + mov r5, ip + stmia r0!, {r1, r2, r3, r4, r5} + /* Restore original low register values. */ + add r0, sp, #4 + ldmia r0!, {r1, r2, r3, r4, r5} + /* Restore orginial r0. */ + ldr r0, [sp, #60] + str r0, [sp] + /* Demand-save flags, plus an extra word for alignment. */ + mov r3, #0 + push {r2, r3} + /* Point r1 at the block. Pass r[0..nargs) unchanged. */ + add r\nargs, sp, #4 + + bl SYM (__gnu\name) + + ldr r3, [sp, #64] + add sp, sp, #72 + bx r3 + + FUNC_END \name + UNPREFIX \name +.endm + +#else /* !__ARM_ARCH_6M__ */ + +/* r0 points to a 16-word block. Upload these values to the actual core + state. */ +ARM_FUNC_START restore_core_regs + /* We must use sp as the base register when restoring sp. Push the + last 3 registers onto the top of the current stack to achieve + this. */ + add r1, r0, #52 + ldmia r1, {r3, r4, r5} /* {sp, lr, pc}. */ +#if defined(__thumb2__) + /* Thumb-2 doesn't allow sp in a load-multiple instruction, so push + the target address onto the target stack. This is safe as + we're always returning to somewhere further up the call stack. */ + mov ip, r3 + mov lr, r4 + str r5, [ip, #-4]! +#elif defined(__INTERWORKING__) + /* Restore pc into ip. */ + mov r2, r5 + stmfd sp!, {r2, r3, r4} +#else + stmfd sp!, {r3, r4, r5} +#endif + /* Don't bother restoring ip. */ + ldmia r0, {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp} +#if defined(__thumb2__) + /* Pop the return address off the target stack. */ + mov sp, ip + pop {pc} +#elif defined(__INTERWORKING__) + /* Pop the three registers we pushed earlier. */ + ldmfd sp, {ip, sp, lr} + bx ip +#else + ldmfd sp, {sp, lr, pc} +#endif + FUNC_END restore_core_regs + UNPREFIX restore_core_regs + +/* Load VFP registers d0-d15 from the address in r0. + Use this to load from FSTMX format. */ +ARM_FUNC_START gnu_Unwind_Restore_VFP + /* Use the generic coprocessor form so that gas doesn't complain + on soft-float targets. */ + ldc p11,cr0,[r0],{0x21} /* fldmiax r0, {d0-d15} */ + RET + +/* Store VFP registers d0-d15 to the address in r0. + Use this to store in FSTMX format. */ +ARM_FUNC_START gnu_Unwind_Save_VFP + /* Use the generic coprocessor form so that gas doesn't complain + on soft-float targets. */ + stc p11,cr0,[r0],{0x21} /* fstmiax r0, {d0-d15} */ + RET + +/* Load VFP registers d0-d15 from the address in r0. + Use this to load from FSTMD format. */ +ARM_FUNC_START gnu_Unwind_Restore_VFP_D + ldc p11,cr0,[r0],{0x20} /* fldmiad r0, {d0-d15} */ + RET + +/* Store VFP registers d0-d15 to the address in r0. + Use this to store in FLDMD format. */ +ARM_FUNC_START gnu_Unwind_Save_VFP_D + stc p11,cr0,[r0],{0x20} /* fstmiad r0, {d0-d15} */ + RET + +/* Load VFP registers d16-d31 from the address in r0. + Use this to load from FSTMD (=VSTM) format. Needs VFPv3. */ +ARM_FUNC_START gnu_Unwind_Restore_VFP_D_16_to_31 + ldcl p11,cr0,[r0],{0x20} /* vldm r0, {d16-d31} */ + RET + +/* Store VFP registers d16-d31 to the address in r0. + Use this to store in FLDMD (=VLDM) format. Needs VFPv3. */ +ARM_FUNC_START gnu_Unwind_Save_VFP_D_16_to_31 + stcl p11,cr0,[r0],{0x20} /* vstm r0, {d16-d31} */ + RET + +ARM_FUNC_START gnu_Unwind_Restore_WMMXD + /* Use the generic coprocessor form so that gas doesn't complain + on non-iWMMXt targets. */ + ldcl p1, cr0, [r0], #8 /* wldrd wr0, [r0], #8 */ + ldcl p1, cr1, [r0], #8 /* wldrd wr1, [r0], #8 */ + ldcl p1, cr2, [r0], #8 /* wldrd wr2, [r0], #8 */ + ldcl p1, cr3, [r0], #8 /* wldrd wr3, [r0], #8 */ + ldcl p1, cr4, [r0], #8 /* wldrd wr4, [r0], #8 */ + ldcl p1, cr5, [r0], #8 /* wldrd wr5, [r0], #8 */ + ldcl p1, cr6, [r0], #8 /* wldrd wr6, [r0], #8 */ + ldcl p1, cr7, [r0], #8 /* wldrd wr7, [r0], #8 */ + ldcl p1, cr8, [r0], #8 /* wldrd wr8, [r0], #8 */ + ldcl p1, cr9, [r0], #8 /* wldrd wr9, [r0], #8 */ + ldcl p1, cr10, [r0], #8 /* wldrd wr10, [r0], #8 */ + ldcl p1, cr11, [r0], #8 /* wldrd wr11, [r0], #8 */ + ldcl p1, cr12, [r0], #8 /* wldrd wr12, [r0], #8 */ + ldcl p1, cr13, [r0], #8 /* wldrd wr13, [r0], #8 */ + ldcl p1, cr14, [r0], #8 /* wldrd wr14, [r0], #8 */ + ldcl p1, cr15, [r0], #8 /* wldrd wr15, [r0], #8 */ + RET + +ARM_FUNC_START gnu_Unwind_Save_WMMXD + /* Use the generic coprocessor form so that gas doesn't complain + on non-iWMMXt targets. */ + stcl p1, cr0, [r0], #8 /* wstrd wr0, [r0], #8 */ + stcl p1, cr1, [r0], #8 /* wstrd wr1, [r0], #8 */ + stcl p1, cr2, [r0], #8 /* wstrd wr2, [r0], #8 */ + stcl p1, cr3, [r0], #8 /* wstrd wr3, [r0], #8 */ + stcl p1, cr4, [r0], #8 /* wstrd wr4, [r0], #8 */ + stcl p1, cr5, [r0], #8 /* wstrd wr5, [r0], #8 */ + stcl p1, cr6, [r0], #8 /* wstrd wr6, [r0], #8 */ + stcl p1, cr7, [r0], #8 /* wstrd wr7, [r0], #8 */ + stcl p1, cr8, [r0], #8 /* wstrd wr8, [r0], #8 */ + stcl p1, cr9, [r0], #8 /* wstrd wr9, [r0], #8 */ + stcl p1, cr10, [r0], #8 /* wstrd wr10, [r0], #8 */ + stcl p1, cr11, [r0], #8 /* wstrd wr11, [r0], #8 */ + stcl p1, cr12, [r0], #8 /* wstrd wr12, [r0], #8 */ + stcl p1, cr13, [r0], #8 /* wstrd wr13, [r0], #8 */ + stcl p1, cr14, [r0], #8 /* wstrd wr14, [r0], #8 */ + stcl p1, cr15, [r0], #8 /* wstrd wr15, [r0], #8 */ + RET + +ARM_FUNC_START gnu_Unwind_Restore_WMMXC + /* Use the generic coprocessor form so that gas doesn't complain + on non-iWMMXt targets. */ + ldc2 p1, cr8, [r0], #4 /* wldrw wcgr0, [r0], #4 */ + ldc2 p1, cr9, [r0], #4 /* wldrw wcgr1, [r0], #4 */ + ldc2 p1, cr10, [r0], #4 /* wldrw wcgr2, [r0], #4 */ + ldc2 p1, cr11, [r0], #4 /* wldrw wcgr3, [r0], #4 */ + RET + +ARM_FUNC_START gnu_Unwind_Save_WMMXC + /* Use the generic coprocessor form so that gas doesn't complain + on non-iWMMXt targets. */ + stc2 p1, cr8, [r0], #4 /* wstrw wcgr0, [r0], #4 */ + stc2 p1, cr9, [r0], #4 /* wstrw wcgr1, [r0], #4 */ + stc2 p1, cr10, [r0], #4 /* wstrw wcgr2, [r0], #4 */ + stc2 p1, cr11, [r0], #4 /* wstrw wcgr3, [r0], #4 */ + RET + +/* Wrappers to save core registers, then call the real routine. */ + +.macro UNWIND_WRAPPER name nargs + ARM_FUNC_START \name + /* Create a phase2_vrs structure. */ + /* Split reg push in two to ensure the correct value for sp. */ +#if defined(__thumb2__) + mov ip, sp + push {lr} /* PC is ignored. */ + push {ip, lr} /* Push original SP and LR. */ +#else + stmfd sp!, {sp, lr, pc} +#endif + stmfd sp!, {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip} + + /* Demand-save flags, plus an extra word for alignment. */ + mov r3, #0 + stmfd sp!, {r2, r3} + + /* Point r1 at the block. Pass r[0..nargs) unchanged. */ + add r\nargs, sp, #4 +#if defined(__thumb__) && !defined(__thumb2__) + /* Switch back to thumb mode to avoid interworking hassle. */ + adr ip, .L1_\name + orr ip, ip, #1 + bx ip + .thumb +.L1_\name: + bl SYM (__gnu\name) __PLT__ + ldr r3, [sp, #64] + add sp, #72 + bx r3 +#else + bl SYM (__gnu\name) __PLT__ + ldr lr, [sp, #64] + add sp, sp, #72 + RET +#endif + FUNC_END \name + UNPREFIX \name +.endm + +#endif /* !__ARM_ARCH_6M__ */ + +UNWIND_WRAPPER _Unwind_RaiseException 1 +UNWIND_WRAPPER _Unwind_Resume 1 +UNWIND_WRAPPER _Unwind_Resume_or_Rethrow 1 +UNWIND_WRAPPER _Unwind_ForcedUnwind 3 +UNWIND_WRAPPER _Unwind_Backtrace 2 + +#endif /* ndef __symbian__ */ diff --git a/gcc/config/arm/linux-atomic.c b/gcc/config/arm/linux-atomic.c new file mode 100644 index 000000000..57065a6e8 --- /dev/null +++ b/gcc/config/arm/linux-atomic.c @@ -0,0 +1,278 @@ +/* Linux-specific atomic operations for ARM EABI. + Copyright (C) 2008, 2009, 2010 Free Software Foundation, Inc. + Contributed by CodeSourcery. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ + +/* Kernel helper for compare-and-exchange. */ +typedef int (__kernel_cmpxchg_t) (int oldval, int newval, int *ptr); +#define __kernel_cmpxchg (*(__kernel_cmpxchg_t *) 0xffff0fc0) + +/* Kernel helper for memory barrier. */ +typedef void (__kernel_dmb_t) (void); +#define __kernel_dmb (*(__kernel_dmb_t *) 0xffff0fa0) + +/* Note: we implement byte, short and int versions of atomic operations using + the above kernel helpers, but there is no support for "long long" (64-bit) + operations as yet. */ + +#define HIDDEN __attribute__ ((visibility ("hidden"))) + +#ifdef __ARMEL__ +#define INVERT_MASK_1 0 +#define INVERT_MASK_2 0 +#else +#define INVERT_MASK_1 24 +#define INVERT_MASK_2 16 +#endif + +#define MASK_1 0xffu +#define MASK_2 0xffffu + +#define FETCH_AND_OP_WORD(OP, PFX_OP, INF_OP) \ + int HIDDEN \ + __sync_fetch_and_##OP##_4 (int *ptr, int val) \ + { \ + int failure, tmp; \ + \ + do { \ + tmp = *ptr; \ + failure = __kernel_cmpxchg (tmp, PFX_OP (tmp INF_OP val), ptr); \ + } while (failure != 0); \ + \ + return tmp; \ + } + +FETCH_AND_OP_WORD (add, , +) +FETCH_AND_OP_WORD (sub, , -) +FETCH_AND_OP_WORD (or, , |) +FETCH_AND_OP_WORD (and, , &) +FETCH_AND_OP_WORD (xor, , ^) +FETCH_AND_OP_WORD (nand, ~, &) + +#define NAME_oldval(OP, WIDTH) __sync_fetch_and_##OP##_##WIDTH +#define NAME_newval(OP, WIDTH) __sync_##OP##_and_fetch_##WIDTH + +/* Implement both __sync__and_fetch and __sync_fetch_and_ for + subword-sized quantities. */ + +#define SUBWORD_SYNC_OP(OP, PFX_OP, INF_OP, TYPE, WIDTH, RETURN) \ + TYPE HIDDEN \ + NAME##_##RETURN (OP, WIDTH) (TYPE *ptr, TYPE val) \ + { \ + int *wordptr = (int *) ((unsigned int) ptr & ~3); \ + unsigned int mask, shift, oldval, newval; \ + int failure; \ + \ + shift = (((unsigned int) ptr & 3) << 3) ^ INVERT_MASK_##WIDTH; \ + mask = MASK_##WIDTH << shift; \ + \ + do { \ + oldval = *wordptr; \ + newval = ((PFX_OP (((oldval & mask) >> shift) \ + INF_OP (unsigned int) val)) << shift) & mask; \ + newval |= oldval & ~mask; \ + failure = __kernel_cmpxchg (oldval, newval, wordptr); \ + } while (failure != 0); \ + \ + return (RETURN & mask) >> shift; \ + } + +SUBWORD_SYNC_OP (add, , +, unsigned short, 2, oldval) +SUBWORD_SYNC_OP (sub, , -, unsigned short, 2, oldval) +SUBWORD_SYNC_OP (or, , |, unsigned short, 2, oldval) +SUBWORD_SYNC_OP (and, , &, unsigned short, 2, oldval) +SUBWORD_SYNC_OP (xor, , ^, unsigned short, 2, oldval) +SUBWORD_SYNC_OP (nand, ~, &, unsigned short, 2, oldval) + +SUBWORD_SYNC_OP (add, , +, unsigned char, 1, oldval) +SUBWORD_SYNC_OP (sub, , -, unsigned char, 1, oldval) +SUBWORD_SYNC_OP (or, , |, unsigned char, 1, oldval) +SUBWORD_SYNC_OP (and, , &, unsigned char, 1, oldval) +SUBWORD_SYNC_OP (xor, , ^, unsigned char, 1, oldval) +SUBWORD_SYNC_OP (nand, ~, &, unsigned char, 1, oldval) + +#define OP_AND_FETCH_WORD(OP, PFX_OP, INF_OP) \ + int HIDDEN \ + __sync_##OP##_and_fetch_4 (int *ptr, int val) \ + { \ + int tmp, failure; \ + \ + do { \ + tmp = *ptr; \ + failure = __kernel_cmpxchg (tmp, PFX_OP (tmp INF_OP val), ptr); \ + } while (failure != 0); \ + \ + return PFX_OP (tmp INF_OP val); \ + } + +OP_AND_FETCH_WORD (add, , +) +OP_AND_FETCH_WORD (sub, , -) +OP_AND_FETCH_WORD (or, , |) +OP_AND_FETCH_WORD (and, , &) +OP_AND_FETCH_WORD (xor, , ^) +OP_AND_FETCH_WORD (nand, ~, &) + +SUBWORD_SYNC_OP (add, , +, unsigned short, 2, newval) +SUBWORD_SYNC_OP (sub, , -, unsigned short, 2, newval) +SUBWORD_SYNC_OP (or, , |, unsigned short, 2, newval) +SUBWORD_SYNC_OP (and, , &, unsigned short, 2, newval) +SUBWORD_SYNC_OP (xor, , ^, unsigned short, 2, newval) +SUBWORD_SYNC_OP (nand, ~, &, unsigned short, 2, newval) + +SUBWORD_SYNC_OP (add, , +, unsigned char, 1, newval) +SUBWORD_SYNC_OP (sub, , -, unsigned char, 1, newval) +SUBWORD_SYNC_OP (or, , |, unsigned char, 1, newval) +SUBWORD_SYNC_OP (and, , &, unsigned char, 1, newval) +SUBWORD_SYNC_OP (xor, , ^, unsigned char, 1, newval) +SUBWORD_SYNC_OP (nand, ~, &, unsigned char, 1, newval) + +int HIDDEN +__sync_val_compare_and_swap_4 (int *ptr, int oldval, int newval) +{ + int actual_oldval, fail; + + while (1) + { + actual_oldval = *ptr; + + if (__builtin_expect (oldval != actual_oldval, 0)) + return actual_oldval; + + fail = __kernel_cmpxchg (actual_oldval, newval, ptr); + + if (__builtin_expect (!fail, 1)) + return oldval; + } +} + +#define SUBWORD_VAL_CAS(TYPE, WIDTH) \ + TYPE HIDDEN \ + __sync_val_compare_and_swap_##WIDTH (TYPE *ptr, TYPE oldval, \ + TYPE newval) \ + { \ + int *wordptr = (int *)((unsigned int) ptr & ~3), fail; \ + unsigned int mask, shift, actual_oldval, actual_newval; \ + \ + shift = (((unsigned int) ptr & 3) << 3) ^ INVERT_MASK_##WIDTH; \ + mask = MASK_##WIDTH << shift; \ + \ + while (1) \ + { \ + actual_oldval = *wordptr; \ + \ + if (__builtin_expect (((actual_oldval & mask) >> shift) != \ + (unsigned int) oldval, 0)) \ + return (actual_oldval & mask) >> shift; \ + \ + actual_newval = (actual_oldval & ~mask) \ + | (((unsigned int) newval << shift) & mask); \ + \ + fail = __kernel_cmpxchg (actual_oldval, actual_newval, \ + wordptr); \ + \ + if (__builtin_expect (!fail, 1)) \ + return oldval; \ + } \ + } + +SUBWORD_VAL_CAS (unsigned short, 2) +SUBWORD_VAL_CAS (unsigned char, 1) + +typedef unsigned char bool; + +bool HIDDEN +__sync_bool_compare_and_swap_4 (int *ptr, int oldval, int newval) +{ + int failure = __kernel_cmpxchg (oldval, newval, ptr); + return (failure == 0); +} + +#define SUBWORD_BOOL_CAS(TYPE, WIDTH) \ + bool HIDDEN \ + __sync_bool_compare_and_swap_##WIDTH (TYPE *ptr, TYPE oldval, \ + TYPE newval) \ + { \ + TYPE actual_oldval \ + = __sync_val_compare_and_swap_##WIDTH (ptr, oldval, newval); \ + return (oldval == actual_oldval); \ + } + +SUBWORD_BOOL_CAS (unsigned short, 2) +SUBWORD_BOOL_CAS (unsigned char, 1) + +void HIDDEN +__sync_synchronize (void) +{ + __kernel_dmb (); +} + +int HIDDEN +__sync_lock_test_and_set_4 (int *ptr, int val) +{ + int failure, oldval; + + do { + oldval = *ptr; + failure = __kernel_cmpxchg (oldval, val, ptr); + } while (failure != 0); + + return oldval; +} + +#define SUBWORD_TEST_AND_SET(TYPE, WIDTH) \ + TYPE HIDDEN \ + __sync_lock_test_and_set_##WIDTH (TYPE *ptr, TYPE val) \ + { \ + int failure; \ + unsigned int oldval, newval, shift, mask; \ + int *wordptr = (int *) ((unsigned int) ptr & ~3); \ + \ + shift = (((unsigned int) ptr & 3) << 3) ^ INVERT_MASK_##WIDTH; \ + mask = MASK_##WIDTH << shift; \ + \ + do { \ + oldval = *wordptr; \ + newval = (oldval & ~mask) \ + | (((unsigned int) val << shift) & mask); \ + failure = __kernel_cmpxchg (oldval, newval, wordptr); \ + } while (failure != 0); \ + \ + return (oldval & mask) >> shift; \ + } + +SUBWORD_TEST_AND_SET (unsigned short, 2) +SUBWORD_TEST_AND_SET (unsigned char, 1) + +#define SYNC_LOCK_RELEASE(TYPE, WIDTH) \ + void HIDDEN \ + __sync_lock_release_##WIDTH (TYPE *ptr) \ + { \ + /* All writes before this point must be seen before we release \ + the lock itself. */ \ + __kernel_dmb (); \ + *ptr = 0; \ + } + +SYNC_LOCK_RELEASE (int, 4) +SYNC_LOCK_RELEASE (short, 2) +SYNC_LOCK_RELEASE (char, 1) diff --git a/gcc/config/arm/linux-eabi.h b/gcc/config/arm/linux-eabi.h new file mode 100644 index 000000000..833005284 --- /dev/null +++ b/gcc/config/arm/linux-eabi.h @@ -0,0 +1,103 @@ +/* Configuration file for ARM GNU/Linux EABI targets. + Copyright (C) 2004, 2005, 2006, 2007, 2009, 2010, 2011 + Free Software Foundation, Inc. + Contributed by CodeSourcery, LLC + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +/* On EABI GNU/Linux, we want both the BPABI builtins and the + GNU/Linux builtins. */ +#undef TARGET_OS_CPP_BUILTINS +#define TARGET_OS_CPP_BUILTINS() \ + do \ + { \ + TARGET_BPABI_CPP_BUILTINS(); \ + LINUX_TARGET_OS_CPP_BUILTINS(); \ + ANDROID_TARGET_OS_CPP_BUILTINS(); \ + } \ + while (false) + +/* We default to a soft-float ABI so that binaries can run on all + target hardware. */ +#undef TARGET_DEFAULT_FLOAT_ABI +#define TARGET_DEFAULT_FLOAT_ABI ARM_FLOAT_ABI_SOFT + +/* We default to the "aapcs-linux" ABI so that enums are int-sized by + default. */ +#undef ARM_DEFAULT_ABI +#define ARM_DEFAULT_ABI ARM_ABI_AAPCS_LINUX + +/* Default to armv5t so that thumb shared libraries work. + The ARM10TDMI core is the default for armv5t, so set + SUBTARGET_CPU_DEFAULT to achieve this. */ +#undef SUBTARGET_CPU_DEFAULT +#define SUBTARGET_CPU_DEFAULT TARGET_CPU_arm10tdmi + +/* TARGET_BIG_ENDIAN_DEFAULT is set in + config.gcc for big endian configurations. */ +#undef TARGET_LINKER_EMULATION +#if TARGET_BIG_ENDIAN_DEFAULT +#define TARGET_LINKER_EMULATION "armelfb_linux_eabi" +#else +#define TARGET_LINKER_EMULATION "armelf_linux_eabi" +#endif + +#undef SUBTARGET_EXTRA_LINK_SPEC +#define SUBTARGET_EXTRA_LINK_SPEC " -m " TARGET_LINKER_EMULATION + +/* Use ld-linux.so.3 so that it will be possible to run "classic" + GNU/Linux binaries on an EABI system. */ +#undef GLIBC_DYNAMIC_LINKER +#define GLIBC_DYNAMIC_LINKER "/lib/ld-linux.so.3" + +/* At this point, bpabi.h will have clobbered LINK_SPEC. We want to + use the GNU/Linux version, not the generic BPABI version. */ +#undef LINK_SPEC +#define LINK_SPEC BE8_LINK_SPEC \ + LINUX_OR_ANDROID_LD (LINUX_TARGET_LINK_SPEC, \ + LINUX_TARGET_LINK_SPEC " " ANDROID_LINK_SPEC) + +#undef CC1_SPEC +#define CC1_SPEC \ + LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \ + GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC) + +#define CC1PLUS_SPEC \ + LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC) + +#undef LIB_SPEC +#define LIB_SPEC \ + LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \ + GNU_USER_TARGET_LIB_SPEC " " ANDROID_LIB_SPEC) + +#undef STARTFILE_SPEC +#define STARTFILE_SPEC \ + LINUX_OR_ANDROID_LD (GNU_USER_TARGET_STARTFILE_SPEC, ANDROID_STARTFILE_SPEC) + +#undef ENDFILE_SPEC +#define ENDFILE_SPEC \ + LINUX_OR_ANDROID_LD (GNU_USER_TARGET_ENDFILE_SPEC, ANDROID_ENDFILE_SPEC) + +/* Use the default LIBGCC_SPEC, not the version in linux-elf.h, as we + do not use -lfloat. */ +#undef LIBGCC_SPEC + +/* Clear the instruction cache from `beg' to `end'. This is + implemented in lib1funcs.asm, so ensure an error if this definition + is used. */ +#undef CLEAR_INSN_CACHE +#define CLEAR_INSN_CACHE(BEG, END) not_used diff --git a/gcc/config/arm/linux-elf.h b/gcc/config/arm/linux-elf.h new file mode 100644 index 000000000..81d27bb72 --- /dev/null +++ b/gcc/config/arm/linux-elf.h @@ -0,0 +1,120 @@ +/* Definitions for ARM running Linux-based GNU systems using ELF + Copyright (C) 1993, 1994, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, + 2005, 2006, 2007, 2008, 2009, 2010, 2011 + Free Software Foundation, Inc. + Contributed by Philip Blundell + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +/* elfos.h should have already been included. Now just override + any conflicting definitions and add any extras. */ + +/* Run-time Target Specification. */ +#undef TARGET_VERSION +#define TARGET_VERSION fputs (" (ARM GNU/Linux with ELF)", stderr); + +#undef TARGET_DEFAULT_FLOAT_ABI +#define TARGET_DEFAULT_FLOAT_ABI ARM_FLOAT_ABI_HARD + +/* TARGET_BIG_ENDIAN_DEFAULT is set in + config.gcc for big endian configurations. */ +#if TARGET_BIG_ENDIAN_DEFAULT +#define TARGET_ENDIAN_DEFAULT MASK_BIG_END +#define TARGET_ENDIAN_OPTION "mbig-endian" +#define TARGET_LINKER_EMULATION "armelfb_linux" +#else +#define TARGET_ENDIAN_DEFAULT 0 +#define TARGET_ENDIAN_OPTION "mlittle-endian" +#define TARGET_LINKER_EMULATION "armelf_linux" +#endif + +#undef TARGET_DEFAULT +#define TARGET_DEFAULT (TARGET_ENDIAN_DEFAULT) + +#define SUBTARGET_CPU_DEFAULT TARGET_CPU_arm6 + +#define SUBTARGET_EXTRA_LINK_SPEC " -m " TARGET_LINKER_EMULATION " -p" + +#undef MULTILIB_DEFAULTS +#define MULTILIB_DEFAULTS \ + { "marm", "mlittle-endian", "mhard-float", "mno-thumb-interwork" } + +/* Now we define the strings used to build the spec file. */ +#undef LIB_SPEC +#define LIB_SPEC \ + "%{pthread:-lpthread} \ + %{shared:-lc} \ + %{!shared:%{profile:-lc_p}%{!profile:-lc}}" + +#define LIBGCC_SPEC "%{msoft-float:-lfloat} %{mfloat-abi=soft*:-lfloat} -lgcc" + +#define GLIBC_DYNAMIC_LINKER "/lib/ld-linux.so.2" + +#define LINUX_TARGET_LINK_SPEC "%{h*} \ + %{static:-Bstatic} \ + %{shared:-shared} \ + %{symbolic:-Bsymbolic} \ + %{rdynamic:-export-dynamic} \ + -dynamic-linker " LINUX_DYNAMIC_LINKER " \ + -X \ + %{mbig-endian:-EB} %{mlittle-endian:-EL}" \ + SUBTARGET_EXTRA_LINK_SPEC + +#undef LINK_SPEC +#define LINK_SPEC LINUX_TARGET_LINK_SPEC + +#define TARGET_OS_CPP_BUILTINS() \ + do \ + { \ + LINUX_TARGET_OS_CPP_BUILTINS(); \ + } \ + while (0) + +/* This is how we tell the assembler that two symbols have the same value. */ +#define ASM_OUTPUT_DEF(FILE, NAME1, NAME2) \ + do \ + { \ + assemble_name (FILE, NAME1); \ + fputs (" = ", FILE); \ + assemble_name (FILE, NAME2); \ + fputc ('\n', FILE); \ + } \ + while (0) + +/* NWFPE always understands FPA instructions. */ +#undef FPUTYPE_DEFAULT +#define FPUTYPE_DEFAULT "fpe3" + +/* Call the function profiler with a given profile label. */ +#undef ARM_FUNCTION_PROFILER +#define ARM_FUNCTION_PROFILER(STREAM, LABELNO) \ +{ \ + fprintf (STREAM, "\tbl\tmcount%s\n", \ + (TARGET_ARM && NEED_PLT_RELOC) ? "(PLT)" : ""); \ +} + +/* The GNU/Linux profiler clobbers the link register. Make sure the + prologue knows to save it. */ +#define PROFILE_HOOK(X) \ + emit_clobber (gen_rtx_REG (SImode, LR_REGNUM)) + +/* The GNU/Linux profiler needs a frame pointer. */ +#define SUBTARGET_FRAME_POINTER_REQUIRED crtl->profile + +/* Add .note.GNU-stack. */ +#undef NEED_INDICATE_EXEC_STACK +#define NEED_INDICATE_EXEC_STACK 1 diff --git a/gcc/config/arm/linux-gas.h b/gcc/config/arm/linux-gas.h new file mode 100644 index 000000000..9b6fcde2b --- /dev/null +++ b/gcc/config/arm/linux-gas.h @@ -0,0 +1,56 @@ +/* Definitions of target machine for GNU compiler. + ARM Linux-based GNU systems version. + Copyright (C) 1997, 1998, 1999, 2000, 2001, 2004, 2007 + Free Software Foundation, Inc. + Contributed by Russell King . + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +/* This is how we tell the assembler that a symbol is weak. + GAS always supports weak symbols. */ + +/* Unsigned chars produces much better code than signed. */ +#define DEFAULT_SIGNED_CHAR 0 + +#undef SUBTARGET_CPP_SPEC +#define SUBTARGET_CPP_SPEC "%{posix:-D_POSIX_SOURCE} %{pthread:-D_REENTRANT}" + +#undef SIZE_TYPE +#define SIZE_TYPE "unsigned int" + +#undef PTRDIFF_TYPE +#define PTRDIFF_TYPE "int" + +/* Use the AAPCS type for wchar_t, or the previous Linux default for + non-AAPCS. */ +#undef WCHAR_TYPE +#define WCHAR_TYPE (TARGET_AAPCS_BASED ? "unsigned int" : "long int") + +#undef WCHAR_TYPE_SIZE +#define WCHAR_TYPE_SIZE BITS_PER_WORD + +/* Clear the instruction cache from `beg' to `end'. This makes an + inline system call to SYS_cacheflush. */ +#define CLEAR_INSN_CACHE(BEG, END) \ +{ \ + register unsigned long _beg __asm ("a1") = (unsigned long) (BEG); \ + register unsigned long _end __asm ("a2") = (unsigned long) (END); \ + register unsigned long _flg __asm ("a3") = 0; \ + __asm __volatile ("swi 0x9f0002 @ sys_cacheflush" \ + : "=r" (_beg) \ + : "0" (_beg), "r" (_end), "r" (_flg)); \ +} diff --git a/gcc/config/arm/mmintrin.h b/gcc/config/arm/mmintrin.h new file mode 100644 index 000000000..2cc500de3 --- /dev/null +++ b/gcc/config/arm/mmintrin.h @@ -0,0 +1,1254 @@ +/* Copyright (C) 2002, 2003, 2004, 2009 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _MMINTRIN_H_INCLUDED +#define _MMINTRIN_H_INCLUDED + +/* The data type intended for user use. */ +typedef unsigned long long __m64, __int64; + +/* Internal data types for implementing the intrinsics. */ +typedef int __v2si __attribute__ ((vector_size (8))); +typedef short __v4hi __attribute__ ((vector_size (8))); +typedef char __v8qi __attribute__ ((vector_size (8))); + +/* "Convert" __m64 and __int64 into each other. */ +static __inline __m64 +_mm_cvtsi64_m64 (__int64 __i) +{ + return __i; +} + +static __inline __int64 +_mm_cvtm64_si64 (__m64 __i) +{ + return __i; +} + +static __inline int +_mm_cvtsi64_si32 (__int64 __i) +{ + return __i; +} + +static __inline __int64 +_mm_cvtsi32_si64 (int __i) +{ + return __i; +} + +/* Pack the four 16-bit values from M1 into the lower four 8-bit values of + the result, and the four 16-bit values from M2 into the upper four 8-bit + values of the result, all with signed saturation. */ +static __inline __m64 +_mm_packs_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wpackhss ((__v4hi)__m1, (__v4hi)__m2); +} + +/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of + the result, and the two 32-bit values from M2 into the upper two 16-bit + values of the result, all with signed saturation. */ +static __inline __m64 +_mm_packs_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wpackwss ((__v2si)__m1, (__v2si)__m2); +} + +/* Copy the 64-bit value from M1 into the lower 32-bits of the result, and + the 64-bit value from M2 into the upper 32-bits of the result, all with + signed saturation for values that do not fit exactly into 32-bits. */ +static __inline __m64 +_mm_packs_pi64 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wpackdss ((long long)__m1, (long long)__m2); +} + +/* Pack the four 16-bit values from M1 into the lower four 8-bit values of + the result, and the four 16-bit values from M2 into the upper four 8-bit + values of the result, all with unsigned saturation. */ +static __inline __m64 +_mm_packs_pu16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wpackhus ((__v4hi)__m1, (__v4hi)__m2); +} + +/* Pack the two 32-bit values from M1 into the lower two 16-bit values of + the result, and the two 32-bit values from M2 into the upper two 16-bit + values of the result, all with unsigned saturation. */ +static __inline __m64 +_mm_packs_pu32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wpackwus ((__v2si)__m1, (__v2si)__m2); +} + +/* Copy the 64-bit value from M1 into the lower 32-bits of the result, and + the 64-bit value from M2 into the upper 32-bits of the result, all with + unsigned saturation for values that do not fit exactly into 32-bits. */ +static __inline __m64 +_mm_packs_pu64 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wpackdus ((long long)__m1, (long long)__m2); +} + +/* Interleave the four 8-bit values from the high half of M1 with the four + 8-bit values from the high half of M2. */ +static __inline __m64 +_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wunpckihb ((__v8qi)__m1, (__v8qi)__m2); +} + +/* Interleave the two 16-bit values from the high half of M1 with the two + 16-bit values from the high half of M2. */ +static __inline __m64 +_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wunpckihh ((__v4hi)__m1, (__v4hi)__m2); +} + +/* Interleave the 32-bit value from the high half of M1 with the 32-bit + value from the high half of M2. */ +static __inline __m64 +_mm_unpackhi_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wunpckihw ((__v2si)__m1, (__v2si)__m2); +} + +/* Interleave the four 8-bit values from the low half of M1 with the four + 8-bit values from the low half of M2. */ +static __inline __m64 +_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wunpckilb ((__v8qi)__m1, (__v8qi)__m2); +} + +/* Interleave the two 16-bit values from the low half of M1 with the two + 16-bit values from the low half of M2. */ +static __inline __m64 +_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wunpckilh ((__v4hi)__m1, (__v4hi)__m2); +} + +/* Interleave the 32-bit value from the low half of M1 with the 32-bit + value from the low half of M2. */ +static __inline __m64 +_mm_unpacklo_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wunpckilw ((__v2si)__m1, (__v2si)__m2); +} + +/* Take the four 8-bit values from the low half of M1, sign extend them, + and return the result as a vector of four 16-bit quantities. */ +static __inline __m64 +_mm_unpackel_pi8 (__m64 __m1) +{ + return (__m64) __builtin_arm_wunpckelsb ((__v8qi)__m1); +} + +/* Take the two 16-bit values from the low half of M1, sign extend them, + and return the result as a vector of two 32-bit quantities. */ +static __inline __m64 +_mm_unpackel_pi16 (__m64 __m1) +{ + return (__m64) __builtin_arm_wunpckelsh ((__v4hi)__m1); +} + +/* Take the 32-bit value from the low half of M1, and return it sign extended + to 64 bits. */ +static __inline __m64 +_mm_unpackel_pi32 (__m64 __m1) +{ + return (__m64) __builtin_arm_wunpckelsw ((__v2si)__m1); +} + +/* Take the four 8-bit values from the high half of M1, sign extend them, + and return the result as a vector of four 16-bit quantities. */ +static __inline __m64 +_mm_unpackeh_pi8 (__m64 __m1) +{ + return (__m64) __builtin_arm_wunpckehsb ((__v8qi)__m1); +} + +/* Take the two 16-bit values from the high half of M1, sign extend them, + and return the result as a vector of two 32-bit quantities. */ +static __inline __m64 +_mm_unpackeh_pi16 (__m64 __m1) +{ + return (__m64) __builtin_arm_wunpckehsh ((__v4hi)__m1); +} + +/* Take the 32-bit value from the high half of M1, and return it sign extended + to 64 bits. */ +static __inline __m64 +_mm_unpackeh_pi32 (__m64 __m1) +{ + return (__m64) __builtin_arm_wunpckehsw ((__v2si)__m1); +} + +/* Take the four 8-bit values from the low half of M1, zero extend them, + and return the result as a vector of four 16-bit quantities. */ +static __inline __m64 +_mm_unpackel_pu8 (__m64 __m1) +{ + return (__m64) __builtin_arm_wunpckelub ((__v8qi)__m1); +} + +/* Take the two 16-bit values from the low half of M1, zero extend them, + and return the result as a vector of two 32-bit quantities. */ +static __inline __m64 +_mm_unpackel_pu16 (__m64 __m1) +{ + return (__m64) __builtin_arm_wunpckeluh ((__v4hi)__m1); +} + +/* Take the 32-bit value from the low half of M1, and return it zero extended + to 64 bits. */ +static __inline __m64 +_mm_unpackel_pu32 (__m64 __m1) +{ + return (__m64) __builtin_arm_wunpckeluw ((__v2si)__m1); +} + +/* Take the four 8-bit values from the high half of M1, zero extend them, + and return the result as a vector of four 16-bit quantities. */ +static __inline __m64 +_mm_unpackeh_pu8 (__m64 __m1) +{ + return (__m64) __builtin_arm_wunpckehub ((__v8qi)__m1); +} + +/* Take the two 16-bit values from the high half of M1, zero extend them, + and return the result as a vector of two 32-bit quantities. */ +static __inline __m64 +_mm_unpackeh_pu16 (__m64 __m1) +{ + return (__m64) __builtin_arm_wunpckehuh ((__v4hi)__m1); +} + +/* Take the 32-bit value from the high half of M1, and return it zero extended + to 64 bits. */ +static __inline __m64 +_mm_unpackeh_pu32 (__m64 __m1) +{ + return (__m64) __builtin_arm_wunpckehuw ((__v2si)__m1); +} + +/* Add the 8-bit values in M1 to the 8-bit values in M2. */ +static __inline __m64 +_mm_add_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_waddb ((__v8qi)__m1, (__v8qi)__m2); +} + +/* Add the 16-bit values in M1 to the 16-bit values in M2. */ +static __inline __m64 +_mm_add_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_waddh ((__v4hi)__m1, (__v4hi)__m2); +} + +/* Add the 32-bit values in M1 to the 32-bit values in M2. */ +static __inline __m64 +_mm_add_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_waddw ((__v2si)__m1, (__v2si)__m2); +} + +/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed + saturated arithmetic. */ +static __inline __m64 +_mm_adds_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_waddbss ((__v8qi)__m1, (__v8qi)__m2); +} + +/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed + saturated arithmetic. */ +static __inline __m64 +_mm_adds_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_waddhss ((__v4hi)__m1, (__v4hi)__m2); +} + +/* Add the 32-bit values in M1 to the 32-bit values in M2 using signed + saturated arithmetic. */ +static __inline __m64 +_mm_adds_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_waddwss ((__v2si)__m1, (__v2si)__m2); +} + +/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned + saturated arithmetic. */ +static __inline __m64 +_mm_adds_pu8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_waddbus ((__v8qi)__m1, (__v8qi)__m2); +} + +/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned + saturated arithmetic. */ +static __inline __m64 +_mm_adds_pu16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_waddhus ((__v4hi)__m1, (__v4hi)__m2); +} + +/* Add the 32-bit values in M1 to the 32-bit values in M2 using unsigned + saturated arithmetic. */ +static __inline __m64 +_mm_adds_pu32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_waddwus ((__v2si)__m1, (__v2si)__m2); +} + +/* Subtract the 8-bit values in M2 from the 8-bit values in M1. */ +static __inline __m64 +_mm_sub_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wsubb ((__v8qi)__m1, (__v8qi)__m2); +} + +/* Subtract the 16-bit values in M2 from the 16-bit values in M1. */ +static __inline __m64 +_mm_sub_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wsubh ((__v4hi)__m1, (__v4hi)__m2); +} + +/* Subtract the 32-bit values in M2 from the 32-bit values in M1. */ +static __inline __m64 +_mm_sub_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wsubw ((__v2si)__m1, (__v2si)__m2); +} + +/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed + saturating arithmetic. */ +static __inline __m64 +_mm_subs_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wsubbss ((__v8qi)__m1, (__v8qi)__m2); +} + +/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using + signed saturating arithmetic. */ +static __inline __m64 +_mm_subs_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wsubhss ((__v4hi)__m1, (__v4hi)__m2); +} + +/* Subtract the 32-bit values in M2 from the 32-bit values in M1 using + signed saturating arithmetic. */ +static __inline __m64 +_mm_subs_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wsubwss ((__v2si)__m1, (__v2si)__m2); +} + +/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using + unsigned saturating arithmetic. */ +static __inline __m64 +_mm_subs_pu8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wsubbus ((__v8qi)__m1, (__v8qi)__m2); +} + +/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using + unsigned saturating arithmetic. */ +static __inline __m64 +_mm_subs_pu16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wsubhus ((__v4hi)__m1, (__v4hi)__m2); +} + +/* Subtract the 32-bit values in M2 from the 32-bit values in M1 using + unsigned saturating arithmetic. */ +static __inline __m64 +_mm_subs_pu32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wsubwus ((__v2si)__m1, (__v2si)__m2); +} + +/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing + four 32-bit intermediate results, which are then summed by pairs to + produce two 32-bit results. */ +static __inline __m64 +_mm_madd_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wmadds ((__v4hi)__m1, (__v4hi)__m2); +} + +/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing + four 32-bit intermediate results, which are then summed by pairs to + produce two 32-bit results. */ +static __inline __m64 +_mm_madd_pu16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wmaddu ((__v4hi)__m1, (__v4hi)__m2); +} + +/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in + M2 and produce the high 16 bits of the 32-bit results. */ +static __inline __m64 +_mm_mulhi_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wmulsm ((__v4hi)__m1, (__v4hi)__m2); +} + +/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in + M2 and produce the high 16 bits of the 32-bit results. */ +static __inline __m64 +_mm_mulhi_pu16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wmulum ((__v4hi)__m1, (__v4hi)__m2); +} + +/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce + the low 16 bits of the results. */ +static __inline __m64 +_mm_mullo_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wmulul ((__v4hi)__m1, (__v4hi)__m2); +} + +/* Shift four 16-bit values in M left by COUNT. */ +static __inline __m64 +_mm_sll_pi16 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_arm_wsllh ((__v4hi)__m, __count); +} + +static __inline __m64 +_mm_slli_pi16 (__m64 __m, int __count) +{ + return (__m64) __builtin_arm_wsllhi ((__v4hi)__m, __count); +} + +/* Shift two 32-bit values in M left by COUNT. */ +static __inline __m64 +_mm_sll_pi32 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_arm_wsllw ((__v2si)__m, __count); +} + +static __inline __m64 +_mm_slli_pi32 (__m64 __m, int __count) +{ + return (__m64) __builtin_arm_wsllwi ((__v2si)__m, __count); +} + +/* Shift the 64-bit value in M left by COUNT. */ +static __inline __m64 +_mm_sll_si64 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_arm_wslld (__m, __count); +} + +static __inline __m64 +_mm_slli_si64 (__m64 __m, int __count) +{ + return (__m64) __builtin_arm_wslldi (__m, __count); +} + +/* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */ +static __inline __m64 +_mm_sra_pi16 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_arm_wsrah ((__v4hi)__m, __count); +} + +static __inline __m64 +_mm_srai_pi16 (__m64 __m, int __count) +{ + return (__m64) __builtin_arm_wsrahi ((__v4hi)__m, __count); +} + +/* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */ +static __inline __m64 +_mm_sra_pi32 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_arm_wsraw ((__v2si)__m, __count); +} + +static __inline __m64 +_mm_srai_pi32 (__m64 __m, int __count) +{ + return (__m64) __builtin_arm_wsrawi ((__v2si)__m, __count); +} + +/* Shift the 64-bit value in M right by COUNT; shift in the sign bit. */ +static __inline __m64 +_mm_sra_si64 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_arm_wsrad (__m, __count); +} + +static __inline __m64 +_mm_srai_si64 (__m64 __m, int __count) +{ + return (__m64) __builtin_arm_wsradi (__m, __count); +} + +/* Shift four 16-bit values in M right by COUNT; shift in zeros. */ +static __inline __m64 +_mm_srl_pi16 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_arm_wsrlh ((__v4hi)__m, __count); +} + +static __inline __m64 +_mm_srli_pi16 (__m64 __m, int __count) +{ + return (__m64) __builtin_arm_wsrlhi ((__v4hi)__m, __count); +} + +/* Shift two 32-bit values in M right by COUNT; shift in zeros. */ +static __inline __m64 +_mm_srl_pi32 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_arm_wsrlw ((__v2si)__m, __count); +} + +static __inline __m64 +_mm_srli_pi32 (__m64 __m, int __count) +{ + return (__m64) __builtin_arm_wsrlwi ((__v2si)__m, __count); +} + +/* Shift the 64-bit value in M left by COUNT; shift in zeros. */ +static __inline __m64 +_mm_srl_si64 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_arm_wsrld (__m, __count); +} + +static __inline __m64 +_mm_srli_si64 (__m64 __m, int __count) +{ + return (__m64) __builtin_arm_wsrldi (__m, __count); +} + +/* Rotate four 16-bit values in M right by COUNT. */ +static __inline __m64 +_mm_ror_pi16 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_arm_wrorh ((__v4hi)__m, __count); +} + +static __inline __m64 +_mm_rori_pi16 (__m64 __m, int __count) +{ + return (__m64) __builtin_arm_wrorhi ((__v4hi)__m, __count); +} + +/* Rotate two 32-bit values in M right by COUNT. */ +static __inline __m64 +_mm_ror_pi32 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_arm_wrorw ((__v2si)__m, __count); +} + +static __inline __m64 +_mm_rori_pi32 (__m64 __m, int __count) +{ + return (__m64) __builtin_arm_wrorwi ((__v2si)__m, __count); +} + +/* Rotate two 64-bit values in M right by COUNT. */ +static __inline __m64 +_mm_ror_si64 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_arm_wrord (__m, __count); +} + +static __inline __m64 +_mm_rori_si64 (__m64 __m, int __count) +{ + return (__m64) __builtin_arm_wrordi (__m, __count); +} + +/* Bit-wise AND the 64-bit values in M1 and M2. */ +static __inline __m64 +_mm_and_si64 (__m64 __m1, __m64 __m2) +{ + return __builtin_arm_wand (__m1, __m2); +} + +/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the + 64-bit value in M2. */ +static __inline __m64 +_mm_andnot_si64 (__m64 __m1, __m64 __m2) +{ + return __builtin_arm_wandn (__m1, __m2); +} + +/* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ +static __inline __m64 +_mm_or_si64 (__m64 __m1, __m64 __m2) +{ + return __builtin_arm_wor (__m1, __m2); +} + +/* Bit-wise exclusive OR the 64-bit values in M1 and M2. */ +static __inline __m64 +_mm_xor_si64 (__m64 __m1, __m64 __m2) +{ + return __builtin_arm_wxor (__m1, __m2); +} + +/* Compare eight 8-bit values. The result of the comparison is 0xFF if the + test is true and zero if false. */ +static __inline __m64 +_mm_cmpeq_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wcmpeqb ((__v8qi)__m1, (__v8qi)__m2); +} + +static __inline __m64 +_mm_cmpgt_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wcmpgtsb ((__v8qi)__m1, (__v8qi)__m2); +} + +static __inline __m64 +_mm_cmpgt_pu8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wcmpgtub ((__v8qi)__m1, (__v8qi)__m2); +} + +/* Compare four 16-bit values. The result of the comparison is 0xFFFF if + the test is true and zero if false. */ +static __inline __m64 +_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wcmpeqh ((__v4hi)__m1, (__v4hi)__m2); +} + +static __inline __m64 +_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wcmpgtsh ((__v4hi)__m1, (__v4hi)__m2); +} + +static __inline __m64 +_mm_cmpgt_pu16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wcmpgtuh ((__v4hi)__m1, (__v4hi)__m2); +} + +/* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if + the test is true and zero if false. */ +static __inline __m64 +_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wcmpeqw ((__v2si)__m1, (__v2si)__m2); +} + +static __inline __m64 +_mm_cmpgt_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wcmpgtsw ((__v2si)__m1, (__v2si)__m2); +} + +static __inline __m64 +_mm_cmpgt_pu32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_arm_wcmpgtuw ((__v2si)__m1, (__v2si)__m2); +} + +/* Element-wise multiplication of unsigned 16-bit values __B and __C, followed + by accumulate across all elements and __A. */ +static __inline __m64 +_mm_mac_pu16 (__m64 __A, __m64 __B, __m64 __C) +{ + return __builtin_arm_wmacu (__A, (__v4hi)__B, (__v4hi)__C); +} + +/* Element-wise multiplication of signed 16-bit values __B and __C, followed + by accumulate across all elements and __A. */ +static __inline __m64 +_mm_mac_pi16 (__m64 __A, __m64 __B, __m64 __C) +{ + return __builtin_arm_wmacs (__A, (__v4hi)__B, (__v4hi)__C); +} + +/* Element-wise multiplication of unsigned 16-bit values __B and __C, followed + by accumulate across all elements. */ +static __inline __m64 +_mm_macz_pu16 (__m64 __A, __m64 __B) +{ + return __builtin_arm_wmacuz ((__v4hi)__A, (__v4hi)__B); +} + +/* Element-wise multiplication of signed 16-bit values __B and __C, followed + by accumulate across all elements. */ +static __inline __m64 +_mm_macz_pi16 (__m64 __A, __m64 __B) +{ + return __builtin_arm_wmacsz ((__v4hi)__A, (__v4hi)__B); +} + +/* Accumulate across all unsigned 8-bit values in __A. */ +static __inline __m64 +_mm_acc_pu8 (__m64 __A) +{ + return __builtin_arm_waccb ((__v8qi)__A); +} + +/* Accumulate across all unsigned 16-bit values in __A. */ +static __inline __m64 +_mm_acc_pu16 (__m64 __A) +{ + return __builtin_arm_wacch ((__v4hi)__A); +} + +/* Accumulate across all unsigned 32-bit values in __A. */ +static __inline __m64 +_mm_acc_pu32 (__m64 __A) +{ + return __builtin_arm_waccw ((__v2si)__A); +} + +static __inline __m64 +_mm_mia_si64 (__m64 __A, int __B, int __C) +{ + return __builtin_arm_tmia (__A, __B, __C); +} + +static __inline __m64 +_mm_miaph_si64 (__m64 __A, int __B, int __C) +{ + return __builtin_arm_tmiaph (__A, __B, __C); +} + +static __inline __m64 +_mm_miabb_si64 (__m64 __A, int __B, int __C) +{ + return __builtin_arm_tmiabb (__A, __B, __C); +} + +static __inline __m64 +_mm_miabt_si64 (__m64 __A, int __B, int __C) +{ + return __builtin_arm_tmiabt (__A, __B, __C); +} + +static __inline __m64 +_mm_miatb_si64 (__m64 __A, int __B, int __C) +{ + return __builtin_arm_tmiatb (__A, __B, __C); +} + +static __inline __m64 +_mm_miatt_si64 (__m64 __A, int __B, int __C) +{ + return __builtin_arm_tmiatt (__A, __B, __C); +} + +/* Extract one of the elements of A and sign extend. The selector N must + be immediate. */ +#define _mm_extract_pi8(A, N) __builtin_arm_textrmsb ((__v8qi)(A), (N)) +#define _mm_extract_pi16(A, N) __builtin_arm_textrmsh ((__v4hi)(A), (N)) +#define _mm_extract_pi32(A, N) __builtin_arm_textrmsw ((__v2si)(A), (N)) + +/* Extract one of the elements of A and zero extend. The selector N must + be immediate. */ +#define _mm_extract_pu8(A, N) __builtin_arm_textrmub ((__v8qi)(A), (N)) +#define _mm_extract_pu16(A, N) __builtin_arm_textrmuh ((__v4hi)(A), (N)) +#define _mm_extract_pu32(A, N) __builtin_arm_textrmuw ((__v2si)(A), (N)) + +/* Inserts word D into one of the elements of A. The selector N must be + immediate. */ +#define _mm_insert_pi8(A, D, N) \ + ((__m64) __builtin_arm_tinsrb ((__v8qi)(A), (D), (N))) +#define _mm_insert_pi16(A, D, N) \ + ((__m64) __builtin_arm_tinsrh ((__v4hi)(A), (D), (N))) +#define _mm_insert_pi32(A, D, N) \ + ((__m64) __builtin_arm_tinsrw ((__v2si)(A), (D), (N))) + +/* Compute the element-wise maximum of signed 8-bit values. */ +static __inline __m64 +_mm_max_pi8 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_arm_wmaxsb ((__v8qi)__A, (__v8qi)__B); +} + +/* Compute the element-wise maximum of signed 16-bit values. */ +static __inline __m64 +_mm_max_pi16 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_arm_wmaxsh ((__v4hi)__A, (__v4hi)__B); +} + +/* Compute the element-wise maximum of signed 32-bit values. */ +static __inline __m64 +_mm_max_pi32 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_arm_wmaxsw ((__v2si)__A, (__v2si)__B); +} + +/* Compute the element-wise maximum of unsigned 8-bit values. */ +static __inline __m64 +_mm_max_pu8 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_arm_wmaxub ((__v8qi)__A, (__v8qi)__B); +} + +/* Compute the element-wise maximum of unsigned 16-bit values. */ +static __inline __m64 +_mm_max_pu16 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_arm_wmaxuh ((__v4hi)__A, (__v4hi)__B); +} + +/* Compute the element-wise maximum of unsigned 32-bit values. */ +static __inline __m64 +_mm_max_pu32 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_arm_wmaxuw ((__v2si)__A, (__v2si)__B); +} + +/* Compute the element-wise minimum of signed 16-bit values. */ +static __inline __m64 +_mm_min_pi8 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_arm_wminsb ((__v8qi)__A, (__v8qi)__B); +} + +/* Compute the element-wise minimum of signed 16-bit values. */ +static __inline __m64 +_mm_min_pi16 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_arm_wminsh ((__v4hi)__A, (__v4hi)__B); +} + +/* Compute the element-wise minimum of signed 32-bit values. */ +static __inline __m64 +_mm_min_pi32 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_arm_wminsw ((__v2si)__A, (__v2si)__B); +} + +/* Compute the element-wise minimum of unsigned 16-bit values. */ +static __inline __m64 +_mm_min_pu8 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_arm_wminub ((__v8qi)__A, (__v8qi)__B); +} + +/* Compute the element-wise minimum of unsigned 16-bit values. */ +static __inline __m64 +_mm_min_pu16 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_arm_wminuh ((__v4hi)__A, (__v4hi)__B); +} + +/* Compute the element-wise minimum of unsigned 32-bit values. */ +static __inline __m64 +_mm_min_pu32 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_arm_wminuw ((__v2si)__A, (__v2si)__B); +} + +/* Create an 8-bit mask of the signs of 8-bit values. */ +static __inline int +_mm_movemask_pi8 (__m64 __A) +{ + return __builtin_arm_tmovmskb ((__v8qi)__A); +} + +/* Create an 8-bit mask of the signs of 16-bit values. */ +static __inline int +_mm_movemask_pi16 (__m64 __A) +{ + return __builtin_arm_tmovmskh ((__v4hi)__A); +} + +/* Create an 8-bit mask of the signs of 32-bit values. */ +static __inline int +_mm_movemask_pi32 (__m64 __A) +{ + return __builtin_arm_tmovmskw ((__v2si)__A); +} + +/* Return a combination of the four 16-bit values in A. The selector + must be an immediate. */ +#define _mm_shuffle_pi16(A, N) \ + ((__m64) __builtin_arm_wshufh ((__v4hi)(A), (N))) + + +/* Compute the rounded averages of the unsigned 8-bit values in A and B. */ +static __inline __m64 +_mm_avg_pu8 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_arm_wavg2br ((__v8qi)__A, (__v8qi)__B); +} + +/* Compute the rounded averages of the unsigned 16-bit values in A and B. */ +static __inline __m64 +_mm_avg_pu16 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_arm_wavg2hr ((__v4hi)__A, (__v4hi)__B); +} + +/* Compute the averages of the unsigned 8-bit values in A and B. */ +static __inline __m64 +_mm_avg2_pu8 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_arm_wavg2b ((__v8qi)__A, (__v8qi)__B); +} + +/* Compute the averages of the unsigned 16-bit values in A and B. */ +static __inline __m64 +_mm_avg2_pu16 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_arm_wavg2h ((__v4hi)__A, (__v4hi)__B); +} + +/* Compute the sum of the absolute differences of the unsigned 8-bit + values in A and B. Return the value in the lower 16-bit word; the + upper words are cleared. */ +static __inline __m64 +_mm_sad_pu8 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_arm_wsadb ((__v8qi)__A, (__v8qi)__B); +} + +/* Compute the sum of the absolute differences of the unsigned 16-bit + values in A and B. Return the value in the lower 32-bit word; the + upper words are cleared. */ +static __inline __m64 +_mm_sad_pu16 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_arm_wsadh ((__v4hi)__A, (__v4hi)__B); +} + +/* Compute the sum of the absolute differences of the unsigned 8-bit + values in A and B. Return the value in the lower 16-bit word; the + upper words are cleared. */ +static __inline __m64 +_mm_sadz_pu8 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B); +} + +/* Compute the sum of the absolute differences of the unsigned 16-bit + values in A and B. Return the value in the lower 32-bit word; the + upper words are cleared. */ +static __inline __m64 +_mm_sadz_pu16 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B); +} + +static __inline __m64 +_mm_align_si64 (__m64 __A, __m64 __B, int __C) +{ + return (__m64) __builtin_arm_walign ((__v8qi)__A, (__v8qi)__B, __C); +} + +/* Creates a 64-bit zero. */ +static __inline __m64 +_mm_setzero_si64 (void) +{ + return __builtin_arm_wzero (); +} + +/* Set and Get arbitrary iWMMXt Control registers. + Note only registers 0-3 and 8-11 are currently defined, + the rest are reserved. */ + +static __inline void +_mm_setwcx (const int __value, const int __regno) +{ + switch (__regno) + { + case 0: __builtin_arm_setwcx (__value, 0); break; + case 1: __builtin_arm_setwcx (__value, 1); break; + case 2: __builtin_arm_setwcx (__value, 2); break; + case 3: __builtin_arm_setwcx (__value, 3); break; + case 8: __builtin_arm_setwcx (__value, 8); break; + case 9: __builtin_arm_setwcx (__value, 9); break; + case 10: __builtin_arm_setwcx (__value, 10); break; + case 11: __builtin_arm_setwcx (__value, 11); break; + default: break; + } +} + +static __inline int +_mm_getwcx (const int __regno) +{ + switch (__regno) + { + case 0: return __builtin_arm_getwcx (0); + case 1: return __builtin_arm_getwcx (1); + case 2: return __builtin_arm_getwcx (2); + case 3: return __builtin_arm_getwcx (3); + case 8: return __builtin_arm_getwcx (8); + case 9: return __builtin_arm_getwcx (9); + case 10: return __builtin_arm_getwcx (10); + case 11: return __builtin_arm_getwcx (11); + default: return 0; + } +} + +/* Creates a vector of two 32-bit values; I0 is least significant. */ +static __inline __m64 +_mm_set_pi32 (int __i1, int __i0) +{ + union { + __m64 __q; + struct { + unsigned int __i0; + unsigned int __i1; + } __s; + } __u; + + __u.__s.__i0 = __i0; + __u.__s.__i1 = __i1; + + return __u.__q; +} + +/* Creates a vector of four 16-bit values; W0 is least significant. */ +static __inline __m64 +_mm_set_pi16 (short __w3, short __w2, short __w1, short __w0) +{ + unsigned int __i1 = (unsigned short)__w3 << 16 | (unsigned short)__w2; + unsigned int __i0 = (unsigned short)__w1 << 16 | (unsigned short)__w0; + return _mm_set_pi32 (__i1, __i0); + +} + +/* Creates a vector of eight 8-bit values; B0 is least significant. */ +static __inline __m64 +_mm_set_pi8 (char __b7, char __b6, char __b5, char __b4, + char __b3, char __b2, char __b1, char __b0) +{ + unsigned int __i1, __i0; + + __i1 = (unsigned char)__b7; + __i1 = __i1 << 8 | (unsigned char)__b6; + __i1 = __i1 << 8 | (unsigned char)__b5; + __i1 = __i1 << 8 | (unsigned char)__b4; + + __i0 = (unsigned char)__b3; + __i0 = __i0 << 8 | (unsigned char)__b2; + __i0 = __i0 << 8 | (unsigned char)__b1; + __i0 = __i0 << 8 | (unsigned char)__b0; + + return _mm_set_pi32 (__i1, __i0); +} + +/* Similar, but with the arguments in reverse order. */ +static __inline __m64 +_mm_setr_pi32 (int __i0, int __i1) +{ + return _mm_set_pi32 (__i1, __i0); +} + +static __inline __m64 +_mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3) +{ + return _mm_set_pi16 (__w3, __w2, __w1, __w0); +} + +static __inline __m64 +_mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3, + char __b4, char __b5, char __b6, char __b7) +{ + return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); +} + +/* Creates a vector of two 32-bit values, both elements containing I. */ +static __inline __m64 +_mm_set1_pi32 (int __i) +{ + return _mm_set_pi32 (__i, __i); +} + +/* Creates a vector of four 16-bit values, all elements containing W. */ +static __inline __m64 +_mm_set1_pi16 (short __w) +{ + unsigned int __i = (unsigned short)__w << 16 | (unsigned short)__w; + return _mm_set1_pi32 (__i); +} + +/* Creates a vector of four 16-bit values, all elements containing B. */ +static __inline __m64 +_mm_set1_pi8 (char __b) +{ + unsigned int __w = (unsigned char)__b << 8 | (unsigned char)__b; + unsigned int __i = __w << 16 | __w; + return _mm_set1_pi32 (__i); +} + +/* Convert an integer to a __m64 object. */ +static __inline __m64 +_m_from_int (int __a) +{ + return (__m64)__a; +} + +#define _m_packsswb _mm_packs_pi16 +#define _m_packssdw _mm_packs_pi32 +#define _m_packuswb _mm_packs_pu16 +#define _m_packusdw _mm_packs_pu32 +#define _m_packssqd _mm_packs_pi64 +#define _m_packusqd _mm_packs_pu64 +#define _mm_packs_si64 _mm_packs_pi64 +#define _mm_packs_su64 _mm_packs_pu64 +#define _m_punpckhbw _mm_unpackhi_pi8 +#define _m_punpckhwd _mm_unpackhi_pi16 +#define _m_punpckhdq _mm_unpackhi_pi32 +#define _m_punpcklbw _mm_unpacklo_pi8 +#define _m_punpcklwd _mm_unpacklo_pi16 +#define _m_punpckldq _mm_unpacklo_pi32 +#define _m_punpckehsbw _mm_unpackeh_pi8 +#define _m_punpckehswd _mm_unpackeh_pi16 +#define _m_punpckehsdq _mm_unpackeh_pi32 +#define _m_punpckehubw _mm_unpackeh_pu8 +#define _m_punpckehuwd _mm_unpackeh_pu16 +#define _m_punpckehudq _mm_unpackeh_pu32 +#define _m_punpckelsbw _mm_unpackel_pi8 +#define _m_punpckelswd _mm_unpackel_pi16 +#define _m_punpckelsdq _mm_unpackel_pi32 +#define _m_punpckelubw _mm_unpackel_pu8 +#define _m_punpckeluwd _mm_unpackel_pu16 +#define _m_punpckeludq _mm_unpackel_pu32 +#define _m_paddb _mm_add_pi8 +#define _m_paddw _mm_add_pi16 +#define _m_paddd _mm_add_pi32 +#define _m_paddsb _mm_adds_pi8 +#define _m_paddsw _mm_adds_pi16 +#define _m_paddsd _mm_adds_pi32 +#define _m_paddusb _mm_adds_pu8 +#define _m_paddusw _mm_adds_pu16 +#define _m_paddusd _mm_adds_pu32 +#define _m_psubb _mm_sub_pi8 +#define _m_psubw _mm_sub_pi16 +#define _m_psubd _mm_sub_pi32 +#define _m_psubsb _mm_subs_pi8 +#define _m_psubsw _mm_subs_pi16 +#define _m_psubuw _mm_subs_pi32 +#define _m_psubusb _mm_subs_pu8 +#define _m_psubusw _mm_subs_pu16 +#define _m_psubusd _mm_subs_pu32 +#define _m_pmaddwd _mm_madd_pi16 +#define _m_pmadduwd _mm_madd_pu16 +#define _m_pmulhw _mm_mulhi_pi16 +#define _m_pmulhuw _mm_mulhi_pu16 +#define _m_pmullw _mm_mullo_pi16 +#define _m_pmacsw _mm_mac_pi16 +#define _m_pmacuw _mm_mac_pu16 +#define _m_pmacszw _mm_macz_pi16 +#define _m_pmacuzw _mm_macz_pu16 +#define _m_paccb _mm_acc_pu8 +#define _m_paccw _mm_acc_pu16 +#define _m_paccd _mm_acc_pu32 +#define _m_pmia _mm_mia_si64 +#define _m_pmiaph _mm_miaph_si64 +#define _m_pmiabb _mm_miabb_si64 +#define _m_pmiabt _mm_miabt_si64 +#define _m_pmiatb _mm_miatb_si64 +#define _m_pmiatt _mm_miatt_si64 +#define _m_psllw _mm_sll_pi16 +#define _m_psllwi _mm_slli_pi16 +#define _m_pslld _mm_sll_pi32 +#define _m_pslldi _mm_slli_pi32 +#define _m_psllq _mm_sll_si64 +#define _m_psllqi _mm_slli_si64 +#define _m_psraw _mm_sra_pi16 +#define _m_psrawi _mm_srai_pi16 +#define _m_psrad _mm_sra_pi32 +#define _m_psradi _mm_srai_pi32 +#define _m_psraq _mm_sra_si64 +#define _m_psraqi _mm_srai_si64 +#define _m_psrlw _mm_srl_pi16 +#define _m_psrlwi _mm_srli_pi16 +#define _m_psrld _mm_srl_pi32 +#define _m_psrldi _mm_srli_pi32 +#define _m_psrlq _mm_srl_si64 +#define _m_psrlqi _mm_srli_si64 +#define _m_prorw _mm_ror_pi16 +#define _m_prorwi _mm_rori_pi16 +#define _m_prord _mm_ror_pi32 +#define _m_prordi _mm_rori_pi32 +#define _m_prorq _mm_ror_si64 +#define _m_prorqi _mm_rori_si64 +#define _m_pand _mm_and_si64 +#define _m_pandn _mm_andnot_si64 +#define _m_por _mm_or_si64 +#define _m_pxor _mm_xor_si64 +#define _m_pcmpeqb _mm_cmpeq_pi8 +#define _m_pcmpeqw _mm_cmpeq_pi16 +#define _m_pcmpeqd _mm_cmpeq_pi32 +#define _m_pcmpgtb _mm_cmpgt_pi8 +#define _m_pcmpgtub _mm_cmpgt_pu8 +#define _m_pcmpgtw _mm_cmpgt_pi16 +#define _m_pcmpgtuw _mm_cmpgt_pu16 +#define _m_pcmpgtd _mm_cmpgt_pi32 +#define _m_pcmpgtud _mm_cmpgt_pu32 +#define _m_pextrb _mm_extract_pi8 +#define _m_pextrw _mm_extract_pi16 +#define _m_pextrd _mm_extract_pi32 +#define _m_pextrub _mm_extract_pu8 +#define _m_pextruw _mm_extract_pu16 +#define _m_pextrud _mm_extract_pu32 +#define _m_pinsrb _mm_insert_pi8 +#define _m_pinsrw _mm_insert_pi16 +#define _m_pinsrd _mm_insert_pi32 +#define _m_pmaxsb _mm_max_pi8 +#define _m_pmaxsw _mm_max_pi16 +#define _m_pmaxsd _mm_max_pi32 +#define _m_pmaxub _mm_max_pu8 +#define _m_pmaxuw _mm_max_pu16 +#define _m_pmaxud _mm_max_pu32 +#define _m_pminsb _mm_min_pi8 +#define _m_pminsw _mm_min_pi16 +#define _m_pminsd _mm_min_pi32 +#define _m_pminub _mm_min_pu8 +#define _m_pminuw _mm_min_pu16 +#define _m_pminud _mm_min_pu32 +#define _m_pmovmskb _mm_movemask_pi8 +#define _m_pmovmskw _mm_movemask_pi16 +#define _m_pmovmskd _mm_movemask_pi32 +#define _m_pshufw _mm_shuffle_pi16 +#define _m_pavgb _mm_avg_pu8 +#define _m_pavgw _mm_avg_pu16 +#define _m_pavg2b _mm_avg2_pu8 +#define _m_pavg2w _mm_avg2_pu16 +#define _m_psadbw _mm_sad_pu8 +#define _m_psadwd _mm_sad_pu16 +#define _m_psadzbw _mm_sadz_pu8 +#define _m_psadzwd _mm_sadz_pu16 +#define _m_paligniq _mm_align_si64 +#define _m_cvt_si2pi _mm_cvtsi64_m64 +#define _m_cvt_pi2si _mm_cvtm64_si64 + +#endif /* _MMINTRIN_H_INCLUDED */ diff --git a/gcc/config/arm/neon-docgen.ml b/gcc/config/arm/neon-docgen.ml new file mode 100644 index 000000000..23e37b498 --- /dev/null +++ b/gcc/config/arm/neon-docgen.ml @@ -0,0 +1,337 @@ +(* ARM NEON documentation generator. + + Copyright (C) 2006, 2007 Free Software Foundation, Inc. + Contributed by CodeSourcery. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free + Software Foundation; either version 3, or (at your option) any later + version. + + GCC is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . + + This is an O'Caml program. The O'Caml compiler is available from: + + http://caml.inria.fr/ + + Or from your favourite OS's friendly packaging system. Tested with version + 3.09.2, though other versions will probably work too. + + Compile with: + ocamlc -c neon.ml + ocamlc -o neon-docgen neon.cmo neon-docgen.ml + + Run with: + /path/to/neon-docgen /path/to/gcc/doc/arm-neon-intrinsics.texi +*) + +open Neon + +(* The combined "ops" and "reinterp" table. *) +let ops_reinterp = reinterp @ ops + +(* Helper functions for extracting things from the "ops" table. *) +let single_opcode desired_opcode () = + List.fold_left (fun got_so_far -> + fun row -> + match row with + (opcode, _, _, _, _, _) -> + if opcode = desired_opcode then row :: got_so_far + else got_so_far + ) [] ops_reinterp + +let multiple_opcodes desired_opcodes () = + List.fold_left (fun got_so_far -> + fun desired_opcode -> + (single_opcode desired_opcode ()) @ got_so_far) + [] desired_opcodes + +let ldx_opcode number () = + List.fold_left (fun got_so_far -> + fun row -> + match row with + (opcode, _, _, _, _, _) -> + match opcode with + Vldx n | Vldx_lane n | Vldx_dup n when n = number -> + row :: got_so_far + | _ -> got_so_far + ) [] ops_reinterp + +let stx_opcode number () = + List.fold_left (fun got_so_far -> + fun row -> + match row with + (opcode, _, _, _, _, _) -> + match opcode with + Vstx n | Vstx_lane n when n = number -> + row :: got_so_far + | _ -> got_so_far + ) [] ops_reinterp + +let tbl_opcode () = + List.fold_left (fun got_so_far -> + fun row -> + match row with + (opcode, _, _, _, _, _) -> + match opcode with + Vtbl _ -> row :: got_so_far + | _ -> got_so_far + ) [] ops_reinterp + +let tbx_opcode () = + List.fold_left (fun got_so_far -> + fun row -> + match row with + (opcode, _, _, _, _, _) -> + match opcode with + Vtbx _ -> row :: got_so_far + | _ -> got_so_far + ) [] ops_reinterp + +(* The groups of intrinsics. *) +let intrinsic_groups = + [ "Addition", single_opcode Vadd; + "Multiplication", single_opcode Vmul; + "Multiply-accumulate", single_opcode Vmla; + "Multiply-subtract", single_opcode Vmls; + "Subtraction", single_opcode Vsub; + "Comparison (equal-to)", single_opcode Vceq; + "Comparison (greater-than-or-equal-to)", single_opcode Vcge; + "Comparison (less-than-or-equal-to)", single_opcode Vcle; + "Comparison (greater-than)", single_opcode Vcgt; + "Comparison (less-than)", single_opcode Vclt; + "Comparison (absolute greater-than-or-equal-to)", single_opcode Vcage; + "Comparison (absolute less-than-or-equal-to)", single_opcode Vcale; + "Comparison (absolute greater-than)", single_opcode Vcagt; + "Comparison (absolute less-than)", single_opcode Vcalt; + "Test bits", single_opcode Vtst; + "Absolute difference", single_opcode Vabd; + "Absolute difference and accumulate", single_opcode Vaba; + "Maximum", single_opcode Vmax; + "Minimum", single_opcode Vmin; + "Pairwise add", single_opcode Vpadd; + "Pairwise add, single_opcode widen and accumulate", single_opcode Vpada; + "Folding maximum", single_opcode Vpmax; + "Folding minimum", single_opcode Vpmin; + "Reciprocal step", multiple_opcodes [Vrecps; Vrsqrts]; + "Vector shift left", single_opcode Vshl; + "Vector shift left by constant", single_opcode Vshl_n; + "Vector shift right by constant", single_opcode Vshr_n; + "Vector shift right by constant and accumulate", single_opcode Vsra_n; + "Vector shift right and insert", single_opcode Vsri; + "Vector shift left and insert", single_opcode Vsli; + "Absolute value", single_opcode Vabs; + "Negation", single_opcode Vneg; + "Bitwise not", single_opcode Vmvn; + "Count leading sign bits", single_opcode Vcls; + "Count leading zeros", single_opcode Vclz; + "Count number of set bits", single_opcode Vcnt; + "Reciprocal estimate", single_opcode Vrecpe; + "Reciprocal square-root estimate", single_opcode Vrsqrte; + "Get lanes from a vector", single_opcode Vget_lane; + "Set lanes in a vector", single_opcode Vset_lane; + "Create vector from literal bit pattern", single_opcode Vcreate; + "Set all lanes to the same value", + multiple_opcodes [Vdup_n; Vmov_n; Vdup_lane]; + "Combining vectors", single_opcode Vcombine; + "Splitting vectors", multiple_opcodes [Vget_high; Vget_low]; + "Conversions", multiple_opcodes [Vcvt; Vcvt_n]; + "Move, single_opcode narrowing", single_opcode Vmovn; + "Move, single_opcode long", single_opcode Vmovl; + "Table lookup", tbl_opcode; + "Extended table lookup", tbx_opcode; + "Multiply, lane", single_opcode Vmul_lane; + "Long multiply, lane", single_opcode Vmull_lane; + "Saturating doubling long multiply, lane", single_opcode Vqdmull_lane; + "Saturating doubling multiply high, lane", single_opcode Vqdmulh_lane; + "Multiply-accumulate, lane", single_opcode Vmla_lane; + "Multiply-subtract, lane", single_opcode Vmls_lane; + "Vector multiply by scalar", single_opcode Vmul_n; + "Vector long multiply by scalar", single_opcode Vmull_n; + "Vector saturating doubling long multiply by scalar", + single_opcode Vqdmull_n; + "Vector saturating doubling multiply high by scalar", + single_opcode Vqdmulh_n; + "Vector multiply-accumulate by scalar", single_opcode Vmla_n; + "Vector multiply-subtract by scalar", single_opcode Vmls_n; + "Vector extract", single_opcode Vext; + "Reverse elements", multiple_opcodes [Vrev64; Vrev32; Vrev16]; + "Bit selection", single_opcode Vbsl; + "Transpose elements", single_opcode Vtrn; + "Zip elements", single_opcode Vzip; + "Unzip elements", single_opcode Vuzp; + "Element/structure loads, VLD1 variants", ldx_opcode 1; + "Element/structure stores, VST1 variants", stx_opcode 1; + "Element/structure loads, VLD2 variants", ldx_opcode 2; + "Element/structure stores, VST2 variants", stx_opcode 2; + "Element/structure loads, VLD3 variants", ldx_opcode 3; + "Element/structure stores, VST3 variants", stx_opcode 3; + "Element/structure loads, VLD4 variants", ldx_opcode 4; + "Element/structure stores, VST4 variants", stx_opcode 4; + "Logical operations (AND)", single_opcode Vand; + "Logical operations (OR)", single_opcode Vorr; + "Logical operations (exclusive OR)", single_opcode Veor; + "Logical operations (AND-NOT)", single_opcode Vbic; + "Logical operations (OR-NOT)", single_opcode Vorn; + "Reinterpret casts", single_opcode Vreinterp ] + +(* Given an intrinsic shape, produce a string to document the corresponding + operand shapes. *) +let rec analyze_shape shape = + let rec n_things n thing = + match n with + 0 -> [] + | n -> thing :: (n_things (n - 1) thing) + in + let rec analyze_shape_elt reg_no elt = + match elt with + Dreg -> "@var{d" ^ (string_of_int reg_no) ^ "}" + | Qreg -> "@var{q" ^ (string_of_int reg_no) ^ "}" + | Corereg -> "@var{r" ^ (string_of_int reg_no) ^ "}" + | Immed -> "#@var{0}" + | VecArray (1, elt) -> + let elt_regexp = analyze_shape_elt 0 elt in + "@{" ^ elt_regexp ^ "@}" + | VecArray (n, elt) -> + let rec f m = + match m with + 0 -> [] + | m -> (analyze_shape_elt (m - 1) elt) :: (f (m - 1)) + in + let ops = List.rev (f n) in + "@{" ^ (commas (fun x -> x) ops "") ^ "@}" + | (PtrTo elt | CstPtrTo elt) -> + "[" ^ (analyze_shape_elt reg_no elt) ^ "]" + | Element_of_dreg -> (analyze_shape_elt reg_no Dreg) ^ "[@var{0}]" + | Element_of_qreg -> (analyze_shape_elt reg_no Qreg) ^ "[@var{0}]" + | All_elements_of_dreg -> (analyze_shape_elt reg_no Dreg) ^ "[]" + | Alternatives alts -> (analyze_shape_elt reg_no (List.hd alts)) + in + match shape with + All (n, elt) -> commas (analyze_shape_elt 0) (n_things n elt) "" + | Long -> (analyze_shape_elt 0 Qreg) ^ ", " ^ (analyze_shape_elt 0 Dreg) ^ + ", " ^ (analyze_shape_elt 0 Dreg) + | Long_noreg elt -> (analyze_shape_elt 0 elt) ^ ", " ^ + (analyze_shape_elt 0 elt) + | Wide -> (analyze_shape_elt 0 Qreg) ^ ", " ^ (analyze_shape_elt 0 Qreg) ^ + ", " ^ (analyze_shape_elt 0 Dreg) + | Wide_noreg elt -> analyze_shape (Long_noreg elt) + | Narrow -> (analyze_shape_elt 0 Dreg) ^ ", " ^ (analyze_shape_elt 0 Qreg) ^ + ", " ^ (analyze_shape_elt 0 Qreg) + | Use_operands elts -> commas (analyze_shape_elt 0) (Array.to_list elts) "" + | By_scalar Dreg -> + analyze_shape (Use_operands [| Dreg; Dreg; Element_of_dreg |]) + | By_scalar Qreg -> + analyze_shape (Use_operands [| Qreg; Qreg; Element_of_dreg |]) + | By_scalar _ -> assert false + | Wide_lane -> + analyze_shape (Use_operands [| Qreg; Dreg; Element_of_dreg |]) + | Wide_scalar -> + analyze_shape (Use_operands [| Qreg; Dreg; Element_of_dreg |]) + | Pair_result elt -> + let elt_regexp = analyze_shape_elt 0 elt in + let elt_regexp' = analyze_shape_elt 1 elt in + elt_regexp ^ ", " ^ elt_regexp' + | Unary_scalar _ -> "FIXME Unary_scalar" + | Binary_imm elt -> analyze_shape (Use_operands [| elt; elt; Immed |]) + | Narrow_imm -> analyze_shape (Use_operands [| Dreg; Qreg; Immed |]) + | Long_imm -> analyze_shape (Use_operands [| Qreg; Dreg; Immed |]) + +(* Document a single intrinsic. *) +let describe_intrinsic first chan + (elt_ty, (_, features, shape, name, munge, _)) = + let c_arity, new_elt_ty = munge shape elt_ty in + let c_types = strings_of_arity c_arity in + Printf.fprintf chan "@itemize @bullet\n"; + let item_code = if first then "@item" else "@itemx" in + Printf.fprintf chan "%s %s %s_%s (" item_code (List.hd c_types) + (intrinsic_name name) (string_of_elt elt_ty); + Printf.fprintf chan "%s)\n" (commas (fun ty -> ty) (List.tl c_types) ""); + if not (List.exists (fun feature -> feature = No_op) features) then + begin + let print_one_insn name = + Printf.fprintf chan "@code{"; + let no_suffix = (new_elt_ty = NoElts) in + let name_with_suffix = + if no_suffix then name + else name ^ "." ^ (string_of_elt_dots new_elt_ty) + in + let possible_operands = analyze_all_shapes features shape + analyze_shape + in + let rec print_one_possible_operand op = + Printf.fprintf chan "%s %s}" name_with_suffix op + in + (* If the intrinsic expands to multiple instructions, we assume + they are all of the same form. *) + print_one_possible_operand (List.hd possible_operands) + in + let rec print_insns names = + match names with + [] -> () + | [name] -> print_one_insn name + | name::names -> (print_one_insn name; + Printf.fprintf chan " @emph{or} "; + print_insns names) + in + let insn_names = get_insn_names features name in + Printf.fprintf chan "@*@emph{Form of expected instruction(s):} "; + print_insns insn_names; + Printf.fprintf chan "\n" + end; + Printf.fprintf chan "@end itemize\n"; + Printf.fprintf chan "\n\n" + +(* Document a group of intrinsics. *) +let document_group chan (group_title, group_extractor) = + (* Extract the rows in question from the ops table and then turn them + into a list of intrinsics. *) + let intrinsics = + List.fold_left (fun got_so_far -> + fun row -> + match row with + (_, _, _, _, _, elt_tys) -> + List.fold_left (fun got_so_far' -> + fun elt_ty -> + (elt_ty, row) :: got_so_far') + got_so_far elt_tys + ) [] (group_extractor ()) + in + (* Emit the title for this group. *) + Printf.fprintf chan "@subsubsection %s\n\n" group_title; + (* Emit a description of each intrinsic. *) + List.iter (describe_intrinsic true chan) intrinsics; + (* Close this group. *) + Printf.fprintf chan "\n\n" + +let gnu_header chan = + List.iter (fun s -> Printf.fprintf chan "%s\n" s) [ + "@c Copyright (C) 2006 Free Software Foundation, Inc."; + "@c This is part of the GCC manual."; + "@c For copying conditions, see the file gcc.texi."; + ""; + "@c This file is generated automatically using gcc/config/arm/neon-docgen.ml"; + "@c Please do not edit manually."] + +(* Program entry point. *) +let _ = + if Array.length Sys.argv <> 2 then + failwith "Usage: neon-docgen " + else + let file = Sys.argv.(1) in + try + let chan = open_out file in + gnu_header chan; + List.iter (document_group chan) intrinsic_groups; + close_out chan + with Sys_error sys -> + failwith ("Could not create output file " ^ file ^ ": " ^ sys) diff --git a/gcc/config/arm/neon-gen.ml b/gcc/config/arm/neon-gen.ml new file mode 100644 index 000000000..112c8be6e --- /dev/null +++ b/gcc/config/arm/neon-gen.ml @@ -0,0 +1,416 @@ +(* Auto-generate ARM Neon intrinsics header file. + Copyright (C) 2006, 2007, 2009 Free Software Foundation, Inc. + Contributed by CodeSourcery. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free + Software Foundation; either version 3, or (at your option) any later + version. + + GCC is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . + + This is an O'Caml program. The O'Caml compiler is available from: + + http://caml.inria.fr/ + + Or from your favourite OS's friendly packaging system. Tested with version + 3.09.2, though other versions will probably work too. + + Compile with: + ocamlc -c neon.ml + ocamlc -o neon-gen neon.cmo neon-gen.ml + + Run with: + ./neon-gen > arm_neon.h +*) + +open Neon + +(* The format codes used in the following functions are documented at: + http://caml.inria.fr/pub/docs/manual-ocaml/libref/Format.html\ + #6_printflikefunctionsforprettyprinting + (one line, remove the backslash.) +*) + +(* Following functions can be used to approximate GNU indentation style. *) +let start_function () = + Format.printf "@["; + ref 0 + +let end_function nesting = + match !nesting with + 0 -> Format.printf "@;@;@]" + | _ -> failwith ("Bad nesting (ending function at level " + ^ (string_of_int !nesting) ^ ")") + +let open_braceblock nesting = + begin match !nesting with + 0 -> Format.printf "@,@<0>{@[@," + | _ -> Format.printf "@,@[ @<0>{@[@," + end; + incr nesting + +let close_braceblock nesting = + decr nesting; + match !nesting with + 0 -> Format.printf "@]@,@<0>}" + | _ -> Format.printf "@]@,@<0>}@]" + +let print_function arity fnname body = + let ffmt = start_function () in + Format.printf "__extension__ static __inline "; + let inl = "__attribute__ ((__always_inline__))" in + begin match arity with + Arity0 ret -> + Format.printf "%s %s@,%s (void)" (string_of_vectype ret) inl fnname + | Arity1 (ret, arg0) -> + Format.printf "%s %s@,%s (%s __a)" (string_of_vectype ret) inl fnname + (string_of_vectype arg0) + | Arity2 (ret, arg0, arg1) -> + Format.printf "%s %s@,%s (%s __a, %s __b)" + (string_of_vectype ret) inl fnname (string_of_vectype arg0) + (string_of_vectype arg1) + | Arity3 (ret, arg0, arg1, arg2) -> + Format.printf "%s %s@,%s (%s __a, %s __b, %s __c)" + (string_of_vectype ret) inl fnname (string_of_vectype arg0) + (string_of_vectype arg1) (string_of_vectype arg2) + | Arity4 (ret, arg0, arg1, arg2, arg3) -> + Format.printf "%s %s@,%s (%s __a, %s __b, %s __c, %s __d)" + (string_of_vectype ret) inl fnname (string_of_vectype arg0) + (string_of_vectype arg1) (string_of_vectype arg2) + (string_of_vectype arg3) + end; + open_braceblock ffmt; + let rec print_lines = function + [] -> () + | [line] -> Format.printf "%s" line + | line::lines -> Format.printf "%s@," line; print_lines lines in + print_lines body; + close_braceblock ffmt; + end_function ffmt + +let return_by_ptr features = List.mem ReturnPtr features + +let union_string num elts base = + let itype = inttype_for_array num elts in + let iname = string_of_inttype itype + and sname = string_of_vectype (T_arrayof (num, elts)) in + Printf.sprintf "union { %s __i; %s __o; } %s" sname iname base + +let rec signed_ctype = function + T_uint8x8 | T_poly8x8 -> T_int8x8 + | T_uint8x16 | T_poly8x16 -> T_int8x16 + | T_uint16x4 | T_poly16x4 -> T_int16x4 + | T_uint16x8 | T_poly16x8 -> T_int16x8 + | T_uint32x2 -> T_int32x2 + | T_uint32x4 -> T_int32x4 + | T_uint64x1 -> T_int64x1 + | T_uint64x2 -> T_int64x2 + (* Cast to types defined by mode in arm.c, not random types pulled in from + the header in use. This fixes incompatible pointer errors when + compiling with C++. *) + | T_uint8 | T_int8 -> T_intQI + | T_uint16 | T_int16 -> T_intHI + | T_uint32 | T_int32 -> T_intSI + | T_uint64 | T_int64 -> T_intDI + | T_float32 -> T_floatSF + | T_poly8 -> T_intQI + | T_poly16 -> T_intHI + | T_arrayof (n, elt) -> T_arrayof (n, signed_ctype elt) + | T_ptrto elt -> T_ptrto (signed_ctype elt) + | T_const elt -> T_const (signed_ctype elt) + | x -> x + +let add_cast ctype cval = + let stype = signed_ctype ctype in + if ctype <> stype then + Printf.sprintf "(%s) %s" (string_of_vectype stype) cval + else + cval + +let cast_for_return to_ty = "(" ^ (string_of_vectype to_ty) ^ ")" + +(* Return a tuple of a list of declarations to go at the start of the function, + and a list of statements needed to return THING. *) +let return arity return_by_ptr thing = + match arity with + Arity0 (ret) | Arity1 (ret, _) | Arity2 (ret, _, _) | Arity3 (ret, _, _, _) + | Arity4 (ret, _, _, _, _) -> + match ret with + T_arrayof (num, vec) -> + if return_by_ptr then + let sname = string_of_vectype ret in + [Printf.sprintf "%s __rv;" sname], + [thing ^ ";"; "return __rv;"] + else + let uname = union_string num vec "__rv" in + [uname ^ ";"], ["__rv.__o = " ^ thing ^ ";"; "return __rv.__i;"] + | T_void -> [], [thing ^ ";"] + | _ -> + [], ["return " ^ (cast_for_return ret) ^ thing ^ ";"] + +let rec element_type ctype = + match ctype with + T_arrayof (_, v) -> element_type v + | _ -> ctype + +let params return_by_ptr ps = + let pdecls = ref [] in + let ptype t p = + match t with + T_arrayof (num, elts) -> + let uname = union_string num elts (p ^ "u") in + let decl = Printf.sprintf "%s = { %s };" uname p in + pdecls := decl :: !pdecls; + p ^ "u.__o" + | _ -> add_cast t p in + let plist = match ps with + Arity0 _ -> [] + | Arity1 (_, t1) -> [ptype t1 "__a"] + | Arity2 (_, t1, t2) -> [ptype t1 "__a"; ptype t2 "__b"] + | Arity3 (_, t1, t2, t3) -> [ptype t1 "__a"; ptype t2 "__b"; ptype t3 "__c"] + | Arity4 (_, t1, t2, t3, t4) -> + [ptype t1 "__a"; ptype t2 "__b"; ptype t3 "__c"; ptype t4 "__d"] in + match ps with + Arity0 ret | Arity1 (ret, _) | Arity2 (ret, _, _) | Arity3 (ret, _, _, _) + | Arity4 (ret, _, _, _, _) -> + if return_by_ptr then + !pdecls, add_cast (T_ptrto (element_type ret)) "&__rv.val[0]" :: plist + else + !pdecls, plist + +let modify_params features plist = + let is_flipped = + List.exists (function Flipped _ -> true | _ -> false) features in + if is_flipped then + match plist with + [ a; b ] -> [ b; a ] + | _ -> + failwith ("Don't know how to flip args " ^ (String.concat ", " plist)) + else + plist + +(* !!! Decide whether to add an extra information word based on the shape + form. *) +let extra_word shape features paramlist bits = + let use_word = + match shape with + All _ | Long | Long_noreg _ | Wide | Wide_noreg _ | Narrow + | By_scalar _ | Wide_scalar | Wide_lane | Binary_imm _ | Long_imm + | Narrow_imm -> true + | _ -> List.mem InfoWord features + in + if use_word then + paramlist @ [string_of_int bits] + else + paramlist + +(* Bit 0 represents signed (1) vs unsigned (0), or float (1) vs poly (0). + Bit 1 represents floats & polynomials (1), or ordinary integers (0). + Bit 2 represents rounding (1) vs none (0). *) +let infoword_value elttype features = + let bits01 = + match elt_class elttype with + Signed | ConvClass (Signed, _) | ConvClass (_, Signed) -> 0b001 + | Poly -> 0b010 + | Float -> 0b011 + | _ -> 0b000 + and rounding_bit = if List.mem Rounding features then 0b100 else 0b000 in + bits01 lor rounding_bit + +(* "Cast" type operations will throw an exception in mode_of_elt (actually in + elt_width, called from there). Deal with that here, and generate a suffix + with multiple modes (). *) +let rec mode_suffix elttype shape = + try + let mode = mode_of_elt elttype shape in + string_of_mode mode + with MixedMode (dst, src) -> + let dstmode = mode_of_elt dst shape + and srcmode = mode_of_elt src shape in + string_of_mode dstmode ^ string_of_mode srcmode + +let print_variant opcode features shape name (ctype, asmtype, elttype) = + let bits = infoword_value elttype features in + let modesuf = mode_suffix elttype shape in + let return_by_ptr = return_by_ptr features in + let pdecls, paramlist = params return_by_ptr ctype in + let paramlist' = modify_params features paramlist in + let paramlist'' = extra_word shape features paramlist' bits in + let parstr = String.concat ", " paramlist'' in + let builtin = Printf.sprintf "__builtin_neon_%s%s (%s)" + (builtin_name features name) modesuf parstr in + let rdecls, stmts = return ctype return_by_ptr builtin in + let body = pdecls @ rdecls @ stmts + and fnname = (intrinsic_name name) ^ "_" ^ (string_of_elt elttype) in + print_function ctype fnname body + +(* When this function processes the element types in the ops table, it rewrites + them in a list of tuples (a,b,c): + a : C type as an "arity", e.g. Arity1 (T_poly8x8, T_poly8x8) + b : Asm type : a single, processed element type, e.g. P16. This is the + type which should be attached to the asm opcode. + c : Variant type : the unprocessed type for this variant (e.g. in add + instructions which don't care about the sign, b might be i16 and c + might be s16.) +*) + +let print_op (opcode, features, shape, name, munge, types) = + let sorted_types = List.sort compare types in + let munged_types = List.map + (fun elt -> let c, asm = munge shape elt in c, asm, elt) sorted_types in + List.iter + (fun variant -> print_variant opcode features shape name variant) + munged_types + +let print_ops ops = + List.iter print_op ops + +(* Output type definitions. Table entries are: + cbase : "C" name for the type. + abase : "ARM" base name for the type (i.e. int in int8x8_t). + esize : element size. + enum : element count. +*) + +let deftypes () = + let typeinfo = [ + (* Doubleword vector types. *) + "__builtin_neon_qi", "int", 8, 8; + "__builtin_neon_hi", "int", 16, 4; + "__builtin_neon_si", "int", 32, 2; + "__builtin_neon_di", "int", 64, 1; + "__builtin_neon_sf", "float", 32, 2; + "__builtin_neon_poly8", "poly", 8, 8; + "__builtin_neon_poly16", "poly", 16, 4; + "__builtin_neon_uqi", "uint", 8, 8; + "__builtin_neon_uhi", "uint", 16, 4; + "__builtin_neon_usi", "uint", 32, 2; + "__builtin_neon_udi", "uint", 64, 1; + + (* Quadword vector types. *) + "__builtin_neon_qi", "int", 8, 16; + "__builtin_neon_hi", "int", 16, 8; + "__builtin_neon_si", "int", 32, 4; + "__builtin_neon_di", "int", 64, 2; + "__builtin_neon_sf", "float", 32, 4; + "__builtin_neon_poly8", "poly", 8, 16; + "__builtin_neon_poly16", "poly", 16, 8; + "__builtin_neon_uqi", "uint", 8, 16; + "__builtin_neon_uhi", "uint", 16, 8; + "__builtin_neon_usi", "uint", 32, 4; + "__builtin_neon_udi", "uint", 64, 2 + ] in + List.iter + (fun (cbase, abase, esize, enum) -> + let attr = + match enum with + 1 -> "" + | _ -> Printf.sprintf "\t__attribute__ ((__vector_size__ (%d)))" + (esize * enum / 8) in + Format.printf "typedef %s %s%dx%d_t%s;@\n" cbase abase esize enum attr) + typeinfo; + Format.print_newline (); + (* Extra types not in . *) + Format.printf "typedef float float32_t;\n"; + Format.printf "typedef __builtin_neon_poly8 poly8_t;\n"; + Format.printf "typedef __builtin_neon_poly16 poly16_t;\n" + +(* Output structs containing arrays, for load & store instructions etc. *) + +let arrtypes () = + let typeinfo = [ + "int", 8; "int", 16; + "int", 32; "int", 64; + "uint", 8; "uint", 16; + "uint", 32; "uint", 64; + "float", 32; "poly", 8; + "poly", 16 + ] in + let writestruct elname elsize regsize arrsize = + let elnum = regsize / elsize in + let structname = + Printf.sprintf "%s%dx%dx%d_t" elname elsize elnum arrsize in + let sfmt = start_function () in + Format.printf "typedef struct %s" structname; + open_braceblock sfmt; + Format.printf "%s%dx%d_t val[%d];" elname elsize elnum arrsize; + close_braceblock sfmt; + Format.printf " %s;" structname; + end_function sfmt; + in + for n = 2 to 4 do + List.iter + (fun (elname, elsize) -> + writestruct elname elsize 64 n; + writestruct elname elsize 128 n) + typeinfo + done + +let print_lines = List.iter (fun s -> Format.printf "%s@\n" s) + +(* Do it. *) + +let _ = + print_lines [ +"/* ARM NEON intrinsics include file. This file is generated automatically"; +" using neon-gen.ml. Please do not edit manually."; +""; +" Copyright (C) 2006, 2007, 2009 Free Software Foundation, Inc."; +" Contributed by CodeSourcery."; +""; +" This file is part of GCC."; +""; +" GCC is free software; you can redistribute it and/or modify it"; +" under the terms of the GNU General Public License as published"; +" by the Free Software Foundation; either version 3, or (at your"; +" option) any later version."; +""; +" GCC is distributed in the hope that it will be useful, but WITHOUT"; +" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY"; +" or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public"; +" License for more details."; +""; +" Under Section 7 of GPL version 3, you are granted additional"; +" permissions described in the GCC Runtime Library Exception, version"; +" 3.1, as published by the Free Software Foundation."; +""; +" You should have received a copy of the GNU General Public License and"; +" a copy of the GCC Runtime Library Exception along with this program;"; +" see the files COPYING3 and COPYING.RUNTIME respectively. If not, see"; +" . */"; +""; +"#ifndef _GCC_ARM_NEON_H"; +"#define _GCC_ARM_NEON_H 1"; +""; +"#ifndef __ARM_NEON__"; +"#error You must enable NEON instructions (e.g. -mfloat-abi=softfp -mfpu=neon) to use arm_neon.h"; +"#else"; +""; +"#ifdef __cplusplus"; +"extern \"C\" {"; +"#endif"; +""; +"#include "; +""]; + deftypes (); + arrtypes (); + Format.print_newline (); + print_ops ops; + Format.print_newline (); + print_ops reinterp; + print_lines [ +"#ifdef __cplusplus"; +"}"; +"#endif"; +"#endif"; +"#endif"] diff --git a/gcc/config/arm/neon-schedgen.ml b/gcc/config/arm/neon-schedgen.ml new file mode 100644 index 000000000..3d9b04422 --- /dev/null +++ b/gcc/config/arm/neon-schedgen.ml @@ -0,0 +1,543 @@ +(* Emission of the core of the Cortex-A8 NEON scheduling description. + Copyright (C) 2007, 2010 Free Software Foundation, Inc. + Contributed by CodeSourcery. + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free + Software Foundation; either version 3, or (at your option) any later + version. + + GCC is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . +*) + +(* This scheduling description generator works as follows. + - Each group of instructions has source and destination requirements + specified and a list of cores supported. This is then filtered + and per core scheduler descriptions are generated out. + The reservations generated are prefixed by the name of the + core and the check is performed on the basis of what the tuning + string is. Running this will generate Neon scheduler descriptions + for all cores supported. + + The source requirements may be specified using + Source (the stage at which all source operands not otherwise + described are read), Source_m (the stage at which Rm operands are + read), Source_n (likewise for Rn) and Source_d (likewise for Rd). + - For each group of instructions the earliest stage where a source + operand may be required is calculated. + - Each group of instructions is selected in turn as a producer. + The latencies between this group and every other group are then + calculated, yielding up to four values for each combination: + 1. Producer -> consumer Rn latency + 2. Producer -> consumer Rm latency + 3. Producer -> consumer Rd (as a source) latency + 4. Producer -> consumer worst-case latency. + Value 4 is calculated from the destination availability requirements + of the consumer and the earliest source availability requirements + of the producer. + - The largest Value 4 calculated for the current producer is the + worse-case latency, L, for that instruction group. This value is written + out in a define_insn_reservation for the producer group. + - For each producer and consumer pair, the latencies calculated above + are collated. The average (of up to four values) is calculated and + if this average is different from the worst-case latency, an + unguarded define_bypass construction is issued for that pair. + (For each pair only one define_bypass construction will be emitted, + and at present we do not emit specific guards.) +*) + +let find_with_result fn lst = + let rec scan = function + [] -> raise Not_found + | l::ls -> + match fn l with + Some result -> result + | _ -> scan ls in + scan lst + +let n1 = 1 and n2 = 2 and n3 = 3 and n4 = 4 and n5 = 5 and n6 = 6 + and n7 = 7 and n8 = 8 and n9 = 9 + +type availability = Source of int + | Source_n of int + | Source_m of int + | Source_d of int + | Dest of int + | Dest_n_after of int * int + +type guard = Guard_none | Guard_only_m | Guard_only_n | Guard_only_d + +(* Reservation behaviors. All but the last row here correspond to one + pipeline each. Each constructor will correspond to one + define_reservation. *) +type reservation = + Mul | Mul_2cycle | Mul_4cycle +| Shift | Shift_2cycle +| ALU | ALU_2cycle +| Fmul | Fmul_2cycle +| Fadd | Fadd_2cycle +(* | VFP *) +| Permute of int +| Ls of int +| Fmul_then_fadd | Fmul_then_fadd_2 + +type core = CortexA8 | CortexA9 +let allCores = [CortexA8; CortexA9] +let coreStr = function + CortexA8 -> "cortex_a8" + | CortexA9 -> "cortex_a9" + +let tuneStr = function + CortexA8 -> "cortexa8" + | CortexA9 -> "cortexa9" + + +(* This table must be kept as short as possible by conflating + entries with the same availability behavior. + + First components: instruction group names + Second components: availability requirements, in the order in which + they should appear in the comments in the .md file. + Third components: reservation info + Fourth components: List of supported cores. +*) +let availability_table = [ + (* NEON integer ALU instructions. *) + (* vbit vbif vbsl vorr vbic vnot vcls vclz vcnt vadd vand vorr + veor vbic vorn ddd qqq *) + "neon_int_1", [Source n2; Dest n3], ALU, allCores; + (* vadd vsub qqd vsub ddd qqq *) + "neon_int_2", [Source_m n1; Source_n n2; Dest n3], ALU, allCores; + (* vsum vneg dd qq vadd vsub qdd *) + "neon_int_3", [Source n1; Dest n3], ALU, allCores; + (* vabs vceqz vcgez vcbtz vclez vcltz vadh vradh vsbh vrsbh dqq *) + (* vhadd vrhadd vqadd vtst ddd qqq *) + "neon_int_4", [Source n2; Dest n4], ALU, allCores; + (* vabd qdd vhsub vqsub vabd vceq vcge vcgt vmax vmin vfmx vfmn ddd ddd *) + "neon_int_5", [Source_m n1; Source_n n2; Dest n4], ALU, allCores; + (* vqneg vqabs dd qq *) + "neon_vqneg_vqabs", [Source n1; Dest n4], ALU, allCores; + (* vmov vmvn *) + "neon_vmov", [Dest n3], ALU, allCores; + (* vaba *) + "neon_vaba", [Source_n n2; Source_m n1; Source_d n3; Dest n6], ALU, allCores; + "neon_vaba_qqq", + [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n6)], + ALU_2cycle, allCores; + (* vsma *) + "neon_vsma", [Source_m n1; Source_d n3; Dest n6], ALU, allCores; + + (* NEON integer multiply instructions. *) + (* vmul, vqdmlh, vqrdmlh *) + (* vmul, vqdmul, qdd 16/8 long 32/16 long *) + "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long", [Source n2; Dest n6], + Mul, allCores; + "neon_mul_qqq_8_16_32_ddd_32", [Source n2; Dest_n_after (1, n6)], + Mul_2cycle, allCores; + (* vmul, vqdmul again *) + "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar", + [Source_n n2; Source_m n1; Dest_n_after (1, n6)], Mul_2cycle, allCores; + (* vmla, vmls *) + "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long", + [Source_n n2; Source_m n2; Source_d n3; Dest n6], Mul, allCores; + "neon_mla_qqq_8_16", + [Source_n n2; Source_m n2; Source_d n3; Dest_n_after (1, n6)], + Mul_2cycle, allCores; + "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long", + [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n6)], + Mul_2cycle, allCores; + "neon_mla_qqq_32_qqd_32_scalar", + [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (3, n6)], + Mul_4cycle, allCores; + (* vmul, vqdmulh, vqrdmulh *) + (* vmul, vqdmul *) + "neon_mul_ddd_16_scalar_32_16_long_scalar", + [Source_n n2; Source_m n1; Dest n6], Mul, allCores; + "neon_mul_qqd_32_scalar", + [Source_n n2; Source_m n1; Dest_n_after (3, n6)], Mul_4cycle, allCores; + (* vmla, vmls *) + (* vmla, vmla, vqdmla, vqdmls *) + "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar", + [Source_n n2; Source_m n1; Source_d n3; Dest n6], Mul, allCores; + + (* NEON integer shift instructions. *) + (* vshr/vshl immediate, vshr_narrow, vshl_vmvh, vsli_vsri_ddd *) + "neon_shift_1", [Source n1; Dest n3], Shift, allCores; + (* vqshl, vrshr immediate; vqshr, vqmov, vrshr, vqrshr narrow, allCores; + vqshl_vrshl_vqrshl_ddd *) + "neon_shift_2", [Source n1; Dest n4], Shift, allCores; + (* vsli, vsri and vshl for qqq *) + "neon_shift_3", [Source n1; Dest_n_after (1, n3)], Shift_2cycle, allCores; + "neon_vshl_ddd", [Source n1; Dest n1], Shift, allCores; + "neon_vqshl_vrshl_vqrshl_qqq", [Source n1; Dest_n_after (1, n4)], + Shift_2cycle, allCores; + "neon_vsra_vrsra", [Source_m n1; Source_d n3; Dest n6], Shift, allCores; + + (* NEON floating-point instructions. *) + (* vadd, vsub, vabd, vmul, vceq, vcge, vcgt, vcage, vcagt, vmax, vmin *) + (* vabs, vneg, vceqz, vcgez, vcgtz, vclez, vcltz, vrecpe, vrsqrte, vcvt *) + "neon_fp_vadd_ddd_vabs_dd", [Source n2; Dest n5], Fadd, allCores; + "neon_fp_vadd_qqq_vabs_qq", [Source n2; Dest_n_after (1, n5)], + Fadd_2cycle, allCores; + (* vsum, fvmx, vfmn *) + "neon_fp_vsum", [Source n1; Dest n5], Fadd, allCores; + "neon_fp_vmul_ddd", [Source_n n2; Source_m n1; Dest n5], Fmul, allCores; + "neon_fp_vmul_qqd", [Source_n n2; Source_m n1; Dest_n_after (1, n5)], + Fmul_2cycle, allCores; + (* vmla, vmls *) + "neon_fp_vmla_ddd", + [Source_n n2; Source_m n2; Source_d n3; Dest n9], Fmul_then_fadd, allCores; + "neon_fp_vmla_qqq", + [Source_n n2; Source_m n2; Source_d n3; Dest_n_after (1, n9)], + Fmul_then_fadd_2, allCores; + "neon_fp_vmla_ddd_scalar", + [Source_n n2; Source_m n1; Source_d n3; Dest n9], Fmul_then_fadd, allCores; + "neon_fp_vmla_qqq_scalar", + [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n9)], + Fmul_then_fadd_2, allCores; + "neon_fp_vrecps_vrsqrts_ddd", [Source n2; Dest n9], Fmul_then_fadd, allCores; + "neon_fp_vrecps_vrsqrts_qqq", [Source n2; Dest_n_after (1, n9)], + Fmul_then_fadd_2, allCores; + + (* NEON byte permute instructions. *) + (* vmov; vtrn and vswp for dd; vzip for dd; vuzp for dd; vrev; vext for dd *) + "neon_bp_simple", [Source n1; Dest n2], Permute 1, allCores; + (* vswp for qq; vext for qqq; vtbl with {Dn} or {Dn, Dn1}, allCores; + similarly for vtbx *) + "neon_bp_2cycle", [Source n1; Dest_n_after (1, n2)], Permute 2, allCores; + (* all the rest *) + "neon_bp_3cycle", [Source n1; Dest_n_after (2, n2)], Permute 3, allCores; + + (* NEON load/store instructions. *) + "neon_ldr", [Dest n1], Ls 1, allCores; + "neon_str", [Source n1], Ls 1, allCores; + "neon_vld1_1_2_regs", [Dest_n_after (1, n1)], Ls 2, allCores; + "neon_vld1_3_4_regs", [Dest_n_after (2, n1)], Ls 3, allCores; + "neon_vld2_2_regs_vld1_vld2_all_lanes", [Dest_n_after (1, n2)], Ls 2, allCores; + "neon_vld2_4_regs", [Dest_n_after (2, n2)], Ls 3, allCores; + "neon_vld3_vld4", [Dest_n_after (3, n2)], Ls 4, allCores; + "neon_vst1_1_2_regs_vst2_2_regs", [Source n1], Ls 2, allCores; + "neon_vst1_3_4_regs", [Source n1], Ls 3, allCores; + "neon_vst2_4_regs_vst3_vst4", [Source n1], Ls 4, allCores; + "neon_vst3_vst4", [Source n1], Ls 4, allCores; + "neon_vld1_vld2_lane", [Source n1; Dest_n_after (2, n2)], Ls 3, allCores; + "neon_vld3_vld4_lane", [Source n1; Dest_n_after (4, n2)], Ls 5, allCores; + "neon_vst1_vst2_lane", [Source n1], Ls 2, allCores; + "neon_vst3_vst4_lane", [Source n1], Ls 3, allCores; + "neon_vld3_vld4_all_lanes", [Dest_n_after (1, n2)], Ls 3, allCores; + + (* NEON register transfer instructions. *) + "neon_mcr", [Dest n2], Permute 1, allCores; + "neon_mcr_2_mcrr", [Dest n2], Permute 2, allCores; + (* MRC instructions are in the .tpl file. *) +] + +(* Augment the tuples in the availability table with an extra component + that describes the earliest stage where a source operand may be + required. (It is also possible that an entry in the table has no + source requirements.) *) +let calculate_sources = + List.map (fun (name, avail, res, cores) -> + let earliest_stage = + List.fold_left + (fun cur -> fun info -> + match info with + Source stage + | Source_n stage + | Source_m stage + | Source_d stage -> + (match cur with + None -> Some stage + | Some stage' when stage < stage' -> Some stage + | _ -> cur) + | _ -> cur) None avail + in + (name, avail, res, earliest_stage)) + +(* Find the stage, if any, at the end of which a group produces a result. *) +let find_dest (attr, avail, _, _) = + try + find_with_result + (fun av -> match av with + Dest st -> Some (Some st) + | Dest_n_after (after, st) -> Some (Some (after + st)) + | _ -> None) avail + with Not_found -> None + +(* Find the worst-case latency between a producer and a consumer. *) +let worst_case_latency producer (_, _, _, earliest_required) = + let dest = find_dest producer in + match earliest_required, dest with + None, _ -> + (* The consumer doesn't have any source requirements. *) + None + | _, None -> + (* The producer doesn't produce any results (e.g. a store insn). *) + None + | Some consumed, Some produced -> Some (produced - consumed + 1) + +(* Helper function for below. *) +let latency_calc f producer (_, avail, _, _) = + try + let source_avail = find_with_result f avail in + match find_dest producer with + None -> + (* The producer does not produce a result. *) + Some 0 + | Some produced -> + let latency = produced - source_avail + 1 in + (* Latencies below zero are raised to zero since we don't have + delay slots. *) + if latency < 0 then Some 0 else Some latency + with Not_found -> None + +(* Find any Rm latency between a producer and a consumer. If no + Rm source requirement is explicitly specified for the consumer, + return "positive infinity". Also return "positive infinity" if + the latency matches the supplied worst-case latency for this + producer. *) +let get_m_latency producer consumer = + match latency_calc (fun av -> match av with Source_m stage -> Some stage + | _ -> None) producer consumer + with None -> [] | Some latency -> [(Guard_only_m, latency)] + +(* Likewise for Rn. *) +let get_n_latency producer consumer = + match latency_calc (fun av -> match av with Source_n stage -> Some stage + | _ -> None) producer consumer + with None -> [] | Some latency -> [(Guard_only_n, latency)] + +(* Likewise for Rd. *) +let get_d_latency producer consumer = + match + latency_calc (fun av -> match av with Source_d stage -> Some stage + | _ -> None) producer consumer + with None -> [] | Some latency -> [(Guard_only_d, latency)] + +(* Given a producer and a consumer, work out the latency of the producer + to the consumer in each of the four cases (availability information + permitting) identified at the top of this file. Return the + consumer, the worst-case unguarded latency and any guarded latencies. *) +let calculate_latencies producer consumer = + let worst = worst_case_latency producer consumer in + let m_latency = get_m_latency producer consumer in + let n_latency = get_n_latency producer consumer in + let d_latency = get_d_latency producer consumer in + (consumer, worst, m_latency @ n_latency @ d_latency) + +(* Helper function for below. *) +let pick_latency largest worst guards = + let guards = + match worst with + None -> guards + | Some worst -> (Guard_none, worst) :: guards + in + if List.length guards = 0 then None else + let total_latency = + List.fold_left (fun acc -> fun (_, latency) -> acc + latency) 0 guards + in + let average_latency = (float_of_int total_latency) /. + (float_of_int (List.length guards)) in + let rounded_latency = int_of_float (ceil average_latency) in + if rounded_latency = largest then None + else Some (Guard_none, rounded_latency) + +(* Collate all bypasses for a particular producer as required in + worst_case_latencies_and_bypasses. (By this stage there is a maximum + of one bypass from this producer to any particular consumer listed + in LATENCIES.) Use a hash table to collate bypasses with the + same latency and guard. *) +let collate_bypasses (producer_name, _, _, _) largest latencies core = + let ht = Hashtbl.create 42 in + let keys = ref [] in + List.iter ( + fun ((consumer, _, _, _), worst, guards) -> + (* Find out which latency to use. Ignoring latencies that match + the *overall* worst-case latency for this producer (which will + be in define_insn_reservation), we have to examine: + 1. the latency with no guard between this producer and this + consumer; and + 2. any guarded latency. *) + let guard_latency_opt = pick_latency largest worst guards in + match guard_latency_opt with + None -> () + | Some (guard, latency) -> + begin + (if (try ignore (Hashtbl.find ht (guard, latency)); false + with Not_found -> true) then + keys := (guard, latency) :: !keys); + Hashtbl.add ht (guard, latency) ((coreStr core) ^ "_" ^ consumer) + end + ) latencies; + (* The hash table now has bypasses collated so that ones with the + same latency and guard have the same keys. Walk through all the + keys, extract the associated bypasses, and concatenate the names + of the consumers for each bypass. *) + List.map ( + fun ((guard, latency) as key) -> + let consumers = Hashtbl.find_all ht key in + (producer_name, + String.concat ",\\\n " consumers, + latency, + guard) + ) !keys + +(* For every producer, find the worst-case latency between it and + *any* consumer. Also determine (if such a thing exists) the + lowest-latency bypass from each producer to each consumer. Group + the output in such a way that all bypasses with the same producer + and latency are together, and so that bypasses with the worst-case + latency are ignored. *) +let worst_case_latencies_and_bypasses core = + let rec f (worst_acc, bypasses_acc) prev xs = + match xs with + [] -> (worst_acc, bypasses_acc) + | ((producer_name, producer_avail, res_string, _) as producer)::next -> + (* For this particular producer, work out the latencies between + it and every consumer. *) + let latencies = + List.fold_left (fun acc -> fun consumer -> + (calculate_latencies producer consumer) :: acc) + [] (prev @ xs) + in + (* Now work out what the overall worst case latency was for this + particular producer. *) + match latencies with + [] -> assert false + | _ -> + let comp_fn (_, l1, _) (_, l2, _) = + if l1 > l2 then -1 else if l1 = l2 then 0 else 1 + in + let largest = + match List.hd (List.sort comp_fn latencies) with + (_, None, _) -> 0 (* Producer has no consumers. *) + | (_, Some worst, _) -> worst + in + (* Having got the largest latency, collect all bypasses for + this producer and filter out those with that larger + latency. Record the others for later emission. *) + let bypasses = collate_bypasses producer largest latencies core in + (* Go on to process remaining producers, having noted + the result for this one. *) + f ((producer_name, producer_avail, largest, + res_string) :: worst_acc, + bypasses @ bypasses_acc) + (prev @ [producer]) next + in + f ([], []) [] + +(* Emit a helpful comment for a define_insn_reservation. *) +let write_comment producer avail = + let seen_source = ref false in + let describe info = + let read = if !seen_source then "" else "read " in + match info with + Source stage -> + seen_source := true; + Printf.printf "%stheir source operands at N%d" read stage + | Source_n stage -> + seen_source := true; + Printf.printf "%stheir (D|Q)n operands at N%d" read stage + | Source_m stage -> + seen_source := true; + Printf.printf "%stheir (D|Q)m operands at N%d" read stage + | Source_d stage -> + Printf.printf "%stheir (D|Q)d operands at N%d" read stage + | Dest stage -> + Printf.printf "produce a result at N%d" stage + | Dest_n_after (after, stage) -> + Printf.printf "produce a result at N%d on cycle %d" stage (after + 1) + in + Printf.printf ";; Instructions using this reservation "; + let rec f infos x = + let sep = if x mod 2 = 1 then "" else "\n;;" in + match infos with + [] -> assert false + | [info] -> describe info; Printf.printf ".\n" + | info::(_::[] as infos) -> + describe info; Printf.printf ", and%s " sep; f infos (x+1) + | info::infos -> describe info; Printf.printf ",%s " sep; f infos (x+1) + in + f avail 0 + + +(* Emit a define_insn_reservation for each producer. The latency + written in will be its worst-case latency. *) +let emit_insn_reservations core = + let corestring = coreStr core in + let tunestring = tuneStr core + in List.iter ( + fun (producer, avail, latency, reservation) -> + write_comment producer avail; + Printf.printf "(define_insn_reservation \"%s_%s\" %d\n" + corestring producer latency; + Printf.printf " (and (eq_attr \"tune\" \"%s\")\n" tunestring; + Printf.printf " (eq_attr \"neon_type\" \"%s\"))\n" producer; + let str = + match reservation with + Mul -> "dp" | Mul_2cycle -> "dp_2" | Mul_4cycle -> "dp_4" + | Shift -> "dp" | Shift_2cycle -> "dp_2" + | ALU -> "dp" | ALU_2cycle -> "dp_2" + | Fmul -> "dp" | Fmul_2cycle -> "dp_2" + | Fadd -> "fadd" | Fadd_2cycle -> "fadd_2" + | Ls 1 -> "ls" + | Ls n -> "ls_" ^ (string_of_int n) + | Permute 1 -> "perm" + | Permute n -> "perm_" ^ (string_of_int n) + | Fmul_then_fadd -> "fmul_then_fadd" + | Fmul_then_fadd_2 -> "fmul_then_fadd_2" + in + Printf.printf " \"%s_neon_%s\")\n\n" corestring str + ) + +(* Given a guard description, return the name of the C function to + be used as the guard for define_bypass. *) +let guard_fn g = + match g with + Guard_only_m -> "arm_neon_only_m_dependency" + | Guard_only_n -> "arm_neon_only_n_dependency" + | Guard_only_d -> "arm_neon_only_d_dependency" + | Guard_none -> assert false + +(* Emit a define_bypass for each bypass. *) +let emit_bypasses core = + List.iter ( + fun (producer, consumers, latency, guard) -> + Printf.printf "(define_bypass %d \"%s_%s\"\n" + latency (coreStr core) producer; + + if guard = Guard_none then + Printf.printf " \"%s\")\n\n" consumers + else + begin + Printf.printf " \"%s\"\n" consumers; + Printf.printf " \"%s\")\n\n" (guard_fn guard) + end + ) + + +let calculate_per_core_availability_table core availability_table = + let table = calculate_sources availability_table in + let worst_cases, bypasses = worst_case_latencies_and_bypasses core table in + emit_insn_reservations core (List.rev worst_cases); + Printf.printf ";; Exceptions to the default latencies.\n\n"; + emit_bypasses core bypasses + +let calculate_core_availability_table core availability_table = +let filter_core = List.filter (fun (_, _, _, cores) + -> List.exists ((=) core) cores) +in calculate_per_core_availability_table core (filter_core availability_table) + + +(* Program entry point. *) +let main = + List.map (fun core -> calculate_core_availability_table + core availability_table) allCores diff --git a/gcc/config/arm/neon-testgen.ml b/gcc/config/arm/neon-testgen.ml new file mode 100644 index 000000000..63fbbbf2c --- /dev/null +++ b/gcc/config/arm/neon-testgen.ml @@ -0,0 +1,283 @@ +(* Auto-generate ARM Neon intrinsics tests. + Copyright (C) 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc. + Contributed by CodeSourcery. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free + Software Foundation; either version 3, or (at your option) any later + version. + + GCC is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . + + This is an O'Caml program. The O'Caml compiler is available from: + + http://caml.inria.fr/ + + Or from your favourite OS's friendly packaging system. Tested with version + 3.09.2, though other versions will probably work too. + + Compile with: + ocamlc -c neon.ml + ocamlc -o neon-testgen neon.cmo neon-testgen.ml + + Run with: + cd /path/to/gcc/testsuite/gcc.target/arm/neon + /path/to/neon-testgen +*) + +open Neon + +type c_type_flags = Pointer | Const + +(* Open a test source file. *) +let open_test_file dir name = + try + open_out (dir ^ "/" ^ name ^ ".c") + with Sys_error str -> + failwith ("Could not create test source file " ^ name ^ ": " ^ str) + +(* Emit prologue code to a test source file. *) +let emit_prologue chan test_name = + Printf.fprintf chan "/* Test the `%s' ARM Neon intrinsic. */\n" test_name; + Printf.fprintf chan "/* This file was autogenerated by neon-testgen. */\n\n"; + Printf.fprintf chan "/* { dg-do assemble } */\n"; + Printf.fprintf chan "/* { dg-require-effective-target arm_neon_ok } */\n"; + Printf.fprintf chan "/* { dg-options \"-save-temps -O0\" } */\n"; + Printf.fprintf chan "/* { dg-add-options arm_neon } */\n"; + Printf.fprintf chan "\n#include \"arm_neon.h\"\n\n"; + Printf.fprintf chan "void test_%s (void)\n{\n" test_name + +(* Emit declarations of local variables that are going to be passed + to an intrinsic, together with one to take a returned value if needed. *) +let emit_automatics chan c_types features = + let emit () = + ignore ( + List.fold_left (fun arg_number -> fun (flags, ty) -> + let pointer_bit = + if List.mem Pointer flags then "*" else "" + in + (* Const arguments to builtins are directly + written in as constants. *) + if not (List.mem Const flags) then + Printf.fprintf chan " %s %sarg%d_%s;\n" + ty pointer_bit arg_number ty; + arg_number + 1) + 0 (List.tl c_types)) + in + match c_types with + (_, return_ty) :: tys -> + if return_ty <> "void" then begin + (* The intrinsic returns a value. We need to do explict register + allocation for vget_low tests or they fail because of copy + elimination. *) + ((if List.mem Fixed_return_reg features then + Printf.fprintf chan " register %s out_%s asm (\"d18\");\n" + return_ty return_ty + else + Printf.fprintf chan " %s out_%s;\n" return_ty return_ty); + emit ()) + end else + (* The intrinsic does not return a value. *) + emit () + | _ -> assert false + +(* Emit code to call an intrinsic. *) +let emit_call chan const_valuator c_types name elt_ty = + (if snd (List.hd c_types) <> "void" then + Printf.fprintf chan " out_%s = " (snd (List.hd c_types)) + else + Printf.fprintf chan " "); + Printf.fprintf chan "%s_%s (" (intrinsic_name name) (string_of_elt elt_ty); + let print_arg chan arg_number (flags, ty) = + (* If the argument is of const type, then directly write in the + constant now. *) + if List.mem Const flags then + match const_valuator with + None -> + if List.mem Pointer flags then + Printf.fprintf chan "0" + else + Printf.fprintf chan "1" + | Some f -> Printf.fprintf chan "%s" (string_of_int (f arg_number)) + else + Printf.fprintf chan "arg%d_%s" arg_number ty + in + let rec print_args arg_number tys = + match tys with + [] -> () + | [ty] -> print_arg chan arg_number ty + | ty::tys -> + print_arg chan arg_number ty; + Printf.fprintf chan ", "; + print_args (arg_number + 1) tys + in + print_args 0 (List.tl c_types); + Printf.fprintf chan ");\n" + +(* Emit epilogue code to a test source file. *) +let emit_epilogue chan features regexps = + let no_op = List.exists (fun feature -> feature = No_op) features in + Printf.fprintf chan "}\n\n"; + (if not no_op then + List.iter (fun regexp -> + Printf.fprintf chan + "/* { dg-final { scan-assembler \"%s\" } } */\n" regexp) + regexps + else + () + ); + Printf.fprintf chan "/* { dg-final { cleanup-saved-temps } } */\n" + +(* Check a list of C types to determine which ones are pointers and which + ones are const. *) +let check_types tys = + let tys' = + List.map (fun ty -> + let len = String.length ty in + if len > 2 && String.get ty (len - 2) = ' ' + && String.get ty (len - 1) = '*' + then ([Pointer], String.sub ty 0 (len - 2)) + else ([], ty)) tys + in + List.map (fun (flags, ty) -> + if String.length ty > 6 && String.sub ty 0 6 = "const " + then (Const :: flags, String.sub ty 6 ((String.length ty) - 6)) + else (flags, ty)) tys' + +(* Given an intrinsic shape, produce a regexp that will match + the right-hand sides of instructions generated by an intrinsic of + that shape. *) +let rec analyze_shape shape = + let rec n_things n thing = + match n with + 0 -> [] + | n -> thing :: (n_things (n - 1) thing) + in + let rec analyze_shape_elt elt = + match elt with + Dreg -> "\\[dD\\]\\[0-9\\]+" + | Qreg -> "\\[qQ\\]\\[0-9\\]+" + | Corereg -> "\\[rR\\]\\[0-9\\]+" + | Immed -> "#\\[0-9\\]+" + | VecArray (1, elt) -> + let elt_regexp = analyze_shape_elt elt in + "((\\\\\\{" ^ elt_regexp ^ "\\\\\\})|(" ^ elt_regexp ^ "))" + | VecArray (n, elt) -> + let elt_regexp = analyze_shape_elt elt in + let alt1 = elt_regexp ^ "-" ^ elt_regexp in + let alt2 = commas (fun x -> x) (n_things n elt_regexp) "" in + "\\\\\\{((" ^ alt1 ^ ")|(" ^ alt2 ^ "))\\\\\\}" + | (PtrTo elt | CstPtrTo elt) -> + "\\\\\\[" ^ (analyze_shape_elt elt) ^ "\\\\\\]" + | Element_of_dreg -> (analyze_shape_elt Dreg) ^ "\\\\\\[\\[0-9\\]+\\\\\\]" + | Element_of_qreg -> (analyze_shape_elt Qreg) ^ "\\\\\\[\\[0-9\\]+\\\\\\]" + | All_elements_of_dreg -> (analyze_shape_elt Dreg) ^ "\\\\\\[\\\\\\]" + | Alternatives (elts) -> "(" ^ (String.concat "|" (List.map analyze_shape_elt elts)) ^ ")" + in + match shape with + All (n, elt) -> commas analyze_shape_elt (n_things n elt) "" + | Long -> (analyze_shape_elt Qreg) ^ ", " ^ (analyze_shape_elt Dreg) ^ + ", " ^ (analyze_shape_elt Dreg) + | Long_noreg elt -> (analyze_shape_elt elt) ^ ", " ^ (analyze_shape_elt elt) + | Wide -> (analyze_shape_elt Qreg) ^ ", " ^ (analyze_shape_elt Qreg) ^ + ", " ^ (analyze_shape_elt Dreg) + | Wide_noreg elt -> analyze_shape (Long_noreg elt) + | Narrow -> (analyze_shape_elt Dreg) ^ ", " ^ (analyze_shape_elt Qreg) ^ + ", " ^ (analyze_shape_elt Qreg) + | Use_operands elts -> commas analyze_shape_elt (Array.to_list elts) "" + | By_scalar Dreg -> + analyze_shape (Use_operands [| Dreg; Dreg; Element_of_dreg |]) + | By_scalar Qreg -> + analyze_shape (Use_operands [| Qreg; Qreg; Element_of_dreg |]) + | By_scalar _ -> assert false + | Wide_lane -> + analyze_shape (Use_operands [| Qreg; Dreg; Element_of_dreg |]) + | Wide_scalar -> + analyze_shape (Use_operands [| Qreg; Dreg; Element_of_dreg |]) + | Pair_result elt -> + let elt_regexp = analyze_shape_elt elt in + elt_regexp ^ ", " ^ elt_regexp + | Unary_scalar _ -> "FIXME Unary_scalar" + | Binary_imm elt -> analyze_shape (Use_operands [| elt; elt; Immed |]) + | Narrow_imm -> analyze_shape (Use_operands [| Dreg; Qreg; Immed |]) + | Long_imm -> analyze_shape (Use_operands [| Qreg; Dreg; Immed |]) + +(* Generate tests for one intrinsic. *) +let test_intrinsic dir opcode features shape name munge elt_ty = + (* Open the test source file. *) + let test_name = name ^ (string_of_elt elt_ty) in + let chan = open_test_file dir test_name in + (* Work out what argument and return types the intrinsic has. *) + let c_arity, new_elt_ty = munge shape elt_ty in + let c_types = check_types (strings_of_arity c_arity) in + (* Extract any constant valuator (a function specifying what constant + values are to be written into the intrinsic call) from the features + list. *) + let const_valuator = + try + match (List.find (fun feature -> match feature with + Const_valuator _ -> true + | _ -> false) features) with + Const_valuator f -> Some f + | _ -> assert false + with Not_found -> None + in + (* Work out what instruction name(s) to expect. *) + let insns = get_insn_names features name in + let no_suffix = (new_elt_ty = NoElts) in + let insns = + if no_suffix then insns + else List.map (fun insn -> + let suffix = string_of_elt_dots new_elt_ty in + insn ^ "\\." ^ suffix) insns + in + (* Construct a regexp to match against the expected instruction name(s). *) + let insn_regexp = + match insns with + [] -> assert false + | [insn] -> insn + | _ -> + let rec calc_regexp insns cur_regexp = + match insns with + [] -> cur_regexp + | [insn] -> cur_regexp ^ "(" ^ insn ^ "))" + | insn::insns -> calc_regexp insns (cur_regexp ^ "(" ^ insn ^ ")|") + in calc_regexp insns "(" + in + (* Construct regexps to match against the instructions that this + intrinsic expands to. Watch out for any writeback character and + comments after the instruction. *) + let regexps = List.map (fun regexp -> insn_regexp ^ "\\[ \t\\]+" ^ regexp ^ + "!?\\(\\[ \t\\]+@\\[a-zA-Z0-9 \\]+\\)?\\n") + (analyze_all_shapes features shape analyze_shape) + in + (* Emit file and function prologues. *) + emit_prologue chan test_name; + (* Emit local variable declarations. *) + emit_automatics chan c_types features; + Printf.fprintf chan "\n"; + (* Emit the call to the intrinsic. *) + emit_call chan const_valuator c_types name elt_ty; + (* Emit the function epilogue and the DejaGNU scan-assembler directives. *) + emit_epilogue chan features regexps; + (* Close the test file. *) + close_out chan + +(* Generate tests for one element of the "ops" table. *) +let test_intrinsic_group dir (opcode, features, shape, name, munge, types) = + List.iter (test_intrinsic dir opcode features shape name munge) types + +(* Program entry point. *) +let _ = + let directory = if Array.length Sys.argv <> 1 then Sys.argv.(1) else "." in + List.iter (test_intrinsic_group directory) (reinterp @ ops) + diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md new file mode 100644 index 000000000..247dc1ff4 --- /dev/null +++ b/gcc/config/arm/neon.md @@ -0,0 +1,5476 @@ +;; ARM NEON coprocessor Machine Description +;; Copyright (C) 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc. +;; Written by CodeSourcery. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +;; Constants for unspecs. +(define_constants + [(UNSPEC_ASHIFT_SIGNED 65) + (UNSPEC_ASHIFT_UNSIGNED 66) + (UNSPEC_VABD 69) + (UNSPEC_VABDL 70) + (UNSPEC_VADD 72) + (UNSPEC_VADDHN 73) + (UNSPEC_VADDL 74) + (UNSPEC_VADDW 75) + (UNSPEC_VBSL 78) + (UNSPEC_VCAGE 79) + (UNSPEC_VCAGT 80) + (UNSPEC_VCEQ 81) + (UNSPEC_VCGE 82) + (UNSPEC_VCGT 83) + (UNSPEC_VCLS 84) + (UNSPEC_VCVT 88) + (UNSPEC_VCVT_N 89) + (UNSPEC_VEXT 93) + (UNSPEC_VHADD 97) + (UNSPEC_VHSUB 98) + (UNSPEC_VLD1 99) + (UNSPEC_VLD1_DUP 100) + (UNSPEC_VLD1_LANE 101) + (UNSPEC_VLD2 102) + (UNSPEC_VLD2_DUP 103) + (UNSPEC_VLD2_LANE 104) + (UNSPEC_VLD3 105) + (UNSPEC_VLD3A 106) + (UNSPEC_VLD3B 107) + (UNSPEC_VLD3_DUP 108) + (UNSPEC_VLD3_LANE 109) + (UNSPEC_VLD4 110) + (UNSPEC_VLD4A 111) + (UNSPEC_VLD4B 112) + (UNSPEC_VLD4_DUP 113) + (UNSPEC_VLD4_LANE 114) + (UNSPEC_VMAX 115) + (UNSPEC_VMIN 116) + (UNSPEC_VMLA 117) + (UNSPEC_VMLAL 118) + (UNSPEC_VMLA_LANE 119) + (UNSPEC_VMLAL_LANE 120) + (UNSPEC_VMLS 121) + (UNSPEC_VMLSL 122) + (UNSPEC_VMLS_LANE 123) + (UNSPEC_VMLSL_LANE 124) + (UNSPEC_VMOVL 125) + (UNSPEC_VMOVN 126) + (UNSPEC_VMUL 127) + (UNSPEC_VMULL 128) + (UNSPEC_VMUL_LANE 129) + (UNSPEC_VMULL_LANE 130) + (UNSPEC_VPADAL 135) + (UNSPEC_VPADD 136) + (UNSPEC_VPADDL 137) + (UNSPEC_VPMAX 138) + (UNSPEC_VPMIN 139) + (UNSPEC_VPSMAX 140) + (UNSPEC_VPSMIN 141) + (UNSPEC_VPUMAX 142) + (UNSPEC_VPUMIN 143) + (UNSPEC_VQABS 144) + (UNSPEC_VQADD 145) + (UNSPEC_VQDMLAL 146) + (UNSPEC_VQDMLAL_LANE 147) + (UNSPEC_VQDMLSL 148) + (UNSPEC_VQDMLSL_LANE 149) + (UNSPEC_VQDMULH 150) + (UNSPEC_VQDMULH_LANE 151) + (UNSPEC_VQDMULL 152) + (UNSPEC_VQDMULL_LANE 153) + (UNSPEC_VQMOVN 154) + (UNSPEC_VQMOVUN 155) + (UNSPEC_VQNEG 156) + (UNSPEC_VQSHL 157) + (UNSPEC_VQSHL_N 158) + (UNSPEC_VQSHLU_N 159) + (UNSPEC_VQSHRN_N 160) + (UNSPEC_VQSHRUN_N 161) + (UNSPEC_VQSUB 162) + (UNSPEC_VRECPE 163) + (UNSPEC_VRECPS 164) + (UNSPEC_VREV16 165) + (UNSPEC_VREV32 166) + (UNSPEC_VREV64 167) + (UNSPEC_VRSQRTE 168) + (UNSPEC_VRSQRTS 169) + (UNSPEC_VSHL 171) + (UNSPEC_VSHLL_N 172) + (UNSPEC_VSHL_N 173) + (UNSPEC_VSHR_N 174) + (UNSPEC_VSHRN_N 175) + (UNSPEC_VSLI 176) + (UNSPEC_VSRA_N 177) + (UNSPEC_VSRI 178) + (UNSPEC_VST1 179) + (UNSPEC_VST1_LANE 180) + (UNSPEC_VST2 181) + (UNSPEC_VST2_LANE 182) + (UNSPEC_VST3 183) + (UNSPEC_VST3A 184) + (UNSPEC_VST3B 185) + (UNSPEC_VST3_LANE 186) + (UNSPEC_VST4 187) + (UNSPEC_VST4A 188) + (UNSPEC_VST4B 189) + (UNSPEC_VST4_LANE 190) + (UNSPEC_VSTRUCTDUMMY 191) + (UNSPEC_VSUB 192) + (UNSPEC_VSUBHN 193) + (UNSPEC_VSUBL 194) + (UNSPEC_VSUBW 195) + (UNSPEC_VTBL 196) + (UNSPEC_VTBX 197) + (UNSPEC_VTRN1 198) + (UNSPEC_VTRN2 199) + (UNSPEC_VTST 200) + (UNSPEC_VUZP1 201) + (UNSPEC_VUZP2 202) + (UNSPEC_VZIP1 203) + (UNSPEC_VZIP2 204) + (UNSPEC_MISALIGNED_ACCESS 205) + (UNSPEC_VCLE 206) + (UNSPEC_VCLT 207)]) + + +;; Attribute used to permit string comparisons against in +;; neon_type attribute definitions. +(define_attr "vqh_mnem" "vadd,vmin,vmax" (const_string "vadd")) + +(define_insn "*neon_mov" + [(set (match_operand:VD 0 "nonimmediate_operand" + "=w,Uv,w, w, ?r,?w,?r,?r, ?Us") + (match_operand:VD 1 "general_operand" + " w,w, Dn,Uvi, w, r, r, Usi,r"))] + "TARGET_NEON + && (register_operand (operands[0], mode) + || register_operand (operands[1], mode))" +{ + if (which_alternative == 2) + { + int width, is_valid; + static char templ[40]; + + is_valid = neon_immediate_valid_for_move (operands[1], mode, + &operands[1], &width); + + gcc_assert (is_valid != 0); + + if (width == 0) + return "vmov.f32\t%P0, %1 @ "; + else + sprintf (templ, "vmov.i%d\t%%P0, %%1 @ ", width); + + return templ; + } + + /* FIXME: If the memory layout is changed in big-endian mode, output_move_vfp + below must be changed to output_move_neon (which will use the + element/structure loads/stores), and the constraint changed to 'Um' instead + of 'Uv'. */ + + switch (which_alternative) + { + case 0: return "vmov\t%P0, %P1 @ "; + case 1: case 3: return output_move_vfp (operands); + case 2: gcc_unreachable (); + case 4: return "vmov\t%Q0, %R0, %P1 @ "; + case 5: return "vmov\t%P0, %Q1, %R1 @ "; + default: return output_move_double (operands); + } +} + [(set_attr "neon_type" "neon_int_1,*,neon_vmov,*,neon_mrrc,neon_mcr_2_mcrr,*,*,*") + (set_attr "type" "*,f_stored,*,f_loadd,*,*,alu,load2,store2") + (set_attr "insn" "*,*,*,*,*,*,mov,*,*") + (set_attr "length" "4,4,4,4,4,4,8,8,8") + (set_attr "pool_range" "*,*,*,1020,*,*,*,1020,*") + (set_attr "neg_pool_range" "*,*,*,1008,*,*,*,1008,*")]) + +(define_insn "*neon_mov" + [(set (match_operand:VQXMOV 0 "nonimmediate_operand" + "=w,Un,w, w, ?r,?w,?r,?r, ?Us") + (match_operand:VQXMOV 1 "general_operand" + " w,w, Dn,Uni, w, r, r, Usi, r"))] + "TARGET_NEON + && (register_operand (operands[0], mode) + || register_operand (operands[1], mode))" +{ + if (which_alternative == 2) + { + int width, is_valid; + static char templ[40]; + + is_valid = neon_immediate_valid_for_move (operands[1], mode, + &operands[1], &width); + + gcc_assert (is_valid != 0); + + if (width == 0) + return "vmov.f32\t%q0, %1 @ "; + else + sprintf (templ, "vmov.i%d\t%%q0, %%1 @ ", width); + + return templ; + } + + switch (which_alternative) + { + case 0: return "vmov\t%q0, %q1 @ "; + case 1: case 3: return output_move_neon (operands); + case 2: gcc_unreachable (); + case 4: return "vmov\t%Q0, %R0, %e1 @ \;vmov\t%J0, %K0, %f1"; + case 5: return "vmov\t%e0, %Q1, %R1 @ \;vmov\t%f0, %J1, %K1"; + default: return output_move_quad (operands); + } +} + [(set_attr "neon_type" "neon_int_1,neon_stm_2,neon_vmov,neon_ldm_2,\ + neon_mrrc,neon_mcr_2_mcrr,*,*,*") + (set_attr "type" "*,*,*,*,*,*,alu,load4,store4") + (set_attr "insn" "*,*,*,*,*,*,mov,*,*") + (set_attr "length" "4,8,4,8,8,8,16,8,16") + (set_attr "pool_range" "*,*,*,1020,*,*,*,1020,*") + (set_attr "neg_pool_range" "*,*,*,1008,*,*,*,1008,*")]) + +(define_expand "movti" + [(set (match_operand:TI 0 "nonimmediate_operand" "") + (match_operand:TI 1 "general_operand" ""))] + "TARGET_NEON" +{ + if (can_create_pseudo_p ()) + { + if (GET_CODE (operands[0]) != REG) + operands[1] = force_reg (TImode, operands[1]); + } +}) + +(define_expand "mov" + [(set (match_operand:VSTRUCT 0 "nonimmediate_operand" "") + (match_operand:VSTRUCT 1 "general_operand" ""))] + "TARGET_NEON" +{ + if (can_create_pseudo_p ()) + { + if (GET_CODE (operands[0]) != REG) + operands[1] = force_reg (mode, operands[1]); + } +}) + +(define_insn "*neon_mov" + [(set (match_operand:VSTRUCT 0 "nonimmediate_operand" "=w,Ut,w") + (match_operand:VSTRUCT 1 "general_operand" " w,w, Ut"))] + "TARGET_NEON + && (register_operand (operands[0], mode) + || register_operand (operands[1], mode))" +{ + switch (which_alternative) + { + case 0: return "#"; + case 1: case 2: return output_move_neon (operands); + default: gcc_unreachable (); + } +} + [(set_attr "neon_type" "neon_int_1,neon_stm_2,neon_ldm_2") + (set (attr "length") (symbol_ref "arm_attr_length_move_neon (insn)"))]) + +(define_split + [(set (match_operand:EI 0 "s_register_operand" "") + (match_operand:EI 1 "s_register_operand" ""))] + "TARGET_NEON && reload_completed" + [(set (match_dup 0) (match_dup 1)) + (set (match_dup 2) (match_dup 3))] +{ + int rdest = REGNO (operands[0]); + int rsrc = REGNO (operands[1]); + rtx dest[2], src[2]; + + dest[0] = gen_rtx_REG (TImode, rdest); + src[0] = gen_rtx_REG (TImode, rsrc); + dest[1] = gen_rtx_REG (DImode, rdest + 4); + src[1] = gen_rtx_REG (DImode, rsrc + 4); + + neon_disambiguate_copy (operands, dest, src, 2); +}) + +(define_split + [(set (match_operand:OI 0 "s_register_operand" "") + (match_operand:OI 1 "s_register_operand" ""))] + "TARGET_NEON && reload_completed" + [(set (match_dup 0) (match_dup 1)) + (set (match_dup 2) (match_dup 3))] +{ + int rdest = REGNO (operands[0]); + int rsrc = REGNO (operands[1]); + rtx dest[2], src[2]; + + dest[0] = gen_rtx_REG (TImode, rdest); + src[0] = gen_rtx_REG (TImode, rsrc); + dest[1] = gen_rtx_REG (TImode, rdest + 4); + src[1] = gen_rtx_REG (TImode, rsrc + 4); + + neon_disambiguate_copy (operands, dest, src, 2); +}) + +(define_split + [(set (match_operand:CI 0 "s_register_operand" "") + (match_operand:CI 1 "s_register_operand" ""))] + "TARGET_NEON && reload_completed" + [(set (match_dup 0) (match_dup 1)) + (set (match_dup 2) (match_dup 3)) + (set (match_dup 4) (match_dup 5))] +{ + int rdest = REGNO (operands[0]); + int rsrc = REGNO (operands[1]); + rtx dest[3], src[3]; + + dest[0] = gen_rtx_REG (TImode, rdest); + src[0] = gen_rtx_REG (TImode, rsrc); + dest[1] = gen_rtx_REG (TImode, rdest + 4); + src[1] = gen_rtx_REG (TImode, rsrc + 4); + dest[2] = gen_rtx_REG (TImode, rdest + 8); + src[2] = gen_rtx_REG (TImode, rsrc + 8); + + neon_disambiguate_copy (operands, dest, src, 3); +}) + +(define_split + [(set (match_operand:XI 0 "s_register_operand" "") + (match_operand:XI 1 "s_register_operand" ""))] + "TARGET_NEON && reload_completed" + [(set (match_dup 0) (match_dup 1)) + (set (match_dup 2) (match_dup 3)) + (set (match_dup 4) (match_dup 5)) + (set (match_dup 6) (match_dup 7))] +{ + int rdest = REGNO (operands[0]); + int rsrc = REGNO (operands[1]); + rtx dest[4], src[4]; + + dest[0] = gen_rtx_REG (TImode, rdest); + src[0] = gen_rtx_REG (TImode, rsrc); + dest[1] = gen_rtx_REG (TImode, rdest + 4); + src[1] = gen_rtx_REG (TImode, rsrc + 4); + dest[2] = gen_rtx_REG (TImode, rdest + 8); + src[2] = gen_rtx_REG (TImode, rsrc + 8); + dest[3] = gen_rtx_REG (TImode, rdest + 12); + src[3] = gen_rtx_REG (TImode, rsrc + 12); + + neon_disambiguate_copy (operands, dest, src, 4); +}) + +(define_expand "movmisalign" + [(set (match_operand:VDQX 0 "nonimmediate_operand" "") + (unspec:VDQX [(match_operand:VDQX 1 "general_operand" "")] + UNSPEC_MISALIGNED_ACCESS))] + "TARGET_NEON && !BYTES_BIG_ENDIAN" +{ + /* This pattern is not permitted to fail during expansion: if both arguments + are non-registers (e.g. memory := constant, which can be created by the + auto-vectorizer), force operand 1 into a register. */ + if (!s_register_operand (operands[0], mode) + && !s_register_operand (operands[1], mode)) + operands[1] = force_reg (mode, operands[1]); +}) + +(define_insn "*movmisalign_neon_store" + [(set (match_operand:VDX 0 "memory_operand" "=Um") + (unspec:VDX [(match_operand:VDX 1 "s_register_operand" " w")] + UNSPEC_MISALIGNED_ACCESS))] + "TARGET_NEON && !BYTES_BIG_ENDIAN" + "vst1.\t{%P1}, %A0" + [(set_attr "neon_type" "neon_vst1_1_2_regs_vst2_2_regs")]) + +(define_insn "*movmisalign_neon_load" + [(set (match_operand:VDX 0 "s_register_operand" "=w") + (unspec:VDX [(match_operand:VDX 1 "memory_operand" " Um")] + UNSPEC_MISALIGNED_ACCESS))] + "TARGET_NEON && !BYTES_BIG_ENDIAN" + "vld1.\t{%P0}, %A1" + [(set_attr "neon_type" "neon_vld1_1_2_regs")]) + +(define_insn "*movmisalign_neon_store" + [(set (match_operand:VQX 0 "memory_operand" "=Um") + (unspec:VQX [(match_operand:VQX 1 "s_register_operand" " w")] + UNSPEC_MISALIGNED_ACCESS))] + "TARGET_NEON && !BYTES_BIG_ENDIAN" + "vst1.\t{%q1}, %A0" + [(set_attr "neon_type" "neon_vst1_1_2_regs_vst2_2_regs")]) + +(define_insn "*movmisalign_neon_load" + [(set (match_operand:VQX 0 "s_register_operand" "=w") + (unspec:VQX [(match_operand:VQX 1 "memory_operand" " Um")] + UNSPEC_MISALIGNED_ACCESS))] + "TARGET_NEON && !BYTES_BIG_ENDIAN" + "vld1.\t{%q0}, %A1" + [(set_attr "neon_type" "neon_vld1_1_2_regs")]) + +(define_insn "vec_set_internal" + [(set (match_operand:VD 0 "s_register_operand" "=w") + (vec_merge:VD + (vec_duplicate:VD + (match_operand: 1 "s_register_operand" "r")) + (match_operand:VD 3 "s_register_operand" "0") + (match_operand:SI 2 "immediate_operand" "i")))] + "TARGET_NEON" +{ + int elt = ffs ((int) INTVAL (operands[2])) - 1; + if (BYTES_BIG_ENDIAN) + elt = GET_MODE_NUNITS (mode) - 1 - elt; + operands[2] = GEN_INT (elt); + + return "vmov%?.\t%P0[%c2], %1"; +} + [(set_attr "predicable" "yes") + (set_attr "neon_type" "neon_mcr")]) + +(define_insn "vec_set_internal" + [(set (match_operand:VQ 0 "s_register_operand" "=w") + (vec_merge:VQ + (vec_duplicate:VQ + (match_operand: 1 "s_register_operand" "r")) + (match_operand:VQ 3 "s_register_operand" "0") + (match_operand:SI 2 "immediate_operand" "i")))] + "TARGET_NEON" +{ + HOST_WIDE_INT elem = ffs ((int) INTVAL (operands[2])) - 1; + int half_elts = GET_MODE_NUNITS (mode) / 2; + int elt = elem % half_elts; + int hi = (elem / half_elts) * 2; + int regno = REGNO (operands[0]); + + if (BYTES_BIG_ENDIAN) + elt = half_elts - 1 - elt; + + operands[0] = gen_rtx_REG (mode, regno + hi); + operands[2] = GEN_INT (elt); + + return "vmov%?.\t%P0[%c2], %1"; +} + [(set_attr "predicable" "yes") + (set_attr "neon_type" "neon_mcr")] +) + +(define_insn "vec_setv2di_internal" + [(set (match_operand:V2DI 0 "s_register_operand" "=w") + (vec_merge:V2DI + (vec_duplicate:V2DI + (match_operand:DI 1 "s_register_operand" "r")) + (match_operand:V2DI 3 "s_register_operand" "0") + (match_operand:SI 2 "immediate_operand" "i")))] + "TARGET_NEON" +{ + HOST_WIDE_INT elem = ffs ((int) INTVAL (operands[2])) - 1; + int regno = REGNO (operands[0]) + 2 * elem; + + operands[0] = gen_rtx_REG (DImode, regno); + + return "vmov%?\t%P0, %Q1, %R1"; +} + [(set_attr "predicable" "yes") + (set_attr "neon_type" "neon_mcr_2_mcrr")] +) + +(define_expand "vec_set" + [(match_operand:VDQ 0 "s_register_operand" "") + (match_operand: 1 "s_register_operand" "") + (match_operand:SI 2 "immediate_operand" "")] + "TARGET_NEON" +{ + HOST_WIDE_INT elem = (HOST_WIDE_INT) 1 << INTVAL (operands[2]); + emit_insn (gen_vec_set_internal (operands[0], operands[1], + GEN_INT (elem), operands[0])); + DONE; +}) + +(define_insn "vec_extract" + [(set (match_operand: 0 "s_register_operand" "=r") + (vec_select: + (match_operand:VD 1 "s_register_operand" "w") + (parallel [(match_operand:SI 2 "immediate_operand" "i")])))] + "TARGET_NEON" +{ + if (BYTES_BIG_ENDIAN) + { + int elt = INTVAL (operands[2]); + elt = GET_MODE_NUNITS (mode) - 1 - elt; + operands[2] = GEN_INT (elt); + } + return "vmov%?.\t%0, %P1[%c2]"; +} + [(set_attr "predicable" "yes") + (set_attr "neon_type" "neon_bp_simple")] +) + +(define_insn "vec_extract" + [(set (match_operand: 0 "s_register_operand" "=r") + (vec_select: + (match_operand:VQ 1 "s_register_operand" "w") + (parallel [(match_operand:SI 2 "immediate_operand" "i")])))] + "TARGET_NEON" +{ + int half_elts = GET_MODE_NUNITS (mode) / 2; + int elt = INTVAL (operands[2]) % half_elts; + int hi = (INTVAL (operands[2]) / half_elts) * 2; + int regno = REGNO (operands[1]); + + if (BYTES_BIG_ENDIAN) + elt = half_elts - 1 - elt; + + operands[1] = gen_rtx_REG (mode, regno + hi); + operands[2] = GEN_INT (elt); + + return "vmov%?.\t%0, %P1[%c2]"; +} + [(set_attr "predicable" "yes") + (set_attr "neon_type" "neon_bp_simple")] +) + +(define_insn "vec_extractv2di" + [(set (match_operand:DI 0 "s_register_operand" "=r") + (vec_select:DI + (match_operand:V2DI 1 "s_register_operand" "w") + (parallel [(match_operand:SI 2 "immediate_operand" "i")])))] + "TARGET_NEON" +{ + int regno = REGNO (operands[1]) + 2 * INTVAL (operands[2]); + + operands[1] = gen_rtx_REG (DImode, regno); + + return "vmov%?\t%Q0, %R0, %P1 @ v2di"; +} + [(set_attr "predicable" "yes") + (set_attr "neon_type" "neon_int_1")] +) + +(define_expand "vec_init" + [(match_operand:VDQ 0 "s_register_operand" "") + (match_operand 1 "" "")] + "TARGET_NEON" +{ + neon_expand_vector_init (operands[0], operands[1]); + DONE; +}) + +;; Doubleword and quadword arithmetic. + +;; NOTE: some other instructions also support 64-bit integer +;; element size, which we could potentially use for "long long" operations. + +(define_insn "*add3_neon" + [(set (match_operand:VDQ 0 "s_register_operand" "=w") + (plus:VDQ (match_operand:VDQ 1 "s_register_operand" "w") + (match_operand:VDQ 2 "s_register_operand" "w")))] + "TARGET_NEON && (! || flag_unsafe_math_optimizations)" + "vadd.\t%0, %1, %2" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")) + (const_string "neon_int_1")))] +) + +(define_insn "adddi3_neon" + [(set (match_operand:DI 0 "s_register_operand" "=w,?&r,?&r") + (plus:DI (match_operand:DI 1 "s_register_operand" "%w,0,0") + (match_operand:DI 2 "s_register_operand" "w,r,0"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_NEON" +{ + switch (which_alternative) + { + case 0: return "vadd.i64\t%P0, %P1, %P2"; + case 1: return "#"; + case 2: return "#"; + default: gcc_unreachable (); + } +} + [(set_attr "neon_type" "neon_int_1,*,*") + (set_attr "conds" "*,clob,clob") + (set_attr "length" "*,8,8")] +) + +(define_insn "*sub3_neon" + [(set (match_operand:VDQ 0 "s_register_operand" "=w") + (minus:VDQ (match_operand:VDQ 1 "s_register_operand" "w") + (match_operand:VDQ 2 "s_register_operand" "w")))] + "TARGET_NEON && (! || flag_unsafe_math_optimizations)" + "vsub.\t%0, %1, %2" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")) + (const_string "neon_int_2")))] +) + +(define_insn "subdi3_neon" + [(set (match_operand:DI 0 "s_register_operand" "=w,?&r,?&r,?&r") + (minus:DI (match_operand:DI 1 "s_register_operand" "w,0,r,0") + (match_operand:DI 2 "s_register_operand" "w,r,0,0"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_NEON" +{ + switch (which_alternative) + { + case 0: return "vsub.i64\t%P0, %P1, %P2"; + case 1: /* fall through */ + case 2: /* fall through */ + case 3: return "subs\\t%Q0, %Q1, %Q2\;sbc\\t%R0, %R1, %R2"; + default: gcc_unreachable (); + } +} + [(set_attr "neon_type" "neon_int_2,*,*,*") + (set_attr "conds" "*,clob,clob,clob") + (set_attr "length" "*,8,8,8")] +) + +(define_insn "*mul3_neon" + [(set (match_operand:VDQ 0 "s_register_operand" "=w") + (mult:VDQ (match_operand:VDQ 1 "s_register_operand" "w") + (match_operand:VDQ 2 "s_register_operand" "w")))] + "TARGET_NEON && (! || flag_unsafe_math_optimizations)" + "vmul.\t%0, %1, %2" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else + (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long") + (const_string "neon_mul_qqq_8_16_32_ddd_32")) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mul_qqq_8_16_32_ddd_32") + (const_string "neon_mul_qqq_8_16_32_ddd_32")))))] +) + +(define_insn "mul3add_neon" + [(set (match_operand:VDQ 0 "s_register_operand" "=w") + (plus:VDQ (mult:VDQ (match_operand:VDQ 2 "s_register_operand" "w") + (match_operand:VDQ 3 "s_register_operand" "w")) + (match_operand:VDQ 1 "s_register_operand" "0")))] + "TARGET_NEON && (! || flag_unsafe_math_optimizations)" + "vmla.\t%0, %2, %3" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vmla_ddd") + (const_string "neon_fp_vmla_qqq")) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else + (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long") + (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mla_qqq_8_16") + (const_string "neon_mla_qqq_32_qqd_32_scalar")))))] +) + +(define_insn "mul3negadd_neon" + [(set (match_operand:VDQ 0 "s_register_operand" "=w") + (minus:VDQ (match_operand:VDQ 1 "s_register_operand" "0") + (mult:VDQ (match_operand:VDQ 2 "s_register_operand" "w") + (match_operand:VDQ 3 "s_register_operand" "w"))))] + "TARGET_NEON && (! || flag_unsafe_math_optimizations)" + "vmls.\t%0, %2, %3" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vmla_ddd") + (const_string "neon_fp_vmla_qqq")) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else + (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long") + (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mla_qqq_8_16") + (const_string "neon_mla_qqq_32_qqd_32_scalar")))))] +) + +(define_insn "ior3" + [(set (match_operand:VDQ 0 "s_register_operand" "=w,w") + (ior:VDQ (match_operand:VDQ 1 "s_register_operand" "w,0") + (match_operand:VDQ 2 "neon_logic_op2" "w,Dl")))] + "TARGET_NEON" +{ + switch (which_alternative) + { + case 0: return "vorr\t%0, %1, %2"; + case 1: return neon_output_logic_immediate ("vorr", &operands[2], + mode, 0, VALID_NEON_QREG_MODE (mode)); + default: gcc_unreachable (); + } +} + [(set_attr "neon_type" "neon_int_1")] +) + +(define_insn "iordi3_neon" + [(set (match_operand:DI 0 "s_register_operand" "=w,w,?&r,?&r") + (ior:DI (match_operand:DI 1 "s_register_operand" "%w,0,0,r") + (match_operand:DI 2 "neon_logic_op2" "w,Dl,r,r")))] + "TARGET_NEON" +{ + switch (which_alternative) + { + case 0: return "vorr\t%P0, %P1, %P2"; + case 1: return neon_output_logic_immediate ("vorr", &operands[2], + DImode, 0, VALID_NEON_QREG_MODE (DImode)); + case 2: return "#"; + case 3: return "#"; + default: gcc_unreachable (); + } +} + [(set_attr "neon_type" "neon_int_1,neon_int_1,*,*") + (set_attr "length" "*,*,8,8")] +) + +;; The concrete forms of the Neon immediate-logic instructions are vbic and +;; vorr. We support the pseudo-instruction vand instead, because that +;; corresponds to the canonical form the middle-end expects to use for +;; immediate bitwise-ANDs. + +(define_insn "and3" + [(set (match_operand:VDQ 0 "s_register_operand" "=w,w") + (and:VDQ (match_operand:VDQ 1 "s_register_operand" "w,0") + (match_operand:VDQ 2 "neon_inv_logic_op2" "w,DL")))] + "TARGET_NEON" +{ + switch (which_alternative) + { + case 0: return "vand\t%0, %1, %2"; + case 1: return neon_output_logic_immediate ("vand", &operands[2], + mode, 1, VALID_NEON_QREG_MODE (mode)); + default: gcc_unreachable (); + } +} + [(set_attr "neon_type" "neon_int_1")] +) + +(define_insn "anddi3_neon" + [(set (match_operand:DI 0 "s_register_operand" "=w,w,?&r,?&r") + (and:DI (match_operand:DI 1 "s_register_operand" "%w,0,0,r") + (match_operand:DI 2 "neon_inv_logic_op2" "w,DL,r,r")))] + "TARGET_NEON" +{ + switch (which_alternative) + { + case 0: return "vand\t%P0, %P1, %P2"; + case 1: return neon_output_logic_immediate ("vand", &operands[2], + DImode, 1, VALID_NEON_QREG_MODE (DImode)); + case 2: return "#"; + case 3: return "#"; + default: gcc_unreachable (); + } +} + [(set_attr "neon_type" "neon_int_1,neon_int_1,*,*") + (set_attr "length" "*,*,8,8")] +) + +(define_insn "orn3_neon" + [(set (match_operand:VDQ 0 "s_register_operand" "=w") + (ior:VDQ (match_operand:VDQ 1 "s_register_operand" "w") + (not:VDQ (match_operand:VDQ 2 "s_register_operand" "w"))))] + "TARGET_NEON" + "vorn\t%0, %1, %2" + [(set_attr "neon_type" "neon_int_1")] +) + +(define_insn "orndi3_neon" + [(set (match_operand:DI 0 "s_register_operand" "=w,?=&r,?&r") + (ior:DI (match_operand:DI 1 "s_register_operand" "w,r,0") + (not:DI (match_operand:DI 2 "s_register_operand" "w,0,r"))))] + "TARGET_NEON" + "@ + vorn\t%P0, %P1, %P2 + # + #" + [(set_attr "neon_type" "neon_int_1,*,*") + (set_attr "length" "*,8,8")] +) + +(define_insn "bic3_neon" + [(set (match_operand:VDQ 0 "s_register_operand" "=w") + (and:VDQ (match_operand:VDQ 1 "s_register_operand" "w") + (not:VDQ (match_operand:VDQ 2 "s_register_operand" "w"))))] + "TARGET_NEON" + "vbic\t%0, %1, %2" + [(set_attr "neon_type" "neon_int_1")] +) + +;; Compare to *anddi_notdi_di. +(define_insn "bicdi3_neon" + [(set (match_operand:DI 0 "s_register_operand" "=w,?=&r,?&r") + (and:DI (not:DI (match_operand:DI 2 "s_register_operand" "w,r,0")) + (match_operand:DI 1 "s_register_operand" "w,0,r")))] + "TARGET_NEON" + "@ + vbic\t%P0, %P1, %P2 + # + #" + [(set_attr "neon_type" "neon_int_1,*,*") + (set_attr "length" "*,8,8")] +) + +(define_insn "xor3" + [(set (match_operand:VDQ 0 "s_register_operand" "=w") + (xor:VDQ (match_operand:VDQ 1 "s_register_operand" "w") + (match_operand:VDQ 2 "s_register_operand" "w")))] + "TARGET_NEON" + "veor\t%0, %1, %2" + [(set_attr "neon_type" "neon_int_1")] +) + +(define_insn "xordi3_neon" + [(set (match_operand:DI 0 "s_register_operand" "=w,?&r,?&r") + (xor:DI (match_operand:DI 1 "s_register_operand" "%w,0,r") + (match_operand:DI 2 "s_register_operand" "w,r,r")))] + "TARGET_NEON" + "@ + veor\t%P0, %P1, %P2 + # + #" + [(set_attr "neon_type" "neon_int_1,*,*") + (set_attr "length" "*,8,8")] +) + +(define_insn "one_cmpl2" + [(set (match_operand:VDQ 0 "s_register_operand" "=w") + (not:VDQ (match_operand:VDQ 1 "s_register_operand" "w")))] + "TARGET_NEON" + "vmvn\t%0, %1" + [(set_attr "neon_type" "neon_int_1")] +) + +(define_insn "abs2" + [(set (match_operand:VDQW 0 "s_register_operand" "=w") + (abs:VDQW (match_operand:VDQW 1 "s_register_operand" "w")))] + "TARGET_NEON" + "vabs.\t%0, %1" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")) + (const_string "neon_int_3")))] +) + +(define_insn "neg2" + [(set (match_operand:VDQW 0 "s_register_operand" "=w") + (neg:VDQW (match_operand:VDQW 1 "s_register_operand" "w")))] + "TARGET_NEON" + "vneg.\t%0, %1" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")) + (const_string "neon_int_3")))] +) + +(define_insn "*umin3_neon" + [(set (match_operand:VDQIW 0 "s_register_operand" "=w") + (umin:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w") + (match_operand:VDQIW 2 "s_register_operand" "w")))] + "TARGET_NEON" + "vmin.\t%0, %1, %2" + [(set_attr "neon_type" "neon_int_5")] +) + +(define_insn "*umax3_neon" + [(set (match_operand:VDQIW 0 "s_register_operand" "=w") + (umax:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w") + (match_operand:VDQIW 2 "s_register_operand" "w")))] + "TARGET_NEON" + "vmax.\t%0, %1, %2" + [(set_attr "neon_type" "neon_int_5")] +) + +(define_insn "*smin3_neon" + [(set (match_operand:VDQW 0 "s_register_operand" "=w") + (smin:VDQW (match_operand:VDQW 1 "s_register_operand" "w") + (match_operand:VDQW 2 "s_register_operand" "w")))] + "TARGET_NEON" + "vmin.\t%0, %1, %2" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_int_5")))] +) + +(define_insn "*smax3_neon" + [(set (match_operand:VDQW 0 "s_register_operand" "=w") + (smax:VDQW (match_operand:VDQW 1 "s_register_operand" "w") + (match_operand:VDQW 2 "s_register_operand" "w")))] + "TARGET_NEON" + "vmax.\t%0, %1, %2" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_int_5")))] +) + +; TODO: V2DI shifts are current disabled because there are bugs in the +; generic vectorizer code. It ends up creating a V2DI constructor with +; SImode elements. + +(define_insn "vashl3" + [(set (match_operand:VDQIW 0 "s_register_operand" "=w") + (ashift:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w") + (match_operand:VDQIW 2 "s_register_operand" "w")))] + "TARGET_NEON" + "vshl.\t%0, %1, %2" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_vshl_ddd") + (const_string "neon_shift_3")))] +) + +; Used for implementing logical shift-right, which is a left-shift by a negative +; amount, with signed operands. This is essentially the same as ashl3 +; above, but using an unspec in case GCC tries anything tricky with negative +; shift amounts. + +(define_insn "ashl3_signed" + [(set (match_operand:VDQI 0 "s_register_operand" "=w") + (unspec:VDQI [(match_operand:VDQI 1 "s_register_operand" "w") + (match_operand:VDQI 2 "s_register_operand" "w")] + UNSPEC_ASHIFT_SIGNED))] + "TARGET_NEON" + "vshl.\t%0, %1, %2" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_vshl_ddd") + (const_string "neon_shift_3")))] +) + +; Used for implementing logical shift-right, which is a left-shift by a negative +; amount, with unsigned operands. + +(define_insn "ashl3_unsigned" + [(set (match_operand:VDQI 0 "s_register_operand" "=w") + (unspec:VDQI [(match_operand:VDQI 1 "s_register_operand" "w") + (match_operand:VDQI 2 "s_register_operand" "w")] + UNSPEC_ASHIFT_UNSIGNED))] + "TARGET_NEON" + "vshl.\t%0, %1, %2" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_vshl_ddd") + (const_string "neon_shift_3")))] +) + +(define_expand "vashr3" + [(set (match_operand:VDQIW 0 "s_register_operand" "") + (ashiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand" "") + (match_operand:VDQIW 2 "s_register_operand" "")))] + "TARGET_NEON" +{ + rtx neg = gen_reg_rtx (mode); + + emit_insn (gen_neg2 (neg, operands[2])); + emit_insn (gen_ashl3_signed (operands[0], operands[1], neg)); + + DONE; +}) + +(define_expand "vlshr3" + [(set (match_operand:VDQIW 0 "s_register_operand" "") + (lshiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand" "") + (match_operand:VDQIW 2 "s_register_operand" "")))] + "TARGET_NEON" +{ + rtx neg = gen_reg_rtx (mode); + + emit_insn (gen_neg2 (neg, operands[2])); + emit_insn (gen_ashl3_unsigned (operands[0], operands[1], neg)); + + DONE; +}) + +;; Widening operations + +(define_insn "widen_ssum3" + [(set (match_operand: 0 "s_register_operand" "=w") + (plus: (sign_extend: + (match_operand:VW 1 "s_register_operand" "%w")) + (match_operand: 2 "s_register_operand" "w")))] + "TARGET_NEON" + "vaddw.\t%q0, %q2, %P1" + [(set_attr "neon_type" "neon_int_3")] +) + +(define_insn "widen_usum3" + [(set (match_operand: 0 "s_register_operand" "=w") + (plus: (zero_extend: + (match_operand:VW 1 "s_register_operand" "%w")) + (match_operand: 2 "s_register_operand" "w")))] + "TARGET_NEON" + "vaddw.\t%q0, %q2, %P1" + [(set_attr "neon_type" "neon_int_3")] +) + +;; VEXT can be used to synthesize coarse whole-vector shifts with 8-bit +;; shift-count granularity. That's good enough for the middle-end's current +;; needs. + +(define_expand "vec_shr_" + [(match_operand:VDQ 0 "s_register_operand" "") + (match_operand:VDQ 1 "s_register_operand" "") + (match_operand:SI 2 "const_multiple_of_8_operand" "")] + "TARGET_NEON" +{ + rtx zero_reg; + HOST_WIDE_INT num_bits = INTVAL (operands[2]); + const int width = GET_MODE_BITSIZE (mode); + const enum machine_mode bvecmode = (width == 128) ? V16QImode : V8QImode; + rtx (*gen_ext) (rtx, rtx, rtx, rtx) = + (width == 128) ? gen_neon_vextv16qi : gen_neon_vextv8qi; + + if (num_bits == width) + { + emit_move_insn (operands[0], operands[1]); + DONE; + } + + zero_reg = force_reg (bvecmode, CONST0_RTX (bvecmode)); + operands[0] = gen_lowpart (bvecmode, operands[0]); + operands[1] = gen_lowpart (bvecmode, operands[1]); + + emit_insn (gen_ext (operands[0], operands[1], zero_reg, + GEN_INT (num_bits / BITS_PER_UNIT))); + DONE; +}) + +(define_expand "vec_shl_" + [(match_operand:VDQ 0 "s_register_operand" "") + (match_operand:VDQ 1 "s_register_operand" "") + (match_operand:SI 2 "const_multiple_of_8_operand" "")] + "TARGET_NEON" +{ + rtx zero_reg; + HOST_WIDE_INT num_bits = INTVAL (operands[2]); + const int width = GET_MODE_BITSIZE (mode); + const enum machine_mode bvecmode = (width == 128) ? V16QImode : V8QImode; + rtx (*gen_ext) (rtx, rtx, rtx, rtx) = + (width == 128) ? gen_neon_vextv16qi : gen_neon_vextv8qi; + + if (num_bits == 0) + { + emit_move_insn (operands[0], CONST0_RTX (mode)); + DONE; + } + + num_bits = width - num_bits; + + zero_reg = force_reg (bvecmode, CONST0_RTX (bvecmode)); + operands[0] = gen_lowpart (bvecmode, operands[0]); + operands[1] = gen_lowpart (bvecmode, operands[1]); + + emit_insn (gen_ext (operands[0], zero_reg, operands[1], + GEN_INT (num_bits / BITS_PER_UNIT))); + DONE; +}) + +;; Helpers for quad-word reduction operations + +; Add (or smin, smax...) the low N/2 elements of the N-element vector +; operand[1] to the high N/2 elements of same. Put the result in operand[0], an +; N/2-element vector. + +(define_insn "quad_halves_v4si" + [(set (match_operand:V2SI 0 "s_register_operand" "=w") + (vqh_ops:V2SI + (vec_select:V2SI (match_operand:V4SI 1 "s_register_operand" "w") + (parallel [(const_int 0) (const_int 1)])) + (vec_select:V2SI (match_dup 1) + (parallel [(const_int 2) (const_int 3)]))))] + "TARGET_NEON" + ".32\t%P0, %e1, %f1" + [(set_attr "vqh_mnem" "") + (set (attr "neon_type") + (if_then_else (eq_attr "vqh_mnem" "vadd") + (const_string "neon_int_1") (const_string "neon_int_5")))] +) + +(define_insn "quad_halves_v4sf" + [(set (match_operand:V2SF 0 "s_register_operand" "=w") + (vqhs_ops:V2SF + (vec_select:V2SF (match_operand:V4SF 1 "s_register_operand" "w") + (parallel [(const_int 0) (const_int 1)])) + (vec_select:V2SF (match_dup 1) + (parallel [(const_int 2) (const_int 3)]))))] + "TARGET_NEON && flag_unsafe_math_optimizations" + ".f32\t%P0, %e1, %f1" + [(set_attr "vqh_mnem" "") + (set (attr "neon_type") + (if_then_else (eq_attr "vqh_mnem" "vadd") + (const_string "neon_int_1") (const_string "neon_int_5")))] +) + +(define_insn "quad_halves_v8hi" + [(set (match_operand:V4HI 0 "s_register_operand" "+w") + (vqh_ops:V4HI + (vec_select:V4HI (match_operand:V8HI 1 "s_register_operand" "w") + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)])) + (vec_select:V4HI (match_dup 1) + (parallel [(const_int 4) (const_int 5) + (const_int 6) (const_int 7)]))))] + "TARGET_NEON" + ".16\t%P0, %e1, %f1" + [(set_attr "vqh_mnem" "") + (set (attr "neon_type") + (if_then_else (eq_attr "vqh_mnem" "vadd") + (const_string "neon_int_1") (const_string "neon_int_5")))] +) + +(define_insn "quad_halves_v16qi" + [(set (match_operand:V8QI 0 "s_register_operand" "+w") + (vqh_ops:V8QI + (vec_select:V8QI (match_operand:V16QI 1 "s_register_operand" "w") + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7)])) + (vec_select:V8QI (match_dup 1) + (parallel [(const_int 8) (const_int 9) + (const_int 10) (const_int 11) + (const_int 12) (const_int 13) + (const_int 14) (const_int 15)]))))] + "TARGET_NEON" + ".8\t%P0, %e1, %f1" + [(set_attr "vqh_mnem" "") + (set (attr "neon_type") + (if_then_else (eq_attr "vqh_mnem" "vadd") + (const_string "neon_int_1") (const_string "neon_int_5")))] +) + +; FIXME: We wouldn't need the following insns if we could write subregs of +; vector registers. Make an attempt at removing unnecessary moves, though +; we're really at the mercy of the register allocator. + +(define_insn "neon_move_lo_quad_" + [(set (match_operand:ANY128 0 "s_register_operand" "+w") + (vec_concat:ANY128 + (match_operand: 1 "s_register_operand" "w") + (vec_select: + (match_dup 0) + (match_operand:ANY128 2 "vect_par_constant_high" ""))))] + "TARGET_NEON" +{ + int dest = REGNO (operands[0]); + int src = REGNO (operands[1]); + + if (dest != src) + return "vmov\t%e0, %P1"; + else + return ""; +} + [(set_attr "neon_type" "neon_bp_simple")] +) + +(define_insn "neon_move_hi_quad_" + [(set (match_operand:ANY128 0 "s_register_operand" "+w") + (vec_concat:ANY128 + (vec_select: + (match_dup 0) + (match_operand:ANY128 2 "vect_par_constant_low" "")) + (match_operand: 1 "s_register_operand" "w")))] + + "TARGET_NEON" +{ + int dest = REGNO (operands[0]); + int src = REGNO (operands[1]); + + if (dest != src) + return "vmov\t%f0, %P1"; + else + return ""; +} + [(set_attr "neon_type" "neon_bp_simple")] +) + +(define_expand "move_hi_quad_" + [(match_operand:ANY128 0 "s_register_operand" "") + (match_operand: 1 "s_register_operand" "")] + "TARGET_NEON" +{ + rtvec v = rtvec_alloc (/2); + rtx t1; + int i; + + for (i=0; i < (/2); i++) + RTVEC_ELT (v, i) = GEN_INT (i); + + t1 = gen_rtx_PARALLEL (mode, v); + emit_insn (gen_neon_move_hi_quad_ (operands[0], operands[1], t1)); + + DONE; +}) + +(define_expand "move_lo_quad_" + [(match_operand:ANY128 0 "s_register_operand" "") + (match_operand: 1 "s_register_operand" "")] + "TARGET_NEON" +{ + rtvec v = rtvec_alloc (/2); + rtx t1; + int i; + + for (i=0; i < (/2); i++) + RTVEC_ELT (v, i) = GEN_INT ((/2) + i); + + t1 = gen_rtx_PARALLEL (mode, v); + emit_insn (gen_neon_move_lo_quad_ (operands[0], operands[1], t1)); + + DONE; +}) + +;; Reduction operations + +(define_expand "reduc_splus_" + [(match_operand:VD 0 "s_register_operand" "") + (match_operand:VD 1 "s_register_operand" "")] + "TARGET_NEON && (! || flag_unsafe_math_optimizations)" +{ + neon_pairwise_reduce (operands[0], operands[1], mode, + &gen_neon_vpadd_internal); + DONE; +}) + +(define_expand "reduc_splus_" + [(match_operand:VQ 0 "s_register_operand" "") + (match_operand:VQ 1 "s_register_operand" "")] + "TARGET_NEON && (! || flag_unsafe_math_optimizations)" +{ + rtx step1 = gen_reg_rtx (mode); + rtx res_d = gen_reg_rtx (mode); + + emit_insn (gen_quad_halves_plus (step1, operands[1])); + emit_insn (gen_reduc_splus_ (res_d, step1)); + emit_insn (gen_move_lo_quad_ (operands[0], res_d)); + + DONE; +}) + +(define_insn "reduc_splus_v2di" + [(set (match_operand:V2DI 0 "s_register_operand" "=w") + (unspec:V2DI [(match_operand:V2DI 1 "s_register_operand" "w")] + UNSPEC_VPADD))] + "TARGET_NEON" + "vadd.i64\t%e0, %e1, %f1" + [(set_attr "neon_type" "neon_int_1")] +) + +;; NEON does not distinguish between signed and unsigned addition except on +;; widening operations. +(define_expand "reduc_uplus_" + [(match_operand:VDQI 0 "s_register_operand" "") + (match_operand:VDQI 1 "s_register_operand" "")] + "TARGET_NEON" +{ + emit_insn (gen_reduc_splus_ (operands[0], operands[1])); + DONE; +}) + +(define_expand "reduc_smin_" + [(match_operand:VD 0 "s_register_operand" "") + (match_operand:VD 1 "s_register_operand" "")] + "TARGET_NEON && (! || flag_unsafe_math_optimizations)" +{ + neon_pairwise_reduce (operands[0], operands[1], mode, + &gen_neon_vpsmin); + DONE; +}) + +(define_expand "reduc_smin_" + [(match_operand:VQ 0 "s_register_operand" "") + (match_operand:VQ 1 "s_register_operand" "")] + "TARGET_NEON && (! || flag_unsafe_math_optimizations)" +{ + rtx step1 = gen_reg_rtx (mode); + rtx res_d = gen_reg_rtx (mode); + + emit_insn (gen_quad_halves_smin (step1, operands[1])); + emit_insn (gen_reduc_smin_ (res_d, step1)); + emit_insn (gen_move_lo_quad_ (operands[0], res_d)); + + DONE; +}) + +(define_expand "reduc_smax_" + [(match_operand:VD 0 "s_register_operand" "") + (match_operand:VD 1 "s_register_operand" "")] + "TARGET_NEON && (! || flag_unsafe_math_optimizations)" +{ + neon_pairwise_reduce (operands[0], operands[1], mode, + &gen_neon_vpsmax); + DONE; +}) + +(define_expand "reduc_smax_" + [(match_operand:VQ 0 "s_register_operand" "") + (match_operand:VQ 1 "s_register_operand" "")] + "TARGET_NEON && (! || flag_unsafe_math_optimizations)" +{ + rtx step1 = gen_reg_rtx (mode); + rtx res_d = gen_reg_rtx (mode); + + emit_insn (gen_quad_halves_smax (step1, operands[1])); + emit_insn (gen_reduc_smax_ (res_d, step1)); + emit_insn (gen_move_lo_quad_ (operands[0], res_d)); + + DONE; +}) + +(define_expand "reduc_umin_" + [(match_operand:VDI 0 "s_register_operand" "") + (match_operand:VDI 1 "s_register_operand" "")] + "TARGET_NEON" +{ + neon_pairwise_reduce (operands[0], operands[1], mode, + &gen_neon_vpumin); + DONE; +}) + +(define_expand "reduc_umin_" + [(match_operand:VQI 0 "s_register_operand" "") + (match_operand:VQI 1 "s_register_operand" "")] + "TARGET_NEON" +{ + rtx step1 = gen_reg_rtx (mode); + rtx res_d = gen_reg_rtx (mode); + + emit_insn (gen_quad_halves_umin (step1, operands[1])); + emit_insn (gen_reduc_umin_ (res_d, step1)); + emit_insn (gen_move_lo_quad_ (operands[0], res_d)); + + DONE; +}) + +(define_expand "reduc_umax_" + [(match_operand:VDI 0 "s_register_operand" "") + (match_operand:VDI 1 "s_register_operand" "")] + "TARGET_NEON" +{ + neon_pairwise_reduce (operands[0], operands[1], mode, + &gen_neon_vpumax); + DONE; +}) + +(define_expand "reduc_umax_" + [(match_operand:VQI 0 "s_register_operand" "") + (match_operand:VQI 1 "s_register_operand" "")] + "TARGET_NEON" +{ + rtx step1 = gen_reg_rtx (mode); + rtx res_d = gen_reg_rtx (mode); + + emit_insn (gen_quad_halves_umax (step1, operands[1])); + emit_insn (gen_reduc_umax_ (res_d, step1)); + emit_insn (gen_move_lo_quad_ (operands[0], res_d)); + + DONE; +}) + +(define_insn "neon_vpadd_internal" + [(set (match_operand:VD 0 "s_register_operand" "=w") + (unspec:VD [(match_operand:VD 1 "s_register_operand" "w") + (match_operand:VD 2 "s_register_operand" "w")] + UNSPEC_VPADD))] + "TARGET_NEON" + "vpadd.\t%P0, %P1, %P2" + ;; Assume this schedules like vadd. + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")) + (const_string "neon_int_1")))] +) + +(define_insn "neon_vpsmin" + [(set (match_operand:VD 0 "s_register_operand" "=w") + (unspec:VD [(match_operand:VD 1 "s_register_operand" "w") + (match_operand:VD 2 "s_register_operand" "w")] + UNSPEC_VPSMIN))] + "TARGET_NEON" + "vpmin.\t%P0, %P1, %P2" + ;; Assume this schedules like vmin. + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_int_5")))] +) + +(define_insn "neon_vpsmax" + [(set (match_operand:VD 0 "s_register_operand" "=w") + (unspec:VD [(match_operand:VD 1 "s_register_operand" "w") + (match_operand:VD 2 "s_register_operand" "w")] + UNSPEC_VPSMAX))] + "TARGET_NEON" + "vpmax.\t%P0, %P1, %P2" + ;; Assume this schedules like vmax. + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_int_5")))] +) + +(define_insn "neon_vpumin" + [(set (match_operand:VDI 0 "s_register_operand" "=w") + (unspec:VDI [(match_operand:VDI 1 "s_register_operand" "w") + (match_operand:VDI 2 "s_register_operand" "w")] + UNSPEC_VPUMIN))] + "TARGET_NEON" + "vpmin.\t%P0, %P1, %P2" + ;; Assume this schedules like umin. + [(set_attr "neon_type" "neon_int_5")] +) + +(define_insn "neon_vpumax" + [(set (match_operand:VDI 0 "s_register_operand" "=w") + (unspec:VDI [(match_operand:VDI 1 "s_register_operand" "w") + (match_operand:VDI 2 "s_register_operand" "w")] + UNSPEC_VPUMAX))] + "TARGET_NEON" + "vpmax.\t%P0, %P1, %P2" + ;; Assume this schedules like umax. + [(set_attr "neon_type" "neon_int_5")] +) + +;; Saturating arithmetic + +; NOTE: Neon supports many more saturating variants of instructions than the +; following, but these are all GCC currently understands. +; FIXME: Actually, GCC doesn't know how to create saturating add/sub by itself +; yet either, although these patterns may be used by intrinsics when they're +; added. + +(define_insn "*ss_add_neon" + [(set (match_operand:VD 0 "s_register_operand" "=w") + (ss_plus:VD (match_operand:VD 1 "s_register_operand" "w") + (match_operand:VD 2 "s_register_operand" "w")))] + "TARGET_NEON" + "vqadd.\t%P0, %P1, %P2" + [(set_attr "neon_type" "neon_int_4")] +) + +(define_insn "*us_add_neon" + [(set (match_operand:VD 0 "s_register_operand" "=w") + (us_plus:VD (match_operand:VD 1 "s_register_operand" "w") + (match_operand:VD 2 "s_register_operand" "w")))] + "TARGET_NEON" + "vqadd.\t%P0, %P1, %P2" + [(set_attr "neon_type" "neon_int_4")] +) + +(define_insn "*ss_sub_neon" + [(set (match_operand:VD 0 "s_register_operand" "=w") + (ss_minus:VD (match_operand:VD 1 "s_register_operand" "w") + (match_operand:VD 2 "s_register_operand" "w")))] + "TARGET_NEON" + "vqsub.\t%P0, %P1, %P2" + [(set_attr "neon_type" "neon_int_5")] +) + +(define_insn "*us_sub_neon" + [(set (match_operand:VD 0 "s_register_operand" "=w") + (us_minus:VD (match_operand:VD 1 "s_register_operand" "w") + (match_operand:VD 2 "s_register_operand" "w")))] + "TARGET_NEON" + "vqsub.\t%P0, %P1, %P2" + [(set_attr "neon_type" "neon_int_5")] +) + +;; Conditional instructions. These are comparisons with conditional moves for +;; vectors. They perform the assignment: +;; +;; Vop0 = (Vop4 Vop5) ? Vop1 : Vop2; +;; +;; where op3 is <, <=, ==, !=, >= or >. Operations are performed +;; element-wise. + +(define_expand "vcond" + [(set (match_operand:VDQW 0 "s_register_operand" "") + (if_then_else:VDQW + (match_operator 3 "arm_comparison_operator" + [(match_operand:VDQW 4 "s_register_operand" "") + (match_operand:VDQW 5 "nonmemory_operand" "")]) + (match_operand:VDQW 1 "s_register_operand" "") + (match_operand:VDQW 2 "s_register_operand" "")))] + "TARGET_NEON && (! || flag_unsafe_math_optimizations)" +{ + rtx mask; + int inverse = 0, immediate_zero = 0; + /* See the description of "magic" bits in the 'T' case of + arm_print_operand. */ + HOST_WIDE_INT magic_word = (mode == V2SFmode || mode == V4SFmode) + ? 3 : 1; + rtx magic_rtx = GEN_INT (magic_word); + + mask = gen_reg_rtx (mode); + + if (operands[5] == CONST0_RTX (mode)) + immediate_zero = 1; + else if (!REG_P (operands[5])) + operands[5] = force_reg (mode, operands[5]); + + switch (GET_CODE (operands[3])) + { + case GE: + emit_insn (gen_neon_vcge (mask, operands[4], operands[5], + magic_rtx)); + break; + + case GT: + emit_insn (gen_neon_vcgt (mask, operands[4], operands[5], + magic_rtx)); + break; + + case EQ: + emit_insn (gen_neon_vceq (mask, operands[4], operands[5], + magic_rtx)); + break; + + case LE: + if (immediate_zero) + emit_insn (gen_neon_vcle (mask, operands[4], operands[5], + magic_rtx)); + else + emit_insn (gen_neon_vcge (mask, operands[5], operands[4], + magic_rtx)); + break; + + case LT: + if (immediate_zero) + emit_insn (gen_neon_vclt (mask, operands[4], operands[5], + magic_rtx)); + else + emit_insn (gen_neon_vcgt (mask, operands[5], operands[4], + magic_rtx)); + break; + + case NE: + emit_insn (gen_neon_vceq (mask, operands[4], operands[5], + magic_rtx)); + inverse = 1; + break; + + default: + gcc_unreachable (); + } + + if (inverse) + emit_insn (gen_neon_vbsl (operands[0], mask, operands[2], + operands[1])); + else + emit_insn (gen_neon_vbsl (operands[0], mask, operands[1], + operands[2])); + + DONE; +}) + +(define_expand "vcondu" + [(set (match_operand:VDQIW 0 "s_register_operand" "") + (if_then_else:VDQIW + (match_operator 3 "arm_comparison_operator" + [(match_operand:VDQIW 4 "s_register_operand" "") + (match_operand:VDQIW 5 "s_register_operand" "")]) + (match_operand:VDQIW 1 "s_register_operand" "") + (match_operand:VDQIW 2 "s_register_operand" "")))] + "TARGET_NEON" +{ + rtx mask; + int inverse = 0, immediate_zero = 0; + + mask = gen_reg_rtx (mode); + + if (operands[5] == CONST0_RTX (mode)) + immediate_zero = 1; + else if (!REG_P (operands[5])) + operands[5] = force_reg (mode, operands[5]); + + switch (GET_CODE (operands[3])) + { + case GEU: + emit_insn (gen_neon_vcge (mask, operands[4], operands[5], + const0_rtx)); + break; + + case GTU: + emit_insn (gen_neon_vcgt (mask, operands[4], operands[5], + const0_rtx)); + break; + + case EQ: + emit_insn (gen_neon_vceq (mask, operands[4], operands[5], + const0_rtx)); + break; + + case LEU: + if (immediate_zero) + emit_insn (gen_neon_vcle (mask, operands[4], operands[5], + const0_rtx)); + else + emit_insn (gen_neon_vcge (mask, operands[5], operands[4], + const0_rtx)); + break; + + case LTU: + if (immediate_zero) + emit_insn (gen_neon_vclt (mask, operands[4], operands[5], + const0_rtx)); + else + emit_insn (gen_neon_vcgt (mask, operands[5], operands[4], + const0_rtx)); + break; + + case NE: + emit_insn (gen_neon_vceq (mask, operands[4], operands[5], + const0_rtx)); + inverse = 1; + break; + + default: + gcc_unreachable (); + } + + if (inverse) + emit_insn (gen_neon_vbsl (operands[0], mask, operands[2], + operands[1])); + else + emit_insn (gen_neon_vbsl (operands[0], mask, operands[1], + operands[2])); + + DONE; +}) + +;; Patterns for builtins. + +; good for plain vadd, vaddq. + +(define_expand "neon_vadd" + [(match_operand:VDQX 0 "s_register_operand" "=w") + (match_operand:VDQX 1 "s_register_operand" "w") + (match_operand:VDQX 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + "TARGET_NEON" +{ + if (! || flag_unsafe_math_optimizations) + emit_insn (gen_add3 (operands[0], operands[1], operands[2])); + else + emit_insn (gen_neon_vadd_unspec (operands[0], operands[1], + operands[2])); + DONE; +}) + +; Note that NEON operations don't support the full IEEE 754 standard: in +; particular, denormal values are flushed to zero. This means that GCC cannot +; use those instructions for autovectorization, etc. unless +; -funsafe-math-optimizations is in effect (in which case flush-to-zero +; behaviour is permissible). Intrinsic operations (provided by the arm_neon.h +; header) must work in either case: if -funsafe-math-optimizations is given, +; intrinsics expand to "canonical" RTL where possible, otherwise intrinsics +; expand to unspecs (which may potentially limit the extent to which they might +; be optimized by generic code). + +; Used for intrinsics when flag_unsafe_math_optimizations is false. + +(define_insn "neon_vadd_unspec" + [(set (match_operand:VDQX 0 "s_register_operand" "=w") + (unspec:VDQX [(match_operand:VDQX 1 "s_register_operand" "w") + (match_operand:VDQX 2 "s_register_operand" "w")] + UNSPEC_VADD))] + "TARGET_NEON" + "vadd.\t%0, %1, %2" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")) + (const_string "neon_int_1")))] +) + +; operand 3 represents in bits: +; bit 0: signed (vs unsigned). +; bit 1: rounding (vs none). + +(define_insn "neon_vaddl" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:VDI 1 "s_register_operand" "w") + (match_operand:VDI 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VADDL))] + "TARGET_NEON" + "vaddl.%T3%#\t%q0, %P1, %P2" + [(set_attr "neon_type" "neon_int_3")] +) + +(define_insn "neon_vaddw" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand: 1 "s_register_operand" "w") + (match_operand:VDI 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VADDW))] + "TARGET_NEON" + "vaddw.%T3%#\t%q0, %q1, %P2" + [(set_attr "neon_type" "neon_int_2")] +) + +; vhadd and vrhadd. + +(define_insn "neon_vhadd" + [(set (match_operand:VDQIW 0 "s_register_operand" "=w") + (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w") + (match_operand:VDQIW 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VHADD))] + "TARGET_NEON" + "v%O3hadd.%T3%#\t%0, %1, %2" + [(set_attr "neon_type" "neon_int_4")] +) + +(define_insn "neon_vqadd" + [(set (match_operand:VDQIX 0 "s_register_operand" "=w") + (unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w") + (match_operand:VDQIX 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VQADD))] + "TARGET_NEON" + "vqadd.%T3%#\t%0, %1, %2" + [(set_attr "neon_type" "neon_int_4")] +) + +(define_insn "neon_vaddhn" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:VN 1 "s_register_operand" "w") + (match_operand:VN 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VADDHN))] + "TARGET_NEON" + "v%O3addhn.\t%P0, %q1, %q2" + [(set_attr "neon_type" "neon_int_4")] +) + +;; We cannot replace this unspec with mul3 because of the odd +;; polynomial multiplication case that can specified by operand 3. +(define_insn "neon_vmul" + [(set (match_operand:VDQW 0 "s_register_operand" "=w") + (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "w") + (match_operand:VDQW 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VMUL))] + "TARGET_NEON" + "vmul.%F3%#\t%0, %1, %2" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else + (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long") + (const_string "neon_mul_qqq_8_16_32_ddd_32")) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mul_qqq_8_16_32_ddd_32") + (const_string "neon_mul_qqq_8_16_32_ddd_32")))))] +) + +(define_expand "neon_vmla" + [(match_operand:VDQW 0 "s_register_operand" "=w") + (match_operand:VDQW 1 "s_register_operand" "0") + (match_operand:VDQW 2 "s_register_operand" "w") + (match_operand:VDQW 3 "s_register_operand" "w") + (match_operand:SI 4 "immediate_operand" "i")] + "TARGET_NEON" +{ + if (! || flag_unsafe_math_optimizations) + emit_insn (gen_mul3add_neon (operands[0], operands[1], + operands[2], operands[3])); + else + emit_insn (gen_neon_vmla_unspec (operands[0], operands[1], + operands[2], operands[3])); + DONE; +}) + +; Used for intrinsics when flag_unsafe_math_optimizations is false. + +(define_insn "neon_vmla_unspec" + [(set (match_operand:VDQ 0 "s_register_operand" "=w") + (unspec:VDQ [(match_operand:VDQ 1 "s_register_operand" "0") + (match_operand:VDQ 2 "s_register_operand" "w") + (match_operand:VDQ 3 "s_register_operand" "w")] + UNSPEC_VMLA))] + "TARGET_NEON" + "vmla.\t%0, %2, %3" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vmla_ddd") + (const_string "neon_fp_vmla_qqq")) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else + (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long") + (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mla_qqq_8_16") + (const_string "neon_mla_qqq_32_qqd_32_scalar")))))] +) + +(define_insn "neon_vmlal" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand: 1 "s_register_operand" "0") + (match_operand:VW 2 "s_register_operand" "w") + (match_operand:VW 3 "s_register_operand" "w") + (match_operand:SI 4 "immediate_operand" "i")] + UNSPEC_VMLAL))] + "TARGET_NEON" + "vmlal.%T4%#\t%q0, %P2, %P3" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long") + (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))] +) + +(define_expand "neon_vmls" + [(match_operand:VDQW 0 "s_register_operand" "=w") + (match_operand:VDQW 1 "s_register_operand" "0") + (match_operand:VDQW 2 "s_register_operand" "w") + (match_operand:VDQW 3 "s_register_operand" "w") + (match_operand:SI 4 "immediate_operand" "i")] + "TARGET_NEON" +{ + if (! || flag_unsafe_math_optimizations) + emit_insn (gen_mul3negadd_neon (operands[0], + operands[1], operands[2], operands[3])); + else + emit_insn (gen_neon_vmls_unspec (operands[0], operands[1], + operands[2], operands[3])); + DONE; +}) + +; Used for intrinsics when flag_unsafe_math_optimizations is false. + +(define_insn "neon_vmls_unspec" + [(set (match_operand:VDQ 0 "s_register_operand" "=w") + (unspec:VDQ [(match_operand:VDQ 1 "s_register_operand" "0") + (match_operand:VDQ 2 "s_register_operand" "w") + (match_operand:VDQ 3 "s_register_operand" "w")] + UNSPEC_VMLS))] + "TARGET_NEON" + "vmls.\t%0, %2, %3" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vmla_ddd") + (const_string "neon_fp_vmla_qqq")) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else + (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long") + (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")) + (if_then_else + (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mla_qqq_8_16") + (const_string "neon_mla_qqq_32_qqd_32_scalar")))))] +) + +(define_insn "neon_vmlsl" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand: 1 "s_register_operand" "0") + (match_operand:VW 2 "s_register_operand" "w") + (match_operand:VW 3 "s_register_operand" "w") + (match_operand:SI 4 "immediate_operand" "i")] + UNSPEC_VMLSL))] + "TARGET_NEON" + "vmlsl.%T4%#\t%q0, %P2, %P3" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long") + (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))] +) + +(define_insn "neon_vqdmulh" + [(set (match_operand:VMDQI 0 "s_register_operand" "=w") + (unspec:VMDQI [(match_operand:VMDQI 1 "s_register_operand" "w") + (match_operand:VMDQI 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VQDMULH))] + "TARGET_NEON" + "vq%O3dmulh.\t%0, %1, %2" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long") + (const_string "neon_mul_qqq_8_16_32_ddd_32")) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mul_qqq_8_16_32_ddd_32") + (const_string "neon_mul_qqq_8_16_32_ddd_32"))))] +) + +(define_insn "neon_vqdmlal" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand: 1 "s_register_operand" "0") + (match_operand:VMDI 2 "s_register_operand" "w") + (match_operand:VMDI 3 "s_register_operand" "w") + (match_operand:SI 4 "immediate_operand" "i")] + UNSPEC_VQDMLAL))] + "TARGET_NEON" + "vqdmlal.\t%q0, %P2, %P3" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long") + (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))] +) + +(define_insn "neon_vqdmlsl" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand: 1 "s_register_operand" "0") + (match_operand:VMDI 2 "s_register_operand" "w") + (match_operand:VMDI 3 "s_register_operand" "w") + (match_operand:SI 4 "immediate_operand" "i")] + UNSPEC_VQDMLSL))] + "TARGET_NEON" + "vqdmlsl.\t%q0, %P2, %P3" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long") + (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))] +) + +(define_insn "neon_vmull" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:VW 1 "s_register_operand" "w") + (match_operand:VW 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VMULL))] + "TARGET_NEON" + "vmull.%T3%#\t%q0, %P1, %P2" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long") + (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar")))] +) + +(define_insn "neon_vqdmull" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:VMDI 1 "s_register_operand" "w") + (match_operand:VMDI 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VQDMULL))] + "TARGET_NEON" + "vqdmull.\t%q0, %P1, %P2" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long") + (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar")))] +) + +(define_expand "neon_vsub" + [(match_operand:VDQX 0 "s_register_operand" "=w") + (match_operand:VDQX 1 "s_register_operand" "w") + (match_operand:VDQX 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + "TARGET_NEON" +{ + if (! || flag_unsafe_math_optimizations) + emit_insn (gen_sub3 (operands[0], operands[1], operands[2])); + else + emit_insn (gen_neon_vsub_unspec (operands[0], operands[1], + operands[2])); + DONE; +}) + +; Used for intrinsics when flag_unsafe_math_optimizations is false. + +(define_insn "neon_vsub_unspec" + [(set (match_operand:VDQX 0 "s_register_operand" "=w") + (unspec:VDQX [(match_operand:VDQX 1 "s_register_operand" "w") + (match_operand:VDQX 2 "s_register_operand" "w")] + UNSPEC_VSUB))] + "TARGET_NEON" + "vsub.\t%0, %1, %2" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")) + (const_string "neon_int_2")))] +) + +(define_insn "neon_vsubl" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:VDI 1 "s_register_operand" "w") + (match_operand:VDI 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VSUBL))] + "TARGET_NEON" + "vsubl.%T3%#\t%q0, %P1, %P2" + [(set_attr "neon_type" "neon_int_2")] +) + +(define_insn "neon_vsubw" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand: 1 "s_register_operand" "w") + (match_operand:VDI 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VSUBW))] + "TARGET_NEON" + "vsubw.%T3%#\t%q0, %q1, %P2" + [(set_attr "neon_type" "neon_int_2")] +) + +(define_insn "neon_vqsub" + [(set (match_operand:VDQIX 0 "s_register_operand" "=w") + (unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w") + (match_operand:VDQIX 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VQSUB))] + "TARGET_NEON" + "vqsub.%T3%#\t%0, %1, %2" + [(set_attr "neon_type" "neon_int_5")] +) + +(define_insn "neon_vhsub" + [(set (match_operand:VDQIW 0 "s_register_operand" "=w") + (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w") + (match_operand:VDQIW 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VHSUB))] + "TARGET_NEON" + "vhsub.%T3%#\t%0, %1, %2" + [(set_attr "neon_type" "neon_int_5")] +) + +(define_insn "neon_vsubhn" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:VN 1 "s_register_operand" "w") + (match_operand:VN 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VSUBHN))] + "TARGET_NEON" + "v%O3subhn.\t%P0, %q1, %q2" + [(set_attr "neon_type" "neon_int_4")] +) + +(define_insn "neon_vceq" + [(set (match_operand: 0 "s_register_operand" "=w,w") + (unspec: + [(match_operand:VDQW 1 "s_register_operand" "w,w") + (match_operand:VDQW 2 "nonmemory_operand" "w,Dz") + (match_operand:SI 3 "immediate_operand" "i,i")] + UNSPEC_VCEQ))] + "TARGET_NEON" + "@ + vceq.\t%0, %1, %2 + vceq.\t%0, %1, #0" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")) + (const_string "neon_int_5")))] +) + +(define_insn "neon_vcge" + [(set (match_operand: 0 "s_register_operand" "=w,w") + (unspec: + [(match_operand:VDQW 1 "s_register_operand" "w,w") + (match_operand:VDQW 2 "nonmemory_operand" "w,Dz") + (match_operand:SI 3 "immediate_operand" "i,i")] + UNSPEC_VCGE))] + "TARGET_NEON" + "@ + vcge.%T3%#\t%0, %1, %2 + vcge.%T3%#\t%0, %1, #0" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")) + (const_string "neon_int_5")))] +) + +(define_insn "neon_vcgt" + [(set (match_operand: 0 "s_register_operand" "=w,w") + (unspec: + [(match_operand:VDQW 1 "s_register_operand" "w,w") + (match_operand:VDQW 2 "nonmemory_operand" "w,Dz") + (match_operand:SI 3 "immediate_operand" "i,i")] + UNSPEC_VCGT))] + "TARGET_NEON" + "@ + vcgt.%T3%#\t%0, %1, %2 + vcgt.%T3%#\t%0, %1, #0" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")) + (const_string "neon_int_5")))] +) + +;; VCLE and VCLT only support comparisons with immediate zero (register +;; variants are VCGE and VCGT with operands reversed). + +(define_insn "neon_vcle" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: + [(match_operand:VDQW 1 "s_register_operand" "w") + (match_operand:VDQW 2 "nonmemory_operand" "Dz") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VCLE))] + "TARGET_NEON" + "vcle.%T3%#\t%0, %1, #0" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")) + (const_string "neon_int_5")))] +) + +(define_insn "neon_vclt" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: + [(match_operand:VDQW 1 "s_register_operand" "w") + (match_operand:VDQW 2 "nonmemory_operand" "Dz") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VCLT))] + "TARGET_NEON" + "vclt.%T3%#\t%0, %1, #0" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")) + (const_string "neon_int_5")))] +) + +(define_insn "neon_vcage" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:VCVTF 1 "s_register_operand" "w") + (match_operand:VCVTF 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VCAGE))] + "TARGET_NEON" + "vacge.\t%0, %1, %2" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")))] +) + +(define_insn "neon_vcagt" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:VCVTF 1 "s_register_operand" "w") + (match_operand:VCVTF 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VCAGT))] + "TARGET_NEON" + "vacgt.\t%0, %1, %2" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")))] +) + +(define_insn "neon_vtst" + [(set (match_operand:VDQIW 0 "s_register_operand" "=w") + (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w") + (match_operand:VDQIW 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VTST))] + "TARGET_NEON" + "vtst.\t%0, %1, %2" + [(set_attr "neon_type" "neon_int_4")] +) + +(define_insn "neon_vabd" + [(set (match_operand:VDQW 0 "s_register_operand" "=w") + (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "w") + (match_operand:VDQW 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VABD))] + "TARGET_NEON" + "vabd.%T3%#\t%0, %1, %2" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")) + (const_string "neon_int_5")))] +) + +(define_insn "neon_vabdl" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:VW 1 "s_register_operand" "w") + (match_operand:VW 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VABDL))] + "TARGET_NEON" + "vabdl.%T3%#\t%q0, %P1, %P2" + [(set_attr "neon_type" "neon_int_5")] +) + +(define_insn "neon_vaba" + [(set (match_operand:VDQIW 0 "s_register_operand" "=w") + (plus:VDQIW (match_operand:VDQIW 1 "s_register_operand" "0") + (unspec:VDQIW [(match_operand:VDQIW 2 "s_register_operand" "w") + (match_operand:VDQIW 3 "s_register_operand" "w") + (match_operand:SI 4 "immediate_operand" "i")] + UNSPEC_VABD)))] + "TARGET_NEON" + "vaba.%T4%#\t%0, %2, %3" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_vaba") (const_string "neon_vaba_qqq")))] +) + +(define_insn "neon_vabal" + [(set (match_operand: 0 "s_register_operand" "=w") + (plus: (match_operand: 1 "s_register_operand" "0") + (unspec: [(match_operand:VW 2 "s_register_operand" "w") + (match_operand:VW 3 "s_register_operand" "w") + (match_operand:SI 4 "immediate_operand" "i")] + UNSPEC_VABDL)))] + "TARGET_NEON" + "vabal.%T4%#\t%q0, %P2, %P3" + [(set_attr "neon_type" "neon_vaba")] +) + +(define_insn "neon_vmax" + [(set (match_operand:VDQW 0 "s_register_operand" "=w") + (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "w") + (match_operand:VDQW 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VMAX))] + "TARGET_NEON" + "vmax.%T3%#\t%0, %1, %2" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")) + (const_string "neon_int_5")))] +) + +(define_insn "neon_vmin" + [(set (match_operand:VDQW 0 "s_register_operand" "=w") + (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "w") + (match_operand:VDQW 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VMIN))] + "TARGET_NEON" + "vmin.%T3%#\t%0, %1, %2" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")) + (const_string "neon_int_5")))] +) + +(define_expand "neon_vpadd" + [(match_operand:VD 0 "s_register_operand" "=w") + (match_operand:VD 1 "s_register_operand" "w") + (match_operand:VD 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + "TARGET_NEON" +{ + emit_insn (gen_neon_vpadd_internal (operands[0], operands[1], + operands[2])); + DONE; +}) + +(define_insn "neon_vpaddl" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:VDQIW 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + UNSPEC_VPADDL))] + "TARGET_NEON" + "vpaddl.%T2%#\t%0, %1" + ;; Assume this schedules like vaddl. + [(set_attr "neon_type" "neon_int_3")] +) + +(define_insn "neon_vpadal" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand: 1 "s_register_operand" "0") + (match_operand:VDQIW 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VPADAL))] + "TARGET_NEON" + "vpadal.%T3%#\t%0, %2" + ;; Assume this schedules like vpadd. + [(set_attr "neon_type" "neon_int_1")] +) + +(define_insn "neon_vpmax" + [(set (match_operand:VD 0 "s_register_operand" "=w") + (unspec:VD [(match_operand:VD 1 "s_register_operand" "w") + (match_operand:VD 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VPMAX))] + "TARGET_NEON" + "vpmax.%T3%#\t%0, %1, %2" + ;; Assume this schedules like vmax. + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_int_5")))] +) + +(define_insn "neon_vpmin" + [(set (match_operand:VD 0 "s_register_operand" "=w") + (unspec:VD [(match_operand:VD 1 "s_register_operand" "w") + (match_operand:VD 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VPMIN))] + "TARGET_NEON" + "vpmin.%T3%#\t%0, %1, %2" + ;; Assume this schedules like vmin. + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_int_5")))] +) + +(define_insn "neon_vrecps" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w") + (match_operand:VCVTF 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VRECPS))] + "TARGET_NEON" + "vrecps.\t%0, %1, %2" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vrecps_vrsqrts_ddd") + (const_string "neon_fp_vrecps_vrsqrts_qqq")))] +) + +(define_insn "neon_vrsqrts" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w") + (match_operand:VCVTF 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VRSQRTS))] + "TARGET_NEON" + "vrsqrts.\t%0, %1, %2" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vrecps_vrsqrts_ddd") + (const_string "neon_fp_vrecps_vrsqrts_qqq")))] +) + +(define_expand "neon_vabs" + [(match_operand:VDQW 0 "s_register_operand" "") + (match_operand:VDQW 1 "s_register_operand" "") + (match_operand:SI 2 "immediate_operand" "")] + "TARGET_NEON" +{ + emit_insn (gen_abs2 (operands[0], operands[1])); + DONE; +}) + +(define_insn "neon_vqabs" + [(set (match_operand:VDQIW 0 "s_register_operand" "=w") + (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + UNSPEC_VQABS))] + "TARGET_NEON" + "vqabs.\t%0, %1" + [(set_attr "neon_type" "neon_vqneg_vqabs")] +) + +(define_expand "neon_vneg" + [(match_operand:VDQW 0 "s_register_operand" "") + (match_operand:VDQW 1 "s_register_operand" "") + (match_operand:SI 2 "immediate_operand" "")] + "TARGET_NEON" +{ + emit_insn (gen_neg2 (operands[0], operands[1])); + DONE; +}) + +(define_insn "neon_vqneg" + [(set (match_operand:VDQIW 0 "s_register_operand" "=w") + (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + UNSPEC_VQNEG))] + "TARGET_NEON" + "vqneg.\t%0, %1" + [(set_attr "neon_type" "neon_vqneg_vqabs")] +) + +(define_insn "neon_vcls" + [(set (match_operand:VDQIW 0 "s_register_operand" "=w") + (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + UNSPEC_VCLS))] + "TARGET_NEON" + "vcls.\t%0, %1" + [(set_attr "neon_type" "neon_int_1")] +) + +(define_insn "clz2" + [(set (match_operand:VDQIW 0 "s_register_operand" "=w") + (clz:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w")))] + "TARGET_NEON" + "vclz.\t%0, %1" + [(set_attr "neon_type" "neon_int_1")] +) + +(define_expand "neon_vclz" + [(match_operand:VDQIW 0 "s_register_operand" "") + (match_operand:VDQIW 1 "s_register_operand" "") + (match_operand:SI 2 "immediate_operand" "")] + "TARGET_NEON" +{ + emit_insn (gen_clz2 (operands[0], operands[1])); + DONE; +}) + +(define_insn "popcount2" + [(set (match_operand:VE 0 "s_register_operand" "=w") + (popcount:VE (match_operand:VE 1 "s_register_operand" "w")))] + "TARGET_NEON" + "vcnt.\t%0, %1" + [(set_attr "neon_type" "neon_int_1")] +) + +(define_expand "neon_vcnt" + [(match_operand:VE 0 "s_register_operand" "=w") + (match_operand:VE 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + "TARGET_NEON" +{ + emit_insn (gen_popcount2 (operands[0], operands[1])); + DONE; +}) + +(define_insn "neon_vrecpe" + [(set (match_operand:V32 0 "s_register_operand" "=w") + (unspec:V32 [(match_operand:V32 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + UNSPEC_VRECPE))] + "TARGET_NEON" + "vrecpe.\t%0, %1" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")))] +) + +(define_insn "neon_vrsqrte" + [(set (match_operand:V32 0 "s_register_operand" "=w") + (unspec:V32 [(match_operand:V32 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + UNSPEC_VRSQRTE))] + "TARGET_NEON" + "vrsqrte.\t%0, %1" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")))] +) + +(define_expand "neon_vmvn" + [(match_operand:VDQIW 0 "s_register_operand" "") + (match_operand:VDQIW 1 "s_register_operand" "") + (match_operand:SI 2 "immediate_operand" "")] + "TARGET_NEON" +{ + emit_insn (gen_one_cmpl2 (operands[0], operands[1])); + DONE; +}) + +(define_insn "neon_vget_lane_sext_internal" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (sign_extend:SI + (vec_select: + (match_operand:VD 1 "s_register_operand" "w") + (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))] + "TARGET_NEON" +{ + if (BYTES_BIG_ENDIAN) + { + int elt = INTVAL (operands[2]); + elt = GET_MODE_NUNITS (mode) - 1 - elt; + operands[2] = GEN_INT (elt); + } + return "vmov%?.s\t%0, %P1[%c2]"; +} + [(set_attr "predicable" "yes") + (set_attr "neon_type" "neon_bp_simple")] +) + +(define_insn "neon_vget_lane_zext_internal" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (zero_extend:SI + (vec_select: + (match_operand:VD 1 "s_register_operand" "w") + (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))] + "TARGET_NEON" +{ + if (BYTES_BIG_ENDIAN) + { + int elt = INTVAL (operands[2]); + elt = GET_MODE_NUNITS (mode) - 1 - elt; + operands[2] = GEN_INT (elt); + } + return "vmov%?.u\t%0, %P1[%c2]"; +} + [(set_attr "predicable" "yes") + (set_attr "neon_type" "neon_bp_simple")] +) + +(define_insn "neon_vget_lane_sext_internal" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (sign_extend:SI + (vec_select: + (match_operand:VQ 1 "s_register_operand" "w") + (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))] + "TARGET_NEON" +{ + rtx ops[3]; + int regno = REGNO (operands[1]); + unsigned int halfelts = GET_MODE_NUNITS (mode) / 2; + unsigned int elt = INTVAL (operands[2]); + unsigned int elt_adj = elt % halfelts; + + if (BYTES_BIG_ENDIAN) + elt_adj = halfelts - 1 - elt_adj; + + ops[0] = operands[0]; + ops[1] = gen_rtx_REG (mode, regno + 2 * (elt / halfelts)); + ops[2] = GEN_INT (elt_adj); + output_asm_insn ("vmov%?.s\t%0, %P1[%c2]", ops); + + return ""; +} + [(set_attr "predicable" "yes") + (set_attr "neon_type" "neon_bp_simple")] +) + +(define_insn "neon_vget_lane_zext_internal" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (zero_extend:SI + (vec_select: + (match_operand:VQ 1 "s_register_operand" "w") + (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))] + "TARGET_NEON" +{ + rtx ops[3]; + int regno = REGNO (operands[1]); + unsigned int halfelts = GET_MODE_NUNITS (mode) / 2; + unsigned int elt = INTVAL (operands[2]); + unsigned int elt_adj = elt % halfelts; + + if (BYTES_BIG_ENDIAN) + elt_adj = halfelts - 1 - elt_adj; + + ops[0] = operands[0]; + ops[1] = gen_rtx_REG (mode, regno + 2 * (elt / halfelts)); + ops[2] = GEN_INT (elt_adj); + output_asm_insn ("vmov%?.u\t%0, %P1[%c2]", ops); + + return ""; +} + [(set_attr "predicable" "yes") + (set_attr "neon_type" "neon_bp_simple")] +) + +(define_expand "neon_vget_lane" + [(match_operand: 0 "s_register_operand" "") + (match_operand:VDQW 1 "s_register_operand" "") + (match_operand:SI 2 "immediate_operand" "") + (match_operand:SI 3 "immediate_operand" "")] + "TARGET_NEON" +{ + HOST_WIDE_INT magic = INTVAL (operands[3]); + rtx insn; + + neon_lane_bounds (operands[2], 0, GET_MODE_NUNITS (mode)); + + if (BYTES_BIG_ENDIAN) + { + /* The intrinsics are defined in terms of a model where the + element ordering in memory is vldm order, whereas the generic + RTL is defined in terms of a model where the element ordering + in memory is array order. Convert the lane number to conform + to this model. */ + unsigned int elt = INTVAL (operands[2]); + unsigned int reg_nelts + = 64 / GET_MODE_BITSIZE (GET_MODE_INNER (mode)); + elt ^= reg_nelts - 1; + operands[2] = GEN_INT (elt); + } + + if ((magic & 3) == 3 || GET_MODE_BITSIZE (GET_MODE_INNER (mode)) == 32) + insn = gen_vec_extract (operands[0], operands[1], operands[2]); + else + { + if ((magic & 1) != 0) + insn = gen_neon_vget_lane_sext_internal (operands[0], operands[1], + operands[2]); + else + insn = gen_neon_vget_lane_zext_internal (operands[0], operands[1], + operands[2]); + } + emit_insn (insn); + DONE; +}) + +; Operand 3 (info word) is ignored because it does nothing useful with 64-bit +; elements. + +(define_expand "neon_vget_lanedi" + [(match_operand:DI 0 "s_register_operand" "=r") + (match_operand:DI 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i") + (match_operand:SI 3 "immediate_operand" "i")] + "TARGET_NEON" +{ + neon_lane_bounds (operands[2], 0, 1); + emit_move_insn (operands[0], operands[1]); + DONE; +}) + +(define_expand "neon_vget_lanev2di" + [(match_operand:DI 0 "s_register_operand" "=r") + (match_operand:V2DI 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i") + (match_operand:SI 3 "immediate_operand" "i")] + "TARGET_NEON" +{ + neon_lane_bounds (operands[2], 0, 2); + emit_insn (gen_vec_extractv2di (operands[0], operands[1], operands[2])); + DONE; +}) + +(define_expand "neon_vset_lane" + [(match_operand:VDQ 0 "s_register_operand" "=w") + (match_operand: 1 "s_register_operand" "r") + (match_operand:VDQ 2 "s_register_operand" "0") + (match_operand:SI 3 "immediate_operand" "i")] + "TARGET_NEON" +{ + unsigned int elt = INTVAL (operands[3]); + neon_lane_bounds (operands[3], 0, GET_MODE_NUNITS (mode)); + + if (BYTES_BIG_ENDIAN) + { + unsigned int reg_nelts + = 64 / GET_MODE_BITSIZE (GET_MODE_INNER (mode)); + elt ^= reg_nelts - 1; + } + + emit_insn (gen_vec_set_internal (operands[0], operands[1], + GEN_INT (1 << elt), operands[2])); + DONE; +}) + +; See neon_vget_lanedi comment for reasons operands 2 & 3 are ignored. + +(define_expand "neon_vset_lanedi" + [(match_operand:DI 0 "s_register_operand" "=w") + (match_operand:DI 1 "s_register_operand" "r") + (match_operand:DI 2 "s_register_operand" "0") + (match_operand:SI 3 "immediate_operand" "i")] + "TARGET_NEON" +{ + neon_lane_bounds (operands[3], 0, 1); + emit_move_insn (operands[0], operands[1]); + DONE; +}) + +(define_expand "neon_vcreate" + [(match_operand:VDX 0 "s_register_operand" "") + (match_operand:DI 1 "general_operand" "")] + "TARGET_NEON" +{ + rtx src = gen_lowpart (mode, operands[1]); + emit_move_insn (operands[0], src); + DONE; +}) + +(define_insn "neon_vdup_n" + [(set (match_operand:VX 0 "s_register_operand" "=w") + (vec_duplicate:VX (match_operand: 1 "s_register_operand" "r")))] + "TARGET_NEON" + "vdup%?.\t%0, %1" + ;; Assume this schedules like vmov. + [(set_attr "predicable" "yes") + (set_attr "neon_type" "neon_bp_simple")] +) + +(define_insn "neon_vdup_n" + [(set (match_operand:V32 0 "s_register_operand" "=w,w") + (vec_duplicate:V32 (match_operand: 1 "s_register_operand" "r,t")))] + "TARGET_NEON" + "@ + vdup%?.\t%0, %1 + vdup%?.\t%0, %y1" + ;; Assume this schedules like vmov. + [(set_attr "predicable" "yes") + (set_attr "neon_type" "neon_bp_simple")] +) + +(define_expand "neon_vdup_ndi" + [(match_operand:DI 0 "s_register_operand" "=w") + (match_operand:DI 1 "s_register_operand" "r")] + "TARGET_NEON" +{ + emit_move_insn (operands[0], operands[1]); + DONE; +} +) + +(define_insn "neon_vdup_nv2di" + [(set (match_operand:V2DI 0 "s_register_operand" "=w,w") + (vec_duplicate:V2DI (match_operand:DI 1 "s_register_operand" "r,w")))] + "TARGET_NEON" + "@ + vmov%?\t%e0, %Q1, %R1\;vmov%?\t%f0, %Q1, %R1 + vmov%?\t%e0, %P1\;vmov%?\t%f0, %P1" + [(set_attr "predicable" "yes") + (set_attr "length" "8") + (set_attr "neon_type" "neon_bp_simple")] +) + +(define_insn "neon_vdup_lane_internal" + [(set (match_operand:VDQW 0 "s_register_operand" "=w") + (vec_duplicate:VDQW + (vec_select: + (match_operand: 1 "s_register_operand" "w") + (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))] + "TARGET_NEON" +{ + if (BYTES_BIG_ENDIAN) + { + int elt = INTVAL (operands[2]); + elt = GET_MODE_NUNITS (mode) - 1 - elt; + operands[2] = GEN_INT (elt); + } + if () + return "vdup.\t%P0, %P1[%c2]"; + else + return "vdup.\t%q0, %P1[%c2]"; +} + ;; Assume this schedules like vmov. + [(set_attr "neon_type" "neon_bp_simple")] +) + +(define_expand "neon_vdup_lane" + [(match_operand:VDQW 0 "s_register_operand" "=w") + (match_operand: 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + "TARGET_NEON" +{ + neon_lane_bounds (operands[2], 0, GET_MODE_NUNITS (mode)); + if (BYTES_BIG_ENDIAN) + { + unsigned int elt = INTVAL (operands[2]); + unsigned int reg_nelts + = 64 / GET_MODE_BITSIZE (GET_MODE_INNER (mode)); + elt ^= reg_nelts - 1; + operands[2] = GEN_INT (elt); + } + emit_insn (gen_neon_vdup_lane_internal (operands[0], operands[1], + operands[2])); + DONE; +}) + +; Scalar index is ignored, since only zero is valid here. +(define_expand "neon_vdup_lanedi" + [(match_operand:DI 0 "s_register_operand" "=w") + (match_operand:DI 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + "TARGET_NEON" +{ + neon_lane_bounds (operands[2], 0, 1); + emit_move_insn (operands[0], operands[1]); + DONE; +}) + +; Likewise for v2di, as the DImode second operand has only a single element. +(define_expand "neon_vdup_lanev2di" + [(match_operand:V2DI 0 "s_register_operand" "=w") + (match_operand:DI 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + "TARGET_NEON" +{ + neon_lane_bounds (operands[2], 0, 1); + emit_insn (gen_neon_vdup_nv2di (operands[0], operands[1])); + DONE; +}) + +;; In this insn, operand 1 should be low, and operand 2 the high part of the +;; dest vector. +;; FIXME: A different implementation of this builtin could make it much +;; more likely that we wouldn't actually need to output anything (we could make +;; it so that the reg allocator puts things in the right places magically +;; instead). Lack of subregs for vectors makes that tricky though, I think. + +(define_insn "neon_vcombine" + [(set (match_operand: 0 "s_register_operand" "=w") + (vec_concat: (match_operand:VDX 1 "s_register_operand" "w") + (match_operand:VDX 2 "s_register_operand" "w")))] + "TARGET_NEON" +{ + int dest = REGNO (operands[0]); + int src1 = REGNO (operands[1]); + int src2 = REGNO (operands[2]); + rtx destlo; + + if (src1 == dest && src2 == dest + 2) + return ""; + else if (src2 == dest && src1 == dest + 2) + /* Special case of reversed high/low parts. */ + return "vswp\t%P1, %P2"; + + destlo = gen_rtx_REG (mode, dest); + + if (!reg_overlap_mentioned_p (operands[2], destlo)) + { + /* Try to avoid unnecessary moves if part of the result is in the right + place already. */ + if (src1 != dest) + output_asm_insn ("vmov\t%e0, %P1", operands); + if (src2 != dest + 2) + output_asm_insn ("vmov\t%f0, %P2", operands); + } + else + { + if (src2 != dest + 2) + output_asm_insn ("vmov\t%f0, %P2", operands); + if (src1 != dest) + output_asm_insn ("vmov\t%e0, %P1", operands); + } + + return ""; +} + ;; We set the neon_type attribute based on the vmov instructions above. + [(set_attr "length" "8") + (set_attr "neon_type" "neon_bp_simple")] +) + +(define_insn "neon_vget_highv16qi" + [(set (match_operand:V8QI 0 "s_register_operand" "=w") + (vec_select:V8QI (match_operand:V16QI 1 "s_register_operand" "w") + (parallel [(const_int 8) (const_int 9) + (const_int 10) (const_int 11) + (const_int 12) (const_int 13) + (const_int 14) (const_int 15)])))] + "TARGET_NEON" +{ + int dest = REGNO (operands[0]); + int src = REGNO (operands[1]); + + if (dest != src + 2) + return "vmov\t%P0, %f1"; + else + return ""; +} + [(set_attr "neon_type" "neon_bp_simple")] +) + +(define_insn "neon_vget_highv8hi" + [(set (match_operand:V4HI 0 "s_register_operand" "=w") + (vec_select:V4HI (match_operand:V8HI 1 "s_register_operand" "w") + (parallel [(const_int 4) (const_int 5) + (const_int 6) (const_int 7)])))] + "TARGET_NEON" +{ + int dest = REGNO (operands[0]); + int src = REGNO (operands[1]); + + if (dest != src + 2) + return "vmov\t%P0, %f1"; + else + return ""; +} + [(set_attr "neon_type" "neon_bp_simple")] +) + +(define_insn "neon_vget_highv4si" + [(set (match_operand:V2SI 0 "s_register_operand" "=w") + (vec_select:V2SI (match_operand:V4SI 1 "s_register_operand" "w") + (parallel [(const_int 2) (const_int 3)])))] + "TARGET_NEON" +{ + int dest = REGNO (operands[0]); + int src = REGNO (operands[1]); + + if (dest != src + 2) + return "vmov\t%P0, %f1"; + else + return ""; +} + [(set_attr "neon_type" "neon_bp_simple")] +) + +(define_insn "neon_vget_highv4sf" + [(set (match_operand:V2SF 0 "s_register_operand" "=w") + (vec_select:V2SF (match_operand:V4SF 1 "s_register_operand" "w") + (parallel [(const_int 2) (const_int 3)])))] + "TARGET_NEON" +{ + int dest = REGNO (operands[0]); + int src = REGNO (operands[1]); + + if (dest != src + 2) + return "vmov\t%P0, %f1"; + else + return ""; +} + [(set_attr "neon_type" "neon_bp_simple")] +) + +(define_insn "neon_vget_highv2di" + [(set (match_operand:DI 0 "s_register_operand" "=w") + (vec_select:DI (match_operand:V2DI 1 "s_register_operand" "w") + (parallel [(const_int 1)])))] + "TARGET_NEON" +{ + int dest = REGNO (operands[0]); + int src = REGNO (operands[1]); + + if (dest != src + 2) + return "vmov\t%P0, %f1"; + else + return ""; +} + [(set_attr "neon_type" "neon_bp_simple")] +) + +(define_insn "neon_vget_lowv16qi" + [(set (match_operand:V8QI 0 "s_register_operand" "=w") + (vec_select:V8QI (match_operand:V16QI 1 "s_register_operand" "w") + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7)])))] + "TARGET_NEON" +{ + int dest = REGNO (operands[0]); + int src = REGNO (operands[1]); + + if (dest != src) + return "vmov\t%P0, %e1"; + else + return ""; +} + [(set_attr "neon_type" "neon_bp_simple")] +) + +(define_insn "neon_vget_lowv8hi" + [(set (match_operand:V4HI 0 "s_register_operand" "=w") + (vec_select:V4HI (match_operand:V8HI 1 "s_register_operand" "w") + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)])))] + "TARGET_NEON" +{ + int dest = REGNO (operands[0]); + int src = REGNO (operands[1]); + + if (dest != src) + return "vmov\t%P0, %e1"; + else + return ""; +} + [(set_attr "neon_type" "neon_bp_simple")] +) + +(define_insn "neon_vget_lowv4si" + [(set (match_operand:V2SI 0 "s_register_operand" "=w") + (vec_select:V2SI (match_operand:V4SI 1 "s_register_operand" "w") + (parallel [(const_int 0) (const_int 1)])))] + "TARGET_NEON" +{ + int dest = REGNO (operands[0]); + int src = REGNO (operands[1]); + + if (dest != src) + return "vmov\t%P0, %e1"; + else + return ""; +} + [(set_attr "neon_type" "neon_bp_simple")] +) + +(define_insn "neon_vget_lowv4sf" + [(set (match_operand:V2SF 0 "s_register_operand" "=w") + (vec_select:V2SF (match_operand:V4SF 1 "s_register_operand" "w") + (parallel [(const_int 0) (const_int 1)])))] + "TARGET_NEON" +{ + int dest = REGNO (operands[0]); + int src = REGNO (operands[1]); + + if (dest != src) + return "vmov\t%P0, %e1"; + else + return ""; +} + [(set_attr "neon_type" "neon_bp_simple")] +) + +(define_insn "neon_vget_lowv2di" + [(set (match_operand:DI 0 "s_register_operand" "=w") + (vec_select:DI (match_operand:V2DI 1 "s_register_operand" "w") + (parallel [(const_int 0)])))] + "TARGET_NEON" +{ + int dest = REGNO (operands[0]); + int src = REGNO (operands[1]); + + if (dest != src) + return "vmov\t%P0, %e1"; + else + return ""; +} + [(set_attr "neon_type" "neon_bp_simple")] +) + +(define_insn "neon_vcvt" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:VCVTF 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + UNSPEC_VCVT))] + "TARGET_NEON" + "vcvt.%T2%#32.f32\t%0, %1" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")))] +) + +(define_insn "neon_vcvt" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:VCVTI 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + UNSPEC_VCVT))] + "TARGET_NEON" + "vcvt.f32.%T2%#32\t%0, %1" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")))] +) + +(define_insn "neon_vcvt_n" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:VCVTF 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VCVT_N))] + "TARGET_NEON" +{ + neon_const_bounds (operands[2], 1, 33); + return "vcvt.%T3%#32.f32\t%0, %1, %2"; +} + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")))] +) + +(define_insn "neon_vcvt_n" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:VCVTI 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VCVT_N))] + "TARGET_NEON" +{ + neon_const_bounds (operands[2], 1, 33); + return "vcvt.f32.%T3%#32\t%0, %1, %2"; +} + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vadd_ddd_vabs_dd") + (const_string "neon_fp_vadd_qqq_vabs_qq")))] +) + +(define_insn "neon_vmovn" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:VN 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + UNSPEC_VMOVN))] + "TARGET_NEON" + "vmovn.\t%P0, %q1" + [(set_attr "neon_type" "neon_bp_simple")] +) + +(define_insn "neon_vqmovn" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:VN 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + UNSPEC_VQMOVN))] + "TARGET_NEON" + "vqmovn.%T2%#\t%P0, %q1" + [(set_attr "neon_type" "neon_shift_2")] +) + +(define_insn "neon_vqmovun" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:VN 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + UNSPEC_VQMOVUN))] + "TARGET_NEON" + "vqmovun.\t%P0, %q1" + [(set_attr "neon_type" "neon_shift_2")] +) + +(define_insn "neon_vmovl" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:VW 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + UNSPEC_VMOVL))] + "TARGET_NEON" + "vmovl.%T2%#\t%q0, %P1" + [(set_attr "neon_type" "neon_shift_1")] +) + +(define_insn "neon_vmul_lane" + [(set (match_operand:VMD 0 "s_register_operand" "=w") + (unspec:VMD [(match_operand:VMD 1 "s_register_operand" "w") + (match_operand:VMD 2 "s_register_operand" + "") + (match_operand:SI 3 "immediate_operand" "i") + (match_operand:SI 4 "immediate_operand" "i")] + UNSPEC_VMUL_LANE))] + "TARGET_NEON" +{ + neon_lane_bounds (operands[3], 0, GET_MODE_NUNITS (mode)); + return "vmul.\t%P0, %P1, %P2[%c3]"; +} + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vmul_ddd") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mul_ddd_16_scalar_32_16_long_scalar") + (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar"))))] +) + +(define_insn "neon_vmul_lane" + [(set (match_operand:VMQ 0 "s_register_operand" "=w") + (unspec:VMQ [(match_operand:VMQ 1 "s_register_operand" "w") + (match_operand: 2 "s_register_operand" + "") + (match_operand:SI 3 "immediate_operand" "i") + (match_operand:SI 4 "immediate_operand" "i")] + UNSPEC_VMUL_LANE))] + "TARGET_NEON" +{ + neon_lane_bounds (operands[3], 0, GET_MODE_NUNITS (mode)); + return "vmul.\t%q0, %q1, %P2[%c3]"; +} + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vmul_qqd") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar") + (const_string "neon_mul_qqd_32_scalar"))))] +) + +(define_insn "neon_vmull_lane" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:VMDI 1 "s_register_operand" "w") + (match_operand:VMDI 2 "s_register_operand" + "") + (match_operand:SI 3 "immediate_operand" "i") + (match_operand:SI 4 "immediate_operand" "i")] + UNSPEC_VMULL_LANE))] + "TARGET_NEON" +{ + neon_lane_bounds (operands[3], 0, GET_MODE_NUNITS (mode)); + return "vmull.%T4%#\t%q0, %P1, %P2[%c3]"; +} + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mul_ddd_16_scalar_32_16_long_scalar") + (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar")))] +) + +(define_insn "neon_vqdmull_lane" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:VMDI 1 "s_register_operand" "w") + (match_operand:VMDI 2 "s_register_operand" + "") + (match_operand:SI 3 "immediate_operand" "i") + (match_operand:SI 4 "immediate_operand" "i")] + UNSPEC_VQDMULL_LANE))] + "TARGET_NEON" +{ + neon_lane_bounds (operands[3], 0, GET_MODE_NUNITS (mode)); + return "vqdmull.\t%q0, %P1, %P2[%c3]"; +} + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mul_ddd_16_scalar_32_16_long_scalar") + (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar")))] +) + +(define_insn "neon_vqdmulh_lane" + [(set (match_operand:VMQI 0 "s_register_operand" "=w") + (unspec:VMQI [(match_operand:VMQI 1 "s_register_operand" "w") + (match_operand: 2 "s_register_operand" + "") + (match_operand:SI 3 "immediate_operand" "i") + (match_operand:SI 4 "immediate_operand" "i")] + UNSPEC_VQDMULH_LANE))] + "TARGET_NEON" +{ + neon_lane_bounds (operands[3], 0, GET_MODE_NUNITS (mode)); + return "vq%O4dmulh.%T4%#\t%q0, %q1, %P2[%c3]"; +} + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar") + (const_string "neon_mul_qqd_32_scalar")))] +) + +(define_insn "neon_vqdmulh_lane" + [(set (match_operand:VMDI 0 "s_register_operand" "=w") + (unspec:VMDI [(match_operand:VMDI 1 "s_register_operand" "w") + (match_operand:VMDI 2 "s_register_operand" + "") + (match_operand:SI 3 "immediate_operand" "i") + (match_operand:SI 4 "immediate_operand" "i")] + UNSPEC_VQDMULH_LANE))] + "TARGET_NEON" +{ + neon_lane_bounds (operands[3], 0, GET_MODE_NUNITS (mode)); + return "vq%O4dmulh.%T4%#\t%P0, %P1, %P2[%c3]"; +} + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mul_ddd_16_scalar_32_16_long_scalar") + (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar")))] +) + +(define_insn "neon_vmla_lane" + [(set (match_operand:VMD 0 "s_register_operand" "=w") + (unspec:VMD [(match_operand:VMD 1 "s_register_operand" "0") + (match_operand:VMD 2 "s_register_operand" "w") + (match_operand:VMD 3 "s_register_operand" + "") + (match_operand:SI 4 "immediate_operand" "i") + (match_operand:SI 5 "immediate_operand" "i")] + UNSPEC_VMLA_LANE))] + "TARGET_NEON" +{ + neon_lane_bounds (operands[4], 0, GET_MODE_NUNITS (mode)); + return "vmla.\t%P0, %P2, %P3[%c4]"; +} + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vmla_ddd_scalar") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar") + (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long"))))] +) + +(define_insn "neon_vmla_lane" + [(set (match_operand:VMQ 0 "s_register_operand" "=w") + (unspec:VMQ [(match_operand:VMQ 1 "s_register_operand" "0") + (match_operand:VMQ 2 "s_register_operand" "w") + (match_operand: 3 "s_register_operand" + "") + (match_operand:SI 4 "immediate_operand" "i") + (match_operand:SI 5 "immediate_operand" "i")] + UNSPEC_VMLA_LANE))] + "TARGET_NEON" +{ + neon_lane_bounds (operands[4], 0, GET_MODE_NUNITS (mode)); + return "vmla.\t%q0, %q2, %P3[%c4]"; +} + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vmla_qqq_scalar") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long") + (const_string "neon_mla_qqq_32_qqd_32_scalar"))))] +) + +(define_insn "neon_vmlal_lane" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand: 1 "s_register_operand" "0") + (match_operand:VMDI 2 "s_register_operand" "w") + (match_operand:VMDI 3 "s_register_operand" + "") + (match_operand:SI 4 "immediate_operand" "i") + (match_operand:SI 5 "immediate_operand" "i")] + UNSPEC_VMLAL_LANE))] + "TARGET_NEON" +{ + neon_lane_bounds (operands[4], 0, GET_MODE_NUNITS (mode)); + return "vmlal.%T5%#\t%q0, %P2, %P3[%c4]"; +} + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar") + (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))] +) + +(define_insn "neon_vqdmlal_lane" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand: 1 "s_register_operand" "0") + (match_operand:VMDI 2 "s_register_operand" "w") + (match_operand:VMDI 3 "s_register_operand" + "") + (match_operand:SI 4 "immediate_operand" "i") + (match_operand:SI 5 "immediate_operand" "i")] + UNSPEC_VQDMLAL_LANE))] + "TARGET_NEON" +{ + neon_lane_bounds (operands[4], 0, GET_MODE_NUNITS (mode)); + return "vqdmlal.\t%q0, %P2, %P3[%c4]"; +} + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar") + (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))] +) + +(define_insn "neon_vmls_lane" + [(set (match_operand:VMD 0 "s_register_operand" "=w") + (unspec:VMD [(match_operand:VMD 1 "s_register_operand" "0") + (match_operand:VMD 2 "s_register_operand" "w") + (match_operand:VMD 3 "s_register_operand" + "") + (match_operand:SI 4 "immediate_operand" "i") + (match_operand:SI 5 "immediate_operand" "i")] + UNSPEC_VMLS_LANE))] + "TARGET_NEON" +{ + neon_lane_bounds (operands[4], 0, GET_MODE_NUNITS (mode)); + return "vmls.\t%P0, %P2, %P3[%c4]"; +} + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vmla_ddd_scalar") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar") + (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long"))))] +) + +(define_insn "neon_vmls_lane" + [(set (match_operand:VMQ 0 "s_register_operand" "=w") + (unspec:VMQ [(match_operand:VMQ 1 "s_register_operand" "0") + (match_operand:VMQ 2 "s_register_operand" "w") + (match_operand: 3 "s_register_operand" + "") + (match_operand:SI 4 "immediate_operand" "i") + (match_operand:SI 5 "immediate_operand" "i")] + UNSPEC_VMLS_LANE))] + "TARGET_NEON" +{ + neon_lane_bounds (operands[4], 0, GET_MODE_NUNITS (mode)); + return "vmls.\t%q0, %q2, %P3[%c4]"; +} + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_fp_vmla_qqq_scalar") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long") + (const_string "neon_mla_qqq_32_qqd_32_scalar"))))] +) + +(define_insn "neon_vmlsl_lane" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand: 1 "s_register_operand" "0") + (match_operand:VMDI 2 "s_register_operand" "w") + (match_operand:VMDI 3 "s_register_operand" + "") + (match_operand:SI 4 "immediate_operand" "i") + (match_operand:SI 5 "immediate_operand" "i")] + UNSPEC_VMLSL_LANE))] + "TARGET_NEON" +{ + neon_lane_bounds (operands[4], 0, GET_MODE_NUNITS (mode)); + return "vmlsl.%T5%#\t%q0, %P2, %P3[%c4]"; +} + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar") + (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))] +) + +(define_insn "neon_vqdmlsl_lane" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand: 1 "s_register_operand" "0") + (match_operand:VMDI 2 "s_register_operand" "w") + (match_operand:VMDI 3 "s_register_operand" + "") + (match_operand:SI 4 "immediate_operand" "i") + (match_operand:SI 5 "immediate_operand" "i")] + UNSPEC_VQDMLSL_LANE))] + "TARGET_NEON" +{ + neon_lane_bounds (operands[4], 0, GET_MODE_NUNITS (mode)); + return "vqdmlsl.\t%q0, %P2, %P3[%c4]"; +} + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar") + (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))] +) + +; FIXME: For the "_n" multiply/multiply-accumulate insns, we copy a value in a +; core register into a temp register, then use a scalar taken from that. This +; isn't an optimal solution if e.g. the scalar has just been read from memory +; or extracted from another vector. The latter case it's currently better to +; use the "_lane" variant, and the former case can probably be implemented +; using vld1_lane, but that hasn't been done yet. + +(define_expand "neon_vmul_n" + [(match_operand:VMD 0 "s_register_operand" "") + (match_operand:VMD 1 "s_register_operand" "") + (match_operand: 2 "s_register_operand" "") + (match_operand:SI 3 "immediate_operand" "")] + "TARGET_NEON" +{ + rtx tmp = gen_reg_rtx (mode); + emit_insn (gen_neon_vset_lane (tmp, operands[2], tmp, const0_rtx)); + emit_insn (gen_neon_vmul_lane (operands[0], operands[1], tmp, + const0_rtx, const0_rtx)); + DONE; +}) + +(define_expand "neon_vmul_n" + [(match_operand:VMQ 0 "s_register_operand" "") + (match_operand:VMQ 1 "s_register_operand" "") + (match_operand: 2 "s_register_operand" "") + (match_operand:SI 3 "immediate_operand" "")] + "TARGET_NEON" +{ + rtx tmp = gen_reg_rtx (mode); + emit_insn (gen_neon_vset_lane (tmp, operands[2], tmp, const0_rtx)); + emit_insn (gen_neon_vmul_lane (operands[0], operands[1], tmp, + const0_rtx, const0_rtx)); + DONE; +}) + +(define_expand "neon_vmull_n" + [(match_operand: 0 "s_register_operand" "") + (match_operand:VMDI 1 "s_register_operand" "") + (match_operand: 2 "s_register_operand" "") + (match_operand:SI 3 "immediate_operand" "")] + "TARGET_NEON" +{ + rtx tmp = gen_reg_rtx (mode); + emit_insn (gen_neon_vset_lane (tmp, operands[2], tmp, const0_rtx)); + emit_insn (gen_neon_vmull_lane (operands[0], operands[1], tmp, + const0_rtx, operands[3])); + DONE; +}) + +(define_expand "neon_vqdmull_n" + [(match_operand: 0 "s_register_operand" "") + (match_operand:VMDI 1 "s_register_operand" "") + (match_operand: 2 "s_register_operand" "") + (match_operand:SI 3 "immediate_operand" "")] + "TARGET_NEON" +{ + rtx tmp = gen_reg_rtx (mode); + emit_insn (gen_neon_vset_lane (tmp, operands[2], tmp, const0_rtx)); + emit_insn (gen_neon_vqdmull_lane (operands[0], operands[1], tmp, + const0_rtx, const0_rtx)); + DONE; +}) + +(define_expand "neon_vqdmulh_n" + [(match_operand:VMDI 0 "s_register_operand" "") + (match_operand:VMDI 1 "s_register_operand" "") + (match_operand: 2 "s_register_operand" "") + (match_operand:SI 3 "immediate_operand" "")] + "TARGET_NEON" +{ + rtx tmp = gen_reg_rtx (mode); + emit_insn (gen_neon_vset_lane (tmp, operands[2], tmp, const0_rtx)); + emit_insn (gen_neon_vqdmulh_lane (operands[0], operands[1], tmp, + const0_rtx, operands[3])); + DONE; +}) + +(define_expand "neon_vqdmulh_n" + [(match_operand:VMQI 0 "s_register_operand" "") + (match_operand:VMQI 1 "s_register_operand" "") + (match_operand: 2 "s_register_operand" "") + (match_operand:SI 3 "immediate_operand" "")] + "TARGET_NEON" +{ + rtx tmp = gen_reg_rtx (mode); + emit_insn (gen_neon_vset_lane (tmp, operands[2], tmp, const0_rtx)); + emit_insn (gen_neon_vqdmulh_lane (operands[0], operands[1], tmp, + const0_rtx, operands[3])); + DONE; +}) + +(define_expand "neon_vmla_n" + [(match_operand:VMD 0 "s_register_operand" "") + (match_operand:VMD 1 "s_register_operand" "") + (match_operand:VMD 2 "s_register_operand" "") + (match_operand: 3 "s_register_operand" "") + (match_operand:SI 4 "immediate_operand" "")] + "TARGET_NEON" +{ + rtx tmp = gen_reg_rtx (mode); + emit_insn (gen_neon_vset_lane (tmp, operands[3], tmp, const0_rtx)); + emit_insn (gen_neon_vmla_lane (operands[0], operands[1], operands[2], + tmp, const0_rtx, operands[4])); + DONE; +}) + +(define_expand "neon_vmla_n" + [(match_operand:VMQ 0 "s_register_operand" "") + (match_operand:VMQ 1 "s_register_operand" "") + (match_operand:VMQ 2 "s_register_operand" "") + (match_operand: 3 "s_register_operand" "") + (match_operand:SI 4 "immediate_operand" "")] + "TARGET_NEON" +{ + rtx tmp = gen_reg_rtx (mode); + emit_insn (gen_neon_vset_lane (tmp, operands[3], tmp, const0_rtx)); + emit_insn (gen_neon_vmla_lane (operands[0], operands[1], operands[2], + tmp, const0_rtx, operands[4])); + DONE; +}) + +(define_expand "neon_vmlal_n" + [(match_operand: 0 "s_register_operand" "") + (match_operand: 1 "s_register_operand" "") + (match_operand:VMDI 2 "s_register_operand" "") + (match_operand: 3 "s_register_operand" "") + (match_operand:SI 4 "immediate_operand" "")] + "TARGET_NEON" +{ + rtx tmp = gen_reg_rtx (mode); + emit_insn (gen_neon_vset_lane (tmp, operands[3], tmp, const0_rtx)); + emit_insn (gen_neon_vmlal_lane (operands[0], operands[1], operands[2], + tmp, const0_rtx, operands[4])); + DONE; +}) + +(define_expand "neon_vqdmlal_n" + [(match_operand: 0 "s_register_operand" "") + (match_operand: 1 "s_register_operand" "") + (match_operand:VMDI 2 "s_register_operand" "") + (match_operand: 3 "s_register_operand" "") + (match_operand:SI 4 "immediate_operand" "")] + "TARGET_NEON" +{ + rtx tmp = gen_reg_rtx (mode); + emit_insn (gen_neon_vset_lane (tmp, operands[3], tmp, const0_rtx)); + emit_insn (gen_neon_vqdmlal_lane (operands[0], operands[1], operands[2], + tmp, const0_rtx, operands[4])); + DONE; +}) + +(define_expand "neon_vmls_n" + [(match_operand:VMD 0 "s_register_operand" "") + (match_operand:VMD 1 "s_register_operand" "") + (match_operand:VMD 2 "s_register_operand" "") + (match_operand: 3 "s_register_operand" "") + (match_operand:SI 4 "immediate_operand" "")] + "TARGET_NEON" +{ + rtx tmp = gen_reg_rtx (mode); + emit_insn (gen_neon_vset_lane (tmp, operands[3], tmp, const0_rtx)); + emit_insn (gen_neon_vmls_lane (operands[0], operands[1], operands[2], + tmp, const0_rtx, operands[4])); + DONE; +}) + +(define_expand "neon_vmls_n" + [(match_operand:VMQ 0 "s_register_operand" "") + (match_operand:VMQ 1 "s_register_operand" "") + (match_operand:VMQ 2 "s_register_operand" "") + (match_operand: 3 "s_register_operand" "") + (match_operand:SI 4 "immediate_operand" "")] + "TARGET_NEON" +{ + rtx tmp = gen_reg_rtx (mode); + emit_insn (gen_neon_vset_lane (tmp, operands[3], tmp, const0_rtx)); + emit_insn (gen_neon_vmls_lane (operands[0], operands[1], operands[2], + tmp, const0_rtx, operands[4])); + DONE; +}) + +(define_expand "neon_vmlsl_n" + [(match_operand: 0 "s_register_operand" "") + (match_operand: 1 "s_register_operand" "") + (match_operand:VMDI 2 "s_register_operand" "") + (match_operand: 3 "s_register_operand" "") + (match_operand:SI 4 "immediate_operand" "")] + "TARGET_NEON" +{ + rtx tmp = gen_reg_rtx (mode); + emit_insn (gen_neon_vset_lane (tmp, operands[3], tmp, const0_rtx)); + emit_insn (gen_neon_vmlsl_lane (operands[0], operands[1], operands[2], + tmp, const0_rtx, operands[4])); + DONE; +}) + +(define_expand "neon_vqdmlsl_n" + [(match_operand: 0 "s_register_operand" "") + (match_operand: 1 "s_register_operand" "") + (match_operand:VMDI 2 "s_register_operand" "") + (match_operand: 3 "s_register_operand" "") + (match_operand:SI 4 "immediate_operand" "")] + "TARGET_NEON" +{ + rtx tmp = gen_reg_rtx (mode); + emit_insn (gen_neon_vset_lane (tmp, operands[3], tmp, const0_rtx)); + emit_insn (gen_neon_vqdmlsl_lane (operands[0], operands[1], operands[2], + tmp, const0_rtx, operands[4])); + DONE; +}) + +(define_insn "neon_vext" + [(set (match_operand:VDQX 0 "s_register_operand" "=w") + (unspec:VDQX [(match_operand:VDQX 1 "s_register_operand" "w") + (match_operand:VDQX 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VEXT))] + "TARGET_NEON" +{ + neon_const_bounds (operands[3], 0, GET_MODE_NUNITS (mode)); + return "vext.\t%0, %1, %2, %3"; +} + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_bp_simple") + (const_string "neon_bp_2cycle")))] +) + +(define_insn "neon_vrev64" + [(set (match_operand:VDQ 0 "s_register_operand" "=w") + (unspec:VDQ [(match_operand:VDQ 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + UNSPEC_VREV64))] + "TARGET_NEON" + "vrev64.\t%0, %1" + [(set_attr "neon_type" "neon_bp_simple")] +) + +(define_insn "neon_vrev32" + [(set (match_operand:VX 0 "s_register_operand" "=w") + (unspec:VX [(match_operand:VX 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + UNSPEC_VREV32))] + "TARGET_NEON" + "vrev32.\t%0, %1" + [(set_attr "neon_type" "neon_bp_simple")] +) + +(define_insn "neon_vrev16" + [(set (match_operand:VE 0 "s_register_operand" "=w") + (unspec:VE [(match_operand:VE 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + UNSPEC_VREV16))] + "TARGET_NEON" + "vrev16.\t%0, %1" + [(set_attr "neon_type" "neon_bp_simple")] +) + +; vbsl_* intrinsics may compile to any of vbsl/vbif/vbit depending on register +; allocation. For an intrinsic of form: +; rD = vbsl_* (rS, rN, rM) +; We can use any of: +; vbsl rS, rN, rM (if D = S) +; vbit rD, rN, rS (if D = M, so 1-bits in rS choose bits from rN, else rM) +; vbif rD, rM, rS (if D = N, so 0-bits in rS choose bits from rM, else rN) + +(define_insn "neon_vbsl_internal" + [(set (match_operand:VDQX 0 "s_register_operand" "=w,w,w") + (unspec:VDQX [(match_operand:VDQX 1 "s_register_operand" " 0,w,w") + (match_operand:VDQX 2 "s_register_operand" " w,w,0") + (match_operand:VDQX 3 "s_register_operand" " w,0,w")] + UNSPEC_VBSL))] + "TARGET_NEON" + "@ + vbsl\t%0, %2, %3 + vbit\t%0, %2, %1 + vbif\t%0, %3, %1" + [(set_attr "neon_type" "neon_int_1")] +) + +(define_expand "neon_vbsl" + [(set (match_operand:VDQX 0 "s_register_operand" "") + (unspec:VDQX [(match_operand: 1 "s_register_operand" "") + (match_operand:VDQX 2 "s_register_operand" "") + (match_operand:VDQX 3 "s_register_operand" "")] + UNSPEC_VBSL))] + "TARGET_NEON" +{ + /* We can't alias operands together if they have different modes. */ + operands[1] = gen_lowpart (mode, operands[1]); +}) + +(define_insn "neon_vshl" + [(set (match_operand:VDQIX 0 "s_register_operand" "=w") + (unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w") + (match_operand:VDQIX 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VSHL))] + "TARGET_NEON" + "v%O3shl.%T3%#\t%0, %1, %2" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_vshl_ddd") + (const_string "neon_shift_3")))] +) + +(define_insn "neon_vqshl" + [(set (match_operand:VDQIX 0 "s_register_operand" "=w") + (unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w") + (match_operand:VDQIX 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VQSHL))] + "TARGET_NEON" + "vq%O3shl.%T3%#\t%0, %1, %2" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_shift_2") + (const_string "neon_vqshl_vrshl_vqrshl_qqq")))] +) + +(define_insn "neon_vshr_n" + [(set (match_operand:VDQIX 0 "s_register_operand" "=w") + (unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VSHR_N))] + "TARGET_NEON" +{ + neon_const_bounds (operands[2], 1, neon_element_bits (mode) + 1); + return "v%O3shr.%T3%#\t%0, %1, %2"; +} + [(set_attr "neon_type" "neon_shift_1")] +) + +(define_insn "neon_vshrn_n" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:VN 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VSHRN_N))] + "TARGET_NEON" +{ + neon_const_bounds (operands[2], 1, neon_element_bits (mode) / 2 + 1); + return "v%O3shrn.\t%P0, %q1, %2"; +} + [(set_attr "neon_type" "neon_shift_1")] +) + +(define_insn "neon_vqshrn_n" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:VN 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VQSHRN_N))] + "TARGET_NEON" +{ + neon_const_bounds (operands[2], 1, neon_element_bits (mode) / 2 + 1); + return "vq%O3shrn.%T3%#\t%P0, %q1, %2"; +} + [(set_attr "neon_type" "neon_shift_2")] +) + +(define_insn "neon_vqshrun_n" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:VN 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VQSHRUN_N))] + "TARGET_NEON" +{ + neon_const_bounds (operands[2], 1, neon_element_bits (mode) / 2 + 1); + return "vq%O3shrun.%T3%#\t%P0, %q1, %2"; +} + [(set_attr "neon_type" "neon_shift_2")] +) + +(define_insn "neon_vshl_n" + [(set (match_operand:VDQIX 0 "s_register_operand" "=w") + (unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VSHL_N))] + "TARGET_NEON" +{ + neon_const_bounds (operands[2], 0, neon_element_bits (mode)); + return "vshl.\t%0, %1, %2"; +} + [(set_attr "neon_type" "neon_shift_1")] +) + +(define_insn "neon_vqshl_n" + [(set (match_operand:VDQIX 0 "s_register_operand" "=w") + (unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VQSHL_N))] + "TARGET_NEON" +{ + neon_const_bounds (operands[2], 0, neon_element_bits (mode)); + return "vqshl.%T3%#\t%0, %1, %2"; +} + [(set_attr "neon_type" "neon_shift_2")] +) + +(define_insn "neon_vqshlu_n" + [(set (match_operand:VDQIX 0 "s_register_operand" "=w") + (unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VQSHLU_N))] + "TARGET_NEON" +{ + neon_const_bounds (operands[2], 0, neon_element_bits (mode)); + return "vqshlu.%T3%#\t%0, %1, %2"; +} + [(set_attr "neon_type" "neon_shift_2")] +) + +(define_insn "neon_vshll_n" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: [(match_operand:VW 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VSHLL_N))] + "TARGET_NEON" +{ + /* The boundaries are: 0 < imm <= size. */ + neon_const_bounds (operands[2], 0, neon_element_bits (mode) + 1); + return "vshll.%T3%#\t%q0, %P1, %2"; +} + [(set_attr "neon_type" "neon_shift_1")] +) + +(define_insn "neon_vsra_n" + [(set (match_operand:VDQIX 0 "s_register_operand" "=w") + (unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "0") + (match_operand:VDQIX 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i") + (match_operand:SI 4 "immediate_operand" "i")] + UNSPEC_VSRA_N))] + "TARGET_NEON" +{ + neon_const_bounds (operands[3], 1, neon_element_bits (mode) + 1); + return "v%O4sra.%T4%#\t%0, %2, %3"; +} + [(set_attr "neon_type" "neon_vsra_vrsra")] +) + +(define_insn "neon_vsri_n" + [(set (match_operand:VDQIX 0 "s_register_operand" "=w") + (unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "0") + (match_operand:VDQIX 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VSRI))] + "TARGET_NEON" +{ + neon_const_bounds (operands[3], 1, neon_element_bits (mode) + 1); + return "vsri.\t%0, %2, %3"; +} + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_shift_1") + (const_string "neon_shift_3")))] +) + +(define_insn "neon_vsli_n" + [(set (match_operand:VDQIX 0 "s_register_operand" "=w") + (unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "0") + (match_operand:VDQIX 2 "s_register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VSLI))] + "TARGET_NEON" +{ + neon_const_bounds (operands[3], 0, neon_element_bits (mode)); + return "vsli.\t%0, %2, %3"; +} + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_shift_1") + (const_string "neon_shift_3")))] +) + +(define_insn "neon_vtbl1v8qi" + [(set (match_operand:V8QI 0 "s_register_operand" "=w") + (unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "w") + (match_operand:V8QI 2 "s_register_operand" "w")] + UNSPEC_VTBL))] + "TARGET_NEON" + "vtbl.8\t%P0, {%P1}, %P2" + [(set_attr "neon_type" "neon_bp_2cycle")] +) + +(define_insn "neon_vtbl2v8qi" + [(set (match_operand:V8QI 0 "s_register_operand" "=w") + (unspec:V8QI [(match_operand:TI 1 "s_register_operand" "w") + (match_operand:V8QI 2 "s_register_operand" "w")] + UNSPEC_VTBL))] + "TARGET_NEON" +{ + rtx ops[4]; + int tabbase = REGNO (operands[1]); + + ops[0] = operands[0]; + ops[1] = gen_rtx_REG (V8QImode, tabbase); + ops[2] = gen_rtx_REG (V8QImode, tabbase + 2); + ops[3] = operands[2]; + output_asm_insn ("vtbl.8\t%P0, {%P1, %P2}, %P3", ops); + + return ""; +} + [(set_attr "neon_type" "neon_bp_2cycle")] +) + +(define_insn "neon_vtbl3v8qi" + [(set (match_operand:V8QI 0 "s_register_operand" "=w") + (unspec:V8QI [(match_operand:EI 1 "s_register_operand" "w") + (match_operand:V8QI 2 "s_register_operand" "w")] + UNSPEC_VTBL))] + "TARGET_NEON" +{ + rtx ops[5]; + int tabbase = REGNO (operands[1]); + + ops[0] = operands[0]; + ops[1] = gen_rtx_REG (V8QImode, tabbase); + ops[2] = gen_rtx_REG (V8QImode, tabbase + 2); + ops[3] = gen_rtx_REG (V8QImode, tabbase + 4); + ops[4] = operands[2]; + output_asm_insn ("vtbl.8\t%P0, {%P1, %P2, %P3}, %P4", ops); + + return ""; +} + [(set_attr "neon_type" "neon_bp_3cycle")] +) + +(define_insn "neon_vtbl4v8qi" + [(set (match_operand:V8QI 0 "s_register_operand" "=w") + (unspec:V8QI [(match_operand:OI 1 "s_register_operand" "w") + (match_operand:V8QI 2 "s_register_operand" "w")] + UNSPEC_VTBL))] + "TARGET_NEON" +{ + rtx ops[6]; + int tabbase = REGNO (operands[1]); + + ops[0] = operands[0]; + ops[1] = gen_rtx_REG (V8QImode, tabbase); + ops[2] = gen_rtx_REG (V8QImode, tabbase + 2); + ops[3] = gen_rtx_REG (V8QImode, tabbase + 4); + ops[4] = gen_rtx_REG (V8QImode, tabbase + 6); + ops[5] = operands[2]; + output_asm_insn ("vtbl.8\t%P0, {%P1, %P2, %P3, %P4}, %P5", ops); + + return ""; +} + [(set_attr "neon_type" "neon_bp_3cycle")] +) + +(define_insn "neon_vtbx1v8qi" + [(set (match_operand:V8QI 0 "s_register_operand" "=w") + (unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "0") + (match_operand:V8QI 2 "s_register_operand" "w") + (match_operand:V8QI 3 "s_register_operand" "w")] + UNSPEC_VTBX))] + "TARGET_NEON" + "vtbx.8\t%P0, {%P2}, %P3" + [(set_attr "neon_type" "neon_bp_2cycle")] +) + +(define_insn "neon_vtbx2v8qi" + [(set (match_operand:V8QI 0 "s_register_operand" "=w") + (unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "0") + (match_operand:TI 2 "s_register_operand" "w") + (match_operand:V8QI 3 "s_register_operand" "w")] + UNSPEC_VTBX))] + "TARGET_NEON" +{ + rtx ops[4]; + int tabbase = REGNO (operands[2]); + + ops[0] = operands[0]; + ops[1] = gen_rtx_REG (V8QImode, tabbase); + ops[2] = gen_rtx_REG (V8QImode, tabbase + 2); + ops[3] = operands[3]; + output_asm_insn ("vtbx.8\t%P0, {%P1, %P2}, %P3", ops); + + return ""; +} + [(set_attr "neon_type" "neon_bp_2cycle")] +) + +(define_insn "neon_vtbx3v8qi" + [(set (match_operand:V8QI 0 "s_register_operand" "=w") + (unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "0") + (match_operand:EI 2 "s_register_operand" "w") + (match_operand:V8QI 3 "s_register_operand" "w")] + UNSPEC_VTBX))] + "TARGET_NEON" +{ + rtx ops[5]; + int tabbase = REGNO (operands[2]); + + ops[0] = operands[0]; + ops[1] = gen_rtx_REG (V8QImode, tabbase); + ops[2] = gen_rtx_REG (V8QImode, tabbase + 2); + ops[3] = gen_rtx_REG (V8QImode, tabbase + 4); + ops[4] = operands[3]; + output_asm_insn ("vtbx.8\t%P0, {%P1, %P2, %P3}, %P4", ops); + + return ""; +} + [(set_attr "neon_type" "neon_bp_3cycle")] +) + +(define_insn "neon_vtbx4v8qi" + [(set (match_operand:V8QI 0 "s_register_operand" "=w") + (unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "0") + (match_operand:OI 2 "s_register_operand" "w") + (match_operand:V8QI 3 "s_register_operand" "w")] + UNSPEC_VTBX))] + "TARGET_NEON" +{ + rtx ops[6]; + int tabbase = REGNO (operands[2]); + + ops[0] = operands[0]; + ops[1] = gen_rtx_REG (V8QImode, tabbase); + ops[2] = gen_rtx_REG (V8QImode, tabbase + 2); + ops[3] = gen_rtx_REG (V8QImode, tabbase + 4); + ops[4] = gen_rtx_REG (V8QImode, tabbase + 6); + ops[5] = operands[3]; + output_asm_insn ("vtbx.8\t%P0, {%P1, %P2, %P3, %P4}, %P5", ops); + + return ""; +} + [(set_attr "neon_type" "neon_bp_3cycle")] +) + +(define_insn "neon_vtrn_internal" + [(set (match_operand:VDQW 0 "s_register_operand" "=w") + (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0") + (match_operand:VDQW 2 "s_register_operand" "w")] + UNSPEC_VTRN1)) + (set (match_operand:VDQW 3 "s_register_operand" "=2") + (unspec:VDQW [(match_dup 1) (match_dup 2)] + UNSPEC_VTRN2))] + "TARGET_NEON" + "vtrn.\t%0, %3" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_bp_simple") + (const_string "neon_bp_3cycle")))] +) + +(define_expand "neon_vtrn" + [(match_operand:SI 0 "s_register_operand" "r") + (match_operand:VDQW 1 "s_register_operand" "w") + (match_operand:VDQW 2 "s_register_operand" "w")] + "TARGET_NEON" +{ + neon_emit_pair_result_insn (mode, gen_neon_vtrn_internal, + operands[0], operands[1], operands[2]); + DONE; +}) + +(define_insn "neon_vzip_internal" + [(set (match_operand:VDQW 0 "s_register_operand" "=w") + (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0") + (match_operand:VDQW 2 "s_register_operand" "w")] + UNSPEC_VZIP1)) + (set (match_operand:VDQW 3 "s_register_operand" "=2") + (unspec:VDQW [(match_dup 1) (match_dup 2)] + UNSPEC_VZIP2))] + "TARGET_NEON" + "vzip.\t%0, %3" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_bp_simple") + (const_string "neon_bp_3cycle")))] +) + +(define_expand "neon_vzip" + [(match_operand:SI 0 "s_register_operand" "r") + (match_operand:VDQW 1 "s_register_operand" "w") + (match_operand:VDQW 2 "s_register_operand" "w")] + "TARGET_NEON" +{ + neon_emit_pair_result_insn (mode, gen_neon_vzip_internal, + operands[0], operands[1], operands[2]); + DONE; +}) + +(define_insn "neon_vuzp_internal" + [(set (match_operand:VDQW 0 "s_register_operand" "=w") + (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0") + (match_operand:VDQW 2 "s_register_operand" "w")] + UNSPEC_VUZP1)) + (set (match_operand:VDQW 3 "s_register_operand" "=2") + (unspec:VDQW [(match_dup 1) (match_dup 2)] + UNSPEC_VUZP2))] + "TARGET_NEON" + "vuzp.\t%0, %3" + [(set (attr "neon_type") + (if_then_else (ne (symbol_ref "") (const_int 0)) + (const_string "neon_bp_simple") + (const_string "neon_bp_3cycle")))] +) + +(define_expand "neon_vuzp" + [(match_operand:SI 0 "s_register_operand" "r") + (match_operand:VDQW 1 "s_register_operand" "w") + (match_operand:VDQW 2 "s_register_operand" "w")] + "TARGET_NEON" +{ + neon_emit_pair_result_insn (mode, gen_neon_vuzp_internal, + operands[0], operands[1], operands[2]); + DONE; +}) + +(define_expand "neon_vreinterpretv8qi" + [(match_operand:V8QI 0 "s_register_operand" "") + (match_operand:VDX 1 "s_register_operand" "")] + "TARGET_NEON" +{ + neon_reinterpret (operands[0], operands[1]); + DONE; +}) + +(define_expand "neon_vreinterpretv4hi" + [(match_operand:V4HI 0 "s_register_operand" "") + (match_operand:VDX 1 "s_register_operand" "")] + "TARGET_NEON" +{ + neon_reinterpret (operands[0], operands[1]); + DONE; +}) + +(define_expand "neon_vreinterpretv2si" + [(match_operand:V2SI 0 "s_register_operand" "") + (match_operand:VDX 1 "s_register_operand" "")] + "TARGET_NEON" +{ + neon_reinterpret (operands[0], operands[1]); + DONE; +}) + +(define_expand "neon_vreinterpretv2sf" + [(match_operand:V2SF 0 "s_register_operand" "") + (match_operand:VDX 1 "s_register_operand" "")] + "TARGET_NEON" +{ + neon_reinterpret (operands[0], operands[1]); + DONE; +}) + +(define_expand "neon_vreinterpretdi" + [(match_operand:DI 0 "s_register_operand" "") + (match_operand:VDX 1 "s_register_operand" "")] + "TARGET_NEON" +{ + neon_reinterpret (operands[0], operands[1]); + DONE; +}) + +(define_expand "neon_vreinterpretv16qi" + [(match_operand:V16QI 0 "s_register_operand" "") + (match_operand:VQX 1 "s_register_operand" "")] + "TARGET_NEON" +{ + neon_reinterpret (operands[0], operands[1]); + DONE; +}) + +(define_expand "neon_vreinterpretv8hi" + [(match_operand:V8HI 0 "s_register_operand" "") + (match_operand:VQX 1 "s_register_operand" "")] + "TARGET_NEON" +{ + neon_reinterpret (operands[0], operands[1]); + DONE; +}) + +(define_expand "neon_vreinterpretv4si" + [(match_operand:V4SI 0 "s_register_operand" "") + (match_operand:VQX 1 "s_register_operand" "")] + "TARGET_NEON" +{ + neon_reinterpret (operands[0], operands[1]); + DONE; +}) + +(define_expand "neon_vreinterpretv4sf" + [(match_operand:V4SF 0 "s_register_operand" "") + (match_operand:VQX 1 "s_register_operand" "")] + "TARGET_NEON" +{ + neon_reinterpret (operands[0], operands[1]); + DONE; +}) + +(define_expand "neon_vreinterpretv2di" + [(match_operand:V2DI 0 "s_register_operand" "") + (match_operand:VQX 1 "s_register_operand" "")] + "TARGET_NEON" +{ + neon_reinterpret (operands[0], operands[1]); + DONE; +}) + +(define_insn "neon_vld1" + [(set (match_operand:VDQX 0 "s_register_operand" "=w") + (unspec:VDQX [(mem:VDQX (match_operand:SI 1 "s_register_operand" "r"))] + UNSPEC_VLD1))] + "TARGET_NEON" + "vld1.\t%h0, [%1]" + [(set_attr "neon_type" "neon_vld1_1_2_regs")] +) + +(define_insn "neon_vld1_lane" + [(set (match_operand:VDX 0 "s_register_operand" "=w") + (unspec:VDX [(mem: (match_operand:SI 1 "s_register_operand" "r")) + (match_operand:VDX 2 "s_register_operand" "0") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VLD1_LANE))] + "TARGET_NEON" +{ + HOST_WIDE_INT lane = INTVAL (operands[3]); + HOST_WIDE_INT max = GET_MODE_NUNITS (mode); + if (lane < 0 || lane >= max) + error ("lane out of range"); + if (max == 1) + return "vld1.\t%P0, [%1]"; + else + return "vld1.\t{%P0[%c3]}, [%1]"; +} + [(set (attr "neon_type") + (if_then_else (eq (const_string "") (const_int 2)) + (const_string "neon_vld1_1_2_regs") + (const_string "neon_vld1_vld2_lane")))] +) + +(define_insn "neon_vld1_lane" + [(set (match_operand:VQX 0 "s_register_operand" "=w") + (unspec:VQX [(mem: (match_operand:SI 1 "s_register_operand" "r")) + (match_operand:VQX 2 "s_register_operand" "0") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VLD1_LANE))] + "TARGET_NEON" +{ + HOST_WIDE_INT lane = INTVAL (operands[3]); + HOST_WIDE_INT max = GET_MODE_NUNITS (mode); + int regno = REGNO (operands[0]); + if (lane < 0 || lane >= max) + error ("lane out of range"); + else if (lane >= max / 2) + { + lane -= max / 2; + regno += 2; + operands[3] = GEN_INT (lane); + } + operands[0] = gen_rtx_REG (mode, regno); + if (max == 2) + return "vld1.\t%P0, [%1]"; + else + return "vld1.\t{%P0[%c3]}, [%1]"; +} + [(set (attr "neon_type") + (if_then_else (eq (const_string "") (const_int 2)) + (const_string "neon_vld1_1_2_regs") + (const_string "neon_vld1_vld2_lane")))] +) + +(define_insn "neon_vld1_dup" + [(set (match_operand:VDX 0 "s_register_operand" "=w") + (unspec:VDX [(mem: (match_operand:SI 1 "s_register_operand" "r"))] + UNSPEC_VLD1_DUP))] + "TARGET_NEON" +{ + if (GET_MODE_NUNITS (mode) > 1) + return "vld1.\t{%P0[]}, [%1]"; + else + return "vld1.\t%h0, [%1]"; +} + [(set (attr "neon_type") + (if_then_else (gt (const_string "") (const_string "1")) + (const_string "neon_vld2_2_regs_vld1_vld2_all_lanes") + (const_string "neon_vld1_1_2_regs")))] +) + +(define_insn "neon_vld1_dup" + [(set (match_operand:VQX 0 "s_register_operand" "=w") + (unspec:VQX [(mem: (match_operand:SI 1 "s_register_operand" "r"))] + UNSPEC_VLD1_DUP))] + "TARGET_NEON" +{ + if (GET_MODE_NUNITS (mode) > 2) + return "vld1.\t{%e0[], %f0[]}, [%1]"; + else + return "vld1.\t%h0, [%1]"; +} + [(set (attr "neon_type") + (if_then_else (gt (const_string "") (const_string "1")) + (const_string "neon_vld2_2_regs_vld1_vld2_all_lanes") + (const_string "neon_vld1_1_2_regs")))] +) + +(define_insn "neon_vst1" + [(set (mem:VDQX (match_operand:SI 0 "s_register_operand" "r")) + (unspec:VDQX [(match_operand:VDQX 1 "s_register_operand" "w")] + UNSPEC_VST1))] + "TARGET_NEON" + "vst1.\t%h1, [%0]" + [(set_attr "neon_type" "neon_vst1_1_2_regs_vst2_2_regs")]) + +(define_insn "neon_vst1_lane" + [(set (mem: (match_operand:SI 0 "s_register_operand" "r")) + (vec_select: + (match_operand:VDX 1 "s_register_operand" "w") + (parallel [(match_operand:SI 2 "neon_lane_number" "i")])))] + "TARGET_NEON" +{ + HOST_WIDE_INT lane = INTVAL (operands[2]); + HOST_WIDE_INT max = GET_MODE_NUNITS (mode); + if (lane < 0 || lane >= max) + error ("lane out of range"); + if (max == 1) + return "vst1.\t{%P1}, [%0]"; + else + return "vst1.\t{%P1[%c2]}, [%0]"; +} + [(set (attr "neon_type") + (if_then_else (eq (const_string "") (const_int 1)) + (const_string "neon_vst1_1_2_regs_vst2_2_regs") + (const_string "neon_vst1_vst2_lane")))]) + +(define_insn "neon_vst1_lane" + [(set (mem: (match_operand:SI 0 "s_register_operand" "r")) + (vec_select: + (match_operand:VQX 1 "s_register_operand" "w") + (parallel [(match_operand:SI 2 "neon_lane_number" "i")])))] + "TARGET_NEON" +{ + HOST_WIDE_INT lane = INTVAL (operands[2]); + HOST_WIDE_INT max = GET_MODE_NUNITS (mode); + int regno = REGNO (operands[1]); + if (lane < 0 || lane >= max) + error ("lane out of range"); + else if (lane >= max / 2) + { + lane -= max / 2; + regno += 2; + operands[2] = GEN_INT (lane); + } + operands[1] = gen_rtx_REG (mode, regno); + if (max == 2) + return "vst1.\t{%P1}, [%0]"; + else + return "vst1.\t{%P1[%c2]}, [%0]"; +} + [(set_attr "neon_type" "neon_vst1_vst2_lane")] +) + +(define_insn "neon_vld2" + [(set (match_operand:TI 0 "s_register_operand" "=w") + (unspec:TI [(mem:TI (match_operand:SI 1 "s_register_operand" "r")) + (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2))] + "TARGET_NEON" +{ + if ( == 64) + return "vld1.64\t%h0, [%1]"; + else + return "vld2.\t%h0, [%1]"; +} + [(set (attr "neon_type") + (if_then_else (eq (const_string "") (const_string "64")) + (const_string "neon_vld1_1_2_regs") + (const_string "neon_vld2_2_regs_vld1_vld2_all_lanes")))] +) + +(define_insn "neon_vld2" + [(set (match_operand:OI 0 "s_register_operand" "=w") + (unspec:OI [(mem:OI (match_operand:SI 1 "s_register_operand" "r")) + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2))] + "TARGET_NEON" + "vld2.\t%h0, [%1]" + [(set_attr "neon_type" "neon_vld2_2_regs_vld1_vld2_all_lanes")]) + +(define_insn "neon_vld2_lane" + [(set (match_operand:TI 0 "s_register_operand" "=w") + (unspec:TI [(mem: (match_operand:SI 1 "s_register_operand" "r")) + (match_operand:TI 2 "s_register_operand" "0") + (match_operand:SI 3 "immediate_operand" "i") + (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2_LANE))] + "TARGET_NEON" +{ + HOST_WIDE_INT lane = INTVAL (operands[3]); + HOST_WIDE_INT max = GET_MODE_NUNITS (mode); + int regno = REGNO (operands[0]); + rtx ops[4]; + if (lane < 0 || lane >= max) + error ("lane out of range"); + ops[0] = gen_rtx_REG (DImode, regno); + ops[1] = gen_rtx_REG (DImode, regno + 2); + ops[2] = operands[1]; + ops[3] = operands[3]; + output_asm_insn ("vld2.\t{%P0[%c3], %P1[%c3]}, [%2]", ops); + return ""; +} + [(set_attr "neon_type" "neon_vld1_vld2_lane")] +) + +(define_insn "neon_vld2_lane" + [(set (match_operand:OI 0 "s_register_operand" "=w") + (unspec:OI [(mem: (match_operand:SI 1 "s_register_operand" "r")) + (match_operand:OI 2 "s_register_operand" "0") + (match_operand:SI 3 "immediate_operand" "i") + (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2_LANE))] + "TARGET_NEON" +{ + HOST_WIDE_INT lane = INTVAL (operands[3]); + HOST_WIDE_INT max = GET_MODE_NUNITS (mode); + int regno = REGNO (operands[0]); + rtx ops[4]; + if (lane < 0 || lane >= max) + error ("lane out of range"); + else if (lane >= max / 2) + { + lane -= max / 2; + regno += 2; + } + ops[0] = gen_rtx_REG (DImode, regno); + ops[1] = gen_rtx_REG (DImode, regno + 4); + ops[2] = operands[1]; + ops[3] = GEN_INT (lane); + output_asm_insn ("vld2.\t{%P0[%c3], %P1[%c3]}, [%2]", ops); + return ""; +} + [(set_attr "neon_type" "neon_vld1_vld2_lane")] +) + +(define_insn "neon_vld2_dup" + [(set (match_operand:TI 0 "s_register_operand" "=w") + (unspec:TI [(mem: (match_operand:SI 1 "s_register_operand" "r")) + (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2_DUP))] + "TARGET_NEON" +{ + if (GET_MODE_NUNITS (mode) > 1) + return "vld2.\t{%e0[], %f0[]}, [%1]"; + else + return "vld1.\t%h0, [%1]"; +} + [(set (attr "neon_type") + (if_then_else (gt (const_string "") (const_string "1")) + (const_string "neon_vld2_2_regs_vld1_vld2_all_lanes") + (const_string "neon_vld1_1_2_regs")))] +) + +(define_insn "neon_vst2" + [(set (mem:TI (match_operand:SI 0 "s_register_operand" "r")) + (unspec:TI [(match_operand:TI 1 "s_register_operand" "w") + (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VST2))] + "TARGET_NEON" +{ + if ( == 64) + return "vst1.64\t%h1, [%0]"; + else + return "vst2.\t%h1, [%0]"; +} + [(set (attr "neon_type") + (if_then_else (eq (const_string "") (const_string "64")) + (const_string "neon_vst1_1_2_regs_vst2_2_regs") + (const_string "neon_vst1_1_2_regs_vst2_2_regs")))] +) + +(define_insn "neon_vst2" + [(set (mem:OI (match_operand:SI 0 "s_register_operand" "r")) + (unspec:OI [(match_operand:OI 1 "s_register_operand" "w") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VST2))] + "TARGET_NEON" + "vst2.\t%h1, [%0]" + [(set_attr "neon_type" "neon_vst1_1_2_regs_vst2_2_regs")] +) + +(define_insn "neon_vst2_lane" + [(set (mem: (match_operand:SI 0 "s_register_operand" "r")) + (unspec: + [(match_operand:TI 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i") + (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VST2_LANE))] + "TARGET_NEON" +{ + HOST_WIDE_INT lane = INTVAL (operands[2]); + HOST_WIDE_INT max = GET_MODE_NUNITS (mode); + int regno = REGNO (operands[1]); + rtx ops[4]; + if (lane < 0 || lane >= max) + error ("lane out of range"); + ops[0] = operands[0]; + ops[1] = gen_rtx_REG (DImode, regno); + ops[2] = gen_rtx_REG (DImode, regno + 2); + ops[3] = operands[2]; + output_asm_insn ("vst2.\t{%P1[%c3], %P2[%c3]}, [%0]", ops); + return ""; +} + [(set_attr "neon_type" "neon_vst1_vst2_lane")] +) + +(define_insn "neon_vst2_lane" + [(set (mem: (match_operand:SI 0 "s_register_operand" "r")) + (unspec: + [(match_operand:OI 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i") + (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VST2_LANE))] + "TARGET_NEON" +{ + HOST_WIDE_INT lane = INTVAL (operands[2]); + HOST_WIDE_INT max = GET_MODE_NUNITS (mode); + int regno = REGNO (operands[1]); + rtx ops[4]; + if (lane < 0 || lane >= max) + error ("lane out of range"); + else if (lane >= max / 2) + { + lane -= max / 2; + regno += 2; + } + ops[0] = operands[0]; + ops[1] = gen_rtx_REG (DImode, regno); + ops[2] = gen_rtx_REG (DImode, regno + 4); + ops[3] = GEN_INT (lane); + output_asm_insn ("vst2.\t{%P1[%c3], %P2[%c3]}, [%0]", ops); + return ""; +} + [(set_attr "neon_type" "neon_vst1_vst2_lane")] +) + +(define_insn "neon_vld3" + [(set (match_operand:EI 0 "s_register_operand" "=w") + (unspec:EI [(mem:EI (match_operand:SI 1 "s_register_operand" "r")) + (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD3))] + "TARGET_NEON" +{ + if ( == 64) + return "vld1.64\t%h0, [%1]"; + else + return "vld3.\t%h0, [%1]"; +} + [(set (attr "neon_type") + (if_then_else (eq (const_string "") (const_string "64")) + (const_string "neon_vld1_1_2_regs") + (const_string "neon_vld3_vld4")))] +) + +(define_expand "neon_vld3" + [(match_operand:CI 0 "s_register_operand" "=w") + (match_operand:SI 1 "s_register_operand" "+r") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + "TARGET_NEON" +{ + emit_insn (gen_neon_vld3qa (operands[0], operands[0], + operands[1], operands[1])); + emit_insn (gen_neon_vld3qb (operands[0], operands[0], + operands[1], operands[1])); + DONE; +}) + +(define_insn "neon_vld3qa" + [(set (match_operand:CI 0 "s_register_operand" "=w") + (unspec:CI [(mem:CI (match_operand:SI 3 "s_register_operand" "2")) + (match_operand:CI 1 "s_register_operand" "0") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD3A)) + (set (match_operand:SI 2 "s_register_operand" "=r") + (plus:SI (match_dup 3) + (const_int 24)))] + "TARGET_NEON" +{ + int regno = REGNO (operands[0]); + rtx ops[4]; + ops[0] = gen_rtx_REG (DImode, regno); + ops[1] = gen_rtx_REG (DImode, regno + 4); + ops[2] = gen_rtx_REG (DImode, regno + 8); + ops[3] = operands[2]; + output_asm_insn ("vld3.\t{%P0, %P1, %P2}, [%3]!", ops); + return ""; +} + [(set_attr "neon_type" "neon_vld3_vld4")] +) + +(define_insn "neon_vld3qb" + [(set (match_operand:CI 0 "s_register_operand" "=w") + (unspec:CI [(mem:CI (match_operand:SI 3 "s_register_operand" "2")) + (match_operand:CI 1 "s_register_operand" "0") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD3B)) + (set (match_operand:SI 2 "s_register_operand" "=r") + (plus:SI (match_dup 3) + (const_int 24)))] + "TARGET_NEON" +{ + int regno = REGNO (operands[0]); + rtx ops[4]; + ops[0] = gen_rtx_REG (DImode, regno + 2); + ops[1] = gen_rtx_REG (DImode, regno + 6); + ops[2] = gen_rtx_REG (DImode, regno + 10); + ops[3] = operands[2]; + output_asm_insn ("vld3.\t{%P0, %P1, %P2}, [%3]!", ops); + return ""; +} + [(set_attr "neon_type" "neon_vld3_vld4")] +) + +(define_insn "neon_vld3_lane" + [(set (match_operand:EI 0 "s_register_operand" "=w") + (unspec:EI [(mem: (match_operand:SI 1 "s_register_operand" "r")) + (match_operand:EI 2 "s_register_operand" "0") + (match_operand:SI 3 "immediate_operand" "i") + (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD3_LANE))] + "TARGET_NEON" +{ + HOST_WIDE_INT lane = INTVAL (operands[3]); + HOST_WIDE_INT max = GET_MODE_NUNITS (mode); + int regno = REGNO (operands[0]); + rtx ops[5]; + if (lane < 0 || lane >= max) + error ("lane out of range"); + ops[0] = gen_rtx_REG (DImode, regno); + ops[1] = gen_rtx_REG (DImode, regno + 2); + ops[2] = gen_rtx_REG (DImode, regno + 4); + ops[3] = operands[1]; + ops[4] = operands[3]; + output_asm_insn ("vld3.\t{%P0[%c4], %P1[%c4], %P2[%c4]}, [%3]", + ops); + return ""; +} + [(set_attr "neon_type" "neon_vld3_vld4_lane")] +) + +(define_insn "neon_vld3_lane" + [(set (match_operand:CI 0 "s_register_operand" "=w") + (unspec:CI [(mem: (match_operand:SI 1 "s_register_operand" "r")) + (match_operand:CI 2 "s_register_operand" "0") + (match_operand:SI 3 "immediate_operand" "i") + (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD3_LANE))] + "TARGET_NEON" +{ + HOST_WIDE_INT lane = INTVAL (operands[3]); + HOST_WIDE_INT max = GET_MODE_NUNITS (mode); + int regno = REGNO (operands[0]); + rtx ops[5]; + if (lane < 0 || lane >= max) + error ("lane out of range"); + else if (lane >= max / 2) + { + lane -= max / 2; + regno += 2; + } + ops[0] = gen_rtx_REG (DImode, regno); + ops[1] = gen_rtx_REG (DImode, regno + 4); + ops[2] = gen_rtx_REG (DImode, regno + 8); + ops[3] = operands[1]; + ops[4] = GEN_INT (lane); + output_asm_insn ("vld3.\t{%P0[%c4], %P1[%c4], %P2[%c4]}, [%3]", + ops); + return ""; +} + [(set_attr "neon_type" "neon_vld3_vld4_lane")] +) + +(define_insn "neon_vld3_dup" + [(set (match_operand:EI 0 "s_register_operand" "=w") + (unspec:EI [(mem: (match_operand:SI 1 "s_register_operand" "r")) + (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD3_DUP))] + "TARGET_NEON" +{ + if (GET_MODE_NUNITS (mode) > 1) + { + int regno = REGNO (operands[0]); + rtx ops[4]; + ops[0] = gen_rtx_REG (DImode, regno); + ops[1] = gen_rtx_REG (DImode, regno + 2); + ops[2] = gen_rtx_REG (DImode, regno + 4); + ops[3] = operands[1]; + output_asm_insn ("vld3.\t{%P0[], %P1[], %P2[]}, [%3]", ops); + return ""; + } + else + return "vld1.\t%h0, [%1]"; +} + [(set (attr "neon_type") + (if_then_else (gt (const_string "") (const_string "1")) + (const_string "neon_vld3_vld4_all_lanes") + (const_string "neon_vld1_1_2_regs")))]) + +(define_insn "neon_vst3" + [(set (mem:EI (match_operand:SI 0 "s_register_operand" "r")) + (unspec:EI [(match_operand:EI 1 "s_register_operand" "w") + (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VST3))] + "TARGET_NEON" +{ + if ( == 64) + return "vst1.64\t%h1, [%0]"; + else + return "vst3.\t%h1, [%0]"; +} + [(set (attr "neon_type") + (if_then_else (eq (const_string "") (const_string "64")) + (const_string "neon_vst1_1_2_regs_vst2_2_regs") + (const_string "neon_vst2_4_regs_vst3_vst4")))]) + +(define_expand "neon_vst3" + [(match_operand:SI 0 "s_register_operand" "+r") + (match_operand:CI 1 "s_register_operand" "w") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + "TARGET_NEON" +{ + emit_insn (gen_neon_vst3qa (operands[0], operands[0], operands[1])); + emit_insn (gen_neon_vst3qb (operands[0], operands[0], operands[1])); + DONE; +}) + +(define_insn "neon_vst3qa" + [(set (mem:EI (match_operand:SI 1 "s_register_operand" "0")) + (unspec:EI [(match_operand:CI 2 "s_register_operand" "w") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VST3A)) + (set (match_operand:SI 0 "s_register_operand" "=r") + (plus:SI (match_dup 1) + (const_int 24)))] + "TARGET_NEON" +{ + int regno = REGNO (operands[2]); + rtx ops[4]; + ops[0] = operands[0]; + ops[1] = gen_rtx_REG (DImode, regno); + ops[2] = gen_rtx_REG (DImode, regno + 4); + ops[3] = gen_rtx_REG (DImode, regno + 8); + output_asm_insn ("vst3.\t{%P1, %P2, %P3}, [%0]!", ops); + return ""; +} + [(set_attr "neon_type" "neon_vst2_4_regs_vst3_vst4")] +) + +(define_insn "neon_vst3qb" + [(set (mem:EI (match_operand:SI 1 "s_register_operand" "0")) + (unspec:EI [(match_operand:CI 2 "s_register_operand" "w") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VST3B)) + (set (match_operand:SI 0 "s_register_operand" "=r") + (plus:SI (match_dup 1) + (const_int 24)))] + "TARGET_NEON" +{ + int regno = REGNO (operands[2]); + rtx ops[4]; + ops[0] = operands[0]; + ops[1] = gen_rtx_REG (DImode, regno + 2); + ops[2] = gen_rtx_REG (DImode, regno + 6); + ops[3] = gen_rtx_REG (DImode, regno + 10); + output_asm_insn ("vst3.\t{%P1, %P2, %P3}, [%0]!", ops); + return ""; +} + [(set_attr "neon_type" "neon_vst2_4_regs_vst3_vst4")] +) + +(define_insn "neon_vst3_lane" + [(set (mem: (match_operand:SI 0 "s_register_operand" "r")) + (unspec: + [(match_operand:EI 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i") + (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VST3_LANE))] + "TARGET_NEON" +{ + HOST_WIDE_INT lane = INTVAL (operands[2]); + HOST_WIDE_INT max = GET_MODE_NUNITS (mode); + int regno = REGNO (operands[1]); + rtx ops[5]; + if (lane < 0 || lane >= max) + error ("lane out of range"); + ops[0] = operands[0]; + ops[1] = gen_rtx_REG (DImode, regno); + ops[2] = gen_rtx_REG (DImode, regno + 2); + ops[3] = gen_rtx_REG (DImode, regno + 4); + ops[4] = operands[2]; + output_asm_insn ("vst3.\t{%P1[%c4], %P2[%c4], %P3[%c4]}, [%0]", + ops); + return ""; +} + [(set_attr "neon_type" "neon_vst3_vst4_lane")] +) + +(define_insn "neon_vst3_lane" + [(set (mem: (match_operand:SI 0 "s_register_operand" "r")) + (unspec: + [(match_operand:CI 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i") + (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VST3_LANE))] + "TARGET_NEON" +{ + HOST_WIDE_INT lane = INTVAL (operands[2]); + HOST_WIDE_INT max = GET_MODE_NUNITS (mode); + int regno = REGNO (operands[1]); + rtx ops[5]; + if (lane < 0 || lane >= max) + error ("lane out of range"); + else if (lane >= max / 2) + { + lane -= max / 2; + regno += 2; + } + ops[0] = operands[0]; + ops[1] = gen_rtx_REG (DImode, regno); + ops[2] = gen_rtx_REG (DImode, regno + 4); + ops[3] = gen_rtx_REG (DImode, regno + 8); + ops[4] = GEN_INT (lane); + output_asm_insn ("vst3.\t{%P1[%c4], %P2[%c4], %P3[%c4]}, [%0]", + ops); + return ""; +} +[(set_attr "neon_type" "neon_vst3_vst4_lane")]) + +(define_insn "neon_vld4" + [(set (match_operand:OI 0 "s_register_operand" "=w") + (unspec:OI [(mem:OI (match_operand:SI 1 "s_register_operand" "r")) + (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD4))] + "TARGET_NEON" +{ + if ( == 64) + return "vld1.64\t%h0, [%1]"; + else + return "vld4.\t%h0, [%1]"; +} + [(set (attr "neon_type") + (if_then_else (eq (const_string "") (const_string "64")) + (const_string "neon_vld1_1_2_regs") + (const_string "neon_vld3_vld4")))] +) + +(define_expand "neon_vld4" + [(match_operand:XI 0 "s_register_operand" "=w") + (match_operand:SI 1 "s_register_operand" "+r") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + "TARGET_NEON" +{ + emit_insn (gen_neon_vld4qa (operands[0], operands[0], + operands[1], operands[1])); + emit_insn (gen_neon_vld4qb (operands[0], operands[0], + operands[1], operands[1])); + DONE; +}) + +(define_insn "neon_vld4qa" + [(set (match_operand:XI 0 "s_register_operand" "=w") + (unspec:XI [(mem:XI (match_operand:SI 3 "s_register_operand" "2")) + (match_operand:XI 1 "s_register_operand" "0") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD4A)) + (set (match_operand:SI 2 "s_register_operand" "=r") + (plus:SI (match_dup 3) + (const_int 32)))] + "TARGET_NEON" +{ + int regno = REGNO (operands[0]); + rtx ops[5]; + ops[0] = gen_rtx_REG (DImode, regno); + ops[1] = gen_rtx_REG (DImode, regno + 4); + ops[2] = gen_rtx_REG (DImode, regno + 8); + ops[3] = gen_rtx_REG (DImode, regno + 12); + ops[4] = operands[2]; + output_asm_insn ("vld4.\t{%P0, %P1, %P2, %P3}, [%4]!", ops); + return ""; +} + [(set_attr "neon_type" "neon_vld3_vld4")] +) + +(define_insn "neon_vld4qb" + [(set (match_operand:XI 0 "s_register_operand" "=w") + (unspec:XI [(mem:XI (match_operand:SI 3 "s_register_operand" "2")) + (match_operand:XI 1 "s_register_operand" "0") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD4B)) + (set (match_operand:SI 2 "s_register_operand" "=r") + (plus:SI (match_dup 3) + (const_int 32)))] + "TARGET_NEON" +{ + int regno = REGNO (operands[0]); + rtx ops[5]; + ops[0] = gen_rtx_REG (DImode, regno + 2); + ops[1] = gen_rtx_REG (DImode, regno + 6); + ops[2] = gen_rtx_REG (DImode, regno + 10); + ops[3] = gen_rtx_REG (DImode, regno + 14); + ops[4] = operands[2]; + output_asm_insn ("vld4.\t{%P0, %P1, %P2, %P3}, [%4]!", ops); + return ""; +} + [(set_attr "neon_type" "neon_vld3_vld4")] +) + +(define_insn "neon_vld4_lane" + [(set (match_operand:OI 0 "s_register_operand" "=w") + (unspec:OI [(mem: (match_operand:SI 1 "s_register_operand" "r")) + (match_operand:OI 2 "s_register_operand" "0") + (match_operand:SI 3 "immediate_operand" "i") + (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD4_LANE))] + "TARGET_NEON" +{ + HOST_WIDE_INT lane = INTVAL (operands[3]); + HOST_WIDE_INT max = GET_MODE_NUNITS (mode); + int regno = REGNO (operands[0]); + rtx ops[6]; + if (lane < 0 || lane >= max) + error ("lane out of range"); + ops[0] = gen_rtx_REG (DImode, regno); + ops[1] = gen_rtx_REG (DImode, regno + 2); + ops[2] = gen_rtx_REG (DImode, regno + 4); + ops[3] = gen_rtx_REG (DImode, regno + 6); + ops[4] = operands[1]; + ops[5] = operands[3]; + output_asm_insn ("vld4.\t{%P0[%c5], %P1[%c5], %P2[%c5], %P3[%c5]}, [%4]", + ops); + return ""; +} + [(set_attr "neon_type" "neon_vld3_vld4_lane")] +) + +(define_insn "neon_vld4_lane" + [(set (match_operand:XI 0 "s_register_operand" "=w") + (unspec:XI [(mem: (match_operand:SI 1 "s_register_operand" "r")) + (match_operand:XI 2 "s_register_operand" "0") + (match_operand:SI 3 "immediate_operand" "i") + (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD4_LANE))] + "TARGET_NEON" +{ + HOST_WIDE_INT lane = INTVAL (operands[3]); + HOST_WIDE_INT max = GET_MODE_NUNITS (mode); + int regno = REGNO (operands[0]); + rtx ops[6]; + if (lane < 0 || lane >= max) + error ("lane out of range"); + else if (lane >= max / 2) + { + lane -= max / 2; + regno += 2; + } + ops[0] = gen_rtx_REG (DImode, regno); + ops[1] = gen_rtx_REG (DImode, regno + 4); + ops[2] = gen_rtx_REG (DImode, regno + 8); + ops[3] = gen_rtx_REG (DImode, regno + 12); + ops[4] = operands[1]; + ops[5] = GEN_INT (lane); + output_asm_insn ("vld4.\t{%P0[%c5], %P1[%c5], %P2[%c5], %P3[%c5]}, [%4]", + ops); + return ""; +} + [(set_attr "neon_type" "neon_vld3_vld4_lane")] +) + +(define_insn "neon_vld4_dup" + [(set (match_operand:OI 0 "s_register_operand" "=w") + (unspec:OI [(mem: (match_operand:SI 1 "s_register_operand" "r")) + (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD4_DUP))] + "TARGET_NEON" +{ + if (GET_MODE_NUNITS (mode) > 1) + { + int regno = REGNO (operands[0]); + rtx ops[5]; + ops[0] = gen_rtx_REG (DImode, regno); + ops[1] = gen_rtx_REG (DImode, regno + 2); + ops[2] = gen_rtx_REG (DImode, regno + 4); + ops[3] = gen_rtx_REG (DImode, regno + 6); + ops[4] = operands[1]; + output_asm_insn ("vld4.\t{%P0[], %P1[], %P2[], %P3[]}, [%4]", + ops); + return ""; + } + else + return "vld1.\t%h0, [%1]"; +} + [(set (attr "neon_type") + (if_then_else (gt (const_string "") (const_string "1")) + (const_string "neon_vld3_vld4_all_lanes") + (const_string "neon_vld1_1_2_regs")))] +) + +(define_insn "neon_vst4" + [(set (mem:OI (match_operand:SI 0 "s_register_operand" "r")) + (unspec:OI [(match_operand:OI 1 "s_register_operand" "w") + (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VST4))] + "TARGET_NEON" +{ + if ( == 64) + return "vst1.64\t%h1, [%0]"; + else + return "vst4.\t%h1, [%0]"; +} + [(set (attr "neon_type") + (if_then_else (eq (const_string "") (const_string "64")) + (const_string "neon_vst1_1_2_regs_vst2_2_regs") + (const_string "neon_vst2_4_regs_vst3_vst4")))] +) + +(define_expand "neon_vst4" + [(match_operand:SI 0 "s_register_operand" "+r") + (match_operand:XI 1 "s_register_operand" "w") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + "TARGET_NEON" +{ + emit_insn (gen_neon_vst4qa (operands[0], operands[0], operands[1])); + emit_insn (gen_neon_vst4qb (operands[0], operands[0], operands[1])); + DONE; +}) + +(define_insn "neon_vst4qa" + [(set (mem:OI (match_operand:SI 1 "s_register_operand" "0")) + (unspec:OI [(match_operand:XI 2 "s_register_operand" "w") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VST4A)) + (set (match_operand:SI 0 "s_register_operand" "=r") + (plus:SI (match_dup 1) + (const_int 32)))] + "TARGET_NEON" +{ + int regno = REGNO (operands[2]); + rtx ops[5]; + ops[0] = operands[0]; + ops[1] = gen_rtx_REG (DImode, regno); + ops[2] = gen_rtx_REG (DImode, regno + 4); + ops[3] = gen_rtx_REG (DImode, regno + 8); + ops[4] = gen_rtx_REG (DImode, regno + 12); + output_asm_insn ("vst4.\t{%P1, %P2, %P3, %P4}, [%0]!", ops); + return ""; +} + [(set_attr "neon_type" "neon_vst2_4_regs_vst3_vst4")] +) + +(define_insn "neon_vst4qb" + [(set (mem:OI (match_operand:SI 1 "s_register_operand" "0")) + (unspec:OI [(match_operand:XI 2 "s_register_operand" "w") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VST4B)) + (set (match_operand:SI 0 "s_register_operand" "=r") + (plus:SI (match_dup 1) + (const_int 32)))] + "TARGET_NEON" +{ + int regno = REGNO (operands[2]); + rtx ops[5]; + ops[0] = operands[0]; + ops[1] = gen_rtx_REG (DImode, regno + 2); + ops[2] = gen_rtx_REG (DImode, regno + 6); + ops[3] = gen_rtx_REG (DImode, regno + 10); + ops[4] = gen_rtx_REG (DImode, regno + 14); + output_asm_insn ("vst4.\t{%P1, %P2, %P3, %P4}, [%0]!", ops); + return ""; +} + [(set_attr "neon_type" "neon_vst2_4_regs_vst3_vst4")] +) + +(define_insn "neon_vst4_lane" + [(set (mem: (match_operand:SI 0 "s_register_operand" "r")) + (unspec: + [(match_operand:OI 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i") + (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VST4_LANE))] + "TARGET_NEON" +{ + HOST_WIDE_INT lane = INTVAL (operands[2]); + HOST_WIDE_INT max = GET_MODE_NUNITS (mode); + int regno = REGNO (operands[1]); + rtx ops[6]; + if (lane < 0 || lane >= max) + error ("lane out of range"); + ops[0] = operands[0]; + ops[1] = gen_rtx_REG (DImode, regno); + ops[2] = gen_rtx_REG (DImode, regno + 2); + ops[3] = gen_rtx_REG (DImode, regno + 4); + ops[4] = gen_rtx_REG (DImode, regno + 6); + ops[5] = operands[2]; + output_asm_insn ("vst4.\t{%P1[%c5], %P2[%c5], %P3[%c5], %P4[%c5]}, [%0]", + ops); + return ""; +} + [(set_attr "neon_type" "neon_vst3_vst4_lane")] +) + +(define_insn "neon_vst4_lane" + [(set (mem: (match_operand:SI 0 "s_register_operand" "r")) + (unspec: + [(match_operand:XI 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i") + (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VST4_LANE))] + "TARGET_NEON" +{ + HOST_WIDE_INT lane = INTVAL (operands[2]); + HOST_WIDE_INT max = GET_MODE_NUNITS (mode); + int regno = REGNO (operands[1]); + rtx ops[6]; + if (lane < 0 || lane >= max) + error ("lane out of range"); + else if (lane >= max / 2) + { + lane -= max / 2; + regno += 2; + } + ops[0] = operands[0]; + ops[1] = gen_rtx_REG (DImode, regno); + ops[2] = gen_rtx_REG (DImode, regno + 4); + ops[3] = gen_rtx_REG (DImode, regno + 8); + ops[4] = gen_rtx_REG (DImode, regno + 12); + ops[5] = GEN_INT (lane); + output_asm_insn ("vst4.\t{%P1[%c5], %P2[%c5], %P3[%c5], %P4[%c5]}, [%0]", + ops); + return ""; +} + [(set_attr "neon_type" "neon_vst3_vst4_lane")] +) + +(define_expand "neon_vand" + [(match_operand:VDQX 0 "s_register_operand" "") + (match_operand:VDQX 1 "s_register_operand" "") + (match_operand:VDQX 2 "neon_inv_logic_op2" "") + (match_operand:SI 3 "immediate_operand" "")] + "TARGET_NEON" +{ + emit_insn (gen_and3 (operands[0], operands[1], operands[2])); + DONE; +}) + +(define_expand "neon_vorr" + [(match_operand:VDQX 0 "s_register_operand" "") + (match_operand:VDQX 1 "s_register_operand" "") + (match_operand:VDQX 2 "neon_logic_op2" "") + (match_operand:SI 3 "immediate_operand" "")] + "TARGET_NEON" +{ + emit_insn (gen_ior3 (operands[0], operands[1], operands[2])); + DONE; +}) + +(define_expand "neon_veor" + [(match_operand:VDQX 0 "s_register_operand" "") + (match_operand:VDQX 1 "s_register_operand" "") + (match_operand:VDQX 2 "s_register_operand" "") + (match_operand:SI 3 "immediate_operand" "")] + "TARGET_NEON" +{ + emit_insn (gen_xor3 (operands[0], operands[1], operands[2])); + DONE; +}) + +(define_expand "neon_vbic" + [(match_operand:VDQX 0 "s_register_operand" "") + (match_operand:VDQX 1 "s_register_operand" "") + (match_operand:VDQX 2 "neon_logic_op2" "") + (match_operand:SI 3 "immediate_operand" "")] + "TARGET_NEON" +{ + emit_insn (gen_bic3_neon (operands[0], operands[1], operands[2])); + DONE; +}) + +(define_expand "neon_vorn" + [(match_operand:VDQX 0 "s_register_operand" "") + (match_operand:VDQX 1 "s_register_operand" "") + (match_operand:VDQX 2 "neon_inv_logic_op2" "") + (match_operand:SI 3 "immediate_operand" "")] + "TARGET_NEON" +{ + emit_insn (gen_orn3_neon (operands[0], operands[1], operands[2])); + DONE; +}) + +(define_insn "neon_vec_unpack_lo_" + [(set (match_operand: 0 "register_operand" "=w") + (SE: (vec_select: + (match_operand:VU 1 "register_operand" "w") + (match_operand:VU 2 "vect_par_constant_low" ""))))] + "TARGET_NEON" + "vmovl. %q0, %e1" + [(set_attr "neon_type" "neon_shift_1")] +) + +(define_insn "neon_vec_unpack_hi_" + [(set (match_operand: 0 "register_operand" "=w") + (SE: (vec_select: + (match_operand:VU 1 "register_operand" "w") + (match_operand:VU 2 "vect_par_constant_high" ""))))] + "TARGET_NEON" + "vmovl. %q0, %f1" + [(set_attr "neon_type" "neon_shift_1")] +) + +(define_expand "vec_unpack_hi_" + [(match_operand: 0 "register_operand" "") + (SE: (match_operand:VU 1 "register_operand"))] + "TARGET_NEON" + { + rtvec v = rtvec_alloc (/2) ; + rtx t1; + int i; + for (i = 0; i < (/2); i++) + RTVEC_ELT (v, i) = GEN_INT ((/2) + i); + + t1 = gen_rtx_PARALLEL (mode, v); + emit_insn (gen_neon_vec_unpack_hi_ (operands[0], + operands[1], + t1)); + DONE; + } +) + +(define_expand "vec_unpack_lo_" + [(match_operand: 0 "register_operand" "") + (SE: (match_operand:VU 1 "register_operand" ""))] + "TARGET_NEON" + { + rtvec v = rtvec_alloc (/2) ; + rtx t1; + int i; + for (i = 0; i < (/2) ; i++) + RTVEC_ELT (v, i) = GEN_INT (i); + t1 = gen_rtx_PARALLEL (mode, v); + emit_insn (gen_neon_vec_unpack_lo_ (operands[0], + operands[1], + t1)); + DONE; + } +) + +(define_insn "neon_vec_mult_lo_" + [(set (match_operand: 0 "register_operand" "=w") + (mult: (SE: (vec_select: + (match_operand:VU 1 "register_operand" "w") + (match_operand:VU 2 "vect_par_constant_low" ""))) + (SE: (vec_select: + (match_operand:VU 3 "register_operand" "w") + (match_dup 2)))))] + "TARGET_NEON" + "vmull. %q0, %e1, %e3" + [(set_attr "neon_type" "neon_shift_1")] +) + +(define_expand "vec_widen_mult_lo_" + [(match_operand: 0 "register_operand" "") + (SE: (match_operand:VU 1 "register_operand" "")) + (SE: (match_operand:VU 2 "register_operand" ""))] + "TARGET_NEON" + { + rtvec v = rtvec_alloc (/2) ; + rtx t1; + int i; + for (i = 0; i < (/2) ; i++) + RTVEC_ELT (v, i) = GEN_INT (i); + t1 = gen_rtx_PARALLEL (mode, v); + + emit_insn (gen_neon_vec_mult_lo_ (operands[0], + operands[1], + t1, + operands[2])); + DONE; + } +) + +(define_insn "neon_vec_mult_hi_" + [(set (match_operand: 0 "register_operand" "=w") + (mult: (SE: (vec_select: + (match_operand:VU 1 "register_operand" "w") + (match_operand:VU 2 "vect_par_constant_high" ""))) + (SE: (vec_select: + (match_operand:VU 3 "register_operand" "w") + (match_dup 2)))))] + "TARGET_NEON" + "vmull. %q0, %f1, %f3" + [(set_attr "neon_type" "neon_shift_1")] +) + +(define_expand "vec_widen_mult_hi_" + [(match_operand: 0 "register_operand" "") + (SE: (match_operand:VU 1 "register_operand" "")) + (SE: (match_operand:VU 2 "register_operand" ""))] + "TARGET_NEON" + { + rtvec v = rtvec_alloc (/2) ; + rtx t1; + int i; + for (i = 0; i < (/2) ; i++) + RTVEC_ELT (v, i) = GEN_INT (/2 + i); + t1 = gen_rtx_PARALLEL (mode, v); + + emit_insn (gen_neon_vec_mult_hi_ (operands[0], + operands[1], + t1, + operands[2])); + DONE; + + } +) + +;; Vectorize for non-neon-quad case +(define_insn "neon_unpack_" + [(set (match_operand: 0 "register_operand" "=w") + (SE: (match_operand:VDI 1 "register_operand" "w")))] + "TARGET_NEON" + "vmovl. %q0, %P1" + [(set_attr "neon_type" "neon_shift_1")] +) + +(define_expand "vec_unpack_lo_" + [(match_operand: 0 "register_operand" "") + (SE:(match_operand:VDI 1 "register_operand"))] + "TARGET_NEON" +{ + rtx tmpreg = gen_reg_rtx (mode); + emit_insn (gen_neon_unpack_ (tmpreg, operands[1])); + emit_insn (gen_neon_vget_low (operands[0], tmpreg)); + + DONE; +} +) + +(define_expand "vec_unpack_hi_" + [(match_operand: 0 "register_operand" "") + (SE:(match_operand:VDI 1 "register_operand"))] + "TARGET_NEON" +{ + rtx tmpreg = gen_reg_rtx (mode); + emit_insn (gen_neon_unpack_ (tmpreg, operands[1])); + emit_insn (gen_neon_vget_high (operands[0], tmpreg)); + + DONE; +} +) + +(define_insn "neon_vec_mult_" + [(set (match_operand: 0 "register_operand" "=w") + (mult: (SE: + (match_operand:VDI 1 "register_operand" "w")) + (SE: + (match_operand:VDI 2 "register_operand" "w"))))] + "TARGET_NEON" + "vmull. %q0, %P1, %P2" + [(set_attr "neon_type" "neon_shift_1")] +) + +(define_expand "vec_widen_mult_hi_" + [(match_operand: 0 "register_operand" "") + (SE: (match_operand:VDI 1 "register_operand" "")) + (SE: (match_operand:VDI 2 "register_operand" ""))] + "TARGET_NEON" + { + rtx tmpreg = gen_reg_rtx (mode); + emit_insn (gen_neon_vec_mult_ (tmpreg, operands[1], operands[2])); + emit_insn (gen_neon_vget_high (operands[0], tmpreg)); + + DONE; + + } +) + +(define_expand "vec_widen_mult_lo_" + [(match_operand: 0 "register_operand" "") + (SE: (match_operand:VDI 1 "register_operand" "")) + (SE: (match_operand:VDI 2 "register_operand" ""))] + "TARGET_NEON" + { + rtx tmpreg = gen_reg_rtx (mode); + emit_insn (gen_neon_vec_mult_ (tmpreg, operands[1], operands[2])); + emit_insn (gen_neon_vget_low (operands[0], tmpreg)); + + DONE; + + } +) + +;; The case when using all quad registers. +(define_insn "vec_pack_trunc_" + [(set (match_operand: 0 "register_operand" "=&w") + (vec_concat: + (truncate: + (match_operand:VN 1 "register_operand" "w")) + (truncate: + (match_operand:VN 2 "register_operand" "w"))))] + "TARGET_NEON" + "vmovn.i\t%e0, %q1\;vmovn.i\t%f0, %q2" + [(set_attr "neon_type" "neon_shift_1") + (set_attr "length" "8")] +) + +;; For the non-quad case. +(define_insn "neon_vec_pack_trunc_" + [(set (match_operand: 0 "register_operand" "=w") + (truncate: (match_operand:VN 1 "register_operand" "w")))] + "TARGET_NEON" + "vmovn.i\t%P0, %q1" + [(set_attr "neon_type" "neon_shift_1")] +) + +(define_expand "vec_pack_trunc_" + [(match_operand: 0 "register_operand" "") + (match_operand:VSHFT 1 "register_operand" "") + (match_operand:VSHFT 2 "register_operand")] + "TARGET_NEON" +{ + rtx tempreg = gen_reg_rtx (mode); + + emit_insn (gen_move_lo_quad_ (tempreg, operands[1])); + emit_insn (gen_move_hi_quad_ (tempreg, operands[2])); + emit_insn (gen_neon_vec_pack_trunc_ (operands[0], tempreg)); + DONE; +}) diff --git a/gcc/config/arm/neon.ml b/gcc/config/arm/neon.ml new file mode 100644 index 000000000..b5b9cab73 --- /dev/null +++ b/gcc/config/arm/neon.ml @@ -0,0 +1,1857 @@ +(* Common code for ARM NEON header file, documentation and test case + generators. + + Copyright (C) 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc. + Contributed by CodeSourcery. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free + Software Foundation; either version 3, or (at your option) any later + version. + + GCC is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . *) + +(* Shorthand types for vector elements. *) +type elts = S8 | S16 | S32 | S64 | F32 | U8 | U16 | U32 | U64 | P8 | P16 + | I8 | I16 | I32 | I64 | B8 | B16 | B32 | B64 | Conv of elts * elts + | Cast of elts * elts | NoElts + +type eltclass = Signed | Unsigned | Float | Poly | Int | Bits + | ConvClass of eltclass * eltclass | NoType + +(* These vector types correspond directly to C types. *) +type vectype = T_int8x8 | T_int8x16 + | T_int16x4 | T_int16x8 + | T_int32x2 | T_int32x4 + | T_int64x1 | T_int64x2 + | T_uint8x8 | T_uint8x16 + | T_uint16x4 | T_uint16x8 + | T_uint32x2 | T_uint32x4 + | T_uint64x1 | T_uint64x2 + | T_float32x2 | T_float32x4 + | T_poly8x8 | T_poly8x16 + | T_poly16x4 | T_poly16x8 + | T_immediate of int * int + | T_int8 | T_int16 + | T_int32 | T_int64 + | T_uint8 | T_uint16 + | T_uint32 | T_uint64 + | T_poly8 | T_poly16 + | T_float32 | T_arrayof of int * vectype + | T_ptrto of vectype | T_const of vectype + | T_void | T_intQI + | T_intHI | T_intSI + | T_intDI | T_floatSF + +(* The meanings of the following are: + TImode : "Tetra", two registers (four words). + EImode : "hExa", three registers (six words). + OImode : "Octa", four registers (eight words). + CImode : "dodeCa", six registers (twelve words). + XImode : "heXadeca", eight registers (sixteen words). +*) + +type inttype = B_TImode | B_EImode | B_OImode | B_CImode | B_XImode + +type shape_elt = Dreg | Qreg | Corereg | Immed | VecArray of int * shape_elt + | PtrTo of shape_elt | CstPtrTo of shape_elt + (* These next ones are used only in the test generator. *) + | Element_of_dreg (* Used for "lane" variants. *) + | Element_of_qreg (* Likewise. *) + | All_elements_of_dreg (* Used for "dup" variants. *) + | Alternatives of shape_elt list (* Used for multiple valid operands *) + +type shape_form = All of int * shape_elt + | Long + | Long_noreg of shape_elt + | Wide + | Wide_noreg of shape_elt + | Narrow + | Long_imm + | Narrow_imm + | Binary_imm of shape_elt + | Use_operands of shape_elt array + | By_scalar of shape_elt + | Unary_scalar of shape_elt + | Wide_lane + | Wide_scalar + | Pair_result of shape_elt + +type arity = Arity0 of vectype + | Arity1 of vectype * vectype + | Arity2 of vectype * vectype * vectype + | Arity3 of vectype * vectype * vectype * vectype + | Arity4 of vectype * vectype * vectype * vectype * vectype + +type vecmode = V8QI | V4HI | V2SI | V2SF | DI + | V16QI | V8HI | V4SI | V4SF | V2DI + | QI | HI | SI | SF + +type opcode = + (* Binary ops. *) + Vadd + | Vmul + | Vmla + | Vmls + | Vsub + | Vceq + | Vcge + | Vcgt + | Vcle + | Vclt + | Vcage + | Vcagt + | Vcale + | Vcalt + | Vtst + | Vabd + | Vaba + | Vmax + | Vmin + | Vpadd + | Vpada + | Vpmax + | Vpmin + | Vrecps + | Vrsqrts + | Vshl + | Vshr_n + | Vshl_n + | Vsra_n + | Vsri + | Vsli + (* Logic binops. *) + | Vand + | Vorr + | Veor + | Vbic + | Vorn + | Vbsl + (* Ops with scalar. *) + | Vmul_lane + | Vmla_lane + | Vmls_lane + | Vmul_n + | Vmla_n + | Vmls_n + | Vmull_n + | Vmull_lane + | Vqdmull_n + | Vqdmull_lane + | Vqdmulh_n + | Vqdmulh_lane + (* Unary ops. *) + | Vabs + | Vneg + | Vcls + | Vclz + | Vcnt + | Vrecpe + | Vrsqrte + | Vmvn + (* Vector extract. *) + | Vext + (* Reverse elements. *) + | Vrev64 + | Vrev32 + | Vrev16 + (* Transposition ops. *) + | Vtrn + | Vzip + | Vuzp + (* Loads and stores (VLD1/VST1/VLD2...), elements and structures. *) + | Vldx of int + | Vstx of int + | Vldx_lane of int + | Vldx_dup of int + | Vstx_lane of int + (* Set/extract lanes from a vector. *) + | Vget_lane + | Vset_lane + (* Initialize vector from bit pattern. *) + | Vcreate + (* Set all lanes to same value. *) + | Vdup_n + | Vmov_n (* Is this the same? *) + (* Duplicate scalar to all lanes of vector. *) + | Vdup_lane + (* Combine vectors. *) + | Vcombine + (* Get quadword high/low parts. *) + | Vget_high + | Vget_low + (* Convert vectors. *) + | Vcvt + | Vcvt_n + (* Narrow/lengthen vectors. *) + | Vmovn + | Vmovl + (* Table lookup. *) + | Vtbl of int + | Vtbx of int + (* Reinterpret casts. *) + | Vreinterp + +(* Features used for documentation, to distinguish between some instruction + variants, and to signal special requirements (e.g. swapping arguments). *) + +type features = + Halving + | Rounding + | Saturating + | Dst_unsign + | High_half + | Doubling + | Flipped of string (* Builtin name to use with flipped arguments. *) + | InfoWord (* Pass an extra word for signage/rounding etc. (always passed + for All _, Long, Wide, Narrow shape_forms. *) + | ReturnPtr (* Pass explicit pointer to return value as first argument. *) + (* A specification as to the shape of instruction expected upon + disassembly, used if it differs from the shape used to build the + intrinsic prototype. Multiple entries in the constructor's argument + indicate that the intrinsic expands to more than one assembly + instruction, each with a corresponding shape specified here. *) + | Disassembles_as of shape_form list + | Builtin_name of string (* Override the name of the builtin. *) + (* Override the name of the instruction. If more than one name + is specified, it means that the instruction can have any of those + names. *) + | Instruction_name of string list + (* Mark that the intrinsic yields no instructions, or expands to yield + behavior that the test generator cannot test. *) + | No_op + (* Mark that the intrinsic has constant arguments that cannot be set + to the defaults (zero for pointers and one otherwise) in the test + cases. The function supplied must return the integer to be written + into the testcase for the argument number (0-based) supplied to it. *) + | Const_valuator of (int -> int) + | Fixed_return_reg + +exception MixedMode of elts * elts + +let rec elt_width = function + S8 | U8 | P8 | I8 | B8 -> 8 + | S16 | U16 | P16 | I16 | B16 -> 16 + | S32 | F32 | U32 | I32 | B32 -> 32 + | S64 | U64 | I64 | B64 -> 64 + | Conv (a, b) -> + let wa = elt_width a and wb = elt_width b in + if wa = wb then wa else failwith "element width?" + | Cast (a, b) -> raise (MixedMode (a, b)) + | NoElts -> failwith "No elts" + +let rec elt_class = function + S8 | S16 | S32 | S64 -> Signed + | U8 | U16 | U32 | U64 -> Unsigned + | P8 | P16 -> Poly + | F32 -> Float + | I8 | I16 | I32 | I64 -> Int + | B8 | B16 | B32 | B64 -> Bits + | Conv (a, b) | Cast (a, b) -> ConvClass (elt_class a, elt_class b) + | NoElts -> NoType + +let elt_of_class_width c w = + match c, w with + Signed, 8 -> S8 + | Signed, 16 -> S16 + | Signed, 32 -> S32 + | Signed, 64 -> S64 + | Float, 32 -> F32 + | Unsigned, 8 -> U8 + | Unsigned, 16 -> U16 + | Unsigned, 32 -> U32 + | Unsigned, 64 -> U64 + | Poly, 8 -> P8 + | Poly, 16 -> P16 + | Int, 8 -> I8 + | Int, 16 -> I16 + | Int, 32 -> I32 + | Int, 64 -> I64 + | Bits, 8 -> B8 + | Bits, 16 -> B16 + | Bits, 32 -> B32 + | Bits, 64 -> B64 + | _ -> failwith "Bad element type" + +(* Return unsigned integer element the same width as argument. *) +let unsigned_of_elt elt = + elt_of_class_width Unsigned (elt_width elt) + +let signed_of_elt elt = + elt_of_class_width Signed (elt_width elt) + +(* Return untyped bits element the same width as argument. *) +let bits_of_elt elt = + elt_of_class_width Bits (elt_width elt) + +let non_signed_variant = function + S8 -> I8 + | S16 -> I16 + | S32 -> I32 + | S64 -> I64 + | U8 -> I8 + | U16 -> I16 + | U32 -> I32 + | U64 -> I64 + | x -> x + +let poly_unsigned_variant v = + let elclass = match elt_class v with + Poly -> Unsigned + | x -> x in + elt_of_class_width elclass (elt_width v) + +let widen_elt elt = + let w = elt_width elt + and c = elt_class elt in + elt_of_class_width c (w * 2) + +let narrow_elt elt = + let w = elt_width elt + and c = elt_class elt in + elt_of_class_width c (w / 2) + +(* If we're trying to find a mode from a "Use_operands" instruction, use the + last vector operand as the dominant mode used to invoke the correct builtin. + We must stick to this rule in neon.md. *) +let find_key_operand operands = + let rec scan opno = + match operands.(opno) with + Qreg -> Qreg + | Dreg -> Dreg + | VecArray (_, Qreg) -> Qreg + | VecArray (_, Dreg) -> Dreg + | _ -> scan (opno-1) + in + scan ((Array.length operands) - 1) + +let rec mode_of_elt elt shape = + let flt = match elt_class elt with + Float | ConvClass(_, Float) -> true | _ -> false in + let idx = + match elt_width elt with + 8 -> 0 | 16 -> 1 | 32 -> 2 | 64 -> 3 + | _ -> failwith "Bad element width" + in match shape with + All (_, Dreg) | By_scalar Dreg | Pair_result Dreg | Unary_scalar Dreg + | Binary_imm Dreg | Long_noreg Dreg | Wide_noreg Dreg -> + [| V8QI; V4HI; if flt then V2SF else V2SI; DI |].(idx) + | All (_, Qreg) | By_scalar Qreg | Pair_result Qreg | Unary_scalar Qreg + | Binary_imm Qreg | Long_noreg Qreg | Wide_noreg Qreg -> + [| V16QI; V8HI; if flt then V4SF else V4SI; V2DI |].(idx) + | All (_, (Corereg | PtrTo _ | CstPtrTo _)) -> + [| QI; HI; if flt then SF else SI; DI |].(idx) + | Long | Wide | Wide_lane | Wide_scalar + | Long_imm -> + [| V8QI; V4HI; V2SI; DI |].(idx) + | Narrow | Narrow_imm -> [| V16QI; V8HI; V4SI; V2DI |].(idx) + | Use_operands ops -> mode_of_elt elt (All (0, (find_key_operand ops))) + | _ -> failwith "invalid shape" + +(* Modify an element type dependent on the shape of the instruction and the + operand number. *) + +let shapemap shape no = + let ident = fun x -> x in + match shape with + All _ | Use_operands _ | By_scalar _ | Pair_result _ | Unary_scalar _ + | Binary_imm _ -> ident + | Long | Long_noreg _ | Wide_scalar | Long_imm -> + [| widen_elt; ident; ident |].(no) + | Wide | Wide_noreg _ -> [| widen_elt; widen_elt; ident |].(no) + | Wide_lane -> [| widen_elt; ident; ident; ident |].(no) + | Narrow | Narrow_imm -> [| narrow_elt; ident; ident |].(no) + +(* Register type (D/Q) of an operand, based on shape and operand number. *) + +let regmap shape no = + match shape with + All (_, reg) | Long_noreg reg | Wide_noreg reg -> reg + | Long -> [| Qreg; Dreg; Dreg |].(no) + | Wide -> [| Qreg; Qreg; Dreg |].(no) + | Narrow -> [| Dreg; Qreg; Qreg |].(no) + | Wide_lane -> [| Qreg; Dreg; Dreg; Immed |].(no) + | Wide_scalar -> [| Qreg; Dreg; Corereg |].(no) + | By_scalar reg -> [| reg; reg; Dreg; Immed |].(no) + | Unary_scalar reg -> [| reg; Dreg; Immed |].(no) + | Pair_result reg -> [| VecArray (2, reg); reg; reg |].(no) + | Binary_imm reg -> [| reg; reg; Immed |].(no) + | Long_imm -> [| Qreg; Dreg; Immed |].(no) + | Narrow_imm -> [| Dreg; Qreg; Immed |].(no) + | Use_operands these -> these.(no) + +let type_for_elt shape elt no = + let elt = (shapemap shape no) elt in + let reg = regmap shape no in + let rec type_for_reg_elt reg elt = + match reg with + Dreg -> + begin match elt with + S8 -> T_int8x8 + | S16 -> T_int16x4 + | S32 -> T_int32x2 + | S64 -> T_int64x1 + | U8 -> T_uint8x8 + | U16 -> T_uint16x4 + | U32 -> T_uint32x2 + | U64 -> T_uint64x1 + | F32 -> T_float32x2 + | P8 -> T_poly8x8 + | P16 -> T_poly16x4 + | _ -> failwith "Bad elt type" + end + | Qreg -> + begin match elt with + S8 -> T_int8x16 + | S16 -> T_int16x8 + | S32 -> T_int32x4 + | S64 -> T_int64x2 + | U8 -> T_uint8x16 + | U16 -> T_uint16x8 + | U32 -> T_uint32x4 + | U64 -> T_uint64x2 + | F32 -> T_float32x4 + | P8 -> T_poly8x16 + | P16 -> T_poly16x8 + | _ -> failwith "Bad elt type" + end + | Corereg -> + begin match elt with + S8 -> T_int8 + | S16 -> T_int16 + | S32 -> T_int32 + | S64 -> T_int64 + | U8 -> T_uint8 + | U16 -> T_uint16 + | U32 -> T_uint32 + | U64 -> T_uint64 + | P8 -> T_poly8 + | P16 -> T_poly16 + | F32 -> T_float32 + | _ -> failwith "Bad elt type" + end + | Immed -> + T_immediate (0, 0) + | VecArray (num, sub) -> + T_arrayof (num, type_for_reg_elt sub elt) + | PtrTo x -> + T_ptrto (type_for_reg_elt x elt) + | CstPtrTo x -> + T_ptrto (T_const (type_for_reg_elt x elt)) + (* Anything else is solely for the use of the test generator. *) + | _ -> assert false + in + type_for_reg_elt reg elt + +(* Return size of a vector type, in bits. *) +let vectype_size = function + T_int8x8 | T_int16x4 | T_int32x2 | T_int64x1 + | T_uint8x8 | T_uint16x4 | T_uint32x2 | T_uint64x1 + | T_float32x2 | T_poly8x8 | T_poly16x4 -> 64 + | T_int8x16 | T_int16x8 | T_int32x4 | T_int64x2 + | T_uint8x16 | T_uint16x8 | T_uint32x4 | T_uint64x2 + | T_float32x4 | T_poly8x16 | T_poly16x8 -> 128 + | _ -> raise Not_found + +let inttype_for_array num elttype = + let eltsize = vectype_size elttype in + let numwords = (num * eltsize) / 32 in + match numwords with + 4 -> B_TImode + | 6 -> B_EImode + | 8 -> B_OImode + | 12 -> B_CImode + | 16 -> B_XImode + | _ -> failwith ("no int type for size " ^ string_of_int numwords) + +(* These functions return pairs of (internal, external) types, where "internal" + types are those seen by GCC, and "external" are those seen by the assembler. + These types aren't necessarily the same, since the intrinsics can munge more + than one C type into each assembler opcode. *) + +let make_sign_invariant func shape elt = + let arity, elt' = func shape elt in + arity, non_signed_variant elt' + +(* Don't restrict any types. *) + +let elts_same make_arity shape elt = + let vtype = type_for_elt shape elt in + make_arity vtype, elt + +(* As sign_invar_*, but when sign matters. *) +let elts_same_io_lane = + elts_same (fun vtype -> Arity4 (vtype 0, vtype 0, vtype 1, vtype 2, vtype 3)) + +let elts_same_io = + elts_same (fun vtype -> Arity3 (vtype 0, vtype 0, vtype 1, vtype 2)) + +let elts_same_2_lane = + elts_same (fun vtype -> Arity3 (vtype 0, vtype 1, vtype 2, vtype 3)) + +let elts_same_3 = elts_same_2_lane + +let elts_same_2 = + elts_same (fun vtype -> Arity2 (vtype 0, vtype 1, vtype 2)) + +let elts_same_1 = + elts_same (fun vtype -> Arity1 (vtype 0, vtype 1)) + +(* Use for signed/unsigned invariant operations (i.e. where the operation + doesn't depend on the sign of the data. *) + +let sign_invar_io_lane = make_sign_invariant elts_same_io_lane +let sign_invar_io = make_sign_invariant elts_same_io +let sign_invar_2_lane = make_sign_invariant elts_same_2_lane +let sign_invar_2 = make_sign_invariant elts_same_2 +let sign_invar_1 = make_sign_invariant elts_same_1 + +(* Sign-sensitive comparison. *) + +let cmp_sign_matters shape elt = + let vtype = type_for_elt shape elt + and rtype = type_for_elt shape (unsigned_of_elt elt) 0 in + Arity2 (rtype, vtype 1, vtype 2), elt + +(* Signed/unsigned invariant comparison. *) + +let cmp_sign_invar shape elt = + let shape', elt' = cmp_sign_matters shape elt in + let elt'' = + match non_signed_variant elt' with + P8 -> I8 + | x -> x + in + shape', elt'' + +(* Comparison (VTST) where only the element width matters. *) + +let cmp_bits shape elt = + let vtype = type_for_elt shape elt + and rtype = type_for_elt shape (unsigned_of_elt elt) 0 + and bits_only = bits_of_elt elt in + Arity2 (rtype, vtype 1, vtype 2), bits_only + +let reg_shift shape elt = + let vtype = type_for_elt shape elt + and op2type = type_for_elt shape (signed_of_elt elt) 2 in + Arity2 (vtype 0, vtype 1, op2type), elt + +(* Genericised constant-shift type-generating function. *) + +let const_shift mkimm ?arity ?result shape elt = + let op2type = (shapemap shape 2) elt in + let op2width = elt_width op2type in + let op2 = mkimm op2width + and op1 = type_for_elt shape elt 1 + and r_elt = + match result with + None -> elt + | Some restriction -> restriction elt in + let rtype = type_for_elt shape r_elt 0 in + match arity with + None -> Arity2 (rtype, op1, op2), elt + | Some mkarity -> mkarity rtype op1 op2, elt + +(* Use for immediate right-shifts. *) + +let shift_right shape elt = + const_shift (fun imm -> T_immediate (1, imm)) shape elt + +let shift_right_acc shape elt = + const_shift (fun imm -> T_immediate (1, imm)) + ~arity:(fun dst op1 op2 -> Arity3 (dst, dst, op1, op2)) shape elt + +(* Use for immediate right-shifts when the operation doesn't care about + signedness. *) + +let shift_right_sign_invar = + make_sign_invariant shift_right + +(* Immediate right-shift; result is unsigned even when operand is signed. *) + +let shift_right_to_uns shape elt = + const_shift (fun imm -> T_immediate (1, imm)) ~result:unsigned_of_elt + shape elt + +(* Immediate left-shift. *) + +let shift_left shape elt = + const_shift (fun imm -> T_immediate (0, imm - 1)) shape elt + +(* Immediate left-shift, unsigned result. *) + +let shift_left_to_uns shape elt = + const_shift (fun imm -> T_immediate (0, imm - 1)) ~result:unsigned_of_elt + shape elt + +(* Immediate left-shift, don't care about signs. *) + +let shift_left_sign_invar = + make_sign_invariant shift_left + +(* Shift left/right and insert: only element size matters. *) + +let shift_insert shape elt = + let arity, elt = + const_shift (fun imm -> T_immediate (1, imm)) + ~arity:(fun dst op1 op2 -> Arity3 (dst, dst, op1, op2)) shape elt in + arity, bits_of_elt elt + +(* Get/set lane. *) + +let get_lane shape elt = + let vtype = type_for_elt shape elt in + Arity2 (vtype 0, vtype 1, vtype 2), + (match elt with P8 -> U8 | P16 -> U16 | S32 | U32 | F32 -> B32 | x -> x) + +let set_lane shape elt = + let vtype = type_for_elt shape elt in + Arity3 (vtype 0, vtype 1, vtype 2, vtype 3), bits_of_elt elt + +let set_lane_notype shape elt = + let vtype = type_for_elt shape elt in + Arity3 (vtype 0, vtype 1, vtype 2, vtype 3), NoElts + +let create_vector shape elt = + let vtype = type_for_elt shape U64 1 + and rtype = type_for_elt shape elt 0 in + Arity1 (rtype, vtype), elt + +let conv make_arity shape elt = + let edest, esrc = match elt with + Conv (edest, esrc) | Cast (edest, esrc) -> edest, esrc + | _ -> failwith "Non-conversion element in conversion" in + let vtype = type_for_elt shape esrc + and rtype = type_for_elt shape edest 0 in + make_arity rtype vtype, elt + +let conv_1 = conv (fun rtype vtype -> Arity1 (rtype, vtype 1)) +let conv_2 = conv (fun rtype vtype -> Arity2 (rtype, vtype 1, vtype 2)) + +(* Operation has an unsigned result even if operands are signed. *) + +let dst_unsign make_arity shape elt = + let vtype = type_for_elt shape elt + and rtype = type_for_elt shape (unsigned_of_elt elt) 0 in + make_arity rtype vtype, elt + +let dst_unsign_1 = dst_unsign (fun rtype vtype -> Arity1 (rtype, vtype 1)) + +let make_bits_only func shape elt = + let arity, elt' = func shape elt in + arity, bits_of_elt elt' + +(* Extend operation. *) + +let extend shape elt = + let vtype = type_for_elt shape elt in + Arity3 (vtype 0, vtype 1, vtype 2, vtype 3), bits_of_elt elt + +(* Table look-up operations. Operand 2 is signed/unsigned for signed/unsigned + integer ops respectively, or unsigned for polynomial ops. *) + +let table mkarity shape elt = + let vtype = type_for_elt shape elt in + let op2 = type_for_elt shape (poly_unsigned_variant elt) 2 in + mkarity vtype op2, bits_of_elt elt + +let table_2 = table (fun vtype op2 -> Arity2 (vtype 0, vtype 1, op2)) +let table_io = table (fun vtype op2 -> Arity3 (vtype 0, vtype 0, vtype 1, op2)) + +(* Operations where only bits matter. *) + +let bits_1 = make_bits_only elts_same_1 +let bits_2 = make_bits_only elts_same_2 +let bits_3 = make_bits_only elts_same_3 + +(* Store insns. *) +let store_1 shape elt = + let vtype = type_for_elt shape elt in + Arity2 (T_void, vtype 0, vtype 1), bits_of_elt elt + +let store_3 shape elt = + let vtype = type_for_elt shape elt in + Arity3 (T_void, vtype 0, vtype 1, vtype 2), bits_of_elt elt + +let make_notype func shape elt = + let arity, _ = func shape elt in + arity, NoElts + +let notype_1 = make_notype elts_same_1 +let notype_2 = make_notype elts_same_2 +let notype_3 = make_notype elts_same_3 + +(* Bit-select operations (first operand is unsigned int). *) + +let bit_select shape elt = + let vtype = type_for_elt shape elt + and itype = type_for_elt shape (unsigned_of_elt elt) in + Arity3 (vtype 0, itype 1, vtype 2, vtype 3), NoElts + +(* Common lists of supported element types. *) + +let su_8_32 = [S8; S16; S32; U8; U16; U32] +let su_8_64 = S64 :: U64 :: su_8_32 +let su_16_64 = [S16; S32; S64; U16; U32; U64] +let pf_su_8_32 = P8 :: P16 :: F32 :: su_8_32 +let pf_su_8_64 = P8 :: P16 :: F32 :: su_8_64 + +let ops = + [ + (* Addition. *) + Vadd, [], All (3, Dreg), "vadd", sign_invar_2, F32 :: su_8_32; + Vadd, [No_op], All (3, Dreg), "vadd", sign_invar_2, [S64; U64]; + Vadd, [], All (3, Qreg), "vaddQ", sign_invar_2, F32 :: su_8_64; + Vadd, [], Long, "vaddl", elts_same_2, su_8_32; + Vadd, [], Wide, "vaddw", elts_same_2, su_8_32; + Vadd, [Halving], All (3, Dreg), "vhadd", elts_same_2, su_8_32; + Vadd, [Halving], All (3, Qreg), "vhaddQ", elts_same_2, su_8_32; + Vadd, [Instruction_name ["vrhadd"]; Rounding; Halving], + All (3, Dreg), "vRhadd", elts_same_2, su_8_32; + Vadd, [Instruction_name ["vrhadd"]; Rounding; Halving], + All (3, Qreg), "vRhaddQ", elts_same_2, su_8_32; + Vadd, [Saturating], All (3, Dreg), "vqadd", elts_same_2, su_8_64; + Vadd, [Saturating], All (3, Qreg), "vqaddQ", elts_same_2, su_8_64; + Vadd, [High_half], Narrow, "vaddhn", sign_invar_2, su_16_64; + Vadd, [Instruction_name ["vraddhn"]; Rounding; High_half], + Narrow, "vRaddhn", sign_invar_2, su_16_64; + + (* Multiplication. *) + Vmul, [], All (3, Dreg), "vmul", sign_invar_2, P8 :: F32 :: su_8_32; + Vmul, [], All (3, Qreg), "vmulQ", sign_invar_2, P8 :: F32 :: su_8_32; + Vmul, [Saturating; Doubling; High_half], All (3, Dreg), "vqdmulh", + elts_same_2, [S16; S32]; + Vmul, [Saturating; Doubling; High_half], All (3, Qreg), "vqdmulhQ", + elts_same_2, [S16; S32]; + Vmul, + [Saturating; Rounding; Doubling; High_half; + Instruction_name ["vqrdmulh"]], + All (3, Dreg), "vqRdmulh", + elts_same_2, [S16; S32]; + Vmul, + [Saturating; Rounding; Doubling; High_half; + Instruction_name ["vqrdmulh"]], + All (3, Qreg), "vqRdmulhQ", + elts_same_2, [S16; S32]; + Vmul, [], Long, "vmull", elts_same_2, P8 :: su_8_32; + Vmul, [Saturating; Doubling], Long, "vqdmull", elts_same_2, [S16; S32]; + + (* Multiply-accumulate. *) + Vmla, [], All (3, Dreg), "vmla", sign_invar_io, F32 :: su_8_32; + Vmla, [], All (3, Qreg), "vmlaQ", sign_invar_io, F32 :: su_8_32; + Vmla, [], Long, "vmlal", elts_same_io, su_8_32; + Vmla, [Saturating; Doubling], Long, "vqdmlal", elts_same_io, [S16; S32]; + + (* Multiply-subtract. *) + Vmls, [], All (3, Dreg), "vmls", sign_invar_io, F32 :: su_8_32; + Vmls, [], All (3, Qreg), "vmlsQ", sign_invar_io, F32 :: su_8_32; + Vmls, [], Long, "vmlsl", elts_same_io, su_8_32; + Vmls, [Saturating; Doubling], Long, "vqdmlsl", elts_same_io, [S16; S32]; + + (* Subtraction. *) + Vsub, [], All (3, Dreg), "vsub", sign_invar_2, F32 :: su_8_32; + Vsub, [No_op], All (3, Dreg), "vsub", sign_invar_2, [S64; U64]; + Vsub, [], All (3, Qreg), "vsubQ", sign_invar_2, F32 :: su_8_64; + Vsub, [], Long, "vsubl", elts_same_2, su_8_32; + Vsub, [], Wide, "vsubw", elts_same_2, su_8_32; + Vsub, [Halving], All (3, Dreg), "vhsub", elts_same_2, su_8_32; + Vsub, [Halving], All (3, Qreg), "vhsubQ", elts_same_2, su_8_32; + Vsub, [Saturating], All (3, Dreg), "vqsub", elts_same_2, su_8_64; + Vsub, [Saturating], All (3, Qreg), "vqsubQ", elts_same_2, su_8_64; + Vsub, [High_half], Narrow, "vsubhn", sign_invar_2, su_16_64; + Vsub, [Instruction_name ["vrsubhn"]; Rounding; High_half], + Narrow, "vRsubhn", sign_invar_2, su_16_64; + + (* Comparison, equal. *) + Vceq, [], All (3, Dreg), "vceq", cmp_sign_invar, P8 :: F32 :: su_8_32; + Vceq, [], All (3, Qreg), "vceqQ", cmp_sign_invar, P8 :: F32 :: su_8_32; + + (* Comparison, greater-than or equal. *) + Vcge, [], All (3, Dreg), "vcge", cmp_sign_matters, F32 :: su_8_32; + Vcge, [], All (3, Qreg), "vcgeQ", cmp_sign_matters, F32 :: su_8_32; + + (* Comparison, less-than or equal. *) + Vcle, [Flipped "vcge"], All (3, Dreg), "vcle", cmp_sign_matters, + F32 :: su_8_32; + Vcle, [Instruction_name ["vcge"]; Flipped "vcgeQ"], + All (3, Qreg), "vcleQ", cmp_sign_matters, + F32 :: su_8_32; + + (* Comparison, greater-than. *) + Vcgt, [], All (3, Dreg), "vcgt", cmp_sign_matters, F32 :: su_8_32; + Vcgt, [], All (3, Qreg), "vcgtQ", cmp_sign_matters, F32 :: su_8_32; + + (* Comparison, less-than. *) + Vclt, [Flipped "vcgt"], All (3, Dreg), "vclt", cmp_sign_matters, + F32 :: su_8_32; + Vclt, [Instruction_name ["vcgt"]; Flipped "vcgtQ"], + All (3, Qreg), "vcltQ", cmp_sign_matters, + F32 :: su_8_32; + + (* Compare absolute greater-than or equal. *) + Vcage, [Instruction_name ["vacge"]], + All (3, Dreg), "vcage", cmp_sign_matters, [F32]; + Vcage, [Instruction_name ["vacge"]], + All (3, Qreg), "vcageQ", cmp_sign_matters, [F32]; + + (* Compare absolute less-than or equal. *) + Vcale, [Instruction_name ["vacge"]; Flipped "vcage"], + All (3, Dreg), "vcale", cmp_sign_matters, [F32]; + Vcale, [Instruction_name ["vacge"]; Flipped "vcageQ"], + All (3, Qreg), "vcaleQ", cmp_sign_matters, [F32]; + + (* Compare absolute greater-than or equal. *) + Vcagt, [Instruction_name ["vacgt"]], + All (3, Dreg), "vcagt", cmp_sign_matters, [F32]; + Vcagt, [Instruction_name ["vacgt"]], + All (3, Qreg), "vcagtQ", cmp_sign_matters, [F32]; + + (* Compare absolute less-than or equal. *) + Vcalt, [Instruction_name ["vacgt"]; Flipped "vcagt"], + All (3, Dreg), "vcalt", cmp_sign_matters, [F32]; + Vcalt, [Instruction_name ["vacgt"]; Flipped "vcagtQ"], + All (3, Qreg), "vcaltQ", cmp_sign_matters, [F32]; + + (* Test bits. *) + Vtst, [], All (3, Dreg), "vtst", cmp_bits, P8 :: su_8_32; + Vtst, [], All (3, Qreg), "vtstQ", cmp_bits, P8 :: su_8_32; + + (* Absolute difference. *) + Vabd, [], All (3, Dreg), "vabd", elts_same_2, F32 :: su_8_32; + Vabd, [], All (3, Qreg), "vabdQ", elts_same_2, F32 :: su_8_32; + Vabd, [], Long, "vabdl", elts_same_2, su_8_32; + + (* Absolute difference and accumulate. *) + Vaba, [], All (3, Dreg), "vaba", elts_same_io, su_8_32; + Vaba, [], All (3, Qreg), "vabaQ", elts_same_io, su_8_32; + Vaba, [], Long, "vabal", elts_same_io, su_8_32; + + (* Max. *) + Vmax, [], All (3, Dreg), "vmax", elts_same_2, F32 :: su_8_32; + Vmax, [], All (3, Qreg), "vmaxQ", elts_same_2, F32 :: su_8_32; + + (* Min. *) + Vmin, [], All (3, Dreg), "vmin", elts_same_2, F32 :: su_8_32; + Vmin, [], All (3, Qreg), "vminQ", elts_same_2, F32 :: su_8_32; + + (* Pairwise add. *) + Vpadd, [], All (3, Dreg), "vpadd", sign_invar_2, F32 :: su_8_32; + Vpadd, [], Long_noreg Dreg, "vpaddl", elts_same_1, su_8_32; + Vpadd, [], Long_noreg Qreg, "vpaddlQ", elts_same_1, su_8_32; + + (* Pairwise add, widen and accumulate. *) + Vpada, [], Wide_noreg Dreg, "vpadal", elts_same_2, su_8_32; + Vpada, [], Wide_noreg Qreg, "vpadalQ", elts_same_2, su_8_32; + + (* Folding maximum, minimum. *) + Vpmax, [], All (3, Dreg), "vpmax", elts_same_2, F32 :: su_8_32; + Vpmin, [], All (3, Dreg), "vpmin", elts_same_2, F32 :: su_8_32; + + (* Reciprocal step. *) + Vrecps, [], All (3, Dreg), "vrecps", elts_same_2, [F32]; + Vrecps, [], All (3, Qreg), "vrecpsQ", elts_same_2, [F32]; + Vrsqrts, [], All (3, Dreg), "vrsqrts", elts_same_2, [F32]; + Vrsqrts, [], All (3, Qreg), "vrsqrtsQ", elts_same_2, [F32]; + + (* Vector shift left. *) + Vshl, [], All (3, Dreg), "vshl", reg_shift, su_8_64; + Vshl, [], All (3, Qreg), "vshlQ", reg_shift, su_8_64; + Vshl, [Instruction_name ["vrshl"]; Rounding], + All (3, Dreg), "vRshl", reg_shift, su_8_64; + Vshl, [Instruction_name ["vrshl"]; Rounding], + All (3, Qreg), "vRshlQ", reg_shift, su_8_64; + Vshl, [Saturating], All (3, Dreg), "vqshl", reg_shift, su_8_64; + Vshl, [Saturating], All (3, Qreg), "vqshlQ", reg_shift, su_8_64; + Vshl, [Instruction_name ["vqrshl"]; Saturating; Rounding], + All (3, Dreg), "vqRshl", reg_shift, su_8_64; + Vshl, [Instruction_name ["vqrshl"]; Saturating; Rounding], + All (3, Qreg), "vqRshlQ", reg_shift, su_8_64; + + (* Vector shift right by constant. *) + Vshr_n, [], Binary_imm Dreg, "vshr_n", shift_right, su_8_64; + Vshr_n, [], Binary_imm Qreg, "vshrQ_n", shift_right, su_8_64; + Vshr_n, [Instruction_name ["vrshr"]; Rounding], Binary_imm Dreg, + "vRshr_n", shift_right, su_8_64; + Vshr_n, [Instruction_name ["vrshr"]; Rounding], Binary_imm Qreg, + "vRshrQ_n", shift_right, su_8_64; + Vshr_n, [], Narrow_imm, "vshrn_n", shift_right_sign_invar, su_16_64; + Vshr_n, [Instruction_name ["vrshrn"]; Rounding], Narrow_imm, "vRshrn_n", + shift_right_sign_invar, su_16_64; + Vshr_n, [Saturating], Narrow_imm, "vqshrn_n", shift_right, su_16_64; + Vshr_n, [Instruction_name ["vqrshrn"]; Saturating; Rounding], Narrow_imm, + "vqRshrn_n", shift_right, su_16_64; + Vshr_n, [Saturating; Dst_unsign], Narrow_imm, "vqshrun_n", + shift_right_to_uns, [S16; S32; S64]; + Vshr_n, [Instruction_name ["vqrshrun"]; Saturating; Dst_unsign; Rounding], + Narrow_imm, "vqRshrun_n", shift_right_to_uns, [S16; S32; S64]; + + (* Vector shift left by constant. *) + Vshl_n, [], Binary_imm Dreg, "vshl_n", shift_left_sign_invar, su_8_64; + Vshl_n, [], Binary_imm Qreg, "vshlQ_n", shift_left_sign_invar, su_8_64; + Vshl_n, [Saturating], Binary_imm Dreg, "vqshl_n", shift_left, su_8_64; + Vshl_n, [Saturating], Binary_imm Qreg, "vqshlQ_n", shift_left, su_8_64; + Vshl_n, [Saturating; Dst_unsign], Binary_imm Dreg, "vqshlu_n", + shift_left_to_uns, [S8; S16; S32; S64]; + Vshl_n, [Saturating; Dst_unsign], Binary_imm Qreg, "vqshluQ_n", + shift_left_to_uns, [S8; S16; S32; S64]; + Vshl_n, [], Long_imm, "vshll_n", shift_left, su_8_32; + + (* Vector shift right by constant and accumulate. *) + Vsra_n, [], Binary_imm Dreg, "vsra_n", shift_right_acc, su_8_64; + Vsra_n, [], Binary_imm Qreg, "vsraQ_n", shift_right_acc, su_8_64; + Vsra_n, [Instruction_name ["vrsra"]; Rounding], Binary_imm Dreg, + "vRsra_n", shift_right_acc, su_8_64; + Vsra_n, [Instruction_name ["vrsra"]; Rounding], Binary_imm Qreg, + "vRsraQ_n", shift_right_acc, su_8_64; + + (* Vector shift right and insert. *) + Vsri, [], Use_operands [| Dreg; Dreg; Immed |], "vsri_n", shift_insert, + P8 :: P16 :: su_8_64; + Vsri, [], Use_operands [| Qreg; Qreg; Immed |], "vsriQ_n", shift_insert, + P8 :: P16 :: su_8_64; + + (* Vector shift left and insert. *) + Vsli, [], Use_operands [| Dreg; Dreg; Immed |], "vsli_n", shift_insert, + P8 :: P16 :: su_8_64; + Vsli, [], Use_operands [| Qreg; Qreg; Immed |], "vsliQ_n", shift_insert, + P8 :: P16 :: su_8_64; + + (* Absolute value. *) + Vabs, [], All (2, Dreg), "vabs", elts_same_1, [S8; S16; S32; F32]; + Vabs, [], All (2, Qreg), "vabsQ", elts_same_1, [S8; S16; S32; F32]; + Vabs, [Saturating], All (2, Dreg), "vqabs", elts_same_1, [S8; S16; S32]; + Vabs, [Saturating], All (2, Qreg), "vqabsQ", elts_same_1, [S8; S16; S32]; + + (* Negate. *) + Vneg, [], All (2, Dreg), "vneg", elts_same_1, [S8; S16; S32; F32]; + Vneg, [], All (2, Qreg), "vnegQ", elts_same_1, [S8; S16; S32; F32]; + Vneg, [Saturating], All (2, Dreg), "vqneg", elts_same_1, [S8; S16; S32]; + Vneg, [Saturating], All (2, Qreg), "vqnegQ", elts_same_1, [S8; S16; S32]; + + (* Bitwise not. *) + Vmvn, [], All (2, Dreg), "vmvn", notype_1, P8 :: su_8_32; + Vmvn, [], All (2, Qreg), "vmvnQ", notype_1, P8 :: su_8_32; + + (* Count leading sign bits. *) + Vcls, [], All (2, Dreg), "vcls", elts_same_1, [S8; S16; S32]; + Vcls, [], All (2, Qreg), "vclsQ", elts_same_1, [S8; S16; S32]; + + (* Count leading zeros. *) + Vclz, [], All (2, Dreg), "vclz", sign_invar_1, su_8_32; + Vclz, [], All (2, Qreg), "vclzQ", sign_invar_1, su_8_32; + + (* Count number of set bits. *) + Vcnt, [], All (2, Dreg), "vcnt", bits_1, [P8; S8; U8]; + Vcnt, [], All (2, Qreg), "vcntQ", bits_1, [P8; S8; U8]; + + (* Reciprocal estimate. *) + Vrecpe, [], All (2, Dreg), "vrecpe", elts_same_1, [U32; F32]; + Vrecpe, [], All (2, Qreg), "vrecpeQ", elts_same_1, [U32; F32]; + + (* Reciprocal square-root estimate. *) + Vrsqrte, [], All (2, Dreg), "vrsqrte", elts_same_1, [U32; F32]; + Vrsqrte, [], All (2, Qreg), "vrsqrteQ", elts_same_1, [U32; F32]; + + (* Get lanes from a vector. *) + Vget_lane, + [InfoWord; Disassembles_as [Use_operands [| Corereg; Element_of_dreg |]]; + Instruction_name ["vmov"]], + Use_operands [| Corereg; Dreg; Immed |], + "vget_lane", get_lane, pf_su_8_32; + Vget_lane, + [No_op; + InfoWord; + Disassembles_as [Use_operands [| Corereg; Corereg; Dreg |]]; + Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)], + Use_operands [| Corereg; Dreg; Immed |], + "vget_lane", notype_2, [S64; U64]; + Vget_lane, + [InfoWord; Disassembles_as [Use_operands [| Corereg; Element_of_dreg |]]; + Instruction_name ["vmov"]], + Use_operands [| Corereg; Qreg; Immed |], + "vgetQ_lane", get_lane, pf_su_8_32; + Vget_lane, + [InfoWord; + Disassembles_as [Use_operands [| Corereg; Corereg; Dreg |]]; + Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)], + Use_operands [| Corereg; Qreg; Immed |], + "vgetQ_lane", notype_2, [S64; U64]; + + (* Set lanes in a vector. *) + Vset_lane, [Disassembles_as [Use_operands [| Element_of_dreg; Corereg |]]; + Instruction_name ["vmov"]], + Use_operands [| Dreg; Corereg; Dreg; Immed |], "vset_lane", + set_lane, pf_su_8_32; + Vset_lane, [No_op; + Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]]; + Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)], + Use_operands [| Dreg; Corereg; Dreg; Immed |], "vset_lane", + set_lane_notype, [S64; U64]; + Vset_lane, [Disassembles_as [Use_operands [| Element_of_dreg; Corereg |]]; + Instruction_name ["vmov"]], + Use_operands [| Qreg; Corereg; Qreg; Immed |], "vsetQ_lane", + set_lane, pf_su_8_32; + Vset_lane, [Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]]; + Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)], + Use_operands [| Qreg; Corereg; Qreg; Immed |], "vsetQ_lane", + set_lane_notype, [S64; U64]; + + (* Create vector from literal bit pattern. *) + Vcreate, + [No_op], (* Not really, but it can yield various things that are too + hard for the test generator at this time. *) + Use_operands [| Dreg; Corereg |], "vcreate", create_vector, + pf_su_8_64; + + (* Set all lanes to the same value. *) + Vdup_n, + [Disassembles_as [Use_operands [| Dreg; + Alternatives [ Corereg; + Element_of_dreg ] |]]], + Use_operands [| Dreg; Corereg |], "vdup_n", bits_1, + pf_su_8_32; + Vdup_n, + [No_op; + Instruction_name ["vmov"]; + Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]]], + Use_operands [| Dreg; Corereg |], "vdup_n", notype_1, + [S64; U64]; + Vdup_n, + [Disassembles_as [Use_operands [| Qreg; + Alternatives [ Corereg; + Element_of_dreg ] |]]], + Use_operands [| Qreg; Corereg |], "vdupQ_n", bits_1, + pf_su_8_32; + Vdup_n, + [No_op; + Instruction_name ["vmov"]; + Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]; + Use_operands [| Dreg; Corereg; Corereg |]]], + Use_operands [| Qreg; Corereg |], "vdupQ_n", notype_1, + [S64; U64]; + + (* These are just aliases for the above. *) + Vmov_n, + [Builtin_name "vdup_n"; + Disassembles_as [Use_operands [| Dreg; + Alternatives [ Corereg; + Element_of_dreg ] |]]], + Use_operands [| Dreg; Corereg |], + "vmov_n", bits_1, pf_su_8_32; + Vmov_n, + [No_op; + Builtin_name "vdup_n"; + Instruction_name ["vmov"]; + Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]]], + Use_operands [| Dreg; Corereg |], + "vmov_n", notype_1, [S64; U64]; + Vmov_n, + [Builtin_name "vdupQ_n"; + Disassembles_as [Use_operands [| Qreg; + Alternatives [ Corereg; + Element_of_dreg ] |]]], + Use_operands [| Qreg; Corereg |], + "vmovQ_n", bits_1, pf_su_8_32; + Vmov_n, + [No_op; + Builtin_name "vdupQ_n"; + Instruction_name ["vmov"]; + Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]; + Use_operands [| Dreg; Corereg; Corereg |]]], + Use_operands [| Qreg; Corereg |], + "vmovQ_n", notype_1, [S64; U64]; + + (* Duplicate, lane version. We can't use Use_operands here because the + rightmost register (always Dreg) would be picked up by find_key_operand, + when we want the leftmost register to be used in this case (otherwise + the modes are indistinguishable in neon.md, etc. *) + Vdup_lane, + [Disassembles_as [Use_operands [| Dreg; Element_of_dreg |]]], + Unary_scalar Dreg, "vdup_lane", bits_2, pf_su_8_32; + Vdup_lane, + [No_op; Const_valuator (fun _ -> 0)], + Unary_scalar Dreg, "vdup_lane", bits_2, [S64; U64]; + Vdup_lane, + [Disassembles_as [Use_operands [| Qreg; Element_of_dreg |]]], + Unary_scalar Qreg, "vdupQ_lane", bits_2, pf_su_8_32; + Vdup_lane, + [No_op; Const_valuator (fun _ -> 0)], + Unary_scalar Qreg, "vdupQ_lane", bits_2, [S64; U64]; + + (* Combining vectors. *) + Vcombine, [No_op], + Use_operands [| Qreg; Dreg; Dreg |], "vcombine", notype_2, + pf_su_8_64; + + (* Splitting vectors. *) + Vget_high, [No_op], + Use_operands [| Dreg; Qreg |], "vget_high", + notype_1, pf_su_8_64; + Vget_low, [Instruction_name ["vmov"]; + Disassembles_as [Use_operands [| Dreg; Dreg |]]; + Fixed_return_reg], + Use_operands [| Dreg; Qreg |], "vget_low", + notype_1, pf_su_8_32; + Vget_low, [No_op], + Use_operands [| Dreg; Qreg |], "vget_low", + notype_1, [S64; U64]; + + (* Conversions. *) + Vcvt, [InfoWord], All (2, Dreg), "vcvt", conv_1, + [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)]; + Vcvt, [InfoWord], All (2, Qreg), "vcvtQ", conv_1, + [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)]; + Vcvt_n, [InfoWord], Use_operands [| Dreg; Dreg; Immed |], "vcvt_n", conv_2, + [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)]; + Vcvt_n, [InfoWord], Use_operands [| Qreg; Qreg; Immed |], "vcvtQ_n", conv_2, + [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)]; + + (* Move, narrowing. *) + Vmovn, [Disassembles_as [Use_operands [| Dreg; Qreg |]]], + Narrow, "vmovn", sign_invar_1, su_16_64; + Vmovn, [Disassembles_as [Use_operands [| Dreg; Qreg |]]; Saturating], + Narrow, "vqmovn", elts_same_1, su_16_64; + Vmovn, + [Disassembles_as [Use_operands [| Dreg; Qreg |]]; Saturating; Dst_unsign], + Narrow, "vqmovun", dst_unsign_1, + [S16; S32; S64]; + + (* Move, long. *) + Vmovl, [Disassembles_as [Use_operands [| Qreg; Dreg |]]], + Long, "vmovl", elts_same_1, su_8_32; + + (* Table lookup. *) + Vtbl 1, + [Instruction_name ["vtbl"]; + Disassembles_as [Use_operands [| Dreg; VecArray (1, Dreg); Dreg |]]], + Use_operands [| Dreg; Dreg; Dreg |], "vtbl1", table_2, [U8; S8; P8]; + Vtbl 2, [Instruction_name ["vtbl"]], + Use_operands [| Dreg; VecArray (2, Dreg); Dreg |], "vtbl2", table_2, + [U8; S8; P8]; + Vtbl 3, [Instruction_name ["vtbl"]], + Use_operands [| Dreg; VecArray (3, Dreg); Dreg |], "vtbl3", table_2, + [U8; S8; P8]; + Vtbl 4, [Instruction_name ["vtbl"]], + Use_operands [| Dreg; VecArray (4, Dreg); Dreg |], "vtbl4", table_2, + [U8; S8; P8]; + + (* Extended table lookup. *) + Vtbx 1, + [Instruction_name ["vtbx"]; + Disassembles_as [Use_operands [| Dreg; VecArray (1, Dreg); Dreg |]]], + Use_operands [| Dreg; Dreg; Dreg |], "vtbx1", table_io, [U8; S8; P8]; + Vtbx 2, [Instruction_name ["vtbx"]], + Use_operands [| Dreg; VecArray (2, Dreg); Dreg |], "vtbx2", table_io, + [U8; S8; P8]; + Vtbx 3, [Instruction_name ["vtbx"]], + Use_operands [| Dreg; VecArray (3, Dreg); Dreg |], "vtbx3", table_io, + [U8; S8; P8]; + Vtbx 4, [Instruction_name ["vtbx"]], + Use_operands [| Dreg; VecArray (4, Dreg); Dreg |], "vtbx4", table_io, + [U8; S8; P8]; + + (* Multiply, lane. (note: these were undocumented at the time of + writing). *) + Vmul_lane, [], By_scalar Dreg, "vmul_lane", sign_invar_2_lane, + [S16; S32; U16; U32; F32]; + Vmul_lane, [], By_scalar Qreg, "vmulQ_lane", sign_invar_2_lane, + [S16; S32; U16; U32; F32]; + + (* Multiply-accumulate, lane. *) + Vmla_lane, [], By_scalar Dreg, "vmla_lane", sign_invar_io_lane, + [S16; S32; U16; U32; F32]; + Vmla_lane, [], By_scalar Qreg, "vmlaQ_lane", sign_invar_io_lane, + [S16; S32; U16; U32; F32]; + Vmla_lane, [], Wide_lane, "vmlal_lane", elts_same_io_lane, + [S16; S32; U16; U32]; + Vmla_lane, [Saturating; Doubling], Wide_lane, "vqdmlal_lane", + elts_same_io_lane, [S16; S32]; + + (* Multiply-subtract, lane. *) + Vmls_lane, [], By_scalar Dreg, "vmls_lane", sign_invar_io_lane, + [S16; S32; U16; U32; F32]; + Vmls_lane, [], By_scalar Qreg, "vmlsQ_lane", sign_invar_io_lane, + [S16; S32; U16; U32; F32]; + Vmls_lane, [], Wide_lane, "vmlsl_lane", elts_same_io_lane, + [S16; S32; U16; U32]; + Vmls_lane, [Saturating; Doubling], Wide_lane, "vqdmlsl_lane", + elts_same_io_lane, [S16; S32]; + + (* Long multiply, lane. *) + Vmull_lane, [], + Wide_lane, "vmull_lane", elts_same_2_lane, [S16; S32; U16; U32]; + + (* Saturating doubling long multiply, lane. *) + Vqdmull_lane, [Saturating; Doubling], + Wide_lane, "vqdmull_lane", elts_same_2_lane, [S16; S32]; + + (* Saturating doubling long multiply high, lane. *) + Vqdmulh_lane, [Saturating; Halving], + By_scalar Qreg, "vqdmulhQ_lane", elts_same_2_lane, [S16; S32]; + Vqdmulh_lane, [Saturating; Halving], + By_scalar Dreg, "vqdmulh_lane", elts_same_2_lane, [S16; S32]; + Vqdmulh_lane, [Saturating; Halving; Rounding; + Instruction_name ["vqrdmulh"]], + By_scalar Qreg, "vqRdmulhQ_lane", elts_same_2_lane, [S16; S32]; + Vqdmulh_lane, [Saturating; Halving; Rounding; + Instruction_name ["vqrdmulh"]], + By_scalar Dreg, "vqRdmulh_lane", elts_same_2_lane, [S16; S32]; + + (* Vector multiply by scalar. *) + Vmul_n, [InfoWord; + Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]], + Use_operands [| Dreg; Dreg; Corereg |], "vmul_n", + sign_invar_2, [S16; S32; U16; U32; F32]; + Vmul_n, [InfoWord; + Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]], + Use_operands [| Qreg; Qreg; Corereg |], "vmulQ_n", + sign_invar_2, [S16; S32; U16; U32; F32]; + + (* Vector long multiply by scalar. *) + Vmull_n, [Instruction_name ["vmull"]; + Disassembles_as [Use_operands [| Qreg; Dreg; Element_of_dreg |]]], + Wide_scalar, "vmull_n", + elts_same_2, [S16; S32; U16; U32]; + + (* Vector saturating doubling long multiply by scalar. *) + Vqdmull_n, [Saturating; Doubling; + Disassembles_as [Use_operands [| Qreg; Dreg; + Element_of_dreg |]]], + Wide_scalar, "vqdmull_n", + elts_same_2, [S16; S32]; + + (* Vector saturating doubling long multiply high by scalar. *) + Vqdmulh_n, + [Saturating; Halving; InfoWord; + Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]], + Use_operands [| Qreg; Qreg; Corereg |], + "vqdmulhQ_n", elts_same_2, [S16; S32]; + Vqdmulh_n, + [Saturating; Halving; InfoWord; + Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]], + Use_operands [| Dreg; Dreg; Corereg |], + "vqdmulh_n", elts_same_2, [S16; S32]; + Vqdmulh_n, + [Saturating; Halving; Rounding; InfoWord; + Instruction_name ["vqrdmulh"]; + Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]], + Use_operands [| Qreg; Qreg; Corereg |], + "vqRdmulhQ_n", elts_same_2, [S16; S32]; + Vqdmulh_n, + [Saturating; Halving; Rounding; InfoWord; + Instruction_name ["vqrdmulh"]; + Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]], + Use_operands [| Dreg; Dreg; Corereg |], + "vqRdmulh_n", elts_same_2, [S16; S32]; + + (* Vector multiply-accumulate by scalar. *) + Vmla_n, [InfoWord; + Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]], + Use_operands [| Dreg; Dreg; Corereg |], "vmla_n", + sign_invar_io, [S16; S32; U16; U32; F32]; + Vmla_n, [InfoWord; + Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]], + Use_operands [| Qreg; Qreg; Corereg |], "vmlaQ_n", + sign_invar_io, [S16; S32; U16; U32; F32]; + Vmla_n, [], Wide_scalar, "vmlal_n", elts_same_io, [S16; S32; U16; U32]; + Vmla_n, [Saturating; Doubling], Wide_scalar, "vqdmlal_n", elts_same_io, + [S16; S32]; + + (* Vector multiply subtract by scalar. *) + Vmls_n, [InfoWord; + Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]], + Use_operands [| Dreg; Dreg; Corereg |], "vmls_n", + sign_invar_io, [S16; S32; U16; U32; F32]; + Vmls_n, [InfoWord; + Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]], + Use_operands [| Qreg; Qreg; Corereg |], "vmlsQ_n", + sign_invar_io, [S16; S32; U16; U32; F32]; + Vmls_n, [], Wide_scalar, "vmlsl_n", elts_same_io, [S16; S32; U16; U32]; + Vmls_n, [Saturating; Doubling], Wide_scalar, "vqdmlsl_n", elts_same_io, + [S16; S32]; + + (* Vector extract. *) + Vext, [Const_valuator (fun _ -> 0)], + Use_operands [| Dreg; Dreg; Dreg; Immed |], "vext", extend, + pf_su_8_64; + Vext, [Const_valuator (fun _ -> 0)], + Use_operands [| Qreg; Qreg; Qreg; Immed |], "vextQ", extend, + pf_su_8_64; + + (* Reverse elements. *) + Vrev64, [], All (2, Dreg), "vrev64", bits_1, P8 :: P16 :: F32 :: su_8_32; + Vrev64, [], All (2, Qreg), "vrev64Q", bits_1, P8 :: P16 :: F32 :: su_8_32; + Vrev32, [], All (2, Dreg), "vrev32", bits_1, [P8; P16; S8; U8; S16; U16]; + Vrev32, [], All (2, Qreg), "vrev32Q", bits_1, [P8; P16; S8; U8; S16; U16]; + Vrev16, [], All (2, Dreg), "vrev16", bits_1, [P8; S8; U8]; + Vrev16, [], All (2, Qreg), "vrev16Q", bits_1, [P8; S8; U8]; + + (* Bit selection. *) + Vbsl, + [Instruction_name ["vbsl"; "vbit"; "vbif"]; + Disassembles_as [Use_operands [| Dreg; Dreg; Dreg |]]], + Use_operands [| Dreg; Dreg; Dreg; Dreg |], "vbsl", bit_select, + pf_su_8_64; + Vbsl, + [Instruction_name ["vbsl"; "vbit"; "vbif"]; + Disassembles_as [Use_operands [| Qreg; Qreg; Qreg |]]], + Use_operands [| Qreg; Qreg; Qreg; Qreg |], "vbslQ", bit_select, + pf_su_8_64; + + (* Transpose elements. **NOTE** ReturnPtr goes some of the way towards + generating good code for intrinsics which return structure types -- + builtins work well by themselves (and understand that the values being + stored on e.g. the stack also reside in registers, so can optimise the + stores away entirely if the results are used immediately), but + intrinsics are very much less efficient. Maybe something can be improved + re: inlining, or tweaking the ABI used for intrinsics (a special call + attribute?). + *) + Vtrn, [ReturnPtr], Pair_result Dreg, "vtrn", bits_2, pf_su_8_32; + Vtrn, [ReturnPtr], Pair_result Qreg, "vtrnQ", bits_2, pf_su_8_32; + + (* Zip elements. *) + Vzip, [ReturnPtr], Pair_result Dreg, "vzip", bits_2, pf_su_8_32; + Vzip, [ReturnPtr], Pair_result Qreg, "vzipQ", bits_2, pf_su_8_32; + + (* Unzip elements. *) + Vuzp, [ReturnPtr], Pair_result Dreg, "vuzp", bits_2, pf_su_8_32; + Vuzp, [ReturnPtr], Pair_result Qreg, "vuzpQ", bits_2, pf_su_8_32; + + (* Element/structure loads. VLD1 variants. *) + Vldx 1, + [Disassembles_as [Use_operands [| VecArray (1, Dreg); + CstPtrTo Corereg |]]], + Use_operands [| Dreg; CstPtrTo Corereg |], "vld1", bits_1, + pf_su_8_64; + Vldx 1, [Disassembles_as [Use_operands [| VecArray (2, Dreg); + CstPtrTo Corereg |]]], + Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q", bits_1, + pf_su_8_64; + + Vldx_lane 1, + [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg); + CstPtrTo Corereg |]]], + Use_operands [| Dreg; CstPtrTo Corereg; Dreg; Immed |], + "vld1_lane", bits_3, pf_su_8_32; + Vldx_lane 1, + [Disassembles_as [Use_operands [| VecArray (1, Dreg); + CstPtrTo Corereg |]]; + Const_valuator (fun _ -> 0)], + Use_operands [| Dreg; CstPtrTo Corereg; Dreg; Immed |], + "vld1_lane", bits_3, [S64; U64]; + Vldx_lane 1, + [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg); + CstPtrTo Corereg |]]], + Use_operands [| Qreg; CstPtrTo Corereg; Qreg; Immed |], + "vld1Q_lane", bits_3, pf_su_8_32; + Vldx_lane 1, + [Disassembles_as [Use_operands [| VecArray (1, Dreg); + CstPtrTo Corereg |]]], + Use_operands [| Qreg; CstPtrTo Corereg; Qreg; Immed |], + "vld1Q_lane", bits_3, [S64; U64]; + + Vldx_dup 1, + [Disassembles_as [Use_operands [| VecArray (1, All_elements_of_dreg); + CstPtrTo Corereg |]]], + Use_operands [| Dreg; CstPtrTo Corereg |], "vld1_dup", + bits_1, pf_su_8_32; + Vldx_dup 1, + [Disassembles_as [Use_operands [| VecArray (1, Dreg); + CstPtrTo Corereg |]]], + Use_operands [| Dreg; CstPtrTo Corereg |], "vld1_dup", + bits_1, [S64; U64]; + Vldx_dup 1, + [Disassembles_as [Use_operands [| VecArray (2, All_elements_of_dreg); + CstPtrTo Corereg |]]], + Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q_dup", + bits_1, pf_su_8_32; + Vldx_dup 1, + [Disassembles_as [Use_operands [| VecArray (2, Dreg); + CstPtrTo Corereg |]]], + Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q_dup", + bits_1, [S64; U64]; + + (* VST1 variants. *) + Vstx 1, [Disassembles_as [Use_operands [| VecArray (1, Dreg); + PtrTo Corereg |]]], + Use_operands [| PtrTo Corereg; Dreg |], "vst1", + store_1, pf_su_8_64; + Vstx 1, [Disassembles_as [Use_operands [| VecArray (2, Dreg); + PtrTo Corereg |]]], + Use_operands [| PtrTo Corereg; Qreg |], "vst1Q", + store_1, pf_su_8_64; + + Vstx_lane 1, + [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg); + CstPtrTo Corereg |]]], + Use_operands [| PtrTo Corereg; Dreg; Immed |], + "vst1_lane", store_3, pf_su_8_32; + Vstx_lane 1, + [Disassembles_as [Use_operands [| VecArray (1, Dreg); + CstPtrTo Corereg |]]; + Const_valuator (fun _ -> 0)], + Use_operands [| PtrTo Corereg; Dreg; Immed |], + "vst1_lane", store_3, [U64; S64]; + Vstx_lane 1, + [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg); + CstPtrTo Corereg |]]], + Use_operands [| PtrTo Corereg; Qreg; Immed |], + "vst1Q_lane", store_3, pf_su_8_32; + Vstx_lane 1, + [Disassembles_as [Use_operands [| VecArray (1, Dreg); + CstPtrTo Corereg |]]], + Use_operands [| PtrTo Corereg; Qreg; Immed |], + "vst1Q_lane", store_3, [U64; S64]; + + (* VLD2 variants. *) + Vldx 2, [], Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |], + "vld2", bits_1, pf_su_8_32; + Vldx 2, [Instruction_name ["vld1"]], + Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |], + "vld2", bits_1, [S64; U64]; + Vldx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg); + CstPtrTo Corereg |]; + Use_operands [| VecArray (2, Dreg); + CstPtrTo Corereg |]]], + Use_operands [| VecArray (2, Qreg); CstPtrTo Corereg |], + "vld2Q", bits_1, pf_su_8_32; + + Vldx_lane 2, + [Disassembles_as [Use_operands + [| VecArray (2, Element_of_dreg); + CstPtrTo Corereg |]]], + Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg; + VecArray (2, Dreg); Immed |], + "vld2_lane", bits_3, P8 :: P16 :: F32 :: su_8_32; + Vldx_lane 2, + [Disassembles_as [Use_operands + [| VecArray (2, Element_of_dreg); + CstPtrTo Corereg |]]], + Use_operands [| VecArray (2, Qreg); CstPtrTo Corereg; + VecArray (2, Qreg); Immed |], + "vld2Q_lane", bits_3, [P16; F32; U16; U32; S16; S32]; + + Vldx_dup 2, + [Disassembles_as [Use_operands + [| VecArray (2, All_elements_of_dreg); CstPtrTo Corereg |]]], + Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |], + "vld2_dup", bits_1, pf_su_8_32; + Vldx_dup 2, + [Instruction_name ["vld1"]; Disassembles_as [Use_operands + [| VecArray (2, Dreg); CstPtrTo Corereg |]]], + Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |], + "vld2_dup", bits_1, [S64; U64]; + + (* VST2 variants. *) + Vstx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg); + PtrTo Corereg |]]], + Use_operands [| PtrTo Corereg; VecArray (2, Dreg) |], "vst2", + store_1, pf_su_8_32; + Vstx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg); + PtrTo Corereg |]]; + Instruction_name ["vst1"]], + Use_operands [| PtrTo Corereg; VecArray (2, Dreg) |], "vst2", + store_1, [S64; U64]; + Vstx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg); + PtrTo Corereg |]; + Use_operands [| VecArray (2, Dreg); + PtrTo Corereg |]]], + Use_operands [| PtrTo Corereg; VecArray (2, Qreg) |], "vst2Q", + store_1, pf_su_8_32; + + Vstx_lane 2, + [Disassembles_as [Use_operands + [| VecArray (2, Element_of_dreg); + CstPtrTo Corereg |]]], + Use_operands [| PtrTo Corereg; VecArray (2, Dreg); Immed |], "vst2_lane", + store_3, P8 :: P16 :: F32 :: su_8_32; + Vstx_lane 2, + [Disassembles_as [Use_operands + [| VecArray (2, Element_of_dreg); + CstPtrTo Corereg |]]], + Use_operands [| PtrTo Corereg; VecArray (2, Qreg); Immed |], "vst2Q_lane", + store_3, [P16; F32; U16; U32; S16; S32]; + + (* VLD3 variants. *) + Vldx 3, [], Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |], + "vld3", bits_1, pf_su_8_32; + Vldx 3, [Instruction_name ["vld1"]], + Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |], + "vld3", bits_1, [S64; U64]; + Vldx 3, [Disassembles_as [Use_operands [| VecArray (3, Dreg); + CstPtrTo Corereg |]; + Use_operands [| VecArray (3, Dreg); + CstPtrTo Corereg |]]], + Use_operands [| VecArray (3, Qreg); CstPtrTo Corereg |], + "vld3Q", bits_1, P8 :: P16 :: F32 :: su_8_32; + + Vldx_lane 3, + [Disassembles_as [Use_operands + [| VecArray (3, Element_of_dreg); + CstPtrTo Corereg |]]], + Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg; + VecArray (3, Dreg); Immed |], + "vld3_lane", bits_3, P8 :: P16 :: F32 :: su_8_32; + Vldx_lane 3, + [Disassembles_as [Use_operands + [| VecArray (3, Element_of_dreg); + CstPtrTo Corereg |]]], + Use_operands [| VecArray (3, Qreg); CstPtrTo Corereg; + VecArray (3, Qreg); Immed |], + "vld3Q_lane", bits_3, [P16; F32; U16; U32; S16; S32]; + + Vldx_dup 3, + [Disassembles_as [Use_operands + [| VecArray (3, All_elements_of_dreg); CstPtrTo Corereg |]]], + Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |], + "vld3_dup", bits_1, pf_su_8_32; + Vldx_dup 3, + [Instruction_name ["vld1"]; Disassembles_as [Use_operands + [| VecArray (3, Dreg); CstPtrTo Corereg |]]], + Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |], + "vld3_dup", bits_1, [S64; U64]; + + (* VST3 variants. *) + Vstx 3, [Disassembles_as [Use_operands [| VecArray (4, Dreg); + PtrTo Corereg |]]], + Use_operands [| PtrTo Corereg; VecArray (3, Dreg) |], "vst3", + store_1, pf_su_8_32; + Vstx 3, [Disassembles_as [Use_operands [| VecArray (4, Dreg); + PtrTo Corereg |]]; + Instruction_name ["vst1"]], + Use_operands [| PtrTo Corereg; VecArray (3, Dreg) |], "vst3", + store_1, [S64; U64]; + Vstx 3, [Disassembles_as [Use_operands [| VecArray (3, Dreg); + PtrTo Corereg |]; + Use_operands [| VecArray (3, Dreg); + PtrTo Corereg |]]], + Use_operands [| PtrTo Corereg; VecArray (3, Qreg) |], "vst3Q", + store_1, pf_su_8_32; + + Vstx_lane 3, + [Disassembles_as [Use_operands + [| VecArray (3, Element_of_dreg); + CstPtrTo Corereg |]]], + Use_operands [| PtrTo Corereg; VecArray (3, Dreg); Immed |], "vst3_lane", + store_3, P8 :: P16 :: F32 :: su_8_32; + Vstx_lane 3, + [Disassembles_as [Use_operands + [| VecArray (3, Element_of_dreg); + CstPtrTo Corereg |]]], + Use_operands [| PtrTo Corereg; VecArray (3, Qreg); Immed |], "vst3Q_lane", + store_3, [P16; F32; U16; U32; S16; S32]; + + (* VLD4/VST4 variants. *) + Vldx 4, [], Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |], + "vld4", bits_1, pf_su_8_32; + Vldx 4, [Instruction_name ["vld1"]], + Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |], + "vld4", bits_1, [S64; U64]; + Vldx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg); + CstPtrTo Corereg |]; + Use_operands [| VecArray (4, Dreg); + CstPtrTo Corereg |]]], + Use_operands [| VecArray (4, Qreg); CstPtrTo Corereg |], + "vld4Q", bits_1, P8 :: P16 :: F32 :: su_8_32; + + Vldx_lane 4, + [Disassembles_as [Use_operands + [| VecArray (4, Element_of_dreg); + CstPtrTo Corereg |]]], + Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg; + VecArray (4, Dreg); Immed |], + "vld4_lane", bits_3, P8 :: P16 :: F32 :: su_8_32; + Vldx_lane 4, + [Disassembles_as [Use_operands + [| VecArray (4, Element_of_dreg); + CstPtrTo Corereg |]]], + Use_operands [| VecArray (4, Qreg); CstPtrTo Corereg; + VecArray (4, Qreg); Immed |], + "vld4Q_lane", bits_3, [P16; F32; U16; U32; S16; S32]; + + Vldx_dup 4, + [Disassembles_as [Use_operands + [| VecArray (4, All_elements_of_dreg); CstPtrTo Corereg |]]], + Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |], + "vld4_dup", bits_1, pf_su_8_32; + Vldx_dup 4, + [Instruction_name ["vld1"]; Disassembles_as [Use_operands + [| VecArray (4, Dreg); CstPtrTo Corereg |]]], + Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |], + "vld4_dup", bits_1, [S64; U64]; + + Vstx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg); + PtrTo Corereg |]]], + Use_operands [| PtrTo Corereg; VecArray (4, Dreg) |], "vst4", + store_1, pf_su_8_32; + Vstx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg); + PtrTo Corereg |]]; + Instruction_name ["vst1"]], + Use_operands [| PtrTo Corereg; VecArray (4, Dreg) |], "vst4", + store_1, [S64; U64]; + Vstx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg); + PtrTo Corereg |]; + Use_operands [| VecArray (4, Dreg); + PtrTo Corereg |]]], + Use_operands [| PtrTo Corereg; VecArray (4, Qreg) |], "vst4Q", + store_1, pf_su_8_32; + + Vstx_lane 4, + [Disassembles_as [Use_operands + [| VecArray (4, Element_of_dreg); + CstPtrTo Corereg |]]], + Use_operands [| PtrTo Corereg; VecArray (4, Dreg); Immed |], "vst4_lane", + store_3, P8 :: P16 :: F32 :: su_8_32; + Vstx_lane 4, + [Disassembles_as [Use_operands + [| VecArray (4, Element_of_dreg); + CstPtrTo Corereg |]]], + Use_operands [| PtrTo Corereg; VecArray (4, Qreg); Immed |], "vst4Q_lane", + store_3, [P16; F32; U16; U32; S16; S32]; + + (* Logical operations. And. *) + Vand, [], All (3, Dreg), "vand", notype_2, su_8_32; + Vand, [No_op], All (3, Dreg), "vand", notype_2, [S64; U64]; + Vand, [], All (3, Qreg), "vandQ", notype_2, su_8_64; + + (* Or. *) + Vorr, [], All (3, Dreg), "vorr", notype_2, su_8_32; + Vorr, [No_op], All (3, Dreg), "vorr", notype_2, [S64; U64]; + Vorr, [], All (3, Qreg), "vorrQ", notype_2, su_8_64; + + (* Eor. *) + Veor, [], All (3, Dreg), "veor", notype_2, su_8_32; + Veor, [No_op], All (3, Dreg), "veor", notype_2, [S64; U64]; + Veor, [], All (3, Qreg), "veorQ", notype_2, su_8_64; + + (* Bic (And-not). *) + Vbic, [], All (3, Dreg), "vbic", notype_2, su_8_32; + Vbic, [No_op], All (3, Dreg), "vbic", notype_2, [S64; U64]; + Vbic, [], All (3, Qreg), "vbicQ", notype_2, su_8_64; + + (* Or-not. *) + Vorn, [], All (3, Dreg), "vorn", notype_2, su_8_32; + Vorn, [No_op], All (3, Dreg), "vorn", notype_2, [S64; U64]; + Vorn, [], All (3, Qreg), "vornQ", notype_2, su_8_64; + ] + +let reinterp = + let elems = P8 :: P16 :: F32 :: su_8_64 in + List.fold_right + (fun convto acc -> + let types = List.fold_right + (fun convfrom acc -> + if convfrom <> convto then + Cast (convto, convfrom) :: acc + else + acc) + elems + [] + in + let dconv = Vreinterp, [No_op], Use_operands [| Dreg; Dreg |], + "vreinterpret", conv_1, types + and qconv = Vreinterp, [No_op], Use_operands [| Qreg; Qreg |], + "vreinterpretQ", conv_1, types in + dconv :: qconv :: acc) + elems + [] + +(* Output routines. *) + +let rec string_of_elt = function + S8 -> "s8" | S16 -> "s16" | S32 -> "s32" | S64 -> "s64" + | U8 -> "u8" | U16 -> "u16" | U32 -> "u32" | U64 -> "u64" + | I8 -> "i8" | I16 -> "i16" | I32 -> "i32" | I64 -> "i64" + | B8 -> "8" | B16 -> "16" | B32 -> "32" | B64 -> "64" + | F32 -> "f32" | P8 -> "p8" | P16 -> "p16" + | Conv (a, b) | Cast (a, b) -> string_of_elt a ^ "_" ^ string_of_elt b + | NoElts -> failwith "No elts" + +let string_of_elt_dots elt = + match elt with + Conv (a, b) | Cast (a, b) -> string_of_elt a ^ "." ^ string_of_elt b + | _ -> string_of_elt elt + +let string_of_vectype vt = + let rec name affix = function + T_int8x8 -> affix "int8x8" + | T_int8x16 -> affix "int8x16" + | T_int16x4 -> affix "int16x4" + | T_int16x8 -> affix "int16x8" + | T_int32x2 -> affix "int32x2" + | T_int32x4 -> affix "int32x4" + | T_int64x1 -> affix "int64x1" + | T_int64x2 -> affix "int64x2" + | T_uint8x8 -> affix "uint8x8" + | T_uint8x16 -> affix "uint8x16" + | T_uint16x4 -> affix "uint16x4" + | T_uint16x8 -> affix "uint16x8" + | T_uint32x2 -> affix "uint32x2" + | T_uint32x4 -> affix "uint32x4" + | T_uint64x1 -> affix "uint64x1" + | T_uint64x2 -> affix "uint64x2" + | T_float32x2 -> affix "float32x2" + | T_float32x4 -> affix "float32x4" + | T_poly8x8 -> affix "poly8x8" + | T_poly8x16 -> affix "poly8x16" + | T_poly16x4 -> affix "poly16x4" + | T_poly16x8 -> affix "poly16x8" + | T_int8 -> affix "int8" + | T_int16 -> affix "int16" + | T_int32 -> affix "int32" + | T_int64 -> affix "int64" + | T_uint8 -> affix "uint8" + | T_uint16 -> affix "uint16" + | T_uint32 -> affix "uint32" + | T_uint64 -> affix "uint64" + | T_poly8 -> affix "poly8" + | T_poly16 -> affix "poly16" + | T_float32 -> affix "float32" + | T_immediate _ -> "const int" + | T_void -> "void" + | T_intQI -> "__builtin_neon_qi" + | T_intHI -> "__builtin_neon_hi" + | T_intSI -> "__builtin_neon_si" + | T_intDI -> "__builtin_neon_di" + | T_floatSF -> "__builtin_neon_sf" + | T_arrayof (num, base) -> + let basename = name (fun x -> x) base in + affix (Printf.sprintf "%sx%d" basename num) + | T_ptrto x -> + let basename = name affix x in + Printf.sprintf "%s *" basename + | T_const x -> + let basename = name affix x in + Printf.sprintf "const %s" basename + in + name (fun x -> x ^ "_t") vt + +let string_of_inttype = function + B_TImode -> "__builtin_neon_ti" + | B_EImode -> "__builtin_neon_ei" + | B_OImode -> "__builtin_neon_oi" + | B_CImode -> "__builtin_neon_ci" + | B_XImode -> "__builtin_neon_xi" + +let string_of_mode = function + V8QI -> "v8qi" | V4HI -> "v4hi" | V2SI -> "v2si" | V2SF -> "v2sf" + | DI -> "di" | V16QI -> "v16qi" | V8HI -> "v8hi" | V4SI -> "v4si" + | V4SF -> "v4sf" | V2DI -> "v2di" | QI -> "qi" | HI -> "hi" | SI -> "si" + | SF -> "sf" + +(* Use uppercase chars for letters which form part of the intrinsic name, but + should be omitted from the builtin name (the info is passed in an extra + argument, instead). *) +let intrinsic_name name = String.lowercase name + +(* Allow the name of the builtin to be overridden by things (e.g. Flipped) + found in the features list. *) +let builtin_name features name = + let name = List.fold_right + (fun el name -> + match el with + Flipped x | Builtin_name x -> x + | _ -> name) + features name in + let islower x = let str = String.make 1 x in (String.lowercase str) = str + and buf = Buffer.create (String.length name) in + String.iter (fun c -> if islower c then Buffer.add_char buf c) name; + Buffer.contents buf + +(* Transform an arity into a list of strings. *) +let strings_of_arity a = + match a with + | Arity0 vt -> [string_of_vectype vt] + | Arity1 (vt1, vt2) -> [string_of_vectype vt1; string_of_vectype vt2] + | Arity2 (vt1, vt2, vt3) -> [string_of_vectype vt1; + string_of_vectype vt2; + string_of_vectype vt3] + | Arity3 (vt1, vt2, vt3, vt4) -> [string_of_vectype vt1; + string_of_vectype vt2; + string_of_vectype vt3; + string_of_vectype vt4] + | Arity4 (vt1, vt2, vt3, vt4, vt5) -> [string_of_vectype vt1; + string_of_vectype vt2; + string_of_vectype vt3; + string_of_vectype vt4; + string_of_vectype vt5] + +(* Suffixes on the end of builtin names that are to be stripped in order + to obtain the name used as an instruction. They are only stripped if + preceded immediately by an underscore. *) +let suffixes_to_strip = [ "n"; "lane"; "dup" ] + +(* Get the possible names of an instruction corresponding to a "name" from the + ops table. This is done by getting the equivalent builtin name and + stripping any suffixes from the list at the top of this file, unless + the features list presents with an Instruction_name entry, in which + case that is used; or unless the features list presents with a Flipped + entry, in which case that is used. If both such entries are present, + the first in the list will be chosen. *) +let get_insn_names features name = + let names = try + begin + match List.find (fun feature -> match feature with + Instruction_name _ -> true + | Flipped _ -> true + | _ -> false) features + with + Instruction_name names -> names + | Flipped name -> [name] + | _ -> assert false + end + with Not_found -> [builtin_name features name] + in + begin + List.map (fun name' -> + try + let underscore = String.rindex name' '_' in + let our_suffix = String.sub name' (underscore + 1) + ((String.length name') - underscore - 1) + in + let rec strip remaining_suffixes = + match remaining_suffixes with + [] -> name' + | s::ss when our_suffix = s -> String.sub name' 0 underscore + | _::ss -> strip ss + in + strip suffixes_to_strip + with (Not_found | Invalid_argument _) -> name') names + end + +(* Apply a function to each element of a list and then comma-separate + the resulting strings. *) +let rec commas f elts acc = + match elts with + [] -> acc + | [elt] -> acc ^ (f elt) + | elt::elts -> + commas f elts (acc ^ (f elt) ^ ", ") + +(* Given a list of features and the shape specified in the "ops" table, apply + a function to each possible shape that the instruction may have. + By default, this is the "shape" entry in "ops". If the features list + contains a Disassembles_as entry, the shapes contained in that entry are + mapped to corresponding outputs and returned in a list. If there is more + than one Disassembles_as entry, only the first is used. *) +let analyze_all_shapes features shape f = + try + match List.find (fun feature -> + match feature with Disassembles_as _ -> true + | _ -> false) + features with + Disassembles_as shapes -> List.map f shapes + | _ -> assert false + with Not_found -> [f shape] + diff --git a/gcc/config/arm/netbsd-elf.h b/gcc/config/arm/netbsd-elf.h new file mode 100644 index 000000000..9cf186b33 --- /dev/null +++ b/gcc/config/arm/netbsd-elf.h @@ -0,0 +1,157 @@ +/* Definitions of target machine for GNU compiler, NetBSD/arm ELF version. + Copyright (C) 2002, 2003, 2004, 2005, 2007 Free Software Foundation, Inc. + Contributed by Wasabi Systems, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +/* Run-time Target Specification. */ +#undef TARGET_VERSION +#define TARGET_VERSION fputs (" (NetBSD/arm ELF)", stderr); + +/* arm.h defaults to ARM6 CPU. */ + +/* This defaults us to little-endian. */ +#ifndef TARGET_ENDIAN_DEFAULT +#define TARGET_ENDIAN_DEFAULT 0 +#endif + +#undef MULTILIB_DEFAULTS + +/* Default it to use ATPCS with soft-VFP. */ +#undef TARGET_DEFAULT +#define TARGET_DEFAULT \ + (MASK_APCS_FRAME \ + | TARGET_ENDIAN_DEFAULT) + +#undef ARM_DEFAULT_ABI +#define ARM_DEFAULT_ABI ARM_ABI_ATPCS + +#define TARGET_OS_CPP_BUILTINS() \ + do \ + { \ + NETBSD_OS_CPP_BUILTINS_ELF(); \ + } \ + while (0) + +#undef SUBTARGET_CPP_SPEC +#define SUBTARGET_CPP_SPEC NETBSD_CPP_SPEC + +#undef SUBTARGET_EXTRA_ASM_SPEC +#define SUBTARGET_EXTRA_ASM_SPEC \ + "-matpcs %{fpic|fpie:-k} %{fPIC|fPIE:-k}" + +/* Default to full VFP if -mhard-float is specified. */ +#undef SUBTARGET_ASM_FLOAT_SPEC +#define SUBTARGET_ASM_FLOAT_SPEC \ + "%{mhard-float:{!mfpu=*:-mfpu=vfp}} \ + %{mfloat-abi=hard:{!mfpu=*:-mfpu=vfp}}" + +#undef SUBTARGET_EXTRA_SPECS +#define SUBTARGET_EXTRA_SPECS \ + { "subtarget_extra_asm_spec", SUBTARGET_EXTRA_ASM_SPEC }, \ + { "subtarget_asm_float_spec", SUBTARGET_ASM_FLOAT_SPEC }, \ + { "netbsd_link_spec", NETBSD_LINK_SPEC_ELF }, \ + { "netbsd_entry_point", NETBSD_ENTRY_POINT }, + +#define NETBSD_ENTRY_POINT "__start" + +#undef LINK_SPEC +#define LINK_SPEC \ + "-X %{mbig-endian:-EB} %{mlittle-endian:-EL} \ + %(netbsd_link_spec)" + +/* Make GCC agree with . */ + +#undef SIZE_TYPE +#define SIZE_TYPE "long unsigned int" + +#undef PTRDIFF_TYPE +#define PTRDIFF_TYPE "long int" + +/* We don't have any limit on the length as out debugger is GDB. */ +#undef DBX_CONTIN_LENGTH + +/* NetBSD does its profiling differently to the Acorn compiler. We + don't need a word following the mcount call; and to skip it + requires either an assembly stub or use of fomit-frame-pointer when + compiling the profiling functions. Since we break Acorn CC + compatibility below a little more won't hurt. */ + +#undef ARM_FUNCTION_PROFILER +#define ARM_FUNCTION_PROFILER(STREAM,LABELNO) \ +{ \ + asm_fprintf (STREAM, "\tmov\t%Rip, %Rlr\n"); \ + asm_fprintf (STREAM, "\tbl\t__mcount%s\n", \ + (TARGET_ARM && NEED_PLT_RELOC) \ + ? "(PLT)" : ""); \ +} + +/* VERY BIG NOTE: Change of structure alignment for NetBSD/arm. + There are consequences you should be aware of... + + Normally GCC/arm uses a structure alignment of 32 for compatibility + with armcc. This means that structures are padded to a word + boundary. However this causes problems with bugged NetBSD kernel + code (possibly userland code as well - I have not checked every + binary). The nature of this bugged code is to rely on sizeof() + returning the correct size of various structures rounded to the + nearest byte (SCSI and ether code are two examples, the vm system + is another). This code breaks when the structure alignment is 32 + as sizeof() will report a word=rounded size. By changing the + structure alignment to 8. GCC will conform to what is expected by + NetBSD. + + This has several side effects that should be considered. + 1. Structures will only be aligned to the size of the largest member. + i.e. structures containing only bytes will be byte aligned. + structures containing shorts will be half word aligned. + structures containing ints will be word aligned. + + This means structures should be padded to a word boundary if + alignment of 32 is required for byte structures etc. + + 2. A potential performance penalty may exist if strings are no longer + word aligned. GCC will not be able to use word load/stores to copy + short strings. + + This modification is not encouraged but with the present state of the + NetBSD source tree it is currently the only solution that meets the + requirements. */ + +#undef DEFAULT_STRUCTURE_SIZE_BOUNDARY +#define DEFAULT_STRUCTURE_SIZE_BOUNDARY 8 + +/* Clear the instruction cache from `BEG' to `END'. This makes a + call to the ARM_SYNC_ICACHE architecture specific syscall. */ +#define CLEAR_INSN_CACHE(BEG, END) \ +do \ + { \ + extern int sysarch(int number, void *args); \ + struct \ + { \ + unsigned int addr; \ + int len; \ + } s; \ + s.addr = (unsigned int)(BEG); \ + s.len = (END) - (BEG); \ + (void) sysarch (0, &s); \ + } \ +while (0) + +#undef FPUTYPE_DEFAULT +#define FPUTYPE_DEFAULT "vfp" + diff --git a/gcc/config/arm/netbsd.h b/gcc/config/arm/netbsd.h new file mode 100644 index 000000000..4a1adbae9 --- /dev/null +++ b/gcc/config/arm/netbsd.h @@ -0,0 +1,150 @@ +/* NetBSD/arm a.out version. + Copyright (C) 1993, 1994, 1997, 1998, 2003, 2004, 2005, 2007, 2008, 2010 + Free Software Foundation, Inc. + Contributed by Mark Brinicombe (amb@physig.ph.kcl.ac.uk) + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +/* Run-time Target Specification. */ +#undef TARGET_VERSION +#define TARGET_VERSION fputs (" (ARM/NetBSD)", stderr); + +/* Unsigned chars produces much better code than signed. */ +#define DEFAULT_SIGNED_CHAR 0 + +/* Since we always use GAS as our assembler we support stabs. */ +#define DBX_DEBUGGING_INFO 1 + +/*#undef ASM_DECLARE_FUNCTION_NAME*/ + +/* ARM6 family default cpu. */ +#define SUBTARGET_CPU_DEFAULT TARGET_CPU_arm6 + +#undef TARGET_DEFAULT +#define TARGET_DEFAULT (MASK_APCS_FRAME) + +/* Some defines for CPP. + arm32 is the NetBSD port name, so we always define arm32 and __arm32__. */ +#define TARGET_OS_CPP_BUILTINS() \ + do { \ + NETBSD_OS_CPP_BUILTINS_AOUT(); \ + builtin_define_std ("arm32"); \ + builtin_define_std ("unix"); \ + builtin_define_std ("riscbsd"); \ + } while (0) + +#undef SUBTARGET_EXTRA_SPECS +#define SUBTARGET_EXTRA_SPECS \ + { "netbsd_cpp_spec", NETBSD_CPP_SPEC }, \ + { "netbsd_link_spec", NETBSD_LINK_SPEC_AOUT }, + +#undef CPP_SPEC +#define CPP_SPEC "\ +%(cpp_cpu_arch) %(cpp_float) %(cpp_endian) %(netbsd_cpp_spec) \ +" + +/* Because TARGET_DEFAULT sets MASK_SOFT_FLOAT */ +#undef CPP_FLOAT_DEFAULT_SPEC +#define CPP_FLOAT_DEFAULT_SPEC "-D__SOFTFP__" + +/* Pass -X to the linker so that it will strip symbols starting with 'L' */ +#undef LINK_SPEC +#define LINK_SPEC "-X %(netbsd_link_spec)" + +#undef SIZE_TYPE +#define SIZE_TYPE "unsigned int" + +#undef PTRDIFF_TYPE +#define PTRDIFF_TYPE "int" + +/* We don't have any limit on the length as out debugger is GDB. */ +#undef DBX_CONTIN_LENGTH + +/* NetBSD does its profiling differently to the Acorn compiler. We + don't need a word following the mcount call; and to skip it + requires either an assembly stub or use of fomit-frame-pointer when + compiling the profiling functions. Since we break Acorn CC + compatibility below a little more won't hurt. */ + +#undef ARM_FUNCTION_PROFILER +#define ARM_FUNCTION_PROFILER(STREAM,LABELNO) \ +{ \ + fprintf(STREAM, "\tmov\t%sip, %slr\n", REGISTER_PREFIX, REGISTER_PREFIX); \ + fprintf(STREAM, "\tbl\tmcount\n"); \ +} + +/* On the ARM `@' introduces a comment, so we must use something else + for .type directives. */ +#undef TYPE_OPERAND_FMT +#define TYPE_OPERAND_FMT "%%%s" + +/* NetBSD uses the old PCC style aggregate returning conventions. */ +#undef DEFAULT_PCC_STRUCT_RETURN +#define DEFAULT_PCC_STRUCT_RETURN 1 + +/* Although not normally relevant (since by default, all aggregates + are returned in memory) compiling some parts of libc requires + non-APCS style struct returns. */ +#undef TARGET_RETURN_IN_MEMORY + +/* VERY BIG NOTE : Change of structure alignment for RiscBSD. + There are consequences you should be aware of... + + Normally GCC/arm uses a structure alignment of 32 for compatibility + with armcc. This means that structures are padded to a word + boundary. However this causes problems with bugged NetBSD kernel + code (possibly userland code as well - I have not checked every + binary). The nature of this bugged code is to rely on sizeof() + returning the correct size of various structures rounded to the + nearest byte (SCSI and ether code are two examples, the vm system + is another). This code breaks when the structure alignment is 32 + as sizeof() will report a word=rounded size. By changing the + structure alignment to 8. GCC will conform to what is expected by + NetBSD. + + This has several side effects that should be considered. + 1. Structures will only be aligned to the size of the largest member. + i.e. structures containing only bytes will be byte aligned. + structures containing shorts will be half word aligned. + structures containing ints will be word aligned. + + This means structures should be padded to a word boundary if + alignment of 32 is required for byte structures etc. + + 2. A potential performance penalty may exist if strings are no longer + word aligned. GCC will not be able to use word load/stores to copy + short strings. + + This modification is not encouraged but with the present state of the + NetBSD source tree it is currently the only solution that meets the + requirements. */ +#undef DEFAULT_STRUCTURE_SIZE_BOUNDARY +#define DEFAULT_STRUCTURE_SIZE_BOUNDARY 8 + +/* Clear the instruction cache from `BEG' to `END'. This makes a + call to the ARM32_SYNC_ICACHE architecture specific syscall. */ +#define CLEAR_INSN_CACHE(BEG, END) \ +{ \ + extern int sysarch(int number, void *args); \ + struct { \ + unsigned int addr; \ + int len; \ + } s; \ + s.addr = (unsigned int)(BEG); \ + s.len = (END) - (BEG); \ + (void)sysarch(0, &s); \ +} diff --git a/gcc/config/arm/pe.c b/gcc/config/arm/pe.c new file mode 100644 index 000000000..3d9efd578 --- /dev/null +++ b/gcc/config/arm/pe.c @@ -0,0 +1,257 @@ +/* Routines for GCC for ARM/pe. + Copyright (C) 1995, 1996, 2000, 2001, 2002, 2004, 2005, 2007, 2008, 2010 + Free Software Foundation, Inc. + Contributed by Doug Evans (dje@cygnus.com). + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "tm.h" +#include "rtl.h" +#include "output.h" +#include "flags.h" +#include "tree.h" +#include "expr.h" +#include "diagnostic-core.h" +#include "tm_p.h" + +extern int current_function_anonymous_args; + + +/* Return nonzero if DECL is a dllexport'd object. */ + +tree current_class_type; /* FIXME */ + +int +arm_dllexport_p (tree decl) +{ + tree exp; + + if (TREE_CODE (decl) != VAR_DECL + && TREE_CODE (decl) != FUNCTION_DECL) + return 0; + exp = lookup_attribute ("dllexport", DECL_ATTRIBUTES (decl)); + if (exp) + return 1; + + return 0; +} + +/* Return nonzero if DECL is a dllimport'd object. */ + +int +arm_dllimport_p (tree decl) +{ + tree imp; + + if (TREE_CODE (decl) == FUNCTION_DECL + && TARGET_NOP_FUN_DLLIMPORT) + return 0; + + if (TREE_CODE (decl) != VAR_DECL + && TREE_CODE (decl) != FUNCTION_DECL) + return 0; + imp = lookup_attribute ("dllimport", DECL_ATTRIBUTES (decl)); + if (imp) + return 1; + + return 0; +} + +/* Return nonzero if SYMBOL is marked as being dllexport'd. */ + +int +arm_dllexport_name_p (const char *symbol) +{ + return symbol[0] == ARM_PE_FLAG_CHAR && symbol[1] == 'e' && symbol[2] == '.'; +} + +/* Return nonzero if SYMBOL is marked as being dllimport'd. */ + +int +arm_dllimport_name_p (const char *symbol) +{ + return symbol[0] == ARM_PE_FLAG_CHAR && symbol[1] == 'i' && symbol[2] == '.'; +} + +/* Mark a DECL as being dllexport'd. + Note that we override the previous setting (e.g.: dllimport). */ + +void +arm_mark_dllexport (tree decl) +{ + const char * oldname; + char * newname; + rtx rtlname; + tree idp; + + rtlname = XEXP (DECL_RTL (decl), 0); + if (GET_CODE (rtlname) == MEM) + rtlname = XEXP (rtlname, 0); + gcc_assert (GET_CODE (rtlname) == SYMBOL_REF); + oldname = XSTR (rtlname, 0); + + if (arm_dllimport_name_p (oldname)) + oldname += 9; + else if (arm_dllexport_name_p (oldname)) + return; /* already done */ + + newname = XALLOCAVEC (char, strlen (oldname) + 4); + sprintf (newname, "%ce.%s", ARM_PE_FLAG_CHAR, oldname); + + /* We pass newname through get_identifier to ensure it has a unique + address. RTL processing can sometimes peek inside the symbol ref + and compare the string's addresses to see if two symbols are + identical. */ + /* ??? At least I think that's why we do this. */ + idp = get_identifier (newname); + + XEXP (DECL_RTL (decl), 0) = + gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (idp)); +} + +/* Mark a DECL as being dllimport'd. */ + +void +arm_mark_dllimport (tree decl) +{ + const char * oldname; + char * newname; + tree idp; + rtx rtlname, newrtl; + + rtlname = XEXP (DECL_RTL (decl), 0); + + if (GET_CODE (rtlname) == MEM) + rtlname = XEXP (rtlname, 0); + gcc_assert (GET_CODE (rtlname) == SYMBOL_REF); + oldname = XSTR (rtlname, 0); + + gcc_assert (!arm_dllexport_name_p (oldname)); + if (arm_dllimport_name_p (oldname)) + return; /* already done */ + + /* ??? One can well ask why we're making these checks here, + and that would be a good question. */ + + /* Imported variables can't be initialized. */ + if (TREE_CODE (decl) == VAR_DECL + && !DECL_VIRTUAL_P (decl) + && DECL_INITIAL (decl)) + { + error ("initialized variable %q+D is marked dllimport", decl); + return; + } + /* Nor can they be static. */ + if (TREE_CODE (decl) == VAR_DECL + /* ??? Is this test for vtables needed? */ + && !DECL_VIRTUAL_P (decl) + && 0 /*???*/) + { + error ("static variable %q+D is marked dllimport", decl); + return; + } + + /* `extern' needn't be specified with dllimport. + Specify `extern' now and hope for the best. Sigh. */ + if (TREE_CODE (decl) == VAR_DECL + /* ??? Is this test for vtables needed? */ + && !DECL_VIRTUAL_P (decl)) + { + DECL_EXTERNAL (decl) = 1; + TREE_PUBLIC (decl) = 1; + } + + newname = XALLOCAVEC (char, strlen (oldname) + 11); + sprintf (newname, "%ci.__imp_%s", ARM_PE_FLAG_CHAR, oldname); + + /* We pass newname through get_identifier to ensure it has a unique + address. RTL processing can sometimes peek inside the symbol ref + and compare the string's addresses to see if two symbols are + identical. */ + /* ??? At least I think that's why we do this. */ + idp = get_identifier (newname); + + newrtl = gen_rtx_MEM (Pmode, + gen_rtx_SYMBOL_REF (Pmode, + IDENTIFIER_POINTER (idp))); + XEXP (DECL_RTL (decl), 0) = newrtl; +} + +void +arm_pe_encode_section_info (tree decl, rtx rtl, int first ATTRIBUTE_UNUSED) +{ + /* This bit is copied from arm_encode_section_info. */ + if (optimize > 0 && TREE_CONSTANT (decl)) + SYMBOL_REF_FLAG (XEXP (rtl, 0)) = 1; + + /* Mark the decl so we can tell from the rtl whether the object is + dllexport'd or dllimport'd. */ + if (arm_dllexport_p (decl)) + arm_mark_dllexport (decl); + else if (arm_dllimport_p (decl)) + arm_mark_dllimport (decl); + /* It might be that DECL has already been marked as dllimport, but a + subsequent definition nullified that. The attribute is gone but + DECL_RTL still has @i.__imp_foo. We need to remove that. */ + else if ((TREE_CODE (decl) == FUNCTION_DECL + || TREE_CODE (decl) == VAR_DECL) + && DECL_RTL (decl) != NULL_RTX + && GET_CODE (DECL_RTL (decl)) == MEM + && GET_CODE (XEXP (DECL_RTL (decl), 0)) == MEM + && GET_CODE (XEXP (XEXP (DECL_RTL (decl), 0), 0)) == SYMBOL_REF + && arm_dllimport_name_p (XSTR (XEXP (XEXP (DECL_RTL (decl), 0), 0), 0))) + { + const char *oldname = XSTR (XEXP (XEXP (DECL_RTL (decl), 0), 0), 0); + tree idp = get_identifier (oldname + 9); + rtx newrtl = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (idp)); + + XEXP (DECL_RTL (decl), 0) = newrtl; + + /* We previously set TREE_PUBLIC and DECL_EXTERNAL. + ??? We leave these alone for now. */ + } +} + +void +arm_pe_unique_section (tree decl, int reloc) +{ + int len; + const char * name; + char * string; + const char * prefix; + + name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)); + name = arm_strip_name_encoding (name); + + /* The object is put in, for example, section .text$foo. + The linker will then ultimately place them in .text + (everything from the $ on is stripped). */ + if (TREE_CODE (decl) == FUNCTION_DECL) + prefix = ".text$"; + else if (decl_readonly_section (decl, reloc)) + prefix = ".rdata$"; + else + prefix = ".data$"; + len = strlen (name) + strlen (prefix); + string = XALLOCAVEC (char, len + 1); + sprintf (string, "%s%s", prefix, name); + + DECL_SECTION_NAME (decl) = build_string (len, string); +} diff --git a/gcc/config/arm/pe.h b/gcc/config/arm/pe.h new file mode 100644 index 000000000..009c4fe43 --- /dev/null +++ b/gcc/config/arm/pe.h @@ -0,0 +1,148 @@ +/* Definitions of target machine for GNU compiler, for ARM with PE obj format. + Copyright (C) 1995, 1996, 1999, 2000, 2002, 2003, 2004, 2005, 2007 + Free Software Foundation, Inc. + Contributed by Doug Evans (dje@cygnus.com). + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +/* Enable PE specific code. */ +#define ARM_PE 1 + +#define ARM_PE_FLAG_CHAR '@' + +/* Ensure that @x. will be stripped from the function name. */ +#undef SUBTARGET_NAME_ENCODING_LENGTHS +#define SUBTARGET_NAME_ENCODING_LENGTHS \ + case ARM_PE_FLAG_CHAR: return 3; + +#undef USER_LABEL_PREFIX +#define USER_LABEL_PREFIX "_" + + +/* Run-time Target Specification. */ +#undef TARGET_VERSION +#define TARGET_VERSION fputs (" (ARM/pe)", stderr) + +/* Get tree.c to declare a target-specific specialization of + merge_decl_attributes. */ +#define TARGET_DLLIMPORT_DECL_ATTRIBUTES 1 + +#undef SUBTARGET_CPP_SPEC +#define SUBTARGET_CPP_SPEC "-D__pe__" + +#undef TARGET_DEFAULT +#define TARGET_DEFAULT (MASK_NOP_FUN_DLLIMPORT) + +#undef MULTILIB_DEFAULTS +#define MULTILIB_DEFAULTS \ + { "marm", "mlittle-endian", "msoft-float", "mno-thumb-interwork" } + +#undef WCHAR_TYPE +#define WCHAR_TYPE "short unsigned int" +#undef WCHAR_TYPE_SIZE +#define WCHAR_TYPE_SIZE 16 + +/* r11 is fixed. */ +#undef SUBTARGET_CONDITIONAL_REGISTER_USAGE +#define SUBTARGET_CONDITIONAL_REGISTER_USAGE \ + fixed_regs [11] = 1; \ + call_used_regs [11] = 1; + + +/* PE/COFF uses explicit import from shared libraries. */ +#define MULTIPLE_SYMBOL_SPACES 1 + +#define TARGET_ASM_UNIQUE_SECTION arm_pe_unique_section +#define TARGET_ASM_FUNCTION_RODATA_SECTION default_no_function_rodata_section + +#define SUPPORTS_ONE_ONLY 1 + +/* Switch into a generic section. */ +#undef TARGET_ASM_NAMED_SECTION +#define TARGET_ASM_NAMED_SECTION default_pe_asm_named_section + +#define TARGET_ASM_FILE_START_FILE_DIRECTIVE true + +/* Output a reference to a label. */ +#undef ASM_OUTPUT_LABELREF +#define ASM_OUTPUT_LABELREF(STREAM, NAME) \ + asm_fprintf (STREAM, "%U%s", arm_strip_name_encoding (NAME)) + +/* Output a function definition label. */ +#undef ASM_DECLARE_FUNCTION_NAME +#define ASM_DECLARE_FUNCTION_NAME(STREAM, NAME, DECL) \ + do \ + { \ + if (arm_dllexport_name_p (NAME)) \ + { \ + drectve_section (); \ + fprintf (STREAM, "\t.ascii \" -export:%s\"\n", \ + arm_strip_name_encoding (NAME)); \ + switch_to_section (function_section (DECL)); \ + } \ + ARM_DECLARE_FUNCTION_NAME (STREAM, NAME, DECL); \ + if (TARGET_THUMB) \ + fprintf (STREAM, "\t.code 16\n"); \ + ASM_OUTPUT_LABEL (STREAM, NAME); \ + } \ + while (0) + +/* Output a common block. */ +#undef ASM_OUTPUT_COMMON +#define ASM_OUTPUT_COMMON(STREAM, NAME, SIZE, ROUNDED) \ + do \ + { \ + if (arm_dllexport_name_p (NAME)) \ + { \ + drectve_section (); \ + fprintf ((STREAM), "\t.ascii \" -export:%s\"\n",\ + arm_strip_name_encoding (NAME)); \ + } \ + if (! arm_dllimport_name_p (NAME)) \ + { \ + fprintf ((STREAM), "\t.comm\t"); \ + assemble_name ((STREAM), (NAME)); \ + asm_fprintf ((STREAM), ", %d\t%@ %d\n", \ + (int)(ROUNDED), (int)(SIZE)); \ + } \ + } \ + while (0) + +/* Output the label for an initialized variable. */ +#undef ASM_DECLARE_OBJECT_NAME +#define ASM_DECLARE_OBJECT_NAME(STREAM, NAME, DECL) \ + do \ + { \ + if (arm_dllexport_name_p (NAME)) \ + { \ + section *save_section = in_section; \ + drectve_section (); \ + fprintf (STREAM, "\t.ascii \" -export:%s\"\n",\ + arm_strip_name_encoding (NAME)); \ + switch_to_section (save_section); \ + } \ + ASM_OUTPUT_LABEL ((STREAM), (NAME)); \ + } \ + while (0) + +/* Support the ctors/dtors and other sections. */ + +#define DRECTVE_SECTION_ASM_OP "\t.section .drectve" + +#define drectve_section() \ + (fprintf (asm_out_file, "%s\n", DRECTVE_SECTION_ASM_OP), \ + in_section = NULL) diff --git a/gcc/config/arm/pe.opt b/gcc/config/arm/pe.opt new file mode 100644 index 000000000..560a52a81 --- /dev/null +++ b/gcc/config/arm/pe.opt @@ -0,0 +1,23 @@ +; PE-specific options for the ARM port + +; Copyright (C) 2005, 2007 Free Software Foundation, Inc. +; +; This file is part of GCC. +; +; GCC is free software; you can redistribute it and/or modify it under +; the terms of the GNU General Public License as published by the Free +; Software Foundation; either version 3, or (at your option) any later +; version. +; +; GCC is distributed in the hope that it will be useful, but WITHOUT ANY +; WARRANTY; without even the implied warranty of MERCHANTABILITY or +; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +; for more details. +; +; You should have received a copy of the GNU General Public License +; along with GCC; see the file COPYING3. If not see +; . + +mnop-fun-dllimport +Target Report Mask(NOP_FUN_DLLIMPORT) +Ignore dllimport attribute for functions diff --git a/gcc/config/arm/pr-support.c b/gcc/config/arm/pr-support.c new file mode 100644 index 000000000..deee661e2 --- /dev/null +++ b/gcc/config/arm/pr-support.c @@ -0,0 +1,401 @@ +/* ARM EABI compliant unwinding routines + Copyright (C) 2004, 2005, 2009 Free Software Foundation, Inc. + Contributed by Paul Brook + + This file is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any + later version. + + This file is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#include "unwind.h" + +/* We add a prototype for abort here to avoid creating a dependency on + target headers. */ +extern void abort (void); + +typedef struct _ZSt9type_info type_info; /* This names C++ type_info type */ + +/* Misc constants. */ +#define R_IP 12 +#define R_SP 13 +#define R_LR 14 +#define R_PC 15 + +#define uint32_highbit (((_uw) 1) << 31) + +void __attribute__((weak)) __cxa_call_unexpected(_Unwind_Control_Block *ucbp); + +/* Unwind descriptors. */ + +typedef struct +{ + _uw16 length; + _uw16 offset; +} EHT16; + +typedef struct +{ + _uw length; + _uw offset; +} EHT32; + +/* Calculate the address encoded by a 31-bit self-relative offset at address + P. Copy of routine in unwind-arm.c. */ + +static inline _uw +selfrel_offset31 (const _uw *p) +{ + _uw offset; + + offset = *p; + /* Sign extend to 32 bits. */ + if (offset & (1 << 30)) + offset |= 1u << 31; + + return offset + (_uw) p; +} + + +/* Personality routine helper functions. */ + +#define CODE_FINISH (0xb0) + +/* Return the next byte of unwinding information, or CODE_FINISH if there is + no data remaining. */ +static inline _uw8 +next_unwind_byte (__gnu_unwind_state * uws) +{ + _uw8 b; + + if (uws->bytes_left == 0) + { + /* Load another word */ + if (uws->words_left == 0) + return CODE_FINISH; /* Nothing left. */ + uws->words_left--; + uws->data = *(uws->next++); + uws->bytes_left = 3; + } + else + uws->bytes_left--; + + /* Extract the most significant byte. */ + b = (uws->data >> 24) & 0xff; + uws->data <<= 8; + return b; +} + +/* Execute the unwinding instructions described by UWS. */ +_Unwind_Reason_Code +__gnu_unwind_execute (_Unwind_Context * context, __gnu_unwind_state * uws) +{ + _uw op; + int set_pc; + _uw reg; + + set_pc = 0; + for (;;) + { + op = next_unwind_byte (uws); + if (op == CODE_FINISH) + { + /* If we haven't already set pc then copy it from lr. */ + if (!set_pc) + { + _Unwind_VRS_Get (context, _UVRSC_CORE, R_LR, _UVRSD_UINT32, + ®); + _Unwind_VRS_Set (context, _UVRSC_CORE, R_PC, _UVRSD_UINT32, + ®); + set_pc = 1; + } + /* Drop out of the loop. */ + break; + } + if ((op & 0x80) == 0) + { + /* vsp = vsp +- (imm6 << 2 + 4). */ + _uw offset; + + offset = ((op & 0x3f) << 2) + 4; + _Unwind_VRS_Get (context, _UVRSC_CORE, R_SP, _UVRSD_UINT32, ®); + if (op & 0x40) + reg -= offset; + else + reg += offset; + _Unwind_VRS_Set (context, _UVRSC_CORE, R_SP, _UVRSD_UINT32, ®); + continue; + } + + if ((op & 0xf0) == 0x80) + { + op = (op << 8) | next_unwind_byte (uws); + if (op == 0x8000) + { + /* Refuse to unwind. */ + return _URC_FAILURE; + } + /* Pop r4-r15 under mask. */ + op = (op << 4) & 0xfff0; + if (_Unwind_VRS_Pop (context, _UVRSC_CORE, op, _UVRSD_UINT32) + != _UVRSR_OK) + return _URC_FAILURE; + if (op & (1 << R_PC)) + set_pc = 1; + continue; + } + if ((op & 0xf0) == 0x90) + { + op &= 0xf; + if (op == 13 || op == 15) + /* Reserved. */ + return _URC_FAILURE; + /* vsp = r[nnnn]. */ + _Unwind_VRS_Get (context, _UVRSC_CORE, op, _UVRSD_UINT32, ®); + _Unwind_VRS_Set (context, _UVRSC_CORE, R_SP, _UVRSD_UINT32, ®); + continue; + } + if ((op & 0xf0) == 0xa0) + { + /* Pop r4-r[4+nnn], [lr]. */ + _uw mask; + + mask = (0xff0 >> (7 - (op & 7))) & 0xff0; + if (op & 8) + mask |= (1 << R_LR); + if (_Unwind_VRS_Pop (context, _UVRSC_CORE, mask, _UVRSD_UINT32) + != _UVRSR_OK) + return _URC_FAILURE; + continue; + } + if ((op & 0xf0) == 0xb0) + { + /* op == 0xb0 already handled. */ + if (op == 0xb1) + { + op = next_unwind_byte (uws); + if (op == 0 || ((op & 0xf0) != 0)) + /* Spare. */ + return _URC_FAILURE; + /* Pop r0-r4 under mask. */ + if (_Unwind_VRS_Pop (context, _UVRSC_CORE, op, _UVRSD_UINT32) + != _UVRSR_OK) + return _URC_FAILURE; + continue; + } + if (op == 0xb2) + { + /* vsp = vsp + 0x204 + (uleb128 << 2). */ + int shift; + + _Unwind_VRS_Get (context, _UVRSC_CORE, R_SP, _UVRSD_UINT32, + ®); + op = next_unwind_byte (uws); + shift = 2; + while (op & 0x80) + { + reg += ((op & 0x7f) << shift); + shift += 7; + op = next_unwind_byte (uws); + } + reg += ((op & 0x7f) << shift) + 0x204; + _Unwind_VRS_Set (context, _UVRSC_CORE, R_SP, _UVRSD_UINT32, + ®); + continue; + } + if (op == 0xb3) + { + /* Pop VFP registers with fldmx. */ + op = next_unwind_byte (uws); + op = ((op & 0xf0) << 12) | ((op & 0xf) + 1); + if (_Unwind_VRS_Pop (context, _UVRSC_VFP, op, _UVRSD_VFPX) + != _UVRSR_OK) + return _URC_FAILURE; + continue; + } + if ((op & 0xfc) == 0xb4) + { + /* Pop FPA E[4]-E[4+nn]. */ + op = 0x40000 | ((op & 3) + 1); + if (_Unwind_VRS_Pop (context, _UVRSC_FPA, op, _UVRSD_FPAX) + != _UVRSR_OK) + return _URC_FAILURE; + continue; + } + /* op & 0xf8 == 0xb8. */ + /* Pop VFP D[8]-D[8+nnn] with fldmx. */ + op = 0x80000 | ((op & 7) + 1); + if (_Unwind_VRS_Pop (context, _UVRSC_VFP, op, _UVRSD_VFPX) + != _UVRSR_OK) + return _URC_FAILURE; + continue; + } + if ((op & 0xf0) == 0xc0) + { + if (op == 0xc6) + { + /* Pop iWMMXt D registers. */ + op = next_unwind_byte (uws); + op = ((op & 0xf0) << 12) | ((op & 0xf) + 1); + if (_Unwind_VRS_Pop (context, _UVRSC_WMMXD, op, _UVRSD_UINT64) + != _UVRSR_OK) + return _URC_FAILURE; + continue; + } + if (op == 0xc7) + { + op = next_unwind_byte (uws); + if (op == 0 || (op & 0xf0) != 0) + /* Spare. */ + return _URC_FAILURE; + /* Pop iWMMXt wCGR{3,2,1,0} under mask. */ + if (_Unwind_VRS_Pop (context, _UVRSC_WMMXC, op, _UVRSD_UINT32) + != _UVRSR_OK) + return _URC_FAILURE; + continue; + } + if ((op & 0xf8) == 0xc0) + { + /* Pop iWMMXt wR[10]-wR[10+nnn]. */ + op = 0xa0000 | ((op & 0xf) + 1); + if (_Unwind_VRS_Pop (context, _UVRSC_WMMXD, op, _UVRSD_UINT64) + != _UVRSR_OK) + return _URC_FAILURE; + continue; + } + if (op == 0xc8) + { +#ifndef __VFP_FP__ + /* Pop FPA registers. */ + op = next_unwind_byte (uws); + op = ((op & 0xf0) << 12) | ((op & 0xf) + 1); + if (_Unwind_VRS_Pop (context, _UVRSC_FPA, op, _UVRSD_FPAX) + != _UVRSR_OK) + return _URC_FAILURE; + continue; +#else + /* Pop VFPv3 registers D[16+ssss]-D[16+ssss+cccc] with vldm. */ + op = next_unwind_byte (uws); + op = (((op & 0xf0) + 16) << 12) | ((op & 0xf) + 1); + if (_Unwind_VRS_Pop (context, _UVRSC_VFP, op, _UVRSD_DOUBLE) + != _UVRSR_OK) + return _URC_FAILURE; + continue; +#endif + } + if (op == 0xc9) + { + /* Pop VFP registers with fldmd. */ + op = next_unwind_byte (uws); + op = ((op & 0xf0) << 12) | ((op & 0xf) + 1); + if (_Unwind_VRS_Pop (context, _UVRSC_VFP, op, _UVRSD_DOUBLE) + != _UVRSR_OK) + return _URC_FAILURE; + continue; + } + /* Spare. */ + return _URC_FAILURE; + } + if ((op & 0xf8) == 0xd0) + { + /* Pop VFP D[8]-D[8+nnn] with fldmd. */ + op = 0x80000 | ((op & 7) + 1); + if (_Unwind_VRS_Pop (context, _UVRSC_VFP, op, _UVRSD_DOUBLE) + != _UVRSR_OK) + return _URC_FAILURE; + continue; + } + /* Spare. */ + return _URC_FAILURE; + } + return _URC_OK; +} + + +/* Execute the unwinding instructions associated with a frame. UCBP and + CONTEXT are the current exception object and virtual CPU state + respectively. */ + +_Unwind_Reason_Code +__gnu_unwind_frame (_Unwind_Control_Block * ucbp, _Unwind_Context * context) +{ + _uw *ptr; + __gnu_unwind_state uws; + + ptr = (_uw *) ucbp->pr_cache.ehtp; + /* Skip over the personality routine address. */ + ptr++; + /* Setup the unwinder state. */ + uws.data = (*ptr) << 8; + uws.next = ptr + 1; + uws.bytes_left = 3; + uws.words_left = ((*ptr) >> 24) & 0xff; + + return __gnu_unwind_execute (context, &uws); +} + +/* Get the _Unwind_Control_Block from an _Unwind_Context. */ + +static inline _Unwind_Control_Block * +unwind_UCB_from_context (_Unwind_Context * context) +{ + return (_Unwind_Control_Block *) _Unwind_GetGR (context, R_IP); +} + +/* Get the start address of the function being unwound. */ + +_Unwind_Ptr +_Unwind_GetRegionStart (_Unwind_Context * context) +{ + _Unwind_Control_Block *ucbp; + + ucbp = unwind_UCB_from_context (context); + return (_Unwind_Ptr) ucbp->pr_cache.fnstart; +} + +/* Find the Language specific exception data. */ + +void * +_Unwind_GetLanguageSpecificData (_Unwind_Context * context) +{ + _Unwind_Control_Block *ucbp; + _uw *ptr; + + /* Get a pointer to the exception table entry. */ + ucbp = unwind_UCB_from_context (context); + ptr = (_uw *) ucbp->pr_cache.ehtp; + /* Skip the personality routine address. */ + ptr++; + /* Skip the unwind opcodes. */ + ptr += (((*ptr) >> 24) & 0xff) + 1; + + return ptr; +} + + +/* These two should never be used. */ + +_Unwind_Ptr +_Unwind_GetDataRelBase (_Unwind_Context *context __attribute__ ((unused))) +{ + abort (); +} + +_Unwind_Ptr +_Unwind_GetTextRelBase (_Unwind_Context *context __attribute__ ((unused))) +{ + abort (); +} diff --git a/gcc/config/arm/predicates.md b/gcc/config/arm/predicates.md new file mode 100644 index 000000000..e34b46da0 --- /dev/null +++ b/gcc/config/arm/predicates.md @@ -0,0 +1,688 @@ +;; Predicate definitions for ARM and Thumb +;; Copyright (C) 2004, 2007, 2008, 2010 Free Software Foundation, Inc. +;; Contributed by ARM Ltd. + +;; This file is part of GCC. + +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published +;; by the Free Software Foundation; either version 3, or (at your +;; option) any later version. + +;; GCC is distributed in the hope that it will be useful, but WITHOUT +;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public +;; License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +(define_predicate "s_register_operand" + (match_code "reg,subreg") +{ + if (GET_CODE (op) == SUBREG) + op = SUBREG_REG (op); + /* We don't consider registers whose class is NO_REGS + to be a register operand. */ + /* XXX might have to check for lo regs only for thumb ??? */ + return (GET_CODE (op) == REG + && (REGNO (op) >= FIRST_PSEUDO_REGISTER + || REGNO_REG_CLASS (REGNO (op)) != NO_REGS)); +}) + +;; Any hard register. +(define_predicate "arm_hard_register_operand" + (match_code "reg") +{ + return REGNO (op) < FIRST_PSEUDO_REGISTER; +}) + +;; A low register. +(define_predicate "low_register_operand" + (and (match_code "reg") + (match_test "REGNO (op) <= LAST_LO_REGNUM"))) + +;; A low register or const_int. +(define_predicate "low_reg_or_int_operand" + (ior (match_code "const_int") + (match_operand 0 "low_register_operand"))) + +;; Any core register, or any pseudo. */ +(define_predicate "arm_general_register_operand" + (match_code "reg,subreg") +{ + if (GET_CODE (op) == SUBREG) + op = SUBREG_REG (op); + + return (GET_CODE (op) == REG + && (REGNO (op) <= LAST_ARM_REGNUM + || REGNO (op) >= FIRST_PSEUDO_REGISTER)); +}) + +(define_predicate "f_register_operand" + (match_code "reg,subreg") +{ + if (GET_CODE (op) == SUBREG) + op = SUBREG_REG (op); + + /* We don't consider registers whose class is NO_REGS + to be a register operand. */ + return (GET_CODE (op) == REG + && (REGNO (op) >= FIRST_PSEUDO_REGISTER + || REGNO_REG_CLASS (REGNO (op)) == FPA_REGS)); +}) + +(define_predicate "vfp_register_operand" + (match_code "reg,subreg") +{ + if (GET_CODE (op) == SUBREG) + op = SUBREG_REG (op); + + /* We don't consider registers whose class is NO_REGS + to be a register operand. */ + return (GET_CODE (op) == REG + && (REGNO (op) >= FIRST_PSEUDO_REGISTER + || REGNO_REG_CLASS (REGNO (op)) == VFP_D0_D7_REGS + || REGNO_REG_CLASS (REGNO (op)) == VFP_LO_REGS + || (TARGET_VFPD32 + && REGNO_REG_CLASS (REGNO (op)) == VFP_REGS))); +}) + +(define_special_predicate "subreg_lowpart_operator" + (and (match_code "subreg") + (match_test "subreg_lowpart_p (op)"))) + +;; Reg, subreg(reg) or const_int. +(define_predicate "reg_or_int_operand" + (ior (match_code "const_int") + (match_operand 0 "s_register_operand"))) + +(define_predicate "arm_immediate_operand" + (and (match_code "const_int") + (match_test "const_ok_for_arm (INTVAL (op))"))) + +;; A constant value which fits into two instructions, each taking +;; an arithmetic constant operand for one of the words. +(define_predicate "arm_immediate_di_operand" + (and (match_code "const_int,const_double") + (match_test "arm_const_double_by_immediates (op)"))) + +(define_predicate "arm_neg_immediate_operand" + (and (match_code "const_int") + (match_test "const_ok_for_arm (-INTVAL (op))"))) + +(define_predicate "arm_not_immediate_operand" + (and (match_code "const_int") + (match_test "const_ok_for_arm (~INTVAL (op))"))) + +(define_predicate "const0_operand" + (and (match_code "const_int") + (match_test "INTVAL (op) == 0"))) + +;; Something valid on the RHS of an ARM data-processing instruction +(define_predicate "arm_rhs_operand" + (ior (match_operand 0 "s_register_operand") + (match_operand 0 "arm_immediate_operand"))) + +(define_predicate "arm_rhsm_operand" + (ior (match_operand 0 "arm_rhs_operand") + (match_operand 0 "memory_operand"))) + +(define_predicate "shift_amount_operand" + (ior (and (match_test "TARGET_ARM") + (match_operand 0 "s_register_operand")) + (match_operand 0 "const_int_operand"))) + +(define_predicate "arm_add_operand" + (ior (match_operand 0 "arm_rhs_operand") + (match_operand 0 "arm_neg_immediate_operand"))) + +(define_predicate "arm_addimm_operand" + (ior (match_operand 0 "arm_immediate_operand") + (match_operand 0 "arm_neg_immediate_operand"))) + +(define_predicate "arm_not_operand" + (ior (match_operand 0 "arm_rhs_operand") + (match_operand 0 "arm_not_immediate_operand"))) + +(define_predicate "arm_di_operand" + (ior (match_operand 0 "s_register_operand") + (match_operand 0 "arm_immediate_di_operand"))) + +;; True if the operand is a memory reference which contains an +;; offsettable address. +(define_predicate "offsettable_memory_operand" + (and (match_code "mem") + (match_test + "offsettable_address_p (reload_completed | reload_in_progress, + mode, XEXP (op, 0))"))) + +;; True if the operand is a memory operand that does not have an +;; automodified base register (and thus will not generate output reloads). +(define_predicate "call_memory_operand" + (and (match_code "mem") + (and (match_test "GET_RTX_CLASS (GET_CODE (XEXP (op, 0))) + != RTX_AUTOINC") + (match_operand 0 "memory_operand")))) + +(define_predicate "arm_reload_memory_operand" + (and (match_code "mem,reg,subreg") + (match_test "(!CONSTANT_P (op) + && (true_regnum(op) == -1 + || (GET_CODE (op) == REG + && REGNO (op) >= FIRST_PSEUDO_REGISTER)))"))) + +;; True for valid operands for the rhs of an floating point insns. +;; Allows regs or certain consts on FPA, just regs for everything else. +(define_predicate "arm_float_rhs_operand" + (ior (match_operand 0 "s_register_operand") + (and (match_code "const_double") + (match_test "TARGET_FPA && arm_const_double_rtx (op)")))) + +(define_predicate "arm_float_add_operand" + (ior (match_operand 0 "arm_float_rhs_operand") + (and (match_code "const_double") + (match_test "TARGET_FPA && neg_const_double_rtx_ok_for_fpa (op)")))) + +(define_predicate "vfp_compare_operand" + (ior (match_operand 0 "s_register_operand") + (and (match_code "const_double") + (match_test "arm_const_double_rtx (op)")))) + +(define_predicate "arm_float_compare_operand" + (if_then_else (match_test "TARGET_VFP") + (match_operand 0 "vfp_compare_operand") + (match_operand 0 "arm_float_rhs_operand"))) + +;; True for valid index operands. +(define_predicate "index_operand" + (ior (match_operand 0 "s_register_operand") + (and (match_operand 0 "immediate_operand") + (match_test "(GET_CODE (op) != CONST_INT + || (INTVAL (op) < 4096 && INTVAL (op) > -4096))")))) + +;; True for operators that can be combined with a shift in ARM state. +(define_special_predicate "shiftable_operator" + (and (match_code "plus,minus,ior,xor,and") + (match_test "mode == GET_MODE (op)"))) + +;; True for logical binary operators. +(define_special_predicate "logical_binary_operator" + (and (match_code "ior,xor,and") + (match_test "mode == GET_MODE (op)"))) + +;; True for commutative operators +(define_special_predicate "commutative_binary_operator" + (and (match_code "ior,xor,and,plus") + (match_test "mode == GET_MODE (op)"))) + +;; True for shift operators. +(define_special_predicate "shift_operator" + (and (ior (ior (and (match_code "mult") + (match_test "power_of_two_operand (XEXP (op, 1), mode)")) + (and (match_code "rotate") + (match_test "GET_CODE (XEXP (op, 1)) == CONST_INT + && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) < 32"))) + (match_code "ashift,ashiftrt,lshiftrt,rotatert")) + (match_test "mode == GET_MODE (op)"))) + +;; True for MULT, to identify which variant of shift_operator is in use. +(define_special_predicate "mult_operator" + (match_code "mult")) + +;; True for operators that have 16-bit thumb variants. */ +(define_special_predicate "thumb_16bit_operator" + (match_code "plus,minus,and,ior,xor")) + +;; True for EQ & NE +(define_special_predicate "equality_operator" + (match_code "eq,ne")) + +;; True for integer comparisons and, if FP is active, for comparisons +;; other than LTGT or UNEQ. +(define_special_predicate "arm_comparison_operator" + (ior (match_code "eq,ne,le,lt,ge,gt,geu,gtu,leu,ltu") + (and (match_test "TARGET_32BIT && TARGET_HARD_FLOAT + && (TARGET_FPA || TARGET_VFP)") + (match_code "unordered,ordered,unlt,unle,unge,ungt")))) + +(define_special_predicate "lt_ge_comparison_operator" + (match_code "lt,ge")) + +(define_special_predicate "noov_comparison_operator" + (match_code "lt,ge,eq,ne")) + +(define_special_predicate "minmax_operator" + (and (match_code "smin,smax,umin,umax") + (match_test "mode == GET_MODE (op)"))) + +(define_special_predicate "cc_register" + (and (match_code "reg") + (and (match_test "REGNO (op) == CC_REGNUM") + (ior (match_test "mode == GET_MODE (op)") + (match_test "mode == VOIDmode && GET_MODE_CLASS (GET_MODE (op)) == MODE_CC"))))) + +(define_special_predicate "dominant_cc_register" + (match_code "reg") +{ + if (mode == VOIDmode) + { + mode = GET_MODE (op); + + if (GET_MODE_CLASS (mode) != MODE_CC) + return false; + } + + return (cc_register (op, mode) + && (mode == CC_DNEmode + || mode == CC_DEQmode + || mode == CC_DLEmode + || mode == CC_DLTmode + || mode == CC_DGEmode + || mode == CC_DGTmode + || mode == CC_DLEUmode + || mode == CC_DLTUmode + || mode == CC_DGEUmode + || mode == CC_DGTUmode)); +}) + +(define_special_predicate "arm_extendqisi_mem_op" + (and (match_operand 0 "memory_operand") + (match_test "arm_legitimate_address_outer_p (mode, XEXP (op, 0), + SIGN_EXTEND, 0)"))) + +(define_special_predicate "arm_reg_or_extendqisi_mem_op" + (ior (match_operand 0 "arm_extendqisi_mem_op") + (match_operand 0 "s_register_operand"))) + +(define_predicate "power_of_two_operand" + (match_code "const_int") +{ + unsigned HOST_WIDE_INT value = INTVAL (op) & 0xffffffff; + + return value != 0 && (value & (value - 1)) == 0; +}) + +(define_predicate "nonimmediate_di_operand" + (match_code "reg,subreg,mem") +{ + if (s_register_operand (op, mode)) + return true; + + if (GET_CODE (op) == SUBREG) + op = SUBREG_REG (op); + + return GET_CODE (op) == MEM && memory_address_p (DImode, XEXP (op, 0)); +}) + +(define_predicate "di_operand" + (ior (match_code "const_int,const_double") + (and (match_code "reg,subreg,mem") + (match_operand 0 "nonimmediate_di_operand")))) + +(define_predicate "nonimmediate_soft_df_operand" + (match_code "reg,subreg,mem") +{ + if (s_register_operand (op, mode)) + return true; + + if (GET_CODE (op) == SUBREG) + op = SUBREG_REG (op); + + return GET_CODE (op) == MEM && memory_address_p (DFmode, XEXP (op, 0)); +}) + +(define_predicate "soft_df_operand" + (ior (match_code "const_double") + (and (match_code "reg,subreg,mem") + (match_operand 0 "nonimmediate_soft_df_operand")))) + +(define_predicate "const_shift_operand" + (and (match_code "const_int") + (ior (match_operand 0 "power_of_two_operand") + (match_test "((unsigned HOST_WIDE_INT) INTVAL (op)) < 32")))) + + +(define_special_predicate "load_multiple_operation" + (match_code "parallel") +{ + HOST_WIDE_INT count = XVECLEN (op, 0); + unsigned dest_regno; + rtx src_addr; + HOST_WIDE_INT i = 1, base = 0; + HOST_WIDE_INT offset = 0; + rtx elt; + bool addr_reg_loaded = false; + bool update = false; + + if (count <= 1 + || GET_CODE (XVECEXP (op, 0, 0)) != SET + || !REG_P (SET_DEST (XVECEXP (op, 0, 0)))) + return false; + + /* Check to see if this might be a write-back. */ + if (GET_CODE (SET_SRC (elt = XVECEXP (op, 0, 0))) == PLUS) + { + i++; + base = 1; + update = true; + + /* Now check it more carefully. */ + if (GET_CODE (SET_DEST (elt)) != REG + || GET_CODE (XEXP (SET_SRC (elt), 0)) != REG + || GET_CODE (XEXP (SET_SRC (elt), 1)) != CONST_INT + || INTVAL (XEXP (SET_SRC (elt), 1)) != (count - 1) * 4) + return false; + } + + /* Perform a quick check so we don't blow up below. */ + if (count <= i + || GET_CODE (XVECEXP (op, 0, i - 1)) != SET + || GET_CODE (SET_DEST (XVECEXP (op, 0, i - 1))) != REG + || GET_CODE (SET_SRC (XVECEXP (op, 0, i - 1))) != MEM) + return false; + + dest_regno = REGNO (SET_DEST (XVECEXP (op, 0, i - 1))); + src_addr = XEXP (SET_SRC (XVECEXP (op, 0, i - 1)), 0); + if (GET_CODE (src_addr) == PLUS) + { + if (GET_CODE (XEXP (src_addr, 1)) != CONST_INT) + return false; + offset = INTVAL (XEXP (src_addr, 1)); + src_addr = XEXP (src_addr, 0); + } + if (!REG_P (src_addr)) + return false; + + for (; i < count; i++) + { + elt = XVECEXP (op, 0, i); + + if (GET_CODE (elt) != SET + || GET_CODE (SET_DEST (elt)) != REG + || GET_MODE (SET_DEST (elt)) != SImode + || REGNO (SET_DEST (elt)) <= dest_regno + || GET_CODE (SET_SRC (elt)) != MEM + || GET_MODE (SET_SRC (elt)) != SImode + || ((GET_CODE (XEXP (SET_SRC (elt), 0)) != PLUS + || !rtx_equal_p (XEXP (XEXP (SET_SRC (elt), 0), 0), src_addr) + || GET_CODE (XEXP (XEXP (SET_SRC (elt), 0), 1)) != CONST_INT + || INTVAL (XEXP (XEXP (SET_SRC (elt), 0), 1)) != offset + (i - base) * 4) + && (!REG_P (XEXP (SET_SRC (elt), 0)) + || offset + (i - base) * 4 != 0))) + return false; + dest_regno = REGNO (SET_DEST (elt)); + if (dest_regno == REGNO (src_addr)) + addr_reg_loaded = true; + } + /* For Thumb, we only have updating instructions. If the pattern does + not describe an update, it must be because the address register is + in the list of loaded registers - on the hardware, this has the effect + of overriding the update. */ + if (update && addr_reg_loaded) + return false; + if (TARGET_THUMB1) + return update || addr_reg_loaded; + return true; +}) + +(define_special_predicate "store_multiple_operation" + (match_code "parallel") +{ + HOST_WIDE_INT count = XVECLEN (op, 0); + unsigned src_regno; + rtx dest_addr; + HOST_WIDE_INT i = 1, base = 0, offset = 0; + rtx elt; + + if (count <= 1 + || GET_CODE (XVECEXP (op, 0, 0)) != SET) + return false; + + /* Check to see if this might be a write-back. */ + if (GET_CODE (SET_SRC (elt = XVECEXP (op, 0, 0))) == PLUS) + { + i++; + base = 1; + + /* Now check it more carefully. */ + if (GET_CODE (SET_DEST (elt)) != REG + || GET_CODE (XEXP (SET_SRC (elt), 0)) != REG + || GET_CODE (XEXP (SET_SRC (elt), 1)) != CONST_INT + || INTVAL (XEXP (SET_SRC (elt), 1)) != (count - 1) * 4) + return false; + } + + /* Perform a quick check so we don't blow up below. */ + if (count <= i + || GET_CODE (XVECEXP (op, 0, i - 1)) != SET + || GET_CODE (SET_DEST (XVECEXP (op, 0, i - 1))) != MEM + || GET_CODE (SET_SRC (XVECEXP (op, 0, i - 1))) != REG) + return false; + + src_regno = REGNO (SET_SRC (XVECEXP (op, 0, i - 1))); + dest_addr = XEXP (SET_DEST (XVECEXP (op, 0, i - 1)), 0); + + if (GET_CODE (dest_addr) == PLUS) + { + if (GET_CODE (XEXP (dest_addr, 1)) != CONST_INT) + return false; + offset = INTVAL (XEXP (dest_addr, 1)); + dest_addr = XEXP (dest_addr, 0); + } + if (!REG_P (dest_addr)) + return false; + + for (; i < count; i++) + { + elt = XVECEXP (op, 0, i); + + if (GET_CODE (elt) != SET + || GET_CODE (SET_SRC (elt)) != REG + || GET_MODE (SET_SRC (elt)) != SImode + || REGNO (SET_SRC (elt)) <= src_regno + || GET_CODE (SET_DEST (elt)) != MEM + || GET_MODE (SET_DEST (elt)) != SImode + || ((GET_CODE (XEXP (SET_DEST (elt), 0)) != PLUS + || !rtx_equal_p (XEXP (XEXP (SET_DEST (elt), 0), 0), dest_addr) + || GET_CODE (XEXP (XEXP (SET_DEST (elt), 0), 1)) != CONST_INT + || INTVAL (XEXP (XEXP (SET_DEST (elt), 0), 1)) != offset + (i - base) * 4) + && (!REG_P (XEXP (SET_DEST (elt), 0)) + || offset + (i - base) * 4 != 0))) + return false; + src_regno = REGNO (SET_SRC (elt)); + } + + return true; +}) + +(define_special_predicate "multi_register_push" + (match_code "parallel") +{ + if ((GET_CODE (XVECEXP (op, 0, 0)) != SET) + || (GET_CODE (SET_SRC (XVECEXP (op, 0, 0))) != UNSPEC) + || (XINT (SET_SRC (XVECEXP (op, 0, 0)), 1) != UNSPEC_PUSH_MULT)) + return false; + + return true; +}) + +;;------------------------------------------------------------------------- +;; +;; Thumb predicates +;; + +(define_predicate "thumb1_cmp_operand" + (ior (and (match_code "reg,subreg") + (match_operand 0 "s_register_operand")) + (and (match_code "const_int") + (match_test "((unsigned HOST_WIDE_INT) INTVAL (op)) < 256")))) + +(define_predicate "thumb1_cmpneg_operand" + (and (match_code "const_int") + (match_test "INTVAL (op) < 0 && INTVAL (op) > -256"))) + +;; Return TRUE if a result can be stored in OP without clobbering the +;; condition code register. Prior to reload we only accept a +;; register. After reload we have to be able to handle memory as +;; well, since a pseudo may not get a hard reg and reload cannot +;; handle output-reloads on jump insns. + +;; We could possibly handle mem before reload as well, but that might +;; complicate things with the need to handle increment +;; side-effects. +(define_predicate "thumb_cbrch_target_operand" + (and (match_code "reg,subreg,mem") + (ior (match_operand 0 "s_register_operand") + (and (match_test "reload_in_progress || reload_completed") + (match_operand 0 "memory_operand"))))) + +;;------------------------------------------------------------------------- +;; +;; MAVERICK predicates +;; + +(define_predicate "cirrus_register_operand" + (match_code "reg,subreg") +{ + if (GET_CODE (op) == SUBREG) + op = SUBREG_REG (op); + + return (GET_CODE (op) == REG + && (REGNO_REG_CLASS (REGNO (op)) == CIRRUS_REGS + || REGNO_REG_CLASS (REGNO (op)) == GENERAL_REGS)); +}) + +(define_predicate "cirrus_fp_register" + (match_code "reg,subreg") +{ + if (GET_CODE (op) == SUBREG) + op = SUBREG_REG (op); + + return (GET_CODE (op) == REG + && (REGNO (op) >= FIRST_PSEUDO_REGISTER + || REGNO_REG_CLASS (REGNO (op)) == CIRRUS_REGS)); +}) + +(define_predicate "cirrus_shift_const" + (and (match_code "const_int") + (match_test "((unsigned HOST_WIDE_INT) INTVAL (op)) < 64"))) + + +;; Neon predicates + +(define_predicate "const_multiple_of_8_operand" + (match_code "const_int") +{ + unsigned HOST_WIDE_INT val = INTVAL (op); + return (val & 7) == 0; +}) + +(define_predicate "imm_for_neon_mov_operand" + (match_code "const_vector") +{ + return neon_immediate_valid_for_move (op, mode, NULL, NULL); +}) + +(define_predicate "imm_for_neon_logic_operand" + (match_code "const_vector") +{ + return (TARGET_NEON + && neon_immediate_valid_for_logic (op, mode, 0, NULL, NULL)); +}) + +(define_predicate "imm_for_neon_inv_logic_operand" + (match_code "const_vector") +{ + return (TARGET_NEON + && neon_immediate_valid_for_logic (op, mode, 1, NULL, NULL)); +}) + +(define_predicate "neon_logic_op2" + (ior (match_operand 0 "imm_for_neon_logic_operand") + (match_operand 0 "s_register_operand"))) + +(define_predicate "neon_inv_logic_op2" + (ior (match_operand 0 "imm_for_neon_inv_logic_operand") + (match_operand 0 "s_register_operand"))) + +;; TODO: We could check lane numbers more precisely based on the mode. +(define_predicate "neon_lane_number" + (and (match_code "const_int") + (match_test "INTVAL (op) >= 0 && INTVAL (op) <= 15"))) +;; Predicates for named expanders that overlap multiple ISAs. + +(define_predicate "cmpdi_operand" + (if_then_else (match_test "TARGET_HARD_FLOAT && TARGET_MAVERICK") + (and (match_test "TARGET_ARM") + (match_operand 0 "cirrus_fp_register")) + (and (match_test "TARGET_32BIT") + (match_operand 0 "arm_di_operand")))) + +;; True if the operand is memory reference suitable for a ldrex/strex. +(define_predicate "arm_sync_memory_operand" + (and (match_operand 0 "memory_operand") + (match_code "reg" "0"))) + +;; Predicates for parallel expanders based on mode. +(define_special_predicate "vect_par_constant_high" + (match_code "parallel") +{ + HOST_WIDE_INT count = XVECLEN (op, 0); + int i; + int base = GET_MODE_NUNITS (mode); + + if ((count < 1) + || (count != base/2)) + return false; + + if (!VECTOR_MODE_P (mode)) + return false; + + for (i = 0; i < count; i++) + { + rtx elt = XVECEXP (op, 0, i); + int val; + + if (GET_CODE (elt) != CONST_INT) + return false; + + val = INTVAL (elt); + if (val != (base/2) + i) + return false; + } + return true; +}) + +(define_special_predicate "vect_par_constant_low" + (match_code "parallel") +{ + HOST_WIDE_INT count = XVECLEN (op, 0); + int i; + int base = GET_MODE_NUNITS (mode); + + if ((count < 1) + || (count != base/2)) + return false; + + if (!VECTOR_MODE_P (mode)) + return false; + + for (i = 0; i < count; i++) + { + rtx elt = XVECEXP (op, 0, i); + int val; + + if (GET_CODE (elt) != CONST_INT) + return false; + + val = INTVAL (elt); + if (val != i) + return false; + } + return true; +}) + +(define_special_predicate "add_operator" + (match_code "plus")) diff --git a/gcc/config/arm/rtems-eabi.h b/gcc/config/arm/rtems-eabi.h new file mode 100644 index 000000000..ced98a91b --- /dev/null +++ b/gcc/config/arm/rtems-eabi.h @@ -0,0 +1,29 @@ +/* Definitions for RTEMS based ARM systems using EABI. + Copyright (C) 2011 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +#define HAS_INIT_SECTION + +#undef TARGET_OS_CPP_BUILTINS +#define TARGET_OS_CPP_BUILTINS() \ + do { \ + builtin_define ("__rtems__"); \ + builtin_define ("__USE_INIT_FINI__"); \ + builtin_assert ("system=rtems"); \ + TARGET_BPABI_CPP_BUILTINS(); \ + } while (0) diff --git a/gcc/config/arm/rtems-elf.h b/gcc/config/arm/rtems-elf.h new file mode 100644 index 000000000..dade74b15 --- /dev/null +++ b/gcc/config/arm/rtems-elf.h @@ -0,0 +1,45 @@ +/* Definitions for RTEMS based ARM systems using ELF + Copyright (C) 2000, 2002, 2005, 2007, 2008 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +/* Run-time Target Specification. */ +#undef TARGET_VERSION +#define TARGET_VERSION fputs (" (ARM/ELF RTEMS)", stderr); + +#define HAS_INIT_SECTION + +#define TARGET_OS_CPP_BUILTINS() \ + do { \ + builtin_define ("__rtems__"); \ + builtin_define ("__USE_INIT_FINI__"); \ + builtin_assert ("system=rtems"); \ + } while (0) + +/* + * The default in gcc now is soft-float, but gcc misses it to + * pass it to the assembler. + */ +#undef SUBTARGET_EXTRA_ASM_SPEC +#define SUBTARGET_EXTRA_ASM_SPEC "\ + %{!mhard-float: %{!msoft-float:-mfpu=softfpa}}" + +/* + * The default includes --start-group and --end-group which conflicts + * with how this used to be defined. + */ +#undef LINK_GCC_C_SEQUENCE_SPEC diff --git a/gcc/config/arm/semi.h b/gcc/config/arm/semi.h new file mode 100644 index 000000000..1e35710c9 --- /dev/null +++ b/gcc/config/arm/semi.h @@ -0,0 +1,75 @@ +/* Definitions of target machine for GNU compiler. ARM on semi-hosted platform + Copyright (C) 1994, 1995, 1996, 1997, 2001, 2004, 2005, 2007 + Free Software Foundation, Inc. + Contributed by Richard Earnshaw (richard.earnshaw@arm.com) + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +#define STARTFILE_SPEC "crt0.o%s" + +#ifndef LIB_SPEC +#define LIB_SPEC "-lc" +#endif + +#ifndef SUBTARGET_CPP_SPEC +#define SUBTARGET_CPP_SPEC "-D__semi__" +#endif + +#ifndef LINK_SPEC +#define LINK_SPEC "%{mbig-endian:-EB} -X" +#endif + +#ifndef TARGET_VERSION +#define TARGET_VERSION fputs (" (ARM/semi-hosted)", stderr); +#endif + +#ifndef TARGET_DEFAULT_FLOAT_ABI +#define TARGET_DEFAULT_FLOAT_ABI ARM_FLOAT_ABI_HARD +#endif + +#ifndef TARGET_DEFAULT +#define TARGET_DEFAULT (MASK_APCS_FRAME) +#endif + +#ifndef SUBTARGET_EXTRA_SPECS +#define SUBTARGET_EXTRA_SPECS \ + { "subtarget_extra_asm_spec", SUBTARGET_EXTRA_ASM_SPEC }, +#endif + +#ifndef SUBTARGET_EXTRA_ASM_SPEC +#define SUBTARGET_EXTRA_ASM_SPEC "" +#endif + +/* The compiler supports PIC code generation, even though the binutils + may not. If we are asked to compile position independent code, we + always pass -k to the assembler. If it doesn't recognize it, then + it will barf, which probably means that it doesn't know how to + assemble PIC code. This is what we want, since otherwise tools + may incorrectly assume we support PIC compilation even if the + binutils can't. */ +#ifndef ASM_SPEC +#define ASM_SPEC "\ +%{fpic|fpie: -k} %{fPIC|fPIE: -k} \ +%{mbig-endian:-EB} \ +%{mcpu=*:-mcpu=%*} \ +%{march=*:-march=%*} \ +%{mapcs-float:-mfloat} \ +%{msoft-float:-mfloat-abi=soft} %{mhard-float:-mfloat-abi=hard} \ +%{mfloat-abi=*} %{mfpu=*} \ +%{mthumb-interwork:-mthumb-interwork} \ +%(subtarget_extra_asm_spec)" +#endif diff --git a/gcc/config/arm/sfp-machine.h b/gcc/config/arm/sfp-machine.h new file mode 100644 index 000000000..a89d05a00 --- /dev/null +++ b/gcc/config/arm/sfp-machine.h @@ -0,0 +1,105 @@ +#define _FP_W_TYPE_SIZE 32 +#define _FP_W_TYPE unsigned long +#define _FP_WS_TYPE signed long +#define _FP_I_TYPE long + +/* The type of the result of a floating point comparison. This must + match `__libgcc_cmp_return__' in GCC for the target. */ +typedef int __gcc_CMPtype __attribute__ ((mode (__libgcc_cmp_return__))); +#define CMPtype __gcc_CMPtype + +#define _FP_MUL_MEAT_S(R,X,Y) \ + _FP_MUL_MEAT_1_wide(_FP_WFRACBITS_S,R,X,Y,umul_ppmm) +#define _FP_MUL_MEAT_D(R,X,Y) \ + _FP_MUL_MEAT_2_wide(_FP_WFRACBITS_D,R,X,Y,umul_ppmm) +#define _FP_MUL_MEAT_Q(R,X,Y) \ + _FP_MUL_MEAT_4_wide(_FP_WFRACBITS_Q,R,X,Y,umul_ppmm) + +#define _FP_DIV_MEAT_S(R,X,Y) _FP_DIV_MEAT_1_loop(S,R,X,Y) +#define _FP_DIV_MEAT_D(R,X,Y) _FP_DIV_MEAT_2_udiv(D,R,X,Y) +#define _FP_DIV_MEAT_Q(R,X,Y) _FP_DIV_MEAT_4_udiv(Q,R,X,Y) + +#define _FP_NANFRAC_H ((_FP_QNANBIT_H << 1) - 1) +#define _FP_NANFRAC_S ((_FP_QNANBIT_S << 1) - 1) +#define _FP_NANFRAC_D ((_FP_QNANBIT_D << 1) - 1), -1 +#define _FP_NANFRAC_Q ((_FP_QNANBIT_Q << 1) - 1), -1, -1, -1 +#define _FP_NANSIGN_H 0 +#define _FP_NANSIGN_S 0 +#define _FP_NANSIGN_D 0 +#define _FP_NANSIGN_Q 0 + +#define _FP_KEEPNANFRACP 1 + +/* Someone please check this. */ +#define _FP_CHOOSENAN(fs, wc, R, X, Y, OP) \ + do { \ + if ((_FP_FRAC_HIGH_RAW_##fs(X) & _FP_QNANBIT_##fs) \ + && !(_FP_FRAC_HIGH_RAW_##fs(Y) & _FP_QNANBIT_##fs)) \ + { \ + R##_s = Y##_s; \ + _FP_FRAC_COPY_##wc(R,Y); \ + } \ + else \ + { \ + R##_s = X##_s; \ + _FP_FRAC_COPY_##wc(R,X); \ + } \ + R##_c = FP_CLS_NAN; \ + } while (0) + +#define __LITTLE_ENDIAN 1234 +#define __BIG_ENDIAN 4321 + +#if defined __ARMEB__ +# define __BYTE_ORDER __BIG_ENDIAN +#else +# define __BYTE_ORDER __LITTLE_ENDIAN +#endif + + +/* Define ALIASNAME as a strong alias for NAME. */ +# define strong_alias(name, aliasname) _strong_alias(name, aliasname) +# define _strong_alias(name, aliasname) \ + extern __typeof (name) aliasname __attribute__ ((alias (#name))); + +#ifdef __ARM_EABI__ +/* Rename functions to their EABI names. */ +/* The comparison functions need wrappers for EABI semantics, so + leave them unmolested. */ +#define __negsf2 __aeabi_fneg +#define __subsf3 __aeabi_fsub +#define __addsf3 __aeabi_fadd +#define __floatunsisf __aeabi_ui2f +#define __floatsisf __aeabi_i2f +#define __floatundisf __aeabi_ul2f +#define __floatdisf __aeabi_l2f +#define __mulsf3 __aeabi_fmul +#define __divsf3 __aeabi_fdiv +#define __unordsf2 __aeabi_fcmpun +#define __fixsfsi __aeabi_f2iz +#define __fixunssfsi __aeabi_f2uiz +#define __fixsfdi __aeabi_f2lz +#define __fixunssfdi __aeabi_f2ulz +#define __floatdisf __aeabi_l2f + +#define __negdf2 __aeabi_dneg +#define __subdf3 __aeabi_dsub +#define __adddf3 __aeabi_dadd +#define __floatunsidf __aeabi_ui2d +#define __floatsidf __aeabi_i2d +#define __extendsfdf2 __aeabi_f2d +#define __truncdfsf2 __aeabi_d2f +#define __floatundidf __aeabi_ul2d +#define __floatdidf __aeabi_l2d +#define __muldf3 __aeabi_dmul +#define __divdf3 __aeabi_ddiv +#define __unorddf2 __aeabi_dcmpun +#define __fixdfsi __aeabi_d2iz +#define __fixunsdfsi __aeabi_d2uiz +#define __fixdfdi __aeabi_d2lz +#define __fixunsdfdi __aeabi_d2ulz +#define __floatdidf __aeabi_l2d +#define __extendhfsf2 __gnu_h2f_ieee +#define __truncsfhf2 __gnu_f2h_ieee + +#endif /* __ARM_EABI__ */ diff --git a/gcc/config/arm/symbian.h b/gcc/config/arm/symbian.h new file mode 100644 index 000000000..ff233a89f --- /dev/null +++ b/gcc/config/arm/symbian.h @@ -0,0 +1,105 @@ +/* Configuration file for Symbian OS on ARM processors. + Copyright (C) 2004, 2005, 2007, 2008 + Free Software Foundation, Inc. + Contributed by CodeSourcery, LLC + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +/* Do not expand builtin functions (unless explicitly prefixed with + "__builtin"). Symbian OS code relies on properties of the standard + library that go beyond those guaranteed by the ANSI/ISO standard. + For example, "memcpy" works even with overlapping memory, like + "memmove". We cannot simply set flag_no_builtin in arm.c because + (a) flag_no_builtin is not declared in language-independent code, + and (b) that would prevent users from explicitly overriding the + default with -fbuiltin, which may sometimes be useful. + + Make all symbols hidden by default. Symbian OS expects that all + exported symbols will be explicitly marked with + "__declspec(dllexport)". + + Enumeration types use 4 bytes, even if the enumerals are small, + unless explicitly overridden. + + The wchar_t type is a 2-byte type, unless explicitly + overridden. */ +#define CC1_SPEC \ + "%{!fbuiltin:%{!fno-builtin:-fno-builtin}} " \ + "%{!fvisibility=*:-fvisibility=hidden} " \ + "%{!fshort-enums:%{!fno-short-enums:-fno-short-enums}} " \ + "%{!fshort-wchar:%{!fno-short-wchar:-fshort-wchar}} " +#define CC1PLUS_SPEC CC1_SPEC + +/* Symbian OS does not use crt*.o, unlike the generic unknown-elf + configuration. */ +#undef STARTFILE_SPEC +#define STARTFILE_SPEC "" + +#undef ENDFILE_SPEC +#define ENDFILE_SPEC "" + +/* Do not link with any libraries by default. On Symbian OS, the user + must supply all required libraries on the command line. */ +#undef LIB_SPEC +#define LIB_SPEC "" + +/* Support the "dllimport" attribute. */ +#define TARGET_DLLIMPORT_DECL_ATTRIBUTES 1 + +/* Symbian OS assumes ARM V5 or above. Since -march=armv5 is + equivalent to making the ARM 10TDMI core the default, we can set + SUBTARGET_CPU_DEFAULT and get an equivalent effect. */ +#undef SUBTARGET_CPU_DEFAULT +#define SUBTARGET_CPU_DEFAULT TARGET_CPU_arm10tdmi + +/* The assembler should assume VFP FPU format, and armv5t. */ +#undef SUBTARGET_ASM_FLOAT_SPEC +#define SUBTARGET_ASM_FLOAT_SPEC \ + "%{!mfpu=*:-mfpu=vfp} %{!mcpu=*:%{!march=*:-march=armv5t}}" + +/* SymbianOS provides the BPABI routines in a separate library. + Therefore, we do not need to define any of them in libgcc. */ +#undef RENAME_LIBRARY +#define RENAME_LIBRARY(GCC_NAME, AEABI_NAME) /* empty */ + +/* Define the __symbian__ macro. */ +#undef TARGET_OS_CPP_BUILTINS +#define TARGET_OS_CPP_BUILTINS() \ + do \ + { \ + /* Include the default BPABI stuff. */ \ + TARGET_BPABI_CPP_BUILTINS (); \ + /* Symbian OS does not support merging symbols across DLL \ + boundaries. */ \ + builtin_define ("__GXX_MERGED_TYPEINFO_NAMES=0"); \ + builtin_define ("__symbian__"); \ + } \ + while (false) + +/* On SymbianOS, these sections are not writable, so we use "a", + rather than "aw", for the section attributes. */ +#undef ARM_EABI_CTORS_SECTION_OP +#define ARM_EABI_CTORS_SECTION_OP \ + "\t.section\t.init_array,\"a\",%init_array" +#undef ARM_EABI_DTORS_SECTION_OP +#define ARM_EABI_DTORS_SECTION_OP \ + "\t.section\t.fini_array,\"a\",%fini_array" + +/* SymbianOS cannot merge entities with vague linkage at runtime. */ +#define TARGET_ARM_DYNAMIC_VAGUE_LINKAGE_P false + +#define TARGET_DEFAULT_WORD_RELOCATIONS 1 diff --git a/gcc/config/arm/sync.md b/gcc/config/arm/sync.md new file mode 100644 index 000000000..689a235c1 --- /dev/null +++ b/gcc/config/arm/sync.md @@ -0,0 +1,602 @@ +;; Machine description for ARM processor synchronization primitives. +;; Copyright (C) 2010 Free Software Foundation, Inc. +;; Written by Marcus Shawcroft (marcus.shawcroft@arm.com) +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ + +;; ARMV6 introduced ldrex and strex instruction. These instruction +;; access SI width data. In order to implement synchronization +;; primitives for the narrower QI and HI modes we insert appropriate +;; AND/OR sequences into the synchronization loop to mask out the +;; relevant component of an SI access. + +(define_expand "memory_barrier" + [(set (match_dup 0) + (unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))] + "TARGET_HAVE_MEMORY_BARRIER" +{ + operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)); + MEM_VOLATILE_P (operands[0]) = 1; +}) + +(define_expand "sync_compare_and_swapsi" + [(set (match_operand:SI 0 "s_register_operand") + (unspec_volatile:SI [(match_operand:SI 1 "memory_operand") + (match_operand:SI 2 "s_register_operand") + (match_operand:SI 3 "s_register_operand")] + VUNSPEC_SYNC_COMPARE_AND_SWAP))] + "TARGET_HAVE_LDREX && TARGET_HAVE_MEMORY_BARRIER" + { + struct arm_sync_generator generator; + generator.op = arm_sync_generator_omrn; + generator.u.omrn = gen_arm_sync_compare_and_swapsi; + arm_expand_sync (SImode, &generator, operands[0], operands[1], operands[2], + operands[3]); + DONE; + }) + +(define_mode_iterator NARROW [QI HI]) + +(define_expand "sync_compare_and_swap" + [(set (match_operand:NARROW 0 "s_register_operand") + (unspec_volatile:NARROW [(match_operand:NARROW 1 "memory_operand") + (match_operand:NARROW 2 "s_register_operand") + (match_operand:NARROW 3 "s_register_operand")] + VUNSPEC_SYNC_COMPARE_AND_SWAP))] + "TARGET_HAVE_LDREXBHD && TARGET_HAVE_MEMORY_BARRIER" + { + struct arm_sync_generator generator; + generator.op = arm_sync_generator_omrn; + generator.u.omrn = gen_arm_sync_compare_and_swap; + arm_expand_sync (mode, &generator, operands[0], operands[1], + operands[2], operands[3]); + DONE; + }) + +(define_expand "sync_lock_test_and_setsi" + [(match_operand:SI 0 "s_register_operand") + (match_operand:SI 1 "memory_operand") + (match_operand:SI 2 "s_register_operand")] + "TARGET_HAVE_LDREX && TARGET_HAVE_MEMORY_BARRIER" + { + struct arm_sync_generator generator; + generator.op = arm_sync_generator_omn; + generator.u.omn = gen_arm_sync_lock_test_and_setsi; + arm_expand_sync (SImode, &generator, operands[0], operands[1], NULL, + operands[2]); + DONE; + }) + +(define_expand "sync_lock_test_and_set" + [(match_operand:NARROW 0 "s_register_operand") + (match_operand:NARROW 1 "memory_operand") + (match_operand:NARROW 2 "s_register_operand")] + "TARGET_HAVE_LDREXBHD && TARGET_HAVE_MEMORY_BARRIER" + { + struct arm_sync_generator generator; + generator.op = arm_sync_generator_omn; + generator.u.omn = gen_arm_sync_lock_test_and_set; + arm_expand_sync (mode, &generator, operands[0], operands[1], NULL, + operands[2]); + DONE; + }) + +(define_code_iterator syncop [plus minus ior xor and]) + +(define_code_attr sync_optab [(ior "ior") + (xor "xor") + (and "and") + (plus "add") + (minus "sub")]) + +(define_code_attr sync_clobber [(ior "=&r") + (and "=&r") + (xor "X") + (plus "X") + (minus "X")]) + +(define_code_attr sync_t2_reqd [(ior "4") + (and "4") + (xor "*") + (plus "*") + (minus "*")]) + +(define_expand "sync_si" + [(match_operand:SI 0 "memory_operand") + (match_operand:SI 1 "s_register_operand") + (syncop:SI (match_dup 0) (match_dup 1))] + "TARGET_HAVE_LDREX && TARGET_HAVE_MEMORY_BARRIER" + { + struct arm_sync_generator generator; + generator.op = arm_sync_generator_omn; + generator.u.omn = gen_arm_sync_new_si; + arm_expand_sync (SImode, &generator, NULL, operands[0], NULL, operands[1]); + DONE; + }) + +(define_expand "sync_nandsi" + [(match_operand:SI 0 "memory_operand") + (match_operand:SI 1 "s_register_operand") + (not:SI (and:SI (match_dup 0) (match_dup 1)))] + "TARGET_HAVE_LDREX && TARGET_HAVE_MEMORY_BARRIER" + { + struct arm_sync_generator generator; + generator.op = arm_sync_generator_omn; + generator.u.omn = gen_arm_sync_new_nandsi; + arm_expand_sync (SImode, &generator, NULL, operands[0], NULL, operands[1]); + DONE; + }) + +(define_expand "sync_" + [(match_operand:NARROW 0 "memory_operand") + (match_operand:NARROW 1 "s_register_operand") + (syncop:NARROW (match_dup 0) (match_dup 1))] + "TARGET_HAVE_LDREXBHD && TARGET_HAVE_MEMORY_BARRIER" + { + struct arm_sync_generator generator; + generator.op = arm_sync_generator_omn; + generator.u.omn = gen_arm_sync_new_; + arm_expand_sync (mode, &generator, NULL, operands[0], NULL, + operands[1]); + DONE; + }) + +(define_expand "sync_nand" + [(match_operand:NARROW 0 "memory_operand") + (match_operand:NARROW 1 "s_register_operand") + (not:NARROW (and:NARROW (match_dup 0) (match_dup 1)))] + "TARGET_HAVE_LDREXBHD && TARGET_HAVE_MEMORY_BARRIER" + { + struct arm_sync_generator generator; + generator.op = arm_sync_generator_omn; + generator.u.omn = gen_arm_sync_new_nand; + arm_expand_sync (mode, &generator, NULL, operands[0], NULL, + operands[1]); + DONE; + }) + +(define_expand "sync_new_si" + [(match_operand:SI 0 "s_register_operand") + (match_operand:SI 1 "memory_operand") + (match_operand:SI 2 "s_register_operand") + (syncop:SI (match_dup 1) (match_dup 2))] + "TARGET_HAVE_LDREX && TARGET_HAVE_MEMORY_BARRIER" + { + struct arm_sync_generator generator; + generator.op = arm_sync_generator_omn; + generator.u.omn = gen_arm_sync_new_si; + arm_expand_sync (SImode, &generator, operands[0], operands[1], NULL, + operands[2]); + DONE; + }) + +(define_expand "sync_new_nandsi" + [(match_operand:SI 0 "s_register_operand") + (match_operand:SI 1 "memory_operand") + (match_operand:SI 2 "s_register_operand") + (not:SI (and:SI (match_dup 1) (match_dup 2)))] + "TARGET_HAVE_LDREX && TARGET_HAVE_MEMORY_BARRIER" + { + struct arm_sync_generator generator; + generator.op = arm_sync_generator_omn; + generator.u.omn = gen_arm_sync_new_nandsi; + arm_expand_sync (SImode, &generator, operands[0], operands[1], NULL, + operands[2]); + DONE; + }) + +(define_expand "sync_new_" + [(match_operand:NARROW 0 "s_register_operand") + (match_operand:NARROW 1 "memory_operand") + (match_operand:NARROW 2 "s_register_operand") + (syncop:NARROW (match_dup 1) (match_dup 2))] + "TARGET_HAVE_LDREXBHD && TARGET_HAVE_MEMORY_BARRIER" + { + struct arm_sync_generator generator; + generator.op = arm_sync_generator_omn; + generator.u.omn = gen_arm_sync_new_; + arm_expand_sync (mode, &generator, operands[0], operands[1], + NULL, operands[2]); + DONE; + }) + +(define_expand "sync_new_nand" + [(match_operand:NARROW 0 "s_register_operand") + (match_operand:NARROW 1 "memory_operand") + (match_operand:NARROW 2 "s_register_operand") + (not:NARROW (and:NARROW (match_dup 1) (match_dup 2)))] + "TARGET_HAVE_LDREXBHD && TARGET_HAVE_MEMORY_BARRIER" + { + struct arm_sync_generator generator; + generator.op = arm_sync_generator_omn; + generator.u.omn = gen_arm_sync_new_nand; + arm_expand_sync (mode, &generator, operands[0], operands[1], + NULL, operands[2]); + DONE; + }); + +(define_expand "sync_old_si" + [(match_operand:SI 0 "s_register_operand") + (match_operand:SI 1 "memory_operand") + (match_operand:SI 2 "s_register_operand") + (syncop:SI (match_dup 1) (match_dup 2))] + "TARGET_HAVE_LDREX && TARGET_HAVE_MEMORY_BARRIER" + { + struct arm_sync_generator generator; + generator.op = arm_sync_generator_omn; + generator.u.omn = gen_arm_sync_old_si; + arm_expand_sync (SImode, &generator, operands[0], operands[1], NULL, + operands[2]); + DONE; + }) + +(define_expand "sync_old_nandsi" + [(match_operand:SI 0 "s_register_operand") + (match_operand:SI 1 "memory_operand") + (match_operand:SI 2 "s_register_operand") + (not:SI (and:SI (match_dup 1) (match_dup 2)))] + "TARGET_HAVE_LDREX && TARGET_HAVE_MEMORY_BARRIER" + { + struct arm_sync_generator generator; + generator.op = arm_sync_generator_omn; + generator.u.omn = gen_arm_sync_old_nandsi; + arm_expand_sync (SImode, &generator, operands[0], operands[1], NULL, + operands[2]); + DONE; + }) + +(define_expand "sync_old_" + [(match_operand:NARROW 0 "s_register_operand") + (match_operand:NARROW 1 "memory_operand") + (match_operand:NARROW 2 "s_register_operand") + (syncop:NARROW (match_dup 1) (match_dup 2))] + "TARGET_HAVE_LDREXBHD && TARGET_HAVE_MEMORY_BARRIER" + { + struct arm_sync_generator generator; + generator.op = arm_sync_generator_omn; + generator.u.omn = gen_arm_sync_old_; + arm_expand_sync (mode, &generator, operands[0], operands[1], + NULL, operands[2]); + DONE; + }) + +(define_expand "sync_old_nand" + [(match_operand:NARROW 0 "s_register_operand") + (match_operand:NARROW 1 "memory_operand") + (match_operand:NARROW 2 "s_register_operand") + (not:NARROW (and:NARROW (match_dup 1) (match_dup 2)))] + "TARGET_HAVE_LDREXBHD && TARGET_HAVE_MEMORY_BARRIER" + { + struct arm_sync_generator generator; + generator.op = arm_sync_generator_omn; + generator.u.omn = gen_arm_sync_old_nand; + arm_expand_sync (mode, &generator, operands[0], operands[1], + NULL, operands[2]); + DONE; + }) + +(define_insn "arm_sync_compare_and_swapsi" + [(set (match_operand:SI 0 "s_register_operand" "=&r") + (unspec_volatile:SI + [(match_operand:SI 1 "arm_sync_memory_operand" "+Q") + (match_operand:SI 2 "s_register_operand" "r") + (match_operand:SI 3 "s_register_operand" "r")] + VUNSPEC_SYNC_COMPARE_AND_SWAP)) + (set (match_dup 1) (unspec_volatile:SI [(match_dup 2)] + VUNSPEC_SYNC_COMPARE_AND_SWAP)) + (set (reg:CC CC_REGNUM) (unspec_volatile:CC [(match_dup 1)] + VUNSPEC_SYNC_COMPARE_AND_SWAP)) + ] + "TARGET_HAVE_LDREX && TARGET_HAVE_MEMORY_BARRIER" + { + return arm_output_sync_insn (insn, operands); + } + [(set_attr "sync_result" "0") + (set_attr "sync_memory" "1") + (set_attr "sync_required_value" "2") + (set_attr "sync_new_value" "3") + (set_attr "sync_t1" "0") + (set_attr "conds" "clob") + (set_attr "predicable" "no")]) + +(define_insn "arm_sync_compare_and_swap" + [(set (match_operand:SI 0 "s_register_operand" "=&r") + (zero_extend:SI + (unspec_volatile:NARROW + [(match_operand:NARROW 1 "arm_sync_memory_operand" "+Q") + (match_operand:SI 2 "s_register_operand" "r") + (match_operand:SI 3 "s_register_operand" "r")] + VUNSPEC_SYNC_COMPARE_AND_SWAP))) + (set (match_dup 1) (unspec_volatile:NARROW [(match_dup 2)] + VUNSPEC_SYNC_COMPARE_AND_SWAP)) + (set (reg:CC CC_REGNUM) (unspec_volatile:CC [(match_dup 1)] + VUNSPEC_SYNC_COMPARE_AND_SWAP)) + ] + "TARGET_HAVE_LDREXBHD && TARGET_HAVE_MEMORY_BARRIER" + { + return arm_output_sync_insn (insn, operands); + } + [(set_attr "sync_result" "0") + (set_attr "sync_memory" "1") + (set_attr "sync_required_value" "2") + (set_attr "sync_new_value" "3") + (set_attr "sync_t1" "0") + (set_attr "conds" "clob") + (set_attr "predicable" "no")]) + +(define_insn "arm_sync_lock_test_and_setsi" + [(set (match_operand:SI 0 "s_register_operand" "=&r") + (match_operand:SI 1 "arm_sync_memory_operand" "+Q")) + (set (match_dup 1) + (unspec_volatile:SI [(match_operand:SI 2 "s_register_operand" "r")] + VUNSPEC_SYNC_LOCK)) + (clobber (reg:CC CC_REGNUM)) + (clobber (match_scratch:SI 3 "=&r"))] + "TARGET_HAVE_LDREX && TARGET_HAVE_MEMORY_BARRIER" + { + return arm_output_sync_insn (insn, operands); + } + [(set_attr "sync_release_barrier" "no") + (set_attr "sync_result" "0") + (set_attr "sync_memory" "1") + (set_attr "sync_new_value" "2") + (set_attr "sync_t1" "0") + (set_attr "sync_t2" "3") + (set_attr "conds" "clob") + (set_attr "predicable" "no")]) + +(define_insn "arm_sync_lock_test_and_set" + [(set (match_operand:SI 0 "s_register_operand" "=&r") + (zero_extend:SI (match_operand:NARROW 1 "arm_sync_memory_operand" "+Q"))) + (set (match_dup 1) + (unspec_volatile:NARROW [(match_operand:SI 2 "s_register_operand" "r")] + VUNSPEC_SYNC_LOCK)) + (clobber (reg:CC CC_REGNUM)) + (clobber (match_scratch:SI 3 "=&r"))] + "TARGET_HAVE_LDREX && TARGET_HAVE_MEMORY_BARRIER" + { + return arm_output_sync_insn (insn, operands); + } + [(set_attr "sync_release_barrier" "no") + (set_attr "sync_result" "0") + (set_attr "sync_memory" "1") + (set_attr "sync_new_value" "2") + (set_attr "sync_t1" "0") + (set_attr "sync_t2" "3") + (set_attr "conds" "clob") + (set_attr "predicable" "no")]) + +(define_insn "arm_sync_new_si" + [(set (match_operand:SI 0 "s_register_operand" "=&r") + (unspec_volatile:SI [(syncop:SI + (match_operand:SI 1 "arm_sync_memory_operand" "+Q") + (match_operand:SI 2 "s_register_operand" "r")) + ] + VUNSPEC_SYNC_NEW_OP)) + (set (match_dup 1) + (unspec_volatile:SI [(match_dup 1) (match_dup 2)] + VUNSPEC_SYNC_NEW_OP)) + (clobber (reg:CC CC_REGNUM)) + (clobber (match_scratch:SI 3 "=&r"))] + "TARGET_HAVE_LDREX && TARGET_HAVE_MEMORY_BARRIER" + { + return arm_output_sync_insn (insn, operands); + } + [(set_attr "sync_result" "0") + (set_attr "sync_memory" "1") + (set_attr "sync_new_value" "2") + (set_attr "sync_t1" "0") + (set_attr "sync_t2" "3") + (set_attr "sync_op" "") + (set_attr "conds" "clob") + (set_attr "predicable" "no")]) + +(define_insn "arm_sync_new_nandsi" + [(set (match_operand:SI 0 "s_register_operand" "=&r") + (unspec_volatile:SI [(not:SI (and:SI + (match_operand:SI 1 "arm_sync_memory_operand" "+Q") + (match_operand:SI 2 "s_register_operand" "r"))) + ] + VUNSPEC_SYNC_NEW_OP)) + (set (match_dup 1) + (unspec_volatile:SI [(match_dup 1) (match_dup 2)] + VUNSPEC_SYNC_NEW_OP)) + (clobber (reg:CC CC_REGNUM)) + (clobber (match_scratch:SI 3 "=&r"))] + "TARGET_HAVE_LDREX && TARGET_HAVE_MEMORY_BARRIER" + { + return arm_output_sync_insn (insn, operands); + } + [(set_attr "sync_result" "0") + (set_attr "sync_memory" "1") + (set_attr "sync_new_value" "2") + (set_attr "sync_t1" "0") + (set_attr "sync_t2" "3") + (set_attr "sync_op" "nand") + (set_attr "conds" "clob") + (set_attr "predicable" "no")]) + +(define_insn "arm_sync_new_" + [(set (match_operand:SI 0 "s_register_operand" "=&r") + (unspec_volatile:SI [(syncop:SI + (zero_extend:SI + (match_operand:NARROW 1 "arm_sync_memory_operand" "+Q")) + (match_operand:SI 2 "s_register_operand" "r")) + ] + VUNSPEC_SYNC_NEW_OP)) + (set (match_dup 1) + (unspec_volatile:NARROW [(match_dup 1) (match_dup 2)] + VUNSPEC_SYNC_NEW_OP)) + (clobber (reg:CC CC_REGNUM)) + (clobber (match_scratch:SI 3 "=&r"))] + "TARGET_HAVE_LDREXBHD && TARGET_HAVE_MEMORY_BARRIER" + { + return arm_output_sync_insn (insn, operands); + } + [(set_attr "sync_result" "0") + (set_attr "sync_memory" "1") + (set_attr "sync_new_value" "2") + (set_attr "sync_t1" "0") + (set_attr "sync_t2" "3") + (set_attr "sync_op" "") + (set_attr "conds" "clob") + (set_attr "predicable" "no")]) + +(define_insn "arm_sync_new_nand" + [(set (match_operand:SI 0 "s_register_operand" "=&r") + (unspec_volatile:SI + [(not:SI + (and:SI + (zero_extend:SI + (match_operand:NARROW 1 "arm_sync_memory_operand" "+Q")) + (match_operand:SI 2 "s_register_operand" "r"))) + ] VUNSPEC_SYNC_NEW_OP)) + (set (match_dup 1) + (unspec_volatile:NARROW [(match_dup 1) (match_dup 2)] + VUNSPEC_SYNC_NEW_OP)) + (clobber (reg:CC CC_REGNUM)) + (clobber (match_scratch:SI 3 "=&r"))] + "TARGET_HAVE_LDREX && TARGET_HAVE_MEMORY_BARRIER" + { + return arm_output_sync_insn (insn, operands); + } + [(set_attr "sync_result" "0") + (set_attr "sync_memory" "1") + (set_attr "sync_new_value" "2") + (set_attr "sync_t1" "0") + (set_attr "sync_t2" "3") + (set_attr "sync_op" "nand") + (set_attr "conds" "clob") + (set_attr "predicable" "no")]) + +(define_insn "arm_sync_old_si" + [(set (match_operand:SI 0 "s_register_operand" "=&r") + (unspec_volatile:SI [(syncop:SI + (match_operand:SI 1 "arm_sync_memory_operand" "+Q") + (match_operand:SI 2 "s_register_operand" "r")) + ] + VUNSPEC_SYNC_OLD_OP)) + (set (match_dup 1) + (unspec_volatile:SI [(match_dup 1) (match_dup 2)] + VUNSPEC_SYNC_OLD_OP)) + (clobber (reg:CC CC_REGNUM)) + (clobber (match_scratch:SI 3 "=&r")) + (clobber (match_scratch:SI 4 ""))] + "TARGET_HAVE_LDREX && TARGET_HAVE_MEMORY_BARRIER" + { + return arm_output_sync_insn (insn, operands); + } + [(set_attr "sync_result" "0") + (set_attr "sync_memory" "1") + (set_attr "sync_new_value" "2") + (set_attr "sync_t1" "3") + (set_attr "sync_t2" "") + (set_attr "sync_op" "") + (set_attr "conds" "clob") + (set_attr "predicable" "no")]) + +(define_insn "arm_sync_old_nandsi" + [(set (match_operand:SI 0 "s_register_operand" "=&r") + (unspec_volatile:SI [(not:SI (and:SI + (match_operand:SI 1 "arm_sync_memory_operand" "+Q") + (match_operand:SI 2 "s_register_operand" "r"))) + ] + VUNSPEC_SYNC_OLD_OP)) + (set (match_dup 1) + (unspec_volatile:SI [(match_dup 1) (match_dup 2)] + VUNSPEC_SYNC_OLD_OP)) + (clobber (reg:CC CC_REGNUM)) + (clobber (match_scratch:SI 3 "=&r")) + (clobber (match_scratch:SI 4 "=&r"))] + "TARGET_HAVE_LDREX && TARGET_HAVE_MEMORY_BARRIER" + { + return arm_output_sync_insn (insn, operands); + } + [(set_attr "sync_result" "0") + (set_attr "sync_memory" "1") + (set_attr "sync_new_value" "2") + (set_attr "sync_t1" "3") + (set_attr "sync_t2" "4") + (set_attr "sync_op" "nand") + (set_attr "conds" "clob") + (set_attr "predicable" "no")]) + +(define_insn "arm_sync_old_" + [(set (match_operand:SI 0 "s_register_operand" "=&r") + (unspec_volatile:SI [(syncop:SI + (zero_extend:SI + (match_operand:NARROW 1 "arm_sync_memory_operand" "+Q")) + (match_operand:SI 2 "s_register_operand" "r")) + ] + VUNSPEC_SYNC_OLD_OP)) + (set (match_dup 1) + (unspec_volatile:NARROW [(match_dup 1) (match_dup 2)] + VUNSPEC_SYNC_OLD_OP)) + (clobber (reg:CC CC_REGNUM)) + (clobber (match_scratch:SI 3 "=&r")) + (clobber (match_scratch:SI 4 ""))] + "TARGET_HAVE_LDREXBHD && TARGET_HAVE_MEMORY_BARRIER" + { + return arm_output_sync_insn (insn, operands); + } + [(set_attr "sync_result" "0") + (set_attr "sync_memory" "1") + (set_attr "sync_new_value" "2") + (set_attr "sync_t1" "3") + (set_attr "sync_t2" "") + (set_attr "sync_op" "") + (set_attr "conds" "clob") + (set_attr "predicable" "no")]) + +(define_insn "arm_sync_old_nand" + [(set (match_operand:SI 0 "s_register_operand" "=&r") + (unspec_volatile:SI [(not:SI (and:SI + (zero_extend:SI + (match_operand:NARROW 1 "arm_sync_memory_operand" "+Q")) + (match_operand:SI 2 "s_register_operand" "r"))) + ] + VUNSPEC_SYNC_OLD_OP)) + (set (match_dup 1) + (unspec_volatile:NARROW [(match_dup 1) (match_dup 2)] + VUNSPEC_SYNC_OLD_OP)) + (clobber (reg:CC CC_REGNUM)) + (clobber (match_scratch:SI 3 "=&r")) + (clobber (match_scratch:SI 4 "=&r"))] + "TARGET_HAVE_LDREXBHD && TARGET_HAVE_MEMORY_BARRIER" + { + return arm_output_sync_insn (insn, operands); + } + [(set_attr "sync_result" "0") + (set_attr "sync_memory" "1") + (set_attr "sync_new_value" "2") + (set_attr "sync_t1" "3") + (set_attr "sync_t2" "4") + (set_attr "sync_op" "nand") + (set_attr "conds" "clob") + (set_attr "predicable" "no")]) + +(define_insn "*memory_barrier" + [(set (match_operand:BLK 0 "" "") + (unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))] + "TARGET_HAVE_MEMORY_BARRIER" + { + return arm_output_memory_barrier (operands); + } + [(set_attr "length" "4") + (set_attr "conds" "unconditional") + (set_attr "predicable" "no")]) + diff --git a/gcc/config/arm/t-arm b/gcc/config/arm/t-arm new file mode 100644 index 000000000..33d7e19f7 --- /dev/null +++ b/gcc/config/arm/t-arm @@ -0,0 +1,66 @@ +# Rules common to all arm targets +# +# Copyright (C) 2004, 2005, 2007, 2008, 2009, 2010 +# Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +MD_INCLUDES= $(srcdir)/config/arm/arm-tune.md \ + $(srcdir)/config/arm/predicates.md \ + $(srcdir)/config/arm/arm-generic.md \ + $(srcdir)/config/arm/arm1020e.md \ + $(srcdir)/config/arm/arm1026ejs.md \ + $(srcdir)/config/arm/arm1136jfs.md \ + $(srcdir)/config/arm/fa526.md \ + $(srcdir)/config/arm/fa606te.md \ + $(srcdir)/config/arm/fa626te.md \ + $(srcdir)/config/arm/fmp626.md \ + $(srcdir)/config/arm/fa726te.md \ + $(srcdir)/config/arm/arm926ejs.md \ + $(srcdir)/config/arm/cirrus.md \ + $(srcdir)/config/arm/fpa.md \ + $(srcdir)/config/arm/vec-common.md \ + $(srcdir)/config/arm/iwmmxt.md \ + $(srcdir)/config/arm/vfp.md \ + $(srcdir)/config/arm/neon.md \ + $(srcdir)/config/arm/thumb2.md + +LIB1ASMSRC = arm/lib1funcs.asm +LIB1ASMFUNCS = _thumb1_case_sqi _thumb1_case_uqi _thumb1_case_shi \ + _thumb1_case_uhi _thumb1_case_si +s-config s-conditions s-flags s-codes s-constants s-emit s-recog s-preds \ + s-opinit s-extract s-peep s-attr s-attrtab s-output: $(MD_INCLUDES) + +$(srcdir)/config/arm/arm-tune.md: $(srcdir)/config/arm/gentune.sh \ + $(srcdir)/config/arm/arm-cores.def + $(SHELL) $(srcdir)/config/arm/gentune.sh \ + $(srcdir)/config/arm/arm-cores.def > \ + $(srcdir)/config/arm/arm-tune.md + +arm.o: $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \ + $(RTL_H) $(TREE_H) $(OBSTACK_H) $(REGS_H) hard-reg-set.h \ + insn-config.h conditions.h output.h \ + $(INSN_ATTR_H) $(FLAGS_H) reload.h $(FUNCTION_H) \ + $(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \ + $(GGC_H) except.h $(C_PRAGMA_H) $(INTEGRATE_H) $(TM_P_H) \ + $(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \ + intl.h libfuncs.h $(PARAMS_H) + +arm-c.o: $(srcdir)/config/arm/arm-c.c $(CONFIG_H) $(SYSTEM_H) \ + coretypes.h $(TM_H) $(TREE_H) output.h $(C_COMMON_H) + $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ + $(srcdir)/config/arm/arm-c.c diff --git a/gcc/config/arm/t-arm-elf b/gcc/config/arm/t-arm-elf new file mode 100644 index 000000000..38c291827 --- /dev/null +++ b/gcc/config/arm/t-arm-elf @@ -0,0 +1,128 @@ +# Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, +# 2008, 2010 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +# For most CPUs we have an assembly soft-float implementations. +# However this is not true for ARMv6M. Here we want to use the soft-fp C +# implementation. The soft-fp code is only build for ARMv6M. This pulls +# in the asm implementation for other CPUs. +LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _bb_init_func \ + _call_via_rX _interwork_call_via_rX \ + _lshrdi3 _ashrdi3 _ashldi3 \ + _arm_negdf2 _arm_addsubdf3 _arm_muldivdf3 _arm_cmpdf2 _arm_unorddf2 \ + _arm_fixdfsi _arm_fixunsdfsi \ + _arm_truncdfsf2 _arm_negsf2 _arm_addsubsf3 _arm_muldivsf3 \ + _arm_cmpsf2 _arm_unordsf2 _arm_fixsfsi _arm_fixunssfsi \ + _arm_floatdidf _arm_floatdisf _arm_floatundidf _arm_floatundisf \ + _clzsi2 _clzdi2 + +MULTILIB_OPTIONS = marm/mthumb +MULTILIB_DIRNAMES = arm thumb +MULTILIB_EXCEPTIONS = +MULTILIB_MATCHES = + +#MULTILIB_OPTIONS += mcpu=fa526/mcpu=fa626/mcpu=fa606te/mcpu=fa626te/mcpu=fmp626/mcpu=fa726te +#MULTILIB_DIRNAMES += fa526 fa626 fa606te fa626te fmp626 fa726te +#MULTILIB_EXCEPTIONS += *mthumb*/*mcpu=fa526 *mthumb*/*mcpu=fa626 + +#MULTILIB_OPTIONS += march=armv7 +#MULTILIB_DIRNAMES += thumb2 +#MULTILIB_EXCEPTIONS += march=armv7* marm/*march=armv7* +#MULTILIB_MATCHES += march?armv7=march?armv7-a +#MULTILIB_MATCHES += march?armv7=march?armv7-r +#MULTILIB_MATCHES += march?armv7=march?armv7-m +#MULTILIB_MATCHES += march?armv7=mcpu?cortex-a8 +#MULTILIB_MATCHES += march?armv7=mcpu?cortex-r4 +#MULTILIB_MATCHES += march?armv7=mcpu?cortex-m3 + +# Not quite true. We can support hard-vfp calling in Thumb2, but how do we +# express that here? Also, we really need architecture v5e or later +# (mcrr etc). +MULTILIB_OPTIONS += mfloat-abi=hard +MULTILIB_DIRNAMES += fpu +MULTILIB_EXCEPTIONS += *mthumb/*mfloat-abi=hard* +#MULTILIB_EXCEPTIONS += *mcpu=fa526/*mfloat-abi=hard* +#MULTILIB_EXCEPTIONS += *mcpu=fa626/*mfloat-abi=hard* + +# MULTILIB_OPTIONS += mcpu=ep9312 +# MULTILIB_DIRNAMES += ep9312 +# MULTILIB_EXCEPTIONS += *mthumb/*mcpu=ep9312* +# +# MULTILIB_OPTIONS += mlittle-endian/mbig-endian +# MULTILIB_DIRNAMES += le be +# MULTILIB_MATCHES += mbig-endian=mbe mlittle-endian=mle +# +# MULTILIB_OPTIONS += mhard-float/msoft-float +# MULTILIB_DIRNAMES += fpu soft +# MULTILIB_EXCEPTIONS += *mthumb/*mhard-float* +# +# MULTILIB_OPTIONS += mno-thumb-interwork/mthumb-interwork +# MULTILIB_DIRNAMES += normal interwork +# +# MULTILIB_OPTIONS += fno-leading-underscore/fleading-underscore +# MULTILIB_DIRNAMES += elf under +# +# MULTILIB_OPTIONS += mcpu=arm7 +# MULTILIB_DIRNAMES += nofmult +# MULTILIB_EXCEPTIONS += *mthumb*/*mcpu=arm7* +# # Note: the multilib_exceptions matches both -mthumb and +# # -mthumb-interwork +# # +# # We have to match all the arm cpu variants which do not have the +# # multiply instruction and treat them as if the user had specified +# # -mcpu=arm7. Note that in the following the ? is interpreted as +# # an = for the purposes of matching command line options. +# # FIXME: There ought to be a better way to do this. +# MULTILIB_MATCHES += mcpu?arm7=mcpu?arm7d +# MULTILIB_MATCHES += mcpu?arm7=mcpu?arm7di +# MULTILIB_MATCHES += mcpu?arm7=mcpu?arm70 +# MULTILIB_MATCHES += mcpu?arm7=mcpu?arm700 +# MULTILIB_MATCHES += mcpu?arm7=mcpu?arm700i +# MULTILIB_MATCHES += mcpu?arm7=mcpu?arm710 +# MULTILIB_MATCHES += mcpu?arm7=mcpu?arm710c +# MULTILIB_MATCHES += mcpu?arm7=mcpu?arm7100 +# MULTILIB_MATCHES += mcpu?arm7=mcpu?arm7500 +# MULTILIB_MATCHES += mcpu?arm7=mcpu?arm7500fe +# MULTILIB_MATCHES += mcpu?arm7=mcpu?arm6 +# MULTILIB_MATCHES += mcpu?arm7=mcpu?arm60 +# MULTILIB_MATCHES += mcpu?arm7=mcpu?arm600 +# MULTILIB_MATCHES += mcpu?arm7=mcpu?arm610 +# MULTILIB_MATCHES += mcpu?arm7=mcpu?arm620 + +EXTRA_MULTILIB_PARTS = crtbegin.o crtend.o crti.o crtn.o + +# If EXTRA_MULTILIB_PARTS is not defined above then define EXTRA_PARTS here +# EXTRA_PARTS = crtbegin.o crtend.o crti.o crtn.o + +LIBGCC = stmp-multilib +INSTALL_LIBGCC = install-multilib + +# Currently there is a bug somewhere in GCC's alias analysis +# or scheduling code that is breaking _fpmul_parts in fp-bit.c. +# Disabling function inlining is a workaround for this problem. +TARGET_LIBGCC2_CFLAGS = -fno-inline + +# Assemble startup files. +$(T)crti.o: $(srcdir)/config/arm/crti.asm $(GCC_PASSES) + $(GCC_FOR_TARGET) $(GCC_CFLAGS) $(MULTILIB_CFLAGS) $(INCLUDES) \ + -c -o $(T)crti.o -x assembler-with-cpp $(srcdir)/config/arm/crti.asm + +$(T)crtn.o: $(srcdir)/config/arm/crtn.asm $(GCC_PASSES) + $(GCC_FOR_TARGET) $(GCC_CFLAGS) $(MULTILIB_CFLAGS) $(INCLUDES) \ + -c -o $(T)crtn.o -x assembler-with-cpp $(srcdir)/config/arm/crtn.asm + diff --git a/gcc/config/arm/t-arm-softfp b/gcc/config/arm/t-arm-softfp new file mode 100644 index 000000000..f9cace97e --- /dev/null +++ b/gcc/config/arm/t-arm-softfp @@ -0,0 +1,29 @@ +# Copyright (C) 2008 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +softfp_float_modes := sf df +softfp_int_modes := si di +softfp_extensions := sfdf +softfp_truncations := dfsf +softfp_machine_header := arm/sfp-machine.h +softfp_exclude_libgcc2 := y +softfp_wrap_start := '\#ifdef __ARM_ARCH_6M__' +softfp_wrap_end := '\#endif' + +# softfp seems to be missing a whole bunch of prototypes. +TARGET_LIBGCC2_CFLAGS += -Wno-missing-prototypes diff --git a/gcc/config/arm/t-bpabi b/gcc/config/arm/t-bpabi new file mode 100644 index 000000000..61da9ec7b --- /dev/null +++ b/gcc/config/arm/t-bpabi @@ -0,0 +1,36 @@ +# Copyright (C) 2004, 2005 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +# Add the bpabi.S functions. +LIB1ASMFUNCS += _aeabi_lcmp _aeabi_ulcmp _aeabi_ldivmod _aeabi_uldivmod + +# Add the BPABI C functions. +LIB2FUNCS_EXTRA = $(srcdir)/config/arm/bpabi.c \ + $(srcdir)/config/arm/unaligned-funcs.c + +LIB2FUNCS_STATIC_EXTRA = $(srcdir)/config/arm/fp16.c + +UNWIND_H = $(srcdir)/config/arm/unwind-arm.h +LIB2ADDEH = $(srcdir)/config/arm/unwind-arm.c \ + $(srcdir)/config/arm/libunwind.S \ + $(srcdir)/config/arm/pr-support.c $(srcdir)/unwind-c.c +LIB2ADDEHDEP = $(UNWIND_H) $(srcdir)/config/$(LIB1ASMSRC) + +# Add the BPABI names. +SHLIB_MAPFILES += $(srcdir)/config/arm/libgcc-bpabi.ver + diff --git a/gcc/config/arm/t-linux b/gcc/config/arm/t-linux new file mode 100644 index 000000000..a6fddad50 --- /dev/null +++ b/gcc/config/arm/t-linux @@ -0,0 +1,34 @@ +# Copyright (C) 1997, 1998, 1999, 2000, 2001, 2003, 2004, 2006, +# 2008 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +# Just for these, we omit the frame pointer since it makes such a big +# difference. +TARGET_LIBGCC2_CFLAGS = -fomit-frame-pointer -fPIC + +LIB1ASMSRC = arm/lib1funcs.asm +LIB1ASMFUNCS = _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_lnx _clzsi2 _clzdi2 \ + _arm_addsubdf3 _arm_addsubsf3 + +# MULTILIB_OPTIONS = mhard-float/msoft-float +# MULTILIB_DIRNAMES = hard-float soft-float + +# EXTRA_MULTILIB_PARTS = crtbegin.o crtend.o + +# LIBGCC = stmp-multilib +# INSTALL_LIBGCC = install-multilib diff --git a/gcc/config/arm/t-linux-androideabi b/gcc/config/arm/t-linux-androideabi new file mode 100644 index 000000000..8f1307c55 --- /dev/null +++ b/gcc/config/arm/t-linux-androideabi @@ -0,0 +1,10 @@ +MULTILIB_OPTIONS = march=armv7-a mthumb +MULTILIB_DIRNAMES = armv7-a thumb +MULTILIB_EXCEPTIONS = +MULTILIB_MATCHES = +MULTILIB_OSDIRNAMES = + +# The "special" multilib can be used to build native applications for Android, +# as opposed to native shared libraries that are then called via JNI. +#MULTILIB_OPTIONS += tno-android-cc +#MULTILIB_DIRNAMES += special diff --git a/gcc/config/arm/t-linux-eabi b/gcc/config/arm/t-linux-eabi new file mode 100644 index 000000000..39de9aefe --- /dev/null +++ b/gcc/config/arm/t-linux-eabi @@ -0,0 +1,43 @@ +# Copyright (C) 2005, 2009, 2010, 2012 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +# These functions are included in shared libraries. +TARGET_LIBGCC2_CFLAGS = -fPIC + +# We do not build a Thumb multilib for Linux because the definition of +# CLEAR_INSN_CACHE in linux-gas.h does not work in Thumb mode. +MULTILIB_OPTIONS = +MULTILIB_DIRNAMES = + +#MULTILIB_OPTIONS += mcpu=fa606te/mcpu=fa626te/mcpu=fmp626/mcpu=fa726te +#MULTILIB_DIRNAMES += fa606te fa626te fmp626 fa726te +#MULTILIB_EXCEPTIONS += *mthumb/*mcpu=fa606te *mthumb/*mcpu=fa626te *mthumb/*mcpu=fmp626 *mthumb/*mcpu=fa726te* + +ifneq (,$(findstring gnueabi,$(target))) +ARM_EB = $(if $(findstring TARGET_BIG_ENDIAN_DEFAULT=1, $(tm_defines)),eb) +MULTIARCH_DIRNAME = $(call if_multiarch,arm$(ARM_EB)-linux-gnueabi$(if $(filter hard,$(with_float)),hf)) +endif + +# Use a version of div0 which raises SIGFPE, and a special __clear_cache. +LIB1ASMFUNCS := $(filter-out _dvmd_tls,$(LIB1ASMFUNCS)) _dvmd_lnx _clear_cache + +# Multilib the standard Linux files. Don't include crti.o or crtn.o, +# which are provided by glibc. +EXTRA_MULTILIB_PARTS=crtbegin.o crtend.o crtbeginS.o crtendS.o crtbeginT.o + +LIB2FUNCS_STATIC_EXTRA += $(srcdir)/config/arm/linux-atomic.c diff --git a/gcc/config/arm/t-netbsd b/gcc/config/arm/t-netbsd new file mode 100644 index 000000000..22bbbe7dd --- /dev/null +++ b/gcc/config/arm/t-netbsd @@ -0,0 +1,47 @@ +# Copyright (C) 1997, 1998, 1999, 2000, 2001, 2003, 2004, 2005, +# 2006 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +# Just for these, we omit the frame pointer since it makes such a big +# difference. It is then pointless adding debugging. +TARGET_LIBGCC2_CFLAGS = -fomit-frame-pointer -fpic +LIBGCC2_DEBUG_CFLAGS = -g0 +LIB2FUNCS_EXTRA = $(srcdir)/config/floatunsidf.c $(srcdir)/config/floatunsisf.c + +# Build a shared libgcc library. +SHLIB_EXT = .so +SHLIB_NAME = @shlib_base_name@.so +SHLIB_SONAME = @shlib_base_name@.so.1 +SHLIB_OBJS = @shlib_objs@ + +SHLIB_LINK = $(GCC_FOR_TARGET) $(LIBGCC2_CFLAGS) -shared -nodefaultlibs \ + -Wl,-soname,$(SHLIB_SONAME) \ + -o $(SHLIB_NAME).tmp @multilib_flags@ $(SHLIB_OBJS) -lc && \ + rm -f $(SHLIB_SONAME) && \ + if [ -f $(SHLIB_NAME) ]; then \ + mv -f $(SHLIB_NAME) $(SHLIB_NAME).backup; \ + else true; fi && \ + mv $(SHLIB_NAME).tmp $(SHLIB_NAME) && \ + $(LN_S) $(SHLIB_NAME) $(SHLIB_SONAME) +# $(slibdir) double quoted to protect it from expansion while building +# libgcc.mk. We want this delayed until actual install time. +SHLIB_INSTALL = \ + $$(mkinstalldirs) $$(DESTDIR)$$(slibdir); \ + $(INSTALL_DATA) $(SHLIB_NAME) $$(DESTDIR)$$(slibdir)/$(SHLIB_SONAME); \ + rm -f $$(DESTDIR)$$(slibdir)/$(SHLIB_NAME); \ + $(LN_S) $(SHLIB_SONAME) $$(DESTDIR)$$(slibdir)/$(SHLIB_NAME) diff --git a/gcc/config/arm/t-pe b/gcc/config/arm/t-pe new file mode 100644 index 000000000..626b1d29a --- /dev/null +++ b/gcc/config/arm/t-pe @@ -0,0 +1,52 @@ +# Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006, 2008, 2009, +# 2010 +# Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _call_via_rX _interwork_call_via_rX _clzsi2 _clzdi2 + +# We want fine grained libraries, so use the new code to build the +# floating point emulation libraries. +FPBIT = fp-bit.c +DPBIT = dp-bit.c + +fp-bit.c: $(srcdir)/config/fp-bit.c + echo '#define FLOAT' > fp-bit.c + echo '#ifndef __ARMEB__' >> fp-bit.c + echo '#define FLOAT_BIT_ORDER_MISMATCH' >> fp-bit.c + echo '#endif' >> fp-bit.c + cat $(srcdir)/config/fp-bit.c >> fp-bit.c + +dp-bit.c: $(srcdir)/config/fp-bit.c + echo '#ifndef __ARMEB__' > dp-bit.c + echo '#define FLOAT_BIT_ORDER_MISMATCH' >> dp-bit.c + echo '#define FLOAT_WORD_ORDER_MISMATCH' >> dp-bit.c + echo '#endif' >> dp-bit.c + cat $(srcdir)/config/fp-bit.c >> dp-bit.c + +pe.o: $(srcdir)/config/arm/pe.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \ + $(RTL_H) output.h flags.h $(TREE_H) expr.h $(TM_P_H) + $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ + $(srcdir)/config/arm/pe.c + +MULTILIB_OPTIONS = mhard-float mthumb +MULTILIB_DIRNAMES = fpu thumb + +LIBGCC = stmp-multilib +INSTALL_LIBGCC = install-multilib +TARGET_LIBGCC2_CFLAGS = diff --git a/gcc/config/arm/t-rtems b/gcc/config/arm/t-rtems new file mode 100644 index 000000000..52d14bab0 --- /dev/null +++ b/gcc/config/arm/t-rtems @@ -0,0 +1,10 @@ +# Custom rtems multilibs + +MULTILIB_OPTIONS = marm/mthumb +MULTILIB_DIRNAMES = arm thumb +MULTILIB_EXCEPTIONS = +MULTILIB_MATCHES = marm=mno-thumb + +MULTILIB_OPTIONS += msoft-float/mhard-float +MULTILIB_DIRNAMES += soft fpu +MULTILIB_EXCEPTIONS += *mthumb/*mhard-float* diff --git a/gcc/config/arm/t-rtems-eabi b/gcc/config/arm/t-rtems-eabi new file mode 100644 index 000000000..f0e714a9b --- /dev/null +++ b/gcc/config/arm/t-rtems-eabi @@ -0,0 +1,8 @@ +# Custom RTEMS EABI multilibs + +MULTILIB_OPTIONS = mthumb march=armv6-m/march=armv7/march=armv7-m +MULTILIB_DIRNAMES = thumb armv6-m armv7 armv7-m +MULTILIB_EXCEPTIONS = march=armv6-m march=armv7 march=armv7-m +MULTILIB_MATCHES = +MULTILIB_EXCLUSIONS = +MULTILIB_OSDIRNAMES = diff --git a/gcc/config/arm/t-strongarm-elf b/gcc/config/arm/t-strongarm-elf new file mode 100644 index 000000000..64d7ca694 --- /dev/null +++ b/gcc/config/arm/t-strongarm-elf @@ -0,0 +1,61 @@ +# Copyright (C) 2000, 2001, 2006, 2008 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _bb_init_func _clzsi2 _clzdi2 + +# We want fine grained libraries, so use the new code to build the +# floating point emulation libraries. +FPBIT = fp-bit.c +DPBIT = dp-bit.c + +fp-bit.c: $(srcdir)/config/fp-bit.c + echo '#define FLOAT' > fp-bit.c + echo '#ifndef __ARMEB__' >> fp-bit.c + echo '#define FLOAT_BIT_ORDER_MISMATCH' >> fp-bit.c + echo '#endif' >> fp-bit.c + cat $(srcdir)/config/fp-bit.c >> fp-bit.c + +dp-bit.c: $(srcdir)/config/fp-bit.c + echo '#ifndef __ARMEB__' > dp-bit.c + echo '#define FLOAT_BIT_ORDER_MISMATCH' >> dp-bit.c + echo '#define FLOAT_WORD_ORDER_MISMATCH' >> dp-bit.c + echo '#endif' >> dp-bit.c + cat $(srcdir)/config/fp-bit.c >> dp-bit.c + +MULTILIB_OPTIONS = mlittle-endian/mbig-endian mhard-float/msoft-float +MULTILIB_DIRNAMES = le be fpu soft +MULTILIB_EXCEPTIONS = +MULTILIB_MATCHES = mbig-endian=mbe mlittle-endian=mle +EXTRA_MULTILIB_PARTS = crtbegin.o crtend.o crti.o crtn.o + +LIBGCC = stmp-multilib +INSTALL_LIBGCC = install-multilib + +# Currently there is a bug somewhere in GCC's alias analysis +# or scheduling code that is breaking _fpmul_parts in fp-bit.c. +# Disabling function inlining is a workaround for this problem. +TARGET_LIBGCC2_CFLAGS = -fno-inline + +# Assemble startup files. +$(T)crti.o: $(srcdir)/config/arm/crti.asm $(GCC_PASSES) + $(GCC_FOR_TARGET) $(GCC_CFLAGS) $(MULTILIB_CFLAGS) $(INCLUDES) \ + -c -o $(T)crti.o -x assembler-with-cpp $(srcdir)/config/arm/crti.asm + +$(T)crtn.o: $(srcdir)/config/arm/crtn.asm $(GCC_PASSES) + $(GCC_FOR_TARGET) $(GCC_CFLAGS) $(MULTILIB_CFLAGS) $(INCLUDES) \ + -c -o $(T)crtn.o -x assembler-with-cpp $(srcdir)/config/arm/crtn.asm diff --git a/gcc/config/arm/t-symbian b/gcc/config/arm/t-symbian new file mode 100644 index 000000000..4a1476f67 --- /dev/null +++ b/gcc/config/arm/t-symbian @@ -0,0 +1,53 @@ +# Copyright (C) 2004, 2005, 2006, 2008 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +LIB1ASMFUNCS += _bb_init_func _call_via_rX _interwork_call_via_rX _clzsi2 _clzdi2 + +# These functions have __aeabi equivalents and will never be called by GCC. +# By putting them in LIB1ASMFUNCS, we avoid the standard libgcc2.c code being +# used -- and we make sure that definitions are not available in lib1funcs.asm, +# either, so they end up undefined. +LIB1ASMFUNCS += \ + _ashldi3 _ashrdi3 _divdi3 _floatdidf _udivmoddi4 _umoddi3 \ + _udivdi3 _lshrdi3 _moddi3 _muldi3 _negdi2 _cmpdi2 \ + _fixdfdi _fixsfdi _fixunsdfdi _fixunssfdi _floatdisf \ + _negdf2 _addsubdf3 _muldivdf3 _cmpdf2 _unorddf2 _fixdfsi _fixunsdfsi \ + _truncdfsf2 _negsf2 _addsubsf3 _muldivsf3 _cmpsf2 _unordsf2 \ + _fixsfsi _fixunssfsi + +# Include the gcc personality routine +UNWIND_H = $(srcdir)/config/arm/unwind-arm.h +LIB2ADDEH = $(srcdir)/unwind-c.c $(srcdir)/config/arm/pr-support.c +LIB2ADDEHDEP = $(UNWIND_H) + +# Include half-float helpers. +LIB2FUNCS_STATIC_EXTRA = $(srcdir)/config/arm/fp16.c + +# Create a multilib for processors with VFP floating-point, and a +# multilib for those without -- using the soft-float ABI in both +# cases. Symbian OS object should be compiled with interworking +# enabled, so there are no separate thumb-mode libraries. +MULTILIB_OPTIONS = mfloat-abi=softfp +MULTILIB_DIRNAMES = softfp + +# There is no C library to link against on Symbian OS -- at least when +# building GCC. +SHLIB_LC = + +# Symbian OS provides its own startup code. +EXTRA_MULTILIB_PARTS= diff --git a/gcc/config/arm/t-vxworks b/gcc/config/arm/t-vxworks new file mode 100644 index 000000000..af01ac412 --- /dev/null +++ b/gcc/config/arm/t-vxworks @@ -0,0 +1,44 @@ +# Copyright (C) 2003, 2007, 2008 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _bb_init_func _call_via_rX _interwork_call_via_rX _clzsi2 _clzdi2 + +# We want fine grained libraries, so use the new code to build the +# floating point emulation libraries. +FPBIT = fp-bit.c +DPBIT = dp-bit.c + +fp-bit.c: $(srcdir)/config/fp-bit.c + echo '#define FLOAT' > fp-bit.c + echo '#ifndef __ARMEB__' >> fp-bit.c + echo '#define FLOAT_BIT_ORDER_MISMATCH' >> fp-bit.c + echo '#endif' >> fp-bit.c + cat $(srcdir)/config/fp-bit.c >> fp-bit.c + +dp-bit.c: $(srcdir)/config/fp-bit.c + echo '#ifndef __ARMEB__' > dp-bit.c + echo '#define FLOAT_BIT_ORDER_MISMATCH' >> dp-bit.c + echo '#endif' >> dp-bit.c + cat $(srcdir)/config/fp-bit.c >> dp-bit.c + +MULTILIB_OPTIONS = \ + mrtp fPIC \ + t4/t4be/t4t/t4tbe/t5/t5be/t5t/t5tbe/tstrongarm/txscale/txscalebe +MULTILIB_MATCHES = fPIC=fpic +# Don't build -fPIC multilibs for kernel or Thumb code. +MULTILIB_EXCEPTIONS = fPIC* mrtp/fPIC/*t[45]t* diff --git a/gcc/config/arm/t-wince-pe b/gcc/config/arm/t-wince-pe new file mode 100644 index 000000000..165bef200 --- /dev/null +++ b/gcc/config/arm/t-wince-pe @@ -0,0 +1,56 @@ +# Copyright (C) 2003, 2004, 2006, 2008, 2009, 2010 +# Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _call_via_rX _interwork_call_via_rX _clzsi2 _clzdi2 + +# We want fine grained libraries, so use the new code to build the +# floating point emulation libraries. +FPBIT = fp-bit.c +DPBIT = dp-bit.c + +fp-bit.c: $(srcdir)/config/fp-bit.c + echo '#define FLOAT' > fp-bit.c + echo '#ifndef __ARMEB__' >> fp-bit.c + echo '#define FLOAT_BIT_ORDER_MISMATCH' >> fp-bit.c + echo '#endif' >> fp-bit.c + cat $(srcdir)/config/fp-bit.c >> fp-bit.c + +dp-bit.c: $(srcdir)/config/fp-bit.c + echo '#ifndef __ARMEB__' > dp-bit.c + echo '#define FLOAT_BIT_ORDER_MISMATCH' >> dp-bit.c + echo '#define FLOAT_WORD_ORDER_MISMATCH' >> dp-bit.c + echo '#endif' >> dp-bit.c + cat $(srcdir)/config/fp-bit.c >> dp-bit.c + +pe.o: $(srcdir)/config/arm/pe.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \ + $(RTL_H) output.h flags.h $(TREE_H) expr.h $(TM_P_H) + $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ + $(srcdir)/config/arm/pe.c + +MULTILIB_OPTIONS = mhard-float +MULTILIB_DIRNAMES = fpu +# Note - Thumb multilib omitted because Thumb support for +# arm-wince-pe target does not appear to be working in binutils +# yet... +# MULTILIB_OPTIONS += thumb +# MULTILIB_DIRNAMES += thumb + +LIBGCC = stmp-multilib +INSTALL_LIBGCC = install-multilib +TARGET_LIBGCC2_CFLAGS = diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md new file mode 100644 index 000000000..1b2fb2d44 --- /dev/null +++ b/gcc/config/arm/thumb2.md @@ -0,0 +1,1121 @@ +;; ARM Thumb-2 Machine Description +;; Copyright (C) 2007, 2008, 2010 Free Software Foundation, Inc. +;; Written by CodeSourcery, LLC. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ + +;; Note: Thumb-2 is the variant of the Thumb architecture that adds +;; 32-bit encodings of [almost all of] the Arm instruction set. +;; Some old documents refer to the relatively minor interworking +;; changes made in armv5t as "thumb2". These are considered part +;; the 16-bit Thumb-1 instruction set. + +(define_insn "*thumb2_incscc" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (plus:SI (match_operator:SI 2 "arm_comparison_operator" + [(match_operand:CC 3 "cc_register" "") (const_int 0)]) + (match_operand:SI 1 "s_register_operand" "0,?r")))] + "TARGET_THUMB2" + "@ + it\\t%d2\;add%d2\\t%0, %1, #1 + ite\\t%D2\;mov%D2\\t%0, %1\;add%d2\\t%0, %1, #1" + [(set_attr "conds" "use") + (set_attr "length" "6,10")] +) + +(define_insn "*thumb2_decscc" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (minus:SI (match_operand:SI 1 "s_register_operand" "0,?r") + (match_operator:SI 2 "arm_comparison_operator" + [(match_operand 3 "cc_register" "") (const_int 0)])))] + "TARGET_THUMB2" + "@ + it\\t%d2\;sub%d2\\t%0, %1, #1 + ite\\t%D2\;mov%D2\\t%0, %1\;sub%d2\\t%0, %1, #1" + [(set_attr "conds" "use") + (set_attr "length" "6,10")] +) + +;; Thumb-2 only allows shift by constant on data processing instructions +(define_insn "*thumb_andsi_not_shiftsi_si" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (and:SI (not:SI (match_operator:SI 4 "shift_operator" + [(match_operand:SI 2 "s_register_operand" "r") + (match_operand:SI 3 "const_int_operand" "M")])) + (match_operand:SI 1 "s_register_operand" "r")))] + "TARGET_THUMB2" + "bic%?\\t%0, %1, %2%S4" + [(set_attr "predicable" "yes") + (set_attr "shift" "2") + (set_attr "type" "alu_shift")] +) + +(define_insn "*thumb2_smaxsi3" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,r") + (smax:SI (match_operand:SI 1 "s_register_operand" "0,r,?r") + (match_operand:SI 2 "arm_rhs_operand" "rI,0,rI"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_THUMB2" + "@ + cmp\\t%1, %2\;it\\tlt\;movlt\\t%0, %2 + cmp\\t%1, %2\;it\\tge\;movge\\t%0, %1 + cmp\\t%1, %2\;ite\\tge\;movge\\t%0, %1\;movlt\\t%0, %2" + [(set_attr "conds" "clob") + (set_attr "length" "10,10,14")] +) + +(define_insn "*thumb2_sminsi3" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,r") + (smin:SI (match_operand:SI 1 "s_register_operand" "0,r,?r") + (match_operand:SI 2 "arm_rhs_operand" "rI,0,rI"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_THUMB2" + "@ + cmp\\t%1, %2\;it\\tge\;movge\\t%0, %2 + cmp\\t%1, %2\;it\\tlt\;movlt\\t%0, %1 + cmp\\t%1, %2\;ite\\tlt\;movlt\\t%0, %1\;movge\\t%0, %2" + [(set_attr "conds" "clob") + (set_attr "length" "10,10,14")] +) + +(define_insn "*thumb32_umaxsi3" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,r") + (umax:SI (match_operand:SI 1 "s_register_operand" "0,r,?r") + (match_operand:SI 2 "arm_rhs_operand" "rI,0,rI"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_THUMB2" + "@ + cmp\\t%1, %2\;it\\tcc\;movcc\\t%0, %2 + cmp\\t%1, %2\;it\\tcs\;movcs\\t%0, %1 + cmp\\t%1, %2\;ite\\tcs\;movcs\\t%0, %1\;movcc\\t%0, %2" + [(set_attr "conds" "clob") + (set_attr "length" "10,10,14")] +) + +(define_insn "*thumb2_uminsi3" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,r") + (umin:SI (match_operand:SI 1 "s_register_operand" "0,r,?r") + (match_operand:SI 2 "arm_rhs_operand" "rI,0,rI"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_THUMB2" + "@ + cmp\\t%1, %2\;it\\tcs\;movcs\\t%0, %2 + cmp\\t%1, %2\;it\\tcc\;movcc\\t%0, %1 + cmp\\t%1, %2\;ite\\tcc\;movcc\\t%0, %1\;movcs\\t%0, %2" + [(set_attr "conds" "clob") + (set_attr "length" "10,10,14")] +) + +;; Thumb-2 does not have rsc, so use a clever trick with shifter operands. +(define_insn "*thumb2_negdi2" + [(set (match_operand:DI 0 "s_register_operand" "=&r,r") + (neg:DI (match_operand:DI 1 "s_register_operand" "?r,0"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_THUMB2" + "negs\\t%Q0, %Q1\;sbc\\t%R0, %R1, %R1, lsl #1" + [(set_attr "conds" "clob") + (set_attr "length" "8")] +) + +(define_insn "*thumb2_abssi2" + [(set (match_operand:SI 0 "s_register_operand" "=r,&r") + (abs:SI (match_operand:SI 1 "s_register_operand" "0,r"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_THUMB2" + "@ + cmp\\t%0, #0\;it\tlt\;rsblt\\t%0, %0, #0 + eor%?\\t%0, %1, %1, asr #31\;sub%?\\t%0, %0, %1, asr #31" + [(set_attr "conds" "clob,*") + (set_attr "shift" "1") + ;; predicable can't be set based on the variant, so left as no + (set_attr "length" "10,8")] +) + +(define_insn "*thumb2_neg_abssi2" + [(set (match_operand:SI 0 "s_register_operand" "=r,&r") + (neg:SI (abs:SI (match_operand:SI 1 "s_register_operand" "0,r")))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_THUMB2" + "@ + cmp\\t%0, #0\;it\\tgt\;rsbgt\\t%0, %0, #0 + eor%?\\t%0, %1, %1, asr #31\;rsb%?\\t%0, %0, %1, asr #31" + [(set_attr "conds" "clob,*") + (set_attr "shift" "1") + ;; predicable can't be set based on the variant, so left as no + (set_attr "length" "10,8")] +) + +;; We have two alternatives here for memory loads (and similarly for stores) +;; to reflect the fact that the permissible constant pool ranges differ +;; between ldr instructions taking low regs and ldr instructions taking high +;; regs. The high register alternatives are not taken into account when +;; choosing register preferences in order to reflect their expense. +(define_insn "*thumb2_movsi_insn" + [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,r,r,l ,*hk,m,*m") + (match_operand:SI 1 "general_operand" "rk ,I,K,j,mi,*mi,l,*hk"))] + "TARGET_THUMB2 && ! TARGET_IWMMXT + && !(TARGET_HARD_FLOAT && TARGET_VFP) + && ( register_operand (operands[0], SImode) + || register_operand (operands[1], SImode))" + "@ + mov%?\\t%0, %1 + mov%?\\t%0, %1 + mvn%?\\t%0, #%B1 + movw%?\\t%0, %1 + ldr%?\\t%0, %1 + ldr%?\\t%0, %1 + str%?\\t%1, %0 + str%?\\t%1, %0" + [(set_attr "type" "*,*,*,*,load1,load1,store1,store1") + (set_attr "predicable" "yes") + (set_attr "pool_range" "*,*,*,*,1020,4096,*,*") + (set_attr "neg_pool_range" "*,*,*,*,0,0,*,*")] +) + +(define_insn "tls_load_dot_plus_four" + [(set (match_operand:SI 0 "register_operand" "=l,l,r,r") + (mem:SI (unspec:SI [(match_operand:SI 2 "register_operand" "0,1,0,1") + (const_int 4) + (match_operand 3 "" "")] + UNSPEC_PIC_BASE))) + (clobber (match_scratch:SI 1 "=X,l,X,r"))] + "TARGET_THUMB2" + "* + (*targetm.asm_out.internal_label) (asm_out_file, \"LPIC\", + INTVAL (operands[3])); + return \"add\\t%2, %|pc\;ldr%?\\t%0, [%2]\"; + " + [(set_attr "length" "4,4,6,6")] +) + +;; Thumb-2 always has load/store halfword instructions, so we can avoid a lot +;; of the messiness associated with the ARM patterns. +(define_insn "*thumb2_movhi_insn" + [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,m,r") + (match_operand:HI 1 "general_operand" "rI,n,r,m"))] + "TARGET_THUMB2" + "@ + mov%?\\t%0, %1\\t%@ movhi + movw%?\\t%0, %L1\\t%@ movhi + str%(h%)\\t%1, %0\\t%@ movhi + ldr%(h%)\\t%0, %1\\t%@ movhi" + [(set_attr "type" "*,*,store1,load1") + (set_attr "predicable" "yes") + (set_attr "pool_range" "*,*,*,4096") + (set_attr "neg_pool_range" "*,*,*,250")] +) + +(define_insn "*thumb2_cmpsi_neg_shiftsi" + [(set (reg:CC CC_REGNUM) + (compare:CC (match_operand:SI 0 "s_register_operand" "r") + (neg:SI (match_operator:SI 3 "shift_operator" + [(match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "const_int_operand" "M")]))))] + "TARGET_THUMB2" + "cmn%?\\t%0, %1%S3" + [(set_attr "conds" "set") + (set_attr "shift" "1") + (set_attr "type" "alu_shift")] +) + +(define_insn "*thumb2_mov_scc" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (match_operator:SI 1 "arm_comparison_operator" + [(match_operand 2 "cc_register" "") (const_int 0)]))] + "TARGET_THUMB2" + "ite\\t%D1\;mov%D1\\t%0, #0\;mov%d1\\t%0, #1" + [(set_attr "conds" "use") + (set_attr "length" "10")] +) + +(define_insn "*thumb2_mov_negscc" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (neg:SI (match_operator:SI 1 "arm_comparison_operator" + [(match_operand 2 "cc_register" "") (const_int 0)])))] + "TARGET_THUMB2" + "ite\\t%D1\;mov%D1\\t%0, #0\;mvn%d1\\t%0, #0" + [(set_attr "conds" "use") + (set_attr "length" "10")] +) + +(define_insn "*thumb2_mov_notscc" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (not:SI (match_operator:SI 1 "arm_comparison_operator" + [(match_operand 2 "cc_register" "") (const_int 0)])))] + "TARGET_THUMB2" + "ite\\t%D1\;mvn%D1\\t%0, #0\;mvn%d1\\t%0, #1" + [(set_attr "conds" "use") + (set_attr "length" "10")] +) + +(define_insn "*thumb2_movsicc_insn" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,r,r,r,r,r,r") + (if_then_else:SI + (match_operator 3 "arm_comparison_operator" + [(match_operand 4 "cc_register" "") (const_int 0)]) + (match_operand:SI 1 "arm_not_operand" "0,0,rI,K,rI,rI,K,K") + (match_operand:SI 2 "arm_not_operand" "rI,K,0,0,rI,K,rI,K")))] + "TARGET_THUMB2" + "@ + it\\t%D3\;mov%D3\\t%0, %2 + it\\t%D3\;mvn%D3\\t%0, #%B2 + it\\t%d3\;mov%d3\\t%0, %1 + it\\t%d3\;mvn%d3\\t%0, #%B1 + ite\\t%d3\;mov%d3\\t%0, %1\;mov%D3\\t%0, %2 + ite\\t%d3\;mov%d3\\t%0, %1\;mvn%D3\\t%0, #%B2 + ite\\t%d3\;mvn%d3\\t%0, #%B1\;mov%D3\\t%0, %2 + ite\\t%d3\;mvn%d3\\t%0, #%B1\;mvn%D3\\t%0, #%B2" + [(set_attr "length" "6,6,6,6,10,10,10,10") + (set_attr "conds" "use")] +) + +(define_insn "*thumb2_movsfcc_soft_insn" + [(set (match_operand:SF 0 "s_register_operand" "=r,r") + (if_then_else:SF (match_operator 3 "arm_comparison_operator" + [(match_operand 4 "cc_register" "") (const_int 0)]) + (match_operand:SF 1 "s_register_operand" "0,r") + (match_operand:SF 2 "s_register_operand" "r,0")))] + "TARGET_THUMB2 && TARGET_SOFT_FLOAT" + "@ + it\\t%D3\;mov%D3\\t%0, %2 + it\\t%d3\;mov%d3\\t%0, %1" + [(set_attr "length" "6,6") + (set_attr "conds" "use")] +) + +(define_insn "*call_reg_thumb2" + [(call (mem:SI (match_operand:SI 0 "s_register_operand" "r")) + (match_operand 1 "" "")) + (use (match_operand 2 "" "")) + (clobber (reg:SI LR_REGNUM))] + "TARGET_THUMB2" + "blx%?\\t%0" + [(set_attr "type" "call")] +) + +(define_insn "*call_value_reg_thumb2" + [(set (match_operand 0 "" "") + (call (mem:SI (match_operand:SI 1 "register_operand" "l*r")) + (match_operand 2 "" ""))) + (use (match_operand 3 "" "")) + (clobber (reg:SI LR_REGNUM))] + "TARGET_THUMB2" + "blx\\t%1" + [(set_attr "type" "call")] +) + +(define_insn "*thumb2_indirect_jump" + [(set (pc) + (match_operand:SI 0 "register_operand" "l*r"))] + "TARGET_THUMB2" + "bx\\t%0" + [(set_attr "conds" "clob")] +) +;; Don't define thumb2_load_indirect_jump because we can't guarantee label +;; addresses will have the thumb bit set correctly. + + +(define_insn "*thumb2_and_scc" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (and:SI (match_operator:SI 1 "arm_comparison_operator" + [(match_operand 3 "cc_register" "") (const_int 0)]) + (match_operand:SI 2 "s_register_operand" "r")))] + "TARGET_THUMB2" + "ite\\t%D1\;mov%D1\\t%0, #0\;and%d1\\t%0, %2, #1" + [(set_attr "conds" "use") + (set_attr "length" "10")] +) + +(define_insn "*thumb2_ior_scc" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (ior:SI (match_operator:SI 2 "arm_comparison_operator" + [(match_operand 3 "cc_register" "") (const_int 0)]) + (match_operand:SI 1 "s_register_operand" "0,?r")))] + "TARGET_THUMB2" + "@ + it\\t%d2\;orr%d2\\t%0, %1, #1 + ite\\t%D2\;mov%D2\\t%0, %1\;orr%d2\\t%0, %1, #1" + [(set_attr "conds" "use") + (set_attr "length" "6,10")] +) + +(define_insn "*thumb2_cond_move" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,r") + (if_then_else:SI (match_operator 3 "equality_operator" + [(match_operator 4 "arm_comparison_operator" + [(match_operand 5 "cc_register" "") (const_int 0)]) + (const_int 0)]) + (match_operand:SI 1 "arm_rhs_operand" "0,rI,?rI") + (match_operand:SI 2 "arm_rhs_operand" "rI,0,rI")))] + "TARGET_THUMB2" + "* + if (GET_CODE (operands[3]) == NE) + { + if (which_alternative != 1) + output_asm_insn (\"it\\t%D4\;mov%D4\\t%0, %2\", operands); + if (which_alternative != 0) + output_asm_insn (\"it\\t%d4\;mov%d4\\t%0, %1\", operands); + return \"\"; + } + switch (which_alternative) + { + case 0: + output_asm_insn (\"it\\t%d4\", operands); + break; + case 1: + output_asm_insn (\"it\\t%D4\", operands); + break; + case 2: + output_asm_insn (\"ite\\t%D4\", operands); + break; + default: + abort(); + } + if (which_alternative != 0) + output_asm_insn (\"mov%D4\\t%0, %1\", operands); + if (which_alternative != 1) + output_asm_insn (\"mov%d4\\t%0, %2\", operands); + return \"\"; + " + [(set_attr "conds" "use") + (set_attr "length" "6,6,10")] +) + +(define_insn "*thumb2_cond_arith" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (match_operator:SI 5 "shiftable_operator" + [(match_operator:SI 4 "arm_comparison_operator" + [(match_operand:SI 2 "s_register_operand" "r,r") + (match_operand:SI 3 "arm_rhs_operand" "rI,rI")]) + (match_operand:SI 1 "s_register_operand" "0,?r")])) + (clobber (reg:CC CC_REGNUM))] + "TARGET_THUMB2" + "* + if (GET_CODE (operands[4]) == LT && operands[3] == const0_rtx) + return \"%i5\\t%0, %1, %2, lsr #31\"; + + output_asm_insn (\"cmp\\t%2, %3\", operands); + if (GET_CODE (operands[5]) == AND) + { + output_asm_insn (\"ite\\t%D4\", operands); + output_asm_insn (\"mov%D4\\t%0, #0\", operands); + } + else if (GET_CODE (operands[5]) == MINUS) + { + output_asm_insn (\"ite\\t%D4\", operands); + output_asm_insn (\"rsb%D4\\t%0, %1, #0\", operands); + } + else if (which_alternative != 0) + { + output_asm_insn (\"ite\\t%D4\", operands); + output_asm_insn (\"mov%D4\\t%0, %1\", operands); + } + else + output_asm_insn (\"it\\t%d4\", operands); + return \"%i5%d4\\t%0, %1, #1\"; + " + [(set_attr "conds" "clob") + (set_attr "length" "14")] +) + +(define_insn "*thumb2_cond_sub" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (minus:SI (match_operand:SI 1 "s_register_operand" "0,?r") + (match_operator:SI 4 "arm_comparison_operator" + [(match_operand:SI 2 "s_register_operand" "r,r") + (match_operand:SI 3 "arm_rhs_operand" "rI,rI")]))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_THUMB2" + "* + output_asm_insn (\"cmp\\t%2, %3\", operands); + if (which_alternative != 0) + { + output_asm_insn (\"ite\\t%D4\", operands); + output_asm_insn (\"mov%D4\\t%0, %1\", operands); + } + else + output_asm_insn (\"it\\t%d4\", operands); + return \"sub%d4\\t%0, %1, #1\"; + " + [(set_attr "conds" "clob") + (set_attr "length" "10,14")] +) + +(define_insn "*thumb2_negscc" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (neg:SI (match_operator 3 "arm_comparison_operator" + [(match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "arm_rhs_operand" "rI")]))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_THUMB2" + "* + if (GET_CODE (operands[3]) == LT && operands[2] == const0_rtx) + return \"asr\\t%0, %1, #31\"; + + if (GET_CODE (operands[3]) == NE) + return \"subs\\t%0, %1, %2\;it\\tne\;mvnne\\t%0, #0\"; + + output_asm_insn (\"cmp\\t%1, %2\", operands); + output_asm_insn (\"ite\\t%D3\", operands); + output_asm_insn (\"mov%D3\\t%0, #0\", operands); + return \"mvn%d3\\t%0, #0\"; + " + [(set_attr "conds" "clob") + (set_attr "length" "14")] +) + +(define_insn "*thumb2_movcond" + [(set (match_operand:SI 0 "s_register_operand" "=r,r,r") + (if_then_else:SI + (match_operator 5 "arm_comparison_operator" + [(match_operand:SI 3 "s_register_operand" "r,r,r") + (match_operand:SI 4 "arm_add_operand" "rIL,rIL,rIL")]) + (match_operand:SI 1 "arm_rhs_operand" "0,rI,?rI") + (match_operand:SI 2 "arm_rhs_operand" "rI,0,rI"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_THUMB2" + "* + if (GET_CODE (operands[5]) == LT + && (operands[4] == const0_rtx)) + { + if (which_alternative != 1 && GET_CODE (operands[1]) == REG) + { + if (operands[2] == const0_rtx) + return \"and\\t%0, %1, %3, asr #31\"; + return \"ands\\t%0, %1, %3, asr #32\;it\\tcc\;movcc\\t%0, %2\"; + } + else if (which_alternative != 0 && GET_CODE (operands[2]) == REG) + { + if (operands[1] == const0_rtx) + return \"bic\\t%0, %2, %3, asr #31\"; + return \"bics\\t%0, %2, %3, asr #32\;it\\tcs\;movcs\\t%0, %1\"; + } + /* The only case that falls through to here is when both ops 1 & 2 + are constants. */ + } + + if (GET_CODE (operands[5]) == GE + && (operands[4] == const0_rtx)) + { + if (which_alternative != 1 && GET_CODE (operands[1]) == REG) + { + if (operands[2] == const0_rtx) + return \"bic\\t%0, %1, %3, asr #31\"; + return \"bics\\t%0, %1, %3, asr #32\;it\\tcs\;movcs\\t%0, %2\"; + } + else if (which_alternative != 0 && GET_CODE (operands[2]) == REG) + { + if (operands[1] == const0_rtx) + return \"and\\t%0, %2, %3, asr #31\"; + return \"ands\\t%0, %2, %3, asr #32\;it\tcc\;movcc\\t%0, %1\"; + } + /* The only case that falls through to here is when both ops 1 & 2 + are constants. */ + } + if (GET_CODE (operands[4]) == CONST_INT + && !const_ok_for_arm (INTVAL (operands[4]))) + output_asm_insn (\"cmn\\t%3, #%n4\", operands); + else + output_asm_insn (\"cmp\\t%3, %4\", operands); + switch (which_alternative) + { + case 0: + output_asm_insn (\"it\\t%D5\", operands); + break; + case 1: + output_asm_insn (\"it\\t%d5\", operands); + break; + case 2: + output_asm_insn (\"ite\\t%d5\", operands); + break; + default: + abort(); + } + if (which_alternative != 0) + output_asm_insn (\"mov%d5\\t%0, %1\", operands); + if (which_alternative != 1) + output_asm_insn (\"mov%D5\\t%0, %2\", operands); + return \"\"; + " + [(set_attr "conds" "clob") + (set_attr "length" "10,10,14")] +) + +;; Zero and sign extension instructions. + +;; All supported Thumb2 implementations are armv6, so only that case is +;; provided. +(define_insn "*thumb2_extendqisi_v6" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (sign_extend:SI (match_operand:QI 1 "nonimmediate_operand" "r,m")))] + "TARGET_THUMB2 && arm_arch6" + "@ + sxtb%?\\t%0, %1 + ldr%(sb%)\\t%0, %1" + [(set_attr "type" "alu_shift,load_byte") + (set_attr "predicable" "yes") + (set_attr "pool_range" "*,4096") + (set_attr "neg_pool_range" "*,250")] +) + +(define_insn "*thumb2_zero_extendhisi2_v6" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (zero_extend:SI (match_operand:HI 1 "nonimmediate_operand" "r,m")))] + "TARGET_THUMB2 && arm_arch6" + "@ + uxth%?\\t%0, %1 + ldr%(h%)\\t%0, %1" + [(set_attr "type" "alu_shift,load_byte") + (set_attr "predicable" "yes") + (set_attr "pool_range" "*,4096") + (set_attr "neg_pool_range" "*,250")] +) + +(define_insn "thumb2_zero_extendqisi2_v6" + [(set (match_operand:SI 0 "s_register_operand" "=r,r") + (zero_extend:SI (match_operand:QI 1 "nonimmediate_operand" "r,m")))] + "TARGET_THUMB2 && arm_arch6" + "@ + uxtb%(%)\\t%0, %1 + ldr%(b%)\\t%0, %1\\t%@ zero_extendqisi2" + [(set_attr "type" "alu_shift,load_byte") + (set_attr "predicable" "yes") + (set_attr "pool_range" "*,4096") + (set_attr "neg_pool_range" "*,250")] +) + +(define_insn "thumb2_casesi_internal" + [(parallel [(set (pc) + (if_then_else + (leu (match_operand:SI 0 "s_register_operand" "r") + (match_operand:SI 1 "arm_rhs_operand" "rI")) + (mem:SI (plus:SI (mult:SI (match_dup 0) (const_int 4)) + (label_ref (match_operand 2 "" "")))) + (label_ref (match_operand 3 "" "")))) + (clobber (reg:CC CC_REGNUM)) + (clobber (match_scratch:SI 4 "=&r")) + (use (label_ref (match_dup 2)))])] + "TARGET_THUMB2 && !flag_pic" + "* return thumb2_output_casesi(operands);" + [(set_attr "conds" "clob") + (set_attr "length" "16")] +) + +(define_insn "thumb2_casesi_internal_pic" + [(parallel [(set (pc) + (if_then_else + (leu (match_operand:SI 0 "s_register_operand" "r") + (match_operand:SI 1 "arm_rhs_operand" "rI")) + (mem:SI (plus:SI (mult:SI (match_dup 0) (const_int 4)) + (label_ref (match_operand 2 "" "")))) + (label_ref (match_operand 3 "" "")))) + (clobber (reg:CC CC_REGNUM)) + (clobber (match_scratch:SI 4 "=&r")) + (clobber (match_scratch:SI 5 "=r")) + (use (label_ref (match_dup 2)))])] + "TARGET_THUMB2 && flag_pic" + "* return thumb2_output_casesi(operands);" + [(set_attr "conds" "clob") + (set_attr "length" "20")] +) + +;; Note: this is not predicable, to avoid issues with linker-generated +;; interworking stubs. +(define_insn "*thumb2_return" + [(return)] + "TARGET_THUMB2 && USE_RETURN_INSN (FALSE)" + "* + { + return output_return_instruction (const_true_rtx, TRUE, FALSE); + }" + [(set_attr "type" "load1") + (set_attr "length" "12")] +) + +(define_insn_and_split "thumb2_eh_return" + [(unspec_volatile [(match_operand:SI 0 "s_register_operand" "r")] + VUNSPEC_EH_RETURN) + (clobber (match_scratch:SI 1 "=&r"))] + "TARGET_THUMB2" + "#" + "&& reload_completed" + [(const_int 0)] + " + { + thumb_set_return_address (operands[0], operands[1]); + DONE; + }" +) + +(define_insn "*thumb2_alusi3_short" + [(set (match_operand:SI 0 "s_register_operand" "=l") + (match_operator:SI 3 "thumb_16bit_operator" + [(match_operand:SI 1 "s_register_operand" "0") + (match_operand:SI 2 "s_register_operand" "l")])) + (clobber (reg:CC CC_REGNUM))] + "TARGET_THUMB2 && reload_completed + && GET_CODE(operands[3]) != PLUS + && GET_CODE(operands[3]) != MINUS" + "%I3%!\\t%0, %1, %2" + [(set_attr "predicable" "yes") + (set_attr "length" "2")] +) + +;; Similarly for 16-bit shift instructions +;; There is no 16-bit rotate by immediate instruction. +(define_peephole2 + [(set (match_operand:SI 0 "low_register_operand" "") + (match_operator:SI 3 "shift_operator" + [(match_operand:SI 1 "low_register_operand" "") + (match_operand:SI 2 "low_reg_or_int_operand" "")]))] + "TARGET_THUMB2 + && peep2_regno_dead_p(0, CC_REGNUM) + && (CONST_INT_P (operands[2]) || operands[1] == operands[0]) + && ((GET_CODE(operands[3]) != ROTATE && GET_CODE(operands[3]) != ROTATERT) + || REG_P(operands[2]))" + [(parallel + [(set (match_dup 0) + (match_op_dup 3 + [(match_dup 1) + (match_dup 2)])) + (clobber (reg:CC CC_REGNUM))])] + "" +) + +(define_insn "*thumb2_shiftsi3_short" + [(set (match_operand:SI 0 "low_register_operand" "=l,l") + (match_operator:SI 3 "shift_operator" + [(match_operand:SI 1 "low_register_operand" "0,l") + (match_operand:SI 2 "low_reg_or_int_operand" "l,M")])) + (clobber (reg:CC CC_REGNUM))] + "TARGET_THUMB2 && reload_completed + && ((GET_CODE(operands[3]) != ROTATE && GET_CODE(operands[3]) != ROTATERT) + || REG_P(operands[2]))" + "* return arm_output_shift(operands, 2);" + [(set_attr "predicable" "yes") + (set_attr "shift" "1") + (set_attr "length" "2") + (set (attr "type") (if_then_else (match_operand 2 "const_int_operand" "") + (const_string "alu_shift") + (const_string "alu_shift_reg")))] +) + +;; 16-bit load immediate +(define_peephole2 + [(set (match_operand:QHSI 0 "low_register_operand" "") + (match_operand:QHSI 1 "const_int_operand" ""))] + "TARGET_THUMB2 + && peep2_regno_dead_p(0, CC_REGNUM) + && (unsigned HOST_WIDE_INT) INTVAL(operands[1]) < 256" + [(parallel + [(set (match_dup 0) + (match_dup 1)) + (clobber (reg:CC CC_REGNUM))])] + "" +) + +(define_insn "*thumb2_mov_shortim" + [(set (match_operand:QHSI 0 "low_register_operand" "=l") + (match_operand:QHSI 1 "const_int_operand" "I")) + (clobber (reg:CC CC_REGNUM))] + "TARGET_THUMB2 && reload_completed" + "mov%!\t%0, %1" + [(set_attr "predicable" "yes") + (set_attr "length" "2")] +) + +;; 16-bit add/sub immediate +(define_peephole2 + [(set (match_operand:SI 0 "low_register_operand" "") + (plus:SI (match_operand:SI 1 "low_register_operand" "") + (match_operand:SI 2 "const_int_operand" "")))] + "TARGET_THUMB2 + && peep2_regno_dead_p(0, CC_REGNUM) + && ((rtx_equal_p(operands[0], operands[1]) + && INTVAL(operands[2]) > -256 && INTVAL(operands[2]) < 256) + || (INTVAL(operands[2]) > -8 && INTVAL(operands[2]) < 8))" + [(parallel + [(set (match_dup 0) + (plus:SI (match_dup 1) + (match_dup 2))) + (clobber (reg:CC CC_REGNUM))])] + "" +) + +(define_insn "*thumb2_addsi_short" + [(set (match_operand:SI 0 "low_register_operand" "=l,l") + (plus:SI (match_operand:SI 1 "low_register_operand" "l,0") + (match_operand:SI 2 "low_reg_or_int_operand" "lPt,Ps"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_THUMB2 && reload_completed" + "* + HOST_WIDE_INT val; + + if (GET_CODE (operands[2]) == CONST_INT) + val = INTVAL(operands[2]); + else + val = 0; + + /* We prefer eg. subs rn, rn, #1 over adds rn, rn, #0xffffffff. */ + if (val < 0 && const_ok_for_arm(ARM_SIGN_EXTEND (-val))) + return \"sub%!\\t%0, %1, #%n2\"; + else + return \"add%!\\t%0, %1, %2\"; + " + [(set_attr "predicable" "yes") + (set_attr "length" "2")] +) + +(define_insn "divsi3" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (div:SI (match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "s_register_operand" "r")))] + "TARGET_THUMB2 && arm_arch_hwdiv" + "sdiv%?\t%0, %1, %2" + [(set_attr "predicable" "yes") + (set_attr "insn" "sdiv")] +) + +(define_insn "udivsi3" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (udiv:SI (match_operand:SI 1 "s_register_operand" "r") + (match_operand:SI 2 "s_register_operand" "r")))] + "TARGET_THUMB2 && arm_arch_hwdiv" + "udiv%?\t%0, %1, %2" + [(set_attr "predicable" "yes") + (set_attr "insn" "udiv")] +) + +(define_insn "*thumb2_subsi_short" + [(set (match_operand:SI 0 "low_register_operand" "=l") + (minus:SI (match_operand:SI 1 "low_register_operand" "l") + (match_operand:SI 2 "low_register_operand" "l"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_THUMB2 && reload_completed" + "sub%!\\t%0, %1, %2" + [(set_attr "predicable" "yes") + (set_attr "length" "2")] +) + +(define_peephole2 + [(set (match_operand:CC 0 "cc_register" "") + (compare:CC (match_operand:SI 1 "low_register_operand" "") + (match_operand:SI 2 "const_int_operand" "")))] + "TARGET_THUMB2 + && peep2_reg_dead_p (1, operands[1]) + && satisfies_constraint_Pw (operands[2])" + [(parallel + [(set (match_dup 0) (compare:CC (match_dup 1) (match_dup 2))) + (set (match_dup 1) (plus:SI (match_dup 1) (match_dup 3)))])] + "operands[3] = GEN_INT (- INTVAL (operands[2]));" +) + +(define_peephole2 + [(match_scratch:SI 3 "l") + (set (match_operand:CC 0 "cc_register" "") + (compare:CC (match_operand:SI 1 "low_register_operand" "") + (match_operand:SI 2 "const_int_operand" "")))] + "TARGET_THUMB2 + && satisfies_constraint_Px (operands[2])" + [(parallel + [(set (match_dup 0) (compare:CC (match_dup 1) (match_dup 2))) + (set (match_dup 3) (plus:SI (match_dup 1) (match_dup 4)))])] + "operands[4] = GEN_INT (- INTVAL (operands[2]));" +) + +(define_insn "*thumb2_addsi3_compare0" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV + (plus:SI (match_operand:SI 1 "s_register_operand" "l, 0, r") + (match_operand:SI 2 "arm_add_operand" "lPt,Ps,rIL")) + (const_int 0))) + (set (match_operand:SI 0 "s_register_operand" "=l,l,r") + (plus:SI (match_dup 1) (match_dup 2)))] + "TARGET_THUMB2" + "* + HOST_WIDE_INT val; + + if (GET_CODE (operands[2]) == CONST_INT) + val = INTVAL (operands[2]); + else + val = 0; + + if (val < 0 && const_ok_for_arm (ARM_SIGN_EXTEND (-val))) + return \"subs\\t%0, %1, #%n2\"; + else + return \"adds\\t%0, %1, %2\"; + " + [(set_attr "conds" "set") + (set_attr "length" "2,2,4")] +) + +(define_insn "*thumb2_addsi3_compare0_scratch" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV + (plus:SI (match_operand:SI 0 "s_register_operand" "l, r") + (match_operand:SI 1 "arm_add_operand" "lPv,rIL")) + (const_int 0)))] + "TARGET_THUMB2" + "* + HOST_WIDE_INT val; + + if (GET_CODE (operands[1]) == CONST_INT) + val = INTVAL (operands[1]); + else + val = 0; + + if (val < 0 && const_ok_for_arm (ARM_SIGN_EXTEND (-val))) + return \"cmp\\t%0, #%n1\"; + else + return \"cmn\\t%0, %1\"; + " + [(set_attr "conds" "set") + (set_attr "length" "2,4")] +) + +;; 16-bit encodings of "muls" and "mul". We only use these when +;; optimizing for size since "muls" is slow on all known +;; implementations and since "mul" will be generated by +;; "*arm_mulsi3_v6" anyhow. The assembler will use a 16-bit encoding +;; for "mul" whenever possible anyhow. +(define_peephole2 + [(set (match_operand:SI 0 "low_register_operand" "") + (mult:SI (match_operand:SI 1 "low_register_operand" "") + (match_dup 0)))] + "TARGET_THUMB2 && optimize_size && peep2_regno_dead_p (0, CC_REGNUM)" + [(parallel + [(set (match_dup 0) + (mult:SI (match_dup 0) (match_dup 1))) + (clobber (reg:CC CC_REGNUM))])] + "" +) + +(define_peephole2 + [(set (match_operand:SI 0 "low_register_operand" "") + (mult:SI (match_dup 0) + (match_operand:SI 1 "low_register_operand" "")))] + "TARGET_THUMB2 && optimize_size && peep2_regno_dead_p (0, CC_REGNUM)" + [(parallel + [(set (match_dup 0) + (mult:SI (match_dup 0) (match_dup 1))) + (clobber (reg:CC CC_REGNUM))])] + "" +) + +(define_insn "*thumb2_mulsi_short" + [(set (match_operand:SI 0 "low_register_operand" "=l") + (mult:SI (match_operand:SI 1 "low_register_operand" "%0") + (match_operand:SI 2 "low_register_operand" "l"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_THUMB2 && optimize_size && reload_completed" + "mul%!\\t%0, %2, %0" + [(set_attr "predicable" "yes") + (set_attr "length" "2") + (set_attr "insn" "muls")]) + +(define_insn "*thumb2_mulsi_short_compare0" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV + (mult:SI (match_operand:SI 1 "register_operand" "%0") + (match_operand:SI 2 "register_operand" "l")) + (const_int 0))) + (set (match_operand:SI 0 "register_operand" "=l") + (mult:SI (match_dup 1) (match_dup 2)))] + "TARGET_THUMB2 && optimize_size" + "muls\\t%0, %2, %0" + [(set_attr "length" "2") + (set_attr "insn" "muls")]) + +(define_insn "*thumb2_mulsi_short_compare0_scratch" + [(set (reg:CC_NOOV CC_REGNUM) + (compare:CC_NOOV + (mult:SI (match_operand:SI 1 "register_operand" "%0") + (match_operand:SI 2 "register_operand" "l")) + (const_int 0))) + (clobber (match_scratch:SI 0 "=l"))] + "TARGET_THUMB2 && optimize_size" + "muls\\t%0, %2, %0" + [(set_attr "length" "2") + (set_attr "insn" "muls")]) + +(define_insn "*thumb2_cbz" + [(set (pc) (if_then_else + (eq (match_operand:SI 0 "s_register_operand" "l,?r") + (const_int 0)) + (label_ref (match_operand 1 "" "")) + (pc))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_THUMB2" + "* + if (get_attr_length (insn) == 2) + return \"cbz\\t%0, %l1\"; + else + return \"cmp\\t%0, #0\;beq\\t%l1\"; + " + [(set (attr "length") + (if_then_else + (and (ge (minus (match_dup 1) (pc)) (const_int 2)) + (le (minus (match_dup 1) (pc)) (const_int 128)) + (eq (symbol_ref ("which_alternative")) (const_int 0))) + (const_int 2) + (const_int 8)))] +) + +(define_insn "*thumb2_cbnz" + [(set (pc) (if_then_else + (ne (match_operand:SI 0 "s_register_operand" "l,?r") + (const_int 0)) + (label_ref (match_operand 1 "" "")) + (pc))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_THUMB2" + "* + if (get_attr_length (insn) == 2) + return \"cbnz\\t%0, %l1\"; + else + return \"cmp\\t%0, #0\;bne\\t%l1\"; + " + [(set (attr "length") + (if_then_else + (and (ge (minus (match_dup 1) (pc)) (const_int 2)) + (le (minus (match_dup 1) (pc)) (const_int 128)) + (eq (symbol_ref ("which_alternative")) (const_int 0))) + (const_int 2) + (const_int 8)))] +) + +;; 16-bit complement +(define_peephole2 + [(set (match_operand:SI 0 "low_register_operand" "") + (not:SI (match_operand:SI 1 "low_register_operand" "")))] + "TARGET_THUMB2 + && peep2_regno_dead_p(0, CC_REGNUM)" + [(parallel + [(set (match_dup 0) + (not:SI (match_dup 1))) + (clobber (reg:CC CC_REGNUM))])] + "" +) + +(define_insn "*thumb2_one_cmplsi2_short" + [(set (match_operand:SI 0 "low_register_operand" "=l") + (not:SI (match_operand:SI 1 "low_register_operand" "l"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_THUMB2 && reload_completed" + "mvn%!\t%0, %1" + [(set_attr "predicable" "yes") + (set_attr "length" "2")] +) + +;; 16-bit negate +(define_peephole2 + [(set (match_operand:SI 0 "low_register_operand" "") + (neg:SI (match_operand:SI 1 "low_register_operand" "")))] + "TARGET_THUMB2 + && peep2_regno_dead_p(0, CC_REGNUM)" + [(parallel + [(set (match_dup 0) + (neg:SI (match_dup 1))) + (clobber (reg:CC CC_REGNUM))])] + "" +) + +(define_insn "*thumb2_negsi2_short" + [(set (match_operand:SI 0 "low_register_operand" "=l") + (neg:SI (match_operand:SI 1 "low_register_operand" "l"))) + (clobber (reg:CC CC_REGNUM))] + "TARGET_THUMB2 && reload_completed" + "neg%!\t%0, %1" + [(set_attr "predicable" "yes") + (set_attr "length" "2")] +) + +(define_insn "*orsi_notsi_si" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (ior:SI (not:SI (match_operand:SI 2 "s_register_operand" "r")) + (match_operand:SI 1 "s_register_operand" "r")))] + "TARGET_THUMB2" + "orn%?\\t%0, %1, %2" + [(set_attr "predicable" "yes")] +) + +(define_insn "*orsi_not_shiftsi_si" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (ior:SI (not:SI (match_operator:SI 4 "shift_operator" + [(match_operand:SI 2 "s_register_operand" "r") + (match_operand:SI 3 "const_int_operand" "M")])) + (match_operand:SI 1 "s_register_operand" "r")))] + "TARGET_THUMB2" + "orn%?\\t%0, %1, %2%S4" + [(set_attr "predicable" "yes") + (set_attr "shift" "2") + (set_attr "type" "alu_shift")] +) + +(define_peephole2 + [(set (match_operand:CC_NOOV 0 "cc_register" "") + (compare:CC_NOOV (zero_extract:SI + (match_operand:SI 1 "low_register_operand" "") + (const_int 1) + (match_operand:SI 2 "const_int_operand" "")) + (const_int 0))) + (match_scratch:SI 3 "l") + (set (pc) + (if_then_else (match_operator:CC_NOOV 4 "equality_operator" + [(match_dup 0) (const_int 0)]) + (match_operand 5 "" "") + (match_operand 6 "" "")))] + "TARGET_THUMB2 + && (INTVAL (operands[2]) >= 0 && INTVAL (operands[2]) < 32)" + [(parallel [(set (match_dup 0) + (compare:CC_NOOV (ashift:SI (match_dup 1) (match_dup 2)) + (const_int 0))) + (clobber (match_dup 3))]) + (set (pc) + (if_then_else (match_op_dup 4 [(match_dup 0) (const_int 0)]) + (match_dup 5) (match_dup 6)))] + " + operands[2] = GEN_INT (31 - INTVAL (operands[2])); + operands[4] = gen_rtx_fmt_ee (GET_CODE (operands[4]) == NE ? LT : GE, + VOIDmode, operands[0], const0_rtx); + ") + +(define_peephole2 + [(set (match_operand:CC_NOOV 0 "cc_register" "") + (compare:CC_NOOV (zero_extract:SI + (match_operand:SI 1 "low_register_operand" "") + (match_operand:SI 2 "const_int_operand" "") + (const_int 0)) + (const_int 0))) + (match_scratch:SI 3 "l") + (set (pc) + (if_then_else (match_operator:CC_NOOV 4 "equality_operator" + [(match_dup 0) (const_int 0)]) + (match_operand 5 "" "") + (match_operand 6 "" "")))] + "TARGET_THUMB2 + && (INTVAL (operands[2]) > 0 && INTVAL (operands[2]) < 32)" + [(parallel [(set (match_dup 0) + (compare:CC_NOOV (ashift:SI (match_dup 1) (match_dup 2)) + (const_int 0))) + (clobber (match_dup 3))]) + (set (pc) + (if_then_else (match_op_dup 4 [(match_dup 0) (const_int 0)]) + (match_dup 5) (match_dup 6)))] + " + operands[2] = GEN_INT (32 - INTVAL (operands[2])); + ") diff --git a/gcc/config/arm/uclinux-eabi.h b/gcc/config/arm/uclinux-eabi.h new file mode 100644 index 000000000..4455288b8 --- /dev/null +++ b/gcc/config/arm/uclinux-eabi.h @@ -0,0 +1,66 @@ +/* Definitions for ARM EABI ucLinux + Copyright (C) 2006, 2007, 2008 Free Software Foundation, Inc. + Contributed by Paul Brook + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +/* Override settings that are different to the uclinux-elf or + bpabi defaults. */ + +#undef TARGET_DEFAULT +#define TARGET_DEFAULT (MASK_SINGLE_PIC_BASE | MASK_INTERWORK) + +/* On EABI GNU/Linux, we want both the BPABI builtins and the + GNU/Linux builtins. */ +#undef TARGET_OS_CPP_BUILTINS +#define TARGET_OS_CPP_BUILTINS() \ + do \ + { \ + TARGET_BPABI_CPP_BUILTINS(); \ + builtin_define ("__uClinux__"); \ + builtin_define ("__gnu_linux__"); \ + builtin_define_std ("linux"); \ + builtin_define_std ("unix"); \ + builtin_assert ("system=linux"); \ + builtin_assert ("system=unix"); \ + builtin_assert ("system=posix"); \ + } \ + while (false) + +#undef SUBTARGET_EXTRA_LINK_SPEC +#define SUBTARGET_EXTRA_LINK_SPEC " -m armelf_linux_eabi -elf2flt" \ + " --pic-veneer --target2=abs" + +/* We default to the "aapcs-linux" ABI so that enums are int-sized by + default. */ +#undef ARM_DEFAULT_ABI +#define ARM_DEFAULT_ABI ARM_ABI_AAPCS_LINUX + +/* Clear the instruction cache from `beg' to `end'. This makes an + inline system call to SYS_cacheflush. */ +#undef CLEAR_INSN_CACHE +#define CLEAR_INSN_CACHE(BEG, END) \ +{ \ + register unsigned long _beg __asm ("a1") = (unsigned long) (BEG); \ + register unsigned long _end __asm ("a2") = (unsigned long) (END); \ + register unsigned long _flg __asm ("a3") = 0; \ + register unsigned long _scno __asm ("r7") = 0xf0002; \ + __asm __volatile ("swi 0x0 @ sys_cacheflush" \ + : "=r" (_beg) \ + : "0" (_beg), "r" (_end), "r" (_flg), "r" (_scno)); \ +} + diff --git a/gcc/config/arm/uclinux-elf.h b/gcc/config/arm/uclinux-elf.h new file mode 100644 index 000000000..50fd76580 --- /dev/null +++ b/gcc/config/arm/uclinux-elf.h @@ -0,0 +1,88 @@ +/* Definitions for ARM running ucLinux using ELF + Copyright (C) 1999, 2001, 2004, 2005, 2007, 2008 + Free Software Foundation, Inc. + Contributed by Philip Blundell + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +/* We don't want a PLT. */ +#undef NEED_PLT_RELOC +#define NEED_PLT_RELOC 0 + +#undef TARGET_VERSION +#define TARGET_VERSION fputs (" (ARM/ELF ucLinux)", stderr); + +#undef TARGET_DEFAULT +#define TARGET_DEFAULT (MASK_SINGLE_PIC_BASE) + +/* NOTE: The remaining definitions in this file are needed because uclinux + does not use config/linux.h. */ + +/* Add GNU/Linux builtins. */ +#undef TARGET_OS_CPP_BUILTINS +#define TARGET_OS_CPP_BUILTINS() \ + do \ + { \ + builtin_define ("__uClinux__"); \ + builtin_define ("__gnu_linux__"); \ + builtin_define_std ("linux"); \ + builtin_define_std ("unix"); \ + builtin_assert ("system=linux"); \ + builtin_assert ("system=unix"); \ + builtin_assert ("system=posix"); \ + } \ + while (false) + +/* Do not assume anything about header files. */ +#define NO_IMPLICIT_EXTERN_C + +/* The GNU C++ standard library requires that these macros be defined. */ +#undef CPLUSPLUS_CPP_SPEC +#define CPLUSPLUS_CPP_SPEC "-D_GNU_SOURCE %(cpp)" + +#undef SUBTARGET_EXTRA_LINK_SPEC +#define SUBTARGET_EXTRA_LINK_SPEC " -m armelf_linux" + +/* Now we define the strings used to build the spec file. */ +#undef STARTFILE_SPEC +#define STARTFILE_SPEC "crt1%O%s crti%O%s crtbegin%O%s" + +#undef ENDFILE_SPEC +#define ENDFILE_SPEC "crtend%O%s crtn%O%s" + +#undef CC1_SPEC +#define CC1_SPEC "%{profile:-p}" + +#undef LINK_GCC_C_SEQUENCE_SPEC +#define LINK_GCC_C_SEQUENCE_SPEC \ + "%{static:--start-group} %G %L %{static:--end-group}%{!static:%G}" + +/* Use --as-needed -lgcc_s for eh support. */ +#ifdef HAVE_LD_AS_NEEDED +#define USE_LD_AS_NEEDED 1 +#endif + +#undef LINK_SPEC +#define LINK_SPEC "%{mbig-endian:-EB} %{mlittle-endian:-EL} -X -elf2flt" + +#undef LIB_SPEC +#define LIB_SPEC \ + "%{pthread:-lpthread} \ + %{shared:-lc} \ + %{!shared:%{profile:-lc_p}%{!profile:-lc}}" + +#define TARGET_DEFAULT_WORD_RELOCATIONS 1 diff --git a/gcc/config/arm/unaligned-funcs.c b/gcc/config/arm/unaligned-funcs.c new file mode 100644 index 000000000..4e684f4fc --- /dev/null +++ b/gcc/config/arm/unaligned-funcs.c @@ -0,0 +1,57 @@ +/* EABI unaligned read/write functions. + + Copyright (C) 2005, 2009 Free Software Foundation, Inc. + Contributed by CodeSourcery, LLC. + + This file is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any + later version. + + This file is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +int __aeabi_uread4 (void *); +int __aeabi_uwrite4 (int, void *); +long long __aeabi_uread8 (void *); +long long __aeabi_uwrite8 (long long, void *); + +struct __attribute__((packed)) u4 { int data; }; +struct __attribute__((packed)) u8 { long long data; }; + +int +__aeabi_uread4 (void *ptr) +{ + return ((struct u4 *) ptr)->data; +} + +int +__aeabi_uwrite4 (int data, void *ptr) +{ + ((struct u4 *) ptr)->data = data; + return data; +} + +long long +__aeabi_uread8 (void *ptr) +{ + return ((struct u8 *) ptr)->data; +} + +long long +__aeabi_uwrite8 (long long data, void *ptr) +{ + ((struct u8 *) ptr)->data = data; + return data; +} diff --git a/gcc/config/arm/unknown-elf.h b/gcc/config/arm/unknown-elf.h new file mode 100644 index 000000000..b47455ea9 --- /dev/null +++ b/gcc/config/arm/unknown-elf.h @@ -0,0 +1,100 @@ +/* Definitions for non-Linux based ARM systems using ELF + Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2007, 2008, 2010 + Free Software Foundation, Inc. + Contributed by Catherine Moore + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +/* elfos.h should have already been included. Now just override + any conflicting definitions and add any extras. */ + +/* Run-time Target Specification. */ +#ifndef TARGET_VERSION +#define TARGET_VERSION fputs (" (ARM/ELF)", stderr); +#endif + +/* Default to using software floating point. */ +#ifndef TARGET_DEFAULT +#define TARGET_DEFAULT (0) +#endif + +/* Now we define the strings used to build the spec file. */ +#define UNKNOWN_ELF_STARTFILE_SPEC " crti%O%s crtbegin%O%s crt0%O%s" + +#undef STARTFILE_SPEC +#define STARTFILE_SPEC UNKNOWN_ELF_STARTFILE_SPEC + +#define UNKNOWN_ELF_ENDFILE_SPEC "crtend%O%s crtn%O%s" + +#undef ENDFILE_SPEC +#define ENDFILE_SPEC UNKNOWN_ELF_ENDFILE_SPEC + +/* The __USES_INITFINI__ define is tested in newlib/libc/sys/arm/crt0.S + to see if it needs to invoked _init() and _fini(). */ +#undef SUBTARGET_CPP_SPEC +#define SUBTARGET_CPP_SPEC "-D__USES_INITFINI__" + +#undef PREFERRED_DEBUGGING_TYPE +#define PREFERRED_DEBUGGING_TYPE DWARF2_DEBUG + +/* Return a nonzero value if DECL has a section attribute. */ +#define IN_NAMED_SECTION_P(DECL) \ + ((TREE_CODE (DECL) == FUNCTION_DECL || TREE_CODE (DECL) == VAR_DECL) \ + && DECL_SECTION_NAME (DECL) != NULL_TREE) + +#undef ASM_OUTPUT_ALIGNED_BSS +#define ASM_OUTPUT_ALIGNED_BSS(FILE, DECL, NAME, SIZE, ALIGN) \ + do \ + { \ + if (IN_NAMED_SECTION_P (DECL)) \ + switch_to_section (get_named_section (DECL, NULL, 0)); \ + else \ + switch_to_section (bss_section); \ + \ + ASM_OUTPUT_ALIGN (FILE, floor_log2 (ALIGN / BITS_PER_UNIT)); \ + \ + last_assemble_variable_decl = DECL; \ + ASM_DECLARE_OBJECT_NAME (FILE, NAME, DECL); \ + ASM_OUTPUT_SKIP (FILE, SIZE ? (int)(SIZE) : 1); \ + } \ + while (0) + +#undef ASM_OUTPUT_ALIGNED_DECL_LOCAL +#define ASM_OUTPUT_ALIGNED_DECL_LOCAL(FILE, DECL, NAME, SIZE, ALIGN) \ + do \ + { \ + if ((DECL) != NULL && IN_NAMED_SECTION_P (DECL)) \ + switch_to_section (get_named_section (DECL, NULL, 0)); \ + else \ + switch_to_section (bss_section); \ + \ + ASM_OUTPUT_ALIGN (FILE, floor_log2 (ALIGN / BITS_PER_UNIT)); \ + ASM_OUTPUT_LABEL (FILE, NAME); \ + fprintf (FILE, "\t.space\t%d\n", SIZE ? (int)(SIZE) : 1); \ + } \ + while (0) + +#ifndef SUBTARGET_CPU_DEFAULT +#define SUBTARGET_CPU_DEFAULT TARGET_CPU_arm7tdmi +#endif + +/* The libgcc udivmod functions may throw exceptions. If newlib is + configured to support long longs in I/O, then printf will depend on + udivmoddi4, which will depend on the exception unwind routines, + which will depend on abort, which is defined in libc. */ +#undef LINK_GCC_C_SEQUENCE_SPEC +#define LINK_GCC_C_SEQUENCE_SPEC "--start-group %G %L --end-group" diff --git a/gcc/config/arm/unwind-arm.c b/gcc/config/arm/unwind-arm.c new file mode 100644 index 000000000..2c6e00489 --- /dev/null +++ b/gcc/config/arm/unwind-arm.c @@ -0,0 +1,1263 @@ +/* ARM EABI compliant unwinding routines. + Copyright (C) 2004, 2005, 2009 Free Software Foundation, Inc. + Contributed by Paul Brook + + This file is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any + later version. + + This file is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#include "unwind.h" + +/* We add a prototype for abort here to avoid creating a dependency on + target headers. */ +extern void abort (void); + +/* Definitions for C++ runtime support routines. We make these weak + declarations to avoid pulling in libsupc++ unnecessarily. */ +typedef unsigned char bool; + +typedef struct _ZSt9type_info type_info; /* This names C++ type_info type */ + +void __attribute__((weak)) __cxa_call_unexpected(_Unwind_Control_Block *ucbp); +bool __attribute__((weak)) __cxa_begin_cleanup(_Unwind_Control_Block *ucbp); +bool __attribute__((weak)) __cxa_type_match(_Unwind_Control_Block *ucbp, + const type_info *rttip, + bool is_reference, + void **matched_object); + +_Unwind_Ptr __attribute__((weak)) +__gnu_Unwind_Find_exidx (_Unwind_Ptr, int *); + +/* Misc constants. */ +#define R_IP 12 +#define R_SP 13 +#define R_LR 14 +#define R_PC 15 + +#define EXIDX_CANTUNWIND 1 +#define uint32_highbit (((_uw) 1) << 31) + +#define UCB_FORCED_STOP_FN(ucbp) ((ucbp)->unwinder_cache.reserved1) +#define UCB_PR_ADDR(ucbp) ((ucbp)->unwinder_cache.reserved2) +#define UCB_SAVED_CALLSITE_ADDR(ucbp) ((ucbp)->unwinder_cache.reserved3) +#define UCB_FORCED_STOP_ARG(ucbp) ((ucbp)->unwinder_cache.reserved4) + +struct core_regs +{ + _uw r[16]; +}; + +/* We use normal integer types here to avoid the compiler generating + coprocessor instructions. */ +struct vfp_regs +{ + _uw64 d[16]; + _uw pad; +}; + +struct vfpv3_regs +{ + /* Always populated via VSTM, so no need for the "pad" field from + vfp_regs (which is used to store the format word for FSTMX). */ + _uw64 d[16]; +}; + +struct fpa_reg +{ + _uw w[3]; +}; + +struct fpa_regs +{ + struct fpa_reg f[8]; +}; + +struct wmmxd_regs +{ + _uw64 wd[16]; +}; + +struct wmmxc_regs +{ + _uw wc[4]; +}; + +/* Unwind descriptors. */ + +typedef struct +{ + _uw16 length; + _uw16 offset; +} EHT16; + +typedef struct +{ + _uw length; + _uw offset; +} EHT32; + +/* The ABI specifies that the unwind routines may only use core registers, + except when actually manipulating coprocessor state. This allows + us to write one implementation that works on all platforms by + demand-saving coprocessor registers. + + During unwinding we hold the coprocessor state in the actual hardware + registers and allocate demand-save areas for use during phase1 + unwinding. */ + +typedef struct +{ + /* The first fields must be the same as a phase2_vrs. */ + _uw demand_save_flags; + struct core_regs core; + _uw prev_sp; /* Only valid during forced unwinding. */ + struct vfp_regs vfp; + struct vfpv3_regs vfp_regs_16_to_31; + struct fpa_regs fpa; + struct wmmxd_regs wmmxd; + struct wmmxc_regs wmmxc; +} phase1_vrs; + +#define DEMAND_SAVE_VFP 1 /* VFP state has been saved if not set */ +#define DEMAND_SAVE_VFP_D 2 /* VFP state is for FLDMD/FSTMD if set */ +#define DEMAND_SAVE_VFP_V3 4 /* VFPv3 state for regs 16 .. 31 has + been saved if not set */ +#define DEMAND_SAVE_WMMXD 8 /* iWMMXt data registers have been + saved if not set. */ +#define DEMAND_SAVE_WMMXC 16 /* iWMMXt control registers have been + saved if not set. */ + +/* This must match the structure created by the assembly wrappers. */ +typedef struct +{ + _uw demand_save_flags; + struct core_regs core; +} phase2_vrs; + + +/* An exception index table entry. */ + +typedef struct __EIT_entry +{ + _uw fnoffset; + _uw content; +} __EIT_entry; + +/* Assembly helper functions. */ + +/* Restore core register state. Never returns. */ +void __attribute__((noreturn)) restore_core_regs (struct core_regs *); + + +/* Coprocessor register state manipulation functions. */ + +/* Routines for FLDMX/FSTMX format... */ +void __gnu_Unwind_Save_VFP (struct vfp_regs * p); +void __gnu_Unwind_Restore_VFP (struct vfp_regs * p); +void __gnu_Unwind_Save_WMMXD (struct wmmxd_regs * p); +void __gnu_Unwind_Restore_WMMXD (struct wmmxd_regs * p); +void __gnu_Unwind_Save_WMMXC (struct wmmxc_regs * p); +void __gnu_Unwind_Restore_WMMXC (struct wmmxc_regs * p); + +/* ...and those for FLDMD/FSTMD format... */ +void __gnu_Unwind_Save_VFP_D (struct vfp_regs * p); +void __gnu_Unwind_Restore_VFP_D (struct vfp_regs * p); + +/* ...and those for VLDM/VSTM format, saving/restoring only registers + 16 through 31. */ +void __gnu_Unwind_Save_VFP_D_16_to_31 (struct vfpv3_regs * p); +void __gnu_Unwind_Restore_VFP_D_16_to_31 (struct vfpv3_regs * p); + +/* Restore coprocessor state after phase1 unwinding. */ +static void +restore_non_core_regs (phase1_vrs * vrs) +{ + if ((vrs->demand_save_flags & DEMAND_SAVE_VFP) == 0) + { + if (vrs->demand_save_flags & DEMAND_SAVE_VFP_D) + __gnu_Unwind_Restore_VFP_D (&vrs->vfp); + else + __gnu_Unwind_Restore_VFP (&vrs->vfp); + } + + if ((vrs->demand_save_flags & DEMAND_SAVE_VFP_V3) == 0) + __gnu_Unwind_Restore_VFP_D_16_to_31 (&vrs->vfp_regs_16_to_31); + + if ((vrs->demand_save_flags & DEMAND_SAVE_WMMXD) == 0) + __gnu_Unwind_Restore_WMMXD (&vrs->wmmxd); + if ((vrs->demand_save_flags & DEMAND_SAVE_WMMXC) == 0) + __gnu_Unwind_Restore_WMMXC (&vrs->wmmxc); +} + +/* A better way to do this would probably be to compare the absolute address + with a segment relative relocation of the same symbol. */ + +extern int __text_start; +extern int __data_start; + +/* The exception index table location. */ +extern __EIT_entry __exidx_start; +extern __EIT_entry __exidx_end; + +/* ABI defined personality routines. */ +extern _Unwind_Reason_Code __aeabi_unwind_cpp_pr0 (_Unwind_State, + _Unwind_Control_Block *, _Unwind_Context *);// __attribute__((weak)); +extern _Unwind_Reason_Code __aeabi_unwind_cpp_pr1 (_Unwind_State, + _Unwind_Control_Block *, _Unwind_Context *) __attribute__((weak)); +extern _Unwind_Reason_Code __aeabi_unwind_cpp_pr2 (_Unwind_State, + _Unwind_Control_Block *, _Unwind_Context *) __attribute__((weak)); + +/* ABI defined routine to store a virtual register to memory. */ + +_Unwind_VRS_Result _Unwind_VRS_Get (_Unwind_Context *context, + _Unwind_VRS_RegClass regclass, + _uw regno, + _Unwind_VRS_DataRepresentation representation, + void *valuep) +{ + phase1_vrs *vrs = (phase1_vrs *) context; + + switch (regclass) + { + case _UVRSC_CORE: + if (representation != _UVRSD_UINT32 + || regno > 15) + return _UVRSR_FAILED; + *(_uw *) valuep = vrs->core.r[regno]; + return _UVRSR_OK; + + case _UVRSC_VFP: + case _UVRSC_FPA: + case _UVRSC_WMMXD: + case _UVRSC_WMMXC: + return _UVRSR_NOT_IMPLEMENTED; + + default: + return _UVRSR_FAILED; + } +} + + +/* ABI defined function to load a virtual register from memory. */ + +_Unwind_VRS_Result _Unwind_VRS_Set (_Unwind_Context *context, + _Unwind_VRS_RegClass regclass, + _uw regno, + _Unwind_VRS_DataRepresentation representation, + void *valuep) +{ + phase1_vrs *vrs = (phase1_vrs *) context; + + switch (regclass) + { + case _UVRSC_CORE: + if (representation != _UVRSD_UINT32 + || regno > 15) + return _UVRSR_FAILED; + + vrs->core.r[regno] = *(_uw *) valuep; + return _UVRSR_OK; + + case _UVRSC_VFP: + case _UVRSC_FPA: + case _UVRSC_WMMXD: + case _UVRSC_WMMXC: + return _UVRSR_NOT_IMPLEMENTED; + + default: + return _UVRSR_FAILED; + } +} + + +/* ABI defined function to pop registers off the stack. */ + +_Unwind_VRS_Result _Unwind_VRS_Pop (_Unwind_Context *context, + _Unwind_VRS_RegClass regclass, + _uw discriminator, + _Unwind_VRS_DataRepresentation representation) +{ + phase1_vrs *vrs = (phase1_vrs *) context; + + switch (regclass) + { + case _UVRSC_CORE: + { + _uw *ptr; + _uw mask; + int i; + + if (representation != _UVRSD_UINT32) + return _UVRSR_FAILED; + + mask = discriminator & 0xffff; + ptr = (_uw *) vrs->core.r[R_SP]; + /* Pop the requested registers. */ + for (i = 0; i < 16; i++) + { + if (mask & (1 << i)) + vrs->core.r[i] = *(ptr++); + } + /* Writeback the stack pointer value if it wasn't restored. */ + if ((mask & (1 << R_SP)) == 0) + vrs->core.r[R_SP] = (_uw) ptr; + } + return _UVRSR_OK; + + case _UVRSC_VFP: + { + _uw start = discriminator >> 16; + _uw count = discriminator & 0xffff; + struct vfp_regs tmp; + struct vfpv3_regs tmp_16_to_31; + int tmp_count; + _uw *sp; + _uw *dest; + int num_vfpv3_regs = 0; + + /* We use an approximation here by bounding _UVRSD_DOUBLE + register numbers at 32 always, since we can't detect if + VFPv3 isn't present (in such a case the upper limit is 16). */ + if ((representation != _UVRSD_VFPX && representation != _UVRSD_DOUBLE) + || start + count > (representation == _UVRSD_VFPX ? 16 : 32) + || (representation == _UVRSD_VFPX && start >= 16)) + return _UVRSR_FAILED; + + /* Check if we're being asked to pop VFPv3-only registers + (numbers 16 through 31). */ + if (start >= 16) + num_vfpv3_regs = count; + else if (start + count > 16) + num_vfpv3_regs = start + count - 16; + + if (num_vfpv3_regs && representation != _UVRSD_DOUBLE) + return _UVRSR_FAILED; + + /* Demand-save coprocessor registers for stage1. */ + if (start < 16 && (vrs->demand_save_flags & DEMAND_SAVE_VFP)) + { + vrs->demand_save_flags &= ~DEMAND_SAVE_VFP; + + if (representation == _UVRSD_DOUBLE) + { + /* Save in FLDMD/FSTMD format. */ + vrs->demand_save_flags |= DEMAND_SAVE_VFP_D; + __gnu_Unwind_Save_VFP_D (&vrs->vfp); + } + else + { + /* Save in FLDMX/FSTMX format. */ + vrs->demand_save_flags &= ~DEMAND_SAVE_VFP_D; + __gnu_Unwind_Save_VFP (&vrs->vfp); + } + } + + if (num_vfpv3_regs > 0 + && (vrs->demand_save_flags & DEMAND_SAVE_VFP_V3)) + { + vrs->demand_save_flags &= ~DEMAND_SAVE_VFP_V3; + __gnu_Unwind_Save_VFP_D_16_to_31 (&vrs->vfp_regs_16_to_31); + } + + /* Restore the registers from the stack. Do this by saving the + current VFP registers to a memory area, moving the in-memory + values into that area, and restoring from the whole area. + For _UVRSD_VFPX we assume FSTMX standard format 1. */ + if (representation == _UVRSD_VFPX) + __gnu_Unwind_Save_VFP (&tmp); + else + { + /* Save registers 0 .. 15 if required. */ + if (start < 16) + __gnu_Unwind_Save_VFP_D (&tmp); + + /* Save VFPv3 registers 16 .. 31 if required. */ + if (num_vfpv3_regs) + __gnu_Unwind_Save_VFP_D_16_to_31 (&tmp_16_to_31); + } + + /* Work out how many registers below register 16 need popping. */ + tmp_count = num_vfpv3_regs > 0 ? 16 - start : count; + + /* Copy registers below 16, if needed. + The stack address is only guaranteed to be word aligned, so + we can't use doubleword copies. */ + sp = (_uw *) vrs->core.r[R_SP]; + if (tmp_count > 0) + { + tmp_count *= 2; + dest = (_uw *) &tmp.d[start]; + while (tmp_count--) + *(dest++) = *(sp++); + } + + /* Copy VFPv3 registers numbered >= 16, if needed. */ + if (num_vfpv3_regs > 0) + { + /* num_vfpv3_regs is needed below, so copy it. */ + int tmp_count_2 = num_vfpv3_regs * 2; + int vfpv3_start = start < 16 ? 16 : start; + + dest = (_uw *) &tmp_16_to_31.d[vfpv3_start - 16]; + while (tmp_count_2--) + *(dest++) = *(sp++); + } + + /* Skip the format word space if using FLDMX/FSTMX format. */ + if (representation == _UVRSD_VFPX) + sp++; + + /* Set the new stack pointer. */ + vrs->core.r[R_SP] = (_uw) sp; + + /* Reload the registers. */ + if (representation == _UVRSD_VFPX) + __gnu_Unwind_Restore_VFP (&tmp); + else + { + /* Restore registers 0 .. 15 if required. */ + if (start < 16) + __gnu_Unwind_Restore_VFP_D (&tmp); + + /* Restore VFPv3 registers 16 .. 31 if required. */ + if (num_vfpv3_regs > 0) + __gnu_Unwind_Restore_VFP_D_16_to_31 (&tmp_16_to_31); + } + } + return _UVRSR_OK; + + case _UVRSC_FPA: + return _UVRSR_NOT_IMPLEMENTED; + + case _UVRSC_WMMXD: + { + _uw start = discriminator >> 16; + _uw count = discriminator & 0xffff; + struct wmmxd_regs tmp; + _uw *sp; + _uw *dest; + + if ((representation != _UVRSD_UINT64) || start + count > 16) + return _UVRSR_FAILED; + + if (vrs->demand_save_flags & DEMAND_SAVE_WMMXD) + { + /* Demand-save resisters for stage1. */ + vrs->demand_save_flags &= ~DEMAND_SAVE_WMMXD; + __gnu_Unwind_Save_WMMXD (&vrs->wmmxd); + } + + /* Restore the registers from the stack. Do this by saving the + current WMMXD registers to a memory area, moving the in-memory + values into that area, and restoring from the whole area. */ + __gnu_Unwind_Save_WMMXD (&tmp); + + /* The stack address is only guaranteed to be word aligned, so + we can't use doubleword copies. */ + sp = (_uw *) vrs->core.r[R_SP]; + dest = (_uw *) &tmp.wd[start]; + count *= 2; + while (count--) + *(dest++) = *(sp++); + + /* Set the new stack pointer. */ + vrs->core.r[R_SP] = (_uw) sp; + + /* Reload the registers. */ + __gnu_Unwind_Restore_WMMXD (&tmp); + } + return _UVRSR_OK; + + case _UVRSC_WMMXC: + { + int i; + struct wmmxc_regs tmp; + _uw *sp; + + if ((representation != _UVRSD_UINT32) || discriminator > 16) + return _UVRSR_FAILED; + + if (vrs->demand_save_flags & DEMAND_SAVE_WMMXC) + { + /* Demand-save resisters for stage1. */ + vrs->demand_save_flags &= ~DEMAND_SAVE_WMMXC; + __gnu_Unwind_Save_WMMXC (&vrs->wmmxc); + } + + /* Restore the registers from the stack. Do this by saving the + current WMMXC registers to a memory area, moving the in-memory + values into that area, and restoring from the whole area. */ + __gnu_Unwind_Save_WMMXC (&tmp); + + sp = (_uw *) vrs->core.r[R_SP]; + for (i = 0; i < 4; i++) + if (discriminator & (1 << i)) + tmp.wc[i] = *(sp++); + + /* Set the new stack pointer. */ + vrs->core.r[R_SP] = (_uw) sp; + + /* Reload the registers. */ + __gnu_Unwind_Restore_WMMXC (&tmp); + } + return _UVRSR_OK; + + default: + return _UVRSR_FAILED; + } +} + + +/* Core unwinding functions. */ + +/* Calculate the address encoded by a 31-bit self-relative offset at address + P. */ +static inline _uw +selfrel_offset31 (const _uw *p) +{ + _uw offset; + + offset = *p; + /* Sign extend to 32 bits. */ + if (offset & (1 << 30)) + offset |= 1u << 31; + else + offset &= ~(1u << 31); + + return offset + (_uw) p; +} + + +/* Perform a binary search for RETURN_ADDRESS in TABLE. The table contains + NREC entries. */ + +static const __EIT_entry * +search_EIT_table (const __EIT_entry * table, int nrec, _uw return_address) +{ + _uw next_fn; + _uw this_fn; + int n, left, right; + + if (nrec == 0) + return (__EIT_entry *) 0; + + left = 0; + right = nrec - 1; + + while (1) + { + n = (left + right) / 2; + this_fn = selfrel_offset31 (&table[n].fnoffset); + if (n != nrec - 1) + next_fn = selfrel_offset31 (&table[n + 1].fnoffset) - 1; + else + next_fn = (_uw)0 - 1; + + if (return_address < this_fn) + { + if (n == left) + return (__EIT_entry *) 0; + right = n - 1; + } + else if (return_address <= next_fn) + return &table[n]; + else + left = n + 1; + } +} + +/* Find the exception index table eintry for the given address. + Fill in the relevant fields of the UCB. + Returns _URC_FAILURE if an error occurred, _URC_OK on success. */ + +static _Unwind_Reason_Code +get_eit_entry (_Unwind_Control_Block *ucbp, _uw return_address) +{ + const __EIT_entry * eitp; + int nrec; + + /* The return address is the address of the instruction following the + call instruction (plus one in thumb mode). If this was the last + instruction in the function the address will lie in the following + function. Subtract 2 from the address so that it points within the call + instruction itself. */ + return_address -= 2; + + if (__gnu_Unwind_Find_exidx) + { + eitp = (const __EIT_entry *) __gnu_Unwind_Find_exidx (return_address, + &nrec); + if (!eitp) + { + UCB_PR_ADDR (ucbp) = 0; + return _URC_FAILURE; + } + } + else + { + eitp = &__exidx_start; + nrec = &__exidx_end - &__exidx_start; + } + + eitp = search_EIT_table (eitp, nrec, return_address); + + if (!eitp) + { + UCB_PR_ADDR (ucbp) = 0; + return _URC_FAILURE; + } + ucbp->pr_cache.fnstart = selfrel_offset31 (&eitp->fnoffset); + + /* Can this frame be unwound at all? */ + if (eitp->content == EXIDX_CANTUNWIND) + { + UCB_PR_ADDR (ucbp) = 0; + return _URC_END_OF_STACK; + } + + /* Obtain the address of the "real" __EHT_Header word. */ + + if (eitp->content & uint32_highbit) + { + /* It is immediate data. */ + ucbp->pr_cache.ehtp = (_Unwind_EHT_Header *)&eitp->content; + ucbp->pr_cache.additional = 1; + } + else + { + /* The low 31 bits of the content field are a self-relative + offset to an _Unwind_EHT_Entry structure. */ + ucbp->pr_cache.ehtp = + (_Unwind_EHT_Header *) selfrel_offset31 (&eitp->content); + ucbp->pr_cache.additional = 0; + } + + /* Discover the personality routine address. */ + if (*ucbp->pr_cache.ehtp & (1u << 31)) + { + /* One of the predefined standard routines. */ + _uw idx = (*(_uw *) ucbp->pr_cache.ehtp >> 24) & 0xf; + if (idx == 0) + UCB_PR_ADDR (ucbp) = (_uw) &__aeabi_unwind_cpp_pr0; + else if (idx == 1) + UCB_PR_ADDR (ucbp) = (_uw) &__aeabi_unwind_cpp_pr1; + else if (idx == 2) + UCB_PR_ADDR (ucbp) = (_uw) &__aeabi_unwind_cpp_pr2; + else + { /* Failed */ + UCB_PR_ADDR (ucbp) = 0; + return _URC_FAILURE; + } + } + else + { + /* Execute region offset to PR */ + UCB_PR_ADDR (ucbp) = selfrel_offset31 (ucbp->pr_cache.ehtp); + } + return _URC_OK; +} + + +/* Perform phase2 unwinding. VRS is the initial virtual register state. */ + +static void __attribute__((noreturn)) +unwind_phase2 (_Unwind_Control_Block * ucbp, phase2_vrs * vrs) +{ + _Unwind_Reason_Code pr_result; + + do + { + /* Find the entry for this routine. */ + if (get_eit_entry (ucbp, vrs->core.r[R_PC]) != _URC_OK) + abort (); + + UCB_SAVED_CALLSITE_ADDR (ucbp) = vrs->core.r[R_PC]; + + /* Call the pr to decide what to do. */ + pr_result = ((personality_routine) UCB_PR_ADDR (ucbp)) + (_US_UNWIND_FRAME_STARTING, ucbp, (_Unwind_Context *) vrs); + } + while (pr_result == _URC_CONTINUE_UNWIND); + + if (pr_result != _URC_INSTALL_CONTEXT) + abort(); + + restore_core_regs (&vrs->core); +} + +/* Perform phase2 forced unwinding. */ + +static _Unwind_Reason_Code +unwind_phase2_forced (_Unwind_Control_Block *ucbp, phase2_vrs *entry_vrs, + int resuming) +{ + _Unwind_Stop_Fn stop_fn = (_Unwind_Stop_Fn) UCB_FORCED_STOP_FN (ucbp); + void *stop_arg = (void *)UCB_FORCED_STOP_ARG (ucbp); + _Unwind_Reason_Code pr_result = 0; + /* We use phase1_vrs here even though we do not demand save, for the + prev_sp field. */ + phase1_vrs saved_vrs, next_vrs; + + /* Save the core registers. */ + saved_vrs.core = entry_vrs->core; + /* We don't need to demand-save the non-core registers, because we + unwind in a single pass. */ + saved_vrs.demand_save_flags = 0; + + /* Unwind until we reach a propagation barrier. */ + do + { + _Unwind_State action; + _Unwind_Reason_Code entry_code; + _Unwind_Reason_Code stop_code; + + /* Find the entry for this routine. */ + entry_code = get_eit_entry (ucbp, saved_vrs.core.r[R_PC]); + + if (resuming) + { + action = _US_UNWIND_FRAME_RESUME | _US_FORCE_UNWIND; + resuming = 0; + } + else + action = _US_UNWIND_FRAME_STARTING | _US_FORCE_UNWIND; + + if (entry_code == _URC_OK) + { + UCB_SAVED_CALLSITE_ADDR (ucbp) = saved_vrs.core.r[R_PC]; + + next_vrs = saved_vrs; + + /* Call the pr to decide what to do. */ + pr_result = ((personality_routine) UCB_PR_ADDR (ucbp)) + (action, ucbp, (void *) &next_vrs); + + saved_vrs.prev_sp = next_vrs.core.r[R_SP]; + } + else + { + /* Treat any failure as the end of unwinding, to cope more + gracefully with missing EH information. Mixed EH and + non-EH within one object will usually result in failure, + because the .ARM.exidx tables do not indicate the end + of the code to which they apply; but mixed EH and non-EH + shared objects should return an unwind failure at the + entry of a non-EH shared object. */ + action |= _US_END_OF_STACK; + + saved_vrs.prev_sp = saved_vrs.core.r[R_SP]; + } + + stop_code = stop_fn (1, action, ucbp->exception_class, ucbp, + (void *)&saved_vrs, stop_arg); + if (stop_code != _URC_NO_REASON) + return _URC_FAILURE; + + if (entry_code != _URC_OK) + return entry_code; + + saved_vrs = next_vrs; + } + while (pr_result == _URC_CONTINUE_UNWIND); + + if (pr_result != _URC_INSTALL_CONTEXT) + { + /* Some sort of failure has occurred in the pr and probably the + pr returned _URC_FAILURE. */ + return _URC_FAILURE; + } + + restore_core_regs (&saved_vrs.core); +} + +/* This is a very limited implementation of _Unwind_GetCFA. It returns + the stack pointer as it is about to be unwound, and is only valid + while calling the stop function during forced unwinding. If the + current personality routine result is going to run a cleanup, this + will not be the CFA; but when the frame is really unwound, it will + be. */ + +_Unwind_Word +_Unwind_GetCFA (_Unwind_Context *context) +{ + return ((phase1_vrs *) context)->prev_sp; +} + +/* Perform phase1 unwinding. UCBP is the exception being thrown, and + entry_VRS is the register state on entry to _Unwind_RaiseException. */ + +_Unwind_Reason_Code +__gnu_Unwind_RaiseException (_Unwind_Control_Block *, phase2_vrs *); + +_Unwind_Reason_Code +__gnu_Unwind_RaiseException (_Unwind_Control_Block * ucbp, + phase2_vrs * entry_vrs) +{ + phase1_vrs saved_vrs; + _Unwind_Reason_Code pr_result; + + /* Set the pc to the call site. */ + entry_vrs->core.r[R_PC] = entry_vrs->core.r[R_LR]; + + /* Save the core registers. */ + saved_vrs.core = entry_vrs->core; + /* Set demand-save flags. */ + saved_vrs.demand_save_flags = ~(_uw) 0; + + /* Unwind until we reach a propagation barrier. */ + do + { + /* Find the entry for this routine. */ + if (get_eit_entry (ucbp, saved_vrs.core.r[R_PC]) != _URC_OK) + return _URC_FAILURE; + + /* Call the pr to decide what to do. */ + pr_result = ((personality_routine) UCB_PR_ADDR (ucbp)) + (_US_VIRTUAL_UNWIND_FRAME, ucbp, (void *) &saved_vrs); + } + while (pr_result == _URC_CONTINUE_UNWIND); + + /* We've unwound as far as we want to go, so restore the original + register state. */ + restore_non_core_regs (&saved_vrs); + if (pr_result != _URC_HANDLER_FOUND) + { + /* Some sort of failure has occurred in the pr and probably the + pr returned _URC_FAILURE. */ + return _URC_FAILURE; + } + + unwind_phase2 (ucbp, entry_vrs); +} + +/* Resume unwinding after a cleanup has been run. UCBP is the exception + being thrown and ENTRY_VRS is the register state on entry to + _Unwind_Resume. */ +_Unwind_Reason_Code +__gnu_Unwind_ForcedUnwind (_Unwind_Control_Block *, + _Unwind_Stop_Fn, void *, phase2_vrs *); + +_Unwind_Reason_Code +__gnu_Unwind_ForcedUnwind (_Unwind_Control_Block *ucbp, + _Unwind_Stop_Fn stop_fn, void *stop_arg, + phase2_vrs *entry_vrs) +{ + UCB_FORCED_STOP_FN (ucbp) = (_uw) stop_fn; + UCB_FORCED_STOP_ARG (ucbp) = (_uw) stop_arg; + + /* Set the pc to the call site. */ + entry_vrs->core.r[R_PC] = entry_vrs->core.r[R_LR]; + + return unwind_phase2_forced (ucbp, entry_vrs, 0); +} + +_Unwind_Reason_Code +__gnu_Unwind_Resume (_Unwind_Control_Block *, phase2_vrs *); + +_Unwind_Reason_Code +__gnu_Unwind_Resume (_Unwind_Control_Block * ucbp, phase2_vrs * entry_vrs) +{ + _Unwind_Reason_Code pr_result; + + /* Recover the saved address. */ + entry_vrs->core.r[R_PC] = UCB_SAVED_CALLSITE_ADDR (ucbp); + + if (UCB_FORCED_STOP_FN (ucbp)) + { + unwind_phase2_forced (ucbp, entry_vrs, 1); + + /* We can't return failure at this point. */ + abort (); + } + + /* Call the cached PR. */ + pr_result = ((personality_routine) UCB_PR_ADDR (ucbp)) + (_US_UNWIND_FRAME_RESUME, ucbp, (_Unwind_Context *) entry_vrs); + + switch (pr_result) + { + case _URC_INSTALL_CONTEXT: + /* Upload the registers to enter the landing pad. */ + restore_core_regs (&entry_vrs->core); + + case _URC_CONTINUE_UNWIND: + /* Continue unwinding the next frame. */ + unwind_phase2 (ucbp, entry_vrs); + + default: + abort (); + } +} + +_Unwind_Reason_Code +__gnu_Unwind_Resume_or_Rethrow (_Unwind_Control_Block *, phase2_vrs *); + +_Unwind_Reason_Code +__gnu_Unwind_Resume_or_Rethrow (_Unwind_Control_Block * ucbp, + phase2_vrs * entry_vrs) +{ + if (!UCB_FORCED_STOP_FN (ucbp)) + return __gnu_Unwind_RaiseException (ucbp, entry_vrs); + + /* Set the pc to the call site. */ + entry_vrs->core.r[R_PC] = entry_vrs->core.r[R_LR]; + /* Continue unwinding the next frame. */ + return unwind_phase2_forced (ucbp, entry_vrs, 0); +} + +/* Clean up an exception object when unwinding is complete. */ +void +_Unwind_Complete (_Unwind_Control_Block * ucbp __attribute__((unused))) +{ +} + + +/* Get the _Unwind_Control_Block from an _Unwind_Context. */ + +static inline _Unwind_Control_Block * +unwind_UCB_from_context (_Unwind_Context * context) +{ + return (_Unwind_Control_Block *) _Unwind_GetGR (context, R_IP); +} + + +/* Free an exception. */ + +void +_Unwind_DeleteException (_Unwind_Exception * exc) +{ + if (exc->exception_cleanup) + (*exc->exception_cleanup) (_URC_FOREIGN_EXCEPTION_CAUGHT, exc); +} + + +/* Perform stack backtrace through unwind data. */ +_Unwind_Reason_Code +__gnu_Unwind_Backtrace(_Unwind_Trace_Fn trace, void * trace_argument, + phase2_vrs * entry_vrs); +_Unwind_Reason_Code +__gnu_Unwind_Backtrace(_Unwind_Trace_Fn trace, void * trace_argument, + phase2_vrs * entry_vrs) +{ + phase1_vrs saved_vrs; + _Unwind_Reason_Code code; + + _Unwind_Control_Block ucb; + _Unwind_Control_Block *ucbp = &ucb; + + /* Set the pc to the call site. */ + entry_vrs->core.r[R_PC] = entry_vrs->core.r[R_LR]; + + /* Save the core registers. */ + saved_vrs.core = entry_vrs->core; + /* Set demand-save flags. */ + saved_vrs.demand_save_flags = ~(_uw) 0; + + do + { + /* Find the entry for this routine. */ + if (get_eit_entry (ucbp, saved_vrs.core.r[R_PC]) != _URC_OK) + { + code = _URC_FAILURE; + break; + } + + /* The dwarf unwinder assumes the context structure holds things + like the function and LSDA pointers. The ARM implementation + caches these in the exception header (UCB). To avoid + rewriting everything we make the virtual IP register point at + the UCB. */ + _Unwind_SetGR((_Unwind_Context *)&saved_vrs, 12, (_Unwind_Ptr) ucbp); + + /* Call trace function. */ + if ((*trace) ((_Unwind_Context *) &saved_vrs, trace_argument) + != _URC_NO_REASON) + { + code = _URC_FAILURE; + break; + } + + /* Call the pr to decide what to do. */ + code = ((personality_routine) UCB_PR_ADDR (ucbp)) + (_US_VIRTUAL_UNWIND_FRAME | _US_FORCE_UNWIND, + ucbp, (void *) &saved_vrs); + } + while (code != _URC_END_OF_STACK + && code != _URC_FAILURE); + + restore_non_core_regs (&saved_vrs); + return code; +} + + +/* Common implementation for ARM ABI defined personality routines. + ID is the index of the personality routine, other arguments are as defined + by __aeabi_unwind_cpp_pr{0,1,2}. */ + +static _Unwind_Reason_Code +__gnu_unwind_pr_common (_Unwind_State state, + _Unwind_Control_Block *ucbp, + _Unwind_Context *context, + int id) +{ + __gnu_unwind_state uws; + _uw *data; + _uw offset; + _uw len; + _uw rtti_count; + int phase2_call_unexpected_after_unwind = 0; + int in_range = 0; + int forced_unwind = state & _US_FORCE_UNWIND; + + state &= _US_ACTION_MASK; + + data = (_uw *) ucbp->pr_cache.ehtp; + uws.data = *(data++); + uws.next = data; + if (id == 0) + { + uws.data <<= 8; + uws.words_left = 0; + uws.bytes_left = 3; + } + else + { + uws.words_left = (uws.data >> 16) & 0xff; + uws.data <<= 16; + uws.bytes_left = 2; + data += uws.words_left; + } + + /* Restore the saved pointer. */ + if (state == _US_UNWIND_FRAME_RESUME) + data = (_uw *) ucbp->cleanup_cache.bitpattern[0]; + + if ((ucbp->pr_cache.additional & 1) == 0) + { + /* Process descriptors. */ + while (*data) + { + _uw addr; + _uw fnstart; + + if (id == 2) + { + len = ((EHT32 *) data)->length; + offset = ((EHT32 *) data)->offset; + data += 2; + } + else + { + len = ((EHT16 *) data)->length; + offset = ((EHT16 *) data)->offset; + data++; + } + + fnstart = ucbp->pr_cache.fnstart + (offset & ~1); + addr = _Unwind_GetGR (context, R_PC); + in_range = (fnstart <= addr && addr < fnstart + (len & ~1)); + + switch (((offset & 1) << 1) | (len & 1)) + { + case 0: + /* Cleanup. */ + if (state != _US_VIRTUAL_UNWIND_FRAME + && in_range) + { + /* Cleanup in range, and we are running cleanups. */ + _uw lp; + + /* Landing pad address is 31-bit pc-relative offset. */ + lp = selfrel_offset31 (data); + data++; + /* Save the exception data pointer. */ + ucbp->cleanup_cache.bitpattern[0] = (_uw) data; + if (!__cxa_begin_cleanup (ucbp)) + return _URC_FAILURE; + /* Setup the VRS to enter the landing pad. */ + _Unwind_SetGR (context, R_PC, lp); + return _URC_INSTALL_CONTEXT; + } + /* Cleanup not in range, or we are in stage 1. */ + data++; + break; + + case 1: + /* Catch handler. */ + if (state == _US_VIRTUAL_UNWIND_FRAME) + { + if (in_range) + { + /* Check for a barrier. */ + _uw rtti; + bool is_reference = (data[0] & uint32_highbit) != 0; + void *matched; + + /* Check for no-throw areas. */ + if (data[1] == (_uw) -2) + return _URC_FAILURE; + + /* The thrown object immediately follows the ECB. */ + matched = (void *)(ucbp + 1); + if (data[1] != (_uw) -1) + { + /* Match a catch specification. */ + rtti = _Unwind_decode_target2 ((_uw) &data[1]); + if (!__cxa_type_match (ucbp, (type_info *) rtti, + is_reference, + &matched)) + matched = (void *)0; + } + + if (matched) + { + ucbp->barrier_cache.sp = + _Unwind_GetGR (context, R_SP); + ucbp->barrier_cache.bitpattern[0] = (_uw) matched; + ucbp->barrier_cache.bitpattern[1] = (_uw) data; + return _URC_HANDLER_FOUND; + } + } + /* Handler out of range, or not matched. */ + } + else if (ucbp->barrier_cache.sp == _Unwind_GetGR (context, R_SP) + && ucbp->barrier_cache.bitpattern[1] == (_uw) data) + { + /* Matched a previous propagation barrier. */ + _uw lp; + + /* Setup for entry to the handler. */ + lp = selfrel_offset31 (data); + _Unwind_SetGR (context, R_PC, lp); + _Unwind_SetGR (context, 0, (_uw) ucbp); + return _URC_INSTALL_CONTEXT; + } + /* Catch handler not matched. Advance to the next descriptor. */ + data += 2; + break; + + case 2: + rtti_count = data[0] & 0x7fffffff; + /* Exception specification. */ + if (state == _US_VIRTUAL_UNWIND_FRAME) + { + if (in_range && (!forced_unwind || !rtti_count)) + { + /* Match against the exception specification. */ + _uw i; + _uw rtti; + void *matched; + + for (i = 0; i < rtti_count; i++) + { + matched = (void *)(ucbp + 1); + rtti = _Unwind_decode_target2 ((_uw) &data[i + 1]); + if (__cxa_type_match (ucbp, (type_info *) rtti, 0, + &matched)) + break; + } + + if (i == rtti_count) + { + /* Exception does not match the spec. */ + ucbp->barrier_cache.sp = + _Unwind_GetGR (context, R_SP); + ucbp->barrier_cache.bitpattern[0] = (_uw) matched; + ucbp->barrier_cache.bitpattern[1] = (_uw) data; + return _URC_HANDLER_FOUND; + } + } + /* Handler out of range, or exception is permitted. */ + } + else if (ucbp->barrier_cache.sp == _Unwind_GetGR (context, R_SP) + && ucbp->barrier_cache.bitpattern[1] == (_uw) data) + { + /* Matched a previous propagation barrier. */ + _uw lp; + /* Record the RTTI list for __cxa_call_unexpected. */ + ucbp->barrier_cache.bitpattern[1] = rtti_count; + ucbp->barrier_cache.bitpattern[2] = 0; + ucbp->barrier_cache.bitpattern[3] = 4; + ucbp->barrier_cache.bitpattern[4] = (_uw) &data[1]; + + if (data[0] & uint32_highbit) + phase2_call_unexpected_after_unwind = 1; + else + { + data += rtti_count + 1; + /* Setup for entry to the handler. */ + lp = selfrel_offset31 (data); + data++; + _Unwind_SetGR (context, R_PC, lp); + _Unwind_SetGR (context, 0, (_uw) ucbp); + return _URC_INSTALL_CONTEXT; + } + } + if (data[0] & uint32_highbit) + data++; + data += rtti_count + 1; + break; + + default: + /* Should never happen. */ + return _URC_FAILURE; + } + /* Finished processing this descriptor. */ + } + } + + if (__gnu_unwind_execute (context, &uws) != _URC_OK) + return _URC_FAILURE; + + if (phase2_call_unexpected_after_unwind) + { + /* Enter __cxa_unexpected as if called from the call site. */ + _Unwind_SetGR (context, R_LR, _Unwind_GetGR (context, R_PC)); + _Unwind_SetGR (context, R_PC, (_uw) &__cxa_call_unexpected); + return _URC_INSTALL_CONTEXT; + } + + return _URC_CONTINUE_UNWIND; +} + + +/* ABI defined personality routine entry points. */ + +_Unwind_Reason_Code +__aeabi_unwind_cpp_pr0 (_Unwind_State state, + _Unwind_Control_Block *ucbp, + _Unwind_Context *context) +{ + return __gnu_unwind_pr_common (state, ucbp, context, 0); +} + +_Unwind_Reason_Code +__aeabi_unwind_cpp_pr1 (_Unwind_State state, + _Unwind_Control_Block *ucbp, + _Unwind_Context *context) +{ + return __gnu_unwind_pr_common (state, ucbp, context, 1); +} + +_Unwind_Reason_Code +__aeabi_unwind_cpp_pr2 (_Unwind_State state, + _Unwind_Control_Block *ucbp, + _Unwind_Context *context) +{ + return __gnu_unwind_pr_common (state, ucbp, context, 2); +} diff --git a/gcc/config/arm/unwind-arm.h b/gcc/config/arm/unwind-arm.h new file mode 100644 index 000000000..a9ba1267a --- /dev/null +++ b/gcc/config/arm/unwind-arm.h @@ -0,0 +1,281 @@ +/* Header file for the ARM EABI unwinder + Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 + Free Software Foundation, Inc. + Contributed by Paul Brook + + This file is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any + later version. + + This file is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Language-independent unwinder header public defines. This contains both + ABI defined objects, and GNU support routines. */ + +#ifndef UNWIND_ARM_H +#define UNWIND_ARM_H + +#define __ARM_EABI_UNWINDER__ 1 + +#ifdef __cplusplus +extern "C" { +#endif + typedef unsigned _Unwind_Word __attribute__((__mode__(__word__))); + typedef signed _Unwind_Sword __attribute__((__mode__(__word__))); + typedef unsigned _Unwind_Ptr __attribute__((__mode__(__pointer__))); + typedef unsigned _Unwind_Internal_Ptr __attribute__((__mode__(__pointer__))); + typedef _Unwind_Word _uw; + typedef unsigned _uw64 __attribute__((mode(__DI__))); + typedef unsigned _uw16 __attribute__((mode(__HI__))); + typedef unsigned _uw8 __attribute__((mode(__QI__))); + + typedef enum + { + _URC_OK = 0, /* operation completed successfully */ + _URC_FOREIGN_EXCEPTION_CAUGHT = 1, + _URC_END_OF_STACK = 5, + _URC_HANDLER_FOUND = 6, + _URC_INSTALL_CONTEXT = 7, + _URC_CONTINUE_UNWIND = 8, + _URC_FAILURE = 9 /* unspecified failure of some kind */ + } + _Unwind_Reason_Code; + + typedef enum + { + _US_VIRTUAL_UNWIND_FRAME = 0, + _US_UNWIND_FRAME_STARTING = 1, + _US_UNWIND_FRAME_RESUME = 2, + _US_ACTION_MASK = 3, + _US_FORCE_UNWIND = 8, + _US_END_OF_STACK = 16 + } + _Unwind_State; + + /* Provided only for for compatibility with existing code. */ + typedef int _Unwind_Action; +#define _UA_SEARCH_PHASE 1 +#define _UA_CLEANUP_PHASE 2 +#define _UA_HANDLER_FRAME 4 +#define _UA_FORCE_UNWIND 8 +#define _UA_END_OF_STACK 16 +#define _URC_NO_REASON _URC_OK + + typedef struct _Unwind_Control_Block _Unwind_Control_Block; + typedef struct _Unwind_Context _Unwind_Context; + typedef _uw _Unwind_EHT_Header; + + + /* UCB: */ + + struct _Unwind_Control_Block + { + char exception_class[8]; + void (*exception_cleanup)(_Unwind_Reason_Code, _Unwind_Control_Block *); + /* Unwinder cache, private fields for the unwinder's use */ + struct + { + _uw reserved1; /* Forced unwind stop fn, 0 if not forced */ + _uw reserved2; /* Personality routine address */ + _uw reserved3; /* Saved callsite address */ + _uw reserved4; /* Forced unwind stop arg */ + _uw reserved5; + } + unwinder_cache; + /* Propagation barrier cache (valid after phase 1): */ + struct + { + _uw sp; + _uw bitpattern[5]; + } + barrier_cache; + /* Cleanup cache (preserved over cleanup): */ + struct + { + _uw bitpattern[4]; + } + cleanup_cache; + /* Pr cache (for pr's benefit): */ + struct + { + _uw fnstart; /* function start address */ + _Unwind_EHT_Header *ehtp; /* pointer to EHT entry header word */ + _uw additional; /* additional data */ + _uw reserved1; + } + pr_cache; + long long int :0; /* Force alignment to 8-byte boundary */ + }; + + /* Virtual Register Set*/ + + typedef enum + { + _UVRSC_CORE = 0, /* integer register */ + _UVRSC_VFP = 1, /* vfp */ + _UVRSC_FPA = 2, /* fpa */ + _UVRSC_WMMXD = 3, /* Intel WMMX data register */ + _UVRSC_WMMXC = 4 /* Intel WMMX control register */ + } + _Unwind_VRS_RegClass; + + typedef enum + { + _UVRSD_UINT32 = 0, + _UVRSD_VFPX = 1, + _UVRSD_FPAX = 2, + _UVRSD_UINT64 = 3, + _UVRSD_FLOAT = 4, + _UVRSD_DOUBLE = 5 + } + _Unwind_VRS_DataRepresentation; + + typedef enum + { + _UVRSR_OK = 0, + _UVRSR_NOT_IMPLEMENTED = 1, + _UVRSR_FAILED = 2 + } + _Unwind_VRS_Result; + + /* Frame unwinding state. */ + typedef struct + { + /* The current word (bytes packed msb first). */ + _uw data; + /* Pointer to the next word of data. */ + _uw *next; + /* The number of bytes left in this word. */ + _uw8 bytes_left; + /* The number of words pointed to by ptr. */ + _uw8 words_left; + } + __gnu_unwind_state; + + typedef _Unwind_Reason_Code (*personality_routine) (_Unwind_State, + _Unwind_Control_Block *, _Unwind_Context *); + + _Unwind_VRS_Result _Unwind_VRS_Set(_Unwind_Context *, _Unwind_VRS_RegClass, + _uw, _Unwind_VRS_DataRepresentation, + void *); + + _Unwind_VRS_Result _Unwind_VRS_Get(_Unwind_Context *, _Unwind_VRS_RegClass, + _uw, _Unwind_VRS_DataRepresentation, + void *); + + _Unwind_VRS_Result _Unwind_VRS_Pop(_Unwind_Context *, _Unwind_VRS_RegClass, + _uw, _Unwind_VRS_DataRepresentation); + + + /* Support functions for the PR. */ +#define _Unwind_Exception _Unwind_Control_Block + typedef char _Unwind_Exception_Class[8]; + + void * _Unwind_GetLanguageSpecificData (_Unwind_Context *); + _Unwind_Ptr _Unwind_GetRegionStart (_Unwind_Context *); + + /* These two should never be used. */ + _Unwind_Ptr _Unwind_GetDataRelBase (_Unwind_Context *); + _Unwind_Ptr _Unwind_GetTextRelBase (_Unwind_Context *); + + /* Interface functions: */ + _Unwind_Reason_Code _Unwind_RaiseException(_Unwind_Control_Block *ucbp); + void __attribute__((noreturn)) _Unwind_Resume(_Unwind_Control_Block *ucbp); + _Unwind_Reason_Code _Unwind_Resume_or_Rethrow (_Unwind_Control_Block *ucbp); + + typedef _Unwind_Reason_Code (*_Unwind_Stop_Fn) + (int, _Unwind_Action, _Unwind_Exception_Class, + _Unwind_Control_Block *, struct _Unwind_Context *, void *); + _Unwind_Reason_Code _Unwind_ForcedUnwind (_Unwind_Control_Block *, + _Unwind_Stop_Fn, void *); + /* @@@ Use unwind data to perform a stack backtrace. The trace callback + is called for every stack frame in the call chain, but no cleanup + actions are performed. */ + typedef _Unwind_Reason_Code (*_Unwind_Trace_Fn) (_Unwind_Context *, void *); + _Unwind_Reason_Code _Unwind_Backtrace(_Unwind_Trace_Fn, + void*); + + _Unwind_Word _Unwind_GetCFA (struct _Unwind_Context *); + void _Unwind_Complete(_Unwind_Control_Block *ucbp); + void _Unwind_DeleteException (_Unwind_Exception *); + + _Unwind_Reason_Code __gnu_unwind_frame (_Unwind_Control_Block *, + _Unwind_Context *); + _Unwind_Reason_Code __gnu_unwind_execute (_Unwind_Context *, + __gnu_unwind_state *); + + /* Decode an R_ARM_TARGET2 relocation. */ + static inline _Unwind_Word + _Unwind_decode_target2 (_Unwind_Word ptr) + { + _Unwind_Word tmp; + + tmp = *(_Unwind_Word *) ptr; + /* Zero values are always NULL. */ + if (!tmp) + return 0; + +#if (defined(linux) && !defined(__uClinux__)) || defined(__NetBSD__) + /* Pc-relative indirect. */ + tmp += ptr; + tmp = *(_Unwind_Word *) tmp; +#elif defined(__symbian__) || defined(__uClinux__) + /* Absolute pointer. Nothing more to do. */ +#else + /* Pc-relative pointer. */ + tmp += ptr; +#endif + return tmp; + } + + static inline _Unwind_Word + _Unwind_GetGR (_Unwind_Context *context, int regno) + { + _uw val; + _Unwind_VRS_Get (context, _UVRSC_CORE, regno, _UVRSD_UINT32, &val); + return val; + } + + /* Return the address of the instruction, not the actual IP value. */ +#define _Unwind_GetIP(context) \ + (_Unwind_GetGR (context, 15) & ~(_Unwind_Word)1) + +#define _Unwind_GetIPInfo(context, ip_before_insn) \ + (*ip_before_insn = 0, _Unwind_GetGR (context, 15) & ~(_Unwind_Word)1) + + static inline void + _Unwind_SetGR (_Unwind_Context *context, int regno, _Unwind_Word val) + { + _Unwind_VRS_Set (context, _UVRSC_CORE, regno, _UVRSD_UINT32, &val); + } + + /* The dwarf unwinder doesn't understand arm/thumb state. We assume the + landing pad uses the same instruction set as the call site. */ +#define _Unwind_SetIP(context, val) \ + _Unwind_SetGR (context, 15, val | (_Unwind_GetGR (context, 15) & 1)) + +/* leb128 type numbers have a potentially unlimited size. + The target of the following definitions of _sleb128_t and _uleb128_t + is to have efficient data types large enough to hold the leb128 type + numbers used in the unwind code. */ +typedef long _sleb128_t; +typedef unsigned long _uleb128_t; + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* defined UNWIND_ARM_H */ diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md new file mode 100644 index 000000000..c27c41411 --- /dev/null +++ b/gcc/config/arm/vec-common.md @@ -0,0 +1,110 @@ +;; Machine Description for shared bits common to IWMMXT and Neon. +;; Copyright (C) 2006, 2007, 2010 Free Software Foundation, Inc. +;; Written by CodeSourcery. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +;; Vector Moves + +(define_expand "mov" + [(set (match_operand:VALL 0 "nonimmediate_operand" "") + (match_operand:VALL 1 "general_operand" ""))] + "TARGET_NEON + || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (mode))" +{ + if (can_create_pseudo_p ()) + { + if (GET_CODE (operands[0]) != REG) + operands[1] = force_reg (mode, operands[1]); + else if (TARGET_NEON && CONSTANT_P (operands[1])) + { + operands[1] = neon_make_constant (operands[1]); + gcc_assert (operands[1] != NULL_RTX); + } + } +}) + +;; Vector arithmetic. Expanders are blank, then unnamed insns implement +;; patterns separately for IWMMXT and Neon. + +(define_expand "add3" + [(set (match_operand:VALL 0 "s_register_operand" "") + (plus:VALL (match_operand:VALL 1 "s_register_operand" "") + (match_operand:VALL 2 "s_register_operand" "")))] + "(TARGET_NEON && ((mode != V2SFmode && mode != V4SFmode) + || flag_unsafe_math_optimizations)) + || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (mode))" +{ +}) + +(define_expand "sub3" + [(set (match_operand:VALL 0 "s_register_operand" "") + (minus:VALL (match_operand:VALL 1 "s_register_operand" "") + (match_operand:VALL 2 "s_register_operand" "")))] + "(TARGET_NEON && ((mode != V2SFmode && mode != V4SFmode) + || flag_unsafe_math_optimizations)) + || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (mode))" +{ +}) + +(define_expand "mul3" + [(set (match_operand:VALLW 0 "s_register_operand" "") + (mult:VALLW (match_operand:VALLW 1 "s_register_operand" "") + (match_operand:VALLW 2 "s_register_operand" "")))] + "(TARGET_NEON && ((mode != V2SFmode && mode != V4SFmode) + || flag_unsafe_math_optimizations)) + || (mode == V4HImode && TARGET_REALLY_IWMMXT)" +{ +}) + +(define_expand "smin3" + [(set (match_operand:VALLW 0 "s_register_operand" "") + (smin:VALLW (match_operand:VALLW 1 "s_register_operand" "") + (match_operand:VALLW 2 "s_register_operand" "")))] + "(TARGET_NEON && ((mode != V2SFmode && mode != V4SFmode) + || flag_unsafe_math_optimizations)) + || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (mode))" +{ +}) + +(define_expand "umin3" + [(set (match_operand:VINTW 0 "s_register_operand" "") + (umin:VINTW (match_operand:VINTW 1 "s_register_operand" "") + (match_operand:VINTW 2 "s_register_operand" "")))] + "TARGET_NEON + || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (mode))" +{ +}) + +(define_expand "smax3" + [(set (match_operand:VALLW 0 "s_register_operand" "") + (smax:VALLW (match_operand:VALLW 1 "s_register_operand" "") + (match_operand:VALLW 2 "s_register_operand" "")))] + "(TARGET_NEON && ((mode != V2SFmode && mode != V4SFmode) + || flag_unsafe_math_optimizations)) + || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (mode))" +{ +}) + +(define_expand "umax3" + [(set (match_operand:VINTW 0 "s_register_operand" "") + (umax:VINTW (match_operand:VINTW 1 "s_register_operand" "") + (match_operand:VINTW 2 "s_register_operand" "")))] + "TARGET_NEON + || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (mode))" +{ +}) diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md new file mode 100644 index 000000000..1ac2d0c2d --- /dev/null +++ b/gcc/config/arm/vfp.md @@ -0,0 +1,1153 @@ +;; ARM VFP instruction patterns +;; Copyright (C) 2003, 2005, 2006, 2007, 2008, 2010 +;; Free Software Foundation, Inc. +;; Written by CodeSourcery. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . */ + +;; Additional register numbers +(define_constants + [(VFPCC_REGNUM 127)] +) + +;; The VFP "type" attributes differ from those used in the FPA model. +;; fcpys Single precision cpy. +;; ffariths Single precision abs, neg. +;; ffarithd Double precision abs, neg, cpy. +;; fadds Single precision add/sub. +;; faddd Double precision add/sub. +;; fconsts Single precision load immediate. +;; fconstd Double precision load immediate. +;; fcmps Single precision comparison. +;; fcmpd Double precision comparison. +;; fmuls Single precision multiply. +;; fmuld Double precision multiply. +;; fmacs Single precision multiply-accumulate. +;; fmacd Double precision multiply-accumulate. +;; fdivs Single precision sqrt or division. +;; fdivd Double precision sqrt or division. +;; f_flag fmstat operation +;; f_load[sd] Floating point load from memory. +;; f_store[sd] Floating point store to memory. +;; f_2_r Transfer vfp to arm reg. +;; r_2_f Transfer arm to vfp reg. +;; f_cvt Convert floating<->integral + +;; SImode moves +;; ??? For now do not allow loading constants into vfp regs. This causes +;; problems because small constants get converted into adds. +(define_insn "*arm_movsi_vfp" + [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,r,r,rk,m ,*t,r,*t,*t, *Uv") + (match_operand:SI 1 "general_operand" "rk, I,K,j,mi,rk,r,*t,*t,*Uvi,*t"))] + "TARGET_ARM && TARGET_VFP && TARGET_HARD_FLOAT + && ( s_register_operand (operands[0], SImode) + || s_register_operand (operands[1], SImode))" + "* + switch (which_alternative) + { + case 0: case 1: + return \"mov%?\\t%0, %1\"; + case 2: + return \"mvn%?\\t%0, #%B1\"; + case 3: + return \"movw%?\\t%0, %1\"; + case 4: + return \"ldr%?\\t%0, %1\"; + case 5: + return \"str%?\\t%1, %0\"; + case 6: + return \"fmsr%?\\t%0, %1\\t%@ int\"; + case 7: + return \"fmrs%?\\t%0, %1\\t%@ int\"; + case 8: + return \"fcpys%?\\t%0, %1\\t%@ int\"; + case 9: case 10: + return output_move_vfp (operands); + default: + gcc_unreachable (); + } + " + [(set_attr "predicable" "yes") + (set_attr "type" "*,*,*,*,load1,store1,r_2_f,f_2_r,fcpys,f_loads,f_stores") + (set_attr "insn" "mov,mov,mvn,mov,*,*,*,*,*,*,*") + (set_attr "pool_range" "*,*,*,*,4096,*,*,*,*,1020,*") + (set_attr "neg_pool_range" "*,*,*,*,4084,*,*,*,*,1008,*")] +) + +;; See thumb2.md:thumb2_movsi_insn for an explanation of the split +;; high/low register alternatives for loads and stores here. +(define_insn "*thumb2_movsi_vfp" + [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,r,r, l,*hk,m, *m,*t, r,*t,*t, *Uv") + (match_operand:SI 1 "general_operand" "rk, I,K,j,mi,*mi,l,*hk, r,*t,*t,*Uvi,*t"))] + "TARGET_THUMB2 && TARGET_VFP && TARGET_HARD_FLOAT + && ( s_register_operand (operands[0], SImode) + || s_register_operand (operands[1], SImode))" + "* + switch (which_alternative) + { + case 0: case 1: + return \"mov%?\\t%0, %1\"; + case 2: + return \"mvn%?\\t%0, #%B1\"; + case 3: + return \"movw%?\\t%0, %1\"; + case 4: + case 5: + return \"ldr%?\\t%0, %1\"; + case 6: + case 7: + return \"str%?\\t%1, %0\"; + case 8: + return \"fmsr%?\\t%0, %1\\t%@ int\"; + case 9: + return \"fmrs%?\\t%0, %1\\t%@ int\"; + case 10: + return \"fcpys%?\\t%0, %1\\t%@ int\"; + case 11: case 12: + return output_move_vfp (operands); + default: + gcc_unreachable (); + } + " + [(set_attr "predicable" "yes") + (set_attr "type" "*,*,*,*,load1,load1,store1,store1,r_2_f,f_2_r,fcpys,f_loads,f_stores") + (set_attr "insn" "mov,mov,mvn,mov,*,*,*,*,*,*,*,*,*") + (set_attr "pool_range" "*,*,*,*,1020,4096,*,*,*,*,*,1020,*") + (set_attr "neg_pool_range" "*,*,*,*, 0, 0,*,*,*,*,*,1008,*")] +) + + +;; DImode moves + +(define_insn "*arm_movdi_vfp" + [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r, r,m,w,r,w,w, Uv") + (match_operand:DI 1 "di_operand" "rIK,mi,r,r,w,w,Uvi,w"))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP + && ( register_operand (operands[0], DImode) + || register_operand (operands[1], DImode))" + "* + switch (which_alternative) + { + case 0: + return \"#\"; + case 1: + case 2: + return output_move_double (operands); + case 3: + return \"fmdrr%?\\t%P0, %Q1, %R1\\t%@ int\"; + case 4: + return \"fmrrd%?\\t%Q0, %R0, %P1\\t%@ int\"; + case 5: + if (TARGET_VFP_SINGLE) + return \"fcpys%?\\t%0, %1\\t%@ int\;fcpys%?\\t%p0, %p1\\t%@ int\"; + else + return \"fcpyd%?\\t%P0, %P1\\t%@ int\"; + case 6: case 7: + return output_move_vfp (operands); + default: + gcc_unreachable (); + } + " + [(set_attr "type" "*,load2,store2,r_2_f,f_2_r,ffarithd,f_loadd,f_stored") + (set (attr "length") (cond [(eq_attr "alternative" "0,1,2") (const_int 8) + (eq_attr "alternative" "5") + (if_then_else + (eq (symbol_ref "TARGET_VFP_SINGLE") + (const_int 1)) + (const_int 8) + (const_int 4))] + (const_int 4))) + (set_attr "predicable" "yes") + (set_attr "pool_range" "*,1020,*,*,*,*,1020,*") + (set_attr "neg_pool_range" "*,1008,*,*,*,*,1008,*")] +) + +(define_insn "*thumb2_movdi_vfp" + [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r, r,m,w,r,w,w, Uv") + (match_operand:DI 1 "di_operand" "rIK,mi,r,r,w,w,Uvi,w"))] + "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_VFP" + "* + switch (which_alternative) + { + case 0: case 1: case 2: + return (output_move_double (operands)); + case 3: + return \"fmdrr%?\\t%P0, %Q1, %R1\\t%@ int\"; + case 4: + return \"fmrrd%?\\t%Q0, %R0, %P1\\t%@ int\"; + case 5: + if (TARGET_VFP_SINGLE) + return \"fcpys%?\\t%0, %1\\t%@ int\;fcpys%?\\t%p0, %p1\\t%@ int\"; + else + return \"fcpyd%?\\t%P0, %P1\\t%@ int\"; + case 6: case 7: + return output_move_vfp (operands); + default: + abort (); + } + " + [(set_attr "type" "*,load2,store2,r_2_f,f_2_r,ffarithd,f_loadd,f_stored") + (set (attr "length") (cond [(eq_attr "alternative" "0,1,2") (const_int 8) + (eq_attr "alternative" "5") + (if_then_else + (eq (symbol_ref "TARGET_VFP_SINGLE") + (const_int 1)) + (const_int 8) + (const_int 4))] + (const_int 4))) + (set_attr "pool_range" "*,4096,*,*,*,*,1020,*") + (set_attr "neg_pool_range" "*, 0,*,*,*,*,1008,*")] +) + +;; HFmode moves +(define_insn "*movhf_vfp_neon" + [(set (match_operand:HF 0 "nonimmediate_operand" "= t,Um,r,m,t,r,t,r,r") + (match_operand:HF 1 "general_operand" " Um, t,m,r,t,r,r,t,F"))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_NEON_FP16 + && ( s_register_operand (operands[0], HFmode) + || s_register_operand (operands[1], HFmode))" + "* + switch (which_alternative) + { + case 0: /* S register from memory */ + return \"vld1.16\\t{%z0}, %A1\"; + case 1: /* memory from S register */ + return \"vst1.16\\t{%z1}, %A0\"; + case 2: /* ARM register from memory */ + return \"ldrh\\t%0, %1\\t%@ __fp16\"; + case 3: /* memory from ARM register */ + return \"strh\\t%1, %0\\t%@ __fp16\"; + case 4: /* S register from S register */ + return \"fcpys\\t%0, %1\"; + case 5: /* ARM register from ARM register */ + return \"mov\\t%0, %1\\t%@ __fp16\"; + case 6: /* S register from ARM register */ + return \"fmsr\\t%0, %1\"; + case 7: /* ARM register from S register */ + return \"fmrs\\t%0, %1\"; + case 8: /* ARM register from constant */ + { + REAL_VALUE_TYPE r; + long bits; + rtx ops[4]; + + REAL_VALUE_FROM_CONST_DOUBLE (r, operands[1]); + bits = real_to_target (NULL, &r, HFmode); + ops[0] = operands[0]; + ops[1] = GEN_INT (bits); + ops[2] = GEN_INT (bits & 0xff00); + ops[3] = GEN_INT (bits & 0x00ff); + + if (arm_arch_thumb2) + output_asm_insn (\"movw\\t%0, %1\", ops); + else + output_asm_insn (\"mov\\t%0, %2\;orr\\t%0, %0, %3\", ops); + return \"\"; + } + default: + gcc_unreachable (); + } + " + [(set_attr "conds" "unconditional") + (set_attr "type" "*,*,load1,store1,fcpys,*,r_2_f,f_2_r,*") + (set_attr "neon_type" "neon_vld1_1_2_regs,neon_vst1_1_2_regs_vst2_2_regs,*,*,*,*,*,*,*") + (set_attr "length" "4,4,4,4,4,4,4,4,8")] +) + +;; FP16 without element load/store instructions. +(define_insn "*movhf_vfp" + [(set (match_operand:HF 0 "nonimmediate_operand" "=r,m,t,r,t,r,r") + (match_operand:HF 1 "general_operand" " m,r,t,r,r,t,F"))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FP16 && !TARGET_NEON_FP16 + && ( s_register_operand (operands[0], HFmode) + || s_register_operand (operands[1], HFmode))" + "* + switch (which_alternative) + { + case 0: /* ARM register from memory */ + return \"ldrh\\t%0, %1\\t%@ __fp16\"; + case 1: /* memory from ARM register */ + return \"strh\\t%1, %0\\t%@ __fp16\"; + case 2: /* S register from S register */ + return \"fcpys\\t%0, %1\"; + case 3: /* ARM register from ARM register */ + return \"mov\\t%0, %1\\t%@ __fp16\"; + case 4: /* S register from ARM register */ + return \"fmsr\\t%0, %1\"; + case 5: /* ARM register from S register */ + return \"fmrs\\t%0, %1\"; + case 6: /* ARM register from constant */ + { + REAL_VALUE_TYPE r; + long bits; + rtx ops[4]; + + REAL_VALUE_FROM_CONST_DOUBLE (r, operands[1]); + bits = real_to_target (NULL, &r, HFmode); + ops[0] = operands[0]; + ops[1] = GEN_INT (bits); + ops[2] = GEN_INT (bits & 0xff00); + ops[3] = GEN_INT (bits & 0x00ff); + + if (arm_arch_thumb2) + output_asm_insn (\"movw\\t%0, %1\", ops); + else + output_asm_insn (\"mov\\t%0, %2\;orr\\t%0, %0, %3\", ops); + return \"\"; + } + default: + gcc_unreachable (); + } + " + [(set_attr "conds" "unconditional") + (set_attr "type" "load1,store1,fcpys,*,r_2_f,f_2_r,*") + (set_attr "length" "4,4,4,4,4,4,8")] +) + + +;; SFmode moves +;; Disparage the w<->r cases because reloading an invalid address is +;; preferable to loading the value via integer registers. + +(define_insn "*movsf_vfp" + [(set (match_operand:SF 0 "nonimmediate_operand" "=t,?r,t ,t ,Uv,r ,m,t,r") + (match_operand:SF 1 "general_operand" " ?r,t,Dv,UvE,t, mE,r,t,r"))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP + && ( s_register_operand (operands[0], SFmode) + || s_register_operand (operands[1], SFmode))" + "* + switch (which_alternative) + { + case 0: + return \"fmsr%?\\t%0, %1\"; + case 1: + return \"fmrs%?\\t%0, %1\"; + case 2: + return \"fconsts%?\\t%0, #%G1\"; + case 3: case 4: + return output_move_vfp (operands); + case 5: + return \"ldr%?\\t%0, %1\\t%@ float\"; + case 6: + return \"str%?\\t%1, %0\\t%@ float\"; + case 7: + return \"fcpys%?\\t%0, %1\"; + case 8: + return \"mov%?\\t%0, %1\\t%@ float\"; + default: + gcc_unreachable (); + } + " + [(set_attr "predicable" "yes") + (set_attr "type" + "r_2_f,f_2_r,fconsts,f_loads,f_stores,load1,store1,fcpys,*") + (set_attr "insn" "*,*,*,*,*,*,*,*,mov") + (set_attr "pool_range" "*,*,*,1020,*,4096,*,*,*") + (set_attr "neg_pool_range" "*,*,*,1008,*,4080,*,*,*")] +) + +(define_insn "*thumb2_movsf_vfp" + [(set (match_operand:SF 0 "nonimmediate_operand" "=t,?r,t, t ,Uv,r ,m,t,r") + (match_operand:SF 1 "general_operand" " ?r,t,Dv,UvE,t, mE,r,t,r"))] + "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_VFP + && ( s_register_operand (operands[0], SFmode) + || s_register_operand (operands[1], SFmode))" + "* + switch (which_alternative) + { + case 0: + return \"fmsr%?\\t%0, %1\"; + case 1: + return \"fmrs%?\\t%0, %1\"; + case 2: + return \"fconsts%?\\t%0, #%G1\"; + case 3: case 4: + return output_move_vfp (operands); + case 5: + return \"ldr%?\\t%0, %1\\t%@ float\"; + case 6: + return \"str%?\\t%1, %0\\t%@ float\"; + case 7: + return \"fcpys%?\\t%0, %1\"; + case 8: + return \"mov%?\\t%0, %1\\t%@ float\"; + default: + gcc_unreachable (); + } + " + [(set_attr "predicable" "yes") + (set_attr "type" + "r_2_f,f_2_r,fconsts,f_loads,f_stores,load1,store1,fcpys,*") + (set_attr "insn" "*,*,*,*,*,*,*,*,mov") + (set_attr "pool_range" "*,*,*,1020,*,4092,*,*,*") + (set_attr "neg_pool_range" "*,*,*,1008,*,0,*,*,*")] +) + + +;; DFmode moves + +(define_insn "*movdf_vfp" + [(set (match_operand:DF 0 "nonimmediate_soft_df_operand" "=w,?r,w ,r, m,w ,Uv,w,r") + (match_operand:DF 1 "soft_df_operand" " ?r,w,Dy,mF,r,UvF,w, w,r"))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP + && ( register_operand (operands[0], DFmode) + || register_operand (operands[1], DFmode))" + "* + { + switch (which_alternative) + { + case 0: + return \"fmdrr%?\\t%P0, %Q1, %R1\"; + case 1: + return \"fmrrd%?\\t%Q0, %R0, %P1\"; + case 2: + gcc_assert (TARGET_VFP_DOUBLE); + return \"fconstd%?\\t%P0, #%G1\"; + case 3: case 4: + return output_move_double (operands); + case 5: case 6: + return output_move_vfp (operands); + case 7: + if (TARGET_VFP_SINGLE) + return \"fcpys%?\\t%0, %1\;fcpys%?\\t%p0, %p1\"; + else + return \"fcpyd%?\\t%P0, %P1\"; + case 8: + return \"#\"; + default: + gcc_unreachable (); + } + } + " + [(set_attr "type" + "r_2_f,f_2_r,fconstd,f_loadd,f_stored,load2,store2,ffarithd,*") + (set (attr "length") (cond [(eq_attr "alternative" "3,4,8") (const_int 8) + (eq_attr "alternative" "7") + (if_then_else + (eq (symbol_ref "TARGET_VFP_SINGLE") + (const_int 1)) + (const_int 8) + (const_int 4))] + (const_int 4))) + (set_attr "predicable" "yes") + (set_attr "pool_range" "*,*,*,1020,*,1020,*,*,*") + (set_attr "neg_pool_range" "*,*,*,1008,*,1008,*,*,*")] +) + +(define_insn "*thumb2_movdf_vfp" + [(set (match_operand:DF 0 "nonimmediate_soft_df_operand" "=w,?r,w ,r, m,w ,Uv,w,r") + (match_operand:DF 1 "soft_df_operand" " ?r,w,Dy,mF,r,UvF,w, w,r"))] + "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_VFP" + "* + { + switch (which_alternative) + { + case 0: + return \"fmdrr%?\\t%P0, %Q1, %R1\"; + case 1: + return \"fmrrd%?\\t%Q0, %R0, %P1\"; + case 2: + gcc_assert (TARGET_VFP_DOUBLE); + return \"fconstd%?\\t%P0, #%G1\"; + case 3: case 4: case 8: + return output_move_double (operands); + case 5: case 6: + return output_move_vfp (operands); + case 7: + if (TARGET_VFP_SINGLE) + return \"fcpys%?\\t%0, %1\;fcpys%?\\t%p0, %p1\"; + else + return \"fcpyd%?\\t%P0, %P1\"; + default: + abort (); + } + } + " + [(set_attr "type" + "r_2_f,f_2_r,fconstd,load2,store2,f_loadd,f_stored,ffarithd,*") + (set (attr "length") (cond [(eq_attr "alternative" "3,4,8") (const_int 8) + (eq_attr "alternative" "7") + (if_then_else + (eq (symbol_ref "TARGET_VFP_SINGLE") + (const_int 1)) + (const_int 8) + (const_int 4))] + (const_int 4))) + (set_attr "pool_range" "*,*,*,4096,*,1020,*,*,*") + (set_attr "neg_pool_range" "*,*,*,0,*,1008,*,*,*")] +) + + +;; Conditional move patterns + +(define_insn "*movsfcc_vfp" + [(set (match_operand:SF 0 "s_register_operand" "=t,t,t,t,t,t,?r,?r,?r") + (if_then_else:SF + (match_operator 3 "arm_comparison_operator" + [(match_operand 4 "cc_register" "") (const_int 0)]) + (match_operand:SF 1 "s_register_operand" "0,t,t,0,?r,?r,0,t,t") + (match_operand:SF 2 "s_register_operand" "t,0,t,?r,0,?r,t,0,t")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "@ + fcpys%D3\\t%0, %2 + fcpys%d3\\t%0, %1 + fcpys%D3\\t%0, %2\;fcpys%d3\\t%0, %1 + fmsr%D3\\t%0, %2 + fmsr%d3\\t%0, %1 + fmsr%D3\\t%0, %2\;fmsr%d3\\t%0, %1 + fmrs%D3\\t%0, %2 + fmrs%d3\\t%0, %1 + fmrs%D3\\t%0, %2\;fmrs%d3\\t%0, %1" + [(set_attr "conds" "use") + (set_attr "length" "4,4,8,4,4,8,4,4,8") + (set_attr "type" "fcpys,fcpys,fcpys,r_2_f,r_2_f,r_2_f,f_2_r,f_2_r,f_2_r")] +) + +(define_insn "*thumb2_movsfcc_vfp" + [(set (match_operand:SF 0 "s_register_operand" "=t,t,t,t,t,t,?r,?r,?r") + (if_then_else:SF + (match_operator 3 "arm_comparison_operator" + [(match_operand 4 "cc_register" "") (const_int 0)]) + (match_operand:SF 1 "s_register_operand" "0,t,t,0,?r,?r,0,t,t") + (match_operand:SF 2 "s_register_operand" "t,0,t,?r,0,?r,t,0,t")))] + "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_VFP" + "@ + it\\t%D3\;fcpys%D3\\t%0, %2 + it\\t%d3\;fcpys%d3\\t%0, %1 + ite\\t%D3\;fcpys%D3\\t%0, %2\;fcpys%d3\\t%0, %1 + it\\t%D3\;fmsr%D3\\t%0, %2 + it\\t%d3\;fmsr%d3\\t%0, %1 + ite\\t%D3\;fmsr%D3\\t%0, %2\;fmsr%d3\\t%0, %1 + it\\t%D3\;fmrs%D3\\t%0, %2 + it\\t%d3\;fmrs%d3\\t%0, %1 + ite\\t%D3\;fmrs%D3\\t%0, %2\;fmrs%d3\\t%0, %1" + [(set_attr "conds" "use") + (set_attr "length" "6,6,10,6,6,10,6,6,10") + (set_attr "type" "fcpys,fcpys,fcpys,r_2_f,r_2_f,r_2_f,f_2_r,f_2_r,f_2_r")] +) + +(define_insn "*movdfcc_vfp" + [(set (match_operand:DF 0 "s_register_operand" "=w,w,w,w,w,w,?r,?r,?r") + (if_then_else:DF + (match_operator 3 "arm_comparison_operator" + [(match_operand 4 "cc_register" "") (const_int 0)]) + (match_operand:DF 1 "s_register_operand" "0,w,w,0,?r,?r,0,w,w") + (match_operand:DF 2 "s_register_operand" "w,0,w,?r,0,?r,w,0,w")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + "@ + fcpyd%D3\\t%P0, %P2 + fcpyd%d3\\t%P0, %P1 + fcpyd%D3\\t%P0, %P2\;fcpyd%d3\\t%P0, %P1 + fmdrr%D3\\t%P0, %Q2, %R2 + fmdrr%d3\\t%P0, %Q1, %R1 + fmdrr%D3\\t%P0, %Q2, %R2\;fmdrr%d3\\t%P0, %Q1, %R1 + fmrrd%D3\\t%Q0, %R0, %P2 + fmrrd%d3\\t%Q0, %R0, %P1 + fmrrd%D3\\t%Q0, %R0, %P2\;fmrrd%d3\\t%Q0, %R0, %P1" + [(set_attr "conds" "use") + (set_attr "length" "4,4,8,4,4,8,4,4,8") + (set_attr "type" "ffarithd,ffarithd,ffarithd,r_2_f,r_2_f,r_2_f,f_2_r,f_2_r,f_2_r")] +) + +(define_insn "*thumb2_movdfcc_vfp" + [(set (match_operand:DF 0 "s_register_operand" "=w,w,w,w,w,w,?r,?r,?r") + (if_then_else:DF + (match_operator 3 "arm_comparison_operator" + [(match_operand 4 "cc_register" "") (const_int 0)]) + (match_operand:DF 1 "s_register_operand" "0,w,w,0,?r,?r,0,w,w") + (match_operand:DF 2 "s_register_operand" "w,0,w,?r,0,?r,w,0,w")))] + "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + "@ + it\\t%D3\;fcpyd%D3\\t%P0, %P2 + it\\t%d3\;fcpyd%d3\\t%P0, %P1 + ite\\t%D3\;fcpyd%D3\\t%P0, %P2\;fcpyd%d3\\t%P0, %P1 + it\t%D3\;fmdrr%D3\\t%P0, %Q2, %R2 + it\t%d3\;fmdrr%d3\\t%P0, %Q1, %R1 + ite\\t%D3\;fmdrr%D3\\t%P0, %Q2, %R2\;fmdrr%d3\\t%P0, %Q1, %R1 + it\t%D3\;fmrrd%D3\\t%Q0, %R0, %P2 + it\t%d3\;fmrrd%d3\\t%Q0, %R0, %P1 + ite\\t%D3\;fmrrd%D3\\t%Q0, %R0, %P2\;fmrrd%d3\\t%Q0, %R0, %P1" + [(set_attr "conds" "use") + (set_attr "length" "6,6,10,6,6,10,6,6,10") + (set_attr "type" "ffarithd,ffarithd,ffarithd,r_2_f,r_2_f,r_2_f,f_2_r,f_2_r,f_2_r")] +) + + +;; Sign manipulation functions + +(define_insn "*abssf2_vfp" + [(set (match_operand:SF 0 "s_register_operand" "=t") + (abs:SF (match_operand:SF 1 "s_register_operand" "t")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP" + "fabss%?\\t%0, %1" + [(set_attr "predicable" "yes") + (set_attr "type" "ffariths")] +) + +(define_insn "*absdf2_vfp" + [(set (match_operand:DF 0 "s_register_operand" "=w") + (abs:DF (match_operand:DF 1 "s_register_operand" "w")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + "fabsd%?\\t%P0, %P1" + [(set_attr "predicable" "yes") + (set_attr "type" "ffarithd")] +) + +(define_insn "*negsf2_vfp" + [(set (match_operand:SF 0 "s_register_operand" "=t,?r") + (neg:SF (match_operand:SF 1 "s_register_operand" "t,r")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP" + "@ + fnegs%?\\t%0, %1 + eor%?\\t%0, %1, #-2147483648" + [(set_attr "predicable" "yes") + (set_attr "type" "ffariths")] +) + +(define_insn_and_split "*negdf2_vfp" + [(set (match_operand:DF 0 "s_register_operand" "=w,?r,?r") + (neg:DF (match_operand:DF 1 "s_register_operand" "w,0,r")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + "@ + fnegd%?\\t%P0, %P1 + # + #" + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE && reload_completed + && arm_general_register_operand (operands[0], DFmode)" + [(set (match_dup 0) (match_dup 1))] + " + if (REGNO (operands[0]) == REGNO (operands[1])) + { + operands[0] = gen_highpart (SImode, operands[0]); + operands[1] = gen_rtx_XOR (SImode, operands[0], GEN_INT (0x80000000)); + } + else + { + rtx in_hi, in_lo, out_hi, out_lo; + + in_hi = gen_rtx_XOR (SImode, gen_highpart (SImode, operands[1]), + GEN_INT (0x80000000)); + in_lo = gen_lowpart (SImode, operands[1]); + out_hi = gen_highpart (SImode, operands[0]); + out_lo = gen_lowpart (SImode, operands[0]); + + if (REGNO (in_lo) == REGNO (out_hi)) + { + emit_insn (gen_rtx_SET (SImode, out_lo, in_lo)); + operands[0] = out_hi; + operands[1] = in_hi; + } + else + { + emit_insn (gen_rtx_SET (SImode, out_hi, in_hi)); + operands[0] = out_lo; + operands[1] = in_lo; + } + } + " + [(set_attr "predicable" "yes") + (set_attr "length" "4,4,8") + (set_attr "type" "ffarithd")] +) + + +;; Arithmetic insns + +(define_insn "*addsf3_vfp" + [(set (match_operand:SF 0 "s_register_operand" "=t") + (plus:SF (match_operand:SF 1 "s_register_operand" "t") + (match_operand:SF 2 "s_register_operand" "t")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP" + "fadds%?\\t%0, %1, %2" + [(set_attr "predicable" "yes") + (set_attr "type" "fadds")] +) + +(define_insn "*adddf3_vfp" + [(set (match_operand:DF 0 "s_register_operand" "=w") + (plus:DF (match_operand:DF 1 "s_register_operand" "w") + (match_operand:DF 2 "s_register_operand" "w")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + "faddd%?\\t%P0, %P1, %P2" + [(set_attr "predicable" "yes") + (set_attr "type" "faddd")] +) + + +(define_insn "*subsf3_vfp" + [(set (match_operand:SF 0 "s_register_operand" "=t") + (minus:SF (match_operand:SF 1 "s_register_operand" "t") + (match_operand:SF 2 "s_register_operand" "t")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP" + "fsubs%?\\t%0, %1, %2" + [(set_attr "predicable" "yes") + (set_attr "type" "fadds")] +) + +(define_insn "*subdf3_vfp" + [(set (match_operand:DF 0 "s_register_operand" "=w") + (minus:DF (match_operand:DF 1 "s_register_operand" "w") + (match_operand:DF 2 "s_register_operand" "w")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + "fsubd%?\\t%P0, %P1, %P2" + [(set_attr "predicable" "yes") + (set_attr "type" "faddd")] +) + + +;; Division insns + +(define_insn "*divsf3_vfp" + [(set (match_operand:SF 0 "s_register_operand" "+t") + (div:SF (match_operand:SF 1 "s_register_operand" "t") + (match_operand:SF 2 "s_register_operand" "t")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP" + "fdivs%?\\t%0, %1, %2" + [(set_attr "predicable" "yes") + (set_attr "type" "fdivs")] +) + +(define_insn "*divdf3_vfp" + [(set (match_operand:DF 0 "s_register_operand" "+w") + (div:DF (match_operand:DF 1 "s_register_operand" "w") + (match_operand:DF 2 "s_register_operand" "w")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + "fdivd%?\\t%P0, %P1, %P2" + [(set_attr "predicable" "yes") + (set_attr "type" "fdivd")] +) + + +;; Multiplication insns + +(define_insn "*mulsf3_vfp" + [(set (match_operand:SF 0 "s_register_operand" "+t") + (mult:SF (match_operand:SF 1 "s_register_operand" "t") + (match_operand:SF 2 "s_register_operand" "t")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP" + "fmuls%?\\t%0, %1, %2" + [(set_attr "predicable" "yes") + (set_attr "type" "fmuls")] +) + +(define_insn "*muldf3_vfp" + [(set (match_operand:DF 0 "s_register_operand" "+w") + (mult:DF (match_operand:DF 1 "s_register_operand" "w") + (match_operand:DF 2 "s_register_operand" "w")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + "fmuld%?\\t%P0, %P1, %P2" + [(set_attr "predicable" "yes") + (set_attr "type" "fmuld")] +) + + +(define_insn "*mulsf3negsf_vfp" + [(set (match_operand:SF 0 "s_register_operand" "+t") + (mult:SF (neg:SF (match_operand:SF 1 "s_register_operand" "t")) + (match_operand:SF 2 "s_register_operand" "t")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP" + "fnmuls%?\\t%0, %1, %2" + [(set_attr "predicable" "yes") + (set_attr "type" "fmuls")] +) + +(define_insn "*muldf3negdf_vfp" + [(set (match_operand:DF 0 "s_register_operand" "+w") + (mult:DF (neg:DF (match_operand:DF 1 "s_register_operand" "w")) + (match_operand:DF 2 "s_register_operand" "w")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + "fnmuld%?\\t%P0, %P1, %P2" + [(set_attr "predicable" "yes") + (set_attr "type" "fmuld")] +) + + +;; Multiply-accumulate insns + +;; 0 = 1 * 2 + 0 +(define_insn "*mulsf3addsf_vfp" + [(set (match_operand:SF 0 "s_register_operand" "=t") + (plus:SF (mult:SF (match_operand:SF 2 "s_register_operand" "t") + (match_operand:SF 3 "s_register_operand" "t")) + (match_operand:SF 1 "s_register_operand" "0")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP" + "fmacs%?\\t%0, %2, %3" + [(set_attr "predicable" "yes") + (set_attr "type" "fmacs")] +) + +(define_insn "*muldf3adddf_vfp" + [(set (match_operand:DF 0 "s_register_operand" "=w") + (plus:DF (mult:DF (match_operand:DF 2 "s_register_operand" "w") + (match_operand:DF 3 "s_register_operand" "w")) + (match_operand:DF 1 "s_register_operand" "0")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + "fmacd%?\\t%P0, %P2, %P3" + [(set_attr "predicable" "yes") + (set_attr "type" "fmacd")] +) + +;; 0 = 1 * 2 - 0 +(define_insn "*mulsf3subsf_vfp" + [(set (match_operand:SF 0 "s_register_operand" "=t") + (minus:SF (mult:SF (match_operand:SF 2 "s_register_operand" "t") + (match_operand:SF 3 "s_register_operand" "t")) + (match_operand:SF 1 "s_register_operand" "0")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP" + "fmscs%?\\t%0, %2, %3" + [(set_attr "predicable" "yes") + (set_attr "type" "fmacs")] +) + +(define_insn "*muldf3subdf_vfp" + [(set (match_operand:DF 0 "s_register_operand" "=w") + (minus:DF (mult:DF (match_operand:DF 2 "s_register_operand" "w") + (match_operand:DF 3 "s_register_operand" "w")) + (match_operand:DF 1 "s_register_operand" "0")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + "fmscd%?\\t%P0, %P2, %P3" + [(set_attr "predicable" "yes") + (set_attr "type" "fmacd")] +) + +;; 0 = -(1 * 2) + 0 +(define_insn "*mulsf3negsfaddsf_vfp" + [(set (match_operand:SF 0 "s_register_operand" "=t") + (minus:SF (match_operand:SF 1 "s_register_operand" "0") + (mult:SF (match_operand:SF 2 "s_register_operand" "t") + (match_operand:SF 3 "s_register_operand" "t"))))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP" + "fnmacs%?\\t%0, %2, %3" + [(set_attr "predicable" "yes") + (set_attr "type" "fmacs")] +) + +(define_insn "*fmuldf3negdfadddf_vfp" + [(set (match_operand:DF 0 "s_register_operand" "=w") + (minus:DF (match_operand:DF 1 "s_register_operand" "0") + (mult:DF (match_operand:DF 2 "s_register_operand" "w") + (match_operand:DF 3 "s_register_operand" "w"))))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + "fnmacd%?\\t%P0, %P2, %P3" + [(set_attr "predicable" "yes") + (set_attr "type" "fmacd")] +) + + +;; 0 = -(1 * 2) - 0 +(define_insn "*mulsf3negsfsubsf_vfp" + [(set (match_operand:SF 0 "s_register_operand" "=t") + (minus:SF (mult:SF + (neg:SF (match_operand:SF 2 "s_register_operand" "t")) + (match_operand:SF 3 "s_register_operand" "t")) + (match_operand:SF 1 "s_register_operand" "0")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP" + "fnmscs%?\\t%0, %2, %3" + [(set_attr "predicable" "yes") + (set_attr "type" "fmacs")] +) + +(define_insn "*muldf3negdfsubdf_vfp" + [(set (match_operand:DF 0 "s_register_operand" "=w") + (minus:DF (mult:DF + (neg:DF (match_operand:DF 2 "s_register_operand" "w")) + (match_operand:DF 3 "s_register_operand" "w")) + (match_operand:DF 1 "s_register_operand" "0")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + "fnmscd%?\\t%P0, %P2, %P3" + [(set_attr "predicable" "yes") + (set_attr "type" "fmacd")] +) + + +;; Conversion routines + +(define_insn "*extendsfdf2_vfp" + [(set (match_operand:DF 0 "s_register_operand" "=w") + (float_extend:DF (match_operand:SF 1 "s_register_operand" "t")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + "fcvtds%?\\t%P0, %1" + [(set_attr "predicable" "yes") + (set_attr "type" "f_cvt")] +) + +(define_insn "*truncdfsf2_vfp" + [(set (match_operand:SF 0 "s_register_operand" "=t") + (float_truncate:SF (match_operand:DF 1 "s_register_operand" "w")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + "fcvtsd%?\\t%0, %P1" + [(set_attr "predicable" "yes") + (set_attr "type" "f_cvt")] +) + +(define_insn "extendhfsf2" + [(set (match_operand:SF 0 "s_register_operand" "=t") + (float_extend:SF (match_operand:HF 1 "s_register_operand" "t")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FP16" + "vcvtb%?.f32.f16\\t%0, %1" + [(set_attr "predicable" "yes") + (set_attr "type" "f_cvt")] +) + +(define_insn "truncsfhf2" + [(set (match_operand:HF 0 "s_register_operand" "=t") + (float_truncate:HF (match_operand:SF 1 "s_register_operand" "t")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FP16" + "vcvtb%?.f16.f32\\t%0, %1" + [(set_attr "predicable" "yes") + (set_attr "type" "f_cvt")] +) + +(define_insn "*truncsisf2_vfp" + [(set (match_operand:SI 0 "s_register_operand" "=t") + (fix:SI (fix:SF (match_operand:SF 1 "s_register_operand" "t"))))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP" + "ftosizs%?\\t%0, %1" + [(set_attr "predicable" "yes") + (set_attr "type" "f_cvt")] +) + +(define_insn "*truncsidf2_vfp" + [(set (match_operand:SI 0 "s_register_operand" "=t") + (fix:SI (fix:DF (match_operand:DF 1 "s_register_operand" "w"))))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + "ftosizd%?\\t%0, %P1" + [(set_attr "predicable" "yes") + (set_attr "type" "f_cvt")] +) + + +(define_insn "fixuns_truncsfsi2" + [(set (match_operand:SI 0 "s_register_operand" "=t") + (unsigned_fix:SI (fix:SF (match_operand:SF 1 "s_register_operand" "t"))))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP" + "ftouizs%?\\t%0, %1" + [(set_attr "predicable" "yes") + (set_attr "type" "f_cvt")] +) + +(define_insn "fixuns_truncdfsi2" + [(set (match_operand:SI 0 "s_register_operand" "=t") + (unsigned_fix:SI (fix:DF (match_operand:DF 1 "s_register_operand" "t"))))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + "ftouizd%?\\t%0, %P1" + [(set_attr "predicable" "yes") + (set_attr "type" "f_cvt")] +) + + +(define_insn "*floatsisf2_vfp" + [(set (match_operand:SF 0 "s_register_operand" "=t") + (float:SF (match_operand:SI 1 "s_register_operand" "t")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP" + "fsitos%?\\t%0, %1" + [(set_attr "predicable" "yes") + (set_attr "type" "f_cvt")] +) + +(define_insn "*floatsidf2_vfp" + [(set (match_operand:DF 0 "s_register_operand" "=w") + (float:DF (match_operand:SI 1 "s_register_operand" "t")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + "fsitod%?\\t%P0, %1" + [(set_attr "predicable" "yes") + (set_attr "type" "f_cvt")] +) + + +(define_insn "floatunssisf2" + [(set (match_operand:SF 0 "s_register_operand" "=t") + (unsigned_float:SF (match_operand:SI 1 "s_register_operand" "t")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP" + "fuitos%?\\t%0, %1" + [(set_attr "predicable" "yes") + (set_attr "type" "f_cvt")] +) + +(define_insn "floatunssidf2" + [(set (match_operand:DF 0 "s_register_operand" "=w") + (unsigned_float:DF (match_operand:SI 1 "s_register_operand" "t")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + "fuitod%?\\t%P0, %1" + [(set_attr "predicable" "yes") + (set_attr "type" "f_cvt")] +) + + +;; Sqrt insns. + +(define_insn "*sqrtsf2_vfp" + [(set (match_operand:SF 0 "s_register_operand" "=t") + (sqrt:SF (match_operand:SF 1 "s_register_operand" "t")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP" + "fsqrts%?\\t%0, %1" + [(set_attr "predicable" "yes") + (set_attr "type" "fdivs")] +) + +(define_insn "*sqrtdf2_vfp" + [(set (match_operand:DF 0 "s_register_operand" "=w") + (sqrt:DF (match_operand:DF 1 "s_register_operand" "w")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + "fsqrtd%?\\t%P0, %P1" + [(set_attr "predicable" "yes") + (set_attr "type" "fdivd")] +) + + +;; Patterns to split/copy vfp condition flags. + +(define_insn "*movcc_vfp" + [(set (reg CC_REGNUM) + (reg VFPCC_REGNUM))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP" + "fmstat%?" + [(set_attr "conds" "set") + (set_attr "type" "f_flag")] +) + +(define_insn_and_split "*cmpsf_split_vfp" + [(set (reg:CCFP CC_REGNUM) + (compare:CCFP (match_operand:SF 0 "s_register_operand" "t") + (match_operand:SF 1 "vfp_compare_operand" "tG")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP" + "#" + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP" + [(set (reg:CCFP VFPCC_REGNUM) + (compare:CCFP (match_dup 0) + (match_dup 1))) + (set (reg:CCFP CC_REGNUM) + (reg:CCFP VFPCC_REGNUM))] + "" +) + +(define_insn_and_split "*cmpsf_trap_split_vfp" + [(set (reg:CCFPE CC_REGNUM) + (compare:CCFPE (match_operand:SF 0 "s_register_operand" "t") + (match_operand:SF 1 "vfp_compare_operand" "tG")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP" + "#" + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP" + [(set (reg:CCFPE VFPCC_REGNUM) + (compare:CCFPE (match_dup 0) + (match_dup 1))) + (set (reg:CCFPE CC_REGNUM) + (reg:CCFPE VFPCC_REGNUM))] + "" +) + +(define_insn_and_split "*cmpdf_split_vfp" + [(set (reg:CCFP CC_REGNUM) + (compare:CCFP (match_operand:DF 0 "s_register_operand" "w") + (match_operand:DF 1 "vfp_compare_operand" "wG")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + "#" + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + [(set (reg:CCFP VFPCC_REGNUM) + (compare:CCFP (match_dup 0) + (match_dup 1))) + (set (reg:CCFP CC_REGNUM) + (reg:CCFP VFPCC_REGNUM))] + "" +) + +(define_insn_and_split "*cmpdf_trap_split_vfp" + [(set (reg:CCFPE CC_REGNUM) + (compare:CCFPE (match_operand:DF 0 "s_register_operand" "w") + (match_operand:DF 1 "vfp_compare_operand" "wG")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + "#" + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + [(set (reg:CCFPE VFPCC_REGNUM) + (compare:CCFPE (match_dup 0) + (match_dup 1))) + (set (reg:CCFPE CC_REGNUM) + (reg:CCFPE VFPCC_REGNUM))] + "" +) + + +;; Comparison patterns + +(define_insn "*cmpsf_vfp" + [(set (reg:CCFP VFPCC_REGNUM) + (compare:CCFP (match_operand:SF 0 "s_register_operand" "t,t") + (match_operand:SF 1 "vfp_compare_operand" "t,G")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP" + "@ + fcmps%?\\t%0, %1 + fcmpzs%?\\t%0" + [(set_attr "predicable" "yes") + (set_attr "type" "fcmps")] +) + +(define_insn "*cmpsf_trap_vfp" + [(set (reg:CCFPE VFPCC_REGNUM) + (compare:CCFPE (match_operand:SF 0 "s_register_operand" "t,t") + (match_operand:SF 1 "vfp_compare_operand" "t,G")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP" + "@ + fcmpes%?\\t%0, %1 + fcmpezs%?\\t%0" + [(set_attr "predicable" "yes") + (set_attr "type" "fcmps")] +) + +(define_insn "*cmpdf_vfp" + [(set (reg:CCFP VFPCC_REGNUM) + (compare:CCFP (match_operand:DF 0 "s_register_operand" "w,w") + (match_operand:DF 1 "vfp_compare_operand" "w,G")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + "@ + fcmpd%?\\t%P0, %P1 + fcmpzd%?\\t%P0" + [(set_attr "predicable" "yes") + (set_attr "type" "fcmpd")] +) + +(define_insn "*cmpdf_trap_vfp" + [(set (reg:CCFPE VFPCC_REGNUM) + (compare:CCFPE (match_operand:DF 0 "s_register_operand" "w,w") + (match_operand:DF 1 "vfp_compare_operand" "w,G")))] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP_DOUBLE" + "@ + fcmped%?\\t%P0, %P1 + fcmpezd%?\\t%P0" + [(set_attr "predicable" "yes") + (set_attr "type" "fcmpd")] +) + + +;; Store multiple insn used in function prologue. + +(define_insn "*push_multi_vfp" + [(match_parallel 2 "multi_register_push" + [(set (match_operand:BLK 0 "memory_operand" "=m") + (unspec:BLK [(match_operand:DF 1 "vfp_register_operand" "")] + UNSPEC_PUSH_MULT))])] + "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP" + "* return vfp_output_fstmd (operands);" + [(set_attr "type" "f_stored")] +) + + +;; Unimplemented insns: +;; fldm* +;; fstm* +;; fmdhr et al (VFPv1) +;; Support for xD (single precision only) variants. +;; fmrrs, fmsrr diff --git a/gcc/config/arm/vfp11.md b/gcc/config/arm/vfp11.md new file mode 100644 index 000000000..8f863fd70 --- /dev/null +++ b/gcc/config/arm/vfp11.md @@ -0,0 +1,92 @@ +;; ARM VFP11 pipeline description +;; Copyright (C) 2003, 2005, 2007, 2008 Free Software Foundation, Inc. +;; Written by CodeSourcery. +;; +;; This file is part of GCC. + +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published +;; by the Free Software Foundation; either version 3, or (at your +;; option) any later version. + +;; GCC is distributed in the hope that it will be useful, but WITHOUT +;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public +;; License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +(define_automaton "vfp11") + +;; There are 3 pipelines in the VFP11 unit. +;; +;; - A 8-stage FMAC pipeline (7 execute + writeback) with forward from +;; fourth stage for simple operations. +;; +;; - A 5-stage DS pipeline (4 execute + writeback) for divide/sqrt insns. +;; These insns also uses first execute stage of FMAC pipeline. +;; +;; - A 4-stage LS pipeline (execute + 2 memory + writeback) with forward from +;; second memory stage for loads. + +;; We do not model Write-After-Read hazards. +;; We do not do write scheduling with the arm core, so it is only necessary +;; to model the first stage of each pipeline +;; ??? Need to model LS pipeline properly for load/store multiple? +;; We do not model fmstat properly. This could be done by modeling pipelines +;; properly and defining an absence set between a dummy fmstat unit and all +;; other vfp units. + +(define_cpu_unit "fmac" "vfp11") + +(define_cpu_unit "ds" "vfp11") + +(define_cpu_unit "vfp_ls" "vfp11") + +(define_cpu_unit "fmstat" "vfp11") + +(exclusion_set "fmac,ds" "fmstat") + +(define_insn_reservation "vfp_ffarith" 4 + (and (eq_attr "generic_vfp" "yes") + (eq_attr "type" "fcpys,ffariths,ffarithd,fcmps,fcmpd")) + "fmac") + +(define_insn_reservation "vfp_farith" 8 + (and (eq_attr "generic_vfp" "yes") + (eq_attr "type" "fadds,faddd,fconsts,fconstd,f_cvt,fmuls,fmacs")) + "fmac") + +(define_insn_reservation "vfp_fmul" 9 + (and (eq_attr "generic_vfp" "yes") + (eq_attr "type" "fmuld,fmacd")) + "fmac*2") + +(define_insn_reservation "vfp_fdivs" 19 + (and (eq_attr "generic_vfp" "yes") + (eq_attr "type" "fdivs")) + "ds*15") + +(define_insn_reservation "vfp_fdivd" 33 + (and (eq_attr "generic_vfp" "yes") + (eq_attr "type" "fdivd")) + "fmac+ds*29") + +;; Moves to/from arm regs also use the load/store pipeline. +(define_insn_reservation "vfp_fload" 4 + (and (eq_attr "generic_vfp" "yes") + (eq_attr "type" "f_loads,f_loadd,r_2_f")) + "vfp_ls") + +(define_insn_reservation "vfp_fstore" 4 + (and (eq_attr "generic_vfp" "yes") + (eq_attr "type" "f_stores,f_stored,f_2_r")) + "vfp_ls") + +(define_insn_reservation "vfp_to_cpsr" 4 + (and (eq_attr "generic_vfp" "yes") + (eq_attr "type" "f_flag")) + "fmstat,vfp_ls*3") + diff --git a/gcc/config/arm/vxworks.h b/gcc/config/arm/vxworks.h new file mode 100644 index 000000000..3ceaed903 --- /dev/null +++ b/gcc/config/arm/vxworks.h @@ -0,0 +1,113 @@ +/* Definitions of target machine for GCC, + for ARM with targetting the VXWorks run time environment. + Copyright (C) 1999, 2000, 2003, 2004, 2007, 2008, 2009, 2010 + Free Software Foundation, Inc. + + Contributed by: Mike Stump + Brought up to date by CodeSourcery, LLC. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + + +#define TARGET_OS_CPP_BUILTINS() \ + do { \ + if (TARGET_BIG_END) \ + builtin_define ("ARMEB"); \ + else \ + builtin_define ("ARMEL"); \ + \ + if (arm_arch_xscale) \ + builtin_define ("CPU=XSCALE"); \ + else if (arm_arch5) \ + builtin_define ("CPU=ARMARCH5"); \ + else if (arm_arch4) \ + { \ + if (thumb_code) \ + builtin_define ("CPU=ARMARCH4_T"); \ + else \ + builtin_define ("CPU=ARMARCH4"); \ + } \ + VXWORKS_OS_CPP_BUILTINS (); \ + } while (0) + +#undef SUBTARGET_OVERRIDE_OPTIONS +#define SUBTARGET_OVERRIDE_OPTIONS VXWORKS_OVERRIDE_OPTIONS + +/* Subsume the arm/elf.h definition, and add RTP hooks. */ +#undef SUBTARGET_CPP_SPEC +#define SUBTARGET_CPP_SPEC "-D__ELF__" VXWORKS_ADDITIONAL_CPP_SPEC + +#undef CC1_SPEC +#define CC1_SPEC \ +"%{tstrongarm:-mlittle-endian -mcpu=strongarm ; \ + t4: -mlittle-endian -march=armv4 ; \ + t4be: -mbig-endian -march=armv4 ; \ + t4t: -mthumb -mthumb-interwork -mlittle-endian -march=armv4t ; \ + t4tbe: -mthumb -mthumb-interwork -mbig-endian -march=armv4t ; \ + t5: -mlittle-endian -march=armv5 ; \ + t5be: -mbig-endian -march=armv5 ; \ + t5t: -mthumb -mthumb-interwork -mlittle-endian -march=armv5 ; \ + t5tbe: -mthumb -mthumb-interwork -mbig-endian -march=armv5 ; \ + txscale: -mlittle-endian -mcpu=xscale ; \ + txscalebe: -mbig-endian -mcpu=xscale ; \ + : -march=armv4}" + +/* Pass -EB for big-endian targets. */ +#define VXWORKS_ENDIAN_SPEC \ + "%{mbig-endian|t4be|t4tbe|t5be|t5tbe|txscalebe:-EB}" + +#undef SUBTARGET_EXTRA_ASM_SPEC +#define SUBTARGET_EXTRA_ASM_SPEC VXWORKS_ENDIAN_SPEC + +#undef LINK_SPEC +#define LINK_SPEC VXWORKS_LINK_SPEC " " VXWORKS_ENDIAN_SPEC + +#undef LIB_SPEC +#define LIB_SPEC VXWORKS_LIB_SPEC + +#undef STARTFILE_SPEC +#define STARTFILE_SPEC VXWORKS_STARTFILE_SPEC + +#undef ENDFILE_SPEC +#define ENDFILE_SPEC VXWORKS_ENDFILE_SPEC + +#undef TARGET_VERSION +#define TARGET_VERSION fputs (" (ARM/VxWorks)", stderr); + +/* There is no default multilib. */ +#undef MULTILIB_DEFAULTS + +#define FPUTYPE_DEFAULT "vfp" + +#undef FUNCTION_PROFILER +#define FUNCTION_PROFILER VXWORKS_FUNCTION_PROFILER + +/* We want to be compatible with a version of "2.96" at one point in + the past before this macro was changed. */ +#undef DEFAULT_STRUCTURE_SIZE_BOUNDARY +#define DEFAULT_STRUCTURE_SIZE_BOUNDARY 8 + +/* The kernel loader does not allow relocations to overflow, so we + cannot allow arbitrary relocation addends in kernel modules or RTP + executables. Also, the dynamic loader uses the resolved relocation + value to distinguish references to the text and data segments, so we + cannot allow arbitrary offsets for shared libraries either. */ +#undef ARM_OFFSETS_MUST_BE_WITHIN_SECTIONS_P +#define ARM_OFFSETS_MUST_BE_WITHIN_SECTIONS_P 1 + +#undef TARGET_DEFAULT_WORD_RELOCATIONS +#define TARGET_DEFAULT_WORD_RELOCATIONS 1 diff --git a/gcc/config/arm/vxworks.opt b/gcc/config/arm/vxworks.opt new file mode 100644 index 000000000..bc8478391 --- /dev/null +++ b/gcc/config/arm/vxworks.opt @@ -0,0 +1,60 @@ +; ARM VxWorks options. + +; Copyright (C) 2011 +; Free Software Foundation, Inc. +; +; This file is part of GCC. +; +; GCC is free software; you can redistribute it and/or modify it under +; the terms of the GNU General Public License as published by the Free +; Software Foundation; either version 3, or (at your option) any later +; version. +; +; GCC is distributed in the hope that it will be useful, but WITHOUT ANY +; WARRANTY; without even the implied warranty of MERCHANTABILITY or +; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +; for more details. +; +; You should have received a copy of the GNU General Public License +; along with GCC; see the file COPYING3. If not see +; . + +; See the GCC internals manual (options.texi) for a description of +; this file's format. + +; Please try to keep this file in ASCII collating order. + +t4 +Driver + +t4be +Driver + +t4t +Driver + +t4tbe +Driver + +t5 +Driver + +t5be +Driver + +t5t +Driver + +t5tbe +Driver + +tstrongarm +Driver + +txscale +Driver + +txscalebe +Driver + +; This comment is to ensure we retain the blank line above. diff --git a/gcc/config/arm/wince-pe.h b/gcc/config/arm/wince-pe.h new file mode 100644 index 000000000..ffaa0c6c3 --- /dev/null +++ b/gcc/config/arm/wince-pe.h @@ -0,0 +1,26 @@ +/* Definitions of target machine for GNU compiler, for ARM with WINCE-PE obj format. + Copyright (C) 2003, 2004, 2005, 2007 Free Software Foundation, Inc. + Contributed by Nick Clifton + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +#undef TARGET_DEFAULT +#define TARGET_DEFAULT (MASK_NOP_FUN_DLLIMPORT) + +#undef MULTILIB_DEFAULTS +#define MULTILIB_DEFAULTS \ + { "marm", "mlittle-endian", "msoft-float", "mno-thumb-interwork" } -- cgit v1.2.3