Contributed under the terms of the MIT/X11 license by
authorJerri Maine <crashfourit@gmail.com>
Tue, 14 Jul 2009 23:16:37 +0000 (23:16 -0000)
committerJerri Maine <crashfourit@gmail.com>
Tue, 14 Jul 2009 23:16:37 +0000 (23:16 -0000)
Jerry Maine <crashfourit@gail.com>.

* basic-simd.cs: Added tests for single and doulble indexers.

* cpu-amd64.md: Added simd opcode information.

* mini-amd64.c: Added IR to native simd generation code.
Added simd register names and function that returns them.

* mini-amd64.h: Added marcos to turn on simd code compilation in
amd64. Added max simd register count marco. Added caller/callee
register mask marcos. Added marcos to use simd register bank.

* mini.h: Added helper marco for shufling dwords and simple
floats.

svn path=/trunk/mono/; revision=137905

mono/mini/ChangeLog
mono/mini/basic-simd.cs
mono/mini/cpu-amd64.md
mono/mini/mini-amd64.c
mono/mini/mini-amd64.h
mono/mini/mini.h

index e9432423204fa9637a220d8d91517ac87424016d..837654ead67225c5b3bd228c11fe5d419af0bb62 100644 (file)
@@ -1,3 +1,22 @@
+2009-07-14 Jerry Maine <crashfourit@gmail.com>
+
+       Contributed under the terms of the MIT/X11 license by
+       Jerry Maine <crashfourit@gail.com>.
+
+       * basic-simd.cs: Added tests for single and doulble indexers.
+
+       * cpu-amd64.md: Added simd opcode information.
+
+       * mini-amd64.c: Added IR to native simd generation code.
+       Added simd register names and function that returns them.
+
+       * mini-amd64.h: Added marcos to turn on simd code compilation in
+       amd64. Added max simd register count marco. Added caller/callee
+       register mask marcos. Added marcos to use simd register bank.
+
+       * mini.h: Added helper marco for shufling dwords and simple
+       floats.
+
 2009-07-14  Zoltan Varga  <vargaz@gmail.com>
 
        * mini-llvm-cpp.cpp: Update to latest llvm SVN api.
index 803a0e0e22c29bf25be65674db53c4b0cc956c49..14887076fb4f63863fcf8b6a65d108c13283eb1a 100644 (file)
@@ -2914,6 +2914,58 @@ public class SimdTests {
                return 0;
        }
 
+       public static int test_0_simd_const_indexer_simple () {
+               Vector4f v = new Vector4f (1, 2, 3, 4);
+               
+               if (v[0] != 1) 
+                       return 1;
+               if (v[1] != 2) 
+                       return 2;
+               if (v[2] != 3) 
+                       return 3;
+               if (v[3] != 4) 
+                       return 4;
+               return 0;
+       }
+
+       public static int test_0_simd_var_indexer_simple () {
+               Vector4f v = new Vector4f (1, 2, 3, 4);
+
+               int index = 0;
+               
+               if (v[index++] != 1) 
+                       return 1;
+               if (v[index++] != 2) 
+                       return 2;
+               if (v[index++] != 3) 
+                       return 3;
+               if (v[index] != 4) 
+                       return 4;
+               return 0;
+       }
+
+       public static int test_0_simd_const_indexer_double () {
+               Vector2d v = new Vector2d (1, 2);
+               
+               if (v[0] != 1) 
+                       return 1;
+               if (v[1] != 2) 
+                       return 2;
+               return 0;
+       }
+
+       public static int test_0_simd_var_indexer_double () {
+               Vector2d v = new Vector2d (1, 2);
+
+               int index = 0;
+               
+               if (v[index++] != 1) 
+                       return 1;
+               if (v[index] != 2) 
+                       return 2;
+               return 0;
+       }
+
        public static int Main () {
                return TestDriver.RunTests (typeof (SimdTests));
        }
index e4638041ebab6a73107467e6fa229169076efa57..132f5ec4a50875c657dbf1e843828ff5846645e3 100644 (file)
@@ -503,5 +503,207 @@ loadi4_mem: dest:i len:16
 loadu1_mem: dest:i len:16
 loadu2_mem: dest:i len:16
 
+
+#SIMD
+#TODO: Some of these IR opcodes are marked as no clobber when they indeed do.
+
+addps: dest:x src1:x src2:x len:4 clob:1
+divps: dest:x src1:x src2:x len:4 clob:1
+mulps: dest:x src1:x src2:x len:4 clob:1
+subps: dest:x src1:x src2:x len:4 clob:1
+maxps: dest:x src1:x src2:x len:4 clob:1
+minps: dest:x src1:x src2:x len:4 clob:1
+compps: dest:x src1:x src2:x len:5 clob:1
+andps: dest:x src1:x src2:x len:4 clob:1
+andnps: dest:x src1:x src2:x len:4 clob:1
+orps: dest:x src1:x src2:x len:4 clob:1
+xorps: dest:x src1:x src2:x len:4 clob:1
+
+haddps: dest:x src1:x src2:x len:5 clob:1
+hsubps: dest:x src1:x src2:x len:5 clob:1
+addsubps: dest:x src1:x src2:x len:5 clob:1
+dupps_low: dest:x src1:x len:5
+dupps_high: dest:x src1:x len:5
+
+addpd: dest:x src1:x src2:x len:5 clob:1
+divpd: dest:x src1:x src2:x len:5 clob:1
+mulpd: dest:x src1:x src2:x len:5 clob:1
+subpd: dest:x src1:x src2:x len:5 clob:1
+maxpd: dest:x src1:x src2:x len:5 clob:1
+minpd: dest:x src1:x src2:x len:5 clob:1
+comppd: dest:x src1:x src2:x len:6 clob:1
+andpd: dest:x src1:x src2:x len:5 clob:1
+andnpd: dest:x src1:x src2:x len:5 clob:1
+orpd: dest:x src1:x src2:x len:5 clob:1
+xorpd: dest:x src1:x src2:x len:5 clob:1
+
+haddpd: dest:x src1:x src2:x len:6 clob:1
+hsubpd: dest:x src1:x src2:x len:6 clob:1
+addsubpd: dest:x src1:x src2:x len:6 clob:1
+duppd: dest:x src1:x len:6
+
+pand: dest:x src1:x src2:x len:5 clob:1
+por: dest:x src1:x src2:x len:5 clob:1
+pxor: dest:x src1:x src2:x len:5 clob:1
+
+sqrtps: dest:x src1:x len:5
+rsqrtps: dest:x src1:x len:5
+rcpps: dest:x src1:x len:5
+
+pshufflew_high: dest:x src1:x len:6
+pshufflew_low: dest:x src1:x len:6
+pshuffled: dest:x src1:x len:6
+
+extract_mask: dest:i src1:x len:6
+
+paddb: dest:x src1:x src2:x len:5 clob:1
+paddw: dest:x src1:x src2:x len:5 clob:1
+paddd: dest:x src1:x src2:x len:5 clob:1
+paddq: dest:x src1:x src2:x len:5 clob:1
+
+psubb: dest:x src1:x src2:x len:5 clob:1
+psubw: dest:x src1:x src2:x len:5 clob:1
+psubd: dest:x src1:x src2:x len:5 clob:1
+psubq: dest:x src1:x src2:x len:5 clob:1
+
+pmaxb_un: dest:x src1:x src2:x len:5 clob:1
+pmaxw_un: dest:x src1:x src2:x len:6 clob:1
+pmaxd_un: dest:x src1:x src2:x len:6 clob:1
+
+pmaxb: dest:x src1:x src2:x len:6 clob:1
+pmaxw: dest:x src1:x src2:x len:5 clob:1
+pmaxd: dest:x src1:x src2:x len:6 clob:1
+
+pavgb_un: dest:x src1:x src2:x len:5 clob:1
+pavgw_un: dest:x src1:x src2:x len:5 clob:1
+
+pminb_un: dest:x src1:x src2:x len:5 clob:1
+pminw_un: dest:x src1:x src2:x len:6 clob:1
+pmind_un: dest:x src1:x src2:x len:6 clob:1
+
+pminb: dest:x src1:x src2:x len:6 clob:1
+pminw: dest:x src1:x src2:x len:5 clob:1
+pmind: dest:x src1:x src2:x len:6 clob:1
+
+pcmpeqb: dest:x src1:x src2:x len:5 clob:1
+pcmpeqw: dest:x src1:x src2:x len:5 clob:1
+pcmpeqd: dest:x src1:x src2:x len:5 clob:1
+pcmpeqq: dest:x src1:x src2:x len:6 clob:1
+
+pcmpgtb: dest:x src1:x src2:x len:5 clob:1
+pcmpgtw: dest:x src1:x src2:x len:5 clob:1
+pcmpgtd: dest:x src1:x src2:x len:5 clob:1
+pcmpgtq: dest:x src1:x src2:x len:6 clob:1
+
+psumabsdiff: dest:x src1:x src2:x len:5 clob:1
+
+unpack_lowb: dest:x src1:x src2:x len:5 clob:1
+unpack_loww: dest:x src1:x src2:x len:5 clob:1
+unpack_lowd: dest:x src1:x src2:x len:5 clob:1
+unpack_lowq: dest:x src1:x src2:x len:5 clob:1
+unpack_lowps: dest:x src1:x src2:x len:5 clob:1
+unpack_lowpd: dest:x src1:x src2:x len:5 clob:1
+
+unpack_highb: dest:x src1:x src2:x len:5 clob:1
+unpack_highw: dest:x src1:x src2:x len:5 clob:1
+unpack_highd: dest:x src1:x src2:x len:5 clob:1
+unpack_highq: dest:x src1:x src2:x len:5 clob:1
+unpack_highps: dest:x src1:x src2:x len:5 clob:1
+unpack_highpd: dest:x src1:x src2:x len:5 clob:1
+
+packw: dest:x src1:x src2:x len:5 clob:1 
+packd: dest:x src1:x src2:x len:5 clob:1 
+
+packw_un: dest:x src1:x src2:x len:5 clob:1 
+packd_un: dest:x src1:x src2:x len:6 clob:1 
+
+paddb_sat: dest:x src1:x src2:x len:5 clob:1
+paddb_sat_un: dest:x src1:x src2:x len:5 clob:1
+
+paddw_sat: dest:x src1:x src2:x len:5 clob:1
+paddw_sat_un: dest:x src1:x src2:x len:5 clob:1
+
+psubb_sat: dest:x src1:x src2:x len:5 clob:1
+psubb_sat_un: dest:x src1:x src2:x len:5 clob:1
+
+psubw_sat: dest:x src1:x src2:x len:5 clob:1
+psubw_sat_un: dest:x src1:x src2:x len:5 clob:1
+
+pmulw: dest:x src1:x src2:x len:5 clob:1
+pmuld: dest:x src1:x src2:x len:6 clob:1
+pmulq: dest:x src1:x src2:x len:5 clob:1
+
+pmul_high_un: dest:x src1:x src2:x len:5 clob:1
+pmul_high: dest:x src1:x src2:x len:5 clob:1
+
+pshrw: dest:x src1:x len:6 clob:1
+pshrw_reg: dest:x src1:x src2:x len:5 clob:1
+
+psarw: dest:x src1:x len:6 clob:1
+psarw_reg: dest:x src1:x src2:x len:5 clob:1
+
+pshlw: dest:x src1:x len:6 clob:1
+pshlw_reg: dest:x src1:x src2:x len:5 clob:1
+
+pshrd: dest:x src1:x len:6 clob:1
+pshrd_reg: dest:x src1:x src2:x len:5 clob:1
+
+psard: dest:x src1:x len:6 clob:1
+psard_reg: dest:x src1:x src2:x len:5 clob:1
+
+pshld: dest:x src1:x len:6 clob:1
+pshld_reg: dest:x src1:x src2:x len:5 clob:1
+
+pshrq: dest:x src1:x len:6 clob:1
+pshrq_reg: dest:x src1:x src2:x len:5 clob:1
+
+pshlq: dest:x src1:x len:6 clob:1
+pshlq_reg: dest:x src1:x src2:x len:5 clob:1
+
+xmove: dest:x src1:x len:5
+xzero: dest:x len:5
+
+iconv_to_x: dest:x src1:i len:5
+extract_i4: dest:i src1:x len:5
+
+extract_i8: dest:i src1:x len:9
+
+extract_i2: dest:i src1:x len:13
+extract_u2: dest:i src1:x len:13
+extract_i1: dest:i src1:x len:13
+extract_u1: dest:i src1:x len:13
+extract_r8: dest:f src1:x len:5 
+
+iconv_to_r8_raw: dest:f src1:i len:9
+
+insert_i2: dest:x src1:x src2:i len:6 clob:1
+
+extractx_u2: dest:i src1:x len:6
+insertx_u1_slow: dest:x src1:i src2:i len:18 clob:x
+
+insertx_i4_slow: dest:x src1:x src2:i len:16 clob:x
+insertx_i8_slow: dest:x src1:x src2:i len:13
+insertx_r4_slow: dest:x src1:x src2:f len:24
+insertx_r8_slow: dest:x src1:x src2:f len:24
+
+loadx_membase: dest:x src1:b len:9
+storex_membase: dest:b src1:x len:9
+storex_membase_reg: dest:b src1:x len:9
+
+loadx_aligned_membase: dest:x src1:b len:7
+storex_aligned_membase_reg: dest:b src1:x len:7
+storex_nta_membase_reg: dest:b src1:x len:7
+
+fconv_to_r8_x: dest:x src1:f len:4 
+xconv_r8_to_i4: dest:y src1:x len:7
+
+prefetch_membase: src1:b len:4
+
+expand_i2: dest:x src1:i len:18
+expand_i4: dest:x src1:i len:11
+expand_i8: dest:x src1:i len:11
+expand_r4: dest:x src1:f len:16
+expand_r8: dest:x src1:f len:13
+
 liverange_start: len:0
 liverange_end: len:0
index d51ecaf37898b9f03c6abced860ce6032da4a520..2547707016f8c4547216f8dd0f5c6f575ebe31f8 100644 (file)
@@ -131,6 +131,13 @@ mono_arch_fregname (int reg)
                return "unknown";
 }
 
+/* TODO: Figure out away of telling this and the one above apart if things get confussing. */
+const char *
+mono_arch_xregname (int reg)
+{
+       return mono_arch_fregname (reg);
+}
+
 G_GNUC_UNUSED static void
 break_count (void)
 {
@@ -2511,6 +2518,30 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                                ins->sreg1 = temp->dreg;
                        }
                        break;
+#ifdef MONO_ARCH_SIMD_INTRINSICS
+               case OP_EXPAND_I1: {
+                               int temp_reg1 = mono_alloc_ireg (cfg);
+                               int temp_reg2 = mono_alloc_ireg (cfg);
+                               int original_reg = ins->sreg1;
+
+                               NEW_INS (cfg, ins, temp, OP_ICONV_TO_U1);
+                               temp->sreg1 = original_reg;
+                               temp->dreg = temp_reg1;
+
+                               NEW_INS (cfg, ins, temp, OP_SHL_IMM);
+                               temp->sreg1 = temp_reg1;
+                               temp->dreg = temp_reg2;
+                               temp->inst_imm = 8;
+
+                               NEW_INS (cfg, ins, temp, OP_LOR);
+                               temp->sreg1 = temp->dreg = temp_reg2;
+                               temp->sreg2 = temp_reg1;
+
+                               ins->opcode = OP_EXPAND_I2;
+                               ins->sreg1 = temp_reg2;
+                       }
+                       break;
+#endif
                default:
                        break;
                }
@@ -4482,6 +4513,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        break;
                }
 #ifdef MONO_ARCH_SIMD_INTRINSICS
+               /* TODO: Some of these IR opcodes are marked as no clobber when they indeed do. */
                case OP_ADDPS:
                        amd64_sse_addps_reg_reg (code, ins->sreg1, ins->sreg2);
                        break;
@@ -4632,6 +4664,428 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_PADDQ:
                        amd64_sse_paddq_reg_reg (code, ins->sreg1, ins->sreg2);
                        break;
+
+               case OP_PSUBB:
+                       amd64_sse_psubb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBW:
+                       amd64_sse_psubw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBD:
+                       amd64_sse_psubd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBQ:
+                       amd64_sse_psubq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PMAXB_UN:
+                       amd64_sse_pmaxub_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMAXW_UN:
+                       amd64_sse_pmaxuw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMAXD_UN:
+                       amd64_sse_pmaxud_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               
+               case OP_PMAXB:
+                       amd64_sse_pmaxsb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMAXW:
+                       amd64_sse_pmaxsw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMAXD:
+                       amd64_sse_pmaxsd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PAVGB_UN:
+                       amd64_sse_pavgb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PAVGW_UN:
+                       amd64_sse_pavgw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PMINB_UN:
+                       amd64_sse_pminub_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMINW_UN:
+                       amd64_sse_pminuw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMIND_UN:
+                       amd64_sse_pminud_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PMINB:
+                       amd64_sse_pminsb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMINW:
+                       amd64_sse_pminsw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMIND:
+                       amd64_sse_pminsd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PCMPEQB:
+                       amd64_sse_pcmpeqb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPEQW:
+                       amd64_sse_pcmpeqw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPEQD:
+                       amd64_sse_pcmpeqd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPEQQ:
+                       amd64_sse_pcmpeqq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PCMPGTB:
+                       amd64_sse_pcmpgtb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPGTW:
+                       amd64_sse_pcmpgtw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPGTD:
+                       amd64_sse_pcmpgtd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPGTQ:
+                       amd64_sse_pcmpgtq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PSUM_ABS_DIFF:
+                       amd64_sse_psadbw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_UNPACK_LOWB:
+                       amd64_sse_punpcklbw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWW:
+                       amd64_sse_punpcklwd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWD:
+                       amd64_sse_punpckldq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWQ:
+                       amd64_sse_punpcklqdq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWPS:
+                       amd64_sse_unpcklps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWPD:
+                       amd64_sse_unpcklpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_UNPACK_HIGHB:
+                       amd64_sse_punpckhbw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHW:
+                       amd64_sse_punpckhwd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHD:
+                       amd64_sse_punpckhdq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHQ:
+                       amd64_sse_punpckhqdq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHPS:
+                       amd64_sse_unpckhps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHPD:
+                       amd64_sse_unpckhpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PACKW:
+                       amd64_sse_packsswb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PACKD:
+                       amd64_sse_packssdw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PACKW_UN:
+                       amd64_sse_packuswb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PACKD_UN:
+                       amd64_sse_packusdw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PADDB_SAT_UN:
+                       amd64_sse_paddusb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBB_SAT_UN:
+                       amd64_sse_psubusb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PADDW_SAT_UN:
+                       amd64_sse_paddusw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBW_SAT_UN:
+                       amd64_sse_psubusw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PADDB_SAT:
+                       amd64_sse_paddsb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBB_SAT:
+                       amd64_sse_psubsb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PADDW_SAT:
+                       amd64_sse_paddsw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBW_SAT:
+                       amd64_sse_psubsw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+                       
+               case OP_PMULW:
+                       amd64_sse_pmullw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMULD:
+                       amd64_sse_pmulld_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMULQ:
+                       amd64_sse_pmuludq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMULW_HIGH_UN:
+                       amd64_sse_pmulhuw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMULW_HIGH:
+                       amd64_sse_pmulhw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PSHRW:
+                       amd64_sse_psrlw_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHRW_REG:
+                       amd64_sse_psrlw_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSARW:
+                       amd64_sse_psraw_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSARW_REG:
+                       amd64_sse_psraw_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSHLW:
+                       amd64_sse_psllw_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHLW_REG:
+                       amd64_sse_psllw_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSHRD:
+                       amd64_sse_psrld_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHRD_REG:
+                       amd64_sse_psrld_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSARD:
+                       amd64_sse_psrad_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSARD_REG:
+                       amd64_sse_psrad_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSHLD:
+                       amd64_sse_pslld_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHLD_REG:
+                       amd64_sse_pslld_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSHRQ:
+                       amd64_sse_psrlq_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHRQ_REG:
+                       amd64_sse_psrlq_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+               
+               /*TODO: This is appart of the sse spec but not added
+               case OP_PSARQ:
+                       amd64_sse_psraq_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSARQ_REG:
+                       amd64_sse_psraq_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;  
+               */
+       
+               case OP_PSHLQ:
+                       amd64_sse_psllq_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHLQ_REG:
+                       amd64_sse_psllq_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;  
+
+               case OP_ICONV_TO_X:
+                       amd64_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 4);
+                       break;
+               case OP_EXTRACT_I4:
+                       amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4);
+                       break;
+               case OP_EXTRACT_I8:
+                       if (ins->inst_c0) {
+                               amd64_movhlps_reg_reg (code, AMD64_XMM15, ins->sreg1);
+                               amd64_movd_reg_xreg_size (code, ins->dreg, AMD64_XMM15, 8);
+                       } else {
+                               amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 8);
+                       }
+                       break;
+               case OP_EXTRACT_I1:
+               case OP_EXTRACT_U1:
+                       amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4);
+                       if (ins->inst_c0)
+                               amd64_shift_reg_imm (code, X86_SHR, ins->dreg, ins->inst_c0 * 8);
+                       amd64_widen_reg (code, ins->dreg, ins->dreg, ins->opcode == OP_EXTRACT_I1, FALSE);
+                       break;
+               case OP_EXTRACT_I2:
+               case OP_EXTRACT_U2:
+                       /*amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4);
+                       if (ins->inst_c0)
+                               amd64_shift_reg_imm_size (code, X86_SHR, ins->dreg, 16, 4);*/
+                       amd64_sse_pextrw_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0);
+                       amd64_widen_reg_size (code, ins->dreg, ins->dreg, ins->opcode == OP_EXTRACT_I2, TRUE, 4);
+                       break;
+               case OP_EXTRACT_R8:
+                       if (ins->inst_c0)
+                               amd64_movhlps_reg_reg (code, ins->dreg, ins->sreg1);
+                       else
+                               amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+               case OP_INSERT_I2:
+                       amd64_sse_pinsrw_reg_reg_imm (code, ins->sreg1, ins->sreg2, ins->inst_c0);
+                       break;
+               case OP_EXTRACTX_U2:
+                       amd64_sse_pextrw_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0);
+                       break;
+               case OP_INSERTX_U1_SLOW:
+                       /*sreg1 is the extracted ireg (scratch)
+                       /sreg2 is the to be inserted ireg (scratch)
+                       /dreg is the xreg to receive the value*/
+
+                       /*clear the bits from the extracted word*/
+                       amd64_alu_reg_imm (code, X86_AND, ins->sreg1, ins->inst_c0 & 1 ? 0x00FF : 0xFF00);
+                       /*shift the value to insert if needed*/
+                       if (ins->inst_c0 & 1)
+                               amd64_shift_reg_imm_size (code, X86_SHL, ins->sreg2, 8, 4);
+                       /*join them together*/
+                       amd64_alu_reg_reg (code, X86_OR, ins->sreg1, ins->sreg2);
+                       amd64_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0 / 2);
+                       break;
+               case OP_INSERTX_I4_SLOW:
+                       amd64_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg2, ins->inst_c0 * 2);
+                       amd64_shift_reg_imm (code, X86_SHR, ins->sreg2, 16);
+                       amd64_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg2, ins->inst_c0 * 2 + 1);
+                       break;
+               case OP_INSERTX_I8_SLOW:
+                       amd64_movd_xreg_reg_size(code, AMD64_XMM15, ins->sreg2, 8);
+                       if (ins->inst_c0)
+                               amd64_movlhps_reg_reg (code, ins->dreg, AMD64_XMM15);
+                       else
+                               amd64_sse_movsd_reg_reg (code, ins->dreg, AMD64_XMM15);
+                       break;
+
+               case OP_INSERTX_R4_SLOW:
+                       switch (ins->inst_c0) {
+                       case 0:
+                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg2);
+                               break;
+                       case 1:
+                               amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(1, 0, 2, 3));
+                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg2);
+                               amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(1, 0, 2, 3));
+                               break;
+                       case 2:
+                               amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(2, 1, 0, 3));
+                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg2);
+                               amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(2, 1, 0, 3));
+                               break;
+                       case 3:
+                               amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(3, 1, 2, 0));
+                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg2);
+                               amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(3, 1, 2, 0));
+                               break;
+                       }
+                       break;
+               case OP_INSERTX_R8_SLOW:
+                       if (ins->inst_c0)
+                               amd64_movlhps_reg_reg (code, ins->dreg, ins->sreg2);
+                       else
+                               amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+               case OP_STOREX_MEMBASE_REG:
+               case OP_STOREX_MEMBASE:
+                       amd64_sse_movups_membase_reg (code, ins->dreg, ins->inst_offset, ins->sreg1);
+                       break;
+               case OP_LOADX_MEMBASE:
+                       amd64_sse_movups_reg_membase (code, ins->dreg, ins->sreg1, ins->inst_offset);
+                       break;
+               case OP_LOADX_ALIGNED_MEMBASE:
+                       amd64_sse_movaps_reg_membase (code, ins->dreg, ins->sreg1, ins->inst_offset);
+                       break;
+               case OP_STOREX_ALIGNED_MEMBASE_REG:
+                       amd64_sse_movaps_membase_reg (code, ins->dreg, ins->inst_offset, ins->sreg1);
+                       break;
+               case OP_STOREX_NTA_MEMBASE_REG:
+                       amd64_sse_movntps_reg_membase (code, ins->dreg, ins->sreg1, ins->inst_offset);
+                       break;
+               case OP_PREFETCH_MEMBASE:
+                       amd64_sse_prefetch_reg_membase (code, ins->backend.arg_info, ins->sreg1, ins->inst_offset);
+                       break;
+
+               case OP_XMOVE:
+                       /*FIXME the peephole pass should have killed this*/
+                       if (ins->dreg != ins->sreg1)
+                               amd64_sse_movaps_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;          
+               case OP_XZERO:
+                       amd64_sse_pxor_reg_reg (code, ins->dreg, ins->dreg);
+                       break;
+               case OP_ICONV_TO_R8_RAW:
+                       amd64_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 4);
+                       amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, ins->dreg);
+                       break;
+
+               case OP_FCONV_TO_R8_X:
+                       amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+
+               case OP_XCONV_R8_TO_I4:
+                       amd64_sse_cvttsd2si_reg_xreg_size (code, ins->dreg, ins->sreg1, 4);
+                       switch (ins->backend.source_opcode) {
+                       case OP_FCONV_TO_I1:
+                               amd64_widen_reg (code, ins->dreg, ins->dreg, TRUE, FALSE);
+                               break;
+                       case OP_FCONV_TO_U1:
+                               amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
+                               break;
+                       case OP_FCONV_TO_I2:
+                               amd64_widen_reg (code, ins->dreg, ins->dreg, TRUE, TRUE);
+                               break;
+                       case OP_FCONV_TO_U2:
+                               amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, TRUE);
+                               break;
+                       }                       
+                       break;
+
+               case OP_EXPAND_I2:
+                       amd64_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg1, 0);
+                       amd64_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg1, 1);
+                       amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0);
+                       break;
+               case OP_EXPAND_I4:
+                       amd64_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 4);
+                       amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0);
+                       break;
+               case OP_EXPAND_I8:
+                       amd64_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 8);
+                       amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0x44);
+                       break;
+               case OP_EXPAND_R4:
+                       amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
+                       amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->dreg);
+                       amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0);
+                       break;
+               case OP_EXPAND_R8:
+                       amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
+                       amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0x44);
+                       break;
 #endif
                case OP_LIVERANGE_START: {
                        if (cfg->verbose_level > 1)
index 3907fc83bfc01855bcb80dae675292b7b5cda9a4..6f6ff467ad16ff9468f0c960d96390cd16f8dd6a 100644 (file)
@@ -74,6 +74,15 @@ struct sigcontext {
 };
 #endif  // sun, Solaris x86
 
+#define MONO_ARCH_SUPPORT_SIMD_INTRINSICS 1
+
+#ifndef DISABLE_SIMD
+#define MONO_ARCH_SIMD_INTRINSICS 1
+#define MONO_ARCH_NEED_SIMD_BANK 1
+#endif
+
+
+
 #define MONO_ARCH_SIGNAL_STACK_SIZE (16 * 1024)
 
 #define MONO_ARCH_HAVE_RESTORE_STACK_SUPPORT 1
@@ -86,10 +95,22 @@ struct sigcontext {
 
 #define MONO_ARCH_FP_RETURN_REG AMD64_XMM0
 
+/* XREG and FREG banks are the same in amd64 
+ * Until the regalloc is redone so that they can play nicely
+ * divide up the regbank and force them to play in thier own worlds.
+ *
+ * FREGS are at the top of the xmm reg bank and the XREGS are at the bottom.
+ */
 /* xmm15 is reserved for use by some opcodes */
-#define MONO_ARCH_CALLEE_FREGS 0xef
+#define MONO_ARCH_CALLEE_FREGS 0x00ff
 #define MONO_ARCH_CALLEE_SAVED_FREGS 0
 
+#define MONO_MAX_XREGS MONO_MAX_FREGS
+
+#define MONO_ARCH_CALLEE_XREGS 0x7f00
+#define MONO_ARCH_CALLEE_SAVED_XREGS 0
+
+
 #define MONO_ARCH_CALLEE_REGS AMD64_CALLEE_REGS
 #define MONO_ARCH_CALLEE_SAVED_REGS AMD64_CALLEE_SAVED_REGS
 
index e2ca9b14188b1180de8bd7ba1b00d603eee78b62..81f2e3929963f97be0bdddf1f5282a9f3b7a9d05 100644 (file)
@@ -91,6 +91,9 @@ typedef gint64 mgreg_t;
 /* Version number of the AOT file format */
 #define MONO_AOT_FILE_VERSION "57"
 
+//TODO: This is x86/amd64 specific.
+#define mono_simd_shuffle_mask(a,b,c,d) ((a) | ((b) << 2) | ((c) << 4) | ((d) << 6))
+
 /* Constants used to encode different types of methods in AOT */
 enum {
        MONO_AOT_METHODREF_MIN = 240,