2009-01-29 Zoltan Varga <vargaz@gmail.com>
[mono.git] / mono / mini / mini-x86.c
index 9603ab841dd21393343477225917d27053162932..65805930dbf92bff541191c0e61daba78e32f061 100644 (file)
@@ -1961,6 +1961,8 @@ x86_pop_reg (code, X86_EAX);
 #define LOOP_ALIGNMENT 8
 #define bb_is_loop_start(bb) ((bb)->loop_body_start && (bb)->nesting)
 
+#ifndef DISABLE_JIT
+
 void
 mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 {
@@ -3171,6 +3173,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_SQRT:
                        x86_fsqrt (code);
                        break;
+               case OP_ROUND:
+                       x86_frndint (code);
+                       break;
                case OP_IMIN:
                        g_assert (cfg->opt & MONO_OPT_CMOV);
                        g_assert (ins->dreg == ins->sreg1);
@@ -4028,6 +4033,53 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                x86_shift_reg_imm (code, X86_SHR, ins->dreg, 16);
                        x86_widen_reg (code, ins->dreg, ins->dreg, ins->opcode == OP_EXTRACT_I2, TRUE);
                        break;
+               case OP_EXTRACT_R8:
+                       if (ins->inst_c0)
+                               x86_sse_alu_pd_membase_reg (code, X86_SSE_MOVHPD_MEMBASE_REG, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, ins->sreg1);
+                       else
+                               x86_sse_alu_sd_membase_reg (code, X86_SSE_MOVSD_MEMBASE_REG, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, ins->sreg1);
+                       x86_fld_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, TRUE);
+                       break;
+
+               case OP_INSERT_I2:
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->sreg1, ins->sreg2, ins->inst_c0);
+                       break;
+               case OP_EXTRACTX_U2:
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PEXTRW, ins->dreg, ins->sreg1, ins->inst_c0);
+                       break;
+               case OP_INSERTX_U1_SLOW:
+                       /*sreg1 is the extracted ireg (scratch)
+                       /sreg2 is the to be inserted ireg (scratch)
+                       /dreg is the xreg to receive the value*/
+
+                       /*clear the bits from the extracted word*/
+                       x86_alu_reg_imm (code, X86_AND, ins->sreg1, ins->inst_c0 & 1 ? 0x00FF : 0xFF00);
+                       /*shift the value to insert if needed*/
+                       if (ins->inst_c0 & 1)
+                               x86_shift_reg_imm (code, X86_SHL, ins->sreg2, 8);
+                       /*join them together*/
+                       x86_alu_reg_reg (code, X86_OR, ins->sreg1, ins->sreg2);
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg1, ins->inst_c0 / 2);
+                       break;
+               case OP_INSERTX_I4_SLOW:
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg2, ins->inst_c0 * 2);
+                       x86_shift_reg_imm (code, X86_SHR, ins->sreg2, 16);
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg2, ins->inst_c0 * 2 + 1);
+                       break;
+
+               case OP_INSERTX_R4_SLOW:
+                       x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, FALSE, TRUE);
+                       /*TODO if inst_c0 == 0 use movss*/
+                       x86_sse_alu_pd_reg_membase_imm (code, X86_SSE_PINSRW, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset + 0, ins->inst_c0 * 2);
+                       x86_sse_alu_pd_reg_membase_imm (code, X86_SSE_PINSRW, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset + 2, ins->inst_c0 * 2 + 1);
+                       break;
+               case OP_INSERTX_R8_SLOW:
+                       x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, TRUE, TRUE);
+                       if (ins->inst_c0)
+                               x86_sse_alu_pd_reg_membase (code, X86_SSE_MOVHPD_REG_MEMBASE, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset);
+                       else
+                               x86_sse_alu_pd_reg_membase (code, X86_SSE_MOVSD_REG_MEMBASE, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset);
+                       break;
 
                case OP_STOREX_MEMBASE_REG:
                case OP_STOREX_MEMBASE:
@@ -4084,6 +4136,34 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                break;
                        }                       
                        break;
+
+               case OP_EXPAND_I1:
+                       /*FIXME this causes a partial register stall, maybe it would not be that bad to use shift + mask + or*/
+                       /*The +4 is to get a mov ?h, ?l over the same reg.*/
+                       x86_mov_reg_reg (code, ins->dreg + 4, ins->dreg, 1);
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg1, 0);
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg1, 1);
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->dreg, 0);
+                       break;
+               case OP_EXPAND_I2:
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg1, 0);
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg1, 1);
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->dreg, 0);
+                       break;
+               case OP_EXPAND_I4:
+                       x86_movd_xreg_reg (code, ins->dreg, ins->sreg1);
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->dreg, 0);
+                       break;
+               case OP_EXPAND_R4:
+                       x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, FALSE, TRUE);
+                       x86_movd_xreg_membase (code, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset);
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->dreg, 0);
+                       break;
+               case OP_EXPAND_R8:
+                       x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, TRUE, TRUE);
+                       x86_movsd_reg_membase (code, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset);
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->dreg, 0x44);
+                       break;
 #endif
                default:
                        g_warning ("unknown opcode %s\n", mono_inst_name (ins->opcode));
@@ -4102,6 +4182,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
        cfg->code_len = code - cfg->native_code;
 }
 
+#endif /* DISABLE_JIT */
+
 void
 mono_arch_register_lowlevel_calls (void)
 {
@@ -4900,6 +4982,8 @@ mono_arch_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMetho
                        opcode = OP_SQRT;
                } else if (strcmp (cmethod->name, "Abs") == 0 && fsig->params [0]->type == MONO_TYPE_R8) {
                        opcode = OP_ABS;
+               } else if (strcmp (cmethod->name, "Round") == 0 && fsig->param_count == 1 && fsig->params [0]->type == MONO_TYPE_R8) {
+                       opcode = OP_ROUND;
                }
                
                if (opcode) {
@@ -5320,8 +5404,100 @@ mono_arch_decompose_opts (MonoCompile *cfg, MonoInst *ins)
        ins->dreg = dreg;
        ins->type = STACK_I4;
        ins->backend.source_opcode = src_opcode;
+}
 
-
+void
+mono_arch_decompose_long_opts (MonoCompile *cfg, MonoInst *long_ins)
+{
+       MonoInst *ins;
+       int vreg;
+       if (!(cfg->opt & MONO_OPT_SIMD))
+               return;
+       
+       /*TODO move this to simd-intrinsic.c once we support sse 4.1 dword extractors since we need the runtime caps info */ 
+       switch (long_ins->opcode) {
+       case OP_EXTRACT_I8:
+               vreg = long_ins->sreg1;
+       
+               if (long_ins->inst_c0) {
+                       MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
+                       ins->klass = long_ins->klass;
+                       ins->sreg1 = long_ins->sreg1;
+                       ins->inst_c0 = 2;
+                       ins->type = STACK_VTYPE;
+                       ins->dreg = vreg = alloc_ireg (cfg);
+                       MONO_ADD_INS (cfg->cbb, ins);
+               }
+       
+               MONO_INST_NEW (cfg, ins, OP_EXTRACT_I4);
+               ins->klass = mono_defaults.int32_class;
+               ins->sreg1 = vreg;
+               ins->type = STACK_I4;
+               ins->dreg = long_ins->dreg + 1;
+               MONO_ADD_INS (cfg->cbb, ins);
+       
+               MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
+               ins->klass = long_ins->klass;
+               ins->sreg1 = long_ins->sreg1;
+               ins->inst_c0 = long_ins->inst_c0 ? 3 : 1;
+               ins->type = STACK_VTYPE;
+               ins->dreg = vreg = alloc_ireg (cfg);
+               MONO_ADD_INS (cfg->cbb, ins);
+       
+               MONO_INST_NEW (cfg, ins, OP_EXTRACT_I4);
+               ins->klass = mono_defaults.int32_class;
+               ins->sreg1 = vreg;
+               ins->type = STACK_I4;
+               ins->dreg = long_ins->dreg + 2;
+               MONO_ADD_INS (cfg->cbb, ins);
+       
+               long_ins->opcode = OP_NOP;
+               break;
+       case OP_INSERTX_I8_SLOW:
+               MONO_INST_NEW (cfg, ins, OP_INSERTX_I4_SLOW);
+               ins->dreg = long_ins->dreg;
+               ins->sreg1 = long_ins->dreg;
+               ins->sreg2 = long_ins->sreg2 + 1;
+               ins->inst_c0 = long_ins->inst_c0 * 2;
+               MONO_ADD_INS (cfg->cbb, ins);
+
+               MONO_INST_NEW (cfg, ins, OP_INSERTX_I4_SLOW);
+               ins->dreg = long_ins->dreg;
+               ins->sreg1 = long_ins->dreg;
+               ins->sreg2 = long_ins->sreg2 + 2;
+               ins->inst_c0 = long_ins->inst_c0 * 2 + 1;
+               MONO_ADD_INS (cfg->cbb, ins);
+
+               long_ins->opcode = OP_NOP;
+               break;
+       case OP_EXPAND_I8:
+               MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
+               ins->dreg = long_ins->dreg;
+               ins->sreg1 = long_ins->sreg1 + 1;
+               ins->klass = long_ins->klass;
+               ins->type = STACK_VTYPE;
+               MONO_ADD_INS (cfg->cbb, ins);
+
+               MONO_INST_NEW (cfg, ins, OP_INSERTX_I4_SLOW);
+               ins->dreg = long_ins->dreg;
+               ins->sreg1 = long_ins->dreg;
+               ins->sreg2 = long_ins->sreg1 + 2;
+               ins->inst_c0 = 1;
+               ins->klass = long_ins->klass;
+               ins->type = STACK_VTYPE;
+               MONO_ADD_INS (cfg->cbb, ins);
+
+               MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
+               ins->dreg = long_ins->dreg;
+               ins->sreg1 = long_ins->dreg;;
+               ins->inst_c0 = 0x44; /*Magic number for swizzling (X,Y,X,Y)*/
+               ins->klass = long_ins->klass;
+               ins->type = STACK_VTYPE;
+               MONO_ADD_INS (cfg->cbb, ins);
+
+               long_ins->opcode = OP_NOP;
+               break;
+       }
 }
 #endif