2009-01-29 Zoltan Varga <vargaz@gmail.com>
[mono.git] / mono / mini / mini-x86.c
index f62b7901f513efe7ce56b0b1723bceffbb0aab2f..65805930dbf92bff541191c0e61daba78e32f061 100644 (file)
@@ -25,8 +25,8 @@
 
 #include "trace.h"
 #include "mini-x86.h"
-#include "inssel.h"
 #include "cpu-x86.h"
+#include "ir-emit.h"
 
 /* On windows, these hold the key returned by TlsAlloc () */
 static gint lmf_tls_offset = -1;
@@ -528,7 +528,7 @@ mono_arch_get_argument_info (MonoMethodSignature *csig, int param_count, MonoJit
                offset += size;
        }
 
-       if (mono_do_x86_stack_align)
+       if (mono_do_x86_stack_align && !CALLCONV_IS_STDCALL (csig))
                align = MONO_ARCH_FRAME_ALIGNMENT;
        else
                align = 4;
@@ -897,6 +897,13 @@ mono_arch_allocate_vars (MonoCompile *cfg)
                offset += (locals_stack_align - 1);
                offset &= ~(locals_stack_align - 1);
        }
+       /*
+        * EBP is at alignment 8 % MONO_ARCH_FRAME_ALIGNMENT, so if we
+        * have locals larger than 8 bytes we need to make sure that
+        * they have the appropriate offset.
+        */
+       if (MONO_ARCH_FRAME_ALIGNMENT > 8 && locals_stack_align > 8)
+               offset += MONO_ARCH_FRAME_ALIGNMENT - sizeof (gpointer) * 2;
        for (i = cfg->locals_start; i < cfg->num_varinfo; i++) {
                if (offsets [i] != -1) {
                        MonoInst *inst = cfg->varinfo [i];
@@ -986,38 +993,6 @@ mono_arch_create_vars (MonoCompile *cfg)
        }
 }
 
-static void
-emit_sig_cookie (MonoCompile *cfg, MonoCallInst *call)
-{
-       MonoInst *arg;
-       MonoMethodSignature *tmp_sig;
-       MonoInst *sig_arg;
-
-       /* FIXME: Add support for signature tokens to AOT */
-       cfg->disable_aot = TRUE;
-       MONO_INST_NEW (cfg, arg, OP_OUTARG);
-
-       /*
-        * mono_ArgIterator_Setup assumes the signature cookie is 
-        * passed first and all the arguments which were before it are
-        * passed on the stack after the signature. So compensate by 
-        * passing a different signature.
-        */
-       tmp_sig = mono_metadata_signature_dup (call->signature);
-       tmp_sig->param_count -= call->signature->sentinelpos;
-       tmp_sig->sentinelpos = 0;
-       memcpy (tmp_sig->params, call->signature->params + call->signature->sentinelpos, tmp_sig->param_count * sizeof (MonoType*));
-
-       MONO_INST_NEW (cfg, sig_arg, OP_ICONST);
-       sig_arg->inst_p0 = tmp_sig;
-
-       arg->inst_left = sig_arg;
-       arg->type = STACK_PTR;
-       /* prepend, so they get reversed */
-       arg->next = call->out_args;
-       call->out_args = arg;
-}
-
 /*
  * It is expensive to adjust esp for each individual fp argument pushed on the stack
  * so we try to do it just once when we have multiple fp arguments in a row.
@@ -1027,7 +1002,7 @@ emit_sig_cookie (MonoCompile *cfg, MonoCallInst *call)
  * fp_arg_setup is the first argument in the execution sequence where the esp register
  * is modified.
  */
-static int
+static G_GNUC_UNUSED int
 collect_fp_stack_space (MonoMethodSignature *sig, int start_arg, int *fp_arg_setup)
 {
        int fp_space = 0;
@@ -1045,157 +1020,8 @@ collect_fp_stack_space (MonoMethodSignature *sig, int start_arg, int *fp_arg_set
        return fp_space;
 }
 
-/* 
- * take the arguments and generate the arch-specific
- * instructions to properly call the function in call.
- * This includes pushing, moving arguments to the right register
- * etc.
- */
-MonoCallInst*
-mono_arch_call_opcode (MonoCompile *cfg, MonoBasicBlock* bb, MonoCallInst *call, int is_virtual) {
-       MonoInst *arg, *in;
-       MonoMethodSignature *sig;
-       int i, n;
-       CallInfo *cinfo;
-       int sentinelpos = 0;
-       int fp_args_space = 0, fp_args_offset = 0, fp_arg_setup = -1;
-
-       sig = call->signature;
-       n = sig->param_count + sig->hasthis;
-
-       cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, FALSE);
-
-       if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG))
-               sentinelpos = sig->sentinelpos + (is_virtual ? 1 : 0);
-
-       for (i = 0; i < n; ++i) {
-               ArgInfo *ainfo = cinfo->args + i;
-
-               /* Emit the signature cookie just before the implicit arguments */
-               if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (i == sentinelpos)) {
-                       emit_sig_cookie (cfg, call);
-               }
-
-               if (is_virtual && i == 0) {
-                       /* the argument will be attached to the call instrucion */
-                       in = call->args [i];
-               } else {
-                       MonoType *t;
-
-                       if (i >= sig->hasthis)
-                               t = sig->params [i - sig->hasthis];
-                       else
-                               t = &mono_defaults.int_class->byval_arg;
-                       t = mini_type_get_underlying_type (cfg->generic_sharing_context, t);
-
-                       MONO_INST_NEW (cfg, arg, OP_OUTARG);
-                       in = call->args [i];
-                       arg->cil_code = in->cil_code;
-                       arg->inst_left = in;
-                       arg->type = in->type;
-                       /* prepend, so they get reversed */
-                       arg->next = call->out_args;
-                       call->out_args = arg;
-
-                       if ((i >= sig->hasthis) && (MONO_TYPE_ISSTRUCT(t))) {
-                               gint align;
-                               guint32 ialign;
-                               guint32 size;
-
-                               if (t->type == MONO_TYPE_TYPEDBYREF) {
-                                       size = sizeof (MonoTypedRef);
-                                       align = sizeof (gpointer);
-                               }
-                               else {
-                                       size = mini_type_stack_size_full (cfg->generic_sharing_context, &in->klass->byval_arg, &ialign, sig->pinvoke);
-                               }
-                               arg->opcode = OP_OUTARG_VT;
-                               arg->klass = in->klass;
-                               arg->backend.is_pinvoke = sig->pinvoke;
-                               arg->inst_imm = size; 
-                       }
-                       else {
-                               switch (ainfo->storage) {
-                               case ArgOnStack:
-                                       arg->opcode = OP_OUTARG;
-                                       if (!t->byref) {
-                                               if (t->type == MONO_TYPE_R4) {
-                                                       arg->opcode = OP_OUTARG_R4;
-                                               } else if (t->type == MONO_TYPE_R8) {
-                                                       arg->opcode = OP_OUTARG_R8;
-                                                       /* we store in the upper bits of backen.arg_info the needed
-                                                        * esp adjustment and in the lower bits the offset from esp
-                                                        * where the arg needs to be stored
-                                                        */
-                                                       if (!fp_args_space) {
-                                                               fp_args_space = collect_fp_stack_space (sig, i - sig->hasthis, &fp_arg_setup);
-                                                               fp_args_offset = fp_args_space;
-                                                       }
-                                                       arg->backend.arg_info = fp_args_space - fp_args_offset;
-                                                       fp_args_offset -= sizeof (double);
-                                                       if (i - sig->hasthis == fp_arg_setup) {
-                                                               arg->backend.arg_info |= fp_args_space << 16;
-                                                       }
-                                                       if (fp_args_offset == 0) {
-                                                               /* the allocated esp stack is finished:
-                                                                * prepare for an eventual second run of fp args
-                                                                */
-                                                               fp_args_space = 0;
-                                                       }
-                                               }
-                                       }
-                                       break;
-                               default:
-                                       g_assert_not_reached ();
-                               }
-                       }
-               }
-       }
-
-       /* Handle the case where there are no implicit arguments */
-       if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (n == sentinelpos)) {
-               emit_sig_cookie (cfg, call);
-       }
-
-       if (sig->ret && MONO_TYPE_ISSTRUCT (sig->ret)) {
-               if (cinfo->ret.storage == ArgValuetypeInReg) {
-                       MonoInst *zero_inst;
-                       /*
-                        * After the call, the struct is in registers, but needs to be saved to the memory pointed
-                        * to by vt_arg in this_vret_args. This means that vt_arg needs to be saved somewhere
-                        * before calling the function. So we add a dummy instruction to represent pushing the 
-                        * struct return address to the stack. The return address will be saved to this stack slot 
-                        * by the code emitted in this_vret_args.
-                        */
-                       MONO_INST_NEW (cfg, arg, OP_OUTARG);
-                       MONO_INST_NEW (cfg, zero_inst, OP_ICONST);
-                       zero_inst->inst_p0 = 0;
-                       arg->inst_left = zero_inst;
-                       arg->type = STACK_PTR;
-                       /* prepend, so they get reversed */
-                       arg->next = call->out_args;
-                       call->out_args = arg;
-               }
-               else
-                       /* if the function returns a struct, the called method already does a ret $0x4 */
-                       if (sig->ret && MONO_TYPE_ISSTRUCT (sig->ret))
-                               cinfo->stack_usage -= 4;
-       }
-
-       call->stack_usage = cinfo->stack_usage;
-
-       if (cinfo->need_stack_align) {
-               MONO_INST_NEW (cfg, arg, OP_X86_OUTARG_ALIGN_STACK);
-               arg->inst_c0 = cinfo->stack_align_amount;
-               arg->next = call->out_args;
-               call->out_args = arg;
-        }
-
-       return call;
-}
-
 static void
-emit_sig_cookie2 (MonoCompile *cfg, MonoCallInst *call, CallInfo *cinfo)
+emit_sig_cookie (MonoCompile *cfg, MonoCallInst *call, CallInfo *cinfo)
 {
        MonoMethodSignature *tmp_sig;
 
@@ -1267,7 +1093,7 @@ mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
 
        /* Handle the case where there are no implicit arguments */
        if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (n == sentinelpos)) {
-               emit_sig_cookie2 (cfg, call, cinfo);
+               emit_sig_cookie (cfg, call, cinfo);
        }
 
        /* Arguments are pushed in the reverse order */
@@ -1343,7 +1169,7 @@ mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
 
                if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (i == sentinelpos)) {
                        /* Emit the signature cookie just before the implicit arguments */
-                       emit_sig_cookie2 (cfg, call, cinfo);
+                       emit_sig_cookie (cfg, call, cinfo);
                }
        }
 
@@ -1358,7 +1184,7 @@ mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
                        /* The return address is passed in a register */
                        MONO_INST_NEW (cfg, vtarg, OP_MOVE);
                        vtarg->sreg1 = call->inst.dreg;
-                       vtarg->dreg = mono_regstate_next_int (cfg->rs);
+                       vtarg->dreg = mono_alloc_ireg (cfg);
                        MONO_ADD_INS (cfg->cbb, vtarg);
                                
                        mono_call_inst_add_outarg_reg (cfg, call, vtarg->dreg, cinfo->ret.reg, FALSE);
@@ -1390,7 +1216,7 @@ mono_arch_emit_outarg_vt (MonoCompile *cfg, MonoInst *ins, MonoInst *src)
                MONO_ADD_INS (cfg->cbb, arg);
        } else if (size <= 20) {        
                MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, ALIGN_TO (size, 4));
-               mini_emit_memcpy2 (cfg, X86_ESP, 0, src->dreg, 0, size, 4);
+               mini_emit_memcpy (cfg, X86_ESP, 0, src->dreg, 0, size, 4);
        } else {
                MONO_INST_NEW (cfg, arg, OP_X86_PUSH_OBJ);
                arg->inst_basereg = src->dreg;
@@ -1770,9 +1596,6 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
 {
        MonoInst *ins, *next;
 
-       if (bb->max_vreg > cfg->rs->next_vreg)
-               cfg->rs->next_vreg = bb->max_vreg;
-
        /*
         * FIXME: Need to add more instructions, but the current machine 
         * description can't model some parts of the composite instructions like
@@ -1797,7 +1620,7 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                }
        }
 
-       bb->max_vreg = cfg->rs->next_vreg;
+       bb->max_vreg = cfg->next_vreg;
 }
 
 static const int 
@@ -2034,18 +1857,19 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, guint8 *code)
 }
 
 /*
- * emit_tls_get:
+ * mono_x86_emit_tls_get:
  * @code: buffer to store code to
  * @dreg: hard register where to place the result
  * @tls_offset: offset info
  *
- * emit_tls_get emits in @code the native code that puts in the dreg register
- * the item in the thread local storage identified by tls_offset.
+ * mono_x86_emit_tls_get emits in @code the native code that puts in
+ * the dreg register the item in the thread local storage identified
+ * by tls_offset.
  *
  * Returns: a pointer to the end of the stored code
  */
-static guint8*
-emit_tls_get (guint8* code, int dreg, int tls_offset)
+guint8*
+mono_x86_emit_tls_get (guint8* code, int dreg, int tls_offset)
 {
 #ifdef PLATFORM_WIN32
        /* 
@@ -2137,6 +1961,8 @@ x86_pop_reg (code, X86_EAX);
 #define LOOP_ALIGNMENT 8
 #define bb_is_loop_start(bb) ((bb)->loop_body_start && (bb)->nesting)
 
+#ifndef DISABLE_JIT
+
 void
 mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 {
@@ -2228,10 +2054,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        x86_mov_mem_imm (code, ins->inst_p0, ins->inst_c0, 4);
                        break;
                case OP_LOADU4_MEM:
-                       if (cfg->new_ir)
-                               x86_mov_reg_mem (code, ins->dreg, ins->inst_imm, 4);
-                       else
-                               x86_mov_reg_mem (code, ins->dreg, ins->inst_p0, 4);
+                       x86_mov_reg_mem (code, ins->dreg, ins->inst_imm, 4);
                        break;
                case OP_LOAD_MEM:
                case OP_LOADI4_MEM:
@@ -2872,7 +2695,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        }
                        code = emit_move_return_value (cfg, ins, code);
                        break;
-               case OP_OUTARG:
                case OP_X86_PUSH:
                        x86_push_reg (code, ins->sreg1);
                        break;
@@ -3351,6 +3173,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_SQRT:
                        x86_fsqrt (code);
                        break;
+               case OP_ROUND:
+                       x86_frndint (code);
+                       break;
                case OP_IMIN:
                        g_assert (cfg->opt & MONO_OPT_CMOV);
                        g_assert (ins->dreg == ins->sreg1);
@@ -3684,7 +3509,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        break;
                }
                case OP_TLS_GET: {
-                       code = emit_tls_get (code, ins->dreg, ins->inst_offset);
+                       code = mono_x86_emit_tls_get (code, ins->dreg, ins->inst_offset);
                        break;
                }
                case OP_MEMORY_BARRIER: {
@@ -3861,10 +3686,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_DUPPS_LOW:
                        x86_sse_alu_ss_reg_reg (code, X86_SSE_MOVSLDUP, ins->dreg, ins->sreg1);
                        break;
-               case OP_SHUFLEPS:
-                       g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 0xFF);
-                       x86_pshufd_reg_reg (code, ins->dreg, ins->sreg1, ins->inst_c0);
-                       break;
 
                case OP_PSHUFLEW_HIGH:
                        g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 0xFF);
@@ -3879,6 +3700,53 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->sreg1, ins->inst_c0);
                        break;
 
+               case OP_ADDPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_ADD, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_DIVPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_DIV, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_MULPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_MUL, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_SUBPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_SUB, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_MAXPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_MAX, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_MINPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_MIN, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_COMPPD:
+                       g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 7);
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_COMP, ins->sreg1, ins->sreg2, ins->inst_c0);
+                       break;
+               case OP_ANDPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_AND, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_ANDNPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_ANDN, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_ORPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_OR, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_XORPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_XOR, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_ADDSUBPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_ADDSUB, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_HADDPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_HADD, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_HSUBPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_HSUB, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_DUPPD:
+                       x86_sse_alu_sd_reg_reg (code, X86_SSE_MOVDDUP, ins->dreg, ins->sreg1);
+                       break;
+                       
                case OP_EXTRACT_MASK:
                        x86_sse_alu_pd_reg_reg (code, X86_SSE_PMOVMSKB, ins->dreg, ins->sreg1);
                        break;
@@ -3902,6 +3770,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_PADDD:
                        x86_sse_alu_pd_reg_reg (code, X86_SSE_PADDD, ins->sreg1, ins->sreg2);
                        break;
+               case OP_PADDQ:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PADDQ, ins->sreg1, ins->sreg2);
+                       break;
 
                case OP_PSUBB:
                        x86_sse_alu_pd_reg_reg (code, X86_SSE_PSUBB, ins->sreg1, ins->sreg2);
@@ -3912,6 +3783,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_PSUBD:
                        x86_sse_alu_pd_reg_reg (code, X86_SSE_PSUBD, ins->sreg1, ins->sreg2);
                        break;
+               case OP_PSUBQ:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PSUBQ, ins->sreg1, ins->sreg2);
+                       break;
 
                case OP_PMAXB_UN:
                        x86_sse_alu_pd_reg_reg (code, X86_SSE_PMAXUB, ins->sreg1, ins->sreg2);
@@ -3969,6 +3843,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_PCMPEQD:
                        x86_sse_alu_pd_reg_reg (code, X86_SSE_PCMPEQD, ins->sreg1, ins->sreg2);
                        break;
+               case OP_PCMPEQQ:
+                       x86_sse_alu_sse41_reg_reg (code, X86_SSE_PCMPEQQ, ins->sreg1, ins->sreg2);
+                       break;
 
                case OP_PCMPGTB:
                        x86_sse_alu_pd_reg_reg (code, X86_SSE_PCMPGTB, ins->sreg1, ins->sreg2);
@@ -3979,6 +3856,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_PCMPGTD:
                        x86_sse_alu_pd_reg_reg (code, X86_SSE_PCMPGTD, ins->sreg1, ins->sreg2);
                        break;
+               case OP_PCMPGTQ:
+                       x86_sse_alu_sse41_reg_reg (code, X86_SSE_PCMPGTQ, ins->sreg1, ins->sreg2);
+                       break;
 
                case OP_PSUM_ABS_DIFF:
                        x86_sse_alu_pd_reg_reg (code, X86_SSE_PSADBW, ins->sreg1, ins->sreg2);
@@ -3993,9 +3873,15 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_UNPACK_LOWD:
                        x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKLDQ, ins->sreg1, ins->sreg2);
                        break;
+               case OP_UNPACK_LOWQ:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKLQDQ, ins->sreg1, ins->sreg2);
+                       break;
                case OP_UNPACK_LOWPS:
                        x86_sse_alu_ps_reg_reg (code, X86_SSE_UNPCKL, ins->sreg1, ins->sreg2);
                        break;
+               case OP_UNPACK_LOWPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_UNPCKL, ins->sreg1, ins->sreg2);
+                       break;
 
                case OP_UNPACK_HIGHB:
                        x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKHBW, ins->sreg1, ins->sreg2);
@@ -4006,9 +3892,15 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_UNPACK_HIGHD:
                        x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKHDQ, ins->sreg1, ins->sreg2);
                        break;
+               case OP_UNPACK_HIGHQ:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKHQDQ, ins->sreg1, ins->sreg2);
+                       break;
                case OP_UNPACK_HIGHPS:
                        x86_sse_alu_ps_reg_reg (code, X86_SSE_UNPCKH, ins->sreg1, ins->sreg2);
                        break;
+               case OP_UNPACK_HIGHPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_UNPCKH, ins->sreg1, ins->sreg2);
+                       break;
 
                case OP_PACKW:
                        x86_sse_alu_pd_reg_reg (code, X86_SSE_PACKSSWB, ins->sreg1, ins->sreg2);
@@ -4055,6 +3947,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_PMULD:
                        x86_sse_alu_sse41_reg_reg (code, X86_SSE_PMULLD, ins->sreg1, ins->sreg2);
                        break;
+               case OP_PMULQ:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PMULUDQ, ins->sreg1, ins->sreg2);
+                       break;
                case OP_PMULW_HIGH_UN:
                        x86_sse_alu_pd_reg_reg (code, X86_SSE_PMULHUW, ins->sreg1, ins->sreg2);
                        break;
@@ -4104,12 +3999,88 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        x86_sse_shift_reg_reg (code, X86_SSE_PSLLD_REG, ins->dreg, ins->sreg2);
                        break;
 
+               case OP_PSHRQ:
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHIFTQ, X86_SSE_SHR, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHRQ_REG:
+                       x86_sse_shift_reg_reg (code, X86_SSE_PSRLQ_REG, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSHLQ:
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHIFTQ, X86_SSE_SHL, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHLQ_REG:
+                       x86_sse_shift_reg_reg (code, X86_SSE_PSLLQ_REG, ins->dreg, ins->sreg2);
+                       break;          
+                       
                case OP_ICONV_TO_X:
                        x86_movd_xreg_reg (code, ins->dreg, ins->sreg1);
                        break;
                case OP_EXTRACT_I4:
                        x86_movd_reg_xreg (code, ins->dreg, ins->sreg1);
                        break;
+               case OP_EXTRACT_I1:
+               case OP_EXTRACT_U1:
+                       x86_movd_reg_xreg (code, ins->dreg, ins->sreg1);
+                       if (ins->inst_c0)
+                               x86_shift_reg_imm (code, X86_SHR, ins->dreg, ins->inst_c0 * 8);
+                       x86_widen_reg (code, ins->dreg, ins->dreg, ins->opcode == OP_EXTRACT_I1, FALSE);
+                       break;
+               case OP_EXTRACT_I2:
+               case OP_EXTRACT_U2:
+                       x86_movd_reg_xreg (code, ins->dreg, ins->sreg1);
+                       if (ins->inst_c0)
+                               x86_shift_reg_imm (code, X86_SHR, ins->dreg, 16);
+                       x86_widen_reg (code, ins->dreg, ins->dreg, ins->opcode == OP_EXTRACT_I2, TRUE);
+                       break;
+               case OP_EXTRACT_R8:
+                       if (ins->inst_c0)
+                               x86_sse_alu_pd_membase_reg (code, X86_SSE_MOVHPD_MEMBASE_REG, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, ins->sreg1);
+                       else
+                               x86_sse_alu_sd_membase_reg (code, X86_SSE_MOVSD_MEMBASE_REG, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, ins->sreg1);
+                       x86_fld_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, TRUE);
+                       break;
+
+               case OP_INSERT_I2:
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->sreg1, ins->sreg2, ins->inst_c0);
+                       break;
+               case OP_EXTRACTX_U2:
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PEXTRW, ins->dreg, ins->sreg1, ins->inst_c0);
+                       break;
+               case OP_INSERTX_U1_SLOW:
+                       /*sreg1 is the extracted ireg (scratch)
+                       /sreg2 is the to be inserted ireg (scratch)
+                       /dreg is the xreg to receive the value*/
+
+                       /*clear the bits from the extracted word*/
+                       x86_alu_reg_imm (code, X86_AND, ins->sreg1, ins->inst_c0 & 1 ? 0x00FF : 0xFF00);
+                       /*shift the value to insert if needed*/
+                       if (ins->inst_c0 & 1)
+                               x86_shift_reg_imm (code, X86_SHL, ins->sreg2, 8);
+                       /*join them together*/
+                       x86_alu_reg_reg (code, X86_OR, ins->sreg1, ins->sreg2);
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg1, ins->inst_c0 / 2);
+                       break;
+               case OP_INSERTX_I4_SLOW:
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg2, ins->inst_c0 * 2);
+                       x86_shift_reg_imm (code, X86_SHR, ins->sreg2, 16);
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg2, ins->inst_c0 * 2 + 1);
+                       break;
+
+               case OP_INSERTX_R4_SLOW:
+                       x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, FALSE, TRUE);
+                       /*TODO if inst_c0 == 0 use movss*/
+                       x86_sse_alu_pd_reg_membase_imm (code, X86_SSE_PINSRW, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset + 0, ins->inst_c0 * 2);
+                       x86_sse_alu_pd_reg_membase_imm (code, X86_SSE_PINSRW, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset + 2, ins->inst_c0 * 2 + 1);
+                       break;
+               case OP_INSERTX_R8_SLOW:
+                       x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, TRUE, TRUE);
+                       if (ins->inst_c0)
+                               x86_sse_alu_pd_reg_membase (code, X86_SSE_MOVHPD_REG_MEMBASE, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset);
+                       else
+                               x86_sse_alu_pd_reg_membase (code, X86_SSE_MOVSD_REG_MEMBASE, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset);
+                       break;
+
                case OP_STOREX_MEMBASE_REG:
                case OP_STOREX_MEMBASE:
                        x86_movups_membase_reg (code, ins->dreg, ins->inst_offset, ins->sreg1);
@@ -4122,6 +4093,13 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        break;
                case OP_STOREX_ALIGNED_MEMBASE_REG:
                        x86_movaps_membase_reg (code, ins->dreg, ins->inst_offset, ins->sreg1);
+                       break;
+               case OP_STOREX_NTA_MEMBASE_REG:
+                       x86_sse_alu_reg_membase (code, X86_SSE_MOVNTPS, ins->dreg, ins->sreg1, ins->inst_offset);
+                       break;
+               case OP_PREFETCH_MEMBASE:
+                       x86_sse_alu_reg_membase (code, X86_SSE_PREFETCH, ins->backend.arg_info, ins->sreg1, ins->inst_offset);
+
                        break;
                case OP_XMOVE:
                        /*FIXME the peephole pass should have killed this*/
@@ -4135,14 +4113,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        x86_mov_membase_reg (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, ins->sreg1, 4);
                        x86_fld_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, FALSE);
                        break;
-               case OP_PUSH_R4:
-                       x86_alu_reg_imm (code, X86_SUB, X86_ESP, 4);
-                       x86_fst_membase (code, X86_ESP, 0, FALSE, TRUE);
-                       break;
-               case OP_LOADX_STACK: 
-                       x86_movups_reg_membase (code, ins->dreg, X86_ESP, 0);
-                       x86_alu_reg_imm (code, X86_ADD, X86_ESP, 16);
-                       break;
 
                case OP_FCONV_TO_R8_X:
                        x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, TRUE, TRUE);
@@ -4166,6 +4136,34 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                break;
                        }                       
                        break;
+
+               case OP_EXPAND_I1:
+                       /*FIXME this causes a partial register stall, maybe it would not be that bad to use shift + mask + or*/
+                       /*The +4 is to get a mov ?h, ?l over the same reg.*/
+                       x86_mov_reg_reg (code, ins->dreg + 4, ins->dreg, 1);
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg1, 0);
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg1, 1);
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->dreg, 0);
+                       break;
+               case OP_EXPAND_I2:
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg1, 0);
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg1, 1);
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->dreg, 0);
+                       break;
+               case OP_EXPAND_I4:
+                       x86_movd_xreg_reg (code, ins->dreg, ins->sreg1);
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->dreg, 0);
+                       break;
+               case OP_EXPAND_R4:
+                       x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, FALSE, TRUE);
+                       x86_movd_xreg_membase (code, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset);
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->dreg, 0);
+                       break;
+               case OP_EXPAND_R8:
+                       x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, TRUE, TRUE);
+                       x86_movsd_reg_membase (code, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset);
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->dreg, 0x44);
+                       break;
 #endif
                default:
                        g_warning ("unknown opcode %s\n", mono_inst_name (ins->opcode));
@@ -4184,6 +4182,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
        cfg->code_len = code - cfg->native_code;
 }
 
+#endif /* DISABLE_JIT */
+
 void
 mono_arch_register_lowlevel_calls (void)
 {
@@ -4231,6 +4231,8 @@ mono_arch_patch_code (MonoMethod *method, MonoDomain *domain, guint8 *code, Mono
                case MONO_PATCH_INFO_LABEL:
                case MONO_PATCH_INFO_RGCTX_FETCH:
                case MONO_PATCH_INFO_GENERIC_CLASS_INIT:
+               case MONO_PATCH_INFO_MONITOR_ENTER:
+               case MONO_PATCH_INFO_MONITOR_EXIT:
                        x86_patch (ip, target);
                        break;
                case MONO_PATCH_INFO_NONE:
@@ -4272,11 +4274,11 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                if (appdomain_tls_offset != -1 && lmf_tls_offset != -1) {
                        guint8 *buf, *no_domain_branch;
 
-                       code = emit_tls_get (code, X86_EAX, appdomain_tls_offset);
+                       code = mono_x86_emit_tls_get (code, X86_EAX, appdomain_tls_offset);
                        x86_alu_reg_imm (code, X86_CMP, X86_EAX, GPOINTER_TO_UINT (cfg->domain));
                        no_domain_branch = code;
                        x86_branch8 (code, X86_CC_NE, 0, 0);
-                       code = emit_tls_get ( code, X86_EAX, lmf_tls_offset);
+                       code = mono_x86_emit_tls_get ( code, X86_EAX, lmf_tls_offset);
                        x86_test_reg_reg (code, X86_EAX, X86_EAX);
                        buf = code;
                        x86_branch8 (code, X86_CC_NE, 0, 0);
@@ -4336,7 +4338,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 
                        if (lmf_addr_tls_offset != -1) {
                                /* Load lmf quicky using the GS register */
-                               code = emit_tls_get (code, X86_EAX, lmf_addr_tls_offset);
+                               code = mono_x86_emit_tls_get (code, X86_EAX, lmf_addr_tls_offset);
 #ifdef PLATFORM_WIN32
                                /* The TLS key actually contains a pointer to the MonoJitTlsData structure */
                                /* FIXME: Add a separate key for LMF to avoid this */
@@ -4503,7 +4505,7 @@ mono_arch_emit_epilog (MonoCompile *cfg)
                /* check if we need to restore protection of the stack after a stack overflow */
                if (mono_get_jit_tls_offset () != -1) {
                        guint8 *patch;
-                       code = emit_tls_get (code, X86_ECX, mono_get_jit_tls_offset ());
+                       code = mono_x86_emit_tls_get (code, X86_ECX, mono_get_jit_tls_offset ());
                        /* we load the value in a separate instruction: this mechanism may be
                         * used later as a safer way to do thread interruption
                         */
@@ -4802,66 +4804,6 @@ mono_arch_free_jit_tls_data (MonoJitTlsData *tls)
 {
 }
 
-void
-mono_arch_emit_this_vret_args (MonoCompile *cfg, MonoCallInst *inst, int this_reg, int this_type, int vt_reg)
-{
-       MonoCallInst *call = (MonoCallInst*)inst;
-       CallInfo *cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, inst->signature, FALSE);
-
-       /* add the this argument */
-       if (this_reg != -1) {
-               if (cinfo->args [0].storage == ArgInIReg) {
-                       MonoInst *this;
-                       MONO_INST_NEW (cfg, this, OP_MOVE);
-                       this->type = this_type;
-                       this->sreg1 = this_reg;
-                       this->dreg = mono_regstate_next_int (cfg->rs);
-                       mono_bblock_add_inst (cfg->cbb, this);
-
-                       mono_call_inst_add_outarg_reg (cfg, call, this->dreg, cinfo->args [0].reg, FALSE);
-               }
-               else {
-                       MonoInst *this;
-                       MONO_INST_NEW (cfg, this, OP_OUTARG);
-                       this->type = this_type;
-                       this->sreg1 = this_reg;
-                       mono_bblock_add_inst (cfg->cbb, this);
-               }
-       }
-
-       if (vt_reg != -1) {
-               MonoInst *vtarg;
-
-               if (cinfo->ret.storage == ArgValuetypeInReg) {
-                       /*
-                        * The valuetype is in EAX:EDX after the call, needs to be copied to
-                        * the stack. Save the address here, so the call instruction can
-                        * access it.
-                        */
-                       MONO_INST_NEW (cfg, vtarg, OP_STORE_MEMBASE_REG);
-                       vtarg->inst_destbasereg = X86_ESP;
-                       vtarg->inst_offset = inst->stack_usage;
-                       vtarg->sreg1 = vt_reg;
-                       mono_bblock_add_inst (cfg->cbb, vtarg);
-               }
-               else if (cinfo->ret.storage == ArgInIReg) {
-                       /* The return address is passed in a register */
-                       MONO_INST_NEW (cfg, vtarg, OP_MOVE);
-                       vtarg->sreg1 = vt_reg;
-                       vtarg->dreg = mono_regstate_next_int (cfg->rs);
-                       mono_bblock_add_inst (cfg->cbb, vtarg);
-
-                       mono_call_inst_add_outarg_reg (cfg, call, vtarg->dreg, cinfo->ret.reg, FALSE);
-               } else {
-                       MonoInst *vtarg;
-                       MONO_INST_NEW (cfg, vtarg, OP_OUTARG);
-                       vtarg->type = STACK_MP;
-                       vtarg->sreg1 = vt_reg;
-                       mono_bblock_add_inst (cfg->cbb, vtarg);
-               }
-       }
-}
-
 #ifdef MONO_ARCH_HAVE_IMT
 
 // Linear handler, the bsearch head compare is shorter
@@ -5021,67 +4963,6 @@ mono_arch_find_static_call_vtable (gpointer *regs, guint8 *code)
        return (MonoVTable*) regs [MONO_ARCH_RGCTX_REG];
 }
 
-MonoInst*
-mono_arch_get_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
-{
-       MonoInst *ins = NULL;
-
-       if (cmethod->klass == mono_defaults.math_class) {
-               if (strcmp (cmethod->name, "Sin") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_SIN);
-                       ins->inst_i0 = args [0];
-               } else if (strcmp (cmethod->name, "Cos") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_COS);
-                       ins->inst_i0 = args [0];
-               } else if (strcmp (cmethod->name, "Tan") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_TAN);
-                       ins->inst_i0 = args [0];
-               } else if (strcmp (cmethod->name, "Atan") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_ATAN);
-                       ins->inst_i0 = args [0];
-               } else if (strcmp (cmethod->name, "Sqrt") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_SQRT);
-                       ins->inst_i0 = args [0];
-               } else if (strcmp (cmethod->name, "Abs") == 0 && fsig->params [0]->type == MONO_TYPE_R8) {
-                       MONO_INST_NEW (cfg, ins, OP_ABS);
-                       ins->inst_i0 = args [0];
-               }
-
-               if (cfg->opt & MONO_OPT_CMOV) {
-                       int opcode = 0;
-
-                       if (strcmp (cmethod->name, "Min") == 0) {
-                               if (fsig->params [0]->type == MONO_TYPE_I4)
-                                       opcode = OP_IMIN;
-                               else if (fsig->params [0]->type == MONO_TYPE_U4)
-                                       opcode = OP_IMIN_UN;
-                       } else if (strcmp (cmethod->name, "Max") == 0) {
-                               if (fsig->params [0]->type == MONO_TYPE_I4)
-                                       opcode = OP_IMAX;
-                               else if (fsig->params [0]->type == MONO_TYPE_U4)
-                                       opcode = OP_IMAX_UN;
-                       }               
-
-                       if (opcode) {
-                               MONO_INST_NEW (cfg, ins, opcode);
-                               ins->inst_i0 = args [0];
-                               ins->inst_i1 = args [1];
-                       }
-               }
-
-#if 0
-               /* OP_FREM is not IEEE compatible */
-               else if (strcmp (cmethod->name, "IEEERemainder") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_FREM);
-                       ins->inst_i0 = args [0];
-                       ins->inst_i1 = args [1];
-               }
-#endif
-       }
-
-       return ins;
-}
-
 MonoInst*
 mono_arch_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
 {
@@ -5101,6 +4982,8 @@ mono_arch_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMetho
                        opcode = OP_SQRT;
                } else if (strcmp (cmethod->name, "Abs") == 0 && fsig->params [0]->type == MONO_TYPE_R8) {
                        opcode = OP_ABS;
+               } else if (strcmp (cmethod->name, "Round") == 0 && fsig->param_count == 1 && fsig->params [0]->type == MONO_TYPE_R8) {
+                       opcode = OP_ROUND;
                }
                
                if (opcode) {
@@ -5485,9 +5368,7 @@ void
 mono_arch_decompose_opts (MonoCompile *cfg, MonoInst *ins)
 {
        MonoInst *fconv;
-
        int dreg, src_opcode;
-       g_assert (cfg->new_ir);
 
        if (!(cfg->opt & MONO_OPT_SSE2) || !(cfg->opt & MONO_OPT_SIMD))
                return;
@@ -5523,8 +5404,100 @@ mono_arch_decompose_opts (MonoCompile *cfg, MonoInst *ins)
        ins->dreg = dreg;
        ins->type = STACK_I4;
        ins->backend.source_opcode = src_opcode;
+}
 
-
+void
+mono_arch_decompose_long_opts (MonoCompile *cfg, MonoInst *long_ins)
+{
+       MonoInst *ins;
+       int vreg;
+       if (!(cfg->opt & MONO_OPT_SIMD))
+               return;
+       
+       /*TODO move this to simd-intrinsic.c once we support sse 4.1 dword extractors since we need the runtime caps info */ 
+       switch (long_ins->opcode) {
+       case OP_EXTRACT_I8:
+               vreg = long_ins->sreg1;
+       
+               if (long_ins->inst_c0) {
+                       MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
+                       ins->klass = long_ins->klass;
+                       ins->sreg1 = long_ins->sreg1;
+                       ins->inst_c0 = 2;
+                       ins->type = STACK_VTYPE;
+                       ins->dreg = vreg = alloc_ireg (cfg);
+                       MONO_ADD_INS (cfg->cbb, ins);
+               }
+       
+               MONO_INST_NEW (cfg, ins, OP_EXTRACT_I4);
+               ins->klass = mono_defaults.int32_class;
+               ins->sreg1 = vreg;
+               ins->type = STACK_I4;
+               ins->dreg = long_ins->dreg + 1;
+               MONO_ADD_INS (cfg->cbb, ins);
+       
+               MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
+               ins->klass = long_ins->klass;
+               ins->sreg1 = long_ins->sreg1;
+               ins->inst_c0 = long_ins->inst_c0 ? 3 : 1;
+               ins->type = STACK_VTYPE;
+               ins->dreg = vreg = alloc_ireg (cfg);
+               MONO_ADD_INS (cfg->cbb, ins);
+       
+               MONO_INST_NEW (cfg, ins, OP_EXTRACT_I4);
+               ins->klass = mono_defaults.int32_class;
+               ins->sreg1 = vreg;
+               ins->type = STACK_I4;
+               ins->dreg = long_ins->dreg + 2;
+               MONO_ADD_INS (cfg->cbb, ins);
+       
+               long_ins->opcode = OP_NOP;
+               break;
+       case OP_INSERTX_I8_SLOW:
+               MONO_INST_NEW (cfg, ins, OP_INSERTX_I4_SLOW);
+               ins->dreg = long_ins->dreg;
+               ins->sreg1 = long_ins->dreg;
+               ins->sreg2 = long_ins->sreg2 + 1;
+               ins->inst_c0 = long_ins->inst_c0 * 2;
+               MONO_ADD_INS (cfg->cbb, ins);
+
+               MONO_INST_NEW (cfg, ins, OP_INSERTX_I4_SLOW);
+               ins->dreg = long_ins->dreg;
+               ins->sreg1 = long_ins->dreg;
+               ins->sreg2 = long_ins->sreg2 + 2;
+               ins->inst_c0 = long_ins->inst_c0 * 2 + 1;
+               MONO_ADD_INS (cfg->cbb, ins);
+
+               long_ins->opcode = OP_NOP;
+               break;
+       case OP_EXPAND_I8:
+               MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
+               ins->dreg = long_ins->dreg;
+               ins->sreg1 = long_ins->sreg1 + 1;
+               ins->klass = long_ins->klass;
+               ins->type = STACK_VTYPE;
+               MONO_ADD_INS (cfg->cbb, ins);
+
+               MONO_INST_NEW (cfg, ins, OP_INSERTX_I4_SLOW);
+               ins->dreg = long_ins->dreg;
+               ins->sreg1 = long_ins->dreg;
+               ins->sreg2 = long_ins->sreg1 + 2;
+               ins->inst_c0 = 1;
+               ins->klass = long_ins->klass;
+               ins->type = STACK_VTYPE;
+               MONO_ADD_INS (cfg->cbb, ins);
+
+               MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
+               ins->dreg = long_ins->dreg;
+               ins->sreg1 = long_ins->dreg;;
+               ins->inst_c0 = 0x44; /*Magic number for swizzling (X,Y,X,Y)*/
+               ins->klass = long_ins->klass;
+               ins->type = STACK_VTYPE;
+               MONO_ADD_INS (cfg->cbb, ins);
+
+               long_ins->opcode = OP_NOP;
+               break;
+       }
 }
 #endif