New tests.
[mono.git] / mono / mini / mini-x86.c
index 6ced7129be2d031d89ec076b9002507a84d81061..4f2715a6f4849a95b341b1b28342e3198bc7587c 100644 (file)
 #include <mono/metadata/profiler-private.h>
 #include <mono/metadata/mono-debug.h>
 #include <mono/utils/mono-math.h>
+#include <mono/utils/mono-counters.h>
+#include <mono/utils/mono-mmap.h>
 
 #include "trace.h"
 #include "mini-x86.h"
-#include "inssel.h"
 #include "cpu-x86.h"
+#include "ir-emit.h"
 
 /* On windows, these hold the key returned by TlsAlloc () */
 static gint lmf_tls_offset = -1;
 static gint lmf_addr_tls_offset = -1;
 static gint appdomain_tls_offset = -1;
-static gint thread_tls_offset = -1;
 
 #ifdef MONO_XEN_OPT
 static gboolean optimize_for_xen = TRUE;
@@ -39,7 +40,7 @@ static gboolean optimize_for_xen = TRUE;
 #define optimize_for_xen 0
 #endif
 
-#ifdef PLATFORM_WIN32
+#ifdef TARGET_WIN32
 static gboolean is_win32 = TRUE;
 #else
 static gboolean is_win32 = FALSE;
@@ -54,7 +55,7 @@ static CRITICAL_SECTION mini_arch_mutex;
 
 #define ARGS_OFFSET 8
 
-#ifdef PLATFORM_WIN32
+#ifdef TARGET_WIN32
 /* Under windows, the default pinvoke calling convention is stdcall */
 #define CALLCONV_IS_STDCALL(sig) ((((sig)->call_convention) == MONO_CALL_STDCALL) || ((sig)->pinvoke && ((sig)->call_convention) == MONO_CALL_DEFAULT))
 #else
@@ -64,6 +65,15 @@ static CRITICAL_SECTION mini_arch_mutex;
 MonoBreakpointInfo
 mono_breakpoint_info [MONO_BREAKPOINT_ARRAY_SIZE];
 
+/*
+ * The code generated for sequence points reads from this location, which is
+ * made read-only when single stepping is enabled.
+ */
+static gpointer ss_trigger_page;
+
+/* Enabled breakpoints read from this trigger page */
+static gpointer bp_trigger_page;
+
 const char*
 mono_arch_regname (int reg)
 {
@@ -105,6 +115,32 @@ mono_arch_fregname (int reg)
        }
 }
 
+const char *
+mono_arch_xregname (int reg)
+{
+       switch (reg) {
+       case 0:
+               return "%xmm0";
+       case 1:
+               return "%xmm1";
+       case 2:
+               return "%xmm2";
+       case 3:
+               return "%xmm3";
+       case 4:
+               return "%xmm4";
+       case 5:
+               return "%xmm5";
+       case 6:
+               return "%xmm6";
+       case 7:
+               return "%xmm7";
+       default:
+               return "unknown";
+       }
+}
+
+
 typedef enum {
        ArgInIReg,
        ArgInFloatSSEReg,
@@ -144,7 +180,7 @@ typedef struct {
 
 static X86_Reg_No param_regs [] = { 0 };
 
-#if defined(PLATFORM_WIN32) || defined(__APPLE__) || defined(__FreeBSD__)
+#if defined(TARGET_WIN32) || defined(__APPLE__) || defined(__FreeBSD__)
 #define SMALL_STRUCTS_IN_REGS
 static X86_Reg_No return_regs [] = { X86_EAX, X86_EDX };
 #endif
@@ -206,10 +242,7 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
        MonoClass *klass;
 
        klass = mono_class_from_mono_type (type);
-       if (sig->pinvoke) 
-               size = mono_type_native_stack_size (&klass->byval_arg, NULL);
-       else 
-               size = mini_type_stack_size (gsctx, &klass->byval_arg, NULL);
+       size = mini_type_stack_size_full (gsctx, &klass->byval_arg, NULL, sig->pinvoke);
 
 #ifdef SMALL_STRUCTS_IN_REGS
        if (sig->pinvoke && is_return) {
@@ -263,27 +296,19 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
  * For x86 win32, see ???.
  */
 static CallInfo*
-get_call_info (MonoCompile *cfg, MonoMemPool *mp, MonoMethodSignature *sig, gboolean is_pinvoke)
+get_call_info_internal (MonoGenericSharingContext *gsctx, CallInfo *cinfo, MonoMethodSignature *sig, gboolean is_pinvoke)
 {
        guint32 i, gr, fr;
        MonoType *ret_type;
        int n = sig->hasthis + sig->param_count;
        guint32 stack_size = 0;
-       CallInfo *cinfo;
-       MonoGenericSharingContext *gsctx = cfg ? cfg->generic_sharing_context : NULL;
-
-       if (mp)
-               cinfo = mono_mempool_alloc0 (mp, sizeof (CallInfo) + (sizeof (ArgInfo) * n));
-       else
-               cinfo = g_malloc0 (sizeof (CallInfo) + (sizeof (ArgInfo) * n));
 
        gr = 0;
        fr = 0;
 
        /* return value */
        {
-               ret_type = mono_type_get_underlying_type (sig->ret);
-               ret_type = mini_get_basic_type_from_generic (gsctx, ret_type);
+               ret_type = mini_type_get_underlying_type (gsctx, sig->ret);
                switch (ret_type->type) {
                case MONO_TYPE_BOOLEAN:
                case MONO_TYPE_I1:
@@ -317,7 +342,7 @@ get_call_info (MonoCompile *cfg, MonoMemPool *mp, MonoMethodSignature *sig, gboo
                        cinfo->ret.storage = ArgOnDoubleFpStack;
                        break;
                case MONO_TYPE_GENERICINST:
-                       if (!mono_type_generic_inst_is_valuetype (sig->ret)) {
+                       if (!mono_type_generic_inst_is_valuetype (ret_type)) {
                                cinfo->ret.storage = ArgInIReg;
                                cinfo->ret.reg = X86_EAX;
                                break;
@@ -378,8 +403,7 @@ get_call_info (MonoCompile *cfg, MonoMemPool *mp, MonoMethodSignature *sig, gboo
                        add_general (&gr, &stack_size, ainfo);
                        continue;
                }
-               ptype = mono_type_get_underlying_type (sig->params [i]);
-               ptype = mini_get_basic_type_from_generic (gsctx, ptype);
+               ptype = mini_type_get_underlying_type (gsctx, sig->params [i]);
                switch (ptype->type) {
                case MONO_TYPE_BOOLEAN:
                case MONO_TYPE_I1:
@@ -407,7 +431,7 @@ get_call_info (MonoCompile *cfg, MonoMemPool *mp, MonoMethodSignature *sig, gboo
                        add_general (&gr, &stack_size, ainfo);
                        break;
                case MONO_TYPE_GENERICINST:
-                       if (!mono_type_generic_inst_is_valuetype (sig->params [i])) {
+                       if (!mono_type_generic_inst_is_valuetype (ptype)) {
                                add_general (&gr, &stack_size, ainfo);
                                break;
                        }
@@ -443,12 +467,11 @@ get_call_info (MonoCompile *cfg, MonoMemPool *mp, MonoMethodSignature *sig, gboo
                add_general (&gr, &stack_size, &cinfo->sig_cookie);
        }
 
-#if defined(__APPLE__)
-       if ((stack_size % 16) != 0) { 
+       if (mono_do_x86_stack_align && (stack_size % MONO_ARCH_FRAME_ALIGNMENT) != 0) {
                cinfo->need_stack_align = TRUE;
-               stack_size += cinfo->stack_align_amount = 16-(stack_size % 16);
+               cinfo->stack_align_amount = MONO_ARCH_FRAME_ALIGNMENT - (stack_size % MONO_ARCH_FRAME_ALIGNMENT);
+               stack_size += cinfo->stack_align_amount;
        }
-#endif
 
        cinfo->stack_usage = stack_size;
        cinfo->reg_usage = gr;
@@ -456,6 +479,20 @@ get_call_info (MonoCompile *cfg, MonoMemPool *mp, MonoMethodSignature *sig, gboo
        return cinfo;
 }
 
+static CallInfo*
+get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSignature *sig, gboolean is_pinvoke)
+{
+       int n = sig->hasthis + sig->param_count;
+       CallInfo *cinfo;
+
+       if (mp)
+               cinfo = mono_mempool_alloc0 (mp, sizeof (CallInfo) + (sizeof (ArgInfo) * n));
+       else
+               cinfo = g_malloc0 (sizeof (CallInfo) + (sizeof (ArgInfo) * n));
+
+       return get_call_info_internal (gsctx, cinfo, sig, is_pinvoke);
+}
+
 /*
  * mono_arch_get_argument_info:
  * @csig:  a method signature
@@ -465,49 +502,48 @@ get_call_info (MonoCompile *cfg, MonoMemPool *mp, MonoMethodSignature *sig, gboo
  * Gathers information on parameters such as size, alignment and
  * padding. arg_info should be large enought to hold param_count + 1 entries. 
  *
- * Returns the size of the activation frame.
+ * Returns the size of the argument area on the stack.
+ * This should be signal safe, since it is called from
+ * mono_arch_find_jit_info_ext ().
+ * FIXME: The metadata calls might not be signal safe.
  */
 int
 mono_arch_get_argument_info (MonoMethodSignature *csig, int param_count, MonoJitArgumentInfo *arg_info)
 {
-       int k, frame_size = 0;
+       int k, args_size = 0;
        int size, pad;
        guint32 align;
        int offset = 8;
        CallInfo *cinfo;
 
-       cinfo = get_call_info (NULL, NULL, csig, FALSE);
+       /* Avoid g_malloc as it is not signal safe */
+       cinfo = (CallInfo*)g_newa (guint8*, sizeof (CallInfo) + (sizeof (ArgInfo) * (csig->param_count + 1)));
+
+       cinfo = get_call_info_internal (NULL, cinfo, csig, FALSE);
 
        if (MONO_TYPE_ISSTRUCT (csig->ret) && (cinfo->ret.storage == ArgOnStack)) {
-               frame_size += sizeof (gpointer);
+               args_size += sizeof (gpointer);
                offset += 4;
        }
 
        arg_info [0].offset = offset;
 
        if (csig->hasthis) {
-               frame_size += sizeof (gpointer);
+               args_size += sizeof (gpointer);
                offset += 4;
        }
 
-       arg_info [0].size = frame_size;
+       arg_info [0].size = args_size;
 
        for (k = 0; k < param_count; k++) {
-               
-               if (csig->pinvoke)
-                       size = mono_type_native_stack_size (csig->params [k], &align);
-               else {
-                       int ialign;
-                       size = mini_type_stack_size (NULL, csig->params [k], &ialign);
-                       align = ialign;
-               }
+               size = mini_type_stack_size_full (NULL, csig->params [k], &align, csig->pinvoke);
 
                /* ignore alignment for now */
                align = 1;
 
-               frame_size += pad = (align - (frame_size & (align - 1))) & (align - 1); 
+               args_size += pad = (align - (args_size & (align - 1))) & (align - 1);   
                arg_info [k].pad = pad;
-               frame_size += size;
+               args_size += size;
                arg_info [k + 1].pad = 0;
                arg_info [k + 1].size = size;
                offset += pad;
@@ -515,13 +551,14 @@ mono_arch_get_argument_info (MonoMethodSignature *csig, int param_count, MonoJit
                offset += size;
        }
 
-       align = MONO_ARCH_FRAME_ALIGNMENT;
-       frame_size += pad = (align - (frame_size & (align - 1))) & (align - 1);
+       if (mono_do_x86_stack_align && !CALLCONV_IS_STDCALL (csig))
+               align = MONO_ARCH_FRAME_ALIGNMENT;
+       else
+               align = 4;
+       args_size += pad = (align - (args_size & (align - 1))) & (align - 1);
        arg_info [k].pad = pad;
 
-       g_free (cinfo);
-
-       return frame_size;
+       return args_size;
 }
 
 static const guchar cpuid_impl [] = {
@@ -633,6 +670,10 @@ void
 mono_arch_init (void)
 {
        InitializeCriticalSection (&mini_arch_mutex);
+
+       ss_trigger_page = mono_valloc (NULL, mono_pagesize (), MONO_MMAP_READ);
+       bp_trigger_page = mono_valloc (NULL, mono_pagesize (), MONO_MMAP_READ|MONO_MMAP_32BIT);
+       mono_mprotect (bp_trigger_page, mono_pagesize (), 0);
 }
 
 /*
@@ -668,10 +709,59 @@ mono_arch_cpu_optimizazions (guint32 *exclude_mask)
                        opts |= MONO_OPT_SSE2;
                else
                        *exclude_mask |= MONO_OPT_SSE2;
+
+#ifdef MONO_ARCH_SIMD_INTRINSICS
+               /*SIMD intrinsics require at least SSE2.*/
+               if (!(opts & MONO_OPT_SSE2))
+                       *exclude_mask |= MONO_OPT_SIMD;
+#endif
        }
        return opts;
 }
 
+/*
+ * This function test for all SSE functions supported.
+ *
+ * Returns a bitmask corresponding to all supported versions.
+ * 
+ */
+guint32
+mono_arch_cpu_enumerate_simd_versions (void)
+{
+       int eax, ebx, ecx, edx;
+       guint32 sse_opts = 0;
+
+       if (cpuid (1, &eax, &ebx, &ecx, &edx)) {
+               if (edx & (1 << 25))
+                       sse_opts |= SIMD_VERSION_SSE1;
+               if (edx & (1 << 26))
+                       sse_opts |= SIMD_VERSION_SSE2;
+               if (ecx & (1 << 0))
+                       sse_opts |= SIMD_VERSION_SSE3;
+               if (ecx & (1 << 9))
+                       sse_opts |= SIMD_VERSION_SSSE3;
+               if (ecx & (1 << 19))
+                       sse_opts |= SIMD_VERSION_SSE41;
+               if (ecx & (1 << 20))
+                       sse_opts |= SIMD_VERSION_SSE42;
+       }
+
+       /* Yes, all this needs to be done to check for sse4a.
+          See: "Amd: CPUID Specification"
+        */
+       if (cpuid (0x80000000, &eax, &ebx, &ecx, &edx)) {
+               /* eax greater or equal than 0x80000001, ebx = 'htuA', ecx = DMAc', edx = 'itne'*/
+               if ((((unsigned int) eax) >= 0x80000001) && (ebx == 0x68747541) && (ecx == 0x444D4163) && (edx == 0x69746E65)) {
+                       cpuid (0x80000001, &eax, &ebx, &ecx, &edx);
+                       if (ecx & (1 << 6))
+                               sse_opts |= SIMD_VERSION_SSE4a;
+               }
+       }
+
+
+       return sse_opts;        
+}
+
 /*
  * Determine whenever the trap whose info is in SIGINFO is caused by
  * integer overflow.
@@ -785,7 +875,78 @@ mono_arch_regalloc_cost (MonoCompile *cfg, MonoMethodVar *vmv)
                /* push+pop+possible load if it is an argument */
                return (ins->opcode == OP_ARG) ? 3 : 2;
 }
+
+static void
+set_needs_stack_frame (MonoCompile *cfg, gboolean flag)
+{
+       static int inited = FALSE;
+       static int count = 0;
+
+       if (cfg->arch.need_stack_frame_inited) {
+               g_assert (cfg->arch.need_stack_frame == flag);
+               return;
+       }
+
+       cfg->arch.need_stack_frame = flag;
+       cfg->arch.need_stack_frame_inited = TRUE;
+
+       if (flag)
+               return;
+
+       if (!inited) {
+               mono_counters_register ("Could eliminate stack frame", MONO_COUNTER_INT|MONO_COUNTER_JIT, &count);
+               inited = TRUE;
+       }
+       ++count;
+
+       //g_print ("will eliminate %s.%s.%s\n", cfg->method->klass->name_space, cfg->method->klass->name, cfg->method->name);
+}
+
+static gboolean
+needs_stack_frame (MonoCompile *cfg)
+{
+       MonoMethodSignature *sig;
+       MonoMethodHeader *header;
+       gboolean result = FALSE;
+
+#if defined(__APPLE__)
+       /*OSX requires stack frame code to have the correct alignment. */
+       return TRUE;
+#endif
+
+       if (cfg->arch.need_stack_frame_inited)
+               return cfg->arch.need_stack_frame;
+
+       header = cfg->header;
+       sig = mono_method_signature (cfg->method);
+
+       if (cfg->disable_omit_fp)
+               result = TRUE;
+       else if (cfg->flags & MONO_CFG_HAS_ALLOCA)
+               result = TRUE;
+       else if (cfg->method->save_lmf)
+               result = TRUE;
+       else if (cfg->stack_offset)
+               result = TRUE;
+       else if (cfg->param_area)
+               result = TRUE;
+       else if (cfg->flags & (MONO_CFG_HAS_CALLS | MONO_CFG_HAS_ALLOCA | MONO_CFG_HAS_TAIL))
+               result = TRUE;
+       else if (header->num_clauses)
+               result = TRUE;
+       else if (sig->param_count + sig->hasthis)
+               result = TRUE;
+       else if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG))
+               result = TRUE;
+       else if ((mono_jit_trace_calls != NULL && mono_trace_eval (cfg->method)) ||
+               (cfg->prof_options & MONO_PROFILE_ENTER_LEAVE))
+               result = TRUE;
+
+       set_needs_stack_frame (cfg, result);
+
+       return cfg->arch.need_stack_frame;
+}
+
 /*
  * Set var information according to the calling convention. X86 version.
  * The locals var stuff should most likely be split in another method.
@@ -801,12 +962,12 @@ mono_arch_allocate_vars (MonoCompile *cfg)
        gint32 *offsets;
        CallInfo *cinfo;
 
-       header = mono_method_get_header (cfg->method);
+       header = cfg->header;
        sig = mono_method_signature (cfg->method);
 
-       cinfo = get_call_info (cfg, cfg->mempool, sig, FALSE);
+       cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, FALSE);
 
-       cfg->frame_reg = MONO_ARCH_BASEREG;
+       cfg->frame_reg = X86_EBP;
        offset = 0;
 
        /* Reserve space to save LMF and caller saved registers */
@@ -841,10 +1002,24 @@ mono_arch_allocate_vars (MonoCompile *cfg)
 
        /* Allocate locals */
        offsets = mono_allocate_stack_slots (cfg, &locals_stack_size, &locals_stack_align);
+       if (locals_stack_size > MONO_ARCH_MAX_FRAME_SIZE) {
+               char *mname = mono_method_full_name (cfg->method, TRUE);
+               cfg->exception_type = MONO_EXCEPTION_INVALID_PROGRAM;
+               cfg->exception_message = g_strdup_printf ("Method %s stack is too big.", mname);
+               g_free (mname);
+               return;
+       }
        if (locals_stack_align) {
                offset += (locals_stack_align - 1);
                offset &= ~(locals_stack_align - 1);
        }
+       /*
+        * EBP is at alignment 8 % MONO_ARCH_FRAME_ALIGNMENT, so if we
+        * have locals larger than 8 bytes we need to make sure that
+        * they have the appropriate offset.
+        */
+       if (MONO_ARCH_FRAME_ALIGNMENT > 8 && locals_stack_align > 8)
+               offset += MONO_ARCH_FRAME_ALIGNMENT - sizeof (gpointer) * 2;
        for (i = cfg->locals_start; i < cfg->num_varinfo; i++) {
                if (offsets [i] != -1) {
                        MonoInst *inst = cfg->varinfo [i];
@@ -863,15 +1038,30 @@ mono_arch_allocate_vars (MonoCompile *cfg)
 
        switch (cinfo->ret.storage) {
        case ArgOnStack:
-               cfg->ret->opcode = OP_REGOFFSET;
-               cfg->ret->inst_basereg = X86_EBP;
-               cfg->ret->inst_offset = cinfo->ret.offset + ARGS_OFFSET;
+               if (MONO_TYPE_ISSTRUCT (sig->ret)) {
+                       /* 
+                        * In the new IR, the cfg->vret_addr variable represents the
+                        * vtype return value.
+                        */
+                       cfg->vret_addr->opcode = OP_REGOFFSET;
+                       cfg->vret_addr->inst_basereg = cfg->frame_reg;
+                       cfg->vret_addr->inst_offset = cinfo->ret.offset + ARGS_OFFSET;
+                       if (G_UNLIKELY (cfg->verbose_level > 1)) {
+                               printf ("vret_addr =");
+                               mono_print_ins (cfg->vret_addr);
+                       }
+               } else {
+                       cfg->ret->opcode = OP_REGOFFSET;
+                       cfg->ret->inst_basereg = X86_EBP;
+                       cfg->ret->inst_offset = cinfo->ret.offset + ARGS_OFFSET;
+               }
                break;
        case ArgValuetypeInReg:
                break;
        case ArgInIReg:
                cfg->ret->opcode = OP_REGVAR;
                cfg->ret->inst_c0 = cinfo->ret.reg;
+               cfg->ret->dreg = cinfo->ret.reg;
                break;
        case ArgNone:
        case ArgOnFloatFpStack:
@@ -896,9 +1086,6 @@ mono_arch_allocate_vars (MonoCompile *cfg)
                inst->inst_offset = ainfo->offset + ARGS_OFFSET;
        }
 
-       offset += (MONO_ARCH_FRAME_ALIGNMENT - 1);
-       offset &= ~(MONO_ARCH_FRAME_ALIGNMENT - 1);
-
        cfg->stack_offset = offset;
 }
 
@@ -910,26 +1097,49 @@ mono_arch_create_vars (MonoCompile *cfg)
 
        sig = mono_method_signature (cfg->method);
 
-       cinfo = get_call_info (cfg, cfg->mempool, sig, FALSE);
+       cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, FALSE);
 
        if (cinfo->ret.storage == ArgValuetypeInReg)
                cfg->ret_var_is_local = TRUE;
+       if ((cinfo->ret.storage != ArgValuetypeInReg) && MONO_TYPE_ISSTRUCT (sig->ret)) {
+               cfg->vret_addr = mono_compile_create_var (cfg, &mono_defaults.int_class->byval_arg, OP_ARG);
+       }
 }
 
-/* Fixme: we need an alignment solution for enter_method and mono_arch_call_opcode,
- * currently alignment in mono_arch_call_opcode is computed without arch_get_argument_info 
+/*
+ * It is expensive to adjust esp for each individual fp argument pushed on the stack
+ * so we try to do it just once when we have multiple fp arguments in a row.
+ * We don't use this mechanism generally because for int arguments the generated code
+ * is slightly bigger and new generation cpus optimize away the dependency chains
+ * created by push instructions on the esp value.
+ * fp_arg_setup is the first argument in the execution sequence where the esp register
+ * is modified.
  */
+static G_GNUC_UNUSED int
+collect_fp_stack_space (MonoMethodSignature *sig, int start_arg, int *fp_arg_setup)
+{
+       int fp_space = 0;
+       MonoType *t;
+
+       for (; start_arg < sig->param_count; ++start_arg) {
+               t = mini_type_get_underlying_type (NULL, sig->params [start_arg]);
+               if (!t->byref && t->type == MONO_TYPE_R8) {
+                       fp_space += sizeof (double);
+                       *fp_arg_setup = start_arg;
+               } else {
+                       break;
+               }
+       }
+       return fp_space;
+}
 
 static void
-emit_sig_cookie (MonoCompile *cfg, MonoCallInst *call)
+emit_sig_cookie (MonoCompile *cfg, MonoCallInst *call, CallInfo *cinfo)
 {
-       MonoInst *arg;
        MonoMethodSignature *tmp_sig;
-       MonoInst *sig_arg;
 
        /* FIXME: Add support for signature tokens to AOT */
        cfg->disable_aot = TRUE;
-       MONO_INST_NEW (cfg, arg, OP_OUTARG);
 
        /*
         * mono_ArgIterator_Setup assumes the signature cookie is 
@@ -942,189 +1152,323 @@ emit_sig_cookie (MonoCompile *cfg, MonoCallInst *call)
        tmp_sig->sentinelpos = 0;
        memcpy (tmp_sig->params, call->signature->params + call->signature->sentinelpos, tmp_sig->param_count * sizeof (MonoType*));
 
-       MONO_INST_NEW (cfg, sig_arg, OP_ICONST);
-       sig_arg->inst_p0 = tmp_sig;
-
-       arg->inst_left = sig_arg;
-       arg->type = STACK_PTR;
-       MONO_INST_LIST_ADD (&arg->node, &call->out_args);
+       MONO_EMIT_NEW_BIALU_IMM (cfg, OP_X86_PUSH_IMM, -1, -1, tmp_sig);
 }
 
-/*
- * It is expensive to adjust esp for each individual fp argument pushed on the stack
- * so we try to do it just once when we have multiple fp arguments in a row.
- * We don't use this mechanism generally because for int arguments the generated code
- * is slightly bigger and new generation cpus optimize away the dependency chains
- * created by push instructions on the esp value.
- * fp_arg_setup is the first argument in the execution sequence where the esp register
- * is modified.
- */
-static int
-collect_fp_stack_space (MonoMethodSignature *sig, int start_arg, int *fp_arg_setup)
+#ifdef ENABLE_LLVM
+LLVMCallInfo*
+mono_arch_get_llvm_call_info (MonoCompile *cfg, MonoMethodSignature *sig)
 {
-       int fp_space = 0;
+       int i, n;
+       CallInfo *cinfo;
+       ArgInfo *ainfo;
+       LLVMCallInfo *linfo;
        MonoType *t;
 
-       for (; start_arg < sig->param_count; ++start_arg) {
-               t = mono_type_get_underlying_type (sig->params [start_arg]);
-               if (!t->byref && t->type == MONO_TYPE_R8) {
-                       fp_space += sizeof (double);
-                       *fp_arg_setup = start_arg;
-               } else {
+       n = sig->param_count + sig->hasthis;
+
+       cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, sig->pinvoke);
+
+       linfo = mono_mempool_alloc0 (cfg->mempool, sizeof (LLVMCallInfo) + (sizeof (LLVMArgInfo) * n));
+
+       /*
+        * LLVM always uses the native ABI while we use our own ABI, the
+        * only difference is the handling of vtypes:
+        * - we only pass/receive them in registers in some cases, and only 
+        *   in 1 or 2 integer registers.
+        */
+       if (cinfo->ret.storage == ArgValuetypeInReg) {
+               if (sig->pinvoke) {
+                       cfg->exception_message = g_strdup ("pinvoke + vtypes");
+                       cfg->disable_llvm = TRUE;
+                       return linfo;
+               }
+
+               cfg->exception_message = g_strdup ("vtype ret in call");
+               cfg->disable_llvm = TRUE;
+               /*
+               linfo->ret.storage = LLVMArgVtypeInReg;
+               for (j = 0; j < 2; ++j)
+                       linfo->ret.pair_storage [j] = arg_storage_to_llvm_arg_storage (cfg, cinfo->ret.pair_storage [j]);
+               */
+       }
+
+       if (MONO_TYPE_ISSTRUCT (sig->ret) && cinfo->ret.storage == ArgInIReg) {
+               /* Vtype returned using a hidden argument */
+               linfo->ret.storage = LLVMArgVtypeRetAddr;
+       }
+
+       if (MONO_TYPE_ISSTRUCT (sig->ret) && cinfo->ret.storage != ArgInIReg) {
+               // FIXME:
+               cfg->exception_message = g_strdup ("vtype ret in call");
+               cfg->disable_llvm = TRUE;
+       }
+
+       for (i = 0; i < n; ++i) {
+               ainfo = cinfo->args + i;
+
+               if (i >= sig->hasthis)
+                       t = sig->params [i - sig->hasthis];
+               else
+                       t = &mono_defaults.int_class->byval_arg;
+
+               linfo->args [i].storage = LLVMArgNone;
+
+               switch (ainfo->storage) {
+               case ArgInIReg:
+                       linfo->args [i].storage = LLVMArgInIReg;
+                       break;
+               case ArgInDoubleSSEReg:
+               case ArgInFloatSSEReg:
+                       linfo->args [i].storage = LLVMArgInFPReg;
+                       break;
+               case ArgOnStack:
+                       if (MONO_TYPE_ISSTRUCT (t)) {
+                               if (mono_class_value_size (mono_class_from_mono_type (t), NULL) == 0)
+                               /* LLVM seems to allocate argument space for empty structures too */
+                                       linfo->args [i].storage = LLVMArgNone;
+                               else
+                                       linfo->args [i].storage = LLVMArgVtypeByVal;
+                       } else {
+                               linfo->args [i].storage = LLVMArgInIReg;
+                               if (t->byref) {
+                                       if (t->type == MONO_TYPE_R4)
+                                               linfo->args [i].storage = LLVMArgInFPReg;
+                                       else if (t->type == MONO_TYPE_R8)
+                                               linfo->args [i].storage = LLVMArgInFPReg;
+                               }
+                       }
+                       break;
+               case ArgValuetypeInReg:
+                       if (sig->pinvoke) {
+                               cfg->exception_message = g_strdup ("pinvoke + vtypes");
+                               cfg->disable_llvm = TRUE;
+                               return linfo;
+                       }
+
+                       cfg->exception_message = g_strdup ("vtype arg");
+                       cfg->disable_llvm = TRUE;
+                       /*
+                       linfo->args [i].storage = LLVMArgVtypeInReg;
+                       for (j = 0; j < 2; ++j)
+                               linfo->args [i].pair_storage [j] = arg_storage_to_llvm_arg_storage (cfg, ainfo->pair_storage [j]);
+                       */
+                       break;
+               default:
+                       cfg->exception_message = g_strdup ("ainfo->storage");
+                       cfg->disable_llvm = TRUE;
                        break;
                }
        }
-       return fp_space;
+
+       return linfo;
 }
+#endif
 
-/* 
- * take the arguments and generate the arch-specific
- * instructions to properly call the function in call.
- * This includes pushing, moving arguments to the right register
- * etc.
- */
-MonoCallInst*
-mono_arch_call_opcode (MonoCompile *cfg, MonoBasicBlock* bb, MonoCallInst *call, int is_virtual) {
+void
+mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
+{
        MonoInst *arg, *in;
        MonoMethodSignature *sig;
        int i, n;
        CallInfo *cinfo;
        int sentinelpos = 0;
-       int fp_args_space = 0, fp_args_offset = 0, fp_arg_setup = -1;
 
        sig = call->signature;
        n = sig->param_count + sig->hasthis;
 
-       cinfo = get_call_info (cfg, cfg->mempool, sig, FALSE);
+       cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, FALSE);
 
        if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG))
-               sentinelpos = sig->sentinelpos + (is_virtual ? 1 : 0);
+               sentinelpos = sig->sentinelpos + (sig->hasthis ? 1 : 0);
 
-       for (i = 0; i < n; ++i) {
-               ArgInfo *ainfo = cinfo->args + i;
+       if (cinfo->need_stack_align) {
+               MONO_INST_NEW (cfg, arg, OP_SUB_IMM);
+               arg->dreg = X86_ESP;
+               arg->sreg1 = X86_ESP;
+               arg->inst_imm = cinfo->stack_align_amount;
+               MONO_ADD_INS (cfg->cbb, arg);
+       }
 
-               /* Emit the signature cookie just before the implicit arguments */
-               if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (i == sentinelpos)) {
-                       emit_sig_cookie (cfg, call);
+       if (sig->ret && MONO_TYPE_ISSTRUCT (sig->ret)) {
+               if (cinfo->ret.storage == ArgValuetypeInReg) {
+                       /*
+                        * Tell the JIT to use a more efficient calling convention: call using
+                        * OP_CALL, compute the result location after the call, and save the 
+                        * result there.
+                        */
+                       call->vret_in_reg = TRUE;
+                       if (call->vret_var)
+                               NULLIFY_INS (call->vret_var);
                }
+       }
 
-               if (is_virtual && i == 0) {
-                       /* the argument will be attached to the call instrucion */
-                       in = call->args [i];
-               } else {
-                       MonoType *t;
+       /* Handle the case where there are no implicit arguments */
+       if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (n == sentinelpos)) {
+               emit_sig_cookie (cfg, call, cinfo);
+       }
 
-                       if (i >= sig->hasthis)
-                               t = sig->params [i - sig->hasthis];
-                       else
-                               t = &mono_defaults.int_class->byval_arg;
-                       t = mono_type_get_underlying_type (t);
-
-                       MONO_INST_NEW (cfg, arg, OP_OUTARG);
-                       in = call->args [i];
-                       arg->cil_code = in->cil_code;
-                       arg->inst_left = in;
-                       arg->type = in->type;
-                       MONO_INST_LIST_ADD (&arg->node, &call->out_args);
-
-                       if ((i >= sig->hasthis) && (MONO_TYPE_ISSTRUCT(t))) {
-                               guint32 size, align;
-
-                               if (t->type == MONO_TYPE_TYPEDBYREF) {
-                                       size = sizeof (MonoTypedRef);
-                                       align = sizeof (gpointer);
-                               }
-                               else
-                                       if (sig->pinvoke)
-                                               size = mono_type_native_stack_size (&in->klass->byval_arg, &align);
-                                       else {
-                                               int ialign;
-                                               size = mini_type_stack_size (cfg->generic_sharing_context, &in->klass->byval_arg, &ialign);
-                                               align = ialign;
-                                       }
+       /* Arguments are pushed in the reverse order */
+       for (i = n - 1; i >= 0; i --) {
+               ArgInfo *ainfo = cinfo->args + i;
+               MonoType *t;
+
+               if (i >= sig->hasthis)
+                       t = sig->params [i - sig->hasthis];
+               else
+                       t = &mono_defaults.int_class->byval_arg;
+               t = mini_type_get_underlying_type (cfg->generic_sharing_context, t);
+
+               MONO_INST_NEW (cfg, arg, OP_X86_PUSH);
+
+               in = call->args [i];
+               arg->cil_code = in->cil_code;
+               arg->sreg1 = in->dreg;
+               arg->type = in->type;
+
+               g_assert (in->dreg != -1);
+
+               if ((i >= sig->hasthis) && (MONO_TYPE_ISSTRUCT(t))) {
+                       guint32 align;
+                       guint32 size;
+
+                       g_assert (in->klass);
+
+                       if (t->type == MONO_TYPE_TYPEDBYREF) {
+                               size = sizeof (MonoTypedRef);
+                               align = sizeof (gpointer);
+                       }
+                       else {
+                               size = mini_type_stack_size_full (cfg->generic_sharing_context, &in->klass->byval_arg, &align, sig->pinvoke);
+                       }
+
+                       if (size > 0) {
                                arg->opcode = OP_OUTARG_VT;
+                               arg->sreg1 = in->dreg;
                                arg->klass = in->klass;
-                               arg->backend.is_pinvoke = sig->pinvoke;
-                               arg->inst_imm = size; 
+                               arg->backend.size = size;
+
+                               MONO_ADD_INS (cfg->cbb, arg);
                        }
-                       else {
-                               switch (ainfo->storage) {
-                               case ArgOnStack:
-                                       arg->opcode = OP_OUTARG;
-                                       if (!t->byref) {
-                                               if (t->type == MONO_TYPE_R4) {
-                                                       arg->opcode = OP_OUTARG_R4;
-                                               } else if (t->type == MONO_TYPE_R8) {
-                                                       arg->opcode = OP_OUTARG_R8;
-                                                       /* we store in the upper bits of backen.arg_info the needed
-                                                        * esp adjustment and in the lower bits the offset from esp
-                                                        * where the arg needs to be stored
-                                                        */
-                                                       if (!fp_args_space) {
-                                                               fp_args_space = collect_fp_stack_space (sig, i - sig->hasthis, &fp_arg_setup);
-                                                               fp_args_offset = fp_args_space;
-                                                       }
-                                                       arg->backend.arg_info = fp_args_space - fp_args_offset;
-                                                       fp_args_offset -= sizeof (double);
-                                                       if (i - sig->hasthis == fp_arg_setup) {
-                                                               arg->backend.arg_info |= fp_args_space << 16;
-                                                       }
-                                                       if (fp_args_offset == 0) {
-                                                               /* the allocated esp stack is finished:
-                                                                * prepare for an eventual second run of fp args
-                                                                */
-                                                               fp_args_space = 0;
-                                                       }
-                                               }
+               }
+               else {
+                       switch (ainfo->storage) {
+                       case ArgOnStack:
+                               arg->opcode = OP_X86_PUSH;
+                               if (!t->byref) {
+                                       if (t->type == MONO_TYPE_R4) {
+                                               MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, 4);
+                                               arg->opcode = OP_STORER4_MEMBASE_REG;
+                                               arg->inst_destbasereg = X86_ESP;
+                                               arg->inst_offset = 0;
+                                       } else if (t->type == MONO_TYPE_R8) {
+                                               MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, 8);
+                                               arg->opcode = OP_STORER8_MEMBASE_REG;
+                                               arg->inst_destbasereg = X86_ESP;
+                                               arg->inst_offset = 0;
+                                       } else if (t->type == MONO_TYPE_I8 || t->type == MONO_TYPE_U8) {
+                                               arg->sreg1 ++;
+                                               MONO_EMIT_NEW_UNALU (cfg, OP_X86_PUSH, -1, in->dreg + 2);
                                        }
-                                       break;
-                               default:
-                                       g_assert_not_reached ();
                                }
+                               break;
+                       default:
+                               g_assert_not_reached ();
                        }
+                       
+                       MONO_ADD_INS (cfg->cbb, arg);
                }
-       }
 
-       /* Handle the case where there are no implicit arguments */
-       if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (n == sentinelpos)) {
-               emit_sig_cookie (cfg, call);
+               if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (i == sentinelpos)) {
+                       /* Emit the signature cookie just before the implicit arguments */
+                       emit_sig_cookie (cfg, call, cinfo);
+               }
        }
 
        if (sig->ret && MONO_TYPE_ISSTRUCT (sig->ret)) {
+               MonoInst *vtarg;
+
                if (cinfo->ret.storage == ArgValuetypeInReg) {
-                       MonoInst *zero_inst;
-                       /*
-                        * After the call, the struct is in registers, but needs to be saved to the memory pointed
-                        * to by vt_arg in this_vret_args. This means that vt_arg needs to be saved somewhere
-                        * before calling the function. So we add a dummy instruction to represent pushing the 
-                        * struct return address to the stack. The return address will be saved to this stack slot 
-                        * by the code emitted in this_vret_args.
-                        */
-                       MONO_INST_NEW (cfg, arg, OP_OUTARG);
-                       MONO_INST_NEW (cfg, zero_inst, OP_ICONST);
-                       zero_inst->inst_p0 = 0;
-                       arg->inst_left = zero_inst;
-                       arg->type = STACK_PTR;
-                       MONO_INST_LIST_ADD (&arg->node, &call->out_args);
+                       /* Already done */
+               }
+               else if (cinfo->ret.storage == ArgInIReg) {
+                       NOT_IMPLEMENTED;
+                       /* The return address is passed in a register */
+                       MONO_INST_NEW (cfg, vtarg, OP_MOVE);
+                       vtarg->sreg1 = call->inst.dreg;
+                       vtarg->dreg = mono_alloc_ireg (cfg);
+                       MONO_ADD_INS (cfg->cbb, vtarg);
+                               
+                       mono_call_inst_add_outarg_reg (cfg, call, vtarg->dreg, cinfo->ret.reg, FALSE);
                } else {
-                       /* if the function returns a struct, the called method already does a ret $0x4 */
-                       if (sig->ret && MONO_TYPE_ISSTRUCT (sig->ret))
-                               cinfo->stack_usage -= 4;
+                       MonoInst *vtarg;
+                       MONO_INST_NEW (cfg, vtarg, OP_X86_PUSH);
+                       vtarg->type = STACK_MP;
+                       vtarg->sreg1 = call->vret_var->dreg;
+                       MONO_ADD_INS (cfg->cbb, vtarg);
                }
+
+               /* if the function returns a struct on stack, the called method already does a ret $0x4 */
+               if (cinfo->ret.storage != ArgValuetypeInReg)
+                       cinfo->stack_usage -= 4;
        }
-       
+
        call->stack_usage = cinfo->stack_usage;
+}
 
-#if defined(__APPLE__)
-       if (cinfo->need_stack_align) {
-               MONO_INST_NEW (cfg, arg, OP_X86_OUTARG_ALIGN_STACK);
-               arg->inst_c0 = cinfo->stack_align_amount;
-               MONO_INST_LIST_ADD (&arg->node, &call->out_args);
-        }
-#endif 
+void
+mono_arch_emit_outarg_vt (MonoCompile *cfg, MonoInst *ins, MonoInst *src)
+{
+       MonoInst *arg;
+       int size = ins->backend.size;
 
-       return call;
-}
+       if (size <= 4) {
+               MONO_INST_NEW (cfg, arg, OP_X86_PUSH_MEMBASE);
+               arg->sreg1 = src->dreg;
+
+               MONO_ADD_INS (cfg->cbb, arg);
+       } else if (size <= 20) {        
+               MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, ALIGN_TO (size, 4));
+               mini_emit_memcpy (cfg, X86_ESP, 0, src->dreg, 0, size, 4);
+       } else {
+               MONO_INST_NEW (cfg, arg, OP_X86_PUSH_OBJ);
+               arg->inst_basereg = src->dreg;
+               arg->inst_offset = 0;
+               arg->inst_imm = size;
+                                       
+               MONO_ADD_INS (cfg->cbb, arg);
+       }
+}
+
+void
+mono_arch_emit_setret (MonoCompile *cfg, MonoMethod *method, MonoInst *val)
+{
+       MonoType *ret = mini_type_get_underlying_type (cfg->generic_sharing_context, mono_method_signature (method)->ret);
+
+       if (!ret->byref) {
+               if (ret->type == MONO_TYPE_R4) {
+                       if (COMPILE_LLVM (cfg))
+                               MONO_EMIT_NEW_UNALU (cfg, OP_FMOVE, cfg->ret->dreg, val->dreg);
+                       /* Nothing to do */
+                       return;
+               } else if (ret->type == MONO_TYPE_R8) {
+                       if (COMPILE_LLVM (cfg))
+                               MONO_EMIT_NEW_UNALU (cfg, OP_FMOVE, cfg->ret->dreg, val->dreg);
+                       /* Nothing to do */
+                       return;
+               } else if (ret->type == MONO_TYPE_I8 || ret->type == MONO_TYPE_U8) {
+                       if (COMPILE_LLVM (cfg))
+                               MONO_EMIT_NEW_UNALU (cfg, OP_LMOVE, cfg->ret->dreg, val->dreg);
+                       else {
+                               MONO_EMIT_NEW_UNALU (cfg, OP_MOVE, X86_EAX, val->dreg + 1);
+                               MONO_EMIT_NEW_UNALU (cfg, OP_MOVE, X86_EDX, val->dreg + 2);
+                       }
+                       return;
+               }
+       }
+                       
+       MONO_EMIT_NEW_UNALU (cfg, OP_MOVE, cfg->ret->dreg, val->dreg);
+}
 
 /*
  * Allow tracing to work with this interface (with an optional argument)
@@ -1134,9 +1478,8 @@ mono_arch_instrument_prolog (MonoCompile *cfg, void *func, void *p, gboolean ena
 {
        guchar *code = p;
 
-#if __APPLE__
-       x86_alu_reg_imm (code, X86_SUB, X86_ESP, 8);
-#endif
+       g_assert (MONO_ARCH_FRAME_ALIGNMENT >= 8);
+       x86_alu_reg_imm (code, X86_SUB, X86_ESP, MONO_ARCH_FRAME_ALIGNMENT - 8);
 
        /* if some args are passed in registers, we need to save them here */
        x86_push_reg (code, X86_EBP);
@@ -1151,11 +1494,7 @@ mono_arch_instrument_prolog (MonoCompile *cfg, void *func, void *p, gboolean ena
                mono_add_patch_info (cfg, code-cfg->native_code, MONO_PATCH_INFO_ABS, func);
                x86_call_code (code, 0);
        }
-#if __APPLE__
-       x86_alu_reg_imm (code, X86_ADD, X86_ESP, 16);
-#else
-       x86_alu_reg_imm (code, X86_ADD, X86_ESP, 8);
-#endif
+       x86_alu_reg_imm (code, X86_ADD, X86_ESP, MONO_ARCH_FRAME_ALIGNMENT);
 
        return code;
 }
@@ -1169,42 +1508,52 @@ enum {
 };
 
 void*
-mono_arch_instrument_epilog (MonoCompile *cfg, void *func, void *p, gboolean enable_arguments)
+mono_arch_instrument_epilog_full (MonoCompile *cfg, void *func, void *p, gboolean enable_arguments, gboolean preserve_argument_registers)
 {
        guchar *code = p;
-       int arg_size = 0, save_mode = SAVE_NONE;
+       int arg_size = 0, stack_usage = 0, save_mode = SAVE_NONE;
        MonoMethod *method = cfg->method;
-       
-       switch (mono_type_get_underlying_type (mono_method_signature (method)->ret)->type) {
+       MonoType *ret_type = mini_type_get_underlying_type (cfg->generic_sharing_context, mono_method_signature (method)->ret);
+
+       switch (ret_type->type) {
        case MONO_TYPE_VOID:
                /* special case string .ctor icall */
-               if (strcmp (".ctor", method->name) && method->klass == mono_defaults.string_class)
+               if (strcmp (".ctor", method->name) && method->klass == mono_defaults.string_class) {
                        save_mode = SAVE_EAX;
-               else
+                       stack_usage = enable_arguments ? 8 : 4;
+               } else
                        save_mode = SAVE_NONE;
                break;
        case MONO_TYPE_I8:
        case MONO_TYPE_U8:
                save_mode = SAVE_EAX_EDX;
+               stack_usage = enable_arguments ? 16 : 8;
                break;
        case MONO_TYPE_R4:
        case MONO_TYPE_R8:
                save_mode = SAVE_FP;
+               stack_usage = enable_arguments ? 16 : 8;
                break;
        case MONO_TYPE_GENERICINST:
-               if (!mono_type_generic_inst_is_valuetype (mono_method_signature (method)->ret)) {
+               if (!mono_type_generic_inst_is_valuetype (ret_type)) {
                        save_mode = SAVE_EAX;
+                       stack_usage = enable_arguments ? 8 : 4;
                        break;
                }
                /* Fall through */
        case MONO_TYPE_VALUETYPE:
+               // FIXME: Handle SMALL_STRUCT_IN_REG here for proper alignment on darwin-x86
                save_mode = SAVE_STRUCT;
+               stack_usage = enable_arguments ? 4 : 0;
                break;
        default:
                save_mode = SAVE_EAX;
+               stack_usage = enable_arguments ? 8 : 4;
                break;
        }
 
+       x86_alu_reg_imm (code, X86_SUB, X86_ESP, MONO_ARCH_FRAME_ALIGNMENT - stack_usage - 4);
+
        switch (save_mode) {
        case SAVE_EAX_EDX:
                x86_push_reg (code, X86_EDX);
@@ -1252,6 +1601,7 @@ mono_arch_instrument_epilog (MonoCompile *cfg, void *func, void *p, gboolean ena
                mono_add_patch_info (cfg, code-cfg->native_code, MONO_PATCH_INFO_ABS, func);
                x86_call_code (code, 0);
        }
+
        x86_alu_reg_imm (code, X86_ADD, X86_ESP, arg_size + 4);
 
        switch (save_mode) {
@@ -1270,33 +1620,22 @@ mono_arch_instrument_epilog (MonoCompile *cfg, void *func, void *p, gboolean ena
        default:
                break;
        }
+       
+       x86_alu_reg_imm (code, X86_ADD, X86_ESP, MONO_ARCH_FRAME_ALIGNMENT - stack_usage);
 
        return code;
 }
 
 #define EMIT_COND_BRANCH(ins,cond,sign) \
-if (ins->flags & MONO_INST_BRLABEL) { \
-        if (ins->inst_i0->inst_c0) { \
-               x86_branch (code, cond, cfg->native_code + ins->inst_i0->inst_c0, sign); \
-        } else { \
-               mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_LABEL, ins->inst_i0); \
-               if ((cfg->opt & MONO_OPT_BRANCH) && \
-                    x86_is_imm8 (ins->inst_i0->inst_c1 - cpos)) \
-                       x86_branch8 (code, cond, 0, sign); \
-                else \
-                       x86_branch32 (code, cond, 0, sign); \
-        } \
+if (ins->inst_true_bb->native_offset) { \
+       x86_branch (code, cond, cfg->native_code + ins->inst_true_bb->native_offset, sign); \
 } else { \
-        if (ins->inst_true_bb->native_offset) { \
-               x86_branch (code, cond, cfg->native_code + ins->inst_true_bb->native_offset, sign); \
-        } else { \
-               mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_BB, ins->inst_true_bb); \
-               if ((cfg->opt & MONO_OPT_BRANCH) && \
-                    x86_is_imm8 (ins->inst_true_bb->max_offset - cpos)) \
-                       x86_branch8 (code, cond, 0, sign); \
-                else \
-                       x86_branch32 (code, cond, 0, sign); \
-        } \
+       mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_BB, ins->inst_true_bb); \
+       if ((cfg->opt & MONO_OPT_BRANCH) && \
+            x86_is_imm8 (ins->inst_true_bb->max_offset - cpos)) \
+               x86_branch8 (code, cond, 0, sign); \
+        else \
+               x86_branch32 (code, cond, 0, sign); \
 }
 
 /*  
@@ -1333,17 +1672,18 @@ emit_call (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointer dat
 #define INST_IGNORES_CFLAGS(opcode) (!(((opcode) == OP_ADC) || ((opcode) == OP_IADC) || ((opcode) == OP_ADC_IMM) || ((opcode) == OP_IADC_IMM) || ((opcode) == OP_SBB) || ((opcode) == OP_ISBB) || ((opcode) == OP_SBB_IMM) || ((opcode) == OP_ISBB_IMM)))
 
 /*
- * peephole_pass_1:
+ * mono_peephole_pass_1:
  *
  *   Perform peephole opts which should/can be performed before local regalloc
  */
-static void
-peephole_pass_1 (MonoCompile *cfg, MonoBasicBlock *bb)
+void
+mono_arch_peephole_pass_1 (MonoCompile *cfg, MonoBasicBlock *bb)
 {
        MonoInst *ins, *n;
 
-       MONO_INST_LIST_FOR_EACH_ENTRY_SAFE (ins, n, &bb->ins_list, node) {
-               MonoInst *last_ins = mono_inst_list_prev (&ins->node, &bb->ins_list);
+       MONO_BB_FOR_EACH_INS_SAFE (bb, n, ins) {
+               MonoInst *last_ins = ins->prev;
+
                switch (ins->opcode) {
                case OP_IADD_IMM:
                case OP_ADD_IMM:
@@ -1396,136 +1736,7 @@ peephole_pass_1 (MonoCompile *cfg, MonoBasicBlock *bb)
                                                ins->opcode = OP_X86_TEST_NULL;
                                }
 
-                       break;
-               case OP_LOAD_MEMBASE:
-               case OP_LOADI4_MEMBASE:
-                       /* 
-                        * Note: if reg1 = reg2 the load op is removed
-                        *
-                        * OP_STORE_MEMBASE_REG reg1, offset(basereg) 
-                        * OP_LOAD_MEMBASE offset(basereg), reg2
-                        * -->
-                        * OP_STORE_MEMBASE_REG reg1, offset(basereg)
-                        * OP_MOVE reg1, reg2
-                        */
-                       if (last_ins && (last_ins->opcode == OP_STOREI4_MEMBASE_REG 
-                                        || last_ins->opcode == OP_STORE_MEMBASE_REG) &&
-                           ins->inst_basereg == last_ins->inst_destbasereg &&
-                           ins->inst_offset == last_ins->inst_offset) {
-                               if (ins->dreg == last_ins->sreg1) {
-                                       MONO_DEL_INS (ins);
-                                       continue;
-                               } else {
-                                       //static int c = 0; printf ("MATCHX %s %d\n", cfg->method->name,c++);
-                                       ins->opcode = OP_MOVE;
-                                       ins->sreg1 = last_ins->sreg1;
-                               }
-
-                       /* 
-                        * Note: reg1 must be different from the basereg in the second load
-                        * Note: if reg1 = reg2 is equal then second load is removed
-                        *
-                        * OP_LOAD_MEMBASE offset(basereg), reg1
-                        * OP_LOAD_MEMBASE offset(basereg), reg2
-                        * -->
-                        * OP_LOAD_MEMBASE offset(basereg), reg1
-                        * OP_MOVE reg1, reg2
-                        */
-                       } if (last_ins && (last_ins->opcode == OP_LOADI4_MEMBASE
-                                          || last_ins->opcode == OP_LOAD_MEMBASE) &&
-                             ins->inst_basereg != last_ins->dreg &&
-                             ins->inst_basereg == last_ins->inst_basereg &&
-                             ins->inst_offset == last_ins->inst_offset) {
-
-                               if (ins->dreg == last_ins->dreg) {
-                                       MONO_DEL_INS (ins);
-                                       continue;
-                               } else {
-                                       ins->opcode = OP_MOVE;
-                                       ins->sreg1 = last_ins->dreg;
-                               }
-
-                               //g_assert_not_reached ();
-
-#if 0
-                       /* 
-                        * OP_STORE_MEMBASE_IMM imm, offset(basereg) 
-                        * OP_LOAD_MEMBASE offset(basereg), reg
-                        * -->
-                        * OP_STORE_MEMBASE_IMM imm, offset(basereg) 
-                        * OP_ICONST reg, imm
-                        */
-                       } else if (last_ins && (last_ins->opcode == OP_STOREI4_MEMBASE_IMM
-                                               || last_ins->opcode == OP_STORE_MEMBASE_IMM) &&
-                                  ins->inst_basereg == last_ins->inst_destbasereg &&
-                                  ins->inst_offset == last_ins->inst_offset) {
-                               //static int c = 0; printf ("MATCHX %s %d\n", cfg->method->name,c++);
-                               ins->opcode = OP_ICONST;
-                               ins->inst_c0 = last_ins->inst_imm;
-                               g_assert_not_reached (); // check this rule
-#endif
-                       }
-                       break;
-               case OP_LOADU1_MEMBASE:
-               case OP_LOADI1_MEMBASE:
-                       /* 
-                        * OP_STORE_MEMBASE_REG reg1, offset(basereg) 
-                        * OP_LOAD_MEMBASE offset(basereg), reg2
-                        * -->
-                        * OP_STORE_MEMBASE_REG reg1, offset(basereg)
-                        * CONV_I2/U2 reg1, reg2
-                        */
-                       if (last_ins && X86_IS_BYTE_REG (last_ins->sreg1) &&
-                               (last_ins->opcode == OP_STOREI1_MEMBASE_REG) &&
-                                       ins->inst_basereg == last_ins->inst_destbasereg &&
-                                       ins->inst_offset == last_ins->inst_offset) {
-                               ins->opcode = (ins->opcode == OP_LOADI1_MEMBASE) ? CEE_CONV_I1 : CEE_CONV_U1;
-                               ins->sreg1 = last_ins->sreg1;
-                       }
-                       break;
-               case OP_LOADU2_MEMBASE:
-               case OP_LOADI2_MEMBASE:
-                       /* 
-                        * OP_STORE_MEMBASE_REG reg1, offset(basereg) 
-                        * OP_LOAD_MEMBASE offset(basereg), reg2
-                        * -->
-                        * OP_STORE_MEMBASE_REG reg1, offset(basereg)
-                        * CONV_I2/U2 reg1, reg2
-                        */
-                       if (last_ins && (last_ins->opcode == OP_STOREI2_MEMBASE_REG) &&
-                                       ins->inst_basereg == last_ins->inst_destbasereg &&
-                                       ins->inst_offset == last_ins->inst_offset) {
-                               ins->opcode = (ins->opcode == OP_LOADI2_MEMBASE) ? CEE_CONV_I2 : CEE_CONV_U2;
-                               ins->sreg1 = last_ins->sreg1;
-                       }
-                       break;
-               case CEE_CONV_I4:
-               case CEE_CONV_U4:
-               case OP_ICONV_TO_I4:
-               case OP_MOVE:
-                       /*
-                        * Removes:
-                        *
-                        * OP_MOVE reg, reg 
-                        */
-                       if (ins->dreg == ins->sreg1) {
-                               MONO_DEL_INS (ins);
-                               continue;
-                       }
-                       /* 
-                        * Removes:
-                        *
-                        * OP_MOVE sreg, dreg 
-                        * OP_MOVE dreg, sreg
-                        */
-                       if (last_ins && last_ins->opcode == OP_MOVE &&
-                           ins->sreg1 == last_ins->dreg &&
-                           ins->dreg == last_ins->sreg1) {
-                               MONO_DEL_INS (ins);
-                               continue;
-                       }
-                       break;
-                       
+                       break;                  
                case OP_X86_PUSH_MEMBASE:
                        if (last_ins && (last_ins->opcode == OP_STOREI4_MEMBASE_REG ||
                                         last_ins->opcode == OP_STORE_MEMBASE_REG) &&
@@ -1536,26 +1747,22 @@ peephole_pass_1 (MonoCompile *cfg, MonoBasicBlock *bb)
                        }
                        break;
                }
+
+               mono_peephole_ins (bb, ins);
        }
 }
 
-static void
-peephole_pass (MonoCompile *cfg, MonoBasicBlock *bb)
+void
+mono_arch_peephole_pass_2 (MonoCompile *cfg, MonoBasicBlock *bb)
 {
        MonoInst *ins, *n;
 
-       MONO_INST_LIST_FOR_EACH_ENTRY_SAFE (ins, n, &bb->ins_list, node) {
-               MonoInst *last_ins = mono_inst_list_prev (&ins->node, &bb->ins_list);
-
+       MONO_BB_FOR_EACH_INS_SAFE (bb, n, ins) {
                switch (ins->opcode) {
-               case OP_ICONST: {
-                       MonoInst *next;
-
+               case OP_ICONST:
                        /* reg = 0 -> XOR (reg, reg) */
                        /* XOR sets cflags on x86, so we cant do it always */
-                       next = mono_inst_list_next (&ins->node, &bb->ins_list);
-                       if (ins->inst_c0 == 0 && (!next ||
-                                       (next && INST_IGNORES_CFLAGS (next->opcode)))) {
+                       if (ins->inst_c0 == 0 && (!ins->next || (ins->next && INST_IGNORES_CFLAGS (ins->next->opcode)))) {
                                MonoInst *ins2;
 
                                ins->opcode = OP_IXOR;
@@ -1566,22 +1773,23 @@ peephole_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                                 * Convert succeeding STORE_MEMBASE_IMM 0 ins to STORE_MEMBASE_REG 
                                 * since it takes 3 bytes instead of 7.
                                 */
-                               for (ins2 = mono_inst_list_next (&ins->node, &bb->ins_list); ins2;
-                                               ins2 = mono_inst_list_next (&ins2->node, &bb->ins_list)) {
+                               for (ins2 = ins->next; ins2; ins2 = ins2->next) {
                                        if ((ins2->opcode == OP_STORE_MEMBASE_IMM) && (ins2->inst_imm == 0)) {
                                                ins2->opcode = OP_STORE_MEMBASE_REG;
                                                ins2->sreg1 = ins->dreg;
-                                       } else if ((ins2->opcode == OP_STOREI4_MEMBASE_IMM) && (ins2->inst_imm == 0)) {
+                                       }
+                                       else if ((ins2->opcode == OP_STOREI4_MEMBASE_IMM) && (ins2->inst_imm == 0)) {
                                                ins2->opcode = OP_STOREI4_MEMBASE_REG;
                                                ins2->sreg1 = ins->dreg;
-                                       } else if ((ins2->opcode == OP_STOREI1_MEMBASE_IMM) || (ins2->opcode == OP_STOREI2_MEMBASE_IMM)) {
+                                       }
+                                       else if ((ins2->opcode == OP_STOREI1_MEMBASE_IMM) || (ins2->opcode == OP_STOREI2_MEMBASE_IMM)) {
                                                /* Continue iteration */
-                                       } else
+                                       }
+                                       else
                                                break;
                                }
                        }
                        break;
-               }
                case OP_IADD_IMM:
                case OP_ADD_IMM:
                        if ((ins->inst_imm == 1) && (ins->dreg == ins->sreg1))
@@ -1592,167 +1800,48 @@ peephole_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                        if ((ins->inst_imm == 1) && (ins->dreg == ins->sreg1))
                                ins->opcode = OP_X86_DEC_REG;
                        break;
-               case OP_X86_COMPARE_MEMBASE_IMM:
-                       /* 
-                        * OP_STORE_MEMBASE_REG reg, offset(basereg)
-                        * OP_X86_COMPARE_MEMBASE_IMM offset(basereg), imm
-                        * -->
-                        * OP_STORE_MEMBASE_REG reg, offset(basereg)
-                        * OP_COMPARE_IMM reg, imm
-                        *
-                        * Note: if imm = 0 then OP_COMPARE_IMM replaced with OP_X86_TEST_NULL
-                        */
-                       if (last_ins && (last_ins->opcode == OP_STOREI4_MEMBASE_REG) &&
-                           ins->inst_basereg == last_ins->inst_destbasereg &&
-                           ins->inst_offset == last_ins->inst_offset) {
-                                       ins->opcode = OP_COMPARE_IMM;
-                                       ins->sreg1 = last_ins->sreg1;
-
-                                       /* check if we can remove cmp reg,0 with test null */
-                                       if (!ins->inst_imm)
-                                               ins->opcode = OP_X86_TEST_NULL;
-                               }
-
-                       break;
-               case OP_LOAD_MEMBASE:
-               case OP_LOADI4_MEMBASE:
-                       /* 
-                        * Note: if reg1 = reg2 the load op is removed
-                        *
-                        * OP_STORE_MEMBASE_REG reg1, offset(basereg) 
-                        * OP_LOAD_MEMBASE offset(basereg), reg2
-                        * -->
-                        * OP_STORE_MEMBASE_REG reg1, offset(basereg)
-                        * OP_MOVE reg1, reg2
-                        */
-                       if (last_ins && (last_ins->opcode == OP_STOREI4_MEMBASE_REG 
-                                        || last_ins->opcode == OP_STORE_MEMBASE_REG) &&
-                           ins->inst_basereg == last_ins->inst_destbasereg &&
-                           ins->inst_offset == last_ins->inst_offset) {
-                               if (ins->dreg == last_ins->sreg1) {
-                                       MONO_DEL_INS (ins);
-                                       continue;
-                               } else {
-                                       //static int c = 0; printf ("MATCHX %s %d\n", cfg->method->name,c++);
-                                       ins->opcode = OP_MOVE;
-                                       ins->sreg1 = last_ins->sreg1;
-                               }
+               }
 
-                       /* 
-                        * Note: reg1 must be different from the basereg in the second load
-                        * Note: if reg1 = reg2 is equal then second load is removed
-                        *
-                        * OP_LOAD_MEMBASE offset(basereg), reg1
-                        * OP_LOAD_MEMBASE offset(basereg), reg2
-                        * -->
-                        * OP_LOAD_MEMBASE offset(basereg), reg1
-                        * OP_MOVE reg1, reg2
-                        */
-                       } if (last_ins && (last_ins->opcode == OP_LOADI4_MEMBASE
-                                          || last_ins->opcode == OP_LOAD_MEMBASE) &&
-                             ins->inst_basereg != last_ins->dreg &&
-                             ins->inst_basereg == last_ins->inst_basereg &&
-                             ins->inst_offset == last_ins->inst_offset) {
-
-                               if (ins->dreg == last_ins->dreg) {
-                                       MONO_DEL_INS (ins);
-                                       continue;
-                               } else {
-                                       ins->opcode = OP_MOVE;
-                                       ins->sreg1 = last_ins->dreg;
-                               }
+               mono_peephole_ins (bb, ins);
+       }
+}
 
-                               //g_assert_not_reached ();
+/*
+ * mono_arch_lowering_pass:
+ *
+ *  Converts complex opcodes into simpler ones so that each IR instruction
+ * corresponds to one machine instruction.
+ */
+void
+mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
+{
+       MonoInst *ins, *next;
 
-#if 0
-                       /* 
-                        * OP_STORE_MEMBASE_IMM imm, offset(basereg) 
-                        * OP_LOAD_MEMBASE offset(basereg), reg
-                        * -->
-                        * OP_STORE_MEMBASE_IMM imm, offset(basereg) 
-                        * OP_ICONST reg, imm
-                        */
-                       } else if (last_ins && (last_ins->opcode == OP_STOREI4_MEMBASE_IMM
-                                               || last_ins->opcode == OP_STORE_MEMBASE_IMM) &&
-                                  ins->inst_basereg == last_ins->inst_destbasereg &&
-                                  ins->inst_offset == last_ins->inst_offset) {
-                               //static int c = 0; printf ("MATCHX %s %d\n", cfg->method->name,c++);
-                               ins->opcode = OP_ICONST;
-                               ins->inst_c0 = last_ins->inst_imm;
-                               g_assert_not_reached (); // check this rule
-#endif
-                       }
-                       break;
-               case OP_LOADU1_MEMBASE:
-               case OP_LOADI1_MEMBASE:
-                       /* 
-                        * OP_STORE_MEMBASE_REG reg1, offset(basereg) 
-                        * OP_LOAD_MEMBASE offset(basereg), reg2
-                        * -->
-                        * OP_STORE_MEMBASE_REG reg1, offset(basereg)
-                        * CONV_I2/U2 reg1, reg2
-                        */
-                       if (last_ins && X86_IS_BYTE_REG (last_ins->sreg1) &&
-                               (last_ins->opcode == OP_STOREI1_MEMBASE_REG) &&
-                                       ins->inst_basereg == last_ins->inst_destbasereg &&
-                                       ins->inst_offset == last_ins->inst_offset) {
-                               ins->opcode = (ins->opcode == OP_LOADI1_MEMBASE) ? CEE_CONV_I1 : CEE_CONV_U1;
-                               ins->sreg1 = last_ins->sreg1;
-                       }
-                       break;
-               case OP_LOADU2_MEMBASE:
-               case OP_LOADI2_MEMBASE:
-                       /* 
-                        * OP_STORE_MEMBASE_REG reg1, offset(basereg) 
-                        * OP_LOAD_MEMBASE offset(basereg), reg2
-                        * -->
-                        * OP_STORE_MEMBASE_REG reg1, offset(basereg)
-                        * CONV_I2/U2 reg1, reg2
-                        */
-                       if (last_ins && (last_ins->opcode == OP_STOREI2_MEMBASE_REG) &&
-                                       ins->inst_basereg == last_ins->inst_destbasereg &&
-                                       ins->inst_offset == last_ins->inst_offset) {
-                               ins->opcode = (ins->opcode == OP_LOADI2_MEMBASE) ? CEE_CONV_I2 : CEE_CONV_U2;
-                               ins->sreg1 = last_ins->sreg1;
-                       }
-                       break;
-               case CEE_CONV_I4:
-               case CEE_CONV_U4:
-               case OP_ICONV_TO_I4:
-               case OP_MOVE:
-                       /*
-                        * Removes:
-                        *
-                        * OP_MOVE reg, reg 
-                        */
-                       if (ins->dreg == ins->sreg1) {
-                               MONO_DEL_INS (ins);
-                               continue;
-                       }
+       /*
+        * FIXME: Need to add more instructions, but the current machine 
+        * description can't model some parts of the composite instructions like
+        * cdq.
+        */
+       MONO_BB_FOR_EACH_INS_SAFE (bb, next, ins) {
+               switch (ins->opcode) {
+               case OP_IREM_IMM:
+               case OP_IDIV_IMM:
+               case OP_IDIV_UN_IMM:
+               case OP_IREM_UN_IMM:
                        /* 
-                        * Removes:
-                        *
-                        * OP_MOVE sreg, dreg 
-                        * OP_MOVE dreg, sreg
+                        * Keep the cases where we could generated optimized code, otherwise convert
+                        * to the non-imm variant.
                         */
-                       if (last_ins && last_ins->opcode == OP_MOVE &&
-                           ins->sreg1 == last_ins->dreg &&
-                           ins->dreg == last_ins->sreg1) {
-                               MONO_DEL_INS (ins);
-                               continue;
-                       }
+                       if ((ins->opcode == OP_IREM_IMM) && mono_is_power_of_two (ins->inst_imm) >= 0)
+                               break;
+                       mono_decompose_op_imm (cfg, bb, ins);
                        break;
-               case OP_X86_PUSH_MEMBASE:
-                       if (last_ins && (last_ins->opcode == OP_STOREI4_MEMBASE_REG ||
-                                        last_ins->opcode == OP_STORE_MEMBASE_REG) &&
-                           ins->inst_basereg == last_ins->inst_destbasereg &&
-                           ins->inst_offset == last_ins->inst_offset) {
-                                   ins->opcode = OP_X86_PUSH;
-                                   ins->sreg1 = last_ins->sreg1;
-                       }
+               default:
                        break;
                }
        }
+
+       bb->max_vreg = cfg->next_vreg;
 }
 
 static const int 
@@ -1775,20 +1864,13 @@ cc_signed_table [] = {
        FALSE, FALSE, FALSE, FALSE
 };
 
-void
-mono_arch_local_regalloc (MonoCompile *cfg, MonoBasicBlock *bb)
-{
-       if (cfg->opt & MONO_OPT_PEEPHOLE)
-               peephole_pass_1 (cfg, bb);
-
-       mono_local_regalloc (cfg, bb);
-}
-
 static unsigned char*
 emit_float_to_int (MonoCompile *cfg, guchar *code, int dreg, int size, gboolean is_signed)
 {
 #define XMM_TEMP_REG 0
-       if (cfg->opt & MONO_OPT_SSE2 && size < 8) {
+       /*This SSE2 optimization must not be done which OPT_SIMD in place as it clobbers xmm0.*/
+       /*The xmm pass decomposes OP_FCONV_ ops anyway anyway.*/
+       if (cfg->opt & MONO_OPT_SSE2 && size < 8 && !(cfg->opt & MONO_OPT_SIMD)) {
                /* optimize by assigning a local var for this use so we avoid
                 * the stack manipulations */
                x86_alu_reg_imm (code, X86_SUB, X86_ESP, 8);
@@ -1836,7 +1918,7 @@ mono_emit_stack_alloc (guchar *code, MonoInst* tree)
        int sreg = tree->sreg1;
        int need_touch = FALSE;
 
-#if defined(PLATFORM_WIN32) || defined(MONO_ARCH_SIGSEGV_ON_ALTSTACK)
+#if defined(TARGET_WIN32) || defined(MONO_ARCH_SIGSEGV_ON_ALTSTACK)
        need_touch = TRUE;
 #endif
 
@@ -1941,38 +2023,14 @@ mono_emit_stack_alloc (guchar *code, MonoInst* tree)
 static guint8*
 emit_move_return_value (MonoCompile *cfg, MonoInst *ins, guint8 *code)
 {
-       CallInfo *cinfo;
-       int quad;
-
        /* Move return value to the target register */
        switch (ins->opcode) {
-       case CEE_CALL:
+       case OP_CALL:
        case OP_CALL_REG:
        case OP_CALL_MEMBASE:
                if (ins->dreg != X86_EAX)
                        x86_mov_reg_reg (code, ins->dreg, X86_EAX, 4);
                break;
-       case OP_VCALL:
-       case OP_VCALL_REG:
-       case OP_VCALL_MEMBASE:
-               cinfo = get_call_info (cfg, cfg->mempool, ((MonoCallInst*)ins)->signature, FALSE);
-               if (cinfo->ret.storage == ArgValuetypeInReg) {
-                       /* Pop the destination address from the stack */
-                       x86_pop_reg (code, X86_ECX);
-                       
-                       for (quad = 0; quad < 2; quad ++) {
-                               switch (cinfo->ret.pair_storage [quad]) {
-                               case ArgInIReg:
-                                       g_assert (cinfo->ret.pair_regs [quad] != X86_ECX);
-                                       x86_mov_membase_reg (code, X86_ECX, (quad * sizeof (gpointer)), cinfo->ret.pair_regs [quad], sizeof (gpointer));
-                                       break;
-                               case ArgNone:
-                                       break;
-                               default:
-                                       g_assert_not_reached ();
-                               }
-                       }
-               }
        default:
                break;
        }
@@ -1980,21 +2038,42 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, guint8 *code)
        return code;
 }
 
+gboolean
+mono_x86_have_tls_get (void)
+{
+#ifdef __APPLE__
+       guint32 *ins = (guint32*)pthread_getspecific;
+       /*
+        * We're looking for these two instructions:
+        *
+        * mov    0x4(%esp),%eax
+        * mov    %gs:0x48(,%eax,4),%eax
+        */
+       return ins [0] == 0x0424448b && ins [1] == 0x85048b65 && ins [2] == 0x00000048;
+#else
+       return TRUE;
+#endif
+}
+
 /*
- * emit_tls_get:
+ * mono_x86_emit_tls_get:
  * @code: buffer to store code to
  * @dreg: hard register where to place the result
  * @tls_offset: offset info
  *
- * emit_tls_get emits in @code the native code that puts in the dreg register
- * the item in the thread local storage identified by tls_offset.
+ * mono_x86_emit_tls_get emits in @code the native code that puts in
+ * the dreg register the item in the thread local storage identified
+ * by tls_offset.
  *
  * Returns: a pointer to the end of the stored code
  */
-static guint8*
-emit_tls_get (guint8* code, int dreg, int tls_offset)
+guint8*
+mono_x86_emit_tls_get (guint8* code, int dreg, int tls_offset)
 {
-#ifdef PLATFORM_WIN32
+#if defined(__APPLE__)
+       x86_prefix (code, X86_GS_PREFIX);
+       x86_mov_reg_mem (code, dreg, 0x48 + tls_offset * 4, 4);
+#elif defined(TARGET_WIN32)
        /* 
         * See the Under the Hood article in the May 1996 issue of Microsoft Systems 
         * Journal and/or a disassembly of the TlsGet () function.
@@ -2037,7 +2116,7 @@ emit_load_volatile_arguments (MonoCompile *cfg, guint8 *code)
 
        sig = mono_method_signature (method);
 
-       cinfo = get_call_info (cfg, cfg->mempool, sig, FALSE);
+       cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, FALSE);
        
        /* This is the opposite of the code in emit_prolog */
 
@@ -2084,6 +2163,8 @@ x86_pop_reg (code, X86_EAX);
 #define LOOP_ALIGNMENT 8
 #define bb_is_loop_start(bb) ((bb)->loop_body_start && (bb)->nesting)
 
+#ifndef DISABLE_JIT
+
 void
 mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 {
@@ -2093,9 +2174,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
        guint8 *code = cfg->native_code + cfg->code_len;
        int max_len, cpos;
 
-       if (cfg->opt & MONO_OPT_PEEPHOLE)
-               peephole_pass (cfg, bb);
-
        if (cfg->opt & MONO_OPT_LOOP) {
                int pad, align = LOOP_ALIGNMENT;
                /* set alignment depending on cpu */
@@ -2174,14 +2252,22 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_STOREI4_MEMBASE_REG:
                        x86_mov_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, ins->sreg1, 4);
                        break;
-               case CEE_LDIND_I:
-               case CEE_LDIND_I4:
-               case CEE_LDIND_U4:
-                       x86_mov_reg_mem (code, ins->dreg, ins->inst_p0, 4);
+               case OP_STORE_MEM_IMM:
+                       x86_mov_mem_imm (code, ins->inst_p0, ins->inst_c0, 4);
                        break;
                case OP_LOADU4_MEM:
-                       x86_mov_reg_imm (code, ins->dreg, ins->inst_p0);
-                       x86_mov_reg_membase (code, ins->dreg, ins->dreg, 0, 4);
+                       x86_mov_reg_mem (code, ins->dreg, ins->inst_imm, 4);
+                       break;
+               case OP_LOAD_MEM:
+               case OP_LOADI4_MEM:
+                       /* These are created by the cprop pass so they use inst_imm as the source */
+                       x86_mov_reg_mem (code, ins->dreg, ins->inst_imm, 4);
+                       break;
+               case OP_LOADU1_MEM:
+                       x86_widen_mem (code, ins->dreg, ins->inst_imm, FALSE, FALSE);
+                       break;
+               case OP_LOADU2_MEM:
+                       x86_widen_mem (code, ins->dreg, ins->inst_imm, FALSE, TRUE);
                        break;
                case OP_LOAD_MEMBASE:
                case OP_LOADI4_MEMBASE:
@@ -2200,24 +2286,26 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_LOADI2_MEMBASE:
                        x86_widen_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset, TRUE, TRUE);
                        break;
-               case CEE_CONV_I1:
+               case OP_ICONV_TO_I1:
                case OP_SEXT_I1:
                        x86_widen_reg (code, ins->dreg, ins->sreg1, TRUE, FALSE);
                        break;
-               case CEE_CONV_I2:
+               case OP_ICONV_TO_I2:
                case OP_SEXT_I2:
                        x86_widen_reg (code, ins->dreg, ins->sreg1, TRUE, TRUE);
                        break;
-               case CEE_CONV_U1:
+               case OP_ICONV_TO_U1:
                        x86_widen_reg (code, ins->dreg, ins->sreg1, FALSE, FALSE);
                        break;
-               case CEE_CONV_U2:
+               case OP_ICONV_TO_U2:
                        x86_widen_reg (code, ins->dreg, ins->sreg1, FALSE, TRUE);
                        break;
                case OP_COMPARE:
+               case OP_ICOMPARE:
                        x86_alu_reg_reg (code, X86_CMP, ins->sreg1, ins->sreg2);
                        break;
                case OP_COMPARE_IMM:
+               case OP_ICOMPARE_IMM:
                        x86_alu_reg_imm (code, X86_CMP, ins->sreg1, ins->inst_imm);
                        break;
                case OP_X86_COMPARE_MEMBASE_REG:
@@ -2241,13 +2329,13 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_X86_ADD_MEMBASE_IMM:
                        x86_alu_membase_imm (code, X86_ADD, ins->inst_basereg, ins->inst_offset, ins->inst_imm);
                        break;
-               case OP_X86_ADD_MEMBASE:
+               case OP_X86_ADD_REG_MEMBASE:
                        x86_alu_reg_membase (code, X86_ADD, ins->sreg1, ins->sreg2, ins->inst_offset);
                        break;
                case OP_X86_SUB_MEMBASE_IMM:
                        x86_alu_membase_imm (code, X86_SUB, ins->inst_basereg, ins->inst_offset, ins->inst_imm);
                        break;
-               case OP_X86_SUB_MEMBASE:
+               case OP_X86_SUB_REG_MEMBASE:
                        x86_alu_reg_membase (code, X86_SUB, ins->sreg1, ins->sreg2, ins->inst_offset);
                        break;
                case OP_X86_AND_MEMBASE_IMM:
@@ -2259,6 +2347,21 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_X86_XOR_MEMBASE_IMM:
                        x86_alu_membase_imm (code, X86_XOR, ins->inst_basereg, ins->inst_offset, ins->inst_imm);
                        break;
+               case OP_X86_ADD_MEMBASE_REG:
+                       x86_alu_membase_reg (code, X86_ADD, ins->inst_basereg, ins->inst_offset, ins->sreg2);
+                       break;
+               case OP_X86_SUB_MEMBASE_REG:
+                       x86_alu_membase_reg (code, X86_SUB, ins->inst_basereg, ins->inst_offset, ins->sreg2);
+                       break;
+               case OP_X86_AND_MEMBASE_REG:
+                       x86_alu_membase_reg (code, X86_AND, ins->inst_basereg, ins->inst_offset, ins->sreg2);
+                       break;
+               case OP_X86_OR_MEMBASE_REG:
+                       x86_alu_membase_reg (code, X86_OR, ins->inst_basereg, ins->inst_offset, ins->sreg2);
+                       break;
+               case OP_X86_XOR_MEMBASE_REG:
+                       x86_alu_membase_reg (code, X86_XOR, ins->inst_basereg, ins->inst_offset, ins->sreg2);
+                       break;
                case OP_X86_INC_MEMBASE:
                        x86_inc_membase (code, ins->inst_basereg, ins->inst_offset);
                        break;
@@ -2271,48 +2374,104 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_X86_DEC_REG:
                        x86_dec_reg (code, ins->dreg);
                        break;
-               case OP_X86_MUL_MEMBASE:
+               case OP_X86_MUL_REG_MEMBASE:
                        x86_imul_reg_membase (code, ins->sreg1, ins->sreg2, ins->inst_offset);
                        break;
+               case OP_X86_AND_REG_MEMBASE:
+                       x86_alu_reg_membase (code, X86_AND, ins->sreg1, ins->sreg2, ins->inst_offset);
+                       break;
+               case OP_X86_OR_REG_MEMBASE:
+                       x86_alu_reg_membase (code, X86_OR, ins->sreg1, ins->sreg2, ins->inst_offset);
+                       break;
+               case OP_X86_XOR_REG_MEMBASE:
+                       x86_alu_reg_membase (code, X86_XOR, ins->sreg1, ins->sreg2, ins->inst_offset);
+                       break;
                case OP_BREAK:
                        x86_breakpoint (code);
                        break;
+               case OP_RELAXED_NOP:
+                       x86_prefix (code, X86_REP_PREFIX);
+                       x86_nop (code);
+                       break;
+               case OP_HARD_NOP:
+                       x86_nop (code);
+                       break;
+               case OP_NOP:
+               case OP_DUMMY_USE:
+               case OP_DUMMY_STORE:
+               case OP_NOT_REACHED:
+               case OP_NOT_NULL:
+                       break;
+               case OP_SEQ_POINT: {
+                       int i;
+
+                       if (cfg->compile_aot)
+                               NOT_IMPLEMENTED;
+
+                       /* 
+                        * Read from the single stepping trigger page. This will cause a
+                        * SIGSEGV when single stepping is enabled.
+                        * We do this _before_ the breakpoint, so single stepping after
+                        * a breakpoint is hit will step to the next IL offset.
+                        */
+                       if (ins->flags & MONO_INST_SINGLE_STEP_LOC)
+                               x86_alu_reg_mem (code, X86_CMP, X86_EAX, (guint32)ss_trigger_page);
+
+                       mono_add_seq_point (cfg, bb, ins, code - cfg->native_code);
+
+                       /* 
+                        * A placeholder for a possible breakpoint inserted by
+                        * mono_arch_set_breakpoint ().
+                        */
+                       for (i = 0; i < 6; ++i)
+                               x86_nop (code);
+                       break;
+               }
                case OP_ADDCC:
-               case CEE_ADD:
+               case OP_IADDCC:
+               case OP_IADD:
                        x86_alu_reg_reg (code, X86_ADD, ins->sreg1, ins->sreg2);
                        break;
                case OP_ADC:
+               case OP_IADC:
                        x86_alu_reg_reg (code, X86_ADC, ins->sreg1, ins->sreg2);
                        break;
                case OP_ADDCC_IMM:
                case OP_ADD_IMM:
+               case OP_IADD_IMM:
                        x86_alu_reg_imm (code, X86_ADD, ins->dreg, ins->inst_imm);
                        break;
                case OP_ADC_IMM:
+               case OP_IADC_IMM:
                        x86_alu_reg_imm (code, X86_ADC, ins->dreg, ins->inst_imm);
                        break;
                case OP_SUBCC:
-               case CEE_SUB:
+               case OP_ISUBCC:
+               case OP_ISUB:
                        x86_alu_reg_reg (code, X86_SUB, ins->sreg1, ins->sreg2);
                        break;
                case OP_SBB:
+               case OP_ISBB:
                        x86_alu_reg_reg (code, X86_SBB, ins->sreg1, ins->sreg2);
                        break;
                case OP_SUBCC_IMM:
                case OP_SUB_IMM:
+               case OP_ISUB_IMM:
                        x86_alu_reg_imm (code, X86_SUB, ins->dreg, ins->inst_imm);
                        break;
                case OP_SBB_IMM:
+               case OP_ISBB_IMM:
                        x86_alu_reg_imm (code, X86_SBB, ins->dreg, ins->inst_imm);
                        break;
-               case CEE_AND:
+               case OP_IAND:
                        x86_alu_reg_reg (code, X86_AND, ins->sreg1, ins->sreg2);
                        break;
                case OP_AND_IMM:
+               case OP_IAND_IMM:
                        x86_alu_reg_imm (code, X86_AND, ins->sreg1, ins->inst_imm);
                        break;
-               case CEE_DIV:
-               case CEE_REM:
+               case OP_IDIV:
+               case OP_IREM:
                        /* 
                         * The code is the same for div/rem, the allocator will allocate dreg
                         * to RAX/RDX as appropriate.
@@ -2328,8 +2487,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                x86_div_reg (code, ins->sreg2, TRUE);
                        }
                        break;
-               case CEE_DIV_UN:
-               case CEE_REM_UN:
+               case OP_IDIV_UN:
+               case OP_IREM_UN:
                        if (ins->sreg2 == X86_EDX) {
                                x86_push_reg (code, ins->sreg2);
                                x86_alu_reg_reg (code, X86_XOR, X86_EDX, X86_EDX);
@@ -2345,43 +2504,75 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        x86_cdq (code);
                        x86_div_reg (code, ins->sreg2, TRUE);
                        break;
-               case OP_REM_IMM:
-                       x86_mov_reg_imm (code, ins->sreg2, ins->inst_imm);
-                       x86_cdq (code);
-                       x86_div_reg (code, ins->sreg2, TRUE);
-                       break;
-               case CEE_OR:
-                       x86_alu_reg_reg (code, X86_OR, ins->sreg1, ins->sreg2);
-                       break;
-               case OP_OR_IMM:
-                       x86_alu_reg_imm (code, X86_OR, ins->sreg1, ins->inst_imm);
-                       break;
-               case CEE_XOR:
-               case OP_IXOR:
-                       x86_alu_reg_reg (code, X86_XOR, ins->sreg1, ins->sreg2);
+               case OP_IREM_IMM: {
+                       int power = mono_is_power_of_two (ins->inst_imm);
+
+                       g_assert (ins->sreg1 == X86_EAX);
+                       g_assert (ins->dreg == X86_EAX);
+                       g_assert (power >= 0);
+
+                       if (power == 1) {
+                               /* Based on http://compilers.iecc.com/comparch/article/93-04-079 */
+                               x86_cdq (code);
+                               x86_alu_reg_imm (code, X86_AND, X86_EAX, 1);
+                               /* 
+                                * If the divident is >= 0, this does not nothing. If it is positive, it
+                                * it transforms %eax=0 into %eax=0, and %eax=1 into %eax=-1.
+                                */
+                               x86_alu_reg_reg (code, X86_XOR, X86_EAX, X86_EDX);
+                               x86_alu_reg_reg (code, X86_SUB, X86_EAX, X86_EDX);
+                       } else if (power == 0) {
+                               x86_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg);
+                       } else {
+                               /* Based on gcc code */
+
+                               /* Add compensation for negative dividents */
+                               x86_cdq (code);
+                               x86_shift_reg_imm (code, X86_SHR, X86_EDX, 32 - power);
+                               x86_alu_reg_reg (code, X86_ADD, X86_EAX, X86_EDX);
+                               /* Compute remainder */
+                               x86_alu_reg_imm (code, X86_AND, X86_EAX, (1 << power) - 1);
+                               /* Remove compensation */
+                               x86_alu_reg_reg (code, X86_SUB, X86_EAX, X86_EDX);
+                       }
+                       break;
+               }
+               case OP_IOR:
+                       x86_alu_reg_reg (code, X86_OR, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_OR_IMM:
+               case OP_IOR_IMM:
+                       x86_alu_reg_imm (code, X86_OR, ins->sreg1, ins->inst_imm);
+                       break;
+               case OP_IXOR:
+                       x86_alu_reg_reg (code, X86_XOR, ins->sreg1, ins->sreg2);
                        break;
                case OP_XOR_IMM:
+               case OP_IXOR_IMM:
                        x86_alu_reg_imm (code, X86_XOR, ins->sreg1, ins->inst_imm);
                        break;
-               case CEE_SHL:
+               case OP_ISHL:
                        g_assert (ins->sreg2 == X86_ECX);
                        x86_shift_reg (code, X86_SHL, ins->dreg);
                        break;
-               case CEE_SHR:
+               case OP_ISHR:
                        g_assert (ins->sreg2 == X86_ECX);
                        x86_shift_reg (code, X86_SAR, ins->dreg);
                        break;
                case OP_SHR_IMM:
+               case OP_ISHR_IMM:
                        x86_shift_reg_imm (code, X86_SAR, ins->dreg, ins->inst_imm);
                        break;
                case OP_SHR_UN_IMM:
+               case OP_ISHR_UN_IMM:
                        x86_shift_reg_imm (code, X86_SHR, ins->dreg, ins->inst_imm);
                        break;
-               case CEE_SHR_UN:
+               case OP_ISHR_UN:
                        g_assert (ins->sreg2 == X86_ECX);
                        x86_shift_reg (code, X86_SHR, ins->dreg);
                        break;
                case OP_SHL_IMM:
+               case OP_ISHL_IMM:
                        x86_shift_reg_imm (code, X86_SHL, ins->dreg, ins->inst_imm);
                        break;
                case OP_LSHL: {
@@ -2465,17 +2656,18 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                x86_shift_reg_imm (code, X86_SHR, ins->backend.reg3, ins->inst_imm);
                        }
                        break;
-               case CEE_NOT:
+               case OP_INOT:
                        x86_not_reg (code, ins->sreg1);
                        break;
-               case CEE_NEG:
+               case OP_INEG:
                        x86_neg_reg (code, ins->sreg1);
                        break;
 
-               case CEE_MUL:
+               case OP_IMUL:
                        x86_imul_reg_reg (code, ins->sreg1, ins->sreg2);
                        break;
                case OP_MUL_IMM:
+               case OP_IMUL_IMM:
                        switch (ins->inst_imm) {
                        case 2:
                                /* MOV r1, r2 */
@@ -2533,11 +2725,11 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                break;
                        }
                        break;
-               case CEE_MUL_OVF:
+               case OP_IMUL_OVF:
                        x86_imul_reg_reg (code, ins->sreg1, ins->sreg2);
                        EMIT_COND_SYSTEM_EXCEPTION (X86_CC_O, FALSE, "OverflowException");
                        break;
-               case CEE_MUL_OVF_UN: {
+               case OP_IMUL_OVF_UN: {
                        /* the mul operation and the exception check should most likely be split */
                        int non_eax_reg, saved_eax = FALSE, saved_edx = FALSE;
                        /*g_assert (ins->sreg2 == X86_EAX);
@@ -2583,6 +2775,10 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        mono_add_patch_info (cfg, offset, (MonoJumpInfoType)ins->inst_i1, ins->inst_p0);
                        x86_mov_reg_imm (code, ins->dreg, 0);
                        break;
+               case OP_JUMP_TABLE:
+                       mono_add_patch_info (cfg, offset, (MonoJumpInfoType)ins->inst_i1, ins->inst_p0);
+                       x86_mov_reg_imm (code, ins->dreg, 0);
+                       break;
                case OP_LOAD_GOTADDR:
                        x86_call_imm (code, 0);
                        /* 
@@ -2601,12 +2797,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        mono_add_patch_info (cfg, offset, (MonoJumpInfoType)ins->inst_right->inst_i1, ins->inst_right->inst_p0);
                        x86_push_membase (code, ins->inst_basereg, 0xf0f0f0f0);
                        break;
-               case CEE_CONV_I4:
                case OP_MOVE:
                        x86_mov_reg_reg (code, ins->dreg, ins->sreg1, 4);
                        break;
-               case CEE_CONV_U4:
-                       g_assert_not_reached ();
                case OP_JMP: {
                        /*
                         * Note: this 'frame destruction' logic is useful for tail calls, too.
@@ -2645,6 +2838,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        offset = code - cfg->native_code;
                        mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_METHOD_JUMP, ins->inst_p0);
                        x86_jump32 (code, 0);
+
+                       cfg->disable_aot = TRUE;
                        break;
                }
                case OP_CHECK_THIS:
@@ -2665,8 +2860,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_FCALL:
                case OP_LCALL:
                case OP_VCALL:
+               case OP_VCALL2:
                case OP_VOIDCALL:
-               case CEE_CALL:
+               case OP_CALL:
                        call = (MonoCallInst*)ins;
                        if (ins->flags & MONO_INST_HAS_METHOD)
                                code = emit_call (cfg, code, MONO_PATCH_INFO_METHOD, call->method);
@@ -2699,6 +2895,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_FCALL_REG:
                case OP_LCALL_REG:
                case OP_VCALL_REG:
+               case OP_VCALL2_REG:
                case OP_VOIDCALL_REG:
                case OP_CALL_REG:
                        call = (MonoCallInst*)ins;
@@ -2714,9 +2911,18 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_FCALL_MEMBASE:
                case OP_LCALL_MEMBASE:
                case OP_VCALL_MEMBASE:
+               case OP_VCALL2_MEMBASE:
                case OP_VOIDCALL_MEMBASE:
                case OP_CALL_MEMBASE:
                        call = (MonoCallInst*)ins;
+
+                       /* 
+                        * Emit a few nops to simplify get_vcall_slot ().
+                        */
+                       x86_nop (code);
+                       x86_nop (code);
+                       x86_nop (code);
+
                        x86_call_membase (code, ins->sreg1, ins->inst_offset);
                        if (call->stack_usage && !CALLCONV_IS_STDCALL (call->signature)) {
                                if (call->stack_usage == 4)
@@ -2726,7 +2932,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        }
                        code = emit_move_return_value (cfg, ins, code);
                        break;
-               case OP_OUTARG:
                case OP_X86_PUSH:
                        x86_push_reg (code, ins->sreg1);
                        break;
@@ -2770,31 +2975,43 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        code = mono_emit_stack_alloc (code, ins);
                        x86_mov_reg_reg (code, ins->dreg, X86_ESP, 4);
                        break;
-               case CEE_RET:
-                       x86_ret (code);
+               case OP_LOCALLOC_IMM: {
+                       guint32 size = ins->inst_imm;
+                       size = (size + (MONO_ARCH_FRAME_ALIGNMENT - 1)) & ~ (MONO_ARCH_FRAME_ALIGNMENT - 1);
+
+                       if (ins->flags & MONO_INST_INIT) {
+                               /* FIXME: Optimize this */
+                               x86_mov_reg_imm (code, ins->dreg, size);
+                               ins->sreg1 = ins->dreg;
+
+                               code = mono_emit_stack_alloc (code, ins);
+                               x86_mov_reg_reg (code, ins->dreg, X86_ESP, 4);
+                       } else {
+                               x86_alu_reg_imm (code, X86_SUB, X86_ESP, size);
+                               x86_mov_reg_reg (code, ins->dreg, X86_ESP, 4);
+                       }
                        break;
+               }
                case OP_THROW: {
+                       x86_alu_reg_imm (code, X86_SUB, X86_ESP, MONO_ARCH_FRAME_ALIGNMENT - 4);
                        x86_push_reg (code, ins->sreg1);
                        code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, 
                                                          (gpointer)"mono_arch_throw_exception");
                        break;
                }
                case OP_RETHROW: {
+                       x86_alu_reg_imm (code, X86_SUB, X86_ESP, MONO_ARCH_FRAME_ALIGNMENT - 4);
                        x86_push_reg (code, ins->sreg1);
                        code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, 
                                                          (gpointer)"mono_arch_rethrow_exception");
                        break;
                }
-               case OP_CALL_HANDLER: 
-                       /* Align stack */
-#ifdef __APPLE__
-                       x86_alu_reg_imm (code, X86_SUB, X86_ESP, 12);
-#endif
+               case OP_CALL_HANDLER:
+                       x86_alu_reg_imm (code, X86_SUB, X86_ESP, MONO_ARCH_FRAME_ALIGNMENT - 4);
                        mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_BB, ins->inst_target_bb);
                        x86_call_imm (code, 0);
-#ifdef __APPLE__
-                       x86_alu_reg_imm (code, X86_ADD, X86_ESP, 12);
-#endif
+                       mono_cfg_add_try_hole (cfg, ins->inst_eh_block, code, bb);
+                       x86_alu_reg_imm (code, X86_ADD, X86_ESP, MONO_ARCH_FRAME_ALIGNMENT - 4);
                        break;
                case OP_START_HANDLER: {
                        MonoInst *spvar = mono_find_spvar_for_region (cfg, bb->region);
@@ -2819,28 +3036,15 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        ins->inst_c0 = code - cfg->native_code;
                        break;
                case OP_BR:
-                       if (ins->flags & MONO_INST_BRLABEL) {
-                               if (ins->inst_i0->inst_c0) {
-                                       x86_jump_code (code, cfg->native_code + ins->inst_i0->inst_c0);
-                               } else {
-                                       mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_LABEL, ins->inst_i0);
-                                       if ((cfg->opt & MONO_OPT_BRANCH) &&
-                                           x86_is_imm8 (ins->inst_i0->inst_c1 - cpos))
-                                               x86_jump8 (code, 0);
-                                       else 
-                                               x86_jump32 (code, 0);
-                               }
+                       if (ins->inst_target_bb->native_offset) {
+                               x86_jump_code (code, cfg->native_code + ins->inst_target_bb->native_offset); 
                        } else {
-                               if (ins->inst_target_bb->native_offset) {
-                                       x86_jump_code (code, cfg->native_code + ins->inst_target_bb->native_offset); 
-                               } else {
-                                       mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_BB, ins->inst_target_bb);
-                                       if ((cfg->opt & MONO_OPT_BRANCH) &&
-                                           x86_is_imm8 (ins->inst_target_bb->max_offset - cpos))
-                                               x86_jump8 (code, 0);
-                                       else 
-                                               x86_jump32 (code, 0);
-                               } 
+                               mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_BB, ins->inst_target_bb);
+                               if ((cfg->opt & MONO_OPT_BRANCH) &&
+                                   x86_is_imm8 (ins->inst_target_bb->max_offset - cpos))
+                                       x86_jump8 (code, 0);
+                               else 
+                                       x86_jump32 (code, 0);
                        }
                        break;
                case OP_BR_REG:
@@ -2852,6 +3056,11 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_CGT:
                case OP_CGT_UN:
                case OP_CNE:
+               case OP_ICEQ:
+               case OP_ICLT:
+               case OP_ICLT_UN:
+               case OP_ICGT:
+               case OP_ICGT_UN:
                        x86_set_reg (code, cc_table [mono_opcode_to_cond (ins->opcode)], ins->dreg, cc_signed_table [mono_opcode_to_cond (ins->opcode)]);
                        x86_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
                        break;
@@ -2865,6 +3074,16 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_COND_EXC_GE_UN:
                case OP_COND_EXC_LE:
                case OP_COND_EXC_LE_UN:
+               case OP_COND_EXC_IEQ:
+               case OP_COND_EXC_INE_UN:
+               case OP_COND_EXC_ILT:
+               case OP_COND_EXC_ILT_UN:
+               case OP_COND_EXC_IGT:
+               case OP_COND_EXC_IGT_UN:
+               case OP_COND_EXC_IGE:
+               case OP_COND_EXC_IGE_UN:
+               case OP_COND_EXC_ILE:
+               case OP_COND_EXC_ILE_UN:
                        EMIT_COND_SYSTEM_EXCEPTION (cc_table [mono_opcode_to_cond (ins->opcode)], cc_signed_table [mono_opcode_to_cond (ins->opcode)], ins->inst_p1);
                        break;
                case OP_COND_EXC_OV:
@@ -2873,19 +3092,39 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_COND_EXC_NC:
                        EMIT_COND_SYSTEM_EXCEPTION (branch_cc_table [ins->opcode - OP_COND_EXC_EQ], (ins->opcode < OP_COND_EXC_NE_UN), ins->inst_p1);
                        break;
-               case CEE_BEQ:
-               case CEE_BNE_UN:
-               case CEE_BLT:
-               case CEE_BLT_UN:
-               case CEE_BGT:
-               case CEE_BGT_UN:
-               case CEE_BGE:
-               case CEE_BGE_UN:
-               case CEE_BLE:
-               case CEE_BLE_UN:
+               case OP_COND_EXC_IOV:
+               case OP_COND_EXC_INO:
+               case OP_COND_EXC_IC:
+               case OP_COND_EXC_INC:
+                       EMIT_COND_SYSTEM_EXCEPTION (branch_cc_table [ins->opcode - OP_COND_EXC_IEQ], (ins->opcode < OP_COND_EXC_INE_UN), ins->inst_p1);
+                       break;
+               case OP_IBEQ:
+               case OP_IBNE_UN:
+               case OP_IBLT:
+               case OP_IBLT_UN:
+               case OP_IBGT:
+               case OP_IBGT_UN:
+               case OP_IBGE:
+               case OP_IBGE_UN:
+               case OP_IBLE:
+               case OP_IBLE_UN:
                        EMIT_COND_BRANCH (ins, cc_table [mono_opcode_to_cond (ins->opcode)], cc_signed_table [mono_opcode_to_cond (ins->opcode)]);
                        break;
 
+               case OP_CMOV_IEQ:
+               case OP_CMOV_IGE:
+               case OP_CMOV_IGT:
+               case OP_CMOV_ILE:
+               case OP_CMOV_ILT:
+               case OP_CMOV_INE_UN:
+               case OP_CMOV_IGE_UN:
+               case OP_CMOV_IGT_UN:
+               case OP_CMOV_ILE_UN:
+               case OP_CMOV_ILT_UN:
+                       g_assert (ins->dreg == ins->sreg1);
+                       x86_cmov_reg (code, cc_table [mono_opcode_to_cond (ins->opcode)], cc_signed_table [mono_opcode_to_cond (ins->opcode)], ins->dreg, ins->sreg2);
+                       break;
+
                /* floating point opcodes */
                case OP_R8CONST: {
                        double d = *(double *)ins->inst_p0;
@@ -2933,10 +3172,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_STORER8_MEMBASE_REG:
                        x86_fst_membase (code, ins->inst_destbasereg, ins->inst_offset, TRUE, TRUE);
                        break;
-               case OP_LOADR8_SPILL_MEMBASE:
-                       x86_fld_membase (code, ins->inst_basereg, ins->inst_offset, TRUE);
-                       x86_fxch (code, 1);
-                       break;
                case OP_LOADR8_MEMBASE:
                        x86_fld_membase (code, ins->inst_basereg, ins->inst_offset, TRUE);
                        break;
@@ -2946,18 +3181,38 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_LOADR4_MEMBASE:
                        x86_fld_membase (code, ins->inst_basereg, ins->inst_offset, FALSE);
                        break;
-               case CEE_CONV_R4: /* FIXME: change precision */
-               case CEE_CONV_R8:
+               case OP_ICONV_TO_R4:
+                       x86_push_reg (code, ins->sreg1);
+                       x86_fild_membase (code, X86_ESP, 0, FALSE);
+                       /* Change precision */
+                       x86_fst_membase (code, X86_ESP, 0, FALSE, TRUE);
+                       x86_fld_membase (code, X86_ESP, 0, FALSE);
+                       x86_alu_reg_imm (code, X86_ADD, X86_ESP, 4);
+                       break;
+               case OP_ICONV_TO_R8:
                        x86_push_reg (code, ins->sreg1);
                        x86_fild_membase (code, X86_ESP, 0, FALSE);
                        x86_alu_reg_imm (code, X86_ADD, X86_ESP, 4);
                        break;
+               case OP_ICONV_TO_R_UN:
+                       x86_push_imm (code, 0);
+                       x86_push_reg (code, ins->sreg1);
+                       x86_fild_membase (code, X86_ESP, 0, TRUE);
+                       x86_alu_reg_imm (code, X86_ADD, X86_ESP, 8);
+                       break;
                case OP_X86_FP_LOAD_I8:
                        x86_fild_membase (code, ins->inst_basereg, ins->inst_offset, TRUE);
                        break;
                case OP_X86_FP_LOAD_I4:
                        x86_fild_membase (code, ins->inst_basereg, ins->inst_offset, FALSE);
                        break;
+               case OP_FCONV_TO_R4:
+                       /* Change precision */
+                       x86_alu_reg_imm (code, X86_SUB, X86_ESP, 4);
+                       x86_fst_membase (code, X86_ESP, 0, FALSE, TRUE);
+                       x86_fld_membase (code, X86_ESP, 0, FALSE);
+                       x86_alu_reg_imm (code, X86_ADD, X86_ESP, 4);
+                       break;
                case OP_FCONV_TO_I1:
                        code = emit_float_to_int (cfg, code, ins->dreg, 1, TRUE);
                        break;
@@ -2988,17 +3243,32 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        x86_fldcw_membase (code, X86_ESP, 0);
                        x86_alu_reg_imm (code, X86_ADD, X86_ESP, 4);
                        break;
-               case OP_LCONV_TO_R_UN: { 
+               case OP_LCONV_TO_R8_2:
+                       x86_push_reg (code, ins->sreg2);
+                       x86_push_reg (code, ins->sreg1);
+                       x86_fild_membase (code, X86_ESP, 0, TRUE);
+                       /* Change precision */
+                       x86_fst_membase (code, X86_ESP, 0, TRUE, TRUE);
+                       x86_fld_membase (code, X86_ESP, 0, TRUE);
+                       x86_alu_reg_imm (code, X86_ADD, X86_ESP, 8);
+                       break;
+               case OP_LCONV_TO_R4_2:
+                       x86_push_reg (code, ins->sreg2);
+                       x86_push_reg (code, ins->sreg1);
+                       x86_fild_membase (code, X86_ESP, 0, TRUE);
+                       /* Change precision */
+                       x86_fst_membase (code, X86_ESP, 0, FALSE, TRUE);
+                       x86_fld_membase (code, X86_ESP, 0, FALSE);
+                       x86_alu_reg_imm (code, X86_ADD, X86_ESP, 8);
+                       break;
+               case OP_LCONV_TO_R_UN_2: { 
                        static guint8 mn[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, 0x40 };
                        guint8 *br;
 
                        /* load 64bit integer to FP stack */
-                       x86_push_imm (code, 0);
                        x86_push_reg (code, ins->sreg2);
                        x86_push_reg (code, ins->sreg1);
                        x86_fild_membase (code, X86_ESP, 0, TRUE);
-                       /* store as 80bit FP value */
-                       x86_fst80_membase (code, X86_ESP, 0);
                        
                        /* test if lreg is negative */
                        x86_test_reg_reg (code, ins->sreg2, ins->sreg2);
@@ -3006,18 +3276,20 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
        
                        /* add correction constant mn */
                        x86_fld80_mem (code, mn);
-                       x86_fld80_membase (code, X86_ESP, 0);
                        x86_fp_op_reg (code, X86_FADD, 1, TRUE);
-                       x86_fst80_membase (code, X86_ESP, 0);
 
                        x86_patch (br, code);
 
-                       x86_fld80_membase (code, X86_ESP, 0);
-                       x86_alu_reg_imm (code, X86_ADD, X86_ESP, 12);
+                       /* Change precision */
+                       x86_fst_membase (code, X86_ESP, 0, TRUE, TRUE);
+                       x86_fld_membase (code, X86_ESP, 0, TRUE);
+
+                       x86_alu_reg_imm (code, X86_ADD, X86_ESP, 8);
 
                        break;
                }
-               case OP_LCONV_TO_OVF_I: {
+               case OP_LCONV_TO_OVF_I:
+               case OP_LCONV_TO_OVF_I4_2: {
                        guint8 *br [3], *label [1];
                        MonoInst *tins;
 
@@ -3060,6 +3332,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                x86_mov_reg_reg (code, ins->dreg, ins->sreg1, 4);
                        break;
                }
+               case OP_FMOVE:
+                       /* Not needed on the fp stack */
+                       break;
                case OP_FADD:
                        x86_fp_op_reg (code, X86_FADD, 1, TRUE);
                        break;
@@ -3131,10 +3406,40 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        break;          
                case OP_SQRT:
                        x86_fsqrt (code);
-                       break;          
+                       break;
+               case OP_ROUND:
+                       x86_frndint (code);
+                       break;
+               case OP_IMIN:
+                       g_assert (cfg->opt & MONO_OPT_CMOV);
+                       g_assert (ins->dreg == ins->sreg1);
+                       x86_alu_reg_reg (code, X86_CMP, ins->sreg1, ins->sreg2);
+                       x86_cmov_reg (code, X86_CC_GT, TRUE, ins->dreg, ins->sreg2);
+                       break;
+               case OP_IMIN_UN:
+                       g_assert (cfg->opt & MONO_OPT_CMOV);
+                       g_assert (ins->dreg == ins->sreg1);
+                       x86_alu_reg_reg (code, X86_CMP, ins->sreg1, ins->sreg2);
+                       x86_cmov_reg (code, X86_CC_GT, FALSE, ins->dreg, ins->sreg2);
+                       break;
+               case OP_IMAX:
+                       g_assert (cfg->opt & MONO_OPT_CMOV);
+                       g_assert (ins->dreg == ins->sreg1);
+                       x86_alu_reg_reg (code, X86_CMP, ins->sreg1, ins->sreg2);
+                       x86_cmov_reg (code, X86_CC_LT, TRUE, ins->dreg, ins->sreg2);
+                       break;
+               case OP_IMAX_UN:
+                       g_assert (cfg->opt & MONO_OPT_CMOV);
+                       g_assert (ins->dreg == ins->sreg1);
+                       x86_alu_reg_reg (code, X86_CMP, ins->sreg1, ins->sreg2);
+                       x86_cmov_reg (code, X86_CC_LT, FALSE, ins->dreg, ins->sreg2);
+                       break;
                case OP_X86_FPOP:
                        x86_fstp (code, 0);
-                       break;          
+                       break;
+               case OP_X86_FXCH:
+                       x86_fxch (code, ins->inst_imm);
+                       break;
                case OP_FREM: {
                        guint8 *l1, *l2;
 
@@ -3438,7 +3743,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        break;
                }
                case OP_TLS_GET: {
-                       code = emit_tls_get (code, ins->dreg, ins->inst_offset);
+                       code = mono_x86_emit_tls_get (code, ins->dreg, ins->inst_offset);
                        break;
                }
                case OP_MEMORY_BARRIER: {
@@ -3518,20 +3823,19 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                         * hack to overcome limits in x86 reg allocator 
                         * (req: dreg == eax and sreg2 != eax and breg != eax) 
                         */
-                       if (ins->dreg != X86_EAX)
-                               x86_push_reg (code, X86_EAX);
+                       g_assert (ins->dreg == X86_EAX);
                        
                        /* We need the EAX reg for the cmpxchg */
                        if (ins->sreg2 == X86_EAX) {
-                               x86_push_reg (code, X86_EDX);
-                               x86_mov_reg_reg (code, X86_EDX, X86_EAX, 4);
-                               sreg2 = X86_EDX;
+                               sreg2 = (breg == X86_EDX) ? X86_EBX : X86_EDX;
+                               x86_push_reg (code, sreg2);
+                               x86_mov_reg_reg (code, sreg2, X86_EAX, 4);
                        }
 
                        if (breg == X86_EAX) {
-                               x86_push_reg (code, X86_ESI);
-                               x86_mov_reg_reg (code, X86_ESI, X86_EAX, 4);
-                               breg = X86_ESI;
+                               breg = (sreg2 == X86_ESI) ? X86_EDI : X86_ESI;
+                               x86_push_reg (code, breg);
+                               x86_mov_reg_reg (code, breg, X86_EAX, 4);
                        }
 
                        x86_mov_reg_membase (code, X86_EAX, breg, ins->inst_offset, 4);
@@ -3542,126 +3846,711 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        x86_patch (br [1], br [0]);
 
                        if (breg != ins->inst_basereg)
-                               x86_pop_reg (code, X86_ESI);
-
-                       if (ins->dreg != X86_EAX) {
-                               x86_mov_reg_reg (code, ins->dreg, X86_EAX, 4);
-                               x86_pop_reg (code, X86_EAX);
-                       }
+                               x86_pop_reg (code, breg);
 
                        if (ins->sreg2 != sreg2)
-                               x86_pop_reg (code, X86_EDX);
+                               x86_pop_reg (code, sreg2);
 
                        break;
                }
-               default:
-                       g_warning ("unknown opcode %s\n", mono_inst_name (ins->opcode));
-                       g_assert_not_reached ();
-               }
-
-               if (G_UNLIKELY ((code - cfg->native_code - offset) > max_len)) {
-                       g_warning ("wrong maximal instruction length of instruction %s (expected %d, got %d)",
-                                  mono_inst_name (ins->opcode), max_len, code - cfg->native_code - offset);
-                       g_assert_not_reached ();
-               }
-              
-               cpos += max_len;
-       }
+               case OP_ATOMIC_CAS_I4: {
+                       g_assert (ins->sreg3 == X86_EAX);
+                       g_assert (ins->sreg1 != X86_EAX);
+                       g_assert (ins->sreg1 != ins->sreg2);
 
-       cfg->code_len = code - cfg->native_code;
-}
+                       x86_prefix (code, X86_LOCK_PREFIX);
+                       x86_cmpxchg_membase_reg (code, ins->sreg1, ins->inst_offset, ins->sreg2);
 
-void
-mono_arch_register_lowlevel_calls (void)
-{
-}
+                       if (ins->dreg != X86_EAX)
+                               x86_mov_reg_reg (code, ins->dreg, X86_EAX, 4);
+                       break;
+               }
+#ifdef MONO_ARCH_SIMD_INTRINSICS
+               case OP_ADDPS:
+                       x86_sse_alu_ps_reg_reg (code, X86_SSE_ADD, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_DIVPS:
+                       x86_sse_alu_ps_reg_reg (code, X86_SSE_DIV, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_MULPS:
+                       x86_sse_alu_ps_reg_reg (code, X86_SSE_MUL, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_SUBPS:
+                       x86_sse_alu_ps_reg_reg (code, X86_SSE_SUB, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_MAXPS:
+                       x86_sse_alu_ps_reg_reg (code, X86_SSE_MAX, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_MINPS:
+                       x86_sse_alu_ps_reg_reg (code, X86_SSE_MIN, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_COMPPS:
+                       g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 7);
+                       x86_sse_alu_ps_reg_reg_imm (code, X86_SSE_COMP, ins->sreg1, ins->sreg2, ins->inst_c0);
+                       break;
+               case OP_ANDPS:
+                       x86_sse_alu_ps_reg_reg (code, X86_SSE_AND, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_ANDNPS:
+                       x86_sse_alu_ps_reg_reg (code, X86_SSE_ANDN, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_ORPS:
+                       x86_sse_alu_ps_reg_reg (code, X86_SSE_OR, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_XORPS:
+                       x86_sse_alu_ps_reg_reg (code, X86_SSE_XOR, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_SQRTPS:
+                       x86_sse_alu_ps_reg_reg (code, X86_SSE_SQRT, ins->dreg, ins->sreg1);
+                       break;
+               case OP_RSQRTPS:
+                       x86_sse_alu_ps_reg_reg (code, X86_SSE_RSQRT, ins->dreg, ins->sreg1);
+                       break;
+               case OP_RCPPS:
+                       x86_sse_alu_ps_reg_reg (code, X86_SSE_RCP, ins->dreg, ins->sreg1);
+                       break;
+               case OP_ADDSUBPS:
+                       x86_sse_alu_sd_reg_reg (code, X86_SSE_ADDSUB, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_HADDPS:
+                       x86_sse_alu_sd_reg_reg (code, X86_SSE_HADD, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_HSUBPS:
+                       x86_sse_alu_sd_reg_reg (code, X86_SSE_HSUB, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_DUPPS_HIGH:
+                       x86_sse_alu_ss_reg_reg (code, X86_SSE_MOVSHDUP, ins->dreg, ins->sreg1);
+                       break;
+               case OP_DUPPS_LOW:
+                       x86_sse_alu_ss_reg_reg (code, X86_SSE_MOVSLDUP, ins->dreg, ins->sreg1);
+                       break;
 
-void
-mono_arch_patch_code (MonoMethod *method, MonoDomain *domain, guint8 *code, MonoJumpInfo *ji, gboolean run_cctors)
-{
-       MonoJumpInfo *patch_info;
-       gboolean compile_aot = !run_cctors;
+               case OP_PSHUFLEW_HIGH:
+                       g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 0xFF);
+                       x86_pshufw_reg_reg (code, ins->dreg, ins->sreg1, ins->inst_c0, 1);
+                       break;
+               case OP_PSHUFLEW_LOW:
+                       g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 0xFF);
+                       x86_pshufw_reg_reg (code, ins->dreg, ins->sreg1, ins->inst_c0, 0);
+                       break;
+               case OP_PSHUFLED:
+                       g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 0xFF);
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->sreg1, ins->inst_c0);
+                       break;
 
-       for (patch_info = ji; patch_info; patch_info = patch_info->next) {
-               unsigned char *ip = patch_info->ip.i + code;
-               const unsigned char *target;
+               case OP_ADDPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_ADD, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_DIVPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_DIV, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_MULPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_MUL, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_SUBPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_SUB, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_MAXPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_MAX, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_MINPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_MIN, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_COMPPD:
+                       g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 7);
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_COMP, ins->sreg1, ins->sreg2, ins->inst_c0);
+                       break;
+               case OP_ANDPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_AND, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_ANDNPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_ANDN, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_ORPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_OR, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_XORPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_XOR, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_SQRTPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_SQRT, ins->dreg, ins->sreg1);
+                       break;
+               case OP_ADDSUBPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_ADDSUB, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_HADDPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_HADD, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_HSUBPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_HSUB, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_DUPPD:
+                       x86_sse_alu_sd_reg_reg (code, X86_SSE_MOVDDUP, ins->dreg, ins->sreg1);
+                       break;
+                       
+               case OP_EXTRACT_MASK:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PMOVMSKB, ins->dreg, ins->sreg1);
+                       break;
+       
+               case OP_PAND:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PAND, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_POR:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_POR, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PXOR:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PXOR, ins->sreg1, ins->sreg2);
+                       break;
 
-               target = mono_resolve_patch_target (method, domain, code, patch_info, run_cctors);
+               case OP_PADDB:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PADDB, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PADDW:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PADDW, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PADDD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PADDD, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PADDQ:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PADDQ, ins->sreg1, ins->sreg2);
+                       break;
 
-               if (compile_aot) {
-                       switch (patch_info->type) {
-                       case MONO_PATCH_INFO_BB:
-                       case MONO_PATCH_INFO_LABEL:
-                               break;
-                       default:
-                               /* No need to patch these */
-                               continue;
-                       }
-               }
+               case OP_PSUBB:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PSUBB, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBW:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PSUBW, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PSUBD, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBQ:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PSUBQ, ins->sreg1, ins->sreg2);
+                       break;
 
-               switch (patch_info->type) {
-               case MONO_PATCH_INFO_IP:
-                       *((gconstpointer *)(ip)) = target;
+               case OP_PMAXB_UN:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PMAXUB, ins->sreg1, ins->sreg2);
                        break;
-               case MONO_PATCH_INFO_CLASS_INIT: {
-                       guint8 *code = ip;
-                       /* Might already been changed to a nop */
-                       x86_call_code (code, 0);
-                       x86_patch (ip, target);
+               case OP_PMAXW_UN:
+                       x86_sse_alu_sse41_reg_reg (code, X86_SSE_PMAXUW, ins->sreg1, ins->sreg2);
                        break;
-               }
-               case MONO_PATCH_INFO_ABS:
-               case MONO_PATCH_INFO_METHOD:
-               case MONO_PATCH_INFO_METHOD_JUMP:
-               case MONO_PATCH_INFO_INTERNAL_METHOD:
-               case MONO_PATCH_INFO_BB:
-               case MONO_PATCH_INFO_LABEL:
-                       x86_patch (ip, target);
+               case OP_PMAXD_UN:
+                       x86_sse_alu_sse41_reg_reg (code, X86_SSE_PMAXUD, ins->sreg1, ins->sreg2);
                        break;
-               case MONO_PATCH_INFO_NONE:
+               
+               case OP_PMAXB:
+                       x86_sse_alu_sse41_reg_reg (code, X86_SSE_PMAXSB, ins->sreg1, ins->sreg2);
                        break;
-               default: {
-                       guint32 offset = mono_arch_get_patch_offset (ip);
-                       *((gconstpointer *)(ip + offset)) = target;
+               case OP_PMAXW:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PMAXSW, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMAXD:
+                       x86_sse_alu_sse41_reg_reg (code, X86_SSE_PMAXSD, ins->sreg1, ins->sreg2);
                        break;
-               }
-               }
-       }
-}
-
-guint8 *
-mono_arch_emit_prolog (MonoCompile *cfg)
-{
-       MonoMethod *method = cfg->method;
-       MonoBasicBlock *bb;
-       MonoMethodSignature *sig;
-       MonoInst *inst;
-       int alloc_size, pos, max_offset, i;
-       guint8 *code;
 
-       cfg->code_size =  MAX (mono_method_get_header (method)->code_size * 4, 1024);
+               case OP_PAVGB_UN:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PAVGB, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PAVGW_UN:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PAVGW, ins->sreg1, ins->sreg2);
+                       break;
 
-       if (cfg->prof_options & MONO_PROFILE_ENTER_LEAVE)
-               cfg->code_size += 512;
+               case OP_PMINB_UN:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PMINUB, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMINW_UN:
+                       x86_sse_alu_sse41_reg_reg (code, X86_SSE_PMINUW, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMIND_UN:
+                       x86_sse_alu_sse41_reg_reg (code, X86_SSE_PMINUD, ins->sreg1, ins->sreg2);
+                       break;
 
-       code = cfg->native_code = g_malloc (cfg->code_size);
+               case OP_PMINB:
+                       x86_sse_alu_sse41_reg_reg (code, X86_SSE_PMINSB, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMINW:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PMINSW, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMIND:
+                       x86_sse_alu_sse41_reg_reg (code, X86_SSE_PMINSD, ins->sreg1, ins->sreg2);
+                       break;
 
-       x86_push_reg (code, X86_EBP);
-       x86_mov_reg_reg (code, X86_EBP, X86_ESP, 4);
+               case OP_PCMPEQB:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PCMPEQB, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPEQW:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PCMPEQW, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPEQD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PCMPEQD, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPEQQ:
+                       x86_sse_alu_sse41_reg_reg (code, X86_SSE_PCMPEQQ, ins->sreg1, ins->sreg2);
+                       break;
 
-       alloc_size = cfg->stack_offset;
-       pos = 0;
+               case OP_PCMPGTB:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PCMPGTB, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPGTW:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PCMPGTW, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPGTD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PCMPGTD, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPGTQ:
+                       x86_sse_alu_sse41_reg_reg (code, X86_SSE_PCMPGTQ, ins->sreg1, ins->sreg2);
+                       break;
 
-       if (method->wrapper_type == MONO_WRAPPER_NATIVE_TO_MANAGED) {
-               /* Might need to attach the thread to the JIT  or change the domain for the callback */
-               if (appdomain_tls_offset != -1 && lmf_tls_offset != -1) {
-                       guint8 *buf, *no_domain_branch;
+               case OP_PSUM_ABS_DIFF:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PSADBW, ins->sreg1, ins->sreg2);
+                       break;
 
-                       code = emit_tls_get (code, X86_EAX, appdomain_tls_offset);
-                       x86_alu_reg_imm (code, X86_CMP, X86_EAX, GPOINTER_TO_UINT (cfg->domain));
-                       no_domain_branch = code;
+               case OP_UNPACK_LOWB:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKLBW, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWW:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKLWD, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKLDQ, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWQ:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKLQDQ, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWPS:
+                       x86_sse_alu_ps_reg_reg (code, X86_SSE_UNPCKL, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_UNPCKL, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_UNPACK_HIGHB:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKHBW, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHW:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKHWD, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKHDQ, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHQ:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PUNPCKHQDQ, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHPS:
+                       x86_sse_alu_ps_reg_reg (code, X86_SSE_UNPCKH, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHPD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_UNPCKH, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PACKW:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PACKSSWB, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PACKD:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PACKSSDW, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PACKW_UN:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PACKUSWB, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PACKD_UN:
+                       x86_sse_alu_sse41_reg_reg (code, X86_SSE_PACKUSDW, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PADDB_SAT_UN:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PADDUSB, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBB_SAT_UN:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PSUBUSB, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PADDW_SAT_UN:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PADDUSW, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBW_SAT_UN:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PSUBUSW, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PADDB_SAT:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PADDSB, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBB_SAT:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PSUBSB, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PADDW_SAT:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PADDSW, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBW_SAT:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PSUBSW, ins->sreg1, ins->sreg2);
+                       break;
+                       
+               case OP_PMULW:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PMULLW, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMULD:
+                       x86_sse_alu_sse41_reg_reg (code, X86_SSE_PMULLD, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMULQ:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PMULUDQ, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMULW_HIGH_UN:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PMULHUW, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMULW_HIGH:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PMULHW, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PSHRW:
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHIFTW, X86_SSE_SHR, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHRW_REG:
+                       x86_sse_shift_reg_reg (code, X86_SSE_PSRLW_REG, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSARW:
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHIFTW, X86_SSE_SAR, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSARW_REG:
+                       x86_sse_shift_reg_reg (code, X86_SSE_PSRAW_REG, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSHLW:
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHIFTW, X86_SSE_SHL, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHLW_REG:
+                       x86_sse_shift_reg_reg (code, X86_SSE_PSLLW_REG, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSHRD:
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHIFTD, X86_SSE_SHR, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHRD_REG:
+                       x86_sse_shift_reg_reg (code, X86_SSE_PSRLD_REG, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSARD:
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHIFTD, X86_SSE_SAR, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSARD_REG:
+                       x86_sse_shift_reg_reg (code, X86_SSE_PSRAD_REG, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSHLD:
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHIFTD, X86_SSE_SHL, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHLD_REG:
+                       x86_sse_shift_reg_reg (code, X86_SSE_PSLLD_REG, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSHRQ:
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHIFTQ, X86_SSE_SHR, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHRQ_REG:
+                       x86_sse_shift_reg_reg (code, X86_SSE_PSRLQ_REG, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSHLQ:
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHIFTQ, X86_SSE_SHL, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHLQ_REG:
+                       x86_sse_shift_reg_reg (code, X86_SSE_PSLLQ_REG, ins->dreg, ins->sreg2);
+                       break;          
+                       
+               case OP_ICONV_TO_X:
+                       x86_movd_xreg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+               case OP_EXTRACT_I4:
+                       x86_movd_reg_xreg (code, ins->dreg, ins->sreg1);
+                       break;
+               case OP_EXTRACT_I1:
+               case OP_EXTRACT_U1:
+                       x86_movd_reg_xreg (code, ins->dreg, ins->sreg1);
+                       if (ins->inst_c0)
+                               x86_shift_reg_imm (code, X86_SHR, ins->dreg, ins->inst_c0 * 8);
+                       x86_widen_reg (code, ins->dreg, ins->dreg, ins->opcode == OP_EXTRACT_I1, FALSE);
+                       break;
+               case OP_EXTRACT_I2:
+               case OP_EXTRACT_U2:
+                       x86_movd_reg_xreg (code, ins->dreg, ins->sreg1);
+                       if (ins->inst_c0)
+                               x86_shift_reg_imm (code, X86_SHR, ins->dreg, 16);
+                       x86_widen_reg (code, ins->dreg, ins->dreg, ins->opcode == OP_EXTRACT_I2, TRUE);
+                       break;
+               case OP_EXTRACT_R8:
+                       if (ins->inst_c0)
+                               x86_sse_alu_pd_membase_reg (code, X86_SSE_MOVHPD_MEMBASE_REG, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, ins->sreg1);
+                       else
+                               x86_sse_alu_sd_membase_reg (code, X86_SSE_MOVSD_MEMBASE_REG, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, ins->sreg1);
+                       x86_fld_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, TRUE);
+                       break;
+
+               case OP_INSERT_I2:
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->sreg1, ins->sreg2, ins->inst_c0);
+                       break;
+               case OP_EXTRACTX_U2:
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PEXTRW, ins->dreg, ins->sreg1, ins->inst_c0);
+                       break;
+               case OP_INSERTX_U1_SLOW:
+                       /*sreg1 is the extracted ireg (scratch)
+                       /sreg2 is the to be inserted ireg (scratch)
+                       /dreg is the xreg to receive the value*/
+
+                       /*clear the bits from the extracted word*/
+                       x86_alu_reg_imm (code, X86_AND, ins->sreg1, ins->inst_c0 & 1 ? 0x00FF : 0xFF00);
+                       /*shift the value to insert if needed*/
+                       if (ins->inst_c0 & 1)
+                               x86_shift_reg_imm (code, X86_SHL, ins->sreg2, 8);
+                       /*join them together*/
+                       x86_alu_reg_reg (code, X86_OR, ins->sreg1, ins->sreg2);
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg1, ins->inst_c0 / 2);
+                       break;
+               case OP_INSERTX_I4_SLOW:
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg2, ins->inst_c0 * 2);
+                       x86_shift_reg_imm (code, X86_SHR, ins->sreg2, 16);
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg2, ins->inst_c0 * 2 + 1);
+                       break;
+
+               case OP_INSERTX_R4_SLOW:
+                       x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, FALSE, TRUE);
+                       /*TODO if inst_c0 == 0 use movss*/
+                       x86_sse_alu_pd_reg_membase_imm (code, X86_SSE_PINSRW, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset + 0, ins->inst_c0 * 2);
+                       x86_sse_alu_pd_reg_membase_imm (code, X86_SSE_PINSRW, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset + 2, ins->inst_c0 * 2 + 1);
+                       break;
+               case OP_INSERTX_R8_SLOW:
+                       x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, TRUE, TRUE);
+                       if (ins->inst_c0)
+                               x86_sse_alu_pd_reg_membase (code, X86_SSE_MOVHPD_REG_MEMBASE, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset);
+                       else
+                               x86_sse_alu_pd_reg_membase (code, X86_SSE_MOVSD_REG_MEMBASE, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset);
+                       break;
+
+               case OP_STOREX_MEMBASE_REG:
+               case OP_STOREX_MEMBASE:
+                       x86_movups_membase_reg (code, ins->dreg, ins->inst_offset, ins->sreg1);
+                       break;
+               case OP_LOADX_MEMBASE:
+                       x86_movups_reg_membase (code, ins->dreg, ins->sreg1, ins->inst_offset);
+                       break;
+               case OP_LOADX_ALIGNED_MEMBASE:
+                       x86_movaps_reg_membase (code, ins->dreg, ins->sreg1, ins->inst_offset);
+                       break;
+               case OP_STOREX_ALIGNED_MEMBASE_REG:
+                       x86_movaps_membase_reg (code, ins->dreg, ins->inst_offset, ins->sreg1);
+                       break;
+               case OP_STOREX_NTA_MEMBASE_REG:
+                       x86_sse_alu_reg_membase (code, X86_SSE_MOVNTPS, ins->dreg, ins->sreg1, ins->inst_offset);
+                       break;
+               case OP_PREFETCH_MEMBASE:
+                       x86_sse_alu_reg_membase (code, X86_SSE_PREFETCH, ins->backend.arg_info, ins->sreg1, ins->inst_offset);
+
+                       break;
+               case OP_XMOVE:
+                       /*FIXME the peephole pass should have killed this*/
+                       if (ins->dreg != ins->sreg1)
+                               x86_movaps_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;          
+               case OP_XZERO:
+                       x86_sse_alu_pd_reg_reg (code, X86_SSE_PXOR, ins->dreg, ins->dreg);
+                       break;
+               case OP_ICONV_TO_R8_RAW:
+                       x86_mov_membase_reg (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, ins->sreg1, 4);
+                       x86_fld_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, FALSE);
+                       break;
+
+               case OP_FCONV_TO_R8_X:
+                       x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, TRUE, TRUE);
+                       x86_movsd_reg_membase (code, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset);
+                       break;
+
+               case OP_XCONV_R8_TO_I4:
+                       x86_cvttsd2si (code, ins->dreg, ins->sreg1);
+                       switch (ins->backend.source_opcode) {
+                       case OP_FCONV_TO_I1:
+                               x86_widen_reg (code, ins->dreg, ins->dreg, TRUE, FALSE);
+                               break;
+                       case OP_FCONV_TO_U1:
+                               x86_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
+                               break;
+                       case OP_FCONV_TO_I2:
+                               x86_widen_reg (code, ins->dreg, ins->dreg, TRUE, TRUE);
+                               break;
+                       case OP_FCONV_TO_U2:
+                               x86_widen_reg (code, ins->dreg, ins->dreg, FALSE, TRUE);
+                               break;
+                       }                       
+                       break;
+
+               case OP_EXPAND_I1:
+                       /*FIXME this causes a partial register stall, maybe it would not be that bad to use shift + mask + or*/
+                       /*The +4 is to get a mov ?h, ?l over the same reg.*/
+                       x86_mov_reg_reg (code, ins->dreg + 4, ins->dreg, 1);
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg1, 0);
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg1, 1);
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->dreg, 0);
+                       break;
+               case OP_EXPAND_I2:
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg1, 0);
+                       x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg1, 1);
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->dreg, 0);
+                       break;
+               case OP_EXPAND_I4:
+                       x86_movd_xreg_reg (code, ins->dreg, ins->sreg1);
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->dreg, 0);
+                       break;
+               case OP_EXPAND_R4:
+                       x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, FALSE, TRUE);
+                       x86_movd_xreg_membase (code, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset);
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->dreg, 0);
+                       break;
+               case OP_EXPAND_R8:
+                       x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, TRUE, TRUE);
+                       x86_movsd_reg_membase (code, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset);
+                       x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->dreg, 0x44);
+                       break;
+#endif
+               case OP_LIVERANGE_START: {
+                       if (cfg->verbose_level > 1)
+                               printf ("R%d START=0x%x\n", MONO_VARINFO (cfg, ins->inst_c0)->vreg, (int)(code - cfg->native_code));
+                       MONO_VARINFO (cfg, ins->inst_c0)->live_range_start = code - cfg->native_code;
+                       break;
+               }
+               case OP_LIVERANGE_END: {
+                       if (cfg->verbose_level > 1)
+                               printf ("R%d END=0x%x\n", MONO_VARINFO (cfg, ins->inst_c0)->vreg, (int)(code - cfg->native_code));
+                       MONO_VARINFO (cfg, ins->inst_c0)->live_range_end = code - cfg->native_code;
+                       break;
+               }
+               default:
+                       g_warning ("unknown opcode %s\n", mono_inst_name (ins->opcode));
+                       g_assert_not_reached ();
+               }
+
+               if (G_UNLIKELY ((code - cfg->native_code - offset) > max_len)) {
+                       g_warning ("wrong maximal instruction length of instruction %s (expected %d, got %d)",
+                                  mono_inst_name (ins->opcode), max_len, code - cfg->native_code - offset);
+                       g_assert_not_reached ();
+               }
+              
+               cpos += max_len;
+       }
+
+       cfg->code_len = code - cfg->native_code;
+}
+
+#endif /* DISABLE_JIT */
+
+void
+mono_arch_register_lowlevel_calls (void)
+{
+}
+
+void
+mono_arch_patch_code (MonoMethod *method, MonoDomain *domain, guint8 *code, MonoJumpInfo *ji, gboolean run_cctors)
+{
+       MonoJumpInfo *patch_info;
+       gboolean compile_aot = !run_cctors;
+
+       for (patch_info = ji; patch_info; patch_info = patch_info->next) {
+               unsigned char *ip = patch_info->ip.i + code;
+               const unsigned char *target;
+
+               target = mono_resolve_patch_target (method, domain, code, patch_info, run_cctors);
+
+               if (compile_aot) {
+                       switch (patch_info->type) {
+                       case MONO_PATCH_INFO_BB:
+                       case MONO_PATCH_INFO_LABEL:
+                               break;
+                       default:
+                               /* No need to patch these */
+                               continue;
+                       }
+               }
+
+               switch (patch_info->type) {
+               case MONO_PATCH_INFO_IP:
+                       *((gconstpointer *)(ip)) = target;
+                       break;
+               case MONO_PATCH_INFO_CLASS_INIT: {
+                       guint8 *code = ip;
+                       /* Might already been changed to a nop */
+                       x86_call_code (code, 0);
+                       x86_patch (ip, target);
+                       break;
+               }
+               case MONO_PATCH_INFO_ABS:
+               case MONO_PATCH_INFO_METHOD:
+               case MONO_PATCH_INFO_METHOD_JUMP:
+               case MONO_PATCH_INFO_INTERNAL_METHOD:
+               case MONO_PATCH_INFO_BB:
+               case MONO_PATCH_INFO_LABEL:
+               case MONO_PATCH_INFO_RGCTX_FETCH:
+               case MONO_PATCH_INFO_GENERIC_CLASS_INIT:
+               case MONO_PATCH_INFO_MONITOR_ENTER:
+               case MONO_PATCH_INFO_MONITOR_EXIT:
+                       x86_patch (ip, target);
+                       break;
+               case MONO_PATCH_INFO_NONE:
+                       break;
+               default: {
+                       guint32 offset = mono_arch_get_patch_offset (ip);
+                       *((gconstpointer *)(ip + offset)) = target;
+                       break;
+               }
+               }
+       }
+}
+
+guint8 *
+mono_arch_emit_prolog (MonoCompile *cfg)
+{
+       MonoMethod *method = cfg->method;
+       MonoBasicBlock *bb;
+       MonoMethodSignature *sig;
+       MonoInst *inst;
+       int alloc_size, pos, max_offset, i, cfa_offset;
+       guint8 *code;
+       gboolean need_stack_frame;
+
+       cfg->code_size = MAX (cfg->header->code_size * 4, 10240);
+
+       if (cfg->prof_options & MONO_PROFILE_ENTER_LEAVE)
+               cfg->code_size += 512;
+
+       code = cfg->native_code = g_malloc (cfg->code_size);
+
+       /* Offset between RSP and the CFA */
+       cfa_offset = 0;
+
+       // CFA = sp + 4
+       cfa_offset = sizeof (gpointer);
+       mono_emit_unwind_op_def_cfa (cfg, code, X86_ESP, sizeof (gpointer));
+       // IP saved at CFA - 4
+       /* There is no IP reg on x86 */
+       mono_emit_unwind_op_offset (cfg, code, X86_NREG, -cfa_offset);
+
+       need_stack_frame = needs_stack_frame (cfg);
+
+       if (need_stack_frame) {
+               x86_push_reg (code, X86_EBP);
+               cfa_offset += sizeof (gpointer);
+               mono_emit_unwind_op_def_cfa_offset (cfg, code, cfa_offset);
+               mono_emit_unwind_op_offset (cfg, code, X86_EBP, - cfa_offset);
+               x86_mov_reg_reg (code, X86_EBP, X86_ESP, 4);
+               mono_emit_unwind_op_def_cfa_reg (cfg, code, X86_EBP);
+       }
+
+       alloc_size = cfg->stack_offset;
+       pos = 0;
+
+       if (method->wrapper_type == MONO_WRAPPER_NATIVE_TO_MANAGED) {
+               /* Might need to attach the thread to the JIT  or change the domain for the callback */
+               if (appdomain_tls_offset != -1 && lmf_tls_offset != -1) {
+                       guint8 *buf, *no_domain_branch;
+
+                       code = mono_x86_emit_tls_get (code, X86_EAX, appdomain_tls_offset);
+                       x86_alu_reg_imm (code, X86_CMP, X86_EAX, GPOINTER_TO_UINT (cfg->domain));
+                       no_domain_branch = code;
                        x86_branch8 (code, X86_CC_NE, 0, 0);
-                       code = emit_tls_get ( code, X86_EAX, lmf_tls_offset);
+                       code = mono_x86_emit_tls_get ( code, X86_EAX, lmf_tls_offset);
                        x86_test_reg_reg (code, X86_EAX, X86_EAX);
                        buf = code;
                        x86_branch8 (code, X86_CC_NE, 0, 0);
@@ -3670,12 +4559,13 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                        code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, (gpointer)"mono_jit_thread_attach");
                        x86_alu_reg_imm (code, X86_ADD, X86_ESP, 4);
                        x86_patch (buf, code);
-#ifdef PLATFORM_WIN32
+#ifdef TARGET_WIN32
                        /* The TLS key actually contains a pointer to the MonoJitTlsData structure */
                        /* FIXME: Add a separate key for LMF to avoid this */
                        x86_alu_reg_imm (code, X86_ADD, X86_EAX, G_STRUCT_OFFSET (MonoJitTlsData, lmf));
 #endif
-               } else {
+               }
+               else {
                        g_assert (!cfg->compile_aot);
                        x86_push_imm (code, cfg->domain);
                        code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD, (gpointer)"mono_jit_thread_attach");
@@ -3686,15 +4576,26 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        if (method->save_lmf) {
                pos += sizeof (MonoLMF);
 
+               if (cfg->compile_aot)
+                       cfg->disable_aot = TRUE;
+
                /* save the current IP */
                mono_add_patch_info (cfg, code + 1 - cfg->native_code, MONO_PATCH_INFO_IP, NULL);
                x86_push_imm_template (code);
+               cfa_offset += sizeof (gpointer);
 
                /* save all caller saved regs */
                x86_push_reg (code, X86_EBP);
+               cfa_offset += sizeof (gpointer);
                x86_push_reg (code, X86_ESI);
+               cfa_offset += sizeof (gpointer);
+               mono_emit_unwind_op_offset (cfg, code, X86_ESI, - cfa_offset);
                x86_push_reg (code, X86_EDI);
+               cfa_offset += sizeof (gpointer);
+               mono_emit_unwind_op_offset (cfg, code, X86_EDI, - cfa_offset);
                x86_push_reg (code, X86_EBX);
+               cfa_offset += sizeof (gpointer);
+               mono_emit_unwind_op_offset (cfg, code, X86_EBX, - cfa_offset);
 
                if ((lmf_tls_offset != -1) && !is_win32 && !optimize_for_xen) {
                        /*
@@ -3720,8 +4621,8 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 
                        if (lmf_addr_tls_offset != -1) {
                                /* Load lmf quicky using the GS register */
-                               code = emit_tls_get (code, X86_EAX, lmf_addr_tls_offset);
-#ifdef PLATFORM_WIN32
+                               code = mono_x86_emit_tls_get (code, X86_EAX, lmf_addr_tls_offset);
+#ifdef TARGET_WIN32
                                /* The TLS key actually contains a pointer to the MonoJitTlsData structure */
                                /* FIXME: Add a separate key for LMF to avoid this */
                                x86_alu_reg_imm (code, X86_ADD, X86_EAX, G_STRUCT_OFFSET (MonoJitTlsData, lmf));
@@ -3745,39 +4646,51 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                if (cfg->used_int_regs & (1 << X86_EBX)) {
                        x86_push_reg (code, X86_EBX);
                        pos += 4;
+                       cfa_offset += sizeof (gpointer);
+                       mono_emit_unwind_op_offset (cfg, code, X86_EBX, - cfa_offset);
                }
 
                if (cfg->used_int_regs & (1 << X86_EDI)) {
                        x86_push_reg (code, X86_EDI);
                        pos += 4;
+                       cfa_offset += sizeof (gpointer);
+                       mono_emit_unwind_op_offset (cfg, code, X86_EDI, - cfa_offset);
                }
 
                if (cfg->used_int_regs & (1 << X86_ESI)) {
                        x86_push_reg (code, X86_ESI);
                        pos += 4;
+                       cfa_offset += sizeof (gpointer);
+                       mono_emit_unwind_op_offset (cfg, code, X86_ESI, - cfa_offset);
                }
        }
 
        alloc_size -= pos;
 
-#if __APPLE__
        /* the original alloc_size is already aligned: there is %ebp and retip pushed, so realign */
-       {
-               int tot = alloc_size + pos + 4 + 4; /* ret ip + ebp */
-               if (tot & 4) {
-                       tot += 4;
-                       alloc_size += 4;
-               }
-               if (tot & 8) {
-                       alloc_size += 8;
-               }
+       if (mono_do_x86_stack_align && need_stack_frame) {
+               int tot = alloc_size + pos + 4; /* ret ip */
+               if (need_stack_frame)
+                       tot += 4; /* ebp */
+               tot &= MONO_ARCH_FRAME_ALIGNMENT - 1;
+               if (tot)
+                       alloc_size += MONO_ARCH_FRAME_ALIGNMENT - tot;
        }
-#endif
 
        if (alloc_size) {
                /* See mono_emit_stack_alloc */
-#if defined(PLATFORM_WIN32) || defined(MONO_ARCH_SIGSEGV_ON_ALTSTACK)
+#if defined(TARGET_WIN32) || defined(MONO_ARCH_SIGSEGV_ON_ALTSTACK)
                guint32 remaining_size = alloc_size;
+               /*FIXME handle unbounded code expansion, we should use a loop in case of more than X interactions*/
+               guint32 required_code_size = ((remaining_size / 0x1000) + 1) * 8; /*8 is the max size of x86_alu_reg_imm + x86_test_membase_reg*/
+               guint32 offset = code - cfg->native_code;
+               if (G_UNLIKELY (required_code_size >= (cfg->code_size - offset))) {
+                       while (required_code_size >= (cfg->code_size - offset))
+                               cfg->code_size *= 2;
+                       cfg->native_code = g_realloc (cfg->native_code, cfg->code_size);
+                       code = cfg->native_code + offset;
+                       mono_jit_stats.code_reallocs++;
+               }
                while (remaining_size >= 0x1000) {
                        x86_alu_reg_imm (code, X86_SUB, X86_ESP, 0x1000);
                        x86_test_membase_reg (code, X86_ESP, 0, X86_ESP);
@@ -3788,15 +4701,24 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 #else
                x86_alu_reg_imm (code, X86_SUB, X86_ESP, alloc_size);
 #endif
+
+               g_assert (need_stack_frame);
        }
 
-#if __APPLE_
+       if (cfg->method->wrapper_type == MONO_WRAPPER_NATIVE_TO_MANAGED ||
+                       cfg->method->wrapper_type == MONO_WRAPPER_RUNTIME_INVOKE) {
+               x86_alu_reg_imm (code, X86_AND, X86_ESP, -MONO_ARCH_FRAME_ALIGNMENT);
+       }
+
+#if DEBUG_STACK_ALIGNMENT
        /* check the stack is aligned */
-       x86_mov_reg_reg (code, X86_EDX, X86_ESP, 4);
-       x86_alu_reg_imm (code, X86_AND, X86_EDX, 15);
-       x86_alu_reg_imm (code, X86_CMP, X86_EDX, 0);
-       x86_branch_disp (code, X86_CC_EQ, 3, FALSE);
-       x86_breakpoint (code);
+       if (need_stack_frame && method->wrapper_type == MONO_WRAPPER_NONE) {
+               x86_mov_reg_reg (code, X86_ECX, X86_ESP, 4);
+               x86_alu_reg_imm (code, X86_AND, X86_ECX, MONO_ARCH_FRAME_ALIGNMENT - 1);
+               x86_alu_reg_imm (code, X86_CMP, X86_ECX, 0);
+               x86_branch_disp (code, X86_CC_EQ, 3, FALSE);
+               x86_breakpoint (code);
+       }
 #endif
 
         /* compute max_offset in order to use short forward jumps */
@@ -3821,6 +4743,13 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                }
        }
 
+       /* store runtime generic context */
+       if (cfg->rgctx_var) {
+               g_assert (cfg->rgctx_var->opcode == OP_REGOFFSET && cfg->rgctx_var->inst_basereg == X86_EBP);
+
+               x86_mov_membase_reg (code, X86_EBP, cfg->rgctx_var->inst_offset, MONO_ARCH_RGCTX_REG, 4);
+       }
+
        if (mono_jit_trace_calls != NULL && mono_trace_eval (method))
                code = mono_arch_instrument_prolog (cfg, mono_trace_enter_method, code, TRUE);
 
@@ -3831,6 +4760,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
                inst = cfg->args [pos];
                if (inst->opcode == OP_REGVAR) {
+                       g_assert (need_stack_frame);
                        x86_mov_reg_membase (code, inst->dreg, X86_EBP, inst->inst_offset, 4);
                        if (cfg->verbose_level > 2)
                                g_print ("Argument %d assigned to register %s\n", pos, mono_arch_regname (inst->dreg));
@@ -3840,6 +4770,8 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 
        cfg->code_len = code - cfg->native_code;
 
+       g_assert (cfg->code_len < cfg->code_size);
+
        return code;
 }
 
@@ -3853,7 +4785,8 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        guint8 *code;
        int max_epilog_size = 16;
        CallInfo *cinfo;
-       
+       gboolean need_stack_frame = needs_stack_frame (cfg);
+
        if (cfg->method->save_lmf)
                max_epilog_size += 128;
 
@@ -3875,6 +4808,23 @@ mono_arch_emit_epilog (MonoCompile *cfg)
                gint32 prev_lmf_reg;
                gint32 lmf_offset = -sizeof (MonoLMF);
 
+               /* check if we need to restore protection of the stack after a stack overflow */
+               if (mono_get_jit_tls_offset () != -1) {
+                       guint8 *patch;
+                       code = mono_x86_emit_tls_get (code, X86_ECX, mono_get_jit_tls_offset ());
+                       /* we load the value in a separate instruction: this mechanism may be
+                        * used later as a safer way to do thread interruption
+                        */
+                       x86_mov_reg_membase (code, X86_ECX, X86_ECX, G_STRUCT_OFFSET (MonoJitTlsData, restore_stack_prot), 4);
+                       x86_alu_reg_imm (code, X86_CMP, X86_ECX, 0);
+                       patch = code;
+                       x86_branch8 (code, X86_CC_Z, 0, FALSE);
+                       /* note that the call trampoline will preserve eax/edx */
+                       x86_call_reg (code, X86_ECX);
+                       x86_patch (patch, code);
+               } else {
+                       /* FIXME: maybe save the jit tls in the prolog */
+               }
                if ((lmf_tls_offset != -1) && !is_win32 && !optimize_for_xen) {
                        /*
                         * Optimized version which uses the mono_lmf TLS variable instead of indirection
@@ -3888,7 +4838,7 @@ mono_arch_emit_epilog (MonoCompile *cfg)
                        x86_mov_mem_reg (code, lmf_tls_offset, X86_ECX, 4);
                } else {
                        /* Find a spare register */
-                       switch (mono_type_get_underlying_type (sig->ret)->type) {
+                       switch (mini_type_get_underlying_type (cfg->generic_sharing_context, sig->ret)->type) {
                        case MONO_TYPE_I8:
                        case MONO_TYPE_U8:
                                prev_lmf_reg = X86_EDI;
@@ -3933,8 +4883,10 @@ mono_arch_emit_epilog (MonoCompile *cfg)
                        pos -= 4;
                }
 
-               if (pos)
+               if (pos) {
+                       g_assert (need_stack_frame);
                        x86_lea_membase (code, X86_ESP, X86_EBP, pos);
+               }
 
                if (cfg->used_int_regs & (1 << X86_ESI)) {
                        x86_pop_reg (code, X86_ESI);
@@ -3948,7 +4900,7 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        }
 
        /* Load returned vtypes into registers if needed */
-       cinfo = get_call_info (cfg, cfg->mempool, sig, FALSE);
+       cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, FALSE);
        if (cinfo->ret.storage == ArgValuetypeInReg) {
                for (quad = 0; quad < 2; quad ++) {
                        switch (cinfo->ret.pair_storage [quad]) {
@@ -3969,7 +4921,8 @@ mono_arch_emit_epilog (MonoCompile *cfg)
                }
        }
 
-       x86_leave (code);
+       if (need_stack_frame)
+               x86_leave (code);
 
        if (CALLCONV_IS_STDCALL (sig)) {
                MonoJitArgumentInfo *arg_info = alloca (sizeof (MonoJitArgumentInfo) * (sig->param_count + 1));
@@ -3980,10 +4933,12 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        else
                stack_to_pop = 0;
 
-       if (stack_to_pop)
+       if (stack_to_pop) {
+               g_assert (need_stack_frame);
                x86_ret_imm (code, stack_to_pop);
-       else
+       } else {
                x86_ret (code);
+       }
 
        cfg->code_len = code - cfg->native_code;
 
@@ -4053,6 +5008,8 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
                                /* Compute size of code following the push <OFFSET> */
                                size = 5 + 5;
 
+                               /*This is aligned to 16 bytes by the callee. This way we save a few bytes here.*/
+
                                if ((code - cfg->native_code) - throw_ip < 126 - size) {
                                        /* Use the shorter form */
                                        buf = buf2 = code;
@@ -4107,6 +5064,12 @@ mono_arch_flush_register_windows (void)
 {
 }
 
+gboolean 
+mono_arch_is_inst_imm (gint64 imm)
+{
+       return TRUE;
+}
+
 /*
  * Support for fast access to the thread-local lmf structure using the GS
  * segment register on NPTL + kernel 2.6.x.
@@ -4119,22 +5082,19 @@ mono_arch_setup_jit_tls_data (MonoJitTlsData *tls)
 {
        if (!tls_offset_inited) {
                if (!getenv ("MONO_NO_TLS")) {
-#ifdef PLATFORM_WIN32
+#ifdef TARGET_WIN32
                        /* 
                         * We need to init this multiple times, since when we are first called, the key might not
                         * be initialized yet.
                         */
                        appdomain_tls_offset = mono_domain_get_tls_key ();
                        lmf_tls_offset = mono_get_jit_tls_key ();
-                       thread_tls_offset = mono_thread_get_tls_key ();
 
                        /* Only 64 tls entries can be accessed using inline code */
                        if (appdomain_tls_offset >= 64)
                                appdomain_tls_offset = -1;
                        if (lmf_tls_offset >= 64)
                                lmf_tls_offset = -1;
-                       if (thread_tls_offset >= 64)
-                               thread_tls_offset = -1;
 #else
 #if MONO_XEN_OPT
                        optimize_for_xen = access ("/proc/xen", F_OK) == 0;
@@ -4143,7 +5103,6 @@ mono_arch_setup_jit_tls_data (MonoJitTlsData *tls)
                        appdomain_tls_offset = mono_domain_get_tls_offset ();
                        lmf_tls_offset = mono_get_lmf_tls_offset ();
                        lmf_addr_tls_offset = mono_get_lmf_addr_tls_offset ();
-                       thread_tls_offset = mono_thread_get_tls_offset ();
 #endif
                }
        }               
@@ -4154,67 +5113,7 @@ mono_arch_free_jit_tls_data (MonoJitTlsData *tls)
 {
 }
 
-void
-mono_arch_emit_this_vret_args (MonoCompile *cfg, MonoCallInst *inst, int this_reg, int this_type, int vt_reg)
-{
-       MonoCallInst *call = (MonoCallInst*)inst;
-       CallInfo *cinfo = get_call_info (cfg, cfg->mempool, inst->signature, FALSE);
-
-       /* add the this argument */
-       if (this_reg != -1) {
-               if (cinfo->args [0].storage == ArgInIReg) {
-                       MonoInst *this;
-                       MONO_INST_NEW (cfg, this, OP_MOVE);
-                       this->type = this_type;
-                       this->sreg1 = this_reg;
-                       this->dreg = mono_regstate_next_int (cfg->rs);
-                       mono_bblock_add_inst (cfg->cbb, this);
-
-                       mono_call_inst_add_outarg_reg (cfg, call, this->dreg, cinfo->args [0].reg, FALSE);
-               }
-               else {
-                       MonoInst *this;
-                       MONO_INST_NEW (cfg, this, OP_OUTARG);
-                       this->type = this_type;
-                       this->sreg1 = this_reg;
-                       mono_bblock_add_inst (cfg->cbb, this);
-               }
-       }
-
-       if (vt_reg != -1) {
-               MonoInst *vtarg;
-
-               if (cinfo->ret.storage == ArgValuetypeInReg) {
-                       /*
-                        * The valuetype is in EAX:EDX after the call, needs to be copied to
-                        * the stack. Save the address here, so the call instruction can
-                        * access it.
-                        */
-                       MONO_INST_NEW (cfg, vtarg, OP_STORE_MEMBASE_REG);
-                       vtarg->inst_destbasereg = X86_ESP;
-                       vtarg->inst_offset = inst->stack_usage;
-                       vtarg->sreg1 = vt_reg;
-                       mono_bblock_add_inst (cfg->cbb, vtarg);
-               }
-               else if (cinfo->ret.storage == ArgInIReg) {
-                       /* The return address is passed in a register */
-                       MONO_INST_NEW (cfg, vtarg, OP_MOVE);
-                       vtarg->sreg1 = vt_reg;
-                       vtarg->dreg = mono_regstate_next_int (cfg->rs);
-                       mono_bblock_add_inst (cfg->cbb, vtarg);
-
-                       mono_call_inst_add_outarg_reg (cfg, call, vtarg->dreg, cinfo->ret.reg, FALSE);
-               } else {
-                       MonoInst *vtarg;
-                       MONO_INST_NEW (cfg, vtarg, OP_OUTARG);
-                       vtarg->type = STACK_MP;
-                       vtarg->sreg1 = vt_reg;
-                       mono_bblock_add_inst (cfg->cbb, vtarg);
-               }
-       }
-}
-
-#ifdef MONO_ARCH_HAVE_IMT
+#ifdef MONO_ARCH_HAVE_IMT
 
 // Linear handler, the bsearch head compare is shorter
 //[2 + 4] x86_alu_reg_imm (code, X86_CMP, ins->sreg1, ins->inst_imm);
@@ -4227,6 +5126,7 @@ mono_arch_emit_this_vret_args (MonoCompile *cfg, MonoCallInst *inst, int this_re
 #define BR_LARGE_SIZE 5
 #define JUMP_IMM_SIZE 6
 #define ENABLE_WRONG_METHOD_CHECK 0
+#define DEBUG_IMT 0
 
 static int
 imt_branch_distance (MonoIMTCheckItem **imt_entries, int start, int target)
@@ -4241,7 +5141,8 @@ imt_branch_distance (MonoIMTCheckItem **imt_entries, int start, int target)
  * LOCKING: called with the domain lock held
  */
 gpointer
-mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckItem **imt_entries, int count)
+mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckItem **imt_entries, int count,
+       gpointer fail_tramp)
 {
        int i;
        int size = 0;
@@ -4255,10 +5156,14 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
                                        item->chunk_size += CMP_SIZE;
                                item->chunk_size += BR_SMALL_SIZE + JUMP_IMM_SIZE;
                        } else {
-                               item->chunk_size += JUMP_IMM_SIZE;
+                               if (fail_tramp) {
+                                       item->chunk_size += CMP_SIZE + BR_SMALL_SIZE + JUMP_IMM_SIZE * 2;
+                               } else {
+                                       item->chunk_size += JUMP_IMM_SIZE;
 #if ENABLE_WRONG_METHOD_CHECK
-                               item->chunk_size += CMP_SIZE + BR_SMALL_SIZE + 1;
+                                       item->chunk_size += CMP_SIZE + BR_SMALL_SIZE + 1;
 #endif
+                               }
                        }
                } else {
                        item->chunk_size += CMP_SIZE + BR_LARGE_SIZE;
@@ -4266,7 +5171,10 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
                }
                size += item->chunk_size;
        }
-       code = mono_code_manager_reserve (domain->code_mp, size);
+       if (fail_tramp)
+               code = mono_method_alloc_generic_virtual_thunk (domain, size);
+       else
+               code = mono_domain_code_reserve (domain, size);
        start = code;
        for (i = 0; i < count; ++i) {
                MonoIMTCheckItem *item = imt_entries [i];
@@ -4274,26 +5182,45 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
                if (item->is_equals) {
                        if (item->check_target_idx) {
                                if (!item->compare_done)
-                                       x86_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)item->method);
+                                       x86_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)item->key);
                                item->jmp_code = code;
                                x86_branch8 (code, X86_CC_NE, 0, FALSE);
-                               x86_jump_mem (code, & (vtable->vtable [item->vtable_slot]));
+                               if (item->has_target_code)
+                                       x86_jump_code (code, item->value.target_code);
+                               else
+                                       x86_jump_mem (code, & (vtable->vtable [item->value.vtable_slot]));
                        } else {
-                               /* enable the commented code to assert on wrong method */
+                               if (fail_tramp) {
+                                       x86_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)item->key);
+                                       item->jmp_code = code;
+                                       x86_branch8 (code, X86_CC_NE, 0, FALSE);
+                                       if (item->has_target_code)
+                                               x86_jump_code (code, item->value.target_code);
+                                       else
+                                               x86_jump_mem (code, & (vtable->vtable [item->value.vtable_slot]));
+                                       x86_patch (item->jmp_code, code);
+                                       x86_jump_code (code, fail_tramp);
+                                       item->jmp_code = NULL;
+                               } else {
+                                       /* enable the commented code to assert on wrong method */
 #if ENABLE_WRONG_METHOD_CHECK
-                               x86_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)item->method);
-                               item->jmp_code = code;
-                               x86_branch8 (code, X86_CC_NE, 0, FALSE);
+                                       x86_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)item->key);
+                                       item->jmp_code = code;
+                                       x86_branch8 (code, X86_CC_NE, 0, FALSE);
 #endif
-                               x86_jump_mem (code, & (vtable->vtable [item->vtable_slot]));
+                                       if (item->has_target_code)
+                                               x86_jump_code (code, item->value.target_code);
+                                       else
+                                               x86_jump_mem (code, & (vtable->vtable [item->value.vtable_slot]));
 #if ENABLE_WRONG_METHOD_CHECK
-                               x86_patch (item->jmp_code, code);
-                               x86_breakpoint (code);
-                               item->jmp_code = NULL;
+                                       x86_patch (item->jmp_code, code);
+                                       x86_breakpoint (code);
+                                       item->jmp_code = NULL;
 #endif
+                               }
                        }
                } else {
-                       x86_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)item->method);
+                       x86_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)item->key);
                        item->jmp_code = code;
                        if (x86_is_imm8 (imt_branch_distance (imt_entries, i, item->check_target_idx)))
                                x86_branch8 (code, X86_CC_GE, 0, FALSE);
@@ -4310,66 +5237,87 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
                        }
                }
        }
-               
-       mono_stats.imt_thunks_size += code - start;
+
+       if (!fail_tramp)
+               mono_stats.imt_thunks_size += code - start;
        g_assert (code - start <= size);
+
+#if DEBUG_IMT
+       {
+               char *buff = g_strdup_printf ("thunk_for_class_%s_%s_entries_%d", vtable->klass->name_space, vtable->klass->name, count);
+               mono_disassemble_code (NULL, (guint8*)start, code - start, buff);
+               g_free (buff);
+       }
+#endif
+
        return start;
 }
 
 MonoMethod*
-mono_arch_find_imt_method (gpointer *regs, guint8 *code)
+mono_arch_find_imt_method (mgreg_t *regs, guint8 *code)
 {
        return (MonoMethod*) regs [MONO_ARCH_IMT_REG];
 }
+#endif
 
-MonoObject*
-mono_arch_find_this_argument (gpointer *regs, MonoMethod *method)
+MonoVTable*
+mono_arch_find_static_call_vtable (mgreg_t *regs, guint8 *code)
 {
-       MonoMethodSignature *sig = mono_method_signature (method);
-       CallInfo *cinfo = get_call_info (NULL, NULL, sig, FALSE);
-       int this_argument_offset;
-       MonoObject *this_argument;
-
-       /* 
-        * this is the offset of the this arg from esp as saved at the start of 
-        * mono_arch_create_trampoline_code () in tramp-x86.c.
-        */
-       this_argument_offset = 5;
-       if (MONO_TYPE_ISSTRUCT (sig->ret) && (cinfo->ret.storage == ArgOnStack))
-               this_argument_offset++;
-
-       this_argument = * (MonoObject**) (((guint8*) regs [X86_ESP]) + this_argument_offset * sizeof (gpointer));
-
-       g_free (cinfo);
-       return this_argument;
+       return (MonoVTable*) regs [MONO_ARCH_RGCTX_REG];
 }
-#endif
 
 MonoInst*
-mono_arch_get_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
+mono_arch_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
 {
        MonoInst *ins = NULL;
+       int opcode = 0;
 
        if (cmethod->klass == mono_defaults.math_class) {
                if (strcmp (cmethod->name, "Sin") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_SIN);
-                       ins->inst_i0 = args [0];
+                       opcode = OP_SIN;
                } else if (strcmp (cmethod->name, "Cos") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_COS);
-                       ins->inst_i0 = args [0];
+                       opcode = OP_COS;
                } else if (strcmp (cmethod->name, "Tan") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_TAN);
-                       ins->inst_i0 = args [0];
+                       opcode = OP_TAN;
                } else if (strcmp (cmethod->name, "Atan") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_ATAN);
-                       ins->inst_i0 = args [0];
+                       opcode = OP_ATAN;
                } else if (strcmp (cmethod->name, "Sqrt") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_SQRT);
-                       ins->inst_i0 = args [0];
+                       opcode = OP_SQRT;
                } else if (strcmp (cmethod->name, "Abs") == 0 && fsig->params [0]->type == MONO_TYPE_R8) {
-                       MONO_INST_NEW (cfg, ins, OP_ABS);
-                       ins->inst_i0 = args [0];
+                       opcode = OP_ABS;
+               } else if (strcmp (cmethod->name, "Round") == 0 && fsig->param_count == 1 && fsig->params [0]->type == MONO_TYPE_R8) {
+                       opcode = OP_ROUND;
+               }
+               
+               if (opcode) {
+                       MONO_INST_NEW (cfg, ins, opcode);
+                       ins->type = STACK_R8;
+                       ins->dreg = mono_alloc_freg (cfg);
+                       ins->sreg1 = args [0]->dreg;
+                       MONO_ADD_INS (cfg->cbb, ins);
                }
+
+               if (cfg->opt & MONO_OPT_CMOV) {
+                       int opcode = 0;
+
+                       if (strcmp (cmethod->name, "Min") == 0) {
+                               if (fsig->params [0]->type == MONO_TYPE_I4)
+                                       opcode = OP_IMIN;
+                       } else if (strcmp (cmethod->name, "Max") == 0) {
+                               if (fsig->params [0]->type == MONO_TYPE_I4)
+                                       opcode = OP_IMAX;
+                       }               
+
+                       if (opcode) {
+                               MONO_INST_NEW (cfg, ins, opcode);
+                               ins->type = STACK_I4;
+                               ins->dreg = mono_alloc_ireg (cfg);
+                               ins->sreg1 = args [0]->dreg;
+                               ins->sreg2 = args [1]->dreg;
+                               MONO_ADD_INS (cfg->cbb, ins);
+                       }
+               }
+
 #if 0
                /* OP_FREM is not IEEE compatible */
                else if (strcmp (cmethod->name, "IEEERemainder") == 0) {
@@ -4383,7 +5331,6 @@ mono_arch_get_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethod
        return ins;
 }
 
-
 gboolean
 mono_arch_print_tree (MonoInst *tree, int arity)
 {
@@ -4393,24 +5340,14 @@ mono_arch_print_tree (MonoInst *tree, int arity)
 MonoInst* mono_arch_get_domain_intrinsic (MonoCompile* cfg)
 {
        MonoInst* ins;
-       
-       if (appdomain_tls_offset == -1)
-               return NULL;
 
-       MONO_INST_NEW (cfg, ins, OP_TLS_GET);
-       ins->inst_offset = appdomain_tls_offset;
-       return ins;
-}
+       return NULL;
 
-MonoInst* mono_arch_get_thread_intrinsic (MonoCompile* cfg)
-{
-       MonoInst* ins;
-
-       if (thread_tls_offset == -1)
+       if (appdomain_tls_offset == -1)
                return NULL;
 
        MONO_INST_NEW (cfg, ins, OP_TLS_GET);
-       ins->inst_offset = thread_tls_offset;
+       ins->inst_offset = appdomain_tls_offset;
        return ins;
 }
 
@@ -4439,18 +5376,42 @@ mono_arch_get_patch_offset (guint8 *code)
        else if ((code [0] >= 0x58) && (code [0] <= 0x58 + X86_NREG) && (code [1] == 0x81))
                /* pop <REG>; add <OFFSET>, <REG> */
                return 3;
+       else if ((code [0] >= 0xb8) && (code [0] < 0xb8 + 8))
+               /* mov <REG>, imm */
+               return 1;
        else {
                g_assert_not_reached ();
                return -1;
        }
 }
 
+/**
+ * mono_breakpoint_clean_code:
+ *
+ * Copy @size bytes from @code - @offset to the buffer @buf. If the debugger inserted software
+ * breakpoints in the original code, they are removed in the copy.
+ *
+ * Returns TRUE if no sw breakpoint was present.
+ */
 gboolean
-mono_breakpoint_clean_code (guint8 *code, guint8 *buf, int size)
+mono_breakpoint_clean_code (guint8 *method_start, guint8 *code, int offset, guint8 *buf, int size)
 {
        int i;
        gboolean can_write = TRUE;
-       memcpy (buf, code, size);
+       /*
+        * If method_start is non-NULL we need to perform bound checks, since we access memory
+        * at code - offset we could go before the start of the method and end up in a different
+        * page of memory that is not mapped or read incorrect data anyway. We zero-fill the bytes
+        * instead.
+        */
+       if (!method_start || code - offset >= method_start) {
+               memcpy (buf, code - offset, size);
+       } else {
+               int diff = code - method_start;
+               memset (buf, 0, size);
+               memcpy (buf + offset - diff, method_start, diff + size - offset);
+       }
+       code -= offset;
        for (i = 0; i < MONO_BREAKPOINT_ARRAY_SIZE; ++i) {
                int idx = mono_breakpoint_info_index [i];
                guint8 *ptr;
@@ -4468,93 +5429,98 @@ mono_breakpoint_clean_code (guint8 *code, guint8 *buf, int size)
 }
 
 gpointer
-mono_arch_get_vcall_slot (guint8 *code, gpointer *regs, int *displacement)
+mono_arch_get_vcall_slot (guint8 *code, mgreg_t *regs, int *displacement)
 {
        guint8 buf [8];
        guint8 reg = 0;
        gint32 disp = 0;
 
-       mono_breakpoint_clean_code (code - 8, buf, sizeof (buf));
+       mono_breakpoint_clean_code (NULL, code, 8, buf, sizeof (buf));
        code = buf + 8;
 
        *displacement = 0;
 
-       /* go to the start of the call instruction
-        *
-        * address_byte = (m << 6) | (o << 3) | reg
-        * call opcode: 0xff address_byte displacement
-        * 0xff m=1,o=2 imm8
-        * 0xff m=2,o=2 imm32
-        */
        code -= 6;
 
        /* 
         * A given byte sequence can match more than case here, so we have to be
         * really careful about the ordering of the cases. Longer sequences
         * come first.
+        * There are two types of calls:
+        * - direct calls: 0xff address_byte 8/32 bits displacement
+        * - indirect calls: nop nop nop <call>
+        * The nops make sure we don't confuse the instruction preceeding an indirect
+        * call with a direct call.
         */
-       if ((code [-2] == 0x8b) && (x86_modrm_mod (code [-1]) == 0x2) && (code [4] == 0xff) && (x86_modrm_reg (code [5]) == 0x2) && (x86_modrm_mod (code [5]) == 0x0)) {
-               /*
-                * This is an interface call
-                * 8b 80 0c e8 ff ff       mov    0xffffe80c(%eax),%eax
-                * ff 10                   call   *(%eax)
-                */
-               reg = x86_modrm_rm (code [5]);
-               disp = 0;
-#ifdef MONO_ARCH_HAVE_IMT
-       } else if ((code [-2] == 0xba) && (code [3] == 0xff) && (x86_modrm_mod (code [4]) == 1) && (x86_modrm_reg (code [4]) == 2) && ((signed char)code [5] < 0)) {
-               /* IMT-based interface calls: with MONO_ARCH_IMT_REG == edx
-                * ba 14 f8 28 08          mov    $0x828f814,%edx
-                * ff 50 fc                call   *0xfffffffc(%eax)
-                */
-               reg = code [4] & 0x07;
-               disp = (signed char)code [5];
-#endif
-       } else if ((code [1] != 0xe8) && (code [3] == 0xff) && ((code [4] & 0x18) == 0x10) && ((code [4] >> 6) == 1)) {
+       if ((code [1] != 0xe8) && (code [3] == 0xff) && ((code [4] & 0x18) == 0x10) && ((code [4] >> 6) == 1)) {
                reg = code [4] & 0x07;
                disp = (signed char)code [5];
-       } else {
-               if ((code [0] == 0xff) && ((code [1] & 0x18) == 0x10) && ((code [1] >> 6) == 2)) {
-                       reg = code [1] & 0x07;
-                       disp = *((gint32*)(code + 2));
-               } else if ((code [1] == 0xe8)) {
-                       return NULL;
-               } else if ((code [4] == 0xff) && (((code [5] >> 6) & 0x3) == 0) && (((code [5] >> 3) & 0x7) == 2)) {
-                       /*
-                        * This is a interface call
-                        * 8b 40 30   mov    0x30(%eax),%eax
-                        * ff 10      call   *(%eax)
-                        */
-                       disp = 0;
-                       reg = code [5] & 0x07;
-               }
-               else
+       } else if ((code [0] == 0xff) && ((code [1] & 0x18) == 0x10) && ((code [1] >> 6) == 2)) {
+               reg = code [1] & 0x07;
+               disp = *((gint32*)(code + 2));
+       } else if ((code [1] == 0xe8)) {
                        return NULL;
+       } else if ((code [4] == 0xff) && (((code [5] >> 6) & 0x3) == 0) && (((code [5] >> 3) & 0x7) == 2)) {
+               /*
+                * This is a interface call
+                * 8b 40 30   mov    0x30(%eax),%eax
+                * ff 10      call   *(%eax)
+                */
+               disp = 0;
+               reg = code [5] & 0x07;
        }
+       else
+               return NULL;
 
        *displacement = disp;
-       return regs [reg];
+       return (gpointer)regs [reg];
 }
 
-gpointer*
-mono_arch_get_vcall_slot_addr (guint8 *code, gpointer *regs)
+/*
+ * mono_x86_get_this_arg_offset:
+ *
+ *   Return the offset of the stack location where this is passed during a virtual
+ * call.
+ */
+guint32
+mono_x86_get_this_arg_offset (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig)
 {
-       gpointer vt;
-       int displacement;
-       vt = mono_arch_get_vcall_slot (code, regs, &displacement);
-       if (!vt)
-               return NULL;
-       return (gpointer*)((char*)vt + displacement);
+       CallInfo *cinfo = NULL;
+       int offset;
+
+       if (MONO_TYPE_ISSTRUCT (sig->ret)) {
+               cinfo = get_call_info (gsctx, NULL, sig, FALSE);
+
+               offset = cinfo->args [0].offset;
+       } else {
+               offset = 0;
+       }
+
+       return offset;
 }
 
 gpointer
-mono_arch_get_this_arg_from_call (MonoMethodSignature *sig, gssize *regs, guint8 *code)
+mono_arch_get_this_arg_from_call (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig,
+               mgreg_t *regs, guint8 *code)
 {
        guint32 esp = regs [X86_ESP];
-       CallInfo *cinfo;
+       CallInfo *cinfo = NULL;
        gpointer res;
+       int offset;
 
-       cinfo = get_call_info (NULL, NULL, sig, FALSE);
+       /* 
+        * Avoid expensive calls to get_generic_context_from_code () + get_call_info 
+        * if possible.
+        */
+       if (MONO_TYPE_ISSTRUCT (sig->ret)) {
+               if (!gsctx && code)
+                       gsctx = mono_get_generic_context_from_code (code);
+               cinfo = get_call_info (gsctx, NULL, sig, FALSE);
+
+               offset = cinfo->args [0].offset;
+       } else {
+               offset = 0;
+       }
 
        /*
         * The stack looks like:
@@ -4564,8 +5530,9 @@ mono_arch_get_this_arg_from_call (MonoMethodSignature *sig, gssize *regs, guint8
         * <return addr>
         * <4 pointers pushed by mono_arch_create_trampoline_code ()>
         */
-       res = (((MonoObject**)esp) [5 + (cinfo->args [0].offset / 4)]);
-       g_free (cinfo);
+       res = (((MonoObject**)esp) [5 + (offset / 4)]);
+       if (cinfo)
+               g_free (cinfo);
        return res;
 }
 
@@ -4591,11 +5558,8 @@ mono_arch_get_delegate_invoke_impl (MonoMethodSignature *sig, gboolean has_targe
 
        if (has_target) {
                static guint8* cached = NULL;
-               mono_mini_arch_lock ();
-               if (cached) {
-                       mono_mini_arch_unlock ();
+               if (cached)
                        return cached;
-               }
                
                start = code = mono_global_codeman_reserve (64);
 
@@ -4607,9 +5571,11 @@ mono_arch_get_delegate_invoke_impl (MonoMethodSignature *sig, gboolean has_targe
 
                g_assert ((code - start) < 64);
 
-               cached = start;
                mono_debug_add_delegate_trampoline (start, code - start);
-               mono_mini_arch_unlock ();
+
+               mono_memory_barrier ();
+
+               cached = start;
        } else {
                static guint8* cache [MAX_ARCH_DELEGATE_PARAMS + 1] = {NULL};
                int i = 0;
@@ -4620,12 +5586,9 @@ mono_arch_get_delegate_invoke_impl (MonoMethodSignature *sig, gboolean has_targe
                        if (!mono_is_regsize_var (sig->params [i]))
                                return NULL;
 
-               mono_mini_arch_lock ();
                code = cache [sig->param_count];
-               if (code) {
-                       mono_mini_arch_unlock ();
+               if (code)
                        return code;
-               }
 
                /*
                 * The stack contains:
@@ -4658,11 +5621,416 @@ mono_arch_get_delegate_invoke_impl (MonoMethodSignature *sig, gboolean has_targe
 
                g_assert ((code - start) < code_reserve);
 
-               cache [sig->param_count] = start;
-
                mono_debug_add_delegate_trampoline (start, code - start);
-               mono_mini_arch_unlock ();
+
+               mono_memory_barrier ();
+
+               cache [sig->param_count] = start;
        }
 
        return start;
 }
+
+gpointer
+mono_arch_context_get_int_reg (MonoContext *ctx, int reg)
+{
+       switch (reg) {
+       case X86_EAX: return (gpointer)ctx->eax;
+       case X86_EBX: return (gpointer)ctx->ebx;
+       case X86_ECX: return (gpointer)ctx->ecx;
+       case X86_EDX: return (gpointer)ctx->edx;
+       case X86_ESP: return (gpointer)ctx->esp;
+       case X86_EBP: return (gpointer)ctx->ebp;
+       case X86_ESI: return (gpointer)ctx->esi;
+       case X86_EDI: return (gpointer)ctx->edi;
+       default: g_assert_not_reached ();
+       }
+}
+
+#ifdef MONO_ARCH_SIMD_INTRINSICS
+
+static MonoInst*
+get_float_to_x_spill_area (MonoCompile *cfg)
+{
+       if (!cfg->fconv_to_r8_x_var) {
+               cfg->fconv_to_r8_x_var = mono_compile_create_var (cfg, &mono_defaults.double_class->byval_arg, OP_LOCAL);
+               cfg->fconv_to_r8_x_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
+       }       
+       return cfg->fconv_to_r8_x_var;
+}
+
+/*
+ * Convert all fconv opts that MONO_OPT_SSE2 would get wrong. 
+ */
+void
+mono_arch_decompose_opts (MonoCompile *cfg, MonoInst *ins)
+{
+       MonoInst *fconv;
+       int dreg, src_opcode;
+
+       if (!(cfg->opt & MONO_OPT_SSE2) || !(cfg->opt & MONO_OPT_SIMD) || COMPILE_LLVM (cfg))
+               return;
+
+       switch (src_opcode = ins->opcode) {
+       case OP_FCONV_TO_I1:
+       case OP_FCONV_TO_U1:
+       case OP_FCONV_TO_I2:
+       case OP_FCONV_TO_U2:
+       case OP_FCONV_TO_I4:
+       case OP_FCONV_TO_I:
+               break;
+       default:
+               return;
+       }
+
+       /* dreg is the IREG and sreg1 is the FREG */
+       MONO_INST_NEW (cfg, fconv, OP_FCONV_TO_R8_X);
+       fconv->klass = NULL; /*FIXME, what can I use here as the Mono.Simd lib might not be loaded yet*/
+       fconv->sreg1 = ins->sreg1;
+       fconv->dreg = mono_alloc_ireg (cfg);
+       fconv->type = STACK_VTYPE;
+       fconv->backend.spill_var = get_float_to_x_spill_area (cfg);
+
+       mono_bblock_insert_before_ins (cfg->cbb, ins, fconv);
+
+       dreg = ins->dreg;
+       NULLIFY_INS (ins);
+       ins->opcode = OP_XCONV_R8_TO_I4;
+
+       ins->klass = mono_defaults.int32_class;
+       ins->sreg1 = fconv->dreg;
+       ins->dreg = dreg;
+       ins->type = STACK_I4;
+       ins->backend.source_opcode = src_opcode;
+}
+
+#endif /* #ifdef MONO_ARCH_SIMD_INTRINSICS */
+
+void
+mono_arch_decompose_long_opts (MonoCompile *cfg, MonoInst *long_ins)
+{
+       MonoInst *ins;
+       int vreg;
+
+       if (long_ins->opcode == OP_LNEG) {
+               ins = long_ins;
+               MONO_EMIT_NEW_UNALU (cfg, OP_INEG, ins->dreg + 1, ins->sreg1 + 1);
+               MONO_EMIT_NEW_BIALU_IMM (cfg, OP_ADC_IMM, ins->dreg + 2, ins->sreg1 + 2, 0);
+               MONO_EMIT_NEW_UNALU (cfg, OP_INEG, ins->dreg + 2, ins->dreg + 2);
+               NULLIFY_INS (ins);
+               return;
+       }
+
+#ifdef MONO_ARCH_SIMD_INTRINSICS
+
+       if (!(cfg->opt & MONO_OPT_SIMD))
+               return;
+       
+       /*TODO move this to simd-intrinsic.c once we support sse 4.1 dword extractors since we need the runtime caps info */ 
+       switch (long_ins->opcode) {
+       case OP_EXTRACT_I8:
+               vreg = long_ins->sreg1;
+       
+               if (long_ins->inst_c0) {
+                       MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
+                       ins->klass = long_ins->klass;
+                       ins->sreg1 = long_ins->sreg1;
+                       ins->inst_c0 = 2;
+                       ins->type = STACK_VTYPE;
+                       ins->dreg = vreg = alloc_ireg (cfg);
+                       MONO_ADD_INS (cfg->cbb, ins);
+               }
+       
+               MONO_INST_NEW (cfg, ins, OP_EXTRACT_I4);
+               ins->klass = mono_defaults.int32_class;
+               ins->sreg1 = vreg;
+               ins->type = STACK_I4;
+               ins->dreg = long_ins->dreg + 1;
+               MONO_ADD_INS (cfg->cbb, ins);
+       
+               MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
+               ins->klass = long_ins->klass;
+               ins->sreg1 = long_ins->sreg1;
+               ins->inst_c0 = long_ins->inst_c0 ? 3 : 1;
+               ins->type = STACK_VTYPE;
+               ins->dreg = vreg = alloc_ireg (cfg);
+               MONO_ADD_INS (cfg->cbb, ins);
+       
+               MONO_INST_NEW (cfg, ins, OP_EXTRACT_I4);
+               ins->klass = mono_defaults.int32_class;
+               ins->sreg1 = vreg;
+               ins->type = STACK_I4;
+               ins->dreg = long_ins->dreg + 2;
+               MONO_ADD_INS (cfg->cbb, ins);
+       
+               long_ins->opcode = OP_NOP;
+               break;
+       case OP_INSERTX_I8_SLOW:
+               MONO_INST_NEW (cfg, ins, OP_INSERTX_I4_SLOW);
+               ins->dreg = long_ins->dreg;
+               ins->sreg1 = long_ins->dreg;
+               ins->sreg2 = long_ins->sreg2 + 1;
+               ins->inst_c0 = long_ins->inst_c0 * 2;
+               MONO_ADD_INS (cfg->cbb, ins);
+
+               MONO_INST_NEW (cfg, ins, OP_INSERTX_I4_SLOW);
+               ins->dreg = long_ins->dreg;
+               ins->sreg1 = long_ins->dreg;
+               ins->sreg2 = long_ins->sreg2 + 2;
+               ins->inst_c0 = long_ins->inst_c0 * 2 + 1;
+               MONO_ADD_INS (cfg->cbb, ins);
+
+               long_ins->opcode = OP_NOP;
+               break;
+       case OP_EXPAND_I8:
+               MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
+               ins->dreg = long_ins->dreg;
+               ins->sreg1 = long_ins->sreg1 + 1;
+               ins->klass = long_ins->klass;
+               ins->type = STACK_VTYPE;
+               MONO_ADD_INS (cfg->cbb, ins);
+
+               MONO_INST_NEW (cfg, ins, OP_INSERTX_I4_SLOW);
+               ins->dreg = long_ins->dreg;
+               ins->sreg1 = long_ins->dreg;
+               ins->sreg2 = long_ins->sreg1 + 2;
+               ins->inst_c0 = 1;
+               ins->klass = long_ins->klass;
+               ins->type = STACK_VTYPE;
+               MONO_ADD_INS (cfg->cbb, ins);
+
+               MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
+               ins->dreg = long_ins->dreg;
+               ins->sreg1 = long_ins->dreg;;
+               ins->inst_c0 = 0x44; /*Magic number for swizzling (X,Y,X,Y)*/
+               ins->klass = long_ins->klass;
+               ins->type = STACK_VTYPE;
+               MONO_ADD_INS (cfg->cbb, ins);
+
+               long_ins->opcode = OP_NOP;
+               break;
+       }
+#endif /* MONO_ARCH_SIMD_INTRINSICS */
+}
+
+/*MONO_ARCH_HAVE_HANDLER_BLOCK_GUARD*/
+gpointer
+mono_arch_install_handler_block_guard (MonoJitInfo *ji, MonoJitExceptionInfo *clause, MonoContext *ctx, gpointer new_value)
+{
+       int offset;
+       gpointer *sp, old_value;
+       char *bp;
+       const unsigned char *handler;
+
+       /*Decode the first instruction to figure out where did we store the spvar*/
+       /*Our jit MUST generate the following:
+        mov %esp, -?(%ebp)
+        Which is encoded as: 0x89 mod_rm.
+        mod_rm (esp, ebp, imm) which can be: (imm will never be zero)
+               mod (reg + imm8):  01 reg(esp): 100 rm(ebp): 101 -> 01100101 (0x65)
+               mod (reg + imm32): 10 reg(esp): 100 rm(ebp): 101 -> 10100101 (0xA5)
+       */
+       handler = clause->handler_start;
+
+       if (*handler != 0x89)
+               return NULL;
+
+       ++handler;
+
+       if (*handler == 0x65)
+               offset = *(signed char*)(handler + 1);
+       else if (*handler == 0xA5)
+               offset = *(int*)(handler + 1);
+       else
+               return NULL;
+
+       /*Load the spvar*/
+       bp = MONO_CONTEXT_GET_BP (ctx);
+       sp = *(gpointer*)(bp + offset);
+
+       old_value = *sp;
+       if (old_value < ji->code_start || (char*)old_value > ((char*)ji->code_start + ji->code_size))
+               return old_value;
+
+       *sp = new_value;
+
+       return old_value;
+}
+
+#if __APPLE__
+#define DBG_SIGNAL SIGBUS
+#else
+#define DBG_SIGNAL SIGSEGV
+#endif
+
+/* Soft Debug support */
+#ifdef MONO_ARCH_SOFT_DEBUG_SUPPORTED
+
+/*
+ * mono_arch_set_breakpoint:
+ *
+ *   Set a breakpoint at the native code corresponding to JI at NATIVE_OFFSET.
+ * The location should contain code emitted by OP_SEQ_POINT.
+ */
+void
+mono_arch_set_breakpoint (MonoJitInfo *ji, guint8 *ip)
+{
+       guint8 *code = ip;
+
+       /* 
+        * In production, we will use int3 (has to fix the size in the md 
+        * file). But that could confuse gdb, so during development, we emit a SIGSEGV
+        * instead.
+        */
+       g_assert (code [0] == 0x90);
+       x86_alu_reg_mem (code, X86_CMP, X86_EAX, (guint32)bp_trigger_page);
+}
+
+/*
+ * mono_arch_clear_breakpoint:
+ *
+ *   Clear the breakpoint at IP.
+ */
+void
+mono_arch_clear_breakpoint (MonoJitInfo *ji, guint8 *ip)
+{
+       guint8 *code = ip;
+       int i;
+
+       for (i = 0; i < 6; ++i)
+               x86_nop (code);
+}
+       
+/*
+ * mono_arch_start_single_stepping:
+ *
+ *   Start single stepping.
+ */
+void
+mono_arch_start_single_stepping (void)
+{
+       mono_mprotect (ss_trigger_page, mono_pagesize (), 0);
+}
+       
+/*
+ * mono_arch_stop_single_stepping:
+ *
+ *   Stop single stepping.
+ */
+void
+mono_arch_stop_single_stepping (void)
+{
+       mono_mprotect (ss_trigger_page, mono_pagesize (), MONO_MMAP_READ);
+}
+
+/*
+ * mono_arch_is_single_step_event:
+ *
+ *   Return whenever the machine state in SIGCTX corresponds to a single
+ * step event.
+ */
+gboolean
+mono_arch_is_single_step_event (void *info, void *sigctx)
+{
+#ifdef TARGET_WIN32
+       EXCEPTION_RECORD* einfo = (EXCEPTION_RECORD*)info;      /* Sometimes the address is off by 4 */
+       if ((einfo->ExceptionInformation[1] >= ss_trigger_page && (guint8*)einfo->ExceptionInformation[1] <= (guint8*)ss_trigger_page + 128))
+               return TRUE;
+       else
+               return FALSE;
+#else
+       siginfo_t* sinfo = (siginfo_t*) info;
+       /* Sometimes the address is off by 4 */
+       if (sinfo->si_signo == DBG_SIGNAL && (sinfo->si_addr >= ss_trigger_page && (guint8*)sinfo->si_addr <= (guint8*)ss_trigger_page + 128))
+               return TRUE;
+       else
+               return FALSE;
+#endif
+}
+
+gboolean
+mono_arch_is_breakpoint_event (void *info, void *sigctx)
+{
+#ifdef TARGET_WIN32
+       EXCEPTION_RECORD* einfo = (EXCEPTION_RECORD*)info;      /* Sometimes the address is off by 4 */
+       if ((einfo->ExceptionInformation[1] >= bp_trigger_page && (guint8*)einfo->ExceptionInformation[1] <= (guint8*)bp_trigger_page + 128))
+               return TRUE;
+       else
+               return FALSE;
+#else
+       siginfo_t* sinfo = (siginfo_t*)info;
+       /* Sometimes the address is off by 4 */
+       if (sinfo->si_signo == DBG_SIGNAL && (sinfo->si_addr >= bp_trigger_page && (guint8*)sinfo->si_addr <= (guint8*)bp_trigger_page + 128))
+               return TRUE;
+       else
+               return FALSE;
+#endif
+}
+
+/*
+ * mono_arch_get_ip_for_breakpoint:
+ *
+ *   See mini-amd64.c for docs.
+ */
+guint8*
+mono_arch_get_ip_for_breakpoint (MonoJitInfo *ji, MonoContext *ctx)
+{
+       guint8 *ip = MONO_CONTEXT_GET_IP (ctx);
+
+       return ip;
+}
+
+#define BREAKPOINT_SIZE 6
+
+/*
+ * mono_arch_get_ip_for_single_step:
+ *
+ *   See mini-amd64.c for docs.
+ */
+guint8*
+mono_arch_get_ip_for_single_step (MonoJitInfo *ji, MonoContext *ctx)
+{
+       guint8 *ip = MONO_CONTEXT_GET_IP (ctx);
+
+       /* Size of x86_alu_reg_imm */
+       ip += 6;
+
+       return ip;
+}
+
+/*
+ * mono_arch_skip_breakpoint:
+ *
+ *   See mini-amd64.c for docs.
+ */
+void
+mono_arch_skip_breakpoint (MonoContext *ctx)
+{
+       MONO_CONTEXT_SET_IP (ctx, (guint8*)MONO_CONTEXT_GET_IP (ctx) + BREAKPOINT_SIZE);
+}
+
+/*
+ * mono_arch_skip_single_step:
+ *
+ *   See mini-amd64.c for docs.
+ */
+void
+mono_arch_skip_single_step (MonoContext *ctx)
+{
+       MONO_CONTEXT_SET_IP (ctx, (guint8*)MONO_CONTEXT_GET_IP (ctx) + 6);
+}
+
+/*
+ * mono_arch_get_seq_point_info:
+ *
+ *   See mini-amd64.c for docs.
+ */
+gpointer
+mono_arch_get_seq_point_info (MonoDomain *domain, guint8 *code)
+{
+       NOT_IMPLEMENTED;
+       return NULL;
+}
+
+#endif
+