Contributed under the terms of the MIT/X11 license by
[mono.git] / mono / mini / mini-amd64.c
index ad4e5836688cec05993c0694ea66ac9ff608d3d1..2547707016f8c4547216f8dd0f5c6f575ebe31f8 100644 (file)
@@ -26,8 +26,8 @@
 #include <mono/utils/mono-math.h>
 
 #include "trace.h"
+#include "ir-emit.h"
 #include "mini-amd64.h"
-#include "inssel.h"
 #include "cpu-amd64.h"
 
 /* 
@@ -93,8 +93,6 @@ mono_breakpoint_info [MONO_BREAKPOINT_ARRAY_SIZE];
  * UNORDERED        1  1  1
  */
 
-void mini_emit_memcpy2 (MonoCompile *cfg, int destreg, int doffset, int srcreg, int soffset, int size, int align);
-
 const char*
 mono_arch_regname (int reg)
 {
@@ -133,6 +131,13 @@ mono_arch_fregname (int reg)
                return "unknown";
 }
 
+/* TODO: Figure out away of telling this and the one above apart if things get confussing. */
+const char *
+mono_arch_xregname (int reg)
+{
+       return mono_arch_fregname (reg);
+}
+
 G_GNUC_UNUSED static void
 break_count (void)
 {
@@ -250,13 +255,6 @@ typedef struct {
 
 #define DEBUG(a) if (cfg->verbose_level > 1) a
 
-#define NEW_ICONST(cfg,dest,val) do {  \
-               (dest) = mono_mempool_alloc0 ((cfg)->mempool, sizeof (MonoInst));       \
-               (dest)->opcode = OP_ICONST;     \
-               (dest)->inst_c0 = (val);        \
-               (dest)->type = STACK_I4;        \
-       } while (0)
-
 #ifdef PLATFORM_WIN32
 #define PARAM_REGS 4
 
@@ -326,7 +324,7 @@ merge_argument_class_from_type (MonoType *type, ArgumentClass class1)
        ArgumentClass class2 = ARG_CLASS_NO_CLASS;
        MonoType *ptype;
 
-       ptype = mono_type_get_underlying_type (type);
+       ptype = mini_type_get_underlying_type (NULL, type);
        switch (ptype->type) {
        case MONO_TYPE_BOOLEAN:
        case MONO_TYPE_CHAR:
@@ -398,26 +396,39 @@ merge_argument_class_from_type (MonoType *type, ArgumentClass class1)
 
 static void
 add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgInfo *ainfo, MonoType *type,
-              gboolean is_return,
-              guint32 *gr, guint32 *fr, guint32 *stack_size)
+                          gboolean is_return,
+                          guint32 *gr, guint32 *fr, guint32 *stack_size)
 {
        guint32 size, quad, nquads, i;
        ArgumentClass args [2];
-       MonoMarshalType *info;
+       MonoMarshalType *info = NULL;
        MonoClass *klass;
+       MonoGenericSharingContext tmp_gsctx;
+       gboolean pass_on_stack = FALSE;
+       
+       /* 
+        * The gsctx currently contains no data, it is only used for checking whenever
+        * open types are allowed, some callers like mono_arch_get_argument_info ()
+        * don't pass it to us, so work around that.
+        */
+       if (!gsctx)
+               gsctx = &tmp_gsctx;
 
        klass = mono_class_from_mono_type (type);
-       if (sig->pinvoke) 
-               size = mono_type_native_stack_size (&klass->byval_arg, NULL);
-       else 
-               size = mini_type_stack_size (gsctx, &klass->byval_arg, NULL);
+       size = mini_type_stack_size_full (gsctx, &klass->byval_arg, NULL, sig->pinvoke);
 #ifndef PLATFORM_WIN32
        if (!sig->pinvoke && !disable_vtypes_in_regs && ((is_return && (size == 8)) || (!is_return && (size <= 16)))) {
                /* We pass and return vtypes of size 8 in a register */
        } else if (!sig->pinvoke || (size == 0) || (size > 16)) {
+               pass_on_stack = TRUE;
+       }
 #else
        if (!sig->pinvoke) {
+               pass_on_stack = TRUE;
+       }
 #endif
+
+       if (pass_on_stack) {
                /* Allways pass in memory */
                ainfo->offset = *stack_size;
                *stack_size += ALIGN_TO (size, 8);
@@ -605,8 +616,7 @@ get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSign
 
        /* return value */
        {
-               ret_type = mono_type_get_underlying_type (sig->ret);
-               ret_type = mini_get_basic_type_from_generic (gsctx, ret_type);
+               ret_type = mini_type_get_underlying_type (gsctx, sig->ret);
                switch (ret_type->type) {
                case MONO_TYPE_BOOLEAN:
                case MONO_TYPE_I1:
@@ -710,8 +720,7 @@ get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSign
                        add_general (&gr, &stack_size, ainfo);
                        continue;
                }
-               ptype = mono_type_get_underlying_type (sig->params [i]);
-               ptype = mini_get_basic_type_from_generic (gsctx, ptype);
+               ptype = mini_type_get_underlying_type (gsctx, sig->params [i]);
                switch (ptype->type) {
                case MONO_TYPE_BOOLEAN:
                case MONO_TYPE_I1:
@@ -910,13 +919,40 @@ mono_arch_cpu_optimizazions (guint32 *exclude_mask)
                } else
                        *exclude_mask |= MONO_OPT_CMOV;
        }
-#ifdef PLATFORM_WIN32
-       /* FIXME */
-       *exclude_mask |= (MONO_OPT_PEEPHOLE | MONO_OPT_BRANCH);
-#endif
+
        return opts;
 }
 
+/*
+ * This function test for all SSE functions supported.
+ *
+ * Returns a bitmask corresponding to all supported versions.
+ * 
+ * TODO detect other versions like SSE4a.
+ */
+guint32
+mono_arch_cpu_enumerate_simd_versions (void)
+{
+       int eax, ebx, ecx, edx;
+       guint32 sse_opts = 0;
+
+       if (cpuid (1, &eax, &ebx, &ecx, &edx)) {
+               if (edx & (1 << 25))
+                       sse_opts |= 1 << SIMD_VERSION_SSE1;
+               if (edx & (1 << 26))
+                       sse_opts |= 1 << SIMD_VERSION_SSE2;
+               if (ecx & (1 << 0))
+                       sse_opts |= 1 << SIMD_VERSION_SSE3;
+               if (ecx & (1 << 9))
+                       sse_opts |= 1 << SIMD_VERSION_SSSE3;
+               if (ecx & (1 << 19))
+                       sse_opts |= 1 << SIMD_VERSION_SSE41;
+               if (ecx & (1 << 20))
+                       sse_opts |= 1 << SIMD_VERSION_SSE42;
+       }
+       return sse_opts;        
+}
+
 GList *
 mono_arch_get_allocatable_int_vars (MonoCompile *cfg)
 {
@@ -1016,11 +1052,6 @@ mono_arch_compute_omit_fp (MonoCompile *cfg)
 
                locals_size += mono_type_size (ins->inst_vtype, &ialign);
        }
-
-       if ((cfg->num_varinfo > 10000) || (locals_size >= (1 << 15))) {
-               /* Avoid hitting the stack_alloc_size < (1 << 16) assertion in emit_epilog () */
-               cfg->arch.omit_fp = FALSE;
-       }
 }
 
 GList *
@@ -1058,6 +1089,10 @@ mono_arch_get_global_int_regs (MonoCompile *cfg)
                regs = g_list_prepend (regs, (gpointer)AMD64_R13);
                regs = g_list_prepend (regs, (gpointer)AMD64_R14);
                regs = g_list_prepend (regs, (gpointer)AMD64_R15);
+#ifdef PLATFORM_WIN32
+               regs = g_list_prepend (regs, (gpointer)AMD64_RDI);
+               regs = g_list_prepend (regs, (gpointer)AMD64_RSI);
+#endif
        }
 
        return regs;
@@ -1250,7 +1285,18 @@ mono_arch_allocate_vars (MonoCompile *cfg)
 
        /*
         * We use the ABI calling conventions for managed code as well.
-        * Exception: valuetypes are never passed or returned in registers.
+        * Exception: valuetypes are only sometimes passed or returned in registers.
+        */
+
+       /*
+        * The stack looks like this:
+        * <incoming arguments passed on the stack>
+        * <return value>
+        * <lmf/caller saved registers>
+        * <locals>
+        * <spill area>
+        * <localloc area>  -> grows dynamically
+        * <params area>
         */
 
        if (cfg->arch.omit_fp) {
@@ -1265,8 +1311,6 @@ mono_arch_allocate_vars (MonoCompile *cfg)
 
        if (cfg->method->save_lmf) {
                /* Reserve stack space for saving LMF */
-               /* mono_arch_find_jit_info () expects to find the LMF at a fixed offset */
-               g_assert (offset == 0);
                if (cfg->arch.omit_fp) {
                        cfg->arch.lmf_offset = offset;
                        offset += sizeof (MonoLMF);
@@ -1342,6 +1386,14 @@ mono_arch_allocate_vars (MonoCompile *cfg)
                        offset += (locals_stack_align - 1);
                        offset &= ~(locals_stack_align - 1);
                }
+               if (cfg->arch.omit_fp) {
+                       cfg->locals_min_stack_offset = offset;
+                       cfg->locals_max_stack_offset = offset + locals_stack_size;
+               } else {
+                       cfg->locals_min_stack_offset = - (offset + locals_stack_size);
+                       cfg->locals_max_stack_offset = - offset;
+               }
+               
                for (i = cfg->locals_start; i < cfg->num_varinfo; i++) {
                        if (offsets [i] != -1) {
                                MonoInst *ins = cfg->varinfo [i];
@@ -1508,37 +1560,23 @@ mono_arch_create_vars (MonoCompile *cfg)
                        mono_print_ins (cfg->vret_addr);
                }
        }
-}
 
-static void
-add_outarg_reg (MonoCompile *cfg, MonoCallInst *call, MonoInst *arg, ArgStorage storage, int reg, MonoInst *tree)
-{
-       switch (storage) {
-       case ArgInIReg:
-               arg->opcode = OP_OUTARG_REG;
-               arg->inst_left = tree;
-               arg->inst_call = call;
-               arg->backend.reg3 = reg;
-               break;
-       case ArgInFloatSSEReg:
-               arg->opcode = OP_AMD64_OUTARG_XMMREG_R4;
-               arg->inst_left = tree;
-               arg->inst_call = call;
-               arg->backend.reg3 = reg;
-               break;
-       case ArgInDoubleSSEReg:
-               arg->opcode = OP_AMD64_OUTARG_XMMREG_R8;
-               arg->inst_left = tree;
-               arg->inst_call = call;
-               arg->backend.reg3 = reg;
-               break;
-       default:
-               g_assert_not_reached ();
-       }
+#ifdef MONO_AMD64_NO_PUSHES
+       /*
+        * When this is set, we pass arguments on the stack by moves, and by allocating 
+        * a bigger stack frame, instead of pushes.
+        * Pushes complicate exception handling because the arguments on the stack have
+        * to be popped each time a frame is unwound. They also make fp elimination
+        * impossible.
+        * FIXME: This doesn't work inside filter/finally clauses, since those execute
+        * on a new frame which doesn't include a param area.
+        */
+       cfg->arch.no_pushes = TRUE;
+#endif
 }
 
 static void
-add_outarg_reg2 (MonoCompile *cfg, MonoCallInst *call, ArgStorage storage, int reg, MonoInst *tree)
+add_outarg_reg (MonoCompile *cfg, MonoCallInst *call, ArgStorage storage, int reg, MonoInst *tree)
 {
        MonoInst *ins;
 
@@ -1572,23 +1610,6 @@ add_outarg_reg2 (MonoCompile *cfg, MonoCallInst *call, ArgStorage storage, int r
        }
 }
 
-static int
-arg_storage_to_ldind (ArgStorage storage)
-{
-       switch (storage) {
-       case ArgInIReg:
-               return CEE_LDIND_I;
-       case ArgInDoubleSSEReg:
-               return CEE_LDIND_R8;
-       case ArgInFloatSSEReg:
-               return CEE_LDIND_R4;
-       default:
-               g_assert_not_reached ();
-       }
-
-       return -1;
-}
-
 static int
 arg_storage_to_load_membase (ArgStorage storage)
 {
@@ -1612,12 +1633,15 @@ emit_sig_cookie (MonoCompile *cfg, MonoCallInst *call, CallInfo *cinfo)
        MonoInst *arg;
        MonoMethodSignature *tmp_sig;
        MonoInst *sig_arg;
-                       
+
+       if (call->tail_call)
+               NOT_IMPLEMENTED;
+
        /* FIXME: Add support for signature tokens to AOT */
        cfg->disable_aot = TRUE;
 
        g_assert (cinfo->sig_cookie.storage == ArgOnStack);
-
+                       
        /*
         * mono_ArgIterator_Setup assumes the signature cookie is 
         * passed first and all the arguments which were before it are
@@ -1630,309 +1654,120 @@ emit_sig_cookie (MonoCompile *cfg, MonoCallInst *call, CallInfo *cinfo)
        memcpy (tmp_sig->params, call->signature->params + call->signature->sentinelpos, tmp_sig->param_count * sizeof (MonoType*));
 
        MONO_INST_NEW (cfg, sig_arg, OP_ICONST);
+       sig_arg->dreg = mono_alloc_ireg (cfg);
        sig_arg->inst_p0 = tmp_sig;
+       MONO_ADD_INS (cfg->cbb, sig_arg);
 
-       MONO_INST_NEW (cfg, arg, OP_OUTARG);
-       arg->inst_left = sig_arg;
-       arg->type = STACK_PTR;
+       if (cfg->arch.no_pushes) {
+               MONO_EMIT_NEW_STORE_MEMBASE (cfg, OP_STORE_MEMBASE_REG, AMD64_RSP, cinfo->sig_cookie.offset, sig_arg->dreg);
+       } else {
+               MONO_INST_NEW (cfg, arg, OP_X86_PUSH);
+               arg->sreg1 = sig_arg->dreg;
+               MONO_ADD_INS (cfg->cbb, arg);
+       }
+}
 
-       /* prepend, so they get reversed */
-       arg->next = call->out_args;
-       call->out_args = arg;
+static inline LLVMArgStorage
+arg_storage_to_llvm_arg_storage (MonoCompile *cfg, ArgStorage storage)
+{
+       switch (storage) {
+       case ArgInIReg:
+               return LLVMArgInIReg;
+       case ArgNone:
+               return LLVMArgNone;
+       default:
+               g_assert_not_reached ();
+               return LLVMArgNone;
+       }
 }
 
-/* 
- * take the arguments and generate the arch-specific
- * instructions to properly call the function in call.
- * This includes pushing, moving arguments to the right register
- * etc.
- */
-MonoCallInst*
-mono_arch_call_opcode (MonoCompile *cfg, MonoBasicBlock* bb, MonoCallInst *call, int is_virtual) {
-       MonoInst *arg, *in;
-       MonoMethodSignature *sig;
-       int i, n, stack_size;
+#ifdef ENABLE_LLVM
+LLVMCallInfo*
+mono_arch_get_llvm_call_info (MonoCompile *cfg, MonoMethodSignature *sig)
+{
+       int i, n;
        CallInfo *cinfo;
        ArgInfo *ainfo;
+       int j;
+       LLVMCallInfo *linfo;
 
-       stack_size = 0;
-
-       sig = call->signature;
        n = sig->param_count + sig->hasthis;
 
        cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, sig->pinvoke);
 
-       if (cfg->method->save_lmf) {
-               MONO_INST_NEW (cfg, arg, OP_AMD64_SAVE_SP_TO_LMF);
-               arg->next = call->out_args;
-               call->out_args = arg;
-       }
-
-       for (i = 0; i < n; ++i) {
-               ainfo = cinfo->args + i;
+       linfo = mono_mempool_alloc0 (cfg->mempool, sizeof (LLVMCallInfo) + (sizeof (LLVMArgInfo) * n));
 
-               if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (i == sig->sentinelpos)) {
-                       /* Emit the signature cookie just before the implicit arguments */
-                       emit_sig_cookie (cfg, call, cinfo);
+       /*
+        * LLVM always uses the native ABI while we use our own ABI, the
+        * only difference is the handling of vtypes:
+        * - we only pass/receive them in registers in some cases, and only 
+        *   in 1 or 2 integer registers.
+        */
+       if (cinfo->ret.storage == ArgValuetypeInReg) {
+               if (sig->pinvoke) {
+                       cfg->exception_message = g_strdup ("pinvoke + vtypes");
+                       cfg->disable_llvm = TRUE;
+                       return linfo;
                }
 
-               if (is_virtual && i == 0) {
-                       /* the argument will be attached to the call instruction */
-                       in = call->args [i];
-               } else {
-                       MONO_INST_NEW (cfg, arg, OP_OUTARG);
-                       in = call->args [i];
-                       arg->cil_code = in->cil_code;
-                       arg->inst_left = in;
-                       arg->type = in->type;
-                       /* prepend, so they get reversed */
-                       arg->next = call->out_args;
-                       call->out_args = arg;
-#if 0
-                       if (!cinfo->stack_usage)
-                               /* Keep the assignments to the arg registers in order if possible */
-                               MONO_INST_LIST_ADD_TAIL (&arg->node, &call->out_args);
-                       else
-                               MONO_INST_LIST_ADD (&arg->node, &call->out_args);
-#endif
-
-                       if ((i >= sig->hasthis) && (MONO_TYPE_ISSTRUCT(sig->params [i - sig->hasthis]))) {
-                               guint32 align;
-                               guint32 size;
-
-                               if (sig->params [i - sig->hasthis]->type == MONO_TYPE_TYPEDBYREF) {
-                                       size = sizeof (MonoTypedRef);
-                                       align = sizeof (gpointer);
-                               }
-                               else
-                               if (sig->pinvoke)
-                                       size = mono_type_native_stack_size (&in->klass->byval_arg, &align);
-                               else {
-                                       /* 
-                                        * Other backends use mini_type_stack_size (), but that
-                                        * aligns the size to 8, which is larger than the size of
-                                        * the source, leading to reads of invalid memory if the
-                                        * source is at the end of address space.
-                                        */
-                                       size = mono_class_value_size (in->klass, &align);
-                               }
-                               if (ainfo->storage == ArgValuetypeInReg) {
-                                       if (ainfo->pair_storage [1] == ArgNone) {
-                                               MonoInst *load;
+               linfo->ret.storage = LLVMArgVtypeInReg;
+               for (j = 0; j < 2; ++j)
+                       linfo->ret.pair_storage [j] = arg_storage_to_llvm_arg_storage (cfg, cinfo->ret.pair_storage [j]);
+       }
 
-                                               /* Simpler case */
+       if (MONO_TYPE_ISSTRUCT (sig->ret) && cinfo->ret.storage == ArgInIReg) {
+               /* Vtype returned using a hidden argument */
+               linfo->ret.storage = LLVMArgVtypeRetAddr;
+       }
 
-                                               MONO_INST_NEW (cfg, load, arg_storage_to_ldind (ainfo->pair_storage [0]));
-                                               load->inst_left = in;
+       for (i = 0; i < n; ++i) {
+               ainfo = cinfo->args + i;
 
-                                               add_outarg_reg (cfg, call, arg, ainfo->pair_storage [0], ainfo->pair_regs [0], load);
-                                       }
-                                       else {
-                                               /* Trees can't be shared so make a copy */
-                                               MonoInst *vtaddr = mono_compile_create_var (cfg, &mono_defaults.int_class->byval_arg, OP_LOCAL);
-                                               MonoInst *load, *load2, *offset_ins;
-
-                                               /* Reg1 */
-                                               MONO_INST_NEW (cfg, load, CEE_LDIND_I);
-                                               load->ssa_op = MONO_SSA_LOAD;
-                                               load->inst_i0 = (cfg)->varinfo [vtaddr->inst_c0];
-
-                                               NEW_ICONST (cfg, offset_ins, 0);
-                                               MONO_INST_NEW (cfg, load2, CEE_ADD);
-                                               load2->inst_left = load;
-                                               load2->inst_right = offset_ins;
-
-                                               MONO_INST_NEW (cfg, load, arg_storage_to_ldind (ainfo->pair_storage [0]));
-                                               load->inst_left = load2;
-
-                                               add_outarg_reg (cfg, call, arg, ainfo->pair_storage [0], ainfo->pair_regs [0], load);
-
-                                               /* Reg2 */
-                                               MONO_INST_NEW (cfg, load, CEE_LDIND_I);
-                                               load->ssa_op = MONO_SSA_LOAD;
-                                               load->inst_i0 = (cfg)->varinfo [vtaddr->inst_c0];
-
-                                               NEW_ICONST (cfg, offset_ins, 8);
-                                               MONO_INST_NEW (cfg, load2, CEE_ADD);
-                                               load2->inst_left = load;
-                                               load2->inst_right = offset_ins;
-
-                                               MONO_INST_NEW (cfg, load, arg_storage_to_ldind (ainfo->pair_storage [1]));
-                                               load->inst_left = load2;
-
-                                               MONO_INST_NEW (cfg, arg, OP_OUTARG);
-                                               arg->cil_code = in->cil_code;
-                                               arg->type = in->type;
-                                               /* prepend, so they get reversed */
-                                               arg->next = call->out_args;
-                                               call->out_args = arg;
-
-                                               add_outarg_reg (cfg, call, arg, ainfo->pair_storage [1], ainfo->pair_regs [1], load);
-
-                                               /* Prepend a copy inst */
-                                               MONO_INST_NEW (cfg, arg, CEE_STIND_I);
-                                               arg->cil_code = in->cil_code;
-                                               arg->ssa_op = MONO_SSA_STORE;
-                                               arg->inst_left = vtaddr;
-                                               arg->inst_right = in;
-                                               arg->type = in->type;
-
-                                               /* prepend, so they get reversed */
-                                               arg->next = call->out_args;
-                                               call->out_args = arg;
-                                       }
-                               }
-                               else if (ainfo->storage == ArgValuetypeAddrInIReg){
+               linfo->args [i].storage = LLVMArgNone;
 
-                                       /* Add a temp variable to the method*/
-                                       MonoInst *load;
-                                       MonoInst *vtaddr = mono_compile_create_var (cfg, &in->klass->byval_arg, OP_LOCAL);
-                                       
-                                       MONO_INST_NEW (cfg, load, OP_LDADDR);
-                                       load->ssa_op = MONO_SSA_LOAD;
-                                       load->inst_left = vtaddr;
-                                       
-                                       if (ainfo->pair_storage [0] == ArgInIReg) {
-                                               /* Inserted after the copy.  Load the address of the temp to the argument regster.*/
-                                               arg->opcode = OP_OUTARG_REG;
-                                               arg->inst_left = load;
-                                               arg->inst_call = call;
-                                               arg->backend.reg3 =  ainfo->pair_regs [0];
-                                       } 
-                                       else {
-                                               /* Inserted after the copy.  Load the address of the temp on the stack.*/
-                                               arg->opcode = OP_OUTARG_VT;
-                                               arg->inst_left = load;
-                                               arg->type = STACK_PTR;
-                                               arg->klass = mono_defaults.int_class;
-                                               arg->backend.is_pinvoke = sig->pinvoke;
-                                               arg->inst_imm = size;
+               switch (ainfo->storage) {
+               case ArgInIReg:
+                       linfo->args [i].storage = LLVMArgInIReg;
+                       break;
+               case ArgInDoubleSSEReg:
+               case ArgInFloatSSEReg:
+                       linfo->args [i].storage = LLVMArgInFPReg;
+                       break;
+               case ArgOnStack:
+                       if ((i >= sig->hasthis) && (MONO_TYPE_ISSTRUCT(sig->params [i - sig->hasthis]))) {
+                               linfo->args [i].storage = LLVMArgVtypeByVal;
+                       } else {
+                               linfo->args [i].storage = LLVMArgInIReg;
+                               if (!sig->params [i - sig->hasthis]->byref) {
+                                       if (sig->params [i - sig->hasthis]->type == MONO_TYPE_R4) {
+                                               linfo->args [i].storage = LLVMArgInFPReg;
+                                       } else if (sig->params [i - sig->hasthis]->type == MONO_TYPE_R8) {
+                                               linfo->args [i].storage = LLVMArgInFPReg;
                                        }
-
-                                       /*Copy the argument to the temp variable.*/
-                                       MONO_INST_NEW (cfg, load, OP_MEMCPY);
-                                       load->backend.memcpy_args = mono_mempool_alloc0 (cfg->mempool, sizeof (MonoMemcpyArgs));
-                                       load->backend.memcpy_args->size = size;
-                                       load->backend.memcpy_args->align = align;
-                                       load->inst_left = (cfg)->varinfo [vtaddr->inst_c0];
-                                       load->inst_right = in->inst_i0;
-
-                                       // FIXME:
-                                       g_assert_not_reached ();
-                                       //MONO_INST_LIST_ADD (&load->node, &call->out_args);
-                               }
-                               else {
-                                       arg->opcode = OP_OUTARG_VT;
-                                       arg->klass = in->klass;
-                                       arg->backend.is_pinvoke = sig->pinvoke;
-                                       arg->inst_imm = size;
                                }
                        }
-                       else {
-                               switch (ainfo->storage) {
-                               case ArgInIReg:
-                                       add_outarg_reg (cfg, call, arg, ainfo->storage, ainfo->reg, in);
-                                       break;
-                               case ArgInFloatSSEReg:
-                               case ArgInDoubleSSEReg:
-                                       add_outarg_reg (cfg, call, arg, ainfo->storage, ainfo->reg, in);
-                                       break;
-                               case ArgOnStack:
-                                       arg->opcode = OP_OUTARG;
-                                       if (!sig->params [i - sig->hasthis]->byref) {
-                                               if (sig->params [i - sig->hasthis]->type == MONO_TYPE_R4)
-                                                       arg->opcode = OP_OUTARG_R4;
-                                               else
-                                                       if (sig->params [i - sig->hasthis]->type == MONO_TYPE_R8)
-                                                               arg->opcode = OP_OUTARG_R8;
-                                       }
-                                       break;
-                               default:
-                                       g_assert_not_reached ();
-                               }
+                       break;
+               case ArgValuetypeInReg:
+                       if (sig->pinvoke) {
+                               cfg->exception_message = g_strdup ("pinvoke + vtypes");
+                               cfg->disable_llvm = TRUE;
+                               return linfo;
                        }
-               }
-       }
-
-       /* Handle the case where there are no implicit arguments */
-       if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (n == sig->sentinelpos)) {
-               emit_sig_cookie (cfg, call, cinfo);
-       }
 
-       if (cinfo->ret.storage == ArgValuetypeInReg) {
-               /* This is needed by mono_arch_emit_this_vret_args () */
-               if (!cfg->arch.vret_addr_loc) {
-                       cfg->arch.vret_addr_loc = mono_compile_create_var (cfg, &mono_defaults.int_class->byval_arg, OP_LOCAL);
-                       /* Prevent it from being register allocated or optimized away */
-                       ((MonoInst*)cfg->arch.vret_addr_loc)->flags |= MONO_INST_VOLATILE;
+                       linfo->args [i].storage = LLVMArgVtypeInReg;
+                       for (j = 0; j < 2; ++j)
+                               linfo->args [i].pair_storage [j] = arg_storage_to_llvm_arg_storage (cfg, ainfo->pair_storage [j]);
+                       break;
+               default:
+                       cfg->exception_message = g_strdup ("ainfo->storage");
+                       cfg->disable_llvm = TRUE;
+                       break;
                }
        }
 
-       if (cinfo->need_stack_align) {
-               MONO_INST_NEW (cfg, arg, OP_AMD64_OUTARG_ALIGN_STACK);
-               arg->inst_c0 = 8;
-               /* prepend, so they get reversed */
-               arg->next = call->out_args;
-               call->out_args = arg;
-       }
-
-#ifdef PLATFORM_WIN32
-       /* Always reserve 32 bytes of stack space on Win64 */
-       /*MONO_INST_NEW (cfg, arg, OP_AMD64_OUTARG_ALIGN_STACK);
-       arg->inst_c0 = 32;
-       MONO_INST_LIST_ADD_TAIL (&arg->node, &call->out_args);*/
-       NOT_IMPLEMENTED;
-#endif
-
-#if 0
-       if (cfg->method->save_lmf) {
-               MONO_INST_NEW (cfg, arg, OP_AMD64_SAVE_SP_TO_LMF);
-               MONO_INST_LIST_ADD_TAIL (&arg->node, &call->out_args);
-       }
-#endif
-
-       call->stack_usage = cinfo->stack_usage;
-       cfg->param_area = MAX (cfg->param_area, call->stack_usage);
-       cfg->flags |= MONO_CFG_HAS_CALLS;
-
-       return call;
-}
-
-static void
-emit_sig_cookie2 (MonoCompile *cfg, MonoCallInst *call, CallInfo *cinfo)
-{
-       MonoInst *arg;
-       MonoMethodSignature *tmp_sig;
-       MonoInst *sig_arg;
-
-       if (call->tail_call)
-               NOT_IMPLEMENTED;
-
-       /* FIXME: Add support for signature tokens to AOT */
-       cfg->disable_aot = TRUE;
-
-       g_assert (cinfo->sig_cookie.storage == ArgOnStack);
-                       
-       /*
-        * mono_ArgIterator_Setup assumes the signature cookie is 
-        * passed first and all the arguments which were before it are
-        * passed on the stack after the signature. So compensate by 
-        * passing a different signature.
-        */
-       tmp_sig = mono_metadata_signature_dup (call->signature);
-       tmp_sig->param_count -= call->signature->sentinelpos;
-       tmp_sig->sentinelpos = 0;
-       memcpy (tmp_sig->params, call->signature->params + call->signature->sentinelpos, tmp_sig->param_count * sizeof (MonoType*));
-
-       MONO_INST_NEW (cfg, sig_arg, OP_ICONST);
-       sig_arg->dreg = mono_alloc_ireg (cfg);
-       sig_arg->inst_p0 = tmp_sig;
-       MONO_ADD_INS (cfg->cbb, sig_arg);
-
-       MONO_INST_NEW (cfg, arg, OP_X86_PUSH);
-       arg->sreg1 = sig_arg->dreg;
-       MONO_ADD_INS (cfg->cbb, arg);
+       return linfo;
 }
+#endif
 
 void
 mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
@@ -1950,8 +1785,46 @@ mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
 
        cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, sig->pinvoke);
 
+       if (COMPILE_LLVM (cfg)) {
+               /* We shouldn't be called in the llvm case */
+               cfg->disable_llvm = TRUE;
+               return;
+       }
+
        if (cinfo->need_stack_align) {
-               MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, 8);
+               if (!cfg->arch.no_pushes)
+                       MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, 8);
+       }
+
+       /* 
+        * Emit all arguments which are passed on the stack to prevent register
+        * allocation problems.
+        */
+       if (cfg->arch.no_pushes) {
+               for (i = 0; i < n; ++i) {
+                       MonoType *t;
+                       ainfo = cinfo->args + i;
+
+                       in = call->args [i];
+
+                       if (sig->hasthis && i == 0)
+                               t = &mono_defaults.object_class->byval_arg;
+                       else
+                               t = sig->params [i - sig->hasthis];
+
+                       if (ainfo->storage == ArgOnStack && !MONO_TYPE_ISSTRUCT (t) && !call->tail_call) {
+                               if (!t->byref) {
+                                       if (t->type == MONO_TYPE_R4)
+                                               MONO_EMIT_NEW_STORE_MEMBASE (cfg, OP_STORER4_MEMBASE_REG, AMD64_RSP, ainfo->offset, in->dreg);
+                                       else if (t->type == MONO_TYPE_R8)
+                                               MONO_EMIT_NEW_STORE_MEMBASE (cfg, OP_STORER8_MEMBASE_REG, AMD64_RSP, ainfo->offset, in->dreg);
+                                       else
+                                               MONO_EMIT_NEW_STORE_MEMBASE (cfg, OP_STORE_MEMBASE_REG, AMD64_RSP, ainfo->offset, in->dreg);
+                               } else {
+                                       MONO_EMIT_NEW_STORE_MEMBASE (cfg, OP_STORE_MEMBASE_REG, AMD64_RSP, ainfo->offset, in->dreg);
+                               }
+                       }
+               }
        }
 
        /*
@@ -1964,7 +1837,7 @@ mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
                in = call->args [i];
 
                if (ainfo->storage == ArgInIReg)
-                       add_outarg_reg2 (cfg, call, ainfo->storage, ainfo->reg, in);
+                       add_outarg_reg (cfg, call, ainfo->storage, ainfo->reg, in);
        }
 
        for (i = n - 1; i >= 0; --i) {
@@ -1978,14 +1851,16 @@ mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
                        break;
                case ArgInFloatSSEReg:
                case ArgInDoubleSSEReg:
-                       add_outarg_reg2 (cfg, call, ainfo->storage, ainfo->reg, in);
+                       add_outarg_reg (cfg, call, ainfo->storage, ainfo->reg, in);
                        break;
                case ArgOnStack:
                case ArgValuetypeInReg:
                case ArgValuetypeAddrInIReg:
-                       if (ainfo->storage == ArgOnStack && call->tail_call)
-                               NOT_IMPLEMENTED;
-                       if ((i >= sig->hasthis) && (MONO_TYPE_ISSTRUCT(sig->params [i - sig->hasthis]))) {
+                       if (ainfo->storage == ArgOnStack && call->tail_call) {
+                               MonoInst *call_inst = (MonoInst*)call;
+                               cfg->args [i]->flags |= MONO_INST_VOLATILE;
+                               EMIT_NEW_ARGSTORE (cfg, call_inst, i, in);
+                       } else if ((i >= sig->hasthis) && (MONO_TYPE_ISSTRUCT(sig->params [i - sig->hasthis]))) {
                                guint32 align;
                                guint32 size;
 
@@ -2020,38 +1895,40 @@ mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
                                        MONO_ADD_INS (cfg->cbb, arg);
                                }
                        } else {
-                               MONO_INST_NEW (cfg, arg, OP_X86_PUSH);
-                               arg->sreg1 = in->dreg;
-                               if (!sig->params [i - sig->hasthis]->byref) {
-                                       if (sig->params [i - sig->hasthis]->type == MONO_TYPE_R4) {
-                                               MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, 8);
-                                               arg->opcode = OP_STORER4_MEMBASE_REG;
-                                               arg->inst_destbasereg = X86_ESP;
-                                               arg->inst_offset = 0;
-                                       } else if (sig->params [i - sig->hasthis]->type == MONO_TYPE_R8) {
-                                               MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, 8);
-                                               arg->opcode = OP_STORER8_MEMBASE_REG;
-                                               arg->inst_destbasereg = X86_ESP;
-                                               arg->inst_offset = 0;
+                               if (cfg->arch.no_pushes) {
+                                       /* Already done */
+                               } else {
+                                       MONO_INST_NEW (cfg, arg, OP_X86_PUSH);
+                                       arg->sreg1 = in->dreg;
+                                       if (!sig->params [i - sig->hasthis]->byref) {
+                                               if (sig->params [i - sig->hasthis]->type == MONO_TYPE_R4) {
+                                                       MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, 8);
+                                                       arg->opcode = OP_STORER4_MEMBASE_REG;
+                                                       arg->inst_destbasereg = X86_ESP;
+                                                       arg->inst_offset = 0;
+                                               } else if (sig->params [i - sig->hasthis]->type == MONO_TYPE_R8) {
+                                                       MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, 8);
+                                                       arg->opcode = OP_STORER8_MEMBASE_REG;
+                                                       arg->inst_destbasereg = X86_ESP;
+                                                       arg->inst_offset = 0;
+                                               }
                                        }
+                                       MONO_ADD_INS (cfg->cbb, arg);
                                }
-                               MONO_ADD_INS (cfg->cbb, arg);
                        }
                        break;
                default:
                        g_assert_not_reached ();
                }
 
-               if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (i == sig->sentinelpos)) {
+               if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (i == sig->sentinelpos))
                        /* Emit the signature cookie just before the implicit arguments */
-                       emit_sig_cookie2 (cfg, call, cinfo);
-               }
+                       emit_sig_cookie (cfg, call, cinfo);
        }
 
        /* Handle the case where there are no implicit arguments */
-       if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (n == sig->sentinelpos)) {
-               emit_sig_cookie2 (cfg, call, cinfo);
-       }
+       if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (n == sig->sentinelpos))
+               emit_sig_cookie (cfg, call, cinfo);
 
        if (sig->ret && MONO_TYPE_ISSTRUCT (sig->ret)) {
                MonoInst *vtarg;
@@ -2144,12 +2021,14 @@ mono_arch_emit_outarg_vt (MonoCompile *cfg, MonoInst *ins, MonoInst *src)
                        }
                        MONO_ADD_INS (cfg->cbb, load);
 
-                       add_outarg_reg2 (cfg, call, ainfo->pair_storage [part], ainfo->pair_regs [part], load);
+                       add_outarg_reg (cfg, call, ainfo->pair_storage [part], ainfo->pair_regs [part], load);
                }
        } else if (ainfo->storage == ArgValuetypeAddrInIReg) {
                MonoInst *vtaddr, *load;
                vtaddr = mono_compile_create_var (cfg, &ins->klass->byval_arg, OP_LOCAL);
                
+               g_assert (!cfg->arch.no_pushes);
+
                MONO_INST_NEW (cfg, load, OP_LDADDR);
                load->inst_p0 = vtaddr;
                vtaddr->flags |= MONO_INST_INDIRECT;
@@ -2157,14 +2036,15 @@ mono_arch_emit_outarg_vt (MonoCompile *cfg, MonoInst *ins, MonoInst *src)
                load->klass = vtaddr->klass;
                load->dreg = mono_alloc_ireg (cfg);
                MONO_ADD_INS (cfg->cbb, load);
-               mini_emit_memcpy2 (cfg, load->dreg, 0, src->dreg, 0, size, 4);
+               mini_emit_memcpy (cfg, load->dreg, 0, src->dreg, 0, size, 4);
 
                if (ainfo->pair_storage [0] == ArgInIReg) {
                        MONO_INST_NEW (cfg, arg, OP_X86_LEA_MEMBASE);
-                       arg->dreg = ainfo->pair_regs [0];
+                       arg->dreg = mono_alloc_ireg (cfg);
                        arg->sreg1 = load->dreg;
                        arg->inst_imm = 0;
                        MONO_ADD_INS (cfg->cbb, arg);
+                       mono_call_inst_add_outarg_reg (cfg, call, arg->dreg, ainfo->pair_regs [0], FALSE);
                } else {
                        MONO_INST_NEW (cfg, arg, OP_X86_PUSH);
                        arg->sreg1 = load->dreg;
@@ -2172,20 +2052,36 @@ mono_arch_emit_outarg_vt (MonoCompile *cfg, MonoInst *ins, MonoInst *src)
                }
        } else {
                if (size == 8) {
-                       /* Can't use this for < 8 since it does an 8 byte memory load */
-                       MONO_INST_NEW (cfg, arg, OP_X86_PUSH_MEMBASE);
-                       arg->inst_basereg = src->dreg;
-                       arg->inst_offset = 0;
-                       MONO_ADD_INS (cfg->cbb, arg);
+                       if (cfg->arch.no_pushes) {
+                               int dreg = mono_alloc_ireg (cfg);
+
+                               MONO_EMIT_NEW_LOAD_MEMBASE (cfg, dreg, src->dreg, 0);
+                               MONO_EMIT_NEW_STORE_MEMBASE (cfg, OP_STORE_MEMBASE_REG, AMD64_RSP, ainfo->offset, dreg);
+                       } else {
+                               /* Can't use this for < 8 since it does an 8 byte memory load */
+                               MONO_INST_NEW (cfg, arg, OP_X86_PUSH_MEMBASE);
+                               arg->inst_basereg = src->dreg;
+                               arg->inst_offset = 0;
+                               MONO_ADD_INS (cfg->cbb, arg);
+                       }
                } else if (size <= 40) {
-                       MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, ALIGN_TO (size, 8));
-                       mini_emit_memcpy2 (cfg, X86_ESP, 0, src->dreg, 0, size, 4);
+                       if (cfg->arch.no_pushes) {
+                               mini_emit_memcpy (cfg, AMD64_RSP, ainfo->offset, src->dreg, 0, size, 4);
+                       } else {
+                               MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, ALIGN_TO (size, 8));
+                               mini_emit_memcpy (cfg, X86_ESP, 0, src->dreg, 0, size, 4);
+                       }
                } else {
-                       MONO_INST_NEW (cfg, arg, OP_X86_PUSH_OBJ);
-                       arg->inst_basereg = src->dreg;
-                       arg->inst_offset = 0;
-                       arg->inst_imm = size;
-                       MONO_ADD_INS (cfg->cbb, arg);
+                       if (cfg->arch.no_pushes) {
+                               // FIXME: Code growth
+                               mini_emit_memcpy (cfg, AMD64_RSP, ainfo->offset, src->dreg, 0, size, 4);
+                       } else {
+                               MONO_INST_NEW (cfg, arg, OP_X86_PUSH_OBJ);
+                               arg->inst_basereg = src->dreg;
+                               arg->inst_offset = 0;
+                               arg->inst_imm = size;
+                               MONO_ADD_INS (cfg->cbb, arg);
+                       }
                }
        }
 }
@@ -2193,11 +2089,14 @@ mono_arch_emit_outarg_vt (MonoCompile *cfg, MonoInst *ins, MonoInst *src)
 void
 mono_arch_emit_setret (MonoCompile *cfg, MonoMethod *method, MonoInst *val)
 {
-       MonoType *ret = mono_type_get_underlying_type (mono_method_signature (method)->ret);
+       MonoType *ret = mini_type_get_underlying_type (NULL, mono_method_signature (method)->ret);
 
        if (!ret->byref) {
                if (ret->type == MONO_TYPE_R4) {
-                       MONO_EMIT_NEW_UNALU (cfg, OP_AMD64_SET_XMMREG_R4, cfg->ret->dreg, val->dreg);
+                       if (COMPILE_LLVM (cfg))
+                               MONO_EMIT_NEW_UNALU (cfg, OP_FMOVE, cfg->ret->dreg, val->dreg);
+                       else
+                               MONO_EMIT_NEW_UNALU (cfg, OP_AMD64_SET_XMMREG_R4, cfg->ret->dreg, val->dreg);
                        return;
                } else if (ret->type == MONO_TYPE_R8) {
                        MONO_EMIT_NEW_UNALU (cfg, OP_FMOVE, cfg->ret->dreg, val->dreg);
@@ -2209,28 +2108,15 @@ mono_arch_emit_setret (MonoCompile *cfg, MonoMethod *method, MonoInst *val)
 }
 
 #define EMIT_COND_BRANCH(ins,cond,sign) \
-if (ins->flags & MONO_INST_BRLABEL) { \
-        if (ins->inst_i0->inst_c0) { \
-               x86_branch (code, cond, cfg->native_code + ins->inst_i0->inst_c0, sign); \
-        } else { \
-               mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_LABEL, ins->inst_i0); \
-               if ((cfg->opt & MONO_OPT_BRANCH) && \
-                    x86_is_imm8 (ins->inst_i0->inst_c1 - cpos)) \
-                       x86_branch8 (code, cond, 0, sign); \
-                else \
-                       x86_branch32 (code, cond, 0, sign); \
-        } \
-} else { \
         if (ins->inst_true_bb->native_offset) { \
                x86_branch (code, cond, cfg->native_code + ins->inst_true_bb->native_offset, sign); \
         } else { \
                mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_BB, ins->inst_true_bb); \
                if ((cfg->opt & MONO_OPT_BRANCH) && \
-                    x86_is_imm8 (ins->inst_true_bb->max_offset - cpos)) \
+            x86_is_imm8 (ins->inst_true_bb->max_offset - offset)) \
                        x86_branch8 (code, cond, 0, sign); \
                 else \
                        x86_branch32 (code, cond, 0, sign); \
-        } \
 }
 
 /* emit an exception if condition is fail */
@@ -2303,9 +2189,13 @@ emit_call_body (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointe
                        }
                }
                else {
-                       if (mono_find_class_init_trampoline_by_addr (data))
+                       if (cfg->abs_patches && g_hash_table_lookup (cfg->abs_patches, data)) {
+                               /* 
+                                * This is not really an optimization, but required because the
+                                * generic class init trampolines use R11 to pass the vtable.
+                                */
                                near_call = TRUE;
-                       else {
+                       else {
                                MonoJitICallInfo *info = mono_find_jit_icall_by_addr (data);
                                if (info) {
                                        if ((cfg->method->wrapper_type == MONO_WRAPPER_MANAGED_TO_NATIVE) && 
@@ -2337,8 +2227,10 @@ emit_call_body (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointe
                        /* These methods are allocated using malloc */
                        near_call = FALSE;
 
-               if (cfg->compile_aot)
+               if (cfg->compile_aot) {
                        near_call = TRUE;
+                       no_patch = TRUE;
+               }
 
 #ifdef MONO_ARCH_NOMAP32BIT
                near_call = FALSE;
@@ -2577,9 +2469,6 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
 {
        MonoInst *ins, *n, *temp;
 
-       if (bb->max_vreg > cfg->rs->next_vreg)
-               cfg->rs->next_vreg = bb->max_vreg;
-
        /*
         * FIXME: Need to add more instructions, but the current machine 
         * description can't model some parts of the composite instructions like
@@ -2590,20 +2479,21 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_DIV_IMM:
                case OP_REM_IMM:
                case OP_IDIV_IMM:
-               case OP_IREM_IMM:
                case OP_IDIV_UN_IMM:
                case OP_IREM_UN_IMM:
                        mono_decompose_op_imm (cfg, bb, ins);
                        break;
+               case OP_IREM_IMM:
+                       /* Keep the opcode if we can implement it efficiently */
+                       if (!((ins->inst_imm > 0) && (mono_is_power_of_two (ins->inst_imm) != -1)))
+                               mono_decompose_op_imm (cfg, bb, ins);
+                       break;
                case OP_COMPARE_IMM:
                case OP_LCOMPARE_IMM:
                        if (!amd64_is_imm32 (ins->inst_imm)) {
                                NEW_INS (cfg, ins, temp, OP_I8CONST);
                                temp->inst_c0 = ins->inst_imm;
-                               if (cfg->globalra)
-                                       temp->dreg = mono_alloc_ireg (cfg);
-                               else
-                                       temp->dreg = mono_regstate_next_int (cfg->rs);
+                               temp->dreg = mono_alloc_ireg (cfg);
                                ins->opcode = OP_COMPARE;
                                ins->sreg2 = temp->dreg;
                        }
@@ -2613,10 +2503,7 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                        if (!amd64_is_imm32 (ins->inst_offset)) {
                                NEW_INS (cfg, ins, temp, OP_I8CONST);
                                temp->inst_c0 = ins->inst_offset;
-                               if (cfg->globalra)
-                                       temp->dreg = mono_alloc_ireg (cfg);
-                               else
-                                       temp->dreg = mono_regstate_next_int (cfg->rs);
+                               temp->dreg = mono_alloc_ireg (cfg);
                                ins->opcode = OP_AMD64_LOADI8_MEMINDEX;
                                ins->inst_indexreg = temp->dreg;
                        }
@@ -2626,20 +2513,41 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                        if (!amd64_is_imm32 (ins->inst_imm)) {
                                NEW_INS (cfg, ins, temp, OP_I8CONST);
                                temp->inst_c0 = ins->inst_imm;
-                               if (cfg->globalra)
-                                       temp->dreg = mono_alloc_ireg (cfg);
-                               else
-                                       temp->dreg = mono_regstate_next_int (cfg->rs);
+                               temp->dreg = mono_alloc_ireg (cfg);
                                ins->opcode = OP_STOREI8_MEMBASE_REG;
                                ins->sreg1 = temp->dreg;
                        }
                        break;
+#ifdef MONO_ARCH_SIMD_INTRINSICS
+               case OP_EXPAND_I1: {
+                               int temp_reg1 = mono_alloc_ireg (cfg);
+                               int temp_reg2 = mono_alloc_ireg (cfg);
+                               int original_reg = ins->sreg1;
+
+                               NEW_INS (cfg, ins, temp, OP_ICONV_TO_U1);
+                               temp->sreg1 = original_reg;
+                               temp->dreg = temp_reg1;
+
+                               NEW_INS (cfg, ins, temp, OP_SHL_IMM);
+                               temp->sreg1 = temp_reg1;
+                               temp->dreg = temp_reg2;
+                               temp->inst_imm = 8;
+
+                               NEW_INS (cfg, ins, temp, OP_LOR);
+                               temp->sreg1 = temp->dreg = temp_reg2;
+                               temp->sreg2 = temp_reg1;
+
+                               ins->opcode = OP_EXPAND_I2;
+                               ins->sreg1 = temp_reg2;
+                       }
+                       break;
+#endif
                default:
                        break;
                }
        }
 
-       bb->max_vreg = cfg->rs->next_vreg;
+       bb->max_vreg = cfg->next_vreg;
 }
 
 static const int 
@@ -2677,7 +2585,7 @@ emit_float_to_int (MonoCompile *cfg, guchar *code, int dreg, int sreg, int size,
 }
 
 static unsigned char*
-mono_emit_stack_alloc (guchar *code, MonoInst* tree)
+mono_emit_stack_alloc (MonoCompile *cfg, guchar *code, MonoInst* tree)
 {
        int sreg = tree->sreg1;
        int need_touch = FALSE;
@@ -2749,6 +2657,8 @@ mono_emit_stack_alloc (guchar *code, MonoInst* tree)
                amd64_alu_reg_reg (code, X86_XOR, AMD64_RAX, AMD64_RAX);
                                
                amd64_lea_membase (code, AMD64_RDI, AMD64_RSP, offset);
+               if (cfg->param_area && cfg->arch.no_pushes)
+                       amd64_alu_reg_imm (code, X86_ADD, AMD64_RDI, cfg->param_area);
                amd64_cld (code);
                amd64_prefix (code, X86_REP_PREFIX);
                amd64_stosl (code);
@@ -2830,18 +2740,19 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, guint8 *code)
 }
 
 /*
- * emit_tls_get:
+ * mono_amd64_emit_tls_get:
  * @code: buffer to store code to
  * @dreg: hard register where to place the result
  * @tls_offset: offset info
  *
- * emit_tls_get emits in @code the native code that puts in the dreg register
- * the item in the thread local storage identified by tls_offset.
+ * mono_amd64_emit_tls_get emits in @code the native code that puts in
+ * the dreg register the item in the thread local storage identified
+ * by tls_offset.
  *
  * Returns: a pointer to the end of the stored code
  */
-static guint8*
-emit_tls_get (guint8* code, int dreg, int tls_offset)
+guint8*
+mono_amd64_emit_tls_get (guint8* code, int dreg, int tls_offset)
 {
 #ifdef PLATFORM_WIN32
        g_assert (tls_offset < 64);
@@ -2860,93 +2771,6 @@ emit_tls_get (guint8* code, int dreg, int tls_offset)
        return code;
 }
 
-/*
- * emit_load_volatile_arguments:
- *
- *  Load volatile arguments from the stack to the original input registers.
- * Required before a tail call.
- */
-static guint8*
-emit_load_volatile_arguments (MonoCompile *cfg, guint8 *code)
-{
-       MonoMethod *method = cfg->method;
-       MonoMethodSignature *sig;
-       MonoInst *ins;
-       CallInfo *cinfo;
-       guint32 i, quad;
-
-       /* FIXME: Generate intermediate code instead */
-
-       sig = mono_method_signature (method);
-
-       cinfo = cfg->arch.cinfo;
-       
-       /* This is the opposite of the code in emit_prolog */
-       if (sig->ret->type != MONO_TYPE_VOID) {
-               if (cfg->vret_addr && (cfg->vret_addr->opcode != OP_REGVAR))
-                       amd64_mov_reg_membase (code, cinfo->ret.reg, cfg->vret_addr->inst_basereg, cfg->vret_addr->inst_offset, 8);
-       }
-
-       for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
-               ArgInfo *ainfo = cinfo->args + i;
-               MonoType *arg_type;
-               ins = cfg->args [i];
-
-               if (sig->hasthis && (i == 0))
-                       arg_type = &mono_defaults.object_class->byval_arg;
-               else
-                       arg_type = sig->params [i - sig->hasthis];
-
-               if (ins->opcode != OP_REGVAR) {
-                       switch (ainfo->storage) {
-                       case ArgInIReg: {
-                               guint32 size = 8;
-
-                               /* FIXME: I1 etc */
-                               amd64_mov_reg_membase (code, ainfo->reg, ins->inst_basereg, ins->inst_offset, size);
-                               break;
-                       }
-                       case ArgInFloatSSEReg:
-                               amd64_movss_reg_membase (code, ainfo->reg, ins->inst_basereg, ins->inst_offset);
-                               break;
-                       case ArgInDoubleSSEReg:
-                               amd64_movsd_reg_membase (code, ainfo->reg, ins->inst_basereg, ins->inst_offset);
-                               break;
-                       case ArgValuetypeInReg:
-                               for (quad = 0; quad < 2; quad ++) {
-                                       switch (ainfo->pair_storage [quad]) {
-                                       case ArgInIReg:
-                                               amd64_mov_reg_membase (code, ainfo->pair_regs [quad], ins->inst_basereg, ins->inst_offset + (quad * sizeof (gpointer)), sizeof (gpointer));
-                                               break;
-                                       case ArgInFloatSSEReg:
-                                       case ArgInDoubleSSEReg:
-                                               g_assert_not_reached ();
-                                               break;
-                                       case ArgNone:
-                                               break;
-                                       default:
-                                               g_assert_not_reached ();
-                                       }
-                               }
-                               break;
-                       case ArgValuetypeAddrInIReg:
-                               if (ainfo->pair_storage [0] == ArgInIReg)
-                                       amd64_mov_reg_membase (code, ainfo->pair_regs [0], ins->inst_left->inst_basereg, ins->inst_left->inst_offset,  sizeof (gpointer));
-                               break;
-                       default:
-                               break;
-                       }
-               }
-               else {
-                       g_assert (ainfo->storage == ArgInIReg);
-
-                       amd64_mov_reg_reg (code, ainfo->reg, ins->dreg, 8);
-               }
-       }
-
-       return code;
-}
-
 #define REAL_PRINT_REG(text,reg) \
 mono_assert (reg >= 0); \
 amd64_push_reg (code, AMD64_RAX); \
@@ -2966,6 +2790,8 @@ amd64_pop_reg (code, AMD64_RAX);
 #define LOOP_ALIGNMENT 8
 #define bb_is_loop_start(bb) ((bb)->loop_body_start && (bb)->nesting)
 
+#ifndef DISABLE_JIT
+
 void
 mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 {
@@ -2975,7 +2801,17 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
        guint8 *code = cfg->native_code + cfg->code_len;
        MonoInst *last_ins = NULL;
        guint last_offset = 0;
-       int max_len, cpos;
+       int max_len;
+
+       /* Fix max_offset estimate for each successor bb */
+       if (cfg->opt & MONO_OPT_BRANCH) {
+               int current_offset = cfg->code_len;
+               MonoBasicBlock *current_bb;
+               for (current_bb = bb; current_bb != NULL; current_bb = current_bb->next_bb) {
+                       current_bb->max_offset = current_offset;
+                       current_offset += current_bb->max_length;
+               }
+       }
 
        if (cfg->opt & MONO_OPT_LOOP) {
                int pad, align = LOOP_ALIGNMENT;
@@ -2992,12 +2828,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
        if (cfg->verbose_level > 2)
                g_print ("Basic block %d starting at offset 0x%x\n", bb->block_num, bb->native_offset);
 
-       cpos = bb->max_offset;
-
        if (cfg->prof_options & MONO_PROFILE_COVERAGE) {
                MonoProfileCoverageInfo *cov = cfg->coverage_info;
                g_assert (!cfg->compile_aot);
-               cpos += 6;
 
                cov->data [bb->dfn].cil_code = bb->cil_code;
                amd64_mov_reg_imm (code, AMD64_R11, (guint64)&cov->data [bb->dfn].count);
@@ -3009,6 +2842,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 
        mono_debug_open_block (cfg, bb, offset);
 
+    if (mono_break_at_bb_method && mono_method_desc_full_match (mono_break_at_bb_method, cfg->method) && bb->block_num == mono_break_at_bb_bb_num)
+               x86_breakpoint (code);
+
        MONO_BB_FOR_EACH_INS (bb, ins) {
                offset = code - cfg->native_code;
 
@@ -3077,15 +2913,10 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        break;
                case OP_LOADU4_MEM:
                        // FIXME: Decompose this earlier
-                       if (cfg->new_ir) {
-                               if (amd64_is_imm32 (ins->inst_imm))
-                                       amd64_mov_reg_mem (code, ins->dreg, ins->inst_imm, 4);
-                               else {
-                                       amd64_mov_reg_imm (code, ins->dreg, ins->inst_imm);
-                                       amd64_mov_reg_membase (code, ins->dreg, ins->dreg, 0, 4);
-                               }
-                       } else {
-                               amd64_mov_reg_imm (code, ins->dreg, ins->inst_p0);
+                       if (amd64_is_imm32 (ins->inst_imm))
+                               amd64_mov_reg_mem (code, ins->dreg, ins->inst_imm, 4);
+                       else {
+                               amd64_mov_reg_imm (code, ins->dreg, ins->inst_imm);
                                amd64_mov_reg_membase (code, ins->dreg, ins->dreg, 0, 4);
                        }
                        break;
@@ -3315,6 +3146,13 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_BREAK:
                        amd64_breakpoint (code);
                        break;
+               case OP_RELAXED_NOP:
+                       x86_prefix (code, X86_REP_PREFIX);
+                       x86_nop (code);
+                       break;
+               case OP_HARD_NOP:
+                       x86_nop (code);
+                       break;
                case OP_NOP:
                case OP_DUMMY_USE:
                case OP_DUMMY_STORE:
@@ -3472,6 +3310,32 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                amd64_div_reg_size (code, ins->sreg2, FALSE, 4);
                        }
                        break;
+               case OP_IREM_IMM: {
+                       int power = mono_is_power_of_two (ins->inst_imm);
+
+                       g_assert (ins->sreg1 == X86_EAX);
+                       g_assert (ins->dreg == X86_EAX);
+                       g_assert (power >= 0);
+
+                       if (power == 0) {
+                               amd64_mov_reg_imm (code, ins->dreg, 0);
+                               break;
+                       }
+
+                       /* Based on gcc code */
+
+                       /* Add compensation for negative dividents */
+                       amd64_mov_reg_reg_size (code, AMD64_RDX, AMD64_RAX, 4);
+                       if (power > 1)
+                               amd64_shift_reg_imm_size (code, X86_SAR, AMD64_RDX, 31, 4);
+                       amd64_shift_reg_imm_size (code, X86_SHR, AMD64_RDX, 32 - power, 4);
+                       amd64_alu_reg_reg_size (code, X86_ADD, AMD64_RAX, AMD64_RDX, 4);
+                       /* Compute remainder */
+                       amd64_alu_reg_imm_size (code, X86_AND, AMD64_RAX, (1 << power) - 1, 4);
+                       /* Remove compensation */
+                       amd64_alu_reg_reg_size (code, X86_SUB, AMD64_RAX, AMD64_RDX, 4);
+                       break;
+               }
                case OP_LMUL_OVF:
                        amd64_imul_reg_reg (code, ins->sreg1, ins->sreg2);
                        EMIT_COND_SYSTEM_EXCEPTION (X86_CC_O, FALSE, "OverflowException");
@@ -3735,7 +3599,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
                        break;
                }
-               case OP_JMP:
                case OP_TAILCALL: {
                        /*
                         * Note: this 'frame destruction' logic is useful for tail calls, too.
@@ -3745,13 +3608,10 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 
                        /* FIXME: no tracing support... */
                        if (cfg->prof_options & MONO_PROFILE_ENTER_LEAVE)
-                               code = mono_arch_instrument_epilog (cfg, mono_profiler_method_leave, code, FALSE);
+                               code = mono_arch_instrument_epilog_full (cfg, mono_profiler_method_leave, code, FALSE, FALSE);
 
                        g_assert (!cfg->method->save_lmf);
 
-                       if (ins->opcode == OP_JMP)
-                               code = emit_load_volatile_arguments (cfg, code);
-
                        if (cfg->arch.omit_fp) {
                                guint32 save_offset = 0;
                                /* Pop callee-saved registers */
@@ -3830,7 +3690,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                code = emit_call (cfg, code, MONO_PATCH_INFO_METHOD, call->method, FALSE);
                        else
                                code = emit_call (cfg, code, MONO_PATCH_INFO_ABS, call->fptr, FALSE);
-                       if (call->stack_usage && !CALLCONV_IS_STDCALL (call->signature->call_convention))
+                       if (call->stack_usage && !CALLCONV_IS_STDCALL (call->signature->call_convention) && !cfg->arch.no_pushes)
                                amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, call->stack_usage);
                        code = emit_move_return_value (cfg, ins, code);
                        break;
@@ -3878,7 +3738,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        }
 
                        amd64_call_reg (code, ins->sreg1);
-                       if (call->stack_usage && !CALLCONV_IS_STDCALL (call->signature->call_convention))
+                       if (call->stack_usage && !CALLCONV_IS_STDCALL (call->signature->call_convention) && !cfg->arch.no_pushes)
                                amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, call->stack_usage);
                        code = emit_move_return_value (cfg, ins, code);
                        break;
@@ -3899,52 +3759,40 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                ins->sreg1 = AMD64_RAX;
                        }
 
-                       if (call->method && ins->inst_offset < 0) {
-                               gssize val;
-
-                               /* 
-                                * This is a possible IMT call so save the IMT method in the proper
-                                * register. We don't use the generic code in method-to-ir.c, because
-                                * we need to disassemble this in get_vcall_slot_addr (), so we have to
-                                * maintain control over the layout of the code.
-                                * Also put the base reg in %rax to simplify find_imt_method ().
-                                */
-                               if (ins->sreg1 != AMD64_RAX) {
-                                       amd64_mov_reg_reg (code, AMD64_RAX, ins->sreg1, 8);
-                                       ins->sreg1 = AMD64_RAX;
-                               }
-                               val = (gssize)(gpointer)call->method;
-
-                               // FIXME: Generics sharing
-#if 0
-                               if ((((guint64)val) >> 32) == 0)
-                                       amd64_mov_reg_imm_size (code, MONO_ARCH_IMT_REG, val, 4);
-                               else
-                                       amd64_mov_reg_imm_size (code, MONO_ARCH_IMT_REG, val, 8);
-#endif
-                       }
+                       /* 
+                        * Emit a few nops to simplify get_vcall_slot ().
+                        */
+                       amd64_nop (code);
+                       amd64_nop (code);
+                       amd64_nop (code);
 
                        amd64_call_membase (code, ins->sreg1, ins->inst_offset);
-                       if (call->stack_usage && !CALLCONV_IS_STDCALL (call->signature->call_convention))
+                       if (call->stack_usage && !CALLCONV_IS_STDCALL (call->signature->call_convention) && !cfg->arch.no_pushes)
                                amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, call->stack_usage);
                        code = emit_move_return_value (cfg, ins, code);
                        break;
                case OP_AMD64_SAVE_SP_TO_LMF:
                        amd64_mov_membase_reg (code, cfg->frame_reg, cfg->arch.lmf_offset + G_STRUCT_OFFSET (MonoLMF, rsp), AMD64_RSP, 8);
                        break;
-               case OP_OUTARG:
                case OP_X86_PUSH:
+                       g_assert (!cfg->arch.no_pushes);
                        amd64_push_reg (code, ins->sreg1);
                        break;
                case OP_X86_PUSH_IMM:
+                       g_assert (!cfg->arch.no_pushes);
                        g_assert (amd64_is_imm32 (ins->inst_imm));
                        amd64_push_imm (code, ins->inst_imm);
                        break;
                case OP_X86_PUSH_MEMBASE:
+                       g_assert (!cfg->arch.no_pushes);
                        amd64_push_membase (code, ins->inst_basereg, ins->inst_offset);
                        break;
-               case OP_X86_PUSH_OBJ: 
-                       amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, ins->inst_imm);
+               case OP_X86_PUSH_OBJ: {
+                       int size = ALIGN_TO (ins->inst_imm, 8);
+
+                       g_assert (!cfg->arch.no_pushes);
+
+                       amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, size);
                        amd64_push_reg (code, AMD64_RDI);
                        amd64_push_reg (code, AMD64_RSI);
                        amd64_push_reg (code, AMD64_RCX);
@@ -3952,8 +3800,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                amd64_lea_membase (code, AMD64_RSI, ins->inst_basereg, ins->inst_offset);
                        else
                                amd64_mov_reg_reg (code, AMD64_RSI, ins->inst_basereg, 8);
-                       amd64_lea_membase (code, AMD64_RDI, AMD64_RSP, 3 * 8);
-                       amd64_mov_reg_imm (code, AMD64_RCX, (ins->inst_imm >> 3));
+                       amd64_lea_membase (code, AMD64_RDI, AMD64_RSP, (3 * 8));
+                       amd64_mov_reg_imm (code, AMD64_RCX, (size >> 3));
                        amd64_cld (code);
                        amd64_prefix (code, X86_REP_PREFIX);
                        amd64_movsd (code);
@@ -3961,6 +3809,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_pop_reg (code, AMD64_RSI);
                        amd64_pop_reg (code, AMD64_RDI);
                        break;
+               }
                case OP_X86_LEA:
                        amd64_lea_memindex (code, ins->dreg, ins->sreg1, ins->inst_imm, ins->sreg2, ins->backend.shift_amount);
                        break;
@@ -3974,8 +3823,10 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        /* keep alignment */
                        amd64_alu_reg_imm (code, X86_ADD, ins->sreg1, MONO_ARCH_FRAME_ALIGNMENT - 1);
                        amd64_alu_reg_imm (code, X86_AND, ins->sreg1, ~(MONO_ARCH_FRAME_ALIGNMENT - 1));
-                       code = mono_emit_stack_alloc (code, ins);
+                       code = mono_emit_stack_alloc (cfg, code, ins);
                        amd64_mov_reg_reg (code, ins->dreg, AMD64_RSP, 8);
+                       if (cfg->param_area && cfg->arch.no_pushes)
+                               amd64_alu_reg_imm (code, X86_ADD, ins->dreg, cfg->param_area);
                        break;
                case OP_LOCALLOC_IMM: {
                        guint32 size = ins->inst_imm;
@@ -3995,13 +3846,15 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                        amd64_mov_reg_imm (code, ins->dreg, size);
                                        ins->sreg1 = ins->dreg;
 
-                                       code = mono_emit_stack_alloc (code, ins);
+                                       code = mono_emit_stack_alloc (cfg, code, ins);
                                        amd64_mov_reg_reg (code, ins->dreg, AMD64_RSP, 8);
                                }
                        } else {
                                amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, size);
                                amd64_mov_reg_reg (code, ins->dreg, AMD64_RSP, 8);
                        }
+                       if (cfg->param_area && cfg->arch.no_pushes)
+                               amd64_alu_reg_imm (code, X86_ADD, ins->dreg, cfg->param_area);
                        break;
                }
                case OP_THROW: {
@@ -4027,6 +3880,12 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_START_HANDLER: {
                        MonoInst *spvar = mono_find_spvar_for_region (cfg, bb->region);
                        amd64_mov_membase_reg (code, spvar->inst_basereg, spvar->inst_offset, AMD64_RSP, 8);
+
+                       if ((MONO_BBLOCK_IS_IN_REGION (bb, MONO_REGION_FINALLY) ||
+                                MONO_BBLOCK_IS_IN_REGION (bb, MONO_REGION_FINALLY)) &&
+                               cfg->param_area && cfg->arch.no_pushes) {
+                               amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, ALIGN_TO (cfg->param_area, MONO_ARCH_FRAME_ALIGNMENT));
+                       }
                        break;
                }
                case OP_ENDFINALLY: {
@@ -4050,28 +3909,15 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        //g_print ("target: %p, next: %p, curr: %p, last: %p\n", ins->inst_target_bb, bb->next_bb, ins, bb->last_ins);
                        //if ((ins->inst_target_bb == bb->next_bb) && ins == bb->last_ins)
                        //break;
-                       if (ins->flags & MONO_INST_BRLABEL) {
-                               if (ins->inst_i0->inst_c0) {
-                                       amd64_jump_code (code, cfg->native_code + ins->inst_i0->inst_c0);
-                               } else {
-                                       mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_LABEL, ins->inst_i0);
-                                       if ((cfg->opt & MONO_OPT_BRANCH) &&
-                                           x86_is_imm8 (ins->inst_i0->inst_c1 - cpos))
-                                               x86_jump8 (code, 0);
-                                       else 
-                                               x86_jump32 (code, 0);
-                               }
-                       } else {
                                if (ins->inst_target_bb->native_offset) {
                                        amd64_jump_code (code, cfg->native_code + ins->inst_target_bb->native_offset); 
                                } else {
                                        mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_BB, ins->inst_target_bb);
                                        if ((cfg->opt & MONO_OPT_BRANCH) &&
-                                           x86_is_imm8 (ins->inst_target_bb->max_offset - cpos))
+                                           x86_is_imm8 (ins->inst_target_bb->max_offset - offset))
                                                x86_jump8 (code, 0);
                                        else 
                                                x86_jump32 (code, 0);
-                               } 
                        }
                        break;
                case OP_BR_REG:
@@ -4537,7 +4383,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, 16);
                        break;
                case OP_TLS_GET: {
-                       code = emit_tls_get (code, ins->dreg, ins->inst_offset);
+                       code = mono_amd64_emit_tls_get (code, ins->dreg, ins->inst_offset);
                        break;
                }
                case OP_MEMORY_BARRIER: {
@@ -4583,8 +4429,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        break;
                }
                case OP_ATOMIC_EXCHANGE_I4:
-               case OP_ATOMIC_EXCHANGE_I8:
-               case OP_ATOMIC_CAS_IMM_I4: {
+               case OP_ATOMIC_EXCHANGE_I8: {
                        guchar *br[2];
                        int sreg2 = ins->sreg2;
                        int breg = ins->inst_basereg;
@@ -4631,94 +4476,695 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                }
                        }
 
-                       if (ins->opcode == OP_ATOMIC_CAS_IMM_I4) {
-                               if (ins->backend.data == NULL)
-                                       amd64_alu_reg_reg (code, X86_XOR, AMD64_RAX, AMD64_RAX);
-                               else
-                                       amd64_mov_reg_imm (code, AMD64_RAX, ins->backend.data);
-
-                               amd64_prefix (code, X86_LOCK_PREFIX);
-                               amd64_cmpxchg_membase_reg_size (code, breg, ins->inst_offset, sreg2, size);
-                       } else {
-                               amd64_mov_reg_membase (code, AMD64_RAX, breg, ins->inst_offset, size);
+                       amd64_mov_reg_membase (code, AMD64_RAX, breg, ins->inst_offset, size);
 
-                               br [0] = code; amd64_prefix (code, X86_LOCK_PREFIX);
-                               amd64_cmpxchg_membase_reg_size (code, breg, ins->inst_offset, sreg2, size);
-                               br [1] = code; amd64_branch8 (code, X86_CC_NE, -1, FALSE);
-                               amd64_patch (br [1], br [0]);
-                       }
+                       br [0] = code; amd64_prefix (code, X86_LOCK_PREFIX);
+                       amd64_cmpxchg_membase_reg_size (code, breg, ins->inst_offset, sreg2, size);
+                       br [1] = code; amd64_branch8 (code, X86_CC_NE, -1, FALSE);
+                       amd64_patch (br [1], br [0]);
 
                        if (rdx_pushed)
                                amd64_pop_reg (code, AMD64_RDX);
 
                        break;
                }
-               default:
-                       g_warning ("unknown opcode %s in %s()\n", mono_inst_name (ins->opcode), __FUNCTION__);
-                       g_assert_not_reached ();
-               }
-
-               if ((code - cfg->native_code - offset) > max_len) {
-                       g_warning ("wrong maximal instruction length of instruction %s (expected %d, got %ld)",
-                                  mono_inst_name (ins->opcode), max_len, code - cfg->native_code - offset);
-                       g_assert_not_reached ();
-               }
-              
-               cpos += max_len;
-
-               last_ins = ins;
-               last_offset = offset;
-       }
-
-       cfg->code_len = code - cfg->native_code;
-}
-
-void
-mono_arch_register_lowlevel_calls (void)
-{
-       /* The signature doesn't matter */
-       mono_register_jit_icall (mono_amd64_throw_exception, "mono_amd64_throw_exception", mono_create_icall_signature ("void"), TRUE);
-}
+               case OP_ATOMIC_CAS_I4:
+               case OP_ATOMIC_CAS_I8: {
+                       guint32 size;
 
-void
-mono_arch_patch_code (MonoMethod *method, MonoDomain *domain, guint8 *code, MonoJumpInfo *ji, gboolean run_cctors)
-{
-       MonoJumpInfo *patch_info;
-       gboolean compile_aot = !run_cctors;
+                       if (ins->opcode == OP_ATOMIC_CAS_I8)
+                               size = 8;
+                       else
+                               size = 4;
 
-       for (patch_info = ji; patch_info; patch_info = patch_info->next) {
-               unsigned char *ip = patch_info->ip.i + code;
-               unsigned char *target;
+                       /* 
+                        * See http://msdn.microsoft.com/en-us/magazine/cc302329.aspx for
+                        * an explanation of how this works.
+                        */
+                       g_assert (ins->sreg3 == AMD64_RAX);
+                       g_assert (ins->sreg1 != AMD64_RAX);
+                       g_assert (ins->sreg1 != ins->sreg2);
 
-               target = mono_resolve_patch_target (method, domain, code, patch_info, run_cctors);
+                       amd64_prefix (code, X86_LOCK_PREFIX);
+                       amd64_cmpxchg_membase_reg_size (code, ins->sreg1, ins->inst_offset, ins->sreg2, size);
 
-               if (compile_aot) {
-                       switch (patch_info->type) {
-                       case MONO_PATCH_INFO_BB:
-                       case MONO_PATCH_INFO_LABEL:
-                               break;
-                       default:
-                               /* No need to patch these */
-                               continue;
-                       }
+                       if (ins->dreg != AMD64_RAX)
+                               amd64_mov_reg_reg (code, ins->dreg, AMD64_RAX, size);
+                       break;
                }
+#ifdef MONO_ARCH_SIMD_INTRINSICS
+               /* TODO: Some of these IR opcodes are marked as no clobber when they indeed do. */
+               case OP_ADDPS:
+                       amd64_sse_addps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_DIVPS:
+                       amd64_sse_divps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_MULPS:
+                       amd64_sse_mulps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_SUBPS:
+                       amd64_sse_subps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_MAXPS:
+                       amd64_sse_maxps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_MINPS:
+                       amd64_sse_minps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_COMPPS:
+                       g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 7);
+                       amd64_sse_cmpps_reg_reg_imm (code, ins->sreg1, ins->sreg2, ins->inst_c0);
+                       break;
+               case OP_ANDPS:
+                       amd64_sse_andps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_ANDNPS:
+                       amd64_sse_andnps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_ORPS:
+                       amd64_sse_orps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_XORPS:
+                       amd64_sse_xorps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_SQRTPS:
+                       amd64_sse_sqrtps_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+               case OP_RSQRTPS:
+                       amd64_sse_rsqrtps_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+               case OP_RCPPS:
+                       amd64_sse_rcpps_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+               case OP_ADDSUBPS:
+                       amd64_sse_addsubps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_HADDPS:
+                       amd64_sse_haddps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_HSUBPS:
+                       amd64_sse_hsubps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_DUPPS_HIGH:
+                       amd64_sse_movshdup_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+               case OP_DUPPS_LOW:
+                       amd64_sse_movsldup_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
 
-               switch (patch_info->type) {
-               case MONO_PATCH_INFO_NONE:
-                       continue;
-               case MONO_PATCH_INFO_METHOD_REL:
-               case MONO_PATCH_INFO_R8:
-               case MONO_PATCH_INFO_R4:
-                       g_assert_not_reached ();
-                       continue;
-               case MONO_PATCH_INFO_BB:
+               case OP_PSHUFLEW_HIGH:
+                       g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 0xFF);
+                       amd64_sse_pshufhw_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0);
                        break;
-               default:
+               case OP_PSHUFLEW_LOW:
+                       g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 0xFF);
+                       amd64_sse_pshuflw_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0);
+                       break;
+               case OP_PSHUFLED:
+                       g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 0xFF);
+                       amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0);
                        break;
-               }
 
-               /* 
-                * Debug code to help track down problems where the target of a near call is
+               case OP_ADDPD:
+                       amd64_sse_addpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_DIVPD:
+                       amd64_sse_divpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_MULPD:
+                       amd64_sse_mulpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_SUBPD:
+                       amd64_sse_subpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_MAXPD:
+                       amd64_sse_maxpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_MINPD:
+                       amd64_sse_minpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_COMPPD:
+                       g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 7);
+                       amd64_sse_cmppd_reg_reg_imm (code, ins->sreg1, ins->sreg2, ins->inst_c0);
+                       break;
+               case OP_ANDPD:
+                       amd64_sse_andpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_ANDNPD:
+                       amd64_sse_andnpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_ORPD:
+                       amd64_sse_orpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_XORPD:
+                       amd64_sse_xorpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               /* TODO: This op is in the AMD64 manual but has not been implemented.
+               case OP_SQRTPD:
+                       amd64_sse_sqrtpd_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+               */
+               case OP_ADDSUBPD:
+                       amd64_sse_addsubpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_HADDPD:
+                       amd64_sse_haddpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_HSUBPD:
+                       amd64_sse_hsubpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_DUPPD:
+                       amd64_sse_movddup_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+
+               case OP_EXTRACT_MASK:
+                       amd64_sse_pmovmskb_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+
+               case OP_PAND:
+                       amd64_sse_pand_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_POR:
+                       amd64_sse_por_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PXOR:
+                       amd64_sse_pxor_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PADDB:
+                       amd64_sse_paddb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PADDW:
+                       amd64_sse_paddw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PADDD:
+                       amd64_sse_paddd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PADDQ:
+                       amd64_sse_paddq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PSUBB:
+                       amd64_sse_psubb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBW:
+                       amd64_sse_psubw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBD:
+                       amd64_sse_psubd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBQ:
+                       amd64_sse_psubq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PMAXB_UN:
+                       amd64_sse_pmaxub_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMAXW_UN:
+                       amd64_sse_pmaxuw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMAXD_UN:
+                       amd64_sse_pmaxud_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               
+               case OP_PMAXB:
+                       amd64_sse_pmaxsb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMAXW:
+                       amd64_sse_pmaxsw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMAXD:
+                       amd64_sse_pmaxsd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PAVGB_UN:
+                       amd64_sse_pavgb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PAVGW_UN:
+                       amd64_sse_pavgw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PMINB_UN:
+                       amd64_sse_pminub_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMINW_UN:
+                       amd64_sse_pminuw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMIND_UN:
+                       amd64_sse_pminud_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PMINB:
+                       amd64_sse_pminsb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMINW:
+                       amd64_sse_pminsw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMIND:
+                       amd64_sse_pminsd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PCMPEQB:
+                       amd64_sse_pcmpeqb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPEQW:
+                       amd64_sse_pcmpeqw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPEQD:
+                       amd64_sse_pcmpeqd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPEQQ:
+                       amd64_sse_pcmpeqq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PCMPGTB:
+                       amd64_sse_pcmpgtb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPGTW:
+                       amd64_sse_pcmpgtw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPGTD:
+                       amd64_sse_pcmpgtd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PCMPGTQ:
+                       amd64_sse_pcmpgtq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PSUM_ABS_DIFF:
+                       amd64_sse_psadbw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_UNPACK_LOWB:
+                       amd64_sse_punpcklbw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWW:
+                       amd64_sse_punpcklwd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWD:
+                       amd64_sse_punpckldq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWQ:
+                       amd64_sse_punpcklqdq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWPS:
+                       amd64_sse_unpcklps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_LOWPD:
+                       amd64_sse_unpcklpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_UNPACK_HIGHB:
+                       amd64_sse_punpckhbw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHW:
+                       amd64_sse_punpckhwd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHD:
+                       amd64_sse_punpckhdq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHQ:
+                       amd64_sse_punpckhqdq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHPS:
+                       amd64_sse_unpckhps_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_UNPACK_HIGHPD:
+                       amd64_sse_unpckhpd_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PACKW:
+                       amd64_sse_packsswb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PACKD:
+                       amd64_sse_packssdw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PACKW_UN:
+                       amd64_sse_packuswb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PACKD_UN:
+                       amd64_sse_packusdw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PADDB_SAT_UN:
+                       amd64_sse_paddusb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBB_SAT_UN:
+                       amd64_sse_psubusb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PADDW_SAT_UN:
+                       amd64_sse_paddusw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBW_SAT_UN:
+                       amd64_sse_psubusw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PADDB_SAT:
+                       amd64_sse_paddsb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBB_SAT:
+                       amd64_sse_psubsb_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PADDW_SAT:
+                       amd64_sse_paddsw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PSUBW_SAT:
+                       amd64_sse_psubsw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+                       
+               case OP_PMULW:
+                       amd64_sse_pmullw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMULD:
+                       amd64_sse_pmulld_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMULQ:
+                       amd64_sse_pmuludq_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMULW_HIGH_UN:
+                       amd64_sse_pmulhuw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+               case OP_PMULW_HIGH:
+                       amd64_sse_pmulhw_reg_reg (code, ins->sreg1, ins->sreg2);
+                       break;
+
+               case OP_PSHRW:
+                       amd64_sse_psrlw_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHRW_REG:
+                       amd64_sse_psrlw_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSARW:
+                       amd64_sse_psraw_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSARW_REG:
+                       amd64_sse_psraw_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSHLW:
+                       amd64_sse_psllw_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHLW_REG:
+                       amd64_sse_psllw_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSHRD:
+                       amd64_sse_psrld_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHRD_REG:
+                       amd64_sse_psrld_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSARD:
+                       amd64_sse_psrad_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSARD_REG:
+                       amd64_sse_psrad_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSHLD:
+                       amd64_sse_pslld_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHLD_REG:
+                       amd64_sse_pslld_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+
+               case OP_PSHRQ:
+                       amd64_sse_psrlq_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHRQ_REG:
+                       amd64_sse_psrlq_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+               
+               /*TODO: This is appart of the sse spec but not added
+               case OP_PSARQ:
+                       amd64_sse_psraq_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSARQ_REG:
+                       amd64_sse_psraq_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;  
+               */
+       
+               case OP_PSHLQ:
+                       amd64_sse_psllq_reg_imm (code, ins->dreg, ins->inst_imm);
+                       break;
+               case OP_PSHLQ_REG:
+                       amd64_sse_psllq_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;  
+
+               case OP_ICONV_TO_X:
+                       amd64_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 4);
+                       break;
+               case OP_EXTRACT_I4:
+                       amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4);
+                       break;
+               case OP_EXTRACT_I8:
+                       if (ins->inst_c0) {
+                               amd64_movhlps_reg_reg (code, AMD64_XMM15, ins->sreg1);
+                               amd64_movd_reg_xreg_size (code, ins->dreg, AMD64_XMM15, 8);
+                       } else {
+                               amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 8);
+                       }
+                       break;
+               case OP_EXTRACT_I1:
+               case OP_EXTRACT_U1:
+                       amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4);
+                       if (ins->inst_c0)
+                               amd64_shift_reg_imm (code, X86_SHR, ins->dreg, ins->inst_c0 * 8);
+                       amd64_widen_reg (code, ins->dreg, ins->dreg, ins->opcode == OP_EXTRACT_I1, FALSE);
+                       break;
+               case OP_EXTRACT_I2:
+               case OP_EXTRACT_U2:
+                       /*amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4);
+                       if (ins->inst_c0)
+                               amd64_shift_reg_imm_size (code, X86_SHR, ins->dreg, 16, 4);*/
+                       amd64_sse_pextrw_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0);
+                       amd64_widen_reg_size (code, ins->dreg, ins->dreg, ins->opcode == OP_EXTRACT_I2, TRUE, 4);
+                       break;
+               case OP_EXTRACT_R8:
+                       if (ins->inst_c0)
+                               amd64_movhlps_reg_reg (code, ins->dreg, ins->sreg1);
+                       else
+                               amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+               case OP_INSERT_I2:
+                       amd64_sse_pinsrw_reg_reg_imm (code, ins->sreg1, ins->sreg2, ins->inst_c0);
+                       break;
+               case OP_EXTRACTX_U2:
+                       amd64_sse_pextrw_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0);
+                       break;
+               case OP_INSERTX_U1_SLOW:
+                       /*sreg1 is the extracted ireg (scratch)
+                       /sreg2 is the to be inserted ireg (scratch)
+                       /dreg is the xreg to receive the value*/
+
+                       /*clear the bits from the extracted word*/
+                       amd64_alu_reg_imm (code, X86_AND, ins->sreg1, ins->inst_c0 & 1 ? 0x00FF : 0xFF00);
+                       /*shift the value to insert if needed*/
+                       if (ins->inst_c0 & 1)
+                               amd64_shift_reg_imm_size (code, X86_SHL, ins->sreg2, 8, 4);
+                       /*join them together*/
+                       amd64_alu_reg_reg (code, X86_OR, ins->sreg1, ins->sreg2);
+                       amd64_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0 / 2);
+                       break;
+               case OP_INSERTX_I4_SLOW:
+                       amd64_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg2, ins->inst_c0 * 2);
+                       amd64_shift_reg_imm (code, X86_SHR, ins->sreg2, 16);
+                       amd64_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg2, ins->inst_c0 * 2 + 1);
+                       break;
+               case OP_INSERTX_I8_SLOW:
+                       amd64_movd_xreg_reg_size(code, AMD64_XMM15, ins->sreg2, 8);
+                       if (ins->inst_c0)
+                               amd64_movlhps_reg_reg (code, ins->dreg, AMD64_XMM15);
+                       else
+                               amd64_sse_movsd_reg_reg (code, ins->dreg, AMD64_XMM15);
+                       break;
+
+               case OP_INSERTX_R4_SLOW:
+                       switch (ins->inst_c0) {
+                       case 0:
+                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg2);
+                               break;
+                       case 1:
+                               amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(1, 0, 2, 3));
+                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg2);
+                               amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(1, 0, 2, 3));
+                               break;
+                       case 2:
+                               amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(2, 1, 0, 3));
+                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg2);
+                               amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(2, 1, 0, 3));
+                               break;
+                       case 3:
+                               amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(3, 1, 2, 0));
+                               amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg2);
+                               amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(3, 1, 2, 0));
+                               break;
+                       }
+                       break;
+               case OP_INSERTX_R8_SLOW:
+                       if (ins->inst_c0)
+                               amd64_movlhps_reg_reg (code, ins->dreg, ins->sreg2);
+                       else
+                               amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
+               case OP_STOREX_MEMBASE_REG:
+               case OP_STOREX_MEMBASE:
+                       amd64_sse_movups_membase_reg (code, ins->dreg, ins->inst_offset, ins->sreg1);
+                       break;
+               case OP_LOADX_MEMBASE:
+                       amd64_sse_movups_reg_membase (code, ins->dreg, ins->sreg1, ins->inst_offset);
+                       break;
+               case OP_LOADX_ALIGNED_MEMBASE:
+                       amd64_sse_movaps_reg_membase (code, ins->dreg, ins->sreg1, ins->inst_offset);
+                       break;
+               case OP_STOREX_ALIGNED_MEMBASE_REG:
+                       amd64_sse_movaps_membase_reg (code, ins->dreg, ins->inst_offset, ins->sreg1);
+                       break;
+               case OP_STOREX_NTA_MEMBASE_REG:
+                       amd64_sse_movntps_reg_membase (code, ins->dreg, ins->sreg1, ins->inst_offset);
+                       break;
+               case OP_PREFETCH_MEMBASE:
+                       amd64_sse_prefetch_reg_membase (code, ins->backend.arg_info, ins->sreg1, ins->inst_offset);
+                       break;
+
+               case OP_XMOVE:
+                       /*FIXME the peephole pass should have killed this*/
+                       if (ins->dreg != ins->sreg1)
+                               amd64_sse_movaps_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;          
+               case OP_XZERO:
+                       amd64_sse_pxor_reg_reg (code, ins->dreg, ins->dreg);
+                       break;
+               case OP_ICONV_TO_R8_RAW:
+                       amd64_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 4);
+                       amd64_sse_cvtss2sd_reg_reg (code, ins->dreg, ins->dreg);
+                       break;
+
+               case OP_FCONV_TO_R8_X:
+                       amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
+                       break;
+
+               case OP_XCONV_R8_TO_I4:
+                       amd64_sse_cvttsd2si_reg_xreg_size (code, ins->dreg, ins->sreg1, 4);
+                       switch (ins->backend.source_opcode) {
+                       case OP_FCONV_TO_I1:
+                               amd64_widen_reg (code, ins->dreg, ins->dreg, TRUE, FALSE);
+                               break;
+                       case OP_FCONV_TO_U1:
+                               amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
+                               break;
+                       case OP_FCONV_TO_I2:
+                               amd64_widen_reg (code, ins->dreg, ins->dreg, TRUE, TRUE);
+                               break;
+                       case OP_FCONV_TO_U2:
+                               amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, TRUE);
+                               break;
+                       }                       
+                       break;
+
+               case OP_EXPAND_I2:
+                       amd64_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg1, 0);
+                       amd64_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg1, 1);
+                       amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0);
+                       break;
+               case OP_EXPAND_I4:
+                       amd64_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 4);
+                       amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0);
+                       break;
+               case OP_EXPAND_I8:
+                       amd64_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 8);
+                       amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0x44);
+                       break;
+               case OP_EXPAND_R4:
+                       amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
+                       amd64_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->dreg);
+                       amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0);
+                       break;
+               case OP_EXPAND_R8:
+                       amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
+                       amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0x44);
+                       break;
+#endif
+               case OP_LIVERANGE_START: {
+                       if (cfg->verbose_level > 1)
+                               printf ("R%d START=0x%x\n", MONO_VARINFO (cfg, ins->inst_c0)->vreg, (int)(code - cfg->native_code));
+                       MONO_VARINFO (cfg, ins->inst_c0)->live_range_start = code - cfg->native_code;
+                       break;
+               }
+               case OP_LIVERANGE_END: {
+                       if (cfg->verbose_level > 1)
+                               printf ("R%d END=0x%x\n", MONO_VARINFO (cfg, ins->inst_c0)->vreg, (int)(code - cfg->native_code));
+                       MONO_VARINFO (cfg, ins->inst_c0)->live_range_end = code - cfg->native_code;
+                       break;
+               }
+               default:
+                       g_warning ("unknown opcode %s in %s()\n", mono_inst_name (ins->opcode), __FUNCTION__);
+                       g_assert_not_reached ();
+               }
+
+               if ((code - cfg->native_code - offset) > max_len) {
+                       g_warning ("wrong maximal instruction length of instruction %s (expected %d, got %ld)",
+                                  mono_inst_name (ins->opcode), max_len, code - cfg->native_code - offset);
+                       g_assert_not_reached ();
+               }
+              
+               last_ins = ins;
+               last_offset = offset;
+       }
+
+       cfg->code_len = code - cfg->native_code;
+}
+
+#endif /* DISABLE_JIT */
+
+void
+mono_arch_register_lowlevel_calls (void)
+{
+       /* The signature doesn't matter */
+       mono_register_jit_icall (mono_amd64_throw_exception, "mono_amd64_throw_exception", mono_create_icall_signature ("void"), TRUE);
+}
+
+void
+mono_arch_patch_code (MonoMethod *method, MonoDomain *domain, guint8 *code, MonoJumpInfo *ji, gboolean run_cctors)
+{
+       MonoJumpInfo *patch_info;
+       gboolean compile_aot = !run_cctors;
+
+       for (patch_info = ji; patch_info; patch_info = patch_info->next) {
+               unsigned char *ip = patch_info->ip.i + code;
+               unsigned char *target;
+
+               target = mono_resolve_patch_target (method, domain, code, patch_info, run_cctors);
+
+               if (compile_aot) {
+                       switch (patch_info->type) {
+                       case MONO_PATCH_INFO_BB:
+                       case MONO_PATCH_INFO_LABEL:
+                               break;
+                       default:
+                               /* No need to patch these */
+                               continue;
+                       }
+               }
+
+               switch (patch_info->type) {
+               case MONO_PATCH_INFO_NONE:
+                       continue;
+               case MONO_PATCH_INFO_METHOD_REL:
+               case MONO_PATCH_INFO_R8:
+               case MONO_PATCH_INFO_R4:
+                       g_assert_not_reached ();
+                       continue;
+               case MONO_PATCH_INFO_BB:
+                       break;
+               default:
+                       break;
+               }
+
+               /* 
+                * Debug code to help track down problems where the target of a near call is
                 * is not valid.
                 */
                if (amd64_is_near_call (ip)) {
@@ -4783,7 +5229,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        MonoBasicBlock *bb;
        MonoMethodSignature *sig;
        MonoInst *ins;
-       int alloc_size, pos, max_offset, i, quad, max_epilog_size;
+       int alloc_size, pos, i, cfa_offset, quad, max_epilog_size;
        guint8 *code;
        CallInfo *cinfo;
        gint32 lmf_offset = cfg->arch.lmf_offset;
@@ -4800,6 +5246,9 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        /* Amount of stack space allocated by register saving code */
        pos = 0;
 
+       /* Offset between RSP and the CFA */
+       cfa_offset = 0;
+
        /* 
         * The prolog consists of the following parts:
         * FP present:
@@ -4815,16 +5264,25 @@ mono_arch_emit_prolog (MonoCompile *cfg)
         * - save callee saved regs using moves
         */
 
+       // CFA = sp + 8
+       cfa_offset = 8;
+       mono_emit_unwind_op_def_cfa (cfg, code, AMD64_RSP, 8);
+       // IP saved at CFA - 8
+       mono_emit_unwind_op_offset (cfg, code, AMD64_RIP, -cfa_offset);
        async_exc_point (code);
 
        if (!cfg->arch.omit_fp) {
                amd64_push_reg (code, AMD64_RBP);
+               cfa_offset += 8;
+               mono_emit_unwind_op_def_cfa_offset (cfg, code, cfa_offset);
+               mono_emit_unwind_op_offset (cfg, code, AMD64_RBP, - cfa_offset);
                async_exc_point (code);
 #ifdef PLATFORM_WIN32
                mono_arch_unwindinfo_add_push_nonvol (&cfg->arch.unwindinfo, cfg->native_code, code, AMD64_RBP);
 #endif
                
                amd64_mov_reg_reg (code, AMD64_RBP, AMD64_RSP, sizeof (gpointer));
+               mono_emit_unwind_op_def_cfa_reg (cfg, code, AMD64_RBP);
                async_exc_point (code);
 #ifdef PLATFORM_WIN32
                mono_arch_unwindinfo_add_set_fpreg (&cfg->arch.unwindinfo, cfg->native_code, code, AMD64_RBP);
@@ -4833,14 +5291,27 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 
        /* Save callee saved registers */
        if (!cfg->arch.omit_fp && !method->save_lmf) {
+               int offset = cfa_offset;
+
                for (i = 0; i < AMD64_NREG; ++i)
                        if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
                                amd64_push_reg (code, i);
                                pos += sizeof (gpointer);
+                               offset += 8;
+                               mono_emit_unwind_op_offset (cfg, code, i, - offset);
                                async_exc_point (code);
                        }
        }
 
+       /* The param area is always at offset 0 from sp */
+       /* This needs to be allocated here, since it has to come after the spill area */
+       if (cfg->arch.no_pushes && cfg->param_area) {
+               if (cfg->arch.omit_fp)
+                       // FIXME:
+                       g_assert_not_reached ();
+               cfg->stack_offset += ALIGN_TO (cfg->param_area, sizeof (gpointer));
+       }
+
        if (cfg->arch.omit_fp) {
                /* 
                 * On enter, the stack is misaligned by the the pushing of the return
@@ -4865,6 +5336,10 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                guint32 remaining_size = alloc_size;
                while (remaining_size >= 0x1000) {
                        amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, 0x1000);
+                       if (cfg->arch.omit_fp) {
+                               cfa_offset += 0x1000;
+                               mono_emit_unwind_op_def_cfa_offset (cfg, code, cfa_offset);
+                       }
                        async_exc_point (code);
 #ifdef PLATFORM_WIN32
                        if (cfg->arch.omit_fp) 
@@ -4876,7 +5351,11 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                }
                if (remaining_size) {
                        amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, remaining_size);
-                       async_exc_point (code);
+                       if (cfg->arch.omit_fp) {
+                               cfa_offset += remaining_size;
+                               mono_emit_unwind_op_def_cfa_offset (cfg, code, cfa_offset);
+                               async_exc_point (code);
+                       }
 #ifdef PLATFORM_WIN32
                        if (cfg->arch.omit_fp) 
                                mono_arch_unwindinfo_add_alloc_stack (&cfg->arch.unwindinfo, cfg->native_code, code, remaining_size);
@@ -4884,7 +5363,11 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                }
 #else
                amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, alloc_size);
-               async_exc_point (code);
+               if (cfg->arch.omit_fp) {
+                       cfa_offset += alloc_size;
+                       mono_emit_unwind_op_def_cfa_offset (cfg, code, cfa_offset);
+                       async_exc_point (code);
+               }
 #endif
        }
 
@@ -4907,12 +5390,31 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                /* sp is saved right before calls */
                /* Skip method (only needed for trampoline LMF frames) */
                /* Save callee saved regs */
-               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbx), AMD64_RBX, 8);
-               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbp), AMD64_RBP, 8);
-               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r12), AMD64_R12, 8);
-               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r13), AMD64_R13, 8);
-               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r14), AMD64_R14, 8);
-               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r15), AMD64_R15, 8);
+               for (i = 0; i < MONO_MAX_IREGS; ++i) {
+                       int offset;
+
+                       switch (i) {
+                       case AMD64_RBX: offset = G_STRUCT_OFFSET (MonoLMF, rbx); break;
+                       case AMD64_RBP: offset = G_STRUCT_OFFSET (MonoLMF, rbp); break;
+                       case AMD64_R12: offset = G_STRUCT_OFFSET (MonoLMF, r12); break;
+                       case AMD64_R13: offset = G_STRUCT_OFFSET (MonoLMF, r13); break;
+                       case AMD64_R14: offset = G_STRUCT_OFFSET (MonoLMF, r14); break;
+                       case AMD64_R15: offset = G_STRUCT_OFFSET (MonoLMF, r15); break;
+#ifdef PLATFORM_WIN32
+                       case AMD64_RDI: offset = G_STRUCT_OFFSET (MonoLMF, rdi); break;
+                       case AMD64_RSI: offset = G_STRUCT_OFFSET (MonoLMF, rsi); break;
+#endif
+                       default:
+                               offset = -1;
+                               break;
+                       }
+
+                       if (offset != -1) {
+                               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + offset, i, 8);
+                               if (cfg->arch.omit_fp || (i != AMD64_RBP))
+                                       mono_emit_unwind_op_offset (cfg, code, i, - (cfa_offset - (lmf_offset + offset)));
+                       }
+               }
        }
 
        /* Save callee saved registers */
@@ -4925,6 +5427,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                for (i = 0; i < AMD64_NREG; ++i)
                        if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
                                amd64_mov_membase_reg (code, AMD64_RSP, save_area_offset, i, 8);
+                               mono_emit_unwind_op_offset (cfg, code, i, - (cfa_offset - save_area_offset));
                                save_area_offset += 8;
                                async_exc_point (code);
                        }
@@ -4938,30 +5441,28 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                amd64_mov_membase_reg (code, cfg->rgctx_var->inst_basereg, cfg->rgctx_var->inst_offset, MONO_ARCH_RGCTX_REG, 8);
        }
 
-       /* compute max_offset in order to use short forward jumps */
-       max_offset = 0;
+       /* compute max_length in order to use short forward jumps */
        max_epilog_size = get_max_epilog_size (cfg);
        if (cfg->opt & MONO_OPT_BRANCH) {
                for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
                        MonoInst *ins;
-                       bb->max_offset = max_offset;
+                       int max_length = 0;
 
                        if (cfg->prof_options & MONO_PROFILE_COVERAGE)
-                               max_offset += 6;
+                               max_length += 6;
                        /* max alignment for loops */
                        if ((cfg->opt & MONO_OPT_LOOP) && bb_is_loop_start (bb))
-                               max_offset += LOOP_ALIGNMENT;
+                               max_length += LOOP_ALIGNMENT;
 
                        MONO_BB_FOR_EACH_INS (bb, ins) {
-                               if (ins->opcode == OP_LABEL)
-                                       ins->inst_c1 = max_offset;
-                               
-                               max_offset += ((guint8 *)ins_get_spec (ins->opcode))[MONO_INST_LEN];
+                               max_length += ((guint8 *)ins_get_spec (ins->opcode))[MONO_INST_LEN];
                        }
 
-                       if (mono_jit_trace_calls && bb == cfg->bb_exit)
-                               /* The tracing code can be quite large */
-                               max_offset += max_epilog_size;
+                       /* Take prolog and epilog instrumentation into account */
+                       if (bb == cfg->bb_entry || bb == cfg->bb_exit)
+                               max_length += max_epilog_size;
+                       
+                       bb->max_length = max_length;
                }
        }
 
@@ -5107,15 +5608,20 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                if (appdomain_tls_offset != -1 && lmf_tls_offset != -1) {
                        guint8 *buf, *no_domain_branch;
 
-                       code = emit_tls_get (code, AMD64_RAX, appdomain_tls_offset);
-                       if ((domain >> 32) == 0)
-                               amd64_mov_reg_imm_size (code, AMD64_ARG_REG1, domain, 4);
-                       else
-                               amd64_mov_reg_imm_size (code, AMD64_ARG_REG1, domain, 8);
+                       code = mono_amd64_emit_tls_get (code, AMD64_RAX, appdomain_tls_offset);
+                       if (cfg->compile_aot) {
+                               /* AOT code is only used in the root domain */
+                               amd64_mov_reg_imm (code, AMD64_ARG_REG1, 0);
+                       } else {
+                               if ((domain >> 32) == 0)
+                                       amd64_mov_reg_imm_size (code, AMD64_ARG_REG1, domain, 4);
+                               else
+                                       amd64_mov_reg_imm_size (code, AMD64_ARG_REG1, domain, 8);
+                       }
                        amd64_alu_reg_reg (code, X86_CMP, AMD64_RAX, AMD64_ARG_REG1);
                        no_domain_branch = code;
                        x86_branch8 (code, X86_CC_NE, 0, 0);
-                       code = emit_tls_get ( code, AMD64_RAX, lmf_addr_tls_offset);
+                       code = mono_amd64_emit_tls_get ( code, AMD64_RAX, lmf_addr_tls_offset);
                        amd64_test_reg_reg (code, AMD64_RAX, AMD64_RAX);
                        buf = code;
                        x86_branch8 (code, X86_CC_NE, 0, 0);
@@ -5130,10 +5636,15 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 #endif
                } else {
                        g_assert (!cfg->compile_aot);
-                       if ((domain >> 32) == 0)
-                               amd64_mov_reg_imm_size (code, AMD64_ARG_REG1, domain, 4);
-                       else
-                               amd64_mov_reg_imm_size (code, AMD64_ARG_REG1, domain, 8);
+                       if (cfg->compile_aot) {
+                               /* AOT code is only used in the root domain */
+                               amd64_mov_reg_imm (code, AMD64_ARG_REG1, 0);
+                       } else {
+                               if ((domain >> 32) == 0)
+                                       amd64_mov_reg_imm_size (code, AMD64_ARG_REG1, domain, 4);
+                               else
+                                       amd64_mov_reg_imm_size (code, AMD64_ARG_REG1, domain, 8);
+                       }
                        code = emit_call (cfg, code, MONO_PATCH_INFO_INTERNAL_METHOD,
                                          (gpointer)"mono_jit_thread_attach", TRUE);
                }
@@ -5142,8 +5653,8 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        if (method->save_lmf) {
                if ((lmf_tls_offset != -1) && !optimize_for_xen) {
                        /*
-                        * Optimized version which uses the mono_lmf TLS variable instead of indirection
-                        * through the mono_lmf_addr TLS variable.
+                        * Optimized version which uses the mono_lmf TLS variable instead of 
+                        * indirection through the mono_lmf_addr TLS variable.
                         */
                        /* %rax = previous_lmf */
                        x86_prefix (code, X86_FS_PREFIX);
@@ -5163,7 +5674,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                } else {
                        if (lmf_addr_tls_offset != -1) {
                                /* Load lmf quicky using the FS register */
-                               code = emit_tls_get (code, AMD64_RAX, lmf_addr_tls_offset);
+                               code = mono_amd64_emit_tls_get (code, AMD64_RAX, lmf_addr_tls_offset);
 #ifdef PLATFORM_WIN32
                                /* The TLS key actually contains a pointer to the MonoJitTlsData structure */
                                /* FIXME: Add a separate key for LMF to avoid this */
@@ -5300,6 +5811,23 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        pos = 0;
        
        if (method->save_lmf) {
+               /* check if we need to restore protection of the stack after a stack overflow */
+               if (mono_get_jit_tls_offset () != -1) {
+                       guint8 *patch;
+                       code = mono_amd64_emit_tls_get (code, X86_ECX, mono_get_jit_tls_offset ());
+                       /* we load the value in a separate instruction: this mechanism may be
+                        * used later as a safer way to do thread interruption
+                        */
+                       amd64_mov_reg_membase (code, X86_ECX, X86_ECX, G_STRUCT_OFFSET (MonoJitTlsData, restore_stack_prot), 8);
+                       x86_alu_reg_imm (code, X86_CMP, X86_ECX, 0);
+                       patch = code;
+                       x86_branch8 (code, X86_CC_Z, 0, FALSE);
+                       /* note that the call trampoline will preserve eax/edx */
+                       x86_call_reg (code, X86_ECX);
+                       x86_patch (patch, code);
+               } else {
+                       /* FIXME: maybe save the jit tls in the prolog */
+               }
                if ((lmf_tls_offset != -1) && !optimize_for_xen) {
                        /*
                         * Optimized version which uses the mono_lmf TLS variable instead of indirection
@@ -5335,6 +5863,14 @@ mono_arch_emit_epilog (MonoCompile *cfg)
                if (cfg->used_int_regs & (1 << AMD64_R15)) {
                        amd64_mov_reg_membase (code, AMD64_R15, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r15), 8);
                }
+#ifdef PLATFORM_WIN32
+               if (cfg->used_int_regs & (1 << AMD64_RDI)) {
+                       amd64_mov_reg_membase (code, AMD64_RDI, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rdi), 8);
+               }
+               if (cfg->used_int_regs & (1 << AMD64_RSI)) {
+                       amd64_mov_reg_membase (code, AMD64_RSI, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rsi), 8);
+               }
+#endif
        } else {
 
                if (cfg->arch.omit_fp) {
@@ -5409,15 +5945,6 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        cfg->code_len = code - cfg->native_code;
 
        g_assert (cfg->code_len < cfg->code_size);
-
-       if (cfg->arch.omit_fp) {
-               /* 
-                * Encode the stack size into used_int_regs so the exception handler
-                * can access it.
-                */
-               g_assert (cfg->arch.stack_alloc_size < (1 << 16));
-               cfg->used_int_regs |= (1 << 31) | (cfg->arch.stack_alloc_size << 16);
-       }
 }
 
 void
@@ -5613,12 +6140,12 @@ enum {
 };
 
 void*
-mono_arch_instrument_epilog (MonoCompile *cfg, void *func, void *p, gboolean enable_arguments)
+mono_arch_instrument_epilog_full (MonoCompile *cfg, void *func, void *p, gboolean enable_arguments, gboolean preserve_argument_registers)
 {
        guchar *code = p;
        int save_mode = SAVE_NONE;
        MonoMethod *method = cfg->method;
-       int rtype = mono_type_get_underlying_type (mono_method_signature (method)->ret)->type;
+       int rtype = mini_type_get_underlying_type (NULL, mono_method_signature (method)->ret)->type;
        
        switch (rtype) {
        case MONO_TYPE_VOID:
@@ -5686,10 +6213,20 @@ mono_arch_instrument_epilog (MonoCompile *cfg, void *func, void *p, gboolean ena
        else
                amd64_mov_reg_imm (code, AMD64_RAX, 0);
 
+       if (preserve_argument_registers) {
+               amd64_push_reg (code, MONO_AMD64_ARG_REG1);
+               amd64_push_reg (code, MONO_AMD64_ARG_REG2);
+       }
+
        mono_add_patch_info (cfg, code-cfg->native_code, MONO_PATCH_INFO_METHODCONST, method);
        amd64_set_reg_template (code, AMD64_ARG_REG1);
        code = emit_call (cfg, code, MONO_PATCH_INFO_ABS, (gpointer)func, TRUE);
 
+       if (preserve_argument_registers) {
+               amd64_pop_reg (code, MONO_AMD64_ARG_REG2);
+               amd64_pop_reg (code, MONO_AMD64_ARG_REG1);
+       }
+
        /* Restore result */
        switch (save_mode) {
        case SAVE_EAX:
@@ -5856,7 +6393,7 @@ mono_breakpoint_clean_code (guint8 *method_start, guint8 *code, int offset, guin
 }
 
 gpointer
-mono_arch_get_vcall_slot (guint8 *code, gpointer *regs, int *displacement)
+mono_arch_get_vcall_slot (guint8 *code, mgreg_t *regs, int *displacement)
 {
        guint8 buf [10];
        guint32 reg;
@@ -5868,56 +6405,30 @@ mono_arch_get_vcall_slot (guint8 *code, gpointer *regs, int *displacement)
 
        *displacement = 0;
 
-       /* go to the start of the call instruction
-        *
-        * address_byte = (m << 6) | (o << 3) | reg
-        * call opcode: 0xff address_byte displacement
-        * 0xff m=1,o=2 imm8
-        * 0xff m=2,o=2 imm32
-        */
        code -= 7;
 
        /* 
         * A given byte sequence can match more than case here, so we have to be
         * really careful about the ordering of the cases. Longer sequences
         * come first.
+        * There are two types of calls:
+        * - direct calls: 0xff address_byte 8/32 bits displacement
+        * - indirect calls: nop nop nop <call>
+        * The nops make sure we don't confuse the instruction preceeding an indirect
+        * call with a direct call.
         */
-#ifdef MONO_ARCH_HAVE_IMT
-       if ((code [-2] == 0x41) && (code [-1] == 0xbb) && (code [4] == 0xff) && (x86_modrm_mod (code [5]) == 1) && (x86_modrm_reg (code [5]) == 2) && ((signed char)code [6] < 0)) {
-               /* IMT-based interface calls: with MONO_ARCH_IMT_REG == r11
-                * 41 bb 14 f8 28 08       mov    $0x828f814,%r11d
-                * ff 50 fc                call   *0xfffffffc(%rax)
-                */
-               reg = amd64_modrm_rm (code [5]);
-               disp = (signed char)code [6];
-               /* R10 is clobbered by the IMT thunk code */
-               g_assert (reg != AMD64_R10);
-       }
-#else
-       if (0) {
-       }
-#endif
-       else if ((code [-1] == 0x8b) && (amd64_modrm_mod (code [0]) == 0x2) && (code [5] == 0xff) && (amd64_modrm_reg (code [6]) == 0x2) && (amd64_modrm_mod (code [6]) == 0x0)) {
-                       /*
-                        * This is a interface call
-                        * 48 8b 80 f0 e8 ff ff   mov    0xffffffffffffe8f0(%rax),%rax
-                        * ff 10                  callq  *(%rax)
-                        */
-               if (IS_REX (code [4]))
-                       rex = code [4];
-               reg = amd64_modrm_rm (code [6]);
-               disp = 0;
-               /* R10 is clobbered by the IMT thunk code */
-               g_assert (reg != AMD64_R10);
-       } else if ((code [0] == 0x41) && (code [1] == 0xff) && (code [2] == 0x15)) {
+       if ((code [0] == 0x41) && (code [1] == 0xff) && (code [2] == 0x15)) {
                /* call OFFSET(%rip) */
                disp = *(guint32*)(code + 3);
                return (gpointer*)(code + disp + 7);
-       } else if ((code [0] == 0xff) && (amd64_modrm_reg (code [1]) == 0x2) && (amd64_modrm_mod (code [1]) == 0x2) && (amd64_modrm_reg (code [2]) == X86_ESP) && (amd64_modrm_mod (code [2]) == 0) && (amd64_modrm_rm (code [2]) == X86_ESP)) {
-               /* call *[r12+disp32] */
-               if (IS_REX (code [-1]))
+       } else if ((code [0] == 0xff) && (amd64_modrm_reg (code [1]) == 0x2) && (amd64_modrm_mod (code [1]) == 0x2) && (amd64_sib_index (code [2]) == 4) && (amd64_sib_scale (code [2]) == 0)) {
+               /* call *[reg+disp32] using indexed addressing */
+               /* The LLVM JIT emits this, and we emit it too for %r12 */
+               if (IS_REX (code [-1])) {
                        rex = code [-1];
-               reg = AMD64_RSP;
+                       g_assert (amd64_rex_x (rex) == 0);
+               }                       
+               reg = amd64_sib_base (code [2]);
                disp = *(gint32*)(code + 3);
        } else if ((code [1] == 0xff) && (amd64_modrm_reg (code [2]) == 0x2) && (amd64_modrm_mod (code [2]) == 0x2)) {
                /* call *[reg+disp32] */
@@ -5930,11 +6441,11 @@ mono_arch_get_vcall_slot (guint8 *code, gpointer *regs, int *displacement)
        } else if (code [2] == 0xe8) {
                /* call <ADDR> */
                return NULL;
-       } else if ((code [3] == 0xff) && (amd64_modrm_reg (code [4]) == 0x2) && (amd64_modrm_mod (code [4]) == 0x1) && (amd64_modrm_reg (code [5]) == X86_ESP) && (amd64_modrm_mod (code [5]) == 0) && (amd64_modrm_rm (code [5]) == X86_ESP)) {
-               /* call *[r12+disp32] */
+       } else if ((code [3] == 0xff) && (amd64_modrm_reg (code [4]) == 0x2) && (amd64_modrm_mod (code [4]) == 0x1) && (amd64_sib_index (code [5]) == 4) && (amd64_sib_scale (code [5]) == 0)) {
+               /* call *[r12+disp8] using indexed addressing */
                if (IS_REX (code [2]))
                        rex = code [2];
-               reg = AMD64_RSP;
+               reg = amd64_sib_base (code [5]);
                disp = *(gint8*)(code + 6);
        } else if (IS_REX (code [4]) && (code [5] == 0xff) && (amd64_modrm_reg (code [6]) == 0x2) && (amd64_modrm_mod (code [6]) == 0x3)) {
                /* call *%reg */
@@ -5948,11 +6459,7 @@ mono_arch_get_vcall_slot (guint8 *code, gpointer *regs, int *displacement)
                //printf ("B: [%%r%d+0x%x]\n", reg, disp);
        }
        else if ((code [5] == 0xff) && (amd64_modrm_reg (code [6]) == 0x2) && (amd64_modrm_mod (code [6]) == 0x0)) {
-                       /*
-                        * This is a interface call: should check the above code can't catch it earlier 
-                        * 8b 40 30   mov    0x30(%eax),%eax
-                        * ff 10      call   *(%eax)
-                        */
+               /* call *%reg */
                if (IS_REX (code [4]))
                        rex = code [4];
                reg = amd64_modrm_rm (code [6]);
@@ -5967,18 +6474,7 @@ mono_arch_get_vcall_slot (guint8 *code, gpointer *regs, int *displacement)
        g_assert (reg != AMD64_R11);
 
        *displacement = disp;
-       return regs [reg];
-}
-
-gpointer*
-mono_arch_get_vcall_slot_addr (guint8* code, gpointer *regs)
-{
-       gpointer vt;
-       int displacement;
-       vt = mono_arch_get_vcall_slot (code, regs, &displacement);
-       if (!vt)
-               return NULL;
-       return (gpointer*)((char*)vt + displacement);
+       return (gpointer)regs [reg];
 }
 
 int
@@ -6003,13 +6499,85 @@ mono_arch_get_this_arg_reg (MonoMethodSignature *sig, MonoGenericSharingContext
 }
 
 gpointer
-mono_arch_get_this_arg_from_call (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, gssize *regs, guint8 *code)
+mono_arch_get_this_arg_from_call (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, mgreg_t *regs, guint8 *code)
 {
        return (gpointer)regs [mono_arch_get_this_arg_reg (sig, gsctx, code)];
 }
 
 #define MAX_ARCH_DELEGATE_PARAMS 10
 
+static gpointer
+get_delegate_invoke_impl (gboolean has_target, guint32 param_count, guint32 *code_len)
+{
+       guint8 *code, *start;
+       int i;
+
+       if (has_target) {
+               start = code = mono_global_codeman_reserve (64);
+
+               /* Replace the this argument with the target */
+               amd64_mov_reg_reg (code, AMD64_RAX, AMD64_ARG_REG1, 8);
+               amd64_mov_reg_membase (code, AMD64_ARG_REG1, AMD64_RAX, G_STRUCT_OFFSET (MonoDelegate, target), 8);
+               amd64_jump_membase (code, AMD64_RAX, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
+
+               g_assert ((code - start) < 64);
+       } else {
+               start = code = mono_global_codeman_reserve (64);
+
+               if (param_count == 0) {
+                       amd64_jump_membase (code, AMD64_ARG_REG1, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
+               } else {
+                       /* We have to shift the arguments left */
+                       amd64_mov_reg_reg (code, AMD64_RAX, AMD64_ARG_REG1, 8);
+                       for (i = 0; i < param_count; ++i) {
+#ifdef PLATFORM_WIN32
+                               if (i < 3)
+                                       amd64_mov_reg_reg (code, param_regs [i], param_regs [i + 1], 8);
+                               else
+                                       amd64_mov_reg_membase (code, param_regs [i], AMD64_RSP, 0x28, 8);
+#else
+                               amd64_mov_reg_reg (code, param_regs [i], param_regs [i + 1], 8);
+#endif
+                       }
+
+                       amd64_jump_membase (code, AMD64_RAX, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
+               }
+               g_assert ((code - start) < 64);
+       }
+
+       mono_debug_add_delegate_trampoline (start, code - start);
+
+       if (code_len)
+               *code_len = code - start;
+
+       return start;
+}
+
+/*
+ * mono_arch_get_delegate_invoke_impls:
+ *
+ *   Return a list of MonoAotTrampInfo structures for the delegate invoke impl
+ * trampolines.
+ */
+GSList*
+mono_arch_get_delegate_invoke_impls (void)
+{
+       GSList *res = NULL;
+       guint8 *code;
+       guint32 code_len;
+       int i;
+
+       code = get_delegate_invoke_impl (TRUE, 0, &code_len);
+       res = g_slist_prepend (res, mono_aot_tramp_info_create (g_strdup ("delegate_invoke_impl_has_target"), code, code_len));
+
+       for (i = 0; i < MAX_ARCH_DELEGATE_PARAMS; ++i) {
+               code = get_delegate_invoke_impl (FALSE, i, &code_len);
+               res = g_slist_prepend (res, mono_aot_tramp_info_create (g_strdup_printf ("delegate_invoke_impl_target_%d", i), code, code_len));
+       }
+
+       return res;
+}
+
 gpointer
 mono_arch_get_delegate_invoke_impl (MonoMethodSignature *sig, gboolean has_target)
 {
@@ -6029,16 +6597,10 @@ mono_arch_get_delegate_invoke_impl (MonoMethodSignature *sig, gboolean has_targe
                if (cached)
                        return cached;
 
-               start = code = mono_global_codeman_reserve (64);
-
-               /* Replace the this argument with the target */
-               amd64_mov_reg_reg (code, AMD64_RAX, AMD64_ARG_REG1, 8);
-               amd64_mov_reg_membase (code, AMD64_ARG_REG1, AMD64_RAX, G_STRUCT_OFFSET (MonoDelegate, target), 8);
-               amd64_jump_membase (code, AMD64_RAX, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
-
-               g_assert ((code - start) < 64);
-
-               mono_debug_add_delegate_trampoline (start, code - start);
+               if (mono_aot_only)
+                       start = mono_aot_get_named_code ("delegate_invoke_impl_has_target");
+               else
+                       start = get_delegate_invoke_impl (TRUE, 0, NULL);
 
                mono_memory_barrier ();
 
@@ -6055,21 +6617,13 @@ mono_arch_get_delegate_invoke_impl (MonoMethodSignature *sig, gboolean has_targe
                if (code)
                        return code;
 
-               start = code = mono_global_codeman_reserve (64);
-
-               if (sig->param_count == 0) {
-                       amd64_jump_membase (code, AMD64_ARG_REG1, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
+               if (mono_aot_only) {
+                       char *name = g_strdup_printf ("delegate_invoke_impl_target_%d", sig->param_count);
+                       start = mono_aot_get_named_code (name);
+                       g_free (name);
                } else {
-                       /* We have to shift the arguments left */
-                       amd64_mov_reg_reg (code, AMD64_RAX, AMD64_ARG_REG1, 8);
-                       for (i = 0; i < sig->param_count; ++i)
-                               amd64_mov_reg_reg (code, param_regs [i], param_regs [i + 1], 8);
-
-                       amd64_jump_membase (code, AMD64_RAX, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
+                       start = get_delegate_invoke_impl (FALSE, sig->param_count, NULL);
                }
-               g_assert ((code - start) < 64);
-
-               mono_debug_add_delegate_trampoline (start, code - start);
 
                mono_memory_barrier ();
 
@@ -6125,50 +6679,6 @@ mono_arch_free_jit_tls_data (MonoJitTlsData *tls)
 {
 }
 
-void
-mono_arch_emit_this_vret_args (MonoCompile *cfg, MonoCallInst *inst, int this_reg, int this_type, int vt_reg)
-{
-       MonoCallInst *call = (MonoCallInst*)inst;
-       CallInfo * cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, inst->signature, FALSE);
-
-       if (vt_reg != -1) {
-               MonoInst *vtarg;
-
-               if (cinfo->ret.storage == ArgValuetypeInReg) {
-                       /*
-                        * The valuetype is in RAX:RDX after the call, need to be copied to
-                        * the stack. Save the address here, so the call instruction can
-                        * access it.
-                        */
-                       MonoInst *loc = cfg->arch.vret_addr_loc;
-
-                       g_assert (loc);
-                       g_assert (loc->opcode == OP_REGOFFSET);
-
-                       MONO_EMIT_NEW_STORE_MEMBASE (cfg, OP_STORE_MEMBASE_REG, loc->inst_basereg, loc->inst_offset, vt_reg);
-               } else {
-                       MONO_INST_NEW (cfg, vtarg, OP_MOVE);
-                       vtarg->sreg1 = vt_reg;
-                       vtarg->dreg = mono_regstate_next_int (cfg->rs);
-                       mono_bblock_add_inst (cfg->cbb, vtarg);
-
-                       mono_call_inst_add_outarg_reg (cfg, call, vtarg->dreg, cinfo->ret.reg, FALSE);
-               }
-       }
-
-       /* add the this argument */
-       if (this_reg != -1) {
-               MonoInst *this;
-               MONO_INST_NEW (cfg, this, OP_MOVE);
-               this->type = this_type;
-               this->sreg1 = this_reg;
-               this->dreg = mono_regstate_next_int (cfg->rs);
-               mono_bblock_add_inst (cfg->cbb, this);
-
-               mono_call_inst_add_outarg_reg (cfg, call, this->dreg, cinfo->args [0].reg, FALSE);
-       }
-}
-
 #ifdef MONO_ARCH_HAVE_IMT
 
 #define CMP_SIZE (6 + 1)
@@ -6192,7 +6702,8 @@ imt_branch_distance (MonoIMTCheckItem **imt_entries, int start, int target)
  * LOCKING: called with the domain lock held
  */
 gpointer
-mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckItem **imt_entries, int count)
+mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckItem **imt_entries, int count,
+       gpointer fail_tramp)
 {
        int i;
        int size = 0;
@@ -6204,28 +6715,37 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
                if (item->is_equals) {
                        if (item->check_target_idx) {
                                if (!item->compare_done) {
-                                       if (amd64_is_imm32 (item->method))
+                                       if (amd64_is_imm32 (item->key))
                                                item->chunk_size += CMP_SIZE;
                                        else
                                                item->chunk_size += MOV_REG_IMM_SIZE + CMP_REG_REG_SIZE;
                                }
-                               if (vtable_is_32bit)
-                                       item->chunk_size += MOV_REG_IMM_32BIT_SIZE;
-                               else
+                               if (item->has_target_code) {
                                        item->chunk_size += MOV_REG_IMM_SIZE;
+                               } else {
+                                       if (vtable_is_32bit)
+                                               item->chunk_size += MOV_REG_IMM_32BIT_SIZE;
+                                       else
+                                               item->chunk_size += MOV_REG_IMM_SIZE;
+                               }
                                item->chunk_size += BR_SMALL_SIZE + JUMP_REG_SIZE;
                        } else {
-                               if (vtable_is_32bit)
-                                       item->chunk_size += MOV_REG_IMM_32BIT_SIZE;
-                               else
-                                       item->chunk_size += MOV_REG_IMM_SIZE;
-                               item->chunk_size += JUMP_REG_SIZE;
-                               /* with assert below:
-                                * item->chunk_size += CMP_SIZE + BR_SMALL_SIZE + 1;
-                                */
+                               if (fail_tramp) {
+                                       item->chunk_size += MOV_REG_IMM_SIZE * 3 + CMP_REG_REG_SIZE +
+                                               BR_SMALL_SIZE + JUMP_REG_SIZE * 2;
+                               } else {
+                                       if (vtable_is_32bit)
+                                               item->chunk_size += MOV_REG_IMM_32BIT_SIZE;
+                                       else
+                                               item->chunk_size += MOV_REG_IMM_SIZE;
+                                       item->chunk_size += JUMP_REG_SIZE;
+                                       /* with assert below:
+                                        * item->chunk_size += CMP_SIZE + BR_SMALL_SIZE + 1;
+                                        */
+                               }
                        }
                } else {
-                       if (amd64_is_imm32 (item->method))
+                       if (amd64_is_imm32 (item->key))
                                item->chunk_size += CMP_SIZE;
                        else
                                item->chunk_size += MOV_REG_IMM_SIZE + CMP_REG_REG_SIZE;
@@ -6234,39 +6754,56 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
                }
                size += item->chunk_size;
        }
-       code = mono_code_manager_reserve (domain->code_mp, size);
+       if (fail_tramp)
+               code = mono_method_alloc_generic_virtual_thunk (domain, size);
+       else
+               code = mono_domain_code_reserve (domain, size);
        start = code;
        for (i = 0; i < count; ++i) {
                MonoIMTCheckItem *item = imt_entries [i];
                item->code_target = code;
                if (item->is_equals) {
-                       if (item->check_target_idx) {
-                               if (!item->compare_done) {
-                                       if (amd64_is_imm32 (item->method))
-                                               amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->method);
+                       gboolean fail_case = !item->check_target_idx && fail_tramp;
+
+                       if (item->check_target_idx || fail_case) {
+                               if (!item->compare_done || fail_case) {
+                                       if (amd64_is_imm32 (item->key))
+                                               amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->key);
                                        else {
-                                               amd64_mov_reg_imm (code, AMD64_R10, item->method);
+                                               amd64_mov_reg_imm (code, AMD64_R10, item->key);
                                                amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, AMD64_R10);
                                        }
                                }
                                item->jmp_code = code;
                                amd64_branch8 (code, X86_CC_NE, 0, FALSE);
                                /* See the comment below about R10 */
-                               amd64_mov_reg_imm (code, AMD64_R10, & (vtable->vtable [item->vtable_slot]));
-                               amd64_jump_membase (code, AMD64_R10, 0);
+                               if (item->has_target_code) {
+                                       amd64_mov_reg_imm (code, AMD64_R10, item->value.target_code);
+                                       amd64_jump_reg (code, AMD64_R10);
+                               } else {
+                                       amd64_mov_reg_imm (code, AMD64_R10, & (vtable->vtable [item->value.vtable_slot]));
+                                       amd64_jump_membase (code, AMD64_R10, 0);
+                               }
+
+                               if (fail_case) {
+                                       amd64_patch (item->jmp_code, code);
+                                       amd64_mov_reg_imm (code, AMD64_R10, fail_tramp);
+                                       amd64_jump_reg (code, AMD64_R10);
+                                       item->jmp_code = NULL;
+                               }
                        } else {
                                /* enable the commented code to assert on wrong method */
 #if 0
-                               if (amd64_is_imm32 (item->method))
-                                       amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->method);
+                               if (amd64_is_imm32 (item->key))
+                                       amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->key);
                                else {
-                                       amd64_mov_reg_imm (code, AMD64_R10, item->method);
+                                       amd64_mov_reg_imm (code, AMD64_R10, item->key);
                                        amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, AMD64_R10);
                                }
                                item->jmp_code = code;
                                amd64_branch8 (code, X86_CC_NE, 0, FALSE);
                                /* See the comment below about R10 */
-                               amd64_mov_reg_imm (code, AMD64_R10, & (vtable->vtable [item->vtable_slot]));
+                               amd64_mov_reg_imm (code, AMD64_R10, & (vtable->vtable [item->value.vtable_slot]));
                                amd64_jump_membase (code, AMD64_R10, 0);
                                amd64_patch (item->jmp_code, code);
                                amd64_breakpoint (code);
@@ -6277,15 +6814,15 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
                                   to be preserved for calls which
                                   require a runtime generic context,
                                   but interface calls don't. */
-                               amd64_mov_reg_imm (code, AMD64_R10, & (vtable->vtable [item->vtable_slot]));
+                               amd64_mov_reg_imm (code, AMD64_R10, & (vtable->vtable [item->value.vtable_slot]));
                                amd64_jump_membase (code, AMD64_R10, 0);
 #endif
                        }
                } else {
-                       if (amd64_is_imm32 (item->method))
-                               amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->method);
+                       if (amd64_is_imm32 (item->key))
+                               amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->key);
                        else {
-                               amd64_mov_reg_imm (code, AMD64_R10, item->method);
+                               amd64_mov_reg_imm (code, AMD64_R10, item->key);
                                amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, AMD64_R10);
                        }
                        item->jmp_code = code;
@@ -6305,101 +6842,33 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
                        }
                }
        }
-               
-       mono_stats.imt_thunks_size += code - start;
+
+       if (!fail_tramp)
+               mono_stats.imt_thunks_size += code - start;
        g_assert (code - start <= size);
 
        return start;
 }
 
 MonoMethod*
-mono_arch_find_imt_method (gpointer *regs, guint8 *code)
+mono_arch_find_imt_method (mgreg_t *regs, guint8 *code)
 {
-       return regs [MONO_ARCH_IMT_REG];
+       return (MonoMethod*)regs [MONO_ARCH_IMT_REG];
 }
 
 MonoObject*
-mono_arch_find_this_argument (gpointer *regs, MonoMethod *method, MonoGenericSharingContext *gsctx)
-{
-       return mono_arch_get_this_arg_from_call (gsctx, mono_method_signature (method), (gssize*)regs, NULL);
-}
-
-void
-mono_arch_emit_imt_argument (MonoCompile *cfg, MonoCallInst *call)
+mono_arch_find_this_argument (mgreg_t *regs, MonoMethod *method, MonoGenericSharingContext *gsctx)
 {
-       /* Done by the implementation of the CALL_MEMBASE opcodes */
+       return mono_arch_get_this_arg_from_call (gsctx, mono_method_signature (method), regs, NULL);
 }
 #endif
 
 MonoVTable*
-mono_arch_find_static_call_vtable (gpointer *regs, guint8 *code)
+mono_arch_find_static_call_vtable (mgreg_t *regs, guint8 *code)
 {
        return (MonoVTable*) regs [MONO_ARCH_RGCTX_REG];
 }
 
-MonoInst*
-mono_arch_get_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
-{
-       MonoInst *ins = NULL;
-
-       if (cmethod->klass == mono_defaults.math_class) {
-               if (strcmp (cmethod->name, "Sin") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_SIN);
-                       ins->inst_i0 = args [0];
-               } else if (strcmp (cmethod->name, "Cos") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_COS);
-                       ins->inst_i0 = args [0];
-               } else if (strcmp (cmethod->name, "Sqrt") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_SQRT);
-                       ins->inst_i0 = args [0];
-               } else if (strcmp (cmethod->name, "Abs") == 0 && fsig->params [0]->type == MONO_TYPE_R8) {
-                       MONO_INST_NEW (cfg, ins, OP_ABS);
-                       ins->inst_i0 = args [0];
-               }
-
-               if (cfg->opt & MONO_OPT_CMOV) {
-                       int opcode = 0;
-
-                       if (strcmp (cmethod->name, "Min") == 0) {
-                               if (fsig->params [0]->type == MONO_TYPE_I4)
-                                       opcode = OP_IMIN;
-                               if (fsig->params [0]->type == MONO_TYPE_U4)
-                                       opcode = OP_IMIN_UN;
-                               else if (fsig->params [0]->type == MONO_TYPE_I8)
-                                       opcode = OP_LMIN;
-                               else if (fsig->params [0]->type == MONO_TYPE_U8)
-                                       opcode = OP_LMIN_UN;
-                       } else if (strcmp (cmethod->name, "Max") == 0) {
-                               if (fsig->params [0]->type == MONO_TYPE_I4)
-                                       opcode = OP_IMAX;
-                               if (fsig->params [0]->type == MONO_TYPE_U4)
-                                       opcode = OP_IMAX_UN;
-                               else if (fsig->params [0]->type == MONO_TYPE_I8)
-                                       opcode = OP_LMAX;
-                               else if (fsig->params [0]->type == MONO_TYPE_U8)
-                                       opcode = OP_LMAX_UN;
-                       }               
-
-                       if (opcode) {
-                               MONO_INST_NEW (cfg, ins, opcode);
-                               ins->inst_i0 = args [0];
-                               ins->inst_i1 = args [1];
-                       }
-               }
-
-#if 0
-               /* OP_FREM is not IEEE compatible */
-               else if (strcmp (cmethod->name, "IEEERemainder") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_FREM);
-                       ins->inst_i0 = args [0];
-                       ins->inst_i1 = args [1];
-               }
-#endif
-       }
-
-       return ins;
-}
-
 MonoInst*
 mono_arch_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
 {