2009-04-03 Zoltan Varga <vargaz@gmail.com>
[mono.git] / mono / mini / mini-amd64.c
index f53c28e2a9d3e5e363e047879363e458941614cc..92183fceb14ac6a5354fe9607b6c8b485bd7100f 100644 (file)
@@ -26,8 +26,8 @@
 #include <mono/utils/mono-math.h>
 
 #include "trace.h"
+#include "ir-emit.h"
 #include "mini-amd64.h"
-#include "inssel.h"
 #include "cpu-amd64.h"
 
 /* 
@@ -93,8 +93,6 @@ mono_breakpoint_info [MONO_BREAKPOINT_ARRAY_SIZE];
  * UNORDERED        1  1  1
  */
 
-void mini_emit_memcpy2 (MonoCompile *cfg, int destreg, int doffset, int srcreg, int soffset, int size, int align);
-
 const char*
 mono_arch_regname (int reg)
 {
@@ -250,13 +248,6 @@ typedef struct {
 
 #define DEBUG(a) if (cfg->verbose_level > 1) a
 
-#define NEW_ICONST(cfg,dest,val) do {  \
-               (dest) = mono_mempool_alloc0 ((cfg)->mempool, sizeof (MonoInst));       \
-               (dest)->opcode = OP_ICONST;     \
-               (dest)->inst_c0 = (val);        \
-               (dest)->type = STACK_I4;        \
-       } while (0)
-
 #ifdef PLATFORM_WIN32
 #define PARAM_REGS 4
 
@@ -326,7 +317,7 @@ merge_argument_class_from_type (MonoType *type, ArgumentClass class1)
        ArgumentClass class2 = ARG_CLASS_NO_CLASS;
        MonoType *ptype;
 
-       ptype = mono_type_get_underlying_type (type);
+       ptype = mini_type_get_underlying_type (NULL, type);
        switch (ptype->type) {
        case MONO_TYPE_BOOLEAN:
        case MONO_TYPE_CHAR:
@@ -403,7 +394,7 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
 {
        guint32 size, quad, nquads, i;
        ArgumentClass args [2];
-       MonoMarshalType *info;
+       MonoMarshalType *info = NULL;
        MonoClass *klass;
        MonoGenericSharingContext tmp_gsctx;
 
@@ -416,10 +407,7 @@ add_valuetype (MonoGenericSharingContext *gsctx, MonoMethodSignature *sig, ArgIn
                gsctx = &tmp_gsctx;
 
        klass = mono_class_from_mono_type (type);
-       if (sig->pinvoke) 
-               size = mono_type_native_stack_size (&klass->byval_arg, NULL);
-       else 
-               size = mini_type_stack_size (gsctx, &klass->byval_arg, NULL);
+       size = mini_type_stack_size_full (gsctx, &klass->byval_arg, NULL, sig->pinvoke);
 #ifndef PLATFORM_WIN32
        if (!sig->pinvoke && !disable_vtypes_in_regs && ((is_return && (size == 8)) || (!is_return && (size <= 16)))) {
                /* We pass and return vtypes of size 8 in a register */
@@ -614,8 +602,7 @@ get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSign
 
        /* return value */
        {
-               ret_type = mono_type_get_underlying_type (sig->ret);
-               ret_type = mini_get_basic_type_from_generic (gsctx, ret_type);
+               ret_type = mini_type_get_underlying_type (gsctx, sig->ret);
                switch (ret_type->type) {
                case MONO_TYPE_BOOLEAN:
                case MONO_TYPE_I1:
@@ -719,8 +706,7 @@ get_call_info (MonoGenericSharingContext *gsctx, MonoMemPool *mp, MonoMethodSign
                        add_general (&gr, &stack_size, ainfo);
                        continue;
                }
-               ptype = mono_type_get_underlying_type (sig->params [i]);
-               ptype = mini_get_basic_type_from_generic (gsctx, ptype);
+               ptype = mini_type_get_underlying_type (gsctx, sig->params [i]);
                switch (ptype->type) {
                case MONO_TYPE_BOOLEAN:
                case MONO_TYPE_I1:
@@ -919,10 +905,7 @@ mono_arch_cpu_optimizazions (guint32 *exclude_mask)
                } else
                        *exclude_mask |= MONO_OPT_CMOV;
        }
-#ifdef PLATFORM_WIN32
-       /* FIXME */
-       *exclude_mask |= (MONO_OPT_PEEPHOLE | MONO_OPT_BRANCH);
-#endif
+
        return opts;
 }
 
@@ -1025,11 +1008,6 @@ mono_arch_compute_omit_fp (MonoCompile *cfg)
 
                locals_size += mono_type_size (ins->inst_vtype, &ialign);
        }
-
-       if ((cfg->num_varinfo > 10000) || (locals_size >= (1 << 15))) {
-               /* Avoid hitting the stack_alloc_size < (1 << 16) assertion in emit_epilog () */
-               cfg->arch.omit_fp = FALSE;
-       }
 }
 
 GList *
@@ -1067,6 +1045,10 @@ mono_arch_get_global_int_regs (MonoCompile *cfg)
                regs = g_list_prepend (regs, (gpointer)AMD64_R13);
                regs = g_list_prepend (regs, (gpointer)AMD64_R14);
                regs = g_list_prepend (regs, (gpointer)AMD64_R15);
+#ifdef PLATFORM_WIN32
+               regs = g_list_prepend (regs, (gpointer)AMD64_RDI);
+               regs = g_list_prepend (regs, (gpointer)AMD64_RSI);
+#endif
        }
 
        return regs;
@@ -1520,34 +1502,7 @@ mono_arch_create_vars (MonoCompile *cfg)
 }
 
 static void
-add_outarg_reg (MonoCompile *cfg, MonoCallInst *call, MonoInst *arg, ArgStorage storage, int reg, MonoInst *tree)
-{
-       switch (storage) {
-       case ArgInIReg:
-               arg->opcode = OP_OUTARG_REG;
-               arg->inst_left = tree;
-               arg->inst_call = call;
-               arg->backend.reg3 = reg;
-               break;
-       case ArgInFloatSSEReg:
-               arg->opcode = OP_AMD64_OUTARG_XMMREG_R4;
-               arg->inst_left = tree;
-               arg->inst_call = call;
-               arg->backend.reg3 = reg;
-               break;
-       case ArgInDoubleSSEReg:
-               arg->opcode = OP_AMD64_OUTARG_XMMREG_R8;
-               arg->inst_left = tree;
-               arg->inst_call = call;
-               arg->backend.reg3 = reg;
-               break;
-       default:
-               g_assert_not_reached ();
-       }
-}
-
-static void
-add_outarg_reg2 (MonoCompile *cfg, MonoCallInst *call, ArgStorage storage, int reg, MonoInst *tree)
+add_outarg_reg (MonoCompile *cfg, MonoCallInst *call, ArgStorage storage, int reg, MonoInst *tree)
 {
        MonoInst *ins;
 
@@ -1581,23 +1536,6 @@ add_outarg_reg2 (MonoCompile *cfg, MonoCallInst *call, ArgStorage storage, int r
        }
 }
 
-static int
-arg_storage_to_ldind (ArgStorage storage)
-{
-       switch (storage) {
-       case ArgInIReg:
-               return CEE_LDIND_I;
-       case ArgInDoubleSSEReg:
-               return CEE_LDIND_R8;
-       case ArgInFloatSSEReg:
-               return CEE_LDIND_R4;
-       default:
-               g_assert_not_reached ();
-       }
-
-       return -1;
-}
-
 static int
 arg_storage_to_load_membase (ArgStorage storage)
 {
@@ -1617,298 +1555,6 @@ arg_storage_to_load_membase (ArgStorage storage)
 
 static void
 emit_sig_cookie (MonoCompile *cfg, MonoCallInst *call, CallInfo *cinfo)
-{
-       MonoInst *arg;
-       MonoMethodSignature *tmp_sig;
-       MonoInst *sig_arg;
-                       
-       /* FIXME: Add support for signature tokens to AOT */
-       cfg->disable_aot = TRUE;
-
-       g_assert (cinfo->sig_cookie.storage == ArgOnStack);
-
-       /*
-        * mono_ArgIterator_Setup assumes the signature cookie is 
-        * passed first and all the arguments which were before it are
-        * passed on the stack after the signature. So compensate by 
-        * passing a different signature.
-        */
-       tmp_sig = mono_metadata_signature_dup (call->signature);
-       tmp_sig->param_count -= call->signature->sentinelpos;
-       tmp_sig->sentinelpos = 0;
-       memcpy (tmp_sig->params, call->signature->params + call->signature->sentinelpos, tmp_sig->param_count * sizeof (MonoType*));
-
-       MONO_INST_NEW (cfg, sig_arg, OP_ICONST);
-       sig_arg->inst_p0 = tmp_sig;
-
-       MONO_INST_NEW (cfg, arg, OP_OUTARG);
-       arg->inst_left = sig_arg;
-       arg->type = STACK_PTR;
-
-       /* prepend, so they get reversed */
-       arg->next = call->out_args;
-       call->out_args = arg;
-}
-
-/* 
- * take the arguments and generate the arch-specific
- * instructions to properly call the function in call.
- * This includes pushing, moving arguments to the right register
- * etc.
- */
-MonoCallInst*
-mono_arch_call_opcode (MonoCompile *cfg, MonoBasicBlock* bb, MonoCallInst *call, int is_virtual) {
-       MonoInst *arg, *in;
-       MonoMethodSignature *sig;
-       int i, n, stack_size;
-       CallInfo *cinfo;
-       ArgInfo *ainfo;
-
-       stack_size = 0;
-
-       sig = call->signature;
-       n = sig->param_count + sig->hasthis;
-
-       cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, sig->pinvoke);
-
-       if (cfg->method->save_lmf) {
-               MONO_INST_NEW (cfg, arg, OP_AMD64_SAVE_SP_TO_LMF);
-               arg->next = call->out_args;
-               call->out_args = arg;
-       }
-
-       for (i = 0; i < n; ++i) {
-               ainfo = cinfo->args + i;
-
-               if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (i == sig->sentinelpos)) {
-                       /* Emit the signature cookie just before the implicit arguments */
-                       emit_sig_cookie (cfg, call, cinfo);
-               }
-
-               if (is_virtual && i == 0) {
-                       /* the argument will be attached to the call instruction */
-                       in = call->args [i];
-               } else {
-                       MONO_INST_NEW (cfg, arg, OP_OUTARG);
-                       in = call->args [i];
-                       arg->cil_code = in->cil_code;
-                       arg->inst_left = in;
-                       arg->type = in->type;
-                       /* prepend, so they get reversed */
-                       arg->next = call->out_args;
-                       call->out_args = arg;
-#if 0
-                       if (!cinfo->stack_usage)
-                               /* Keep the assignments to the arg registers in order if possible */
-                               MONO_INST_LIST_ADD_TAIL (&arg->node, &call->out_args);
-                       else
-                               MONO_INST_LIST_ADD (&arg->node, &call->out_args);
-#endif
-
-                       if ((i >= sig->hasthis) && (MONO_TYPE_ISSTRUCT(sig->params [i - sig->hasthis]))) {
-                               guint32 align;
-                               guint32 size;
-
-                               if (sig->params [i - sig->hasthis]->type == MONO_TYPE_TYPEDBYREF) {
-                                       size = sizeof (MonoTypedRef);
-                                       align = sizeof (gpointer);
-                               }
-                               else
-                               if (sig->pinvoke)
-                                       size = mono_type_native_stack_size (&in->klass->byval_arg, &align);
-                               else {
-                                       /* 
-                                        * Other backends use mini_type_stack_size (), but that
-                                        * aligns the size to 8, which is larger than the size of
-                                        * the source, leading to reads of invalid memory if the
-                                        * source is at the end of address space.
-                                        */
-                                       size = mono_class_value_size (in->klass, &align);
-                               }
-                               if (ainfo->storage == ArgValuetypeInReg) {
-                                       if (ainfo->pair_storage [1] == ArgNone) {
-                                               MonoInst *load;
-
-                                               /* Simpler case */
-
-                                               MONO_INST_NEW (cfg, load, arg_storage_to_ldind (ainfo->pair_storage [0]));
-                                               load->inst_left = in;
-
-                                               add_outarg_reg (cfg, call, arg, ainfo->pair_storage [0], ainfo->pair_regs [0], load);
-                                       }
-                                       else {
-                                               /* Trees can't be shared so make a copy */
-                                               MonoInst *vtaddr = mono_compile_create_var (cfg, &mono_defaults.int_class->byval_arg, OP_LOCAL);
-                                               MonoInst *load, *load2, *offset_ins;
-
-                                               /* Reg1 */
-                                               MONO_INST_NEW (cfg, load, CEE_LDIND_I);
-                                               load->ssa_op = MONO_SSA_LOAD;
-                                               load->inst_i0 = (cfg)->varinfo [vtaddr->inst_c0];
-
-                                               NEW_ICONST (cfg, offset_ins, 0);
-                                               MONO_INST_NEW (cfg, load2, CEE_ADD);
-                                               load2->inst_left = load;
-                                               load2->inst_right = offset_ins;
-
-                                               MONO_INST_NEW (cfg, load, arg_storage_to_ldind (ainfo->pair_storage [0]));
-                                               load->inst_left = load2;
-
-                                               add_outarg_reg (cfg, call, arg, ainfo->pair_storage [0], ainfo->pair_regs [0], load);
-
-                                               /* Reg2 */
-                                               MONO_INST_NEW (cfg, load, CEE_LDIND_I);
-                                               load->ssa_op = MONO_SSA_LOAD;
-                                               load->inst_i0 = (cfg)->varinfo [vtaddr->inst_c0];
-
-                                               NEW_ICONST (cfg, offset_ins, 8);
-                                               MONO_INST_NEW (cfg, load2, CEE_ADD);
-                                               load2->inst_left = load;
-                                               load2->inst_right = offset_ins;
-
-                                               MONO_INST_NEW (cfg, load, arg_storage_to_ldind (ainfo->pair_storage [1]));
-                                               load->inst_left = load2;
-
-                                               MONO_INST_NEW (cfg, arg, OP_OUTARG);
-                                               arg->cil_code = in->cil_code;
-                                               arg->type = in->type;
-                                               /* prepend, so they get reversed */
-                                               arg->next = call->out_args;
-                                               call->out_args = arg;
-
-                                               add_outarg_reg (cfg, call, arg, ainfo->pair_storage [1], ainfo->pair_regs [1], load);
-
-                                               /* Prepend a copy inst */
-                                               MONO_INST_NEW (cfg, arg, CEE_STIND_I);
-                                               arg->cil_code = in->cil_code;
-                                               arg->ssa_op = MONO_SSA_STORE;
-                                               arg->inst_left = vtaddr;
-                                               arg->inst_right = in;
-                                               arg->type = in->type;
-
-                                               /* prepend, so they get reversed */
-                                               arg->next = call->out_args;
-                                               call->out_args = arg;
-                                       }
-                               }
-                               else if (ainfo->storage == ArgValuetypeAddrInIReg){
-
-                                       /* Add a temp variable to the method*/
-                                       MonoInst *load;
-                                       MonoInst *vtaddr = mono_compile_create_var (cfg, &in->klass->byval_arg, OP_LOCAL);
-                                       
-                                       MONO_INST_NEW (cfg, load, OP_LDADDR);
-                                       load->ssa_op = MONO_SSA_LOAD;
-                                       load->inst_left = vtaddr;
-                                       
-                                       if (ainfo->pair_storage [0] == ArgInIReg) {
-                                               /* Inserted after the copy.  Load the address of the temp to the argument regster.*/
-                                               arg->opcode = OP_OUTARG_REG;
-                                               arg->inst_left = load;
-                                               arg->inst_call = call;
-                                               arg->backend.reg3 =  ainfo->pair_regs [0];
-                                       } 
-                                       else {
-                                               /* Inserted after the copy.  Load the address of the temp on the stack.*/
-                                               arg->opcode = OP_OUTARG_VT;
-                                               arg->inst_left = load;
-                                               arg->type = STACK_PTR;
-                                               arg->klass = mono_defaults.int_class;
-                                               arg->backend.is_pinvoke = sig->pinvoke;
-                                               arg->inst_imm = size;
-                                       }
-
-                                       /*Copy the argument to the temp variable.*/
-                                       MONO_INST_NEW (cfg, load, OP_MEMCPY);
-                                       load->backend.memcpy_args = mono_mempool_alloc0 (cfg->mempool, sizeof (MonoMemcpyArgs));
-                                       load->backend.memcpy_args->size = size;
-                                       load->backend.memcpy_args->align = align;
-                                       load->inst_left = (cfg)->varinfo [vtaddr->inst_c0];
-                                       load->inst_right = in->inst_i0;
-
-                                       // FIXME:
-                                       g_assert_not_reached ();
-                                       //MONO_INST_LIST_ADD (&load->node, &call->out_args);
-                               }
-                               else {
-                                       arg->opcode = OP_OUTARG_VT;
-                                       arg->klass = in->klass;
-                                       arg->backend.is_pinvoke = sig->pinvoke;
-                                       arg->inst_imm = size;
-                               }
-                       }
-                       else {
-                               switch (ainfo->storage) {
-                               case ArgInIReg:
-                                       add_outarg_reg (cfg, call, arg, ainfo->storage, ainfo->reg, in);
-                                       break;
-                               case ArgInFloatSSEReg:
-                               case ArgInDoubleSSEReg:
-                                       add_outarg_reg (cfg, call, arg, ainfo->storage, ainfo->reg, in);
-                                       break;
-                               case ArgOnStack:
-                                       arg->opcode = OP_OUTARG;
-                                       if (!sig->params [i - sig->hasthis]->byref) {
-                                               if (sig->params [i - sig->hasthis]->type == MONO_TYPE_R4)
-                                                       arg->opcode = OP_OUTARG_R4;
-                                               else
-                                                       if (sig->params [i - sig->hasthis]->type == MONO_TYPE_R8)
-                                                               arg->opcode = OP_OUTARG_R8;
-                                       }
-                                       break;
-                               default:
-                                       g_assert_not_reached ();
-                               }
-                       }
-               }
-       }
-
-       /* Handle the case where there are no implicit arguments */
-       if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (n == sig->sentinelpos)) {
-               emit_sig_cookie (cfg, call, cinfo);
-       }
-
-       if (cinfo->ret.storage == ArgValuetypeInReg) {
-               /* This is needed by mono_arch_emit_this_vret_args () */
-               if (!cfg->arch.vret_addr_loc) {
-                       cfg->arch.vret_addr_loc = mono_compile_create_var (cfg, &mono_defaults.int_class->byval_arg, OP_LOCAL);
-                       /* Prevent it from being register allocated or optimized away */
-                       ((MonoInst*)cfg->arch.vret_addr_loc)->flags |= MONO_INST_VOLATILE;
-               }
-       }
-
-       if (cinfo->need_stack_align) {
-               MONO_INST_NEW (cfg, arg, OP_AMD64_OUTARG_ALIGN_STACK);
-               arg->inst_c0 = 8;
-               /* prepend, so they get reversed */
-               arg->next = call->out_args;
-               call->out_args = arg;
-       }
-
-#ifdef PLATFORM_WIN32
-       /* Always reserve 32 bytes of stack space on Win64 */
-       /*MONO_INST_NEW (cfg, arg, OP_AMD64_OUTARG_ALIGN_STACK);
-       arg->inst_c0 = 32;
-       MONO_INST_LIST_ADD_TAIL (&arg->node, &call->out_args);*/
-       NOT_IMPLEMENTED;
-#endif
-
-#if 0
-       if (cfg->method->save_lmf) {
-               MONO_INST_NEW (cfg, arg, OP_AMD64_SAVE_SP_TO_LMF);
-               MONO_INST_LIST_ADD_TAIL (&arg->node, &call->out_args);
-       }
-#endif
-
-       call->stack_usage = cinfo->stack_usage;
-       cfg->param_area = MAX (cfg->param_area, call->stack_usage);
-       cfg->flags |= MONO_CFG_HAS_CALLS;
-
-       return call;
-}
-
-static void
-emit_sig_cookie2 (MonoCompile *cfg, MonoCallInst *call, CallInfo *cinfo)
 {
        MonoInst *arg;
        MonoMethodSignature *tmp_sig;
@@ -1959,6 +1605,51 @@ mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
 
        cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, sig, sig->pinvoke);
 
+       if (cfg->compile_llvm) {
+               for (i = 0; i < n; ++i) {
+                       MonoInst *ins;
+
+                       ainfo = cinfo->args + i;
+
+                       in = call->args [i];
+
+                       /* Simply remember the arguments */
+                       switch (ainfo->storage) {
+                       case ArgInIReg:
+                               MONO_INST_NEW (cfg, ins, OP_MOVE);
+                               ins->dreg = mono_alloc_ireg (cfg);
+                               ins->sreg1 = in->dreg;
+                               break;
+                       case ArgInDoubleSSEReg:
+                       case ArgInFloatSSEReg:
+                               MONO_INST_NEW (cfg, ins, OP_FMOVE);
+                               ins->dreg = mono_alloc_freg (cfg);
+                               ins->sreg1 = in->dreg;
+                               break;
+                       case ArgOnStack:
+                               if ((i >= sig->hasthis) && (MONO_TYPE_ISSTRUCT(sig->params [i - sig->hasthis]))) {
+                                       cfg->exception_message = g_strdup ("vtype argument");
+                                       cfg->disable_llvm = TRUE;
+                               } else {
+                                       MONO_INST_NEW (cfg, ins, OP_MOVE);
+                                       ins->dreg = mono_alloc_ireg (cfg);
+                                       ins->sreg1 = in->dreg;
+                               }
+                               break;
+                       default:
+                               cfg->exception_message = g_strdup ("ainfo->storage");
+                               cfg->disable_llvm = TRUE;
+                               return;
+                       }
+
+                       if (!cfg->disable_llvm) {
+                               MONO_ADD_INS (cfg->cbb, ins);
+                               mono_call_inst_add_outarg_reg (cfg, call, ins->dreg, 0, FALSE);
+                       }
+               }
+               return;
+       }
+
        if (cinfo->need_stack_align) {
                MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, 8);
        }
@@ -1973,7 +1664,7 @@ mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
                in = call->args [i];
 
                if (ainfo->storage == ArgInIReg)
-                       add_outarg_reg2 (cfg, call, ainfo->storage, ainfo->reg, in);
+                       add_outarg_reg (cfg, call, ainfo->storage, ainfo->reg, in);
        }
 
        for (i = n - 1; i >= 0; --i) {
@@ -1987,14 +1678,16 @@ mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
                        break;
                case ArgInFloatSSEReg:
                case ArgInDoubleSSEReg:
-                       add_outarg_reg2 (cfg, call, ainfo->storage, ainfo->reg, in);
+                       add_outarg_reg (cfg, call, ainfo->storage, ainfo->reg, in);
                        break;
                case ArgOnStack:
                case ArgValuetypeInReg:
                case ArgValuetypeAddrInIReg:
-                       if (ainfo->storage == ArgOnStack && call->tail_call)
-                               NOT_IMPLEMENTED;
-                       if ((i >= sig->hasthis) && (MONO_TYPE_ISSTRUCT(sig->params [i - sig->hasthis]))) {
+                       if (ainfo->storage == ArgOnStack && call->tail_call) {
+                               MonoInst *call_inst = (MonoInst*)call;
+                               cfg->args [i]->flags |= MONO_INST_VOLATILE;
+                               EMIT_NEW_ARGSTORE (cfg, call_inst, i, in);
+                       } else if ((i >= sig->hasthis) && (MONO_TYPE_ISSTRUCT(sig->params [i - sig->hasthis]))) {
                                guint32 align;
                                guint32 size;
 
@@ -2051,16 +1744,14 @@ mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call)
                        g_assert_not_reached ();
                }
 
-               if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (i == sig->sentinelpos)) {
+               if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (i == sig->sentinelpos))
                        /* Emit the signature cookie just before the implicit arguments */
-                       emit_sig_cookie2 (cfg, call, cinfo);
-               }
+                       emit_sig_cookie (cfg, call, cinfo);
        }
 
        /* Handle the case where there are no implicit arguments */
-       if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (n == sig->sentinelpos)) {
-               emit_sig_cookie2 (cfg, call, cinfo);
-       }
+       if (!sig->pinvoke && (sig->call_convention == MONO_CALL_VARARG) && (n == sig->sentinelpos))
+               emit_sig_cookie (cfg, call, cinfo);
 
        if (sig->ret && MONO_TYPE_ISSTRUCT (sig->ret)) {
                MonoInst *vtarg;
@@ -2153,7 +1844,7 @@ mono_arch_emit_outarg_vt (MonoCompile *cfg, MonoInst *ins, MonoInst *src)
                        }
                        MONO_ADD_INS (cfg->cbb, load);
 
-                       add_outarg_reg2 (cfg, call, ainfo->pair_storage [part], ainfo->pair_regs [part], load);
+                       add_outarg_reg (cfg, call, ainfo->pair_storage [part], ainfo->pair_regs [part], load);
                }
        } else if (ainfo->storage == ArgValuetypeAddrInIReg) {
                MonoInst *vtaddr, *load;
@@ -2166,14 +1857,15 @@ mono_arch_emit_outarg_vt (MonoCompile *cfg, MonoInst *ins, MonoInst *src)
                load->klass = vtaddr->klass;
                load->dreg = mono_alloc_ireg (cfg);
                MONO_ADD_INS (cfg->cbb, load);
-               mini_emit_memcpy2 (cfg, load->dreg, 0, src->dreg, 0, size, 4);
+               mini_emit_memcpy (cfg, load->dreg, 0, src->dreg, 0, size, 4);
 
                if (ainfo->pair_storage [0] == ArgInIReg) {
                        MONO_INST_NEW (cfg, arg, OP_X86_LEA_MEMBASE);
-                       arg->dreg = ainfo->pair_regs [0];
+                       arg->dreg = mono_alloc_ireg (cfg);
                        arg->sreg1 = load->dreg;
                        arg->inst_imm = 0;
                        MONO_ADD_INS (cfg->cbb, arg);
+                       mono_call_inst_add_outarg_reg (cfg, call, arg->dreg, ainfo->pair_regs [0], FALSE);
                } else {
                        MONO_INST_NEW (cfg, arg, OP_X86_PUSH);
                        arg->sreg1 = load->dreg;
@@ -2188,7 +1880,7 @@ mono_arch_emit_outarg_vt (MonoCompile *cfg, MonoInst *ins, MonoInst *src)
                        MONO_ADD_INS (cfg->cbb, arg);
                } else if (size <= 40) {
                        MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SUB_IMM, X86_ESP, X86_ESP, ALIGN_TO (size, 8));
-                       mini_emit_memcpy2 (cfg, X86_ESP, 0, src->dreg, 0, size, 4);
+                       mini_emit_memcpy (cfg, X86_ESP, 0, src->dreg, 0, size, 4);
                } else {
                        MONO_INST_NEW (cfg, arg, OP_X86_PUSH_OBJ);
                        arg->inst_basereg = src->dreg;
@@ -2202,7 +1894,7 @@ mono_arch_emit_outarg_vt (MonoCompile *cfg, MonoInst *ins, MonoInst *src)
 void
 mono_arch_emit_setret (MonoCompile *cfg, MonoMethod *method, MonoInst *val)
 {
-       MonoType *ret = mono_type_get_underlying_type (mono_method_signature (method)->ret);
+       MonoType *ret = mini_type_get_underlying_type (NULL, mono_method_signature (method)->ret);
 
        if (!ret->byref) {
                if (ret->type == MONO_TYPE_R4) {
@@ -2312,9 +2004,7 @@ emit_call_body (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointe
                        }
                }
                else {
-                       if (!cfg->new_ir && mono_find_class_init_trampoline_by_addr (data))
-                               near_call = TRUE;
-                       else if (cfg->abs_patches && g_hash_table_lookup (cfg->abs_patches, data)) {
+                       if (cfg->abs_patches && g_hash_table_lookup (cfg->abs_patches, data)) {
                                /* 
                                 * This is not really an optimization, but required because the
                                 * generic class init trampolines use R11 to pass the vtable.
@@ -2352,8 +2042,10 @@ emit_call_body (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointe
                        /* These methods are allocated using malloc */
                        near_call = FALSE;
 
-               if (cfg->compile_aot)
+               if (cfg->compile_aot) {
                        near_call = TRUE;
+                       no_patch = TRUE;
+               }
 
 #ifdef MONO_ARCH_NOMAP32BIT
                near_call = FALSE;
@@ -2592,9 +2284,6 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
 {
        MonoInst *ins, *n, *temp;
 
-       if (bb->max_vreg > cfg->rs->next_vreg)
-               cfg->rs->next_vreg = bb->max_vreg;
-
        /*
         * FIXME: Need to add more instructions, but the current machine 
         * description can't model some parts of the composite instructions like
@@ -2605,20 +2294,21 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_DIV_IMM:
                case OP_REM_IMM:
                case OP_IDIV_IMM:
-               case OP_IREM_IMM:
                case OP_IDIV_UN_IMM:
                case OP_IREM_UN_IMM:
                        mono_decompose_op_imm (cfg, bb, ins);
                        break;
+               case OP_IREM_IMM:
+                       /* Keep the opcode if we can implement it efficiently */
+                       if (!((ins->inst_imm > 0) && (mono_is_power_of_two (ins->inst_imm) != -1)))
+                               mono_decompose_op_imm (cfg, bb, ins);
+                       break;
                case OP_COMPARE_IMM:
                case OP_LCOMPARE_IMM:
                        if (!amd64_is_imm32 (ins->inst_imm)) {
                                NEW_INS (cfg, ins, temp, OP_I8CONST);
                                temp->inst_c0 = ins->inst_imm;
-                               if (cfg->globalra)
-                                       temp->dreg = mono_alloc_ireg (cfg);
-                               else
-                                       temp->dreg = mono_regstate_next_int (cfg->rs);
+                               temp->dreg = mono_alloc_ireg (cfg);
                                ins->opcode = OP_COMPARE;
                                ins->sreg2 = temp->dreg;
                        }
@@ -2628,10 +2318,7 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                        if (!amd64_is_imm32 (ins->inst_offset)) {
                                NEW_INS (cfg, ins, temp, OP_I8CONST);
                                temp->inst_c0 = ins->inst_offset;
-                               if (cfg->globalra)
-                                       temp->dreg = mono_alloc_ireg (cfg);
-                               else
-                                       temp->dreg = mono_regstate_next_int (cfg->rs);
+                               temp->dreg = mono_alloc_ireg (cfg);
                                ins->opcode = OP_AMD64_LOADI8_MEMINDEX;
                                ins->inst_indexreg = temp->dreg;
                        }
@@ -2641,10 +2328,7 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                        if (!amd64_is_imm32 (ins->inst_imm)) {
                                NEW_INS (cfg, ins, temp, OP_I8CONST);
                                temp->inst_c0 = ins->inst_imm;
-                               if (cfg->globalra)
-                                       temp->dreg = mono_alloc_ireg (cfg);
-                               else
-                                       temp->dreg = mono_regstate_next_int (cfg->rs);
+                               temp->dreg = mono_alloc_ireg (cfg);
                                ins->opcode = OP_STOREI8_MEMBASE_REG;
                                ins->sreg1 = temp->dreg;
                        }
@@ -2654,7 +2338,7 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb)
                }
        }
 
-       bb->max_vreg = cfg->rs->next_vreg;
+       bb->max_vreg = cfg->next_vreg;
 }
 
 static const int 
@@ -2845,18 +2529,19 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, guint8 *code)
 }
 
 /*
- * emit_tls_get:
+ * mono_amd64_emit_tls_get:
  * @code: buffer to store code to
  * @dreg: hard register where to place the result
  * @tls_offset: offset info
  *
- * emit_tls_get emits in @code the native code that puts in the dreg register
- * the item in the thread local storage identified by tls_offset.
+ * mono_amd64_emit_tls_get emits in @code the native code that puts in
+ * the dreg register the item in the thread local storage identified
+ * by tls_offset.
  *
  * Returns: a pointer to the end of the stored code
  */
-static guint8*
-emit_tls_get (guint8* code, int dreg, int tls_offset)
+guint8*
+mono_amd64_emit_tls_get (guint8* code, int dreg, int tls_offset)
 {
 #ifdef PLATFORM_WIN32
        g_assert (tls_offset < 64);
@@ -2875,93 +2560,6 @@ emit_tls_get (guint8* code, int dreg, int tls_offset)
        return code;
 }
 
-/*
- * emit_load_volatile_arguments:
- *
- *  Load volatile arguments from the stack to the original input registers.
- * Required before a tail call.
- */
-static guint8*
-emit_load_volatile_arguments (MonoCompile *cfg, guint8 *code)
-{
-       MonoMethod *method = cfg->method;
-       MonoMethodSignature *sig;
-       MonoInst *ins;
-       CallInfo *cinfo;
-       guint32 i, quad;
-
-       /* FIXME: Generate intermediate code instead */
-
-       sig = mono_method_signature (method);
-
-       cinfo = cfg->arch.cinfo;
-       
-       /* This is the opposite of the code in emit_prolog */
-       if (sig->ret->type != MONO_TYPE_VOID) {
-               if (cfg->vret_addr && (cfg->vret_addr->opcode != OP_REGVAR))
-                       amd64_mov_reg_membase (code, cinfo->ret.reg, cfg->vret_addr->inst_basereg, cfg->vret_addr->inst_offset, 8);
-       }
-
-       for (i = 0; i < sig->param_count + sig->hasthis; ++i) {
-               ArgInfo *ainfo = cinfo->args + i;
-               MonoType *arg_type;
-               ins = cfg->args [i];
-
-               if (sig->hasthis && (i == 0))
-                       arg_type = &mono_defaults.object_class->byval_arg;
-               else
-                       arg_type = sig->params [i - sig->hasthis];
-
-               if (ins->opcode != OP_REGVAR) {
-                       switch (ainfo->storage) {
-                       case ArgInIReg: {
-                               guint32 size = 8;
-
-                               /* FIXME: I1 etc */
-                               amd64_mov_reg_membase (code, ainfo->reg, ins->inst_basereg, ins->inst_offset, size);
-                               break;
-                       }
-                       case ArgInFloatSSEReg:
-                               amd64_movss_reg_membase (code, ainfo->reg, ins->inst_basereg, ins->inst_offset);
-                               break;
-                       case ArgInDoubleSSEReg:
-                               amd64_movsd_reg_membase (code, ainfo->reg, ins->inst_basereg, ins->inst_offset);
-                               break;
-                       case ArgValuetypeInReg:
-                               for (quad = 0; quad < 2; quad ++) {
-                                       switch (ainfo->pair_storage [quad]) {
-                                       case ArgInIReg:
-                                               amd64_mov_reg_membase (code, ainfo->pair_regs [quad], ins->inst_basereg, ins->inst_offset + (quad * sizeof (gpointer)), sizeof (gpointer));
-                                               break;
-                                       case ArgInFloatSSEReg:
-                                       case ArgInDoubleSSEReg:
-                                               g_assert_not_reached ();
-                                               break;
-                                       case ArgNone:
-                                               break;
-                                       default:
-                                               g_assert_not_reached ();
-                                       }
-                               }
-                               break;
-                       case ArgValuetypeAddrInIReg:
-                               if (ainfo->pair_storage [0] == ArgInIReg)
-                                       amd64_mov_reg_membase (code, ainfo->pair_regs [0], ins->inst_left->inst_basereg, ins->inst_left->inst_offset,  sizeof (gpointer));
-                               break;
-                       default:
-                               break;
-                       }
-               }
-               else {
-                       g_assert (ainfo->storage == ArgInIReg);
-
-                       amd64_mov_reg_reg (code, ainfo->reg, ins->dreg, 8);
-               }
-       }
-
-       return code;
-}
-
 #define REAL_PRINT_REG(text,reg) \
 mono_assert (reg >= 0); \
 amd64_push_reg (code, AMD64_RAX); \
@@ -2981,6 +2579,8 @@ amd64_pop_reg (code, AMD64_RAX);
 #define LOOP_ALIGNMENT 8
 #define bb_is_loop_start(bb) ((bb)->loop_body_start && (bb)->nesting)
 
+#ifndef DISABLE_JIT
+
 void
 mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 {
@@ -3024,6 +2624,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 
        mono_debug_open_block (cfg, bb, offset);
 
+    if (mono_break_at_bb_method && mono_method_desc_full_match (mono_break_at_bb_method, cfg->method) && bb->block_num == mono_break_at_bb_bb_num)
+               x86_breakpoint (code);
+
        MONO_BB_FOR_EACH_INS (bb, ins) {
                offset = code - cfg->native_code;
 
@@ -3092,15 +2695,10 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        break;
                case OP_LOADU4_MEM:
                        // FIXME: Decompose this earlier
-                       if (cfg->new_ir) {
-                               if (amd64_is_imm32 (ins->inst_imm))
-                                       amd64_mov_reg_mem (code, ins->dreg, ins->inst_imm, 4);
-                               else {
-                                       amd64_mov_reg_imm (code, ins->dreg, ins->inst_imm);
-                                       amd64_mov_reg_membase (code, ins->dreg, ins->dreg, 0, 4);
-                               }
-                       } else {
-                               amd64_mov_reg_imm (code, ins->dreg, ins->inst_p0);
+                       if (amd64_is_imm32 (ins->inst_imm))
+                               amd64_mov_reg_mem (code, ins->dreg, ins->inst_imm, 4);
+                       else {
+                               amd64_mov_reg_imm (code, ins->dreg, ins->inst_imm);
                                amd64_mov_reg_membase (code, ins->dreg, ins->dreg, 0, 4);
                        }
                        break;
@@ -3330,6 +2928,13 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_BREAK:
                        amd64_breakpoint (code);
                        break;
+               case OP_RELAXED_NOP:
+                       x86_prefix (code, X86_REP_PREFIX);
+                       x86_nop (code);
+                       break;
+               case OP_HARD_NOP:
+                       x86_nop (code);
+                       break;
                case OP_NOP:
                case OP_DUMMY_USE:
                case OP_DUMMY_STORE:
@@ -3487,6 +3092,32 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                amd64_div_reg_size (code, ins->sreg2, FALSE, 4);
                        }
                        break;
+               case OP_IREM_IMM: {
+                       int power = mono_is_power_of_two (ins->inst_imm);
+
+                       g_assert (ins->sreg1 == X86_EAX);
+                       g_assert (ins->dreg == X86_EAX);
+                       g_assert (power >= 0);
+
+                       if (power == 0) {
+                               amd64_mov_reg_imm (code, ins->dreg, 0);
+                               break;
+                       }
+
+                       /* Based on gcc code */
+
+                       /* Add compensation for negative dividents */
+                       amd64_mov_reg_reg_size (code, AMD64_RDX, AMD64_RAX, 4);
+                       if (power > 1)
+                               amd64_shift_reg_imm_size (code, X86_SAR, AMD64_RDX, 31, 4);
+                       amd64_shift_reg_imm_size (code, X86_SHR, AMD64_RDX, 32 - power, 4);
+                       amd64_alu_reg_reg_size (code, X86_ADD, AMD64_RAX, AMD64_RDX, 4);
+                       /* Compute remainder */
+                       amd64_alu_reg_imm_size (code, X86_AND, AMD64_RAX, (1 << power) - 1, 4);
+                       /* Remove compensation */
+                       amd64_alu_reg_reg_size (code, X86_SUB, AMD64_RAX, AMD64_RDX, 4);
+                       break;
+               }
                case OP_LMUL_OVF:
                        amd64_imul_reg_reg (code, ins->sreg1, ins->sreg2);
                        EMIT_COND_SYSTEM_EXCEPTION (X86_CC_O, FALSE, "OverflowException");
@@ -3750,7 +3381,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
                        break;
                }
-               case OP_JMP:
                case OP_TAILCALL: {
                        /*
                         * Note: this 'frame destruction' logic is useful for tail calls, too.
@@ -3764,9 +3394,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 
                        g_assert (!cfg->method->save_lmf);
 
-                       if (ins->opcode == OP_JMP)
-                               code = emit_load_volatile_arguments (cfg, code);
-
                        if (cfg->arch.omit_fp) {
                                guint32 save_offset = 0;
                                /* Pop callee-saved registers */
@@ -3914,30 +3541,12 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                ins->sreg1 = AMD64_RAX;
                        }
 
-                       if (call->method && ins->inst_offset < 0) {
-                               gssize val;
-
-                               /* 
-                                * This is a possible IMT call so save the IMT method in the proper
-                                * register. We don't use the generic code in method-to-ir.c, because
-                                * we need to disassemble this in get_vcall_slot_addr (), so we have to
-                                * maintain control over the layout of the code.
-                                * Also put the base reg in %rax to simplify find_imt_method ().
-                                */
-                               if (ins->sreg1 != AMD64_RAX) {
-                                       amd64_mov_reg_reg (code, AMD64_RAX, ins->sreg1, 8);
-                                       ins->sreg1 = AMD64_RAX;
-                               }
-                               val = (gssize)(gpointer)call->method;
-
-                               // FIXME: Generics sharing
-#if 0
-                               if ((((guint64)val) >> 32) == 0)
-                                       amd64_mov_reg_imm_size (code, MONO_ARCH_IMT_REG, val, 4);
-                               else
-                                       amd64_mov_reg_imm_size (code, MONO_ARCH_IMT_REG, val, 8);
-#endif
-                       }
+                       /* 
+                        * Emit a few nops to simplify get_vcall_slot ().
+                        */
+                       amd64_nop (code);
+                       amd64_nop (code);
+                       amd64_nop (code);
 
                        amd64_call_membase (code, ins->sreg1, ins->inst_offset);
                        if (call->stack_usage && !CALLCONV_IS_STDCALL (call->signature->call_convention))
@@ -3947,7 +3556,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_AMD64_SAVE_SP_TO_LMF:
                        amd64_mov_membase_reg (code, cfg->frame_reg, cfg->arch.lmf_offset + G_STRUCT_OFFSET (MonoLMF, rsp), AMD64_RSP, 8);
                        break;
-               case OP_OUTARG:
                case OP_X86_PUSH:
                        amd64_push_reg (code, ins->sreg1);
                        break;
@@ -3958,8 +3566,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_X86_PUSH_MEMBASE:
                        amd64_push_membase (code, ins->inst_basereg, ins->inst_offset);
                        break;
-               case OP_X86_PUSH_OBJ: 
-                       amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, ins->inst_imm);
+               case OP_X86_PUSH_OBJ: {
+                       int size = ALIGN_TO (ins->inst_imm, 8);
+                       amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, size);
                        amd64_push_reg (code, AMD64_RDI);
                        amd64_push_reg (code, AMD64_RSI);
                        amd64_push_reg (code, AMD64_RCX);
@@ -3967,8 +3576,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                amd64_lea_membase (code, AMD64_RSI, ins->inst_basereg, ins->inst_offset);
                        else
                                amd64_mov_reg_reg (code, AMD64_RSI, ins->inst_basereg, 8);
-                       amd64_lea_membase (code, AMD64_RDI, AMD64_RSP, 3 * 8);
-                       amd64_mov_reg_imm (code, AMD64_RCX, (ins->inst_imm >> 3));
+                       amd64_lea_membase (code, AMD64_RDI, AMD64_RSP, (3 * 8));
+                       amd64_mov_reg_imm (code, AMD64_RCX, (size >> 3));
                        amd64_cld (code);
                        amd64_prefix (code, X86_REP_PREFIX);
                        amd64_movsd (code);
@@ -3976,6 +3585,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_pop_reg (code, AMD64_RSI);
                        amd64_pop_reg (code, AMD64_RDI);
                        break;
+               }
                case OP_X86_LEA:
                        amd64_lea_memindex (code, ins->dreg, ins->sreg1, ins->inst_imm, ins->sreg2, ins->backend.shift_amount);
                        break;
@@ -4552,7 +4162,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_alu_reg_imm (code, X86_ADD, AMD64_RSP, 16);
                        break;
                case OP_TLS_GET: {
-                       code = emit_tls_get (code, ins->dreg, ins->inst_offset);
+                       code = mono_amd64_emit_tls_get (code, ins->dreg, ins->inst_offset);
                        break;
                }
                case OP_MEMORY_BARRIER: {
@@ -4598,8 +4208,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        break;
                }
                case OP_ATOMIC_EXCHANGE_I4:
-               case OP_ATOMIC_EXCHANGE_I8:
-               case OP_ATOMIC_CAS_IMM_I4: {
+               case OP_ATOMIC_EXCHANGE_I8: {
                        guchar *br[2];
                        int sreg2 = ins->sreg2;
                        int breg = ins->inst_basereg;
@@ -4646,28 +4255,54 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                }
                        }
 
-                       if (ins->opcode == OP_ATOMIC_CAS_IMM_I4) {
-                               if (ins->backend.data == NULL)
-                                       amd64_alu_reg_reg (code, X86_XOR, AMD64_RAX, AMD64_RAX);
-                               else
-                                       amd64_mov_reg_imm (code, AMD64_RAX, ins->backend.data);
-
-                               amd64_prefix (code, X86_LOCK_PREFIX);
-                               amd64_cmpxchg_membase_reg_size (code, breg, ins->inst_offset, sreg2, size);
-                       } else {
-                               amd64_mov_reg_membase (code, AMD64_RAX, breg, ins->inst_offset, size);
+                       amd64_mov_reg_membase (code, AMD64_RAX, breg, ins->inst_offset, size);
 
-                               br [0] = code; amd64_prefix (code, X86_LOCK_PREFIX);
-                               amd64_cmpxchg_membase_reg_size (code, breg, ins->inst_offset, sreg2, size);
-                               br [1] = code; amd64_branch8 (code, X86_CC_NE, -1, FALSE);
-                               amd64_patch (br [1], br [0]);
-                       }
+                       br [0] = code; amd64_prefix (code, X86_LOCK_PREFIX);
+                       amd64_cmpxchg_membase_reg_size (code, breg, ins->inst_offset, sreg2, size);
+                       br [1] = code; amd64_branch8 (code, X86_CC_NE, -1, FALSE);
+                       amd64_patch (br [1], br [0]);
 
                        if (rdx_pushed)
                                amd64_pop_reg (code, AMD64_RDX);
 
                        break;
                }
+               case OP_ATOMIC_CAS_I4:
+               case OP_ATOMIC_CAS_I8: {
+                       guint32 size;
+
+                       if (ins->opcode == OP_ATOMIC_CAS_I8)
+                               size = 8;
+                       else
+                               size = 4;
+
+                       /* 
+                        * See http://msdn.microsoft.com/en-us/magazine/cc302329.aspx for
+                        * an explanation of how this works.
+                        */
+                       g_assert (ins->sreg3 == AMD64_RAX);
+                       g_assert (ins->sreg1 != AMD64_RAX);
+                       g_assert (ins->sreg1 != ins->sreg2);
+
+                       amd64_prefix (code, X86_LOCK_PREFIX);
+                       amd64_cmpxchg_membase_reg_size (code, ins->sreg1, ins->inst_offset, ins->sreg2, size);
+
+                       if (ins->dreg != AMD64_RAX)
+                               amd64_mov_reg_reg (code, ins->dreg, AMD64_RAX, size);
+                       break;
+               }
+               case OP_LIVERANGE_START: {
+                       if (cfg->verbose_level > 1)
+                               printf ("R%d START=0x%x\n", MONO_VARINFO (cfg, ins->inst_c0)->vreg, (int)(code - cfg->native_code));
+                       MONO_VARINFO (cfg, ins->inst_c0)->live_range_start = code - cfg->native_code;
+                       break;
+               }
+               case OP_LIVERANGE_END: {
+                       if (cfg->verbose_level > 1)
+                               printf ("R%d END=0x%x\n", MONO_VARINFO (cfg, ins->inst_c0)->vreg, (int)(code - cfg->native_code));
+                       MONO_VARINFO (cfg, ins->inst_c0)->live_range_end = code - cfg->native_code;
+                       break;
+               }
                default:
                        g_warning ("unknown opcode %s in %s()\n", mono_inst_name (ins->opcode), __FUNCTION__);
                        g_assert_not_reached ();
@@ -4688,6 +4323,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
        cfg->code_len = code - cfg->native_code;
 }
 
+#endif /* DISABLE_JIT */
+
 void
 mono_arch_register_lowlevel_calls (void)
 {
@@ -4798,7 +4435,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        MonoBasicBlock *bb;
        MonoMethodSignature *sig;
        MonoInst *ins;
-       int alloc_size, pos, max_offset, i, quad, max_epilog_size;
+       int alloc_size, pos, max_offset, i, cfa_offset, quad, max_epilog_size;
        guint8 *code;
        CallInfo *cinfo;
        gint32 lmf_offset = cfg->arch.lmf_offset;
@@ -4815,6 +4452,9 @@ mono_arch_emit_prolog (MonoCompile *cfg)
        /* Amount of stack space allocated by register saving code */
        pos = 0;
 
+       /* Offset between RSP and the CFA */
+       cfa_offset = 0;
+
        /* 
         * The prolog consists of the following parts:
         * FP present:
@@ -4830,16 +4470,25 @@ mono_arch_emit_prolog (MonoCompile *cfg)
         * - save callee saved regs using moves
         */
 
+       // CFA = sp + 8
+       cfa_offset = 8;
+       mono_emit_unwind_op_def_cfa (cfg, code, AMD64_RSP, 8);
+       // IP saved at CFA - 8
+       mono_emit_unwind_op_offset (cfg, code, AMD64_RIP, -cfa_offset);
        async_exc_point (code);
 
        if (!cfg->arch.omit_fp) {
                amd64_push_reg (code, AMD64_RBP);
+               cfa_offset += 8;
+               mono_emit_unwind_op_def_cfa_offset (cfg, code, cfa_offset);
+               mono_emit_unwind_op_offset (cfg, code, AMD64_RBP, - cfa_offset);
                async_exc_point (code);
 #ifdef PLATFORM_WIN32
                mono_arch_unwindinfo_add_push_nonvol (&cfg->arch.unwindinfo, cfg->native_code, code, AMD64_RBP);
 #endif
                
                amd64_mov_reg_reg (code, AMD64_RBP, AMD64_RSP, sizeof (gpointer));
+               mono_emit_unwind_op_def_cfa_reg (cfg, code, AMD64_RBP);
                async_exc_point (code);
 #ifdef PLATFORM_WIN32
                mono_arch_unwindinfo_add_set_fpreg (&cfg->arch.unwindinfo, cfg->native_code, code, AMD64_RBP);
@@ -4848,10 +4497,14 @@ mono_arch_emit_prolog (MonoCompile *cfg)
 
        /* Save callee saved registers */
        if (!cfg->arch.omit_fp && !method->save_lmf) {
+               int offset = cfa_offset;
+
                for (i = 0; i < AMD64_NREG; ++i)
                        if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
                                amd64_push_reg (code, i);
                                pos += sizeof (gpointer);
+                               offset += 8;
+                               mono_emit_unwind_op_offset (cfg, code, i, - offset);
                                async_exc_point (code);
                        }
        }
@@ -4880,6 +4533,10 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                guint32 remaining_size = alloc_size;
                while (remaining_size >= 0x1000) {
                        amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, 0x1000);
+                       if (cfg->arch.omit_fp) {
+                               cfa_offset += 0x1000;
+                               mono_emit_unwind_op_def_cfa_offset (cfg, code, cfa_offset);
+                       }
                        async_exc_point (code);
 #ifdef PLATFORM_WIN32
                        if (cfg->arch.omit_fp) 
@@ -4891,7 +4548,11 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                }
                if (remaining_size) {
                        amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, remaining_size);
-                       async_exc_point (code);
+                       if (cfg->arch.omit_fp) {
+                               cfa_offset += remaining_size;
+                               mono_emit_unwind_op_def_cfa_offset (cfg, code, cfa_offset);
+                               async_exc_point (code);
+                       }
 #ifdef PLATFORM_WIN32
                        if (cfg->arch.omit_fp) 
                                mono_arch_unwindinfo_add_alloc_stack (&cfg->arch.unwindinfo, cfg->native_code, code, remaining_size);
@@ -4899,7 +4560,11 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                }
 #else
                amd64_alu_reg_imm (code, X86_SUB, AMD64_RSP, alloc_size);
-               async_exc_point (code);
+               if (cfg->arch.omit_fp) {
+                       cfa_offset += alloc_size;
+                       mono_emit_unwind_op_def_cfa_offset (cfg, code, cfa_offset);
+                       async_exc_point (code);
+               }
 #endif
        }
 
@@ -4922,12 +4587,31 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                /* sp is saved right before calls */
                /* Skip method (only needed for trampoline LMF frames) */
                /* Save callee saved regs */
-               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbx), AMD64_RBX, 8);
-               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rbp), AMD64_RBP, 8);
-               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r12), AMD64_R12, 8);
-               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r13), AMD64_R13, 8);
-               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r14), AMD64_R14, 8);
-               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r15), AMD64_R15, 8);
+               for (i = 0; i < MONO_MAX_IREGS; ++i) {
+                       int offset;
+
+                       switch (i) {
+                       case AMD64_RBX: offset = G_STRUCT_OFFSET (MonoLMF, rbx); break;
+                       case AMD64_RBP: offset = G_STRUCT_OFFSET (MonoLMF, rbp); break;
+                       case AMD64_R12: offset = G_STRUCT_OFFSET (MonoLMF, r12); break;
+                       case AMD64_R13: offset = G_STRUCT_OFFSET (MonoLMF, r13); break;
+                       case AMD64_R14: offset = G_STRUCT_OFFSET (MonoLMF, r14); break;
+                       case AMD64_R15: offset = G_STRUCT_OFFSET (MonoLMF, r15); break;
+#ifdef PLATFORM_WIN32
+                       case AMD64_RDI: offset = G_STRUCT_OFFSET (MonoLMF, rdi); break;
+                       case AMD64_RSI: offset = G_STRUCT_OFFSET (MonoLMF, rsi); break;
+#endif
+                       default:
+                               offset = -1;
+                               break;
+                       }
+
+                       if (offset != -1) {
+                               amd64_mov_membase_reg (code, cfg->frame_reg, lmf_offset + offset, i, 8);
+                               if (cfg->arch.omit_fp || (i != AMD64_RBP))
+                                       mono_emit_unwind_op_offset (cfg, code, i, - (cfa_offset - (lmf_offset + offset)));
+                       }
+               }
        }
 
        /* Save callee saved registers */
@@ -4940,6 +4624,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                for (i = 0; i < AMD64_NREG; ++i)
                        if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->used_int_regs & (1 << i))) {
                                amd64_mov_membase_reg (code, AMD64_RSP, save_area_offset, i, 8);
+                               mono_emit_unwind_op_offset (cfg, code, i, - (cfa_offset - save_area_offset));
                                save_area_offset += 8;
                                async_exc_point (code);
                        }
@@ -5122,7 +4807,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                if (appdomain_tls_offset != -1 && lmf_tls_offset != -1) {
                        guint8 *buf, *no_domain_branch;
 
-                       code = emit_tls_get (code, AMD64_RAX, appdomain_tls_offset);
+                       code = mono_amd64_emit_tls_get (code, AMD64_RAX, appdomain_tls_offset);
                        if ((domain >> 32) == 0)
                                amd64_mov_reg_imm_size (code, AMD64_ARG_REG1, domain, 4);
                        else
@@ -5130,7 +4815,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                        amd64_alu_reg_reg (code, X86_CMP, AMD64_RAX, AMD64_ARG_REG1);
                        no_domain_branch = code;
                        x86_branch8 (code, X86_CC_NE, 0, 0);
-                       code = emit_tls_get ( code, AMD64_RAX, lmf_addr_tls_offset);
+                       code = mono_amd64_emit_tls_get ( code, AMD64_RAX, lmf_addr_tls_offset);
                        amd64_test_reg_reg (code, AMD64_RAX, AMD64_RAX);
                        buf = code;
                        x86_branch8 (code, X86_CC_NE, 0, 0);
@@ -5178,7 +4863,7 @@ mono_arch_emit_prolog (MonoCompile *cfg)
                } else {
                        if (lmf_addr_tls_offset != -1) {
                                /* Load lmf quicky using the FS register */
-                               code = emit_tls_get (code, AMD64_RAX, lmf_addr_tls_offset);
+                               code = mono_amd64_emit_tls_get (code, AMD64_RAX, lmf_addr_tls_offset);
 #ifdef PLATFORM_WIN32
                                /* The TLS key actually contains a pointer to the MonoJitTlsData structure */
                                /* FIXME: Add a separate key for LMF to avoid this */
@@ -5315,6 +5000,23 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        pos = 0;
        
        if (method->save_lmf) {
+               /* check if we need to restore protection of the stack after a stack overflow */
+               if (mono_get_jit_tls_offset () != -1) {
+                       guint8 *patch;
+                       code = mono_amd64_emit_tls_get (code, X86_ECX, mono_get_jit_tls_offset ());
+                       /* we load the value in a separate instruction: this mechanism may be
+                        * used later as a safer way to do thread interruption
+                        */
+                       amd64_mov_reg_membase (code, X86_ECX, X86_ECX, G_STRUCT_OFFSET (MonoJitTlsData, restore_stack_prot), 8);
+                       x86_alu_reg_imm (code, X86_CMP, X86_ECX, 0);
+                       patch = code;
+                       x86_branch8 (code, X86_CC_Z, 0, FALSE);
+                       /* note that the call trampoline will preserve eax/edx */
+                       x86_call_reg (code, X86_ECX);
+                       x86_patch (patch, code);
+               } else {
+                       /* FIXME: maybe save the jit tls in the prolog */
+               }
                if ((lmf_tls_offset != -1) && !optimize_for_xen) {
                        /*
                         * Optimized version which uses the mono_lmf TLS variable instead of indirection
@@ -5350,6 +5052,14 @@ mono_arch_emit_epilog (MonoCompile *cfg)
                if (cfg->used_int_regs & (1 << AMD64_R15)) {
                        amd64_mov_reg_membase (code, AMD64_R15, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, r15), 8);
                }
+#ifdef PLATFORM_WIN32
+               if (cfg->used_int_regs & (1 << AMD64_RDI)) {
+                       amd64_mov_reg_membase (code, AMD64_RDI, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rdi), 8);
+               }
+               if (cfg->used_int_regs & (1 << AMD64_RSI)) {
+                       amd64_mov_reg_membase (code, AMD64_RSI, cfg->frame_reg, lmf_offset + G_STRUCT_OFFSET (MonoLMF, rsi), 8);
+               }
+#endif
        } else {
 
                if (cfg->arch.omit_fp) {
@@ -5424,15 +5134,6 @@ mono_arch_emit_epilog (MonoCompile *cfg)
        cfg->code_len = code - cfg->native_code;
 
        g_assert (cfg->code_len < cfg->code_size);
-
-       if (cfg->arch.omit_fp) {
-               /* 
-                * Encode the stack size into used_int_regs so the exception handler
-                * can access it.
-                */
-               g_assert (cfg->arch.stack_alloc_size < (1 << 16));
-               cfg->used_int_regs |= (1 << 31) | (cfg->arch.stack_alloc_size << 16);
-       }
 }
 
 void
@@ -5633,7 +5334,7 @@ mono_arch_instrument_epilog (MonoCompile *cfg, void *func, void *p, gboolean ena
        guchar *code = p;
        int save_mode = SAVE_NONE;
        MonoMethod *method = cfg->method;
-       int rtype = mono_type_get_underlying_type (mono_method_signature (method)->ret)->type;
+       int rtype = mini_type_get_underlying_type (NULL, mono_method_signature (method)->ret)->type;
        
        switch (rtype) {
        case MONO_TYPE_VOID:
@@ -5883,56 +5584,30 @@ mono_arch_get_vcall_slot (guint8 *code, gpointer *regs, int *displacement)
 
        *displacement = 0;
 
-       /* go to the start of the call instruction
-        *
-        * address_byte = (m << 6) | (o << 3) | reg
-        * call opcode: 0xff address_byte displacement
-        * 0xff m=1,o=2 imm8
-        * 0xff m=2,o=2 imm32
-        */
        code -= 7;
 
        /* 
         * A given byte sequence can match more than case here, so we have to be
         * really careful about the ordering of the cases. Longer sequences
         * come first.
+        * There are two types of calls:
+        * - direct calls: 0xff address_byte 8/32 bits displacement
+        * - indirect calls: nop nop nop <call>
+        * The nops make sure we don't confuse the instruction preceeding an indirect
+        * call with a direct call.
         */
-#ifdef MONO_ARCH_HAVE_IMT
-       if ((code [-2] == 0x41) && (code [-1] == 0xbb) && (code [4] == 0xff) && (x86_modrm_mod (code [5]) == 1) && (x86_modrm_reg (code [5]) == 2) && ((signed char)code [6] < 0)) {
-               /* IMT-based interface calls: with MONO_ARCH_IMT_REG == r11
-                * 41 bb 14 f8 28 08       mov    $0x828f814,%r11d
-                * ff 50 fc                call   *0xfffffffc(%rax)
-                */
-               reg = amd64_modrm_rm (code [5]);
-               disp = (signed char)code [6];
-               /* R10 is clobbered by the IMT thunk code */
-               g_assert (reg != AMD64_R10);
-       }
-#else
-       if (0) {
-       }
-#endif
-       else if ((code [-1] == 0x8b) && (amd64_modrm_mod (code [0]) == 0x2) && (code [5] == 0xff) && (amd64_modrm_reg (code [6]) == 0x2) && (amd64_modrm_mod (code [6]) == 0x0)) {
-                       /*
-                        * This is a interface call
-                        * 48 8b 80 f0 e8 ff ff   mov    0xffffffffffffe8f0(%rax),%rax
-                        * ff 10                  callq  *(%rax)
-                        */
-               if (IS_REX (code [4]))
-                       rex = code [4];
-               reg = amd64_modrm_rm (code [6]);
-               disp = 0;
-               /* R10 is clobbered by the IMT thunk code */
-               g_assert (reg != AMD64_R10);
-       } else if ((code [0] == 0x41) && (code [1] == 0xff) && (code [2] == 0x15)) {
+       if ((code [0] == 0x41) && (code [1] == 0xff) && (code [2] == 0x15)) {
                /* call OFFSET(%rip) */
                disp = *(guint32*)(code + 3);
                return (gpointer*)(code + disp + 7);
-       } else if ((code [0] == 0xff) && (amd64_modrm_reg (code [1]) == 0x2) && (amd64_modrm_mod (code [1]) == 0x2) && (amd64_modrm_reg (code [2]) == X86_ESP) && (amd64_modrm_mod (code [2]) == 0) && (amd64_modrm_rm (code [2]) == X86_ESP)) {
-               /* call *[r12+disp32] */
-               if (IS_REX (code [-1]))
+       } else if ((code [0] == 0xff) && (amd64_modrm_reg (code [1]) == 0x2) && (amd64_modrm_mod (code [1]) == 0x2) && (amd64_sib_index (code [2]) == 4) && (amd64_sib_scale (code [2]) == 0)) {
+               /* call *[reg+disp32] using indexed addressing */
+               /* The LLVM JIT emits this, and we emit it too for %r12 */
+               if (IS_REX (code [-1])) {
                        rex = code [-1];
-               reg = AMD64_RSP;
+                       g_assert (amd64_rex_x (rex) == 0);
+               }                       
+               reg = amd64_sib_base (code [2]);
                disp = *(gint32*)(code + 3);
        } else if ((code [1] == 0xff) && (amd64_modrm_reg (code [2]) == 0x2) && (amd64_modrm_mod (code [2]) == 0x2)) {
                /* call *[reg+disp32] */
@@ -5945,11 +5620,11 @@ mono_arch_get_vcall_slot (guint8 *code, gpointer *regs, int *displacement)
        } else if (code [2] == 0xe8) {
                /* call <ADDR> */
                return NULL;
-       } else if ((code [3] == 0xff) && (amd64_modrm_reg (code [4]) == 0x2) && (amd64_modrm_mod (code [4]) == 0x1) && (amd64_modrm_reg (code [5]) == X86_ESP) && (amd64_modrm_mod (code [5]) == 0) && (amd64_modrm_rm (code [5]) == X86_ESP)) {
-               /* call *[r12+disp32] */
+       } else if ((code [3] == 0xff) && (amd64_modrm_reg (code [4]) == 0x2) && (amd64_modrm_mod (code [4]) == 0x1) && (amd64_sib_index (code [5]) == 4) && (amd64_sib_scale (code [5]) == 0)) {
+               /* call *[r12+disp8] using indexed addressing */
                if (IS_REX (code [2]))
                        rex = code [2];
-               reg = AMD64_RSP;
+               reg = amd64_sib_base (code [5]);
                disp = *(gint8*)(code + 6);
        } else if (IS_REX (code [4]) && (code [5] == 0xff) && (amd64_modrm_reg (code [6]) == 0x2) && (amd64_modrm_mod (code [6]) == 0x3)) {
                /* call *%reg */
@@ -5963,11 +5638,7 @@ mono_arch_get_vcall_slot (guint8 *code, gpointer *regs, int *displacement)
                //printf ("B: [%%r%d+0x%x]\n", reg, disp);
        }
        else if ((code [5] == 0xff) && (amd64_modrm_reg (code [6]) == 0x2) && (amd64_modrm_mod (code [6]) == 0x0)) {
-                       /*
-                        * This is a interface call: should check the above code can't catch it earlier 
-                        * 8b 40 30   mov    0x30(%eax),%eax
-                        * ff 10      call   *(%eax)
-                        */
+               /* call *%reg */
                if (IS_REX (code [4]))
                        rex = code [4];
                reg = amd64_modrm_rm (code [6]);
@@ -6077,8 +5748,16 @@ mono_arch_get_delegate_invoke_impl (MonoMethodSignature *sig, gboolean has_targe
                } else {
                        /* We have to shift the arguments left */
                        amd64_mov_reg_reg (code, AMD64_RAX, AMD64_ARG_REG1, 8);
-                       for (i = 0; i < sig->param_count; ++i)
+                       for (i = 0; i < sig->param_count; ++i) {
+#ifdef PLATFORM_WIN32
+                               if (i < 3)
+                                       amd64_mov_reg_reg (code, param_regs [i], param_regs [i + 1], 8);
+                               else
+                                       amd64_mov_reg_membase (code, param_regs [i], AMD64_RSP, 0x28, 8);
+#else
                                amd64_mov_reg_reg (code, param_regs [i], param_regs [i + 1], 8);
+#endif
+                       }
 
                        amd64_jump_membase (code, AMD64_RAX, G_STRUCT_OFFSET (MonoDelegate, method_ptr));
                }
@@ -6140,50 +5819,6 @@ mono_arch_free_jit_tls_data (MonoJitTlsData *tls)
 {
 }
 
-void
-mono_arch_emit_this_vret_args (MonoCompile *cfg, MonoCallInst *inst, int this_reg, int this_type, int vt_reg)
-{
-       MonoCallInst *call = (MonoCallInst*)inst;
-       CallInfo * cinfo = get_call_info (cfg->generic_sharing_context, cfg->mempool, inst->signature, FALSE);
-
-       if (vt_reg != -1) {
-               MonoInst *vtarg;
-
-               if (cinfo->ret.storage == ArgValuetypeInReg) {
-                       /*
-                        * The valuetype is in RAX:RDX after the call, need to be copied to
-                        * the stack. Save the address here, so the call instruction can
-                        * access it.
-                        */
-                       MonoInst *loc = cfg->arch.vret_addr_loc;
-
-                       g_assert (loc);
-                       g_assert (loc->opcode == OP_REGOFFSET);
-
-                       MONO_EMIT_NEW_STORE_MEMBASE (cfg, OP_STORE_MEMBASE_REG, loc->inst_basereg, loc->inst_offset, vt_reg);
-               } else {
-                       MONO_INST_NEW (cfg, vtarg, OP_MOVE);
-                       vtarg->sreg1 = vt_reg;
-                       vtarg->dreg = mono_regstate_next_int (cfg->rs);
-                       mono_bblock_add_inst (cfg->cbb, vtarg);
-
-                       mono_call_inst_add_outarg_reg (cfg, call, vtarg->dreg, cinfo->ret.reg, FALSE);
-               }
-       }
-
-       /* add the this argument */
-       if (this_reg != -1) {
-               MonoInst *this;
-               MONO_INST_NEW (cfg, this, OP_MOVE);
-               this->type = this_type;
-               this->sreg1 = this_reg;
-               this->dreg = mono_regstate_next_int (cfg->rs);
-               mono_bblock_add_inst (cfg->cbb, this);
-
-               mono_call_inst_add_outarg_reg (cfg, call, this->dreg, cinfo->args [0].reg, FALSE);
-       }
-}
-
 #ifdef MONO_ARCH_HAVE_IMT
 
 #define CMP_SIZE (6 + 1)
@@ -6207,7 +5842,8 @@ imt_branch_distance (MonoIMTCheckItem **imt_entries, int start, int target)
  * LOCKING: called with the domain lock held
  */
 gpointer
-mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckItem **imt_entries, int count)
+mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckItem **imt_entries, int count,
+       gpointer fail_tramp)
 {
        int i;
        int size = 0;
@@ -6219,28 +5855,37 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
                if (item->is_equals) {
                        if (item->check_target_idx) {
                                if (!item->compare_done) {
-                                       if (amd64_is_imm32 (item->method))
+                                       if (amd64_is_imm32 (item->key))
                                                item->chunk_size += CMP_SIZE;
                                        else
                                                item->chunk_size += MOV_REG_IMM_SIZE + CMP_REG_REG_SIZE;
                                }
-                               if (vtable_is_32bit)
-                                       item->chunk_size += MOV_REG_IMM_32BIT_SIZE;
-                               else
+                               if (item->has_target_code) {
                                        item->chunk_size += MOV_REG_IMM_SIZE;
+                               } else {
+                                       if (vtable_is_32bit)
+                                               item->chunk_size += MOV_REG_IMM_32BIT_SIZE;
+                                       else
+                                               item->chunk_size += MOV_REG_IMM_SIZE;
+                               }
                                item->chunk_size += BR_SMALL_SIZE + JUMP_REG_SIZE;
                        } else {
-                               if (vtable_is_32bit)
-                                       item->chunk_size += MOV_REG_IMM_32BIT_SIZE;
-                               else
-                                       item->chunk_size += MOV_REG_IMM_SIZE;
-                               item->chunk_size += JUMP_REG_SIZE;
-                               /* with assert below:
-                                * item->chunk_size += CMP_SIZE + BR_SMALL_SIZE + 1;
-                                */
+                               if (fail_tramp) {
+                                       item->chunk_size += MOV_REG_IMM_SIZE * 3 + CMP_REG_REG_SIZE +
+                                               BR_SMALL_SIZE + JUMP_REG_SIZE * 2;
+                               } else {
+                                       if (vtable_is_32bit)
+                                               item->chunk_size += MOV_REG_IMM_32BIT_SIZE;
+                                       else
+                                               item->chunk_size += MOV_REG_IMM_SIZE;
+                                       item->chunk_size += JUMP_REG_SIZE;
+                                       /* with assert below:
+                                        * item->chunk_size += CMP_SIZE + BR_SMALL_SIZE + 1;
+                                        */
+                               }
                        }
                } else {
-                       if (amd64_is_imm32 (item->method))
+                       if (amd64_is_imm32 (item->key))
                                item->chunk_size += CMP_SIZE;
                        else
                                item->chunk_size += MOV_REG_IMM_SIZE + CMP_REG_REG_SIZE;
@@ -6249,39 +5894,56 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
                }
                size += item->chunk_size;
        }
-       code = mono_code_manager_reserve (domain->code_mp, size);
+       if (fail_tramp)
+               code = mono_method_alloc_generic_virtual_thunk (domain, size);
+       else
+               code = mono_domain_code_reserve (domain, size);
        start = code;
        for (i = 0; i < count; ++i) {
                MonoIMTCheckItem *item = imt_entries [i];
                item->code_target = code;
                if (item->is_equals) {
-                       if (item->check_target_idx) {
-                               if (!item->compare_done) {
-                                       if (amd64_is_imm32 (item->method))
-                                               amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->method);
+                       gboolean fail_case = !item->check_target_idx && fail_tramp;
+
+                       if (item->check_target_idx || fail_case) {
+                               if (!item->compare_done || fail_case) {
+                                       if (amd64_is_imm32 (item->key))
+                                               amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->key);
                                        else {
-                                               amd64_mov_reg_imm (code, AMD64_R10, item->method);
+                                               amd64_mov_reg_imm (code, AMD64_R10, item->key);
                                                amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, AMD64_R10);
                                        }
                                }
                                item->jmp_code = code;
                                amd64_branch8 (code, X86_CC_NE, 0, FALSE);
                                /* See the comment below about R10 */
-                               amd64_mov_reg_imm (code, AMD64_R10, & (vtable->vtable [item->vtable_slot]));
-                               amd64_jump_membase (code, AMD64_R10, 0);
+                               if (item->has_target_code) {
+                                       amd64_mov_reg_imm (code, AMD64_R10, item->value.target_code);
+                                       amd64_jump_reg (code, AMD64_R10);
+                               } else {
+                                       amd64_mov_reg_imm (code, AMD64_R10, & (vtable->vtable [item->value.vtable_slot]));
+                                       amd64_jump_membase (code, AMD64_R10, 0);
+                               }
+
+                               if (fail_case) {
+                                       amd64_patch (item->jmp_code, code);
+                                       amd64_mov_reg_imm (code, AMD64_R10, fail_tramp);
+                                       amd64_jump_reg (code, AMD64_R10);
+                                       item->jmp_code = NULL;
+                               }
                        } else {
                                /* enable the commented code to assert on wrong method */
 #if 0
-                               if (amd64_is_imm32 (item->method))
-                                       amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->method);
+                               if (amd64_is_imm32 (item->key))
+                                       amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->key);
                                else {
-                                       amd64_mov_reg_imm (code, AMD64_R10, item->method);
+                                       amd64_mov_reg_imm (code, AMD64_R10, item->key);
                                        amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, AMD64_R10);
                                }
                                item->jmp_code = code;
                                amd64_branch8 (code, X86_CC_NE, 0, FALSE);
                                /* See the comment below about R10 */
-                               amd64_mov_reg_imm (code, AMD64_R10, & (vtable->vtable [item->vtable_slot]));
+                               amd64_mov_reg_imm (code, AMD64_R10, & (vtable->vtable [item->value.vtable_slot]));
                                amd64_jump_membase (code, AMD64_R10, 0);
                                amd64_patch (item->jmp_code, code);
                                amd64_breakpoint (code);
@@ -6292,15 +5954,15 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
                                   to be preserved for calls which
                                   require a runtime generic context,
                                   but interface calls don't. */
-                               amd64_mov_reg_imm (code, AMD64_R10, & (vtable->vtable [item->vtable_slot]));
+                               amd64_mov_reg_imm (code, AMD64_R10, & (vtable->vtable [item->value.vtable_slot]));
                                amd64_jump_membase (code, AMD64_R10, 0);
 #endif
                        }
                } else {
-                       if (amd64_is_imm32 (item->method))
-                               amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->method);
+                       if (amd64_is_imm32 (item->key))
+                               amd64_alu_reg_imm (code, X86_CMP, MONO_ARCH_IMT_REG, (guint32)(gssize)item->key);
                        else {
-                               amd64_mov_reg_imm (code, AMD64_R10, item->method);
+                               amd64_mov_reg_imm (code, AMD64_R10, item->key);
                                amd64_alu_reg_reg (code, X86_CMP, MONO_ARCH_IMT_REG, AMD64_R10);
                        }
                        item->jmp_code = code;
@@ -6320,8 +5982,9 @@ mono_arch_build_imt_thunk (MonoVTable *vtable, MonoDomain *domain, MonoIMTCheckI
                        }
                }
        }
-               
-       mono_stats.imt_thunks_size += code - start;
+
+       if (!fail_tramp)
+               mono_stats.imt_thunks_size += code - start;
        g_assert (code - start <= size);
 
        return start;
@@ -6338,12 +6001,6 @@ mono_arch_find_this_argument (gpointer *regs, MonoMethod *method, MonoGenericSha
 {
        return mono_arch_get_this_arg_from_call (gsctx, mono_method_signature (method), (gssize*)regs, NULL);
 }
-
-void
-mono_arch_emit_imt_argument (MonoCompile *cfg, MonoCallInst *call)
-{
-       /* Done by the implementation of the CALL_MEMBASE opcodes */
-}
 #endif
 
 MonoVTable*
@@ -6352,69 +6009,6 @@ mono_arch_find_static_call_vtable (gpointer *regs, guint8 *code)
        return (MonoVTable*) regs [MONO_ARCH_RGCTX_REG];
 }
 
-MonoInst*
-mono_arch_get_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
-{
-       MonoInst *ins = NULL;
-
-       if (cmethod->klass == mono_defaults.math_class) {
-               if (strcmp (cmethod->name, "Sin") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_SIN);
-                       ins->inst_i0 = args [0];
-               } else if (strcmp (cmethod->name, "Cos") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_COS);
-                       ins->inst_i0 = args [0];
-               } else if (strcmp (cmethod->name, "Sqrt") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_SQRT);
-                       ins->inst_i0 = args [0];
-               } else if (strcmp (cmethod->name, "Abs") == 0 && fsig->params [0]->type == MONO_TYPE_R8) {
-                       MONO_INST_NEW (cfg, ins, OP_ABS);
-                       ins->inst_i0 = args [0];
-               }
-
-               if (cfg->opt & MONO_OPT_CMOV) {
-                       int opcode = 0;
-
-                       if (strcmp (cmethod->name, "Min") == 0) {
-                               if (fsig->params [0]->type == MONO_TYPE_I4)
-                                       opcode = OP_IMIN;
-                               if (fsig->params [0]->type == MONO_TYPE_U4)
-                                       opcode = OP_IMIN_UN;
-                               else if (fsig->params [0]->type == MONO_TYPE_I8)
-                                       opcode = OP_LMIN;
-                               else if (fsig->params [0]->type == MONO_TYPE_U8)
-                                       opcode = OP_LMIN_UN;
-                       } else if (strcmp (cmethod->name, "Max") == 0) {
-                               if (fsig->params [0]->type == MONO_TYPE_I4)
-                                       opcode = OP_IMAX;
-                               if (fsig->params [0]->type == MONO_TYPE_U4)
-                                       opcode = OP_IMAX_UN;
-                               else if (fsig->params [0]->type == MONO_TYPE_I8)
-                                       opcode = OP_LMAX;
-                               else if (fsig->params [0]->type == MONO_TYPE_U8)
-                                       opcode = OP_LMAX_UN;
-                       }               
-
-                       if (opcode) {
-                               MONO_INST_NEW (cfg, ins, opcode);
-                               ins->inst_i0 = args [0];
-                               ins->inst_i1 = args [1];
-                       }
-               }
-
-#if 0
-               /* OP_FREM is not IEEE compatible */
-               else if (strcmp (cmethod->name, "IEEERemainder") == 0) {
-                       MONO_INST_NEW (cfg, ins, OP_FREM);
-                       ins->inst_i0 = args [0];
-                       ins->inst_i1 = args [1];
-               }
-#endif
-       }
-
-       return ins;
-}
-
 MonoInst*
 mono_arch_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
 {